fix missing images in paras
All checks were successful
Python CI / test (3.10) (push) Successful in 2m6s
Python CI / test (3.12) (push) Successful in 1m58s
Python CI / test (3.13) (push) Successful in 1m51s

This commit is contained in:
Duncan Tourolle 2025-11-09 22:25:23 +01:00
parent 40c1b913ec
commit 9fb6792e10
3 changed files with 307 additions and 7 deletions

View File

@ -485,8 +485,65 @@ def process_element(
# Union[Block, List[Block], None]
def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
"""Handle <p> elements."""
def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]:
"""
Handle <p> elements.
Special handling for paragraphs containing images:
- If the paragraph contains only an image (common in EPUBs), return the image block
- If the paragraph contains images mixed with text, split into separate blocks
- Otherwise, return a normal paragraph with text content
"""
# Check if paragraph contains any img tags (including nested ones)
img_tags = element.find_all('img')
if img_tags:
# Paragraph contains images - need special handling
blocks = []
# Check if this is an image-only paragraph (very common in EPUBs)
# Get text content without the img tags
text_content = element.get_text(strip=True)
if not text_content or len(text_content.strip()) == 0:
# Image-only paragraph - return just the image(s)
for img_tag in img_tags:
child_context = apply_element_styling(context, img_tag)
img_block = image_handler(img_tag, child_context)
if img_block:
blocks.append(img_block)
# Return single image or list of images
if len(blocks) == 1:
return blocks[0]
return blocks if blocks else Paragraph(context.font)
# Mixed content - paragraph has both text and images
# Process children in order to preserve structure
for child in element.children:
if isinstance(child, Tag):
if child.name == 'img':
# Add the image as a separate block
child_context = apply_element_styling(context, child)
img_block = image_handler(child, child_context)
if img_block:
blocks.append(img_block)
else:
# Process other inline elements as part of text
# This will be handled by extract_text_content below
pass
# Also add a paragraph with the text content
paragraph = Paragraph(context.font)
words = extract_text_content(element, context)
if words:
for word in words:
paragraph.add_word(word)
blocks.insert(0, paragraph) # Text comes before images
return blocks if blocks else Paragraph(context.font)
# No images - normal paragraph handling
paragraph = Paragraph(context.font)
words = extract_text_content(element, context)
for word in words:

View File

@ -13,9 +13,11 @@ from pathlib import Path
from .ereader_layout import RenderingPosition, ChapterNavigator, ChapterInfo
from .page_buffer import BufferedPageRenderer
from pyWebLayout.abstract.block import Block, HeadingLevel
from pyWebLayout.abstract.block import Block, HeadingLevel, Image, BlockType
from pyWebLayout.concrete.page import Page
from pyWebLayout.concrete.image import RenderableImage
from pyWebLayout.style.page_style import PageStyle
from pyWebLayout.layout.document_layouter import image_layouter
class BookmarkManager:
@ -194,6 +196,10 @@ class EreaderLayoutManager:
self.current_position = RenderingPosition()
self.font_scale = 1.0
# Cover page handling
self._has_cover = self._detect_cover()
self._on_cover_page = self._has_cover # Start on cover if one exists
# Page position history for fast backward navigation
# List of (position, font_scale) tuples representing the start of each page visited
self._page_history: List[Tuple[RenderingPosition, float]] = []
@ -203,6 +209,7 @@ class EreaderLayoutManager:
saved_position = self.bookmark_manager.load_reading_position()
if saved_position:
self.current_position = saved_position
self._on_cover_page = False # If we have a saved position, we're past the cover
# Callbacks for UI updates
self.position_changed_callback: Optional[Callable[[
@ -220,6 +227,69 @@ class EreaderLayoutManager:
"""Set callback for chapter changes"""
self.chapter_changed_callback = callback
def _detect_cover(self) -> bool:
"""
Detect if the document has a cover page.
A cover is detected if:
1. The first block is an Image block, OR
2. The document has cover metadata (future enhancement)
Returns:
True if a cover page should be rendered
"""
if not self.blocks:
return False
# Check if first block is an image - treat it as a cover
first_block = self.blocks[0]
if isinstance(first_block, Image):
return True
return False
def _render_cover_page(self) -> Page:
"""
Render a dedicated cover page.
The cover page displays the first image block (if it exists)
using the standard image layouter with maximum dimensions to fill the page.
Returns:
Rendered cover page
"""
# Create a new page for the cover
page = Page(self.page_size, self.page_style)
if not self.blocks or not isinstance(self.blocks[0], Image):
# No cover image, return blank page
return page
cover_image_block = self.blocks[0]
# Use the image layouter to render the cover image
# Use full page dimensions (minus borders/padding) for cover
try:
max_width = self.page_size[0] - 2 * self.page_style.border_width
max_height = self.page_size[1] - 2 * self.page_style.border_width
# Layout the image on the page
success = image_layouter(
image=cover_image_block,
page=page,
max_width=max_width,
max_height=max_height
)
if not success:
print("Warning: Failed to layout cover image")
except Exception as e:
# If image loading fails, just return the blank page
print(f"Warning: Failed to load cover image: {e}")
return page
def _notify_position_changed(self):
"""Notify UI of position change"""
if self.position_changed_callback:
@ -238,9 +308,16 @@ class EreaderLayoutManager:
"""
Get the page at the current reading position.
If on the cover page, returns the rendered cover.
Otherwise, returns the regular content page.
Returns:
Rendered page
"""
# Check if we're on the cover page
if self._on_cover_page and self._has_cover:
return self._render_cover_page()
page, _ = self.renderer.render_page(self.current_position, self.font_scale)
return page
@ -248,9 +325,23 @@ class EreaderLayoutManager:
"""
Advance to the next page.
If currently on the cover page, advances to the first content page.
Otherwise, advances to the next content page.
Returns:
Next page or None if at end of document
"""
# Special case: transitioning from cover to first content page
if self._on_cover_page and self._has_cover:
self._on_cover_page = False
# If first block is an image (the cover), skip it and start from block 1
if self.blocks and isinstance(self.blocks[0], Image):
self.current_position = RenderingPosition(chapter_index=0, block_index=1)
else:
self.current_position = RenderingPosition()
self._notify_position_changed()
return self.get_current_page()
# Save current position to history before moving forward
self._add_to_history(self.current_position, self.font_scale)
@ -271,10 +362,21 @@ class EreaderLayoutManager:
Uses cached page history for instant navigation when available,
falls back to iterative refinement algorithm when needed.
Can navigate back to the cover page if it exists.
Returns:
Previous page or None if at beginning of document
Previous page or None if at beginning of document (or on cover)
"""
# Special case: if at the beginning of content and there's a cover, go back to it
if self._has_cover and self._is_at_beginning() and not self._on_cover_page:
self._on_cover_page = True
self._notify_position_changed()
return self.get_current_page()
# Can't go before the cover
if self._on_cover_page:
return None
if self._is_at_beginning():
return None
@ -303,9 +405,17 @@ class EreaderLayoutManager:
return None # At beginning of document
def _is_at_beginning(self) -> bool:
"""Check if we're at the beginning of the document"""
"""
Check if we're at the beginning of the document content.
If a cover exists (first block is an Image), the beginning of content
is at block_index=1. Otherwise, it's at block_index=0.
"""
# Determine the first content block index
first_content_block = 1 if (self._has_cover and self.blocks and isinstance(self.blocks[0], Image)) else 0
return (self.current_position.chapter_index == 0 and
self.current_position.block_index == 0 and
self.current_position.block_index == first_content_block and
self.current_position.word_index == 0)
def jump_to_position(self, position: RenderingPosition) -> Page:
@ -319,6 +429,7 @@ class EreaderLayoutManager:
Page at the new position
"""
self.current_position = position
self._on_cover_page = False # Jumping to a position means we're past the cover
self._notify_position_changed()
return self.get_current_page()
@ -636,6 +747,38 @@ class EreaderLayoutManager:
return current_block / max(1, total_blocks - 1)
def has_cover(self) -> bool:
"""
Check if the document has a cover page.
Returns:
True if a cover page is available
"""
return self._has_cover
def is_on_cover(self) -> bool:
"""
Check if currently viewing the cover page.
Returns:
True if on the cover page
"""
return self._on_cover_page
def jump_to_cover(self) -> Optional[Page]:
"""
Jump to the cover page if one exists.
Returns:
Cover page or None if no cover exists
"""
if not self._has_cover:
return None
self._on_cover_page = True
self._notify_position_changed()
return self.get_current_page()
def get_position_info(self) -> Dict[str, Any]:
"""
Get detailed information about the current position.
@ -647,6 +790,8 @@ class EreaderLayoutManager:
return {
'position': self.current_position.to_dict(),
'on_cover': self._on_cover_page,
'has_cover': self._has_cover,
'chapter': {
'title': current_chapter.title if current_chapter else None,
'level': current_chapter.level if current_chapter else None,

View File

@ -7,7 +7,7 @@ including styled content within paragraphs and block-level elements.
import unittest
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image
from pyWebLayout.abstract.document import Document
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
@ -585,5 +585,103 @@ class TestHTMLFontRegistryIntegration(unittest.TestCase):
"Should create separate styles for style combinations")
class TestHTMLImagesInParagraphs(unittest.TestCase):
"""Test cases for handling images inside paragraph tags."""
def setUp(self):
"""Set up test fixtures."""
self.base_font = Font(font_size=14)
def test_image_only_paragraph(self):
"""Test paragraph containing only an image (common in EPUBs)."""
html = '<p><img src="cover.jpg" alt="Book Cover"/></p>'
blocks = parse_html_string(html, base_font=self.base_font)
# Should parse as an Image block, not a Paragraph
self.assertGreater(len(blocks), 0, "Should parse at least one block")
# Check that we have an Image block
image_blocks = [b for b in blocks if isinstance(b, Image)]
self.assertGreater(len(image_blocks), 0, "Should have at least one Image block")
# Verify image properties
img = image_blocks[0]
self.assertEqual(img.source, "cover.jpg")
self.assertEqual(img.alt_text, "Book Cover")
def test_paragraph_with_multiple_images(self):
"""Test paragraph with multiple images."""
html = '<p><img src="img1.jpg" alt="First"/><img src="img2.jpg" alt="Second"/></p>'
blocks = parse_html_string(html, base_font=self.base_font)
# Should have multiple Image blocks
image_blocks = [b for b in blocks if isinstance(b, Image)]
self.assertEqual(len(image_blocks), 2, "Should have two Image blocks")
# Verify both images were parsed
sources = [img.source for img in image_blocks]
self.assertIn("img1.jpg", sources)
self.assertIn("img2.jpg", sources)
def test_paragraph_with_text_and_image(self):
"""Test paragraph with mixed text and image content."""
html = '<p>Some text before <img src="inline.jpg" alt="Inline"/> and after</p>'
blocks = parse_html_string(html, base_font=self.base_font)
# Should have both paragraph and image blocks
paragraphs = [b for b in blocks if isinstance(b, Paragraph)]
images = [b for b in blocks if isinstance(b, Image)]
self.assertGreater(len(paragraphs), 0, "Should have a Paragraph block for text")
self.assertGreater(len(images), 0, "Should have an Image block")
# Verify image was parsed
self.assertEqual(images[0].source, "inline.jpg")
# Verify text was extracted (should have words like "Some", "text", etc.)
if paragraphs:
words = list(paragraphs[0].words_iter())
self.assertGreater(len(words), 0, "Paragraph should have words")
def test_regular_paragraph_still_works(self):
"""Test that regular paragraphs without images still work correctly."""
html = '<p>Just regular text without any images.</p>'
blocks = parse_html_string(html, base_font=self.base_font)
# Should be exactly one Paragraph block
self.assertEqual(len(blocks), 1, "Should have exactly one block")
self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block")
# Should not have any Image blocks
image_blocks = [b for b in blocks if isinstance(b, Image)]
self.assertEqual(len(image_blocks), 0, "Should have no Image blocks")
def test_image_with_width_and_height(self):
"""Test image parsing with width and height attributes."""
html = '<p><img src="sized.jpg" alt="Sized Image" width="400" height="300"/></p>'
blocks = parse_html_string(html, base_font=self.base_font)
# Should have an Image block
image_blocks = [b for b in blocks if isinstance(b, Image)]
self.assertEqual(len(image_blocks), 1, "Should have one Image block")
# Verify dimensions were parsed
img = image_blocks[0]
self.assertEqual(img.width, 400)
self.assertEqual(img.height, 300)
def test_nested_paragraph_with_image_in_span(self):
"""Test image inside nested inline elements."""
html = '<p><span><img src="nested.jpg" alt="Nested"/></span></p>'
blocks = parse_html_string(html, base_font=self.base_font)
# Should still extract the image
image_blocks = [b for b in blocks if isinstance(b, Image)]
self.assertGreater(len(image_blocks), 0, "Should find image even when nested")
# Verify image was parsed correctly
self.assertEqual(image_blocks[0].source, "nested.jpg")
if __name__ == '__main__':
unittest.main()