From 9fb6792e10ce790b43b2df561923e973ea296b1f Mon Sep 17 00:00:00 2001 From: Duncan Tourolle Date: Sun, 9 Nov 2025 22:25:23 +0100 Subject: [PATCH] fix missing images in paras --- pyWebLayout/io/readers/html_extraction.py | 61 ++++++++- pyWebLayout/layout/ereader_manager.py | 153 +++++++++++++++++++++- tests/io_tests/test_html_extraction.py | 100 +++++++++++++- 3 files changed, 307 insertions(+), 7 deletions(-) diff --git a/pyWebLayout/io/readers/html_extraction.py b/pyWebLayout/io/readers/html_extraction.py index 65f3caf..0cf7ac9 100644 --- a/pyWebLayout/io/readers/html_extraction.py +++ b/pyWebLayout/io/readers/html_extraction.py @@ -485,8 +485,65 @@ def process_element( # Union[Block, List[Block], None] -def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph: - """Handle

elements.""" +def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]: + """ + Handle

elements. + + Special handling for paragraphs containing images: + - If the paragraph contains only an image (common in EPUBs), return the image block + - If the paragraph contains images mixed with text, split into separate blocks + - Otherwise, return a normal paragraph with text content + """ + # Check if paragraph contains any img tags (including nested ones) + img_tags = element.find_all('img') + + if img_tags: + # Paragraph contains images - need special handling + blocks = [] + + # Check if this is an image-only paragraph (very common in EPUBs) + # Get text content without the img tags + text_content = element.get_text(strip=True) + + if not text_content or len(text_content.strip()) == 0: + # Image-only paragraph - return just the image(s) + for img_tag in img_tags: + child_context = apply_element_styling(context, img_tag) + img_block = image_handler(img_tag, child_context) + if img_block: + blocks.append(img_block) + + # Return single image or list of images + if len(blocks) == 1: + return blocks[0] + return blocks if blocks else Paragraph(context.font) + + # Mixed content - paragraph has both text and images + # Process children in order to preserve structure + for child in element.children: + if isinstance(child, Tag): + if child.name == 'img': + # Add the image as a separate block + child_context = apply_element_styling(context, child) + img_block = image_handler(child, child_context) + if img_block: + blocks.append(img_block) + else: + # Process other inline elements as part of text + # This will be handled by extract_text_content below + pass + + # Also add a paragraph with the text content + paragraph = Paragraph(context.font) + words = extract_text_content(element, context) + if words: + for word in words: + paragraph.add_word(word) + blocks.insert(0, paragraph) # Text comes before images + + return blocks if blocks else Paragraph(context.font) + + # No images - normal paragraph handling paragraph = Paragraph(context.font) words = extract_text_content(element, context) for word in words: diff --git a/pyWebLayout/layout/ereader_manager.py b/pyWebLayout/layout/ereader_manager.py index 17eee58..316431b 100644 --- a/pyWebLayout/layout/ereader_manager.py +++ b/pyWebLayout/layout/ereader_manager.py @@ -13,9 +13,11 @@ from pathlib import Path from .ereader_layout import RenderingPosition, ChapterNavigator, ChapterInfo from .page_buffer import BufferedPageRenderer -from pyWebLayout.abstract.block import Block, HeadingLevel +from pyWebLayout.abstract.block import Block, HeadingLevel, Image, BlockType from pyWebLayout.concrete.page import Page +from pyWebLayout.concrete.image import RenderableImage from pyWebLayout.style.page_style import PageStyle +from pyWebLayout.layout.document_layouter import image_layouter class BookmarkManager: @@ -194,6 +196,10 @@ class EreaderLayoutManager: self.current_position = RenderingPosition() self.font_scale = 1.0 + # Cover page handling + self._has_cover = self._detect_cover() + self._on_cover_page = self._has_cover # Start on cover if one exists + # Page position history for fast backward navigation # List of (position, font_scale) tuples representing the start of each page visited self._page_history: List[Tuple[RenderingPosition, float]] = [] @@ -203,6 +209,7 @@ class EreaderLayoutManager: saved_position = self.bookmark_manager.load_reading_position() if saved_position: self.current_position = saved_position + self._on_cover_page = False # If we have a saved position, we're past the cover # Callbacks for UI updates self.position_changed_callback: Optional[Callable[[ @@ -220,6 +227,69 @@ class EreaderLayoutManager: """Set callback for chapter changes""" self.chapter_changed_callback = callback + def _detect_cover(self) -> bool: + """ + Detect if the document has a cover page. + + A cover is detected if: + 1. The first block is an Image block, OR + 2. The document has cover metadata (future enhancement) + + Returns: + True if a cover page should be rendered + """ + if not self.blocks: + return False + + # Check if first block is an image - treat it as a cover + first_block = self.blocks[0] + if isinstance(first_block, Image): + return True + + return False + + def _render_cover_page(self) -> Page: + """ + Render a dedicated cover page. + + The cover page displays the first image block (if it exists) + using the standard image layouter with maximum dimensions to fill the page. + + Returns: + Rendered cover page + """ + # Create a new page for the cover + page = Page(self.page_size, self.page_style) + + if not self.blocks or not isinstance(self.blocks[0], Image): + # No cover image, return blank page + return page + + cover_image_block = self.blocks[0] + + # Use the image layouter to render the cover image + # Use full page dimensions (minus borders/padding) for cover + try: + max_width = self.page_size[0] - 2 * self.page_style.border_width + max_height = self.page_size[1] - 2 * self.page_style.border_width + + # Layout the image on the page + success = image_layouter( + image=cover_image_block, + page=page, + max_width=max_width, + max_height=max_height + ) + + if not success: + print("Warning: Failed to layout cover image") + + except Exception as e: + # If image loading fails, just return the blank page + print(f"Warning: Failed to load cover image: {e}") + + return page + def _notify_position_changed(self): """Notify UI of position change""" if self.position_changed_callback: @@ -238,9 +308,16 @@ class EreaderLayoutManager: """ Get the page at the current reading position. + If on the cover page, returns the rendered cover. + Otherwise, returns the regular content page. + Returns: Rendered page """ + # Check if we're on the cover page + if self._on_cover_page and self._has_cover: + return self._render_cover_page() + page, _ = self.renderer.render_page(self.current_position, self.font_scale) return page @@ -248,9 +325,23 @@ class EreaderLayoutManager: """ Advance to the next page. + If currently on the cover page, advances to the first content page. + Otherwise, advances to the next content page. + Returns: Next page or None if at end of document """ + # Special case: transitioning from cover to first content page + if self._on_cover_page and self._has_cover: + self._on_cover_page = False + # If first block is an image (the cover), skip it and start from block 1 + if self.blocks and isinstance(self.blocks[0], Image): + self.current_position = RenderingPosition(chapter_index=0, block_index=1) + else: + self.current_position = RenderingPosition() + self._notify_position_changed() + return self.get_current_page() + # Save current position to history before moving forward self._add_to_history(self.current_position, self.font_scale) @@ -271,10 +362,21 @@ class EreaderLayoutManager: Uses cached page history for instant navigation when available, falls back to iterative refinement algorithm when needed. + Can navigate back to the cover page if it exists. Returns: - Previous page or None if at beginning of document + Previous page or None if at beginning of document (or on cover) """ + # Special case: if at the beginning of content and there's a cover, go back to it + if self._has_cover and self._is_at_beginning() and not self._on_cover_page: + self._on_cover_page = True + self._notify_position_changed() + return self.get_current_page() + + # Can't go before the cover + if self._on_cover_page: + return None + if self._is_at_beginning(): return None @@ -303,9 +405,17 @@ class EreaderLayoutManager: return None # At beginning of document def _is_at_beginning(self) -> bool: - """Check if we're at the beginning of the document""" + """ + Check if we're at the beginning of the document content. + + If a cover exists (first block is an Image), the beginning of content + is at block_index=1. Otherwise, it's at block_index=0. + """ + # Determine the first content block index + first_content_block = 1 if (self._has_cover and self.blocks and isinstance(self.blocks[0], Image)) else 0 + return (self.current_position.chapter_index == 0 and - self.current_position.block_index == 0 and + self.current_position.block_index == first_content_block and self.current_position.word_index == 0) def jump_to_position(self, position: RenderingPosition) -> Page: @@ -319,6 +429,7 @@ class EreaderLayoutManager: Page at the new position """ self.current_position = position + self._on_cover_page = False # Jumping to a position means we're past the cover self._notify_position_changed() return self.get_current_page() @@ -636,6 +747,38 @@ class EreaderLayoutManager: return current_block / max(1, total_blocks - 1) + def has_cover(self) -> bool: + """ + Check if the document has a cover page. + + Returns: + True if a cover page is available + """ + return self._has_cover + + def is_on_cover(self) -> bool: + """ + Check if currently viewing the cover page. + + Returns: + True if on the cover page + """ + return self._on_cover_page + + def jump_to_cover(self) -> Optional[Page]: + """ + Jump to the cover page if one exists. + + Returns: + Cover page or None if no cover exists + """ + if not self._has_cover: + return None + + self._on_cover_page = True + self._notify_position_changed() + return self.get_current_page() + def get_position_info(self) -> Dict[str, Any]: """ Get detailed information about the current position. @@ -647,6 +790,8 @@ class EreaderLayoutManager: return { 'position': self.current_position.to_dict(), + 'on_cover': self._on_cover_page, + 'has_cover': self._has_cover, 'chapter': { 'title': current_chapter.title if current_chapter else None, 'level': current_chapter.level if current_chapter else None, diff --git a/tests/io_tests/test_html_extraction.py b/tests/io_tests/test_html_extraction.py index 0fe7320..fe98867 100644 --- a/tests/io_tests/test_html_extraction.py +++ b/tests/io_tests/test_html_extraction.py @@ -7,7 +7,7 @@ including styled content within paragraphs and block-level elements. import unittest from pyWebLayout.io.readers.html_extraction import parse_html_string -from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table +from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image from pyWebLayout.abstract.document import Document from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration @@ -585,5 +585,103 @@ class TestHTMLFontRegistryIntegration(unittest.TestCase): "Should create separate styles for style combinations") +class TestHTMLImagesInParagraphs(unittest.TestCase): + """Test cases for handling images inside paragraph tags.""" + + def setUp(self): + """Set up test fixtures.""" + self.base_font = Font(font_size=14) + + def test_image_only_paragraph(self): + """Test paragraph containing only an image (common in EPUBs).""" + html = '

Book Cover

' + blocks = parse_html_string(html, base_font=self.base_font) + + # Should parse as an Image block, not a Paragraph + self.assertGreater(len(blocks), 0, "Should parse at least one block") + + # Check that we have an Image block + image_blocks = [b for b in blocks if isinstance(b, Image)] + self.assertGreater(len(image_blocks), 0, "Should have at least one Image block") + + # Verify image properties + img = image_blocks[0] + self.assertEqual(img.source, "cover.jpg") + self.assertEqual(img.alt_text, "Book Cover") + + def test_paragraph_with_multiple_images(self): + """Test paragraph with multiple images.""" + html = '

FirstSecond

' + blocks = parse_html_string(html, base_font=self.base_font) + + # Should have multiple Image blocks + image_blocks = [b for b in blocks if isinstance(b, Image)] + self.assertEqual(len(image_blocks), 2, "Should have two Image blocks") + + # Verify both images were parsed + sources = [img.source for img in image_blocks] + self.assertIn("img1.jpg", sources) + self.assertIn("img2.jpg", sources) + + def test_paragraph_with_text_and_image(self): + """Test paragraph with mixed text and image content.""" + html = '

Some text before Inline and after

' + blocks = parse_html_string(html, base_font=self.base_font) + + # Should have both paragraph and image blocks + paragraphs = [b for b in blocks if isinstance(b, Paragraph)] + images = [b for b in blocks if isinstance(b, Image)] + + self.assertGreater(len(paragraphs), 0, "Should have a Paragraph block for text") + self.assertGreater(len(images), 0, "Should have an Image block") + + # Verify image was parsed + self.assertEqual(images[0].source, "inline.jpg") + + # Verify text was extracted (should have words like "Some", "text", etc.) + if paragraphs: + words = list(paragraphs[0].words_iter()) + self.assertGreater(len(words), 0, "Paragraph should have words") + + def test_regular_paragraph_still_works(self): + """Test that regular paragraphs without images still work correctly.""" + html = '

Just regular text without any images.

' + blocks = parse_html_string(html, base_font=self.base_font) + + # Should be exactly one Paragraph block + self.assertEqual(len(blocks), 1, "Should have exactly one block") + self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block") + + # Should not have any Image blocks + image_blocks = [b for b in blocks if isinstance(b, Image)] + self.assertEqual(len(image_blocks), 0, "Should have no Image blocks") + + def test_image_with_width_and_height(self): + """Test image parsing with width and height attributes.""" + html = '

Sized Image

' + blocks = parse_html_string(html, base_font=self.base_font) + + # Should have an Image block + image_blocks = [b for b in blocks if isinstance(b, Image)] + self.assertEqual(len(image_blocks), 1, "Should have one Image block") + + # Verify dimensions were parsed + img = image_blocks[0] + self.assertEqual(img.width, 400) + self.assertEqual(img.height, 300) + + def test_nested_paragraph_with_image_in_span(self): + """Test image inside nested inline elements.""" + html = '

Nested

' + blocks = parse_html_string(html, base_font=self.base_font) + + # Should still extract the image + image_blocks = [b for b in blocks if isinstance(b, Image)] + self.assertGreater(len(image_blocks), 0, "Should find image even when nested") + + # Verify image was parsed correctly + self.assertEqual(image_blocks[0].source, "nested.jpg") + + if __name__ == '__main__': unittest.main()