fix missing images in paras

2025-11-09 22:25:23 +01:00 · 2025-11-09 22:25:23 +01:00 · 9fb6792e10
commit 9fb6792e10
parent 40c1b913ec
3 changed files with 307 additions and 7 deletions
--- a/pyWebLayout/io/readers/html_extraction.py
+++ b/pyWebLayout/io/readers/html_extraction.py
@ -485,8 +485,65 @@ def process_element(
 # Union[Block, List[Block], None]
-def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
+def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]:
-    """Handle <p> elements."""
+    """
    Handle <p> elements.
    Special handling for paragraphs containing images:
    - If the paragraph contains only an image (common in EPUBs), return the image block
    - If the paragraph contains images mixed with text, split into separate blocks
    - Otherwise, return a normal paragraph with text content
    """
    # Check if paragraph contains any img tags (including nested ones)
    img_tags = element.find_all('img')
    if img_tags:
        # Paragraph contains images - need special handling
        blocks = []
        # Check if this is an image-only paragraph (very common in EPUBs)
        # Get text content without the img tags
        text_content = element.get_text(strip=True)
        if not text_content or len(text_content.strip()) == 0:
            # Image-only paragraph - return just the image(s)
            for img_tag in img_tags:
                child_context = apply_element_styling(context, img_tag)
                img_block = image_handler(img_tag, child_context)
                if img_block:
                    blocks.append(img_block)
            # Return single image or list of images
            if len(blocks) == 1:
                return blocks[0]
            return blocks if blocks else Paragraph(context.font)
        # Mixed content - paragraph has both text and images
        # Process children in order to preserve structure
        for child in element.children:
            if isinstance(child, Tag):
                if child.name == 'img':
                    # Add the image as a separate block
                    child_context = apply_element_styling(context, child)
                    img_block = image_handler(child, child_context)
                    if img_block:
                        blocks.append(img_block)
                else:
                    # Process other inline elements as part of text
                    # This will be handled by extract_text_content below
                    pass
        # Also add a paragraph with the text content
        paragraph = Paragraph(context.font)
        words = extract_text_content(element, context)
        if words:
            for word in words:
                paragraph.add_word(word)
            blocks.insert(0, paragraph)  # Text comes before images
        return blocks if blocks else Paragraph(context.font)
    # No images - normal paragraph handling
    paragraph = Paragraph(context.font)
    words = extract_text_content(element, context)
    for word in words:
--- a/pyWebLayout/layout/ereader_manager.py
+++ b/pyWebLayout/layout/ereader_manager.py
@ -13,9 +13,11 @@ from pathlib import Path
 from .ereader_layout import RenderingPosition, ChapterNavigator, ChapterInfo
 from .page_buffer import BufferedPageRenderer
-from pyWebLayout.abstract.block import Block, HeadingLevel
+from pyWebLayout.abstract.block import Block, HeadingLevel, Image, BlockType
 from pyWebLayout.concrete.page import Page
 from pyWebLayout.concrete.image import RenderableImage
 from pyWebLayout.style.page_style import PageStyle
 from pyWebLayout.layout.document_layouter import image_layouter
 class BookmarkManager:
@ -194,6 +196,10 @@ class EreaderLayoutManager:
        self.current_position = RenderingPosition()
        self.font_scale = 1.0
        # Cover page handling
        self._has_cover = self._detect_cover()
        self._on_cover_page = self._has_cover  # Start on cover if one exists
        # Page position history for fast backward navigation
        # List of (position, font_scale) tuples representing the start of each page visited
        self._page_history: List[Tuple[RenderingPosition, float]] = []
@ -203,6 +209,7 @@ class EreaderLayoutManager:
        saved_position = self.bookmark_manager.load_reading_position()
        if saved_position:
            self.current_position = saved_position
            self._on_cover_page = False  # If we have a saved position, we're past the cover
        # Callbacks for UI updates
        self.position_changed_callback: Optional[Callable[[
@ -220,6 +227,69 @@ class EreaderLayoutManager:
        """Set callback for chapter changes"""
        self.chapter_changed_callback = callback
    def _detect_cover(self) -> bool:
        """
        Detect if the document has a cover page.
        A cover is detected if:
        1. The first block is an Image block, OR
        2. The document has cover metadata (future enhancement)
        Returns:
            True if a cover page should be rendered
        """
        if not self.blocks:
            return False
        # Check if first block is an image - treat it as a cover
        first_block = self.blocks[0]
        if isinstance(first_block, Image):
            return True
        return False
    def _render_cover_page(self) -> Page:
        """
        Render a dedicated cover page.
        The cover page displays the first image block (if it exists)
        using the standard image layouter with maximum dimensions to fill the page.
        Returns:
            Rendered cover page
        """
        # Create a new page for the cover
        page = Page(self.page_size, self.page_style)
        if not self.blocks or not isinstance(self.blocks[0], Image):
            # No cover image, return blank page
            return page
        cover_image_block = self.blocks[0]
        # Use the image layouter to render the cover image
        # Use full page dimensions (minus borders/padding) for cover
        try:
            max_width = self.page_size[0] - 2 * self.page_style.border_width
            max_height = self.page_size[1] - 2 * self.page_style.border_width
            # Layout the image on the page
            success = image_layouter(
                image=cover_image_block,
                page=page,
                max_width=max_width,
                max_height=max_height
            )
            if not success:
                print("Warning: Failed to layout cover image")
        except Exception as e:
            # If image loading fails, just return the blank page
            print(f"Warning: Failed to load cover image: {e}")
        return page
    def _notify_position_changed(self):
        """Notify UI of position change"""
        if self.position_changed_callback:
@ -238,9 +308,16 @@ class EreaderLayoutManager:
        """
        Get the page at the current reading position.
        If on the cover page, returns the rendered cover.
        Otherwise, returns the regular content page.
        Returns:
            Rendered page
        """
        # Check if we're on the cover page
        if self._on_cover_page and self._has_cover:
            return self._render_cover_page()
        page, _ = self.renderer.render_page(self.current_position, self.font_scale)
        return page
@ -248,9 +325,23 @@ class EreaderLayoutManager:
        """
        Advance to the next page.
        If currently on the cover page, advances to the first content page.
        Otherwise, advances to the next content page.
        Returns:
            Next page or None if at end of document
        """
        # Special case: transitioning from cover to first content page
        if self._on_cover_page and self._has_cover:
            self._on_cover_page = False
            # If first block is an image (the cover), skip it and start from block 1
            if self.blocks and isinstance(self.blocks[0], Image):
                self.current_position = RenderingPosition(chapter_index=0, block_index=1)
            else:
                self.current_position = RenderingPosition()
            self._notify_position_changed()
            return self.get_current_page()
        # Save current position to history before moving forward
        self._add_to_history(self.current_position, self.font_scale)
@ -271,10 +362,21 @@ class EreaderLayoutManager:
        Uses cached page history for instant navigation when available,
        falls back to iterative refinement algorithm when needed.
        Can navigate back to the cover page if it exists.
        Returns:
-            Previous page or None if at beginning of document
+            Previous page or None if at beginning of document (or on cover)
        """
        # Special case: if at the beginning of content and there's a cover, go back to it
        if self._has_cover and self._is_at_beginning() and not self._on_cover_page:
            self._on_cover_page = True
            self._notify_position_changed()
            return self.get_current_page()
        # Can't go before the cover
        if self._on_cover_page:
            return None
        if self._is_at_beginning():
            return None
@ -303,9 +405,17 @@ class EreaderLayoutManager:
        return None  # At beginning of document
    def _is_at_beginning(self) -> bool:
-        """Check if we're at the beginning of the document"""
+        """
        Check if we're at the beginning of the document content.
        If a cover exists (first block is an Image), the beginning of content
        is at block_index=1. Otherwise, it's at block_index=0.
        """
        # Determine the first content block index
        first_content_block = 1 if (self._has_cover and self.blocks and isinstance(self.blocks[0], Image)) else 0
        return (self.current_position.chapter_index == 0 and
-                self.current_position.block_index == 0 and
+                self.current_position.block_index == first_content_block and
                self.current_position.word_index == 0)
    def jump_to_position(self, position: RenderingPosition) -> Page:
@ -319,6 +429,7 @@ class EreaderLayoutManager:
            Page at the new position
        """
        self.current_position = position
        self._on_cover_page = False  # Jumping to a position means we're past the cover
        self._notify_position_changed()
        return self.get_current_page()
@ -636,6 +747,38 @@ class EreaderLayoutManager:
        return current_block / max(1, total_blocks - 1)
    def has_cover(self) -> bool:
        """
        Check if the document has a cover page.
        Returns:
            True if a cover page is available
        """
        return self._has_cover
    def is_on_cover(self) -> bool:
        """
        Check if currently viewing the cover page.
        Returns:
            True if on the cover page
        """
        return self._on_cover_page
    def jump_to_cover(self) -> Optional[Page]:
        """
        Jump to the cover page if one exists.
        Returns:
            Cover page or None if no cover exists
        """
        if not self._has_cover:
            return None
        self._on_cover_page = True
        self._notify_position_changed()
        return self.get_current_page()
    def get_position_info(self) -> Dict[str, Any]:
        """
        Get detailed information about the current position.
@ -647,6 +790,8 @@ class EreaderLayoutManager:
        return {
            'position': self.current_position.to_dict(),
            'on_cover': self._on_cover_page,
            'has_cover': self._has_cover,
            'chapter': {
                'title': current_chapter.title if current_chapter else None,
                'level': current_chapter.level if current_chapter else None,
--- a/tests/io_tests/test_html_extraction.py
+++ b/tests/io_tests/test_html_extraction.py
@ -7,7 +7,7 @@ including styled content within paragraphs and block-level elements.
 import unittest
 from pyWebLayout.io.readers.html_extraction import parse_html_string
-from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
+from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image
 from pyWebLayout.abstract.document import Document
 from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
@ -585,5 +585,103 @@ class TestHTMLFontRegistryIntegration(unittest.TestCase):
                           "Should create separate styles for style combinations")
 class TestHTMLImagesInParagraphs(unittest.TestCase):
    """Test cases for handling images inside paragraph tags."""
    def setUp(self):
        """Set up test fixtures."""
        self.base_font = Font(font_size=14)
    def test_image_only_paragraph(self):
        """Test paragraph containing only an image (common in EPUBs)."""
        html = '<p><img src="cover.jpg" alt="Book Cover"/></p>'
        blocks = parse_html_string(html, base_font=self.base_font)
        # Should parse as an Image block, not a Paragraph
        self.assertGreater(len(blocks), 0, "Should parse at least one block")
        # Check that we have an Image block
        image_blocks = [b for b in blocks if isinstance(b, Image)]
        self.assertGreater(len(image_blocks), 0, "Should have at least one Image block")
        # Verify image properties
        img = image_blocks[0]
        self.assertEqual(img.source, "cover.jpg")
        self.assertEqual(img.alt_text, "Book Cover")
    def test_paragraph_with_multiple_images(self):
        """Test paragraph with multiple images."""
        html = '<p><img src="img1.jpg" alt="First"/><img src="img2.jpg" alt="Second"/></p>'
        blocks = parse_html_string(html, base_font=self.base_font)
        # Should have multiple Image blocks
        image_blocks = [b for b in blocks if isinstance(b, Image)]
        self.assertEqual(len(image_blocks), 2, "Should have two Image blocks")
        # Verify both images were parsed
        sources = [img.source for img in image_blocks]
        self.assertIn("img1.jpg", sources)
        self.assertIn("img2.jpg", sources)
    def test_paragraph_with_text_and_image(self):
        """Test paragraph with mixed text and image content."""
        html = '<p>Some text before <img src="inline.jpg" alt="Inline"/> and after</p>'
        blocks = parse_html_string(html, base_font=self.base_font)
        # Should have both paragraph and image blocks
        paragraphs = [b for b in blocks if isinstance(b, Paragraph)]
        images = [b for b in blocks if isinstance(b, Image)]
        self.assertGreater(len(paragraphs), 0, "Should have a Paragraph block for text")
        self.assertGreater(len(images), 0, "Should have an Image block")
        # Verify image was parsed
        self.assertEqual(images[0].source, "inline.jpg")
        # Verify text was extracted (should have words like "Some", "text", etc.)
        if paragraphs:
            words = list(paragraphs[0].words_iter())
            self.assertGreater(len(words), 0, "Paragraph should have words")
    def test_regular_paragraph_still_works(self):
        """Test that regular paragraphs without images still work correctly."""
        html = '<p>Just regular text without any images.</p>'
        blocks = parse_html_string(html, base_font=self.base_font)
        # Should be exactly one Paragraph block
        self.assertEqual(len(blocks), 1, "Should have exactly one block")
        self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block")
        # Should not have any Image blocks
        image_blocks = [b for b in blocks if isinstance(b, Image)]
        self.assertEqual(len(image_blocks), 0, "Should have no Image blocks")
    def test_image_with_width_and_height(self):
        """Test image parsing with width and height attributes."""
        html = '<p><img src="sized.jpg" alt="Sized Image" width="400" height="300"/></p>'
        blocks = parse_html_string(html, base_font=self.base_font)
        # Should have an Image block
        image_blocks = [b for b in blocks if isinstance(b, Image)]
        self.assertEqual(len(image_blocks), 1, "Should have one Image block")
        # Verify dimensions were parsed
        img = image_blocks[0]
        self.assertEqual(img.width, 400)
        self.assertEqual(img.height, 300)
    def test_nested_paragraph_with_image_in_span(self):
        """Test image inside nested inline elements."""
        html = '<p><span><img src="nested.jpg" alt="Nested"/></span></p>'
        blocks = parse_html_string(html, base_font=self.base_font)
        # Should still extract the image
        image_blocks = [b for b in blocks if isinstance(b, Image)]
        self.assertGreater(len(image_blocks), 0, "Should find image even when nested")
        # Verify image was parsed correctly
        self.assertEqual(image_blocks[0].source, "nested.jpg")
 if __name__ == '__main__':
    unittest.main()