fix missing images in paras

2025-11-09 22:25:23 +01:00 · 2025-11-09 22:25:23 +01:00 · 9fb6792e10
commit 9fb6792e10
parent 40c1b913ec
3 changed files with 307 additions and 7 deletions
--- a/pyWebLayout/io/readers/html_extraction.py
+++ b/pyWebLayout/io/readers/html_extraction.py
@ -485,8 +485,65 @@ def process_element(
 # Union[Block, List[Block], None]


-def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
-    """Handle <p> elements."""
+def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]:
+    """
+    Handle <p> elements.
+
+    Special handling for paragraphs containing images:
+    - If the paragraph contains only an image (common in EPUBs), return the image block
+    - If the paragraph contains images mixed with text, split into separate blocks
+    - Otherwise, return a normal paragraph with text content
+    """
+    # Check if paragraph contains any img tags (including nested ones)
+    img_tags = element.find_all('img')
+
+    if img_tags:
+        # Paragraph contains images - need special handling
+        blocks = []
+
+        # Check if this is an image-only paragraph (very common in EPUBs)
+        # Get text content without the img tags
+        text_content = element.get_text(strip=True)
+
+        if not text_content or len(text_content.strip()) == 0:
+            # Image-only paragraph - return just the image(s)
+            for img_tag in img_tags:
+                child_context = apply_element_styling(context, img_tag)
+                img_block = image_handler(img_tag, child_context)
+                if img_block:
+                    blocks.append(img_block)
+
+            # Return single image or list of images
+            if len(blocks) == 1:
+                return blocks[0]
+            return blocks if blocks else Paragraph(context.font)
+
+        # Mixed content - paragraph has both text and images
+        # Process children in order to preserve structure
+        for child in element.children:
+            if isinstance(child, Tag):
+                if child.name == 'img':
+                    # Add the image as a separate block
+                    child_context = apply_element_styling(context, child)
+                    img_block = image_handler(child, child_context)
+                    if img_block:
+                        blocks.append(img_block)
+                else:
+                    # Process other inline elements as part of text
+                    # This will be handled by extract_text_content below
+                    pass
+
+        # Also add a paragraph with the text content
+        paragraph = Paragraph(context.font)
+        words = extract_text_content(element, context)
+        if words:
+            for word in words:
+                paragraph.add_word(word)
+            blocks.insert(0, paragraph)  # Text comes before images
+
+        return blocks if blocks else Paragraph(context.font)
+
+    # No images - normal paragraph handling
    paragraph = Paragraph(context.font)
    words = extract_text_content(element, context)
    for word in words:
--- a/pyWebLayout/layout/ereader_manager.py
+++ b/pyWebLayout/layout/ereader_manager.py
@ -13,9 +13,11 @@ from pathlib import Path

 from .ereader_layout import RenderingPosition, ChapterNavigator, ChapterInfo
 from .page_buffer import BufferedPageRenderer
-from pyWebLayout.abstract.block import Block, HeadingLevel
+from pyWebLayout.abstract.block import Block, HeadingLevel, Image, BlockType
 from pyWebLayout.concrete.page import Page
+from pyWebLayout.concrete.image import RenderableImage
 from pyWebLayout.style.page_style import PageStyle
+from pyWebLayout.layout.document_layouter import image_layouter


 class BookmarkManager:
@ -194,6 +196,10 @@ class EreaderLayoutManager:
        self.current_position = RenderingPosition()
        self.font_scale = 1.0

+        # Cover page handling
+        self._has_cover = self._detect_cover()
+        self._on_cover_page = self._has_cover  # Start on cover if one exists
+
        # Page position history for fast backward navigation
        # List of (position, font_scale) tuples representing the start of each page visited
        self._page_history: List[Tuple[RenderingPosition, float]] = []
@ -203,6 +209,7 @@ class EreaderLayoutManager:
        saved_position = self.bookmark_manager.load_reading_position()
        if saved_position:
            self.current_position = saved_position
+            self._on_cover_page = False  # If we have a saved position, we're past the cover

        # Callbacks for UI updates
        self.position_changed_callback: Optional[Callable[[
@ -220,6 +227,69 @@ class EreaderLayoutManager:
        """Set callback for chapter changes"""
        self.chapter_changed_callback = callback

+    def _detect_cover(self) -> bool:
+        """
+        Detect if the document has a cover page.
+
+        A cover is detected if:
+        1. The first block is an Image block, OR
+        2. The document has cover metadata (future enhancement)
+
+        Returns:
+            True if a cover page should be rendered
+        """
+        if not self.blocks:
+            return False
+
+        # Check if first block is an image - treat it as a cover
+        first_block = self.blocks[0]
+        if isinstance(first_block, Image):
+            return True
+
+        return False
+
+    def _render_cover_page(self) -> Page:
+        """
+        Render a dedicated cover page.
+
+        The cover page displays the first image block (if it exists)
+        using the standard image layouter with maximum dimensions to fill the page.
+
+        Returns:
+            Rendered cover page
+        """
+        # Create a new page for the cover
+        page = Page(self.page_size, self.page_style)
+
+        if not self.blocks or not isinstance(self.blocks[0], Image):
+            # No cover image, return blank page
+            return page
+
+        cover_image_block = self.blocks[0]
+
+        # Use the image layouter to render the cover image
+        # Use full page dimensions (minus borders/padding) for cover
+        try:
+            max_width = self.page_size[0] - 2 * self.page_style.border_width
+            max_height = self.page_size[1] - 2 * self.page_style.border_width
+
+            # Layout the image on the page
+            success = image_layouter(
+                image=cover_image_block,
+                page=page,
+                max_width=max_width,
+                max_height=max_height
+            )
+
+            if not success:
+                print("Warning: Failed to layout cover image")
+
+        except Exception as e:
+            # If image loading fails, just return the blank page
+            print(f"Warning: Failed to load cover image: {e}")
+
+        return page
+
    def _notify_position_changed(self):
        """Notify UI of position change"""
        if self.position_changed_callback:
@ -238,9 +308,16 @@ class EreaderLayoutManager:
        """
        Get the page at the current reading position.

+        If on the cover page, returns the rendered cover.
+        Otherwise, returns the regular content page.
+
        Returns:
            Rendered page
        """
+        # Check if we're on the cover page
+        if self._on_cover_page and self._has_cover:
+            return self._render_cover_page()
+
        page, _ = self.renderer.render_page(self.current_position, self.font_scale)
        return page

@ -248,9 +325,23 @@ class EreaderLayoutManager:
        """
        Advance to the next page.

+        If currently on the cover page, advances to the first content page.
+        Otherwise, advances to the next content page.
+
        Returns:
            Next page or None if at end of document
        """
+        # Special case: transitioning from cover to first content page
+        if self._on_cover_page and self._has_cover:
+            self._on_cover_page = False
+            # If first block is an image (the cover), skip it and start from block 1
+            if self.blocks and isinstance(self.blocks[0], Image):
+                self.current_position = RenderingPosition(chapter_index=0, block_index=1)
+            else:
+                self.current_position = RenderingPosition()
+            self._notify_position_changed()
+            return self.get_current_page()
+
        # Save current position to history before moving forward
        self._add_to_history(self.current_position, self.font_scale)

@ -271,10 +362,21 @@ class EreaderLayoutManager:

        Uses cached page history for instant navigation when available,
        falls back to iterative refinement algorithm when needed.
+        Can navigate back to the cover page if it exists.

        Returns:
-            Previous page or None if at beginning of document
+            Previous page or None if at beginning of document (or on cover)
        """
+        # Special case: if at the beginning of content and there's a cover, go back to it
+        if self._has_cover and self._is_at_beginning() and not self._on_cover_page:
+            self._on_cover_page = True
+            self._notify_position_changed()
+            return self.get_current_page()
+
+        # Can't go before the cover
+        if self._on_cover_page:
+            return None
+
        if self._is_at_beginning():
            return None

@ -303,9 +405,17 @@ class EreaderLayoutManager:
        return None  # At beginning of document

    def _is_at_beginning(self) -> bool:
-        """Check if we're at the beginning of the document"""
+        """
+        Check if we're at the beginning of the document content.
+
+        If a cover exists (first block is an Image), the beginning of content
+        is at block_index=1. Otherwise, it's at block_index=0.
+        """
+        # Determine the first content block index
+        first_content_block = 1 if (self._has_cover and self.blocks and isinstance(self.blocks[0], Image)) else 0
+
        return (self.current_position.chapter_index == 0 and
-                self.current_position.block_index == 0 and
+                self.current_position.block_index == first_content_block and
                self.current_position.word_index == 0)

    def jump_to_position(self, position: RenderingPosition) -> Page:
@ -319,6 +429,7 @@ class EreaderLayoutManager:
            Page at the new position
        """
        self.current_position = position
+        self._on_cover_page = False  # Jumping to a position means we're past the cover
        self._notify_position_changed()
        return self.get_current_page()

@ -636,6 +747,38 @@ class EreaderLayoutManager:

        return current_block / max(1, total_blocks - 1)

+    def has_cover(self) -> bool:
+        """
+        Check if the document has a cover page.
+
+        Returns:
+            True if a cover page is available
+        """
+        return self._has_cover
+
+    def is_on_cover(self) -> bool:
+        """
+        Check if currently viewing the cover page.
+
+        Returns:
+            True if on the cover page
+        """
+        return self._on_cover_page
+
+    def jump_to_cover(self) -> Optional[Page]:
+        """
+        Jump to the cover page if one exists.
+
+        Returns:
+            Cover page or None if no cover exists
+        """
+        if not self._has_cover:
+            return None
+
+        self._on_cover_page = True
+        self._notify_position_changed()
+        return self.get_current_page()
+
    def get_position_info(self) -> Dict[str, Any]:
        """
        Get detailed information about the current position.
@ -647,6 +790,8 @@ class EreaderLayoutManager:

        return {
            'position': self.current_position.to_dict(),
+            'on_cover': self._on_cover_page,
+            'has_cover': self._has_cover,
            'chapter': {
                'title': current_chapter.title if current_chapter else None,
                'level': current_chapter.level if current_chapter else None,
--- a/tests/io_tests/test_html_extraction.py
+++ b/tests/io_tests/test_html_extraction.py
@ -7,7 +7,7 @@ including styled content within paragraphs and block-level elements.

 import unittest
 from pyWebLayout.io.readers.html_extraction import parse_html_string
-from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
+from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image
 from pyWebLayout.abstract.document import Document
 from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration

@ -585,5 +585,103 @@ class TestHTMLFontRegistryIntegration(unittest.TestCase):
                           "Should create separate styles for style combinations")


+class TestHTMLImagesInParagraphs(unittest.TestCase):
+    """Test cases for handling images inside paragraph tags."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.base_font = Font(font_size=14)
+
+    def test_image_only_paragraph(self):
+        """Test paragraph containing only an image (common in EPUBs)."""
+        html = '<p><img src="cover.jpg" alt="Book Cover"/></p>'
+        blocks = parse_html_string(html, base_font=self.base_font)
+
+        # Should parse as an Image block, not a Paragraph
+        self.assertGreater(len(blocks), 0, "Should parse at least one block")
+
+        # Check that we have an Image block
+        image_blocks = [b for b in blocks if isinstance(b, Image)]
+        self.assertGreater(len(image_blocks), 0, "Should have at least one Image block")
+
+        # Verify image properties
+        img = image_blocks[0]
+        self.assertEqual(img.source, "cover.jpg")
+        self.assertEqual(img.alt_text, "Book Cover")
+
+    def test_paragraph_with_multiple_images(self):
+        """Test paragraph with multiple images."""
+        html = '<p><img src="img1.jpg" alt="First"/><img src="img2.jpg" alt="Second"/></p>'
+        blocks = parse_html_string(html, base_font=self.base_font)
+
+        # Should have multiple Image blocks
+        image_blocks = [b for b in blocks if isinstance(b, Image)]
+        self.assertEqual(len(image_blocks), 2, "Should have two Image blocks")
+
+        # Verify both images were parsed
+        sources = [img.source for img in image_blocks]
+        self.assertIn("img1.jpg", sources)
+        self.assertIn("img2.jpg", sources)
+
+    def test_paragraph_with_text_and_image(self):
+        """Test paragraph with mixed text and image content."""
+        html = '<p>Some text before <img src="inline.jpg" alt="Inline"/> and after</p>'
+        blocks = parse_html_string(html, base_font=self.base_font)
+
+        # Should have both paragraph and image blocks
+        paragraphs = [b for b in blocks if isinstance(b, Paragraph)]
+        images = [b for b in blocks if isinstance(b, Image)]
+
+        self.assertGreater(len(paragraphs), 0, "Should have a Paragraph block for text")
+        self.assertGreater(len(images), 0, "Should have an Image block")
+
+        # Verify image was parsed
+        self.assertEqual(images[0].source, "inline.jpg")
+
+        # Verify text was extracted (should have words like "Some", "text", etc.)
+        if paragraphs:
+            words = list(paragraphs[0].words_iter())
+            self.assertGreater(len(words), 0, "Paragraph should have words")
+
+    def test_regular_paragraph_still_works(self):
+        """Test that regular paragraphs without images still work correctly."""
+        html = '<p>Just regular text without any images.</p>'
+        blocks = parse_html_string(html, base_font=self.base_font)
+
+        # Should be exactly one Paragraph block
+        self.assertEqual(len(blocks), 1, "Should have exactly one block")
+        self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block")
+
+        # Should not have any Image blocks
+        image_blocks = [b for b in blocks if isinstance(b, Image)]
+        self.assertEqual(len(image_blocks), 0, "Should have no Image blocks")
+
+    def test_image_with_width_and_height(self):
+        """Test image parsing with width and height attributes."""
+        html = '<p><img src="sized.jpg" alt="Sized Image" width="400" height="300"/></p>'
+        blocks = parse_html_string(html, base_font=self.base_font)
+
+        # Should have an Image block
+        image_blocks = [b for b in blocks if isinstance(b, Image)]
+        self.assertEqual(len(image_blocks), 1, "Should have one Image block")
+
+        # Verify dimensions were parsed
+        img = image_blocks[0]
+        self.assertEqual(img.width, 400)
+        self.assertEqual(img.height, 300)
+
+    def test_nested_paragraph_with_image_in_span(self):
+        """Test image inside nested inline elements."""
+        html = '<p><span><img src="nested.jpg" alt="Nested"/></span></p>'
+        blocks = parse_html_string(html, base_font=self.base_font)
+
+        # Should still extract the image
+        image_blocks = [b for b in blocks if isinstance(b, Image)]
+        self.assertGreater(len(image_blocks), 0, "Should find image even when nested")
+
+        # Verify image was parsed correctly
+        self.assertEqual(image_blocks[0].source, "nested.jpg")
+
+
 if __name__ == '__main__':
    unittest.main()