fix missing images in paras
This commit is contained in:
parent
40c1b913ec
commit
9fb6792e10
@ -485,8 +485,65 @@ def process_element(
|
||||
# Union[Block, List[Block], None]
|
||||
|
||||
|
||||
def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
|
||||
"""Handle <p> elements."""
|
||||
def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]:
|
||||
"""
|
||||
Handle <p> elements.
|
||||
|
||||
Special handling for paragraphs containing images:
|
||||
- If the paragraph contains only an image (common in EPUBs), return the image block
|
||||
- If the paragraph contains images mixed with text, split into separate blocks
|
||||
- Otherwise, return a normal paragraph with text content
|
||||
"""
|
||||
# Check if paragraph contains any img tags (including nested ones)
|
||||
img_tags = element.find_all('img')
|
||||
|
||||
if img_tags:
|
||||
# Paragraph contains images - need special handling
|
||||
blocks = []
|
||||
|
||||
# Check if this is an image-only paragraph (very common in EPUBs)
|
||||
# Get text content without the img tags
|
||||
text_content = element.get_text(strip=True)
|
||||
|
||||
if not text_content or len(text_content.strip()) == 0:
|
||||
# Image-only paragraph - return just the image(s)
|
||||
for img_tag in img_tags:
|
||||
child_context = apply_element_styling(context, img_tag)
|
||||
img_block = image_handler(img_tag, child_context)
|
||||
if img_block:
|
||||
blocks.append(img_block)
|
||||
|
||||
# Return single image or list of images
|
||||
if len(blocks) == 1:
|
||||
return blocks[0]
|
||||
return blocks if blocks else Paragraph(context.font)
|
||||
|
||||
# Mixed content - paragraph has both text and images
|
||||
# Process children in order to preserve structure
|
||||
for child in element.children:
|
||||
if isinstance(child, Tag):
|
||||
if child.name == 'img':
|
||||
# Add the image as a separate block
|
||||
child_context = apply_element_styling(context, child)
|
||||
img_block = image_handler(child, child_context)
|
||||
if img_block:
|
||||
blocks.append(img_block)
|
||||
else:
|
||||
# Process other inline elements as part of text
|
||||
# This will be handled by extract_text_content below
|
||||
pass
|
||||
|
||||
# Also add a paragraph with the text content
|
||||
paragraph = Paragraph(context.font)
|
||||
words = extract_text_content(element, context)
|
||||
if words:
|
||||
for word in words:
|
||||
paragraph.add_word(word)
|
||||
blocks.insert(0, paragraph) # Text comes before images
|
||||
|
||||
return blocks if blocks else Paragraph(context.font)
|
||||
|
||||
# No images - normal paragraph handling
|
||||
paragraph = Paragraph(context.font)
|
||||
words = extract_text_content(element, context)
|
||||
for word in words:
|
||||
|
||||
@ -13,9 +13,11 @@ from pathlib import Path
|
||||
|
||||
from .ereader_layout import RenderingPosition, ChapterNavigator, ChapterInfo
|
||||
from .page_buffer import BufferedPageRenderer
|
||||
from pyWebLayout.abstract.block import Block, HeadingLevel
|
||||
from pyWebLayout.abstract.block import Block, HeadingLevel, Image, BlockType
|
||||
from pyWebLayout.concrete.page import Page
|
||||
from pyWebLayout.concrete.image import RenderableImage
|
||||
from pyWebLayout.style.page_style import PageStyle
|
||||
from pyWebLayout.layout.document_layouter import image_layouter
|
||||
|
||||
|
||||
class BookmarkManager:
|
||||
@ -194,6 +196,10 @@ class EreaderLayoutManager:
|
||||
self.current_position = RenderingPosition()
|
||||
self.font_scale = 1.0
|
||||
|
||||
# Cover page handling
|
||||
self._has_cover = self._detect_cover()
|
||||
self._on_cover_page = self._has_cover # Start on cover if one exists
|
||||
|
||||
# Page position history for fast backward navigation
|
||||
# List of (position, font_scale) tuples representing the start of each page visited
|
||||
self._page_history: List[Tuple[RenderingPosition, float]] = []
|
||||
@ -203,6 +209,7 @@ class EreaderLayoutManager:
|
||||
saved_position = self.bookmark_manager.load_reading_position()
|
||||
if saved_position:
|
||||
self.current_position = saved_position
|
||||
self._on_cover_page = False # If we have a saved position, we're past the cover
|
||||
|
||||
# Callbacks for UI updates
|
||||
self.position_changed_callback: Optional[Callable[[
|
||||
@ -220,6 +227,69 @@ class EreaderLayoutManager:
|
||||
"""Set callback for chapter changes"""
|
||||
self.chapter_changed_callback = callback
|
||||
|
||||
def _detect_cover(self) -> bool:
|
||||
"""
|
||||
Detect if the document has a cover page.
|
||||
|
||||
A cover is detected if:
|
||||
1. The first block is an Image block, OR
|
||||
2. The document has cover metadata (future enhancement)
|
||||
|
||||
Returns:
|
||||
True if a cover page should be rendered
|
||||
"""
|
||||
if not self.blocks:
|
||||
return False
|
||||
|
||||
# Check if first block is an image - treat it as a cover
|
||||
first_block = self.blocks[0]
|
||||
if isinstance(first_block, Image):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _render_cover_page(self) -> Page:
|
||||
"""
|
||||
Render a dedicated cover page.
|
||||
|
||||
The cover page displays the first image block (if it exists)
|
||||
using the standard image layouter with maximum dimensions to fill the page.
|
||||
|
||||
Returns:
|
||||
Rendered cover page
|
||||
"""
|
||||
# Create a new page for the cover
|
||||
page = Page(self.page_size, self.page_style)
|
||||
|
||||
if not self.blocks or not isinstance(self.blocks[0], Image):
|
||||
# No cover image, return blank page
|
||||
return page
|
||||
|
||||
cover_image_block = self.blocks[0]
|
||||
|
||||
# Use the image layouter to render the cover image
|
||||
# Use full page dimensions (minus borders/padding) for cover
|
||||
try:
|
||||
max_width = self.page_size[0] - 2 * self.page_style.border_width
|
||||
max_height = self.page_size[1] - 2 * self.page_style.border_width
|
||||
|
||||
# Layout the image on the page
|
||||
success = image_layouter(
|
||||
image=cover_image_block,
|
||||
page=page,
|
||||
max_width=max_width,
|
||||
max_height=max_height
|
||||
)
|
||||
|
||||
if not success:
|
||||
print("Warning: Failed to layout cover image")
|
||||
|
||||
except Exception as e:
|
||||
# If image loading fails, just return the blank page
|
||||
print(f"Warning: Failed to load cover image: {e}")
|
||||
|
||||
return page
|
||||
|
||||
def _notify_position_changed(self):
|
||||
"""Notify UI of position change"""
|
||||
if self.position_changed_callback:
|
||||
@ -238,9 +308,16 @@ class EreaderLayoutManager:
|
||||
"""
|
||||
Get the page at the current reading position.
|
||||
|
||||
If on the cover page, returns the rendered cover.
|
||||
Otherwise, returns the regular content page.
|
||||
|
||||
Returns:
|
||||
Rendered page
|
||||
"""
|
||||
# Check if we're on the cover page
|
||||
if self._on_cover_page and self._has_cover:
|
||||
return self._render_cover_page()
|
||||
|
||||
page, _ = self.renderer.render_page(self.current_position, self.font_scale)
|
||||
return page
|
||||
|
||||
@ -248,9 +325,23 @@ class EreaderLayoutManager:
|
||||
"""
|
||||
Advance to the next page.
|
||||
|
||||
If currently on the cover page, advances to the first content page.
|
||||
Otherwise, advances to the next content page.
|
||||
|
||||
Returns:
|
||||
Next page or None if at end of document
|
||||
"""
|
||||
# Special case: transitioning from cover to first content page
|
||||
if self._on_cover_page and self._has_cover:
|
||||
self._on_cover_page = False
|
||||
# If first block is an image (the cover), skip it and start from block 1
|
||||
if self.blocks and isinstance(self.blocks[0], Image):
|
||||
self.current_position = RenderingPosition(chapter_index=0, block_index=1)
|
||||
else:
|
||||
self.current_position = RenderingPosition()
|
||||
self._notify_position_changed()
|
||||
return self.get_current_page()
|
||||
|
||||
# Save current position to history before moving forward
|
||||
self._add_to_history(self.current_position, self.font_scale)
|
||||
|
||||
@ -271,10 +362,21 @@ class EreaderLayoutManager:
|
||||
|
||||
Uses cached page history for instant navigation when available,
|
||||
falls back to iterative refinement algorithm when needed.
|
||||
Can navigate back to the cover page if it exists.
|
||||
|
||||
Returns:
|
||||
Previous page or None if at beginning of document
|
||||
Previous page or None if at beginning of document (or on cover)
|
||||
"""
|
||||
# Special case: if at the beginning of content and there's a cover, go back to it
|
||||
if self._has_cover and self._is_at_beginning() and not self._on_cover_page:
|
||||
self._on_cover_page = True
|
||||
self._notify_position_changed()
|
||||
return self.get_current_page()
|
||||
|
||||
# Can't go before the cover
|
||||
if self._on_cover_page:
|
||||
return None
|
||||
|
||||
if self._is_at_beginning():
|
||||
return None
|
||||
|
||||
@ -303,9 +405,17 @@ class EreaderLayoutManager:
|
||||
return None # At beginning of document
|
||||
|
||||
def _is_at_beginning(self) -> bool:
|
||||
"""Check if we're at the beginning of the document"""
|
||||
"""
|
||||
Check if we're at the beginning of the document content.
|
||||
|
||||
If a cover exists (first block is an Image), the beginning of content
|
||||
is at block_index=1. Otherwise, it's at block_index=0.
|
||||
"""
|
||||
# Determine the first content block index
|
||||
first_content_block = 1 if (self._has_cover and self.blocks and isinstance(self.blocks[0], Image)) else 0
|
||||
|
||||
return (self.current_position.chapter_index == 0 and
|
||||
self.current_position.block_index == 0 and
|
||||
self.current_position.block_index == first_content_block and
|
||||
self.current_position.word_index == 0)
|
||||
|
||||
def jump_to_position(self, position: RenderingPosition) -> Page:
|
||||
@ -319,6 +429,7 @@ class EreaderLayoutManager:
|
||||
Page at the new position
|
||||
"""
|
||||
self.current_position = position
|
||||
self._on_cover_page = False # Jumping to a position means we're past the cover
|
||||
self._notify_position_changed()
|
||||
return self.get_current_page()
|
||||
|
||||
@ -636,6 +747,38 @@ class EreaderLayoutManager:
|
||||
|
||||
return current_block / max(1, total_blocks - 1)
|
||||
|
||||
def has_cover(self) -> bool:
|
||||
"""
|
||||
Check if the document has a cover page.
|
||||
|
||||
Returns:
|
||||
True if a cover page is available
|
||||
"""
|
||||
return self._has_cover
|
||||
|
||||
def is_on_cover(self) -> bool:
|
||||
"""
|
||||
Check if currently viewing the cover page.
|
||||
|
||||
Returns:
|
||||
True if on the cover page
|
||||
"""
|
||||
return self._on_cover_page
|
||||
|
||||
def jump_to_cover(self) -> Optional[Page]:
|
||||
"""
|
||||
Jump to the cover page if one exists.
|
||||
|
||||
Returns:
|
||||
Cover page or None if no cover exists
|
||||
"""
|
||||
if not self._has_cover:
|
||||
return None
|
||||
|
||||
self._on_cover_page = True
|
||||
self._notify_position_changed()
|
||||
return self.get_current_page()
|
||||
|
||||
def get_position_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed information about the current position.
|
||||
@ -647,6 +790,8 @@ class EreaderLayoutManager:
|
||||
|
||||
return {
|
||||
'position': self.current_position.to_dict(),
|
||||
'on_cover': self._on_cover_page,
|
||||
'has_cover': self._has_cover,
|
||||
'chapter': {
|
||||
'title': current_chapter.title if current_chapter else None,
|
||||
'level': current_chapter.level if current_chapter else None,
|
||||
|
||||
@ -7,7 +7,7 @@ including styled content within paragraphs and block-level elements.
|
||||
|
||||
import unittest
|
||||
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
||||
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
|
||||
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image
|
||||
from pyWebLayout.abstract.document import Document
|
||||
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
||||
|
||||
@ -585,5 +585,103 @@ class TestHTMLFontRegistryIntegration(unittest.TestCase):
|
||||
"Should create separate styles for style combinations")
|
||||
|
||||
|
||||
class TestHTMLImagesInParagraphs(unittest.TestCase):
|
||||
"""Test cases for handling images inside paragraph tags."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.base_font = Font(font_size=14)
|
||||
|
||||
def test_image_only_paragraph(self):
|
||||
"""Test paragraph containing only an image (common in EPUBs)."""
|
||||
html = '<p><img src="cover.jpg" alt="Book Cover"/></p>'
|
||||
blocks = parse_html_string(html, base_font=self.base_font)
|
||||
|
||||
# Should parse as an Image block, not a Paragraph
|
||||
self.assertGreater(len(blocks), 0, "Should parse at least one block")
|
||||
|
||||
# Check that we have an Image block
|
||||
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||
self.assertGreater(len(image_blocks), 0, "Should have at least one Image block")
|
||||
|
||||
# Verify image properties
|
||||
img = image_blocks[0]
|
||||
self.assertEqual(img.source, "cover.jpg")
|
||||
self.assertEqual(img.alt_text, "Book Cover")
|
||||
|
||||
def test_paragraph_with_multiple_images(self):
|
||||
"""Test paragraph with multiple images."""
|
||||
html = '<p><img src="img1.jpg" alt="First"/><img src="img2.jpg" alt="Second"/></p>'
|
||||
blocks = parse_html_string(html, base_font=self.base_font)
|
||||
|
||||
# Should have multiple Image blocks
|
||||
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||
self.assertEqual(len(image_blocks), 2, "Should have two Image blocks")
|
||||
|
||||
# Verify both images were parsed
|
||||
sources = [img.source for img in image_blocks]
|
||||
self.assertIn("img1.jpg", sources)
|
||||
self.assertIn("img2.jpg", sources)
|
||||
|
||||
def test_paragraph_with_text_and_image(self):
|
||||
"""Test paragraph with mixed text and image content."""
|
||||
html = '<p>Some text before <img src="inline.jpg" alt="Inline"/> and after</p>'
|
||||
blocks = parse_html_string(html, base_font=self.base_font)
|
||||
|
||||
# Should have both paragraph and image blocks
|
||||
paragraphs = [b for b in blocks if isinstance(b, Paragraph)]
|
||||
images = [b for b in blocks if isinstance(b, Image)]
|
||||
|
||||
self.assertGreater(len(paragraphs), 0, "Should have a Paragraph block for text")
|
||||
self.assertGreater(len(images), 0, "Should have an Image block")
|
||||
|
||||
# Verify image was parsed
|
||||
self.assertEqual(images[0].source, "inline.jpg")
|
||||
|
||||
# Verify text was extracted (should have words like "Some", "text", etc.)
|
||||
if paragraphs:
|
||||
words = list(paragraphs[0].words_iter())
|
||||
self.assertGreater(len(words), 0, "Paragraph should have words")
|
||||
|
||||
def test_regular_paragraph_still_works(self):
|
||||
"""Test that regular paragraphs without images still work correctly."""
|
||||
html = '<p>Just regular text without any images.</p>'
|
||||
blocks = parse_html_string(html, base_font=self.base_font)
|
||||
|
||||
# Should be exactly one Paragraph block
|
||||
self.assertEqual(len(blocks), 1, "Should have exactly one block")
|
||||
self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block")
|
||||
|
||||
# Should not have any Image blocks
|
||||
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||
self.assertEqual(len(image_blocks), 0, "Should have no Image blocks")
|
||||
|
||||
def test_image_with_width_and_height(self):
|
||||
"""Test image parsing with width and height attributes."""
|
||||
html = '<p><img src="sized.jpg" alt="Sized Image" width="400" height="300"/></p>'
|
||||
blocks = parse_html_string(html, base_font=self.base_font)
|
||||
|
||||
# Should have an Image block
|
||||
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||
self.assertEqual(len(image_blocks), 1, "Should have one Image block")
|
||||
|
||||
# Verify dimensions were parsed
|
||||
img = image_blocks[0]
|
||||
self.assertEqual(img.width, 400)
|
||||
self.assertEqual(img.height, 300)
|
||||
|
||||
def test_nested_paragraph_with_image_in_span(self):
|
||||
"""Test image inside nested inline elements."""
|
||||
html = '<p><span><img src="nested.jpg" alt="Nested"/></span></p>'
|
||||
blocks = parse_html_string(html, base_font=self.base_font)
|
||||
|
||||
# Should still extract the image
|
||||
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||
self.assertGreater(len(image_blocks), 0, "Should find image even when nested")
|
||||
|
||||
# Verify image was parsed correctly
|
||||
self.assertEqual(image_blocks[0].source, "nested.jpg")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user