fix missing images in paras
This commit is contained in:
parent
40c1b913ec
commit
9fb6792e10
@ -485,8 +485,65 @@ def process_element(
|
|||||||
# Union[Block, List[Block], None]
|
# Union[Block, List[Block], None]
|
||||||
|
|
||||||
|
|
||||||
def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
|
def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]:
|
||||||
"""Handle <p> elements."""
|
"""
|
||||||
|
Handle <p> elements.
|
||||||
|
|
||||||
|
Special handling for paragraphs containing images:
|
||||||
|
- If the paragraph contains only an image (common in EPUBs), return the image block
|
||||||
|
- If the paragraph contains images mixed with text, split into separate blocks
|
||||||
|
- Otherwise, return a normal paragraph with text content
|
||||||
|
"""
|
||||||
|
# Check if paragraph contains any img tags (including nested ones)
|
||||||
|
img_tags = element.find_all('img')
|
||||||
|
|
||||||
|
if img_tags:
|
||||||
|
# Paragraph contains images - need special handling
|
||||||
|
blocks = []
|
||||||
|
|
||||||
|
# Check if this is an image-only paragraph (very common in EPUBs)
|
||||||
|
# Get text content without the img tags
|
||||||
|
text_content = element.get_text(strip=True)
|
||||||
|
|
||||||
|
if not text_content or len(text_content.strip()) == 0:
|
||||||
|
# Image-only paragraph - return just the image(s)
|
||||||
|
for img_tag in img_tags:
|
||||||
|
child_context = apply_element_styling(context, img_tag)
|
||||||
|
img_block = image_handler(img_tag, child_context)
|
||||||
|
if img_block:
|
||||||
|
blocks.append(img_block)
|
||||||
|
|
||||||
|
# Return single image or list of images
|
||||||
|
if len(blocks) == 1:
|
||||||
|
return blocks[0]
|
||||||
|
return blocks if blocks else Paragraph(context.font)
|
||||||
|
|
||||||
|
# Mixed content - paragraph has both text and images
|
||||||
|
# Process children in order to preserve structure
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
if child.name == 'img':
|
||||||
|
# Add the image as a separate block
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
img_block = image_handler(child, child_context)
|
||||||
|
if img_block:
|
||||||
|
blocks.append(img_block)
|
||||||
|
else:
|
||||||
|
# Process other inline elements as part of text
|
||||||
|
# This will be handled by extract_text_content below
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Also add a paragraph with the text content
|
||||||
|
paragraph = Paragraph(context.font)
|
||||||
|
words = extract_text_content(element, context)
|
||||||
|
if words:
|
||||||
|
for word in words:
|
||||||
|
paragraph.add_word(word)
|
||||||
|
blocks.insert(0, paragraph) # Text comes before images
|
||||||
|
|
||||||
|
return blocks if blocks else Paragraph(context.font)
|
||||||
|
|
||||||
|
# No images - normal paragraph handling
|
||||||
paragraph = Paragraph(context.font)
|
paragraph = Paragraph(context.font)
|
||||||
words = extract_text_content(element, context)
|
words = extract_text_content(element, context)
|
||||||
for word in words:
|
for word in words:
|
||||||
|
|||||||
@ -13,9 +13,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
from .ereader_layout import RenderingPosition, ChapterNavigator, ChapterInfo
|
from .ereader_layout import RenderingPosition, ChapterNavigator, ChapterInfo
|
||||||
from .page_buffer import BufferedPageRenderer
|
from .page_buffer import BufferedPageRenderer
|
||||||
from pyWebLayout.abstract.block import Block, HeadingLevel
|
from pyWebLayout.abstract.block import Block, HeadingLevel, Image, BlockType
|
||||||
from pyWebLayout.concrete.page import Page
|
from pyWebLayout.concrete.page import Page
|
||||||
|
from pyWebLayout.concrete.image import RenderableImage
|
||||||
from pyWebLayout.style.page_style import PageStyle
|
from pyWebLayout.style.page_style import PageStyle
|
||||||
|
from pyWebLayout.layout.document_layouter import image_layouter
|
||||||
|
|
||||||
|
|
||||||
class BookmarkManager:
|
class BookmarkManager:
|
||||||
@ -194,6 +196,10 @@ class EreaderLayoutManager:
|
|||||||
self.current_position = RenderingPosition()
|
self.current_position = RenderingPosition()
|
||||||
self.font_scale = 1.0
|
self.font_scale = 1.0
|
||||||
|
|
||||||
|
# Cover page handling
|
||||||
|
self._has_cover = self._detect_cover()
|
||||||
|
self._on_cover_page = self._has_cover # Start on cover if one exists
|
||||||
|
|
||||||
# Page position history for fast backward navigation
|
# Page position history for fast backward navigation
|
||||||
# List of (position, font_scale) tuples representing the start of each page visited
|
# List of (position, font_scale) tuples representing the start of each page visited
|
||||||
self._page_history: List[Tuple[RenderingPosition, float]] = []
|
self._page_history: List[Tuple[RenderingPosition, float]] = []
|
||||||
@ -203,6 +209,7 @@ class EreaderLayoutManager:
|
|||||||
saved_position = self.bookmark_manager.load_reading_position()
|
saved_position = self.bookmark_manager.load_reading_position()
|
||||||
if saved_position:
|
if saved_position:
|
||||||
self.current_position = saved_position
|
self.current_position = saved_position
|
||||||
|
self._on_cover_page = False # If we have a saved position, we're past the cover
|
||||||
|
|
||||||
# Callbacks for UI updates
|
# Callbacks for UI updates
|
||||||
self.position_changed_callback: Optional[Callable[[
|
self.position_changed_callback: Optional[Callable[[
|
||||||
@ -220,6 +227,69 @@ class EreaderLayoutManager:
|
|||||||
"""Set callback for chapter changes"""
|
"""Set callback for chapter changes"""
|
||||||
self.chapter_changed_callback = callback
|
self.chapter_changed_callback = callback
|
||||||
|
|
||||||
|
def _detect_cover(self) -> bool:
|
||||||
|
"""
|
||||||
|
Detect if the document has a cover page.
|
||||||
|
|
||||||
|
A cover is detected if:
|
||||||
|
1. The first block is an Image block, OR
|
||||||
|
2. The document has cover metadata (future enhancement)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if a cover page should be rendered
|
||||||
|
"""
|
||||||
|
if not self.blocks:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if first block is an image - treat it as a cover
|
||||||
|
first_block = self.blocks[0]
|
||||||
|
if isinstance(first_block, Image):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _render_cover_page(self) -> Page:
|
||||||
|
"""
|
||||||
|
Render a dedicated cover page.
|
||||||
|
|
||||||
|
The cover page displays the first image block (if it exists)
|
||||||
|
using the standard image layouter with maximum dimensions to fill the page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Rendered cover page
|
||||||
|
"""
|
||||||
|
# Create a new page for the cover
|
||||||
|
page = Page(self.page_size, self.page_style)
|
||||||
|
|
||||||
|
if not self.blocks or not isinstance(self.blocks[0], Image):
|
||||||
|
# No cover image, return blank page
|
||||||
|
return page
|
||||||
|
|
||||||
|
cover_image_block = self.blocks[0]
|
||||||
|
|
||||||
|
# Use the image layouter to render the cover image
|
||||||
|
# Use full page dimensions (minus borders/padding) for cover
|
||||||
|
try:
|
||||||
|
max_width = self.page_size[0] - 2 * self.page_style.border_width
|
||||||
|
max_height = self.page_size[1] - 2 * self.page_style.border_width
|
||||||
|
|
||||||
|
# Layout the image on the page
|
||||||
|
success = image_layouter(
|
||||||
|
image=cover_image_block,
|
||||||
|
page=page,
|
||||||
|
max_width=max_width,
|
||||||
|
max_height=max_height
|
||||||
|
)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
print("Warning: Failed to layout cover image")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If image loading fails, just return the blank page
|
||||||
|
print(f"Warning: Failed to load cover image: {e}")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
def _notify_position_changed(self):
|
def _notify_position_changed(self):
|
||||||
"""Notify UI of position change"""
|
"""Notify UI of position change"""
|
||||||
if self.position_changed_callback:
|
if self.position_changed_callback:
|
||||||
@ -238,9 +308,16 @@ class EreaderLayoutManager:
|
|||||||
"""
|
"""
|
||||||
Get the page at the current reading position.
|
Get the page at the current reading position.
|
||||||
|
|
||||||
|
If on the cover page, returns the rendered cover.
|
||||||
|
Otherwise, returns the regular content page.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Rendered page
|
Rendered page
|
||||||
"""
|
"""
|
||||||
|
# Check if we're on the cover page
|
||||||
|
if self._on_cover_page and self._has_cover:
|
||||||
|
return self._render_cover_page()
|
||||||
|
|
||||||
page, _ = self.renderer.render_page(self.current_position, self.font_scale)
|
page, _ = self.renderer.render_page(self.current_position, self.font_scale)
|
||||||
return page
|
return page
|
||||||
|
|
||||||
@ -248,9 +325,23 @@ class EreaderLayoutManager:
|
|||||||
"""
|
"""
|
||||||
Advance to the next page.
|
Advance to the next page.
|
||||||
|
|
||||||
|
If currently on the cover page, advances to the first content page.
|
||||||
|
Otherwise, advances to the next content page.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Next page or None if at end of document
|
Next page or None if at end of document
|
||||||
"""
|
"""
|
||||||
|
# Special case: transitioning from cover to first content page
|
||||||
|
if self._on_cover_page and self._has_cover:
|
||||||
|
self._on_cover_page = False
|
||||||
|
# If first block is an image (the cover), skip it and start from block 1
|
||||||
|
if self.blocks and isinstance(self.blocks[0], Image):
|
||||||
|
self.current_position = RenderingPosition(chapter_index=0, block_index=1)
|
||||||
|
else:
|
||||||
|
self.current_position = RenderingPosition()
|
||||||
|
self._notify_position_changed()
|
||||||
|
return self.get_current_page()
|
||||||
|
|
||||||
# Save current position to history before moving forward
|
# Save current position to history before moving forward
|
||||||
self._add_to_history(self.current_position, self.font_scale)
|
self._add_to_history(self.current_position, self.font_scale)
|
||||||
|
|
||||||
@ -271,10 +362,21 @@ class EreaderLayoutManager:
|
|||||||
|
|
||||||
Uses cached page history for instant navigation when available,
|
Uses cached page history for instant navigation when available,
|
||||||
falls back to iterative refinement algorithm when needed.
|
falls back to iterative refinement algorithm when needed.
|
||||||
|
Can navigate back to the cover page if it exists.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Previous page or None if at beginning of document
|
Previous page or None if at beginning of document (or on cover)
|
||||||
"""
|
"""
|
||||||
|
# Special case: if at the beginning of content and there's a cover, go back to it
|
||||||
|
if self._has_cover and self._is_at_beginning() and not self._on_cover_page:
|
||||||
|
self._on_cover_page = True
|
||||||
|
self._notify_position_changed()
|
||||||
|
return self.get_current_page()
|
||||||
|
|
||||||
|
# Can't go before the cover
|
||||||
|
if self._on_cover_page:
|
||||||
|
return None
|
||||||
|
|
||||||
if self._is_at_beginning():
|
if self._is_at_beginning():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -303,9 +405,17 @@ class EreaderLayoutManager:
|
|||||||
return None # At beginning of document
|
return None # At beginning of document
|
||||||
|
|
||||||
def _is_at_beginning(self) -> bool:
|
def _is_at_beginning(self) -> bool:
|
||||||
"""Check if we're at the beginning of the document"""
|
"""
|
||||||
|
Check if we're at the beginning of the document content.
|
||||||
|
|
||||||
|
If a cover exists (first block is an Image), the beginning of content
|
||||||
|
is at block_index=1. Otherwise, it's at block_index=0.
|
||||||
|
"""
|
||||||
|
# Determine the first content block index
|
||||||
|
first_content_block = 1 if (self._has_cover and self.blocks and isinstance(self.blocks[0], Image)) else 0
|
||||||
|
|
||||||
return (self.current_position.chapter_index == 0 and
|
return (self.current_position.chapter_index == 0 and
|
||||||
self.current_position.block_index == 0 and
|
self.current_position.block_index == first_content_block and
|
||||||
self.current_position.word_index == 0)
|
self.current_position.word_index == 0)
|
||||||
|
|
||||||
def jump_to_position(self, position: RenderingPosition) -> Page:
|
def jump_to_position(self, position: RenderingPosition) -> Page:
|
||||||
@ -319,6 +429,7 @@ class EreaderLayoutManager:
|
|||||||
Page at the new position
|
Page at the new position
|
||||||
"""
|
"""
|
||||||
self.current_position = position
|
self.current_position = position
|
||||||
|
self._on_cover_page = False # Jumping to a position means we're past the cover
|
||||||
self._notify_position_changed()
|
self._notify_position_changed()
|
||||||
return self.get_current_page()
|
return self.get_current_page()
|
||||||
|
|
||||||
@ -636,6 +747,38 @@ class EreaderLayoutManager:
|
|||||||
|
|
||||||
return current_block / max(1, total_blocks - 1)
|
return current_block / max(1, total_blocks - 1)
|
||||||
|
|
||||||
|
def has_cover(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the document has a cover page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if a cover page is available
|
||||||
|
"""
|
||||||
|
return self._has_cover
|
||||||
|
|
||||||
|
def is_on_cover(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if currently viewing the cover page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if on the cover page
|
||||||
|
"""
|
||||||
|
return self._on_cover_page
|
||||||
|
|
||||||
|
def jump_to_cover(self) -> Optional[Page]:
|
||||||
|
"""
|
||||||
|
Jump to the cover page if one exists.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cover page or None if no cover exists
|
||||||
|
"""
|
||||||
|
if not self._has_cover:
|
||||||
|
return None
|
||||||
|
|
||||||
|
self._on_cover_page = True
|
||||||
|
self._notify_position_changed()
|
||||||
|
return self.get_current_page()
|
||||||
|
|
||||||
def get_position_info(self) -> Dict[str, Any]:
|
def get_position_info(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Get detailed information about the current position.
|
Get detailed information about the current position.
|
||||||
@ -647,6 +790,8 @@ class EreaderLayoutManager:
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
'position': self.current_position.to_dict(),
|
'position': self.current_position.to_dict(),
|
||||||
|
'on_cover': self._on_cover_page,
|
||||||
|
'has_cover': self._has_cover,
|
||||||
'chapter': {
|
'chapter': {
|
||||||
'title': current_chapter.title if current_chapter else None,
|
'title': current_chapter.title if current_chapter else None,
|
||||||
'level': current_chapter.level if current_chapter else None,
|
'level': current_chapter.level if current_chapter else None,
|
||||||
|
|||||||
@ -7,7 +7,7 @@ including styled content within paragraphs and block-level elements.
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
||||||
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
|
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image
|
||||||
from pyWebLayout.abstract.document import Document
|
from pyWebLayout.abstract.document import Document
|
||||||
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
||||||
|
|
||||||
@ -585,5 +585,103 @@ class TestHTMLFontRegistryIntegration(unittest.TestCase):
|
|||||||
"Should create separate styles for style combinations")
|
"Should create separate styles for style combinations")
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLImagesInParagraphs(unittest.TestCase):
|
||||||
|
"""Test cases for handling images inside paragraph tags."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test fixtures."""
|
||||||
|
self.base_font = Font(font_size=14)
|
||||||
|
|
||||||
|
def test_image_only_paragraph(self):
|
||||||
|
"""Test paragraph containing only an image (common in EPUBs)."""
|
||||||
|
html = '<p><img src="cover.jpg" alt="Book Cover"/></p>'
|
||||||
|
blocks = parse_html_string(html, base_font=self.base_font)
|
||||||
|
|
||||||
|
# Should parse as an Image block, not a Paragraph
|
||||||
|
self.assertGreater(len(blocks), 0, "Should parse at least one block")
|
||||||
|
|
||||||
|
# Check that we have an Image block
|
||||||
|
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||||
|
self.assertGreater(len(image_blocks), 0, "Should have at least one Image block")
|
||||||
|
|
||||||
|
# Verify image properties
|
||||||
|
img = image_blocks[0]
|
||||||
|
self.assertEqual(img.source, "cover.jpg")
|
||||||
|
self.assertEqual(img.alt_text, "Book Cover")
|
||||||
|
|
||||||
|
def test_paragraph_with_multiple_images(self):
|
||||||
|
"""Test paragraph with multiple images."""
|
||||||
|
html = '<p><img src="img1.jpg" alt="First"/><img src="img2.jpg" alt="Second"/></p>'
|
||||||
|
blocks = parse_html_string(html, base_font=self.base_font)
|
||||||
|
|
||||||
|
# Should have multiple Image blocks
|
||||||
|
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||||
|
self.assertEqual(len(image_blocks), 2, "Should have two Image blocks")
|
||||||
|
|
||||||
|
# Verify both images were parsed
|
||||||
|
sources = [img.source for img in image_blocks]
|
||||||
|
self.assertIn("img1.jpg", sources)
|
||||||
|
self.assertIn("img2.jpg", sources)
|
||||||
|
|
||||||
|
def test_paragraph_with_text_and_image(self):
|
||||||
|
"""Test paragraph with mixed text and image content."""
|
||||||
|
html = '<p>Some text before <img src="inline.jpg" alt="Inline"/> and after</p>'
|
||||||
|
blocks = parse_html_string(html, base_font=self.base_font)
|
||||||
|
|
||||||
|
# Should have both paragraph and image blocks
|
||||||
|
paragraphs = [b for b in blocks if isinstance(b, Paragraph)]
|
||||||
|
images = [b for b in blocks if isinstance(b, Image)]
|
||||||
|
|
||||||
|
self.assertGreater(len(paragraphs), 0, "Should have a Paragraph block for text")
|
||||||
|
self.assertGreater(len(images), 0, "Should have an Image block")
|
||||||
|
|
||||||
|
# Verify image was parsed
|
||||||
|
self.assertEqual(images[0].source, "inline.jpg")
|
||||||
|
|
||||||
|
# Verify text was extracted (should have words like "Some", "text", etc.)
|
||||||
|
if paragraphs:
|
||||||
|
words = list(paragraphs[0].words_iter())
|
||||||
|
self.assertGreater(len(words), 0, "Paragraph should have words")
|
||||||
|
|
||||||
|
def test_regular_paragraph_still_works(self):
|
||||||
|
"""Test that regular paragraphs without images still work correctly."""
|
||||||
|
html = '<p>Just regular text without any images.</p>'
|
||||||
|
blocks = parse_html_string(html, base_font=self.base_font)
|
||||||
|
|
||||||
|
# Should be exactly one Paragraph block
|
||||||
|
self.assertEqual(len(blocks), 1, "Should have exactly one block")
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block")
|
||||||
|
|
||||||
|
# Should not have any Image blocks
|
||||||
|
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||||
|
self.assertEqual(len(image_blocks), 0, "Should have no Image blocks")
|
||||||
|
|
||||||
|
def test_image_with_width_and_height(self):
|
||||||
|
"""Test image parsing with width and height attributes."""
|
||||||
|
html = '<p><img src="sized.jpg" alt="Sized Image" width="400" height="300"/></p>'
|
||||||
|
blocks = parse_html_string(html, base_font=self.base_font)
|
||||||
|
|
||||||
|
# Should have an Image block
|
||||||
|
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||||
|
self.assertEqual(len(image_blocks), 1, "Should have one Image block")
|
||||||
|
|
||||||
|
# Verify dimensions were parsed
|
||||||
|
img = image_blocks[0]
|
||||||
|
self.assertEqual(img.width, 400)
|
||||||
|
self.assertEqual(img.height, 300)
|
||||||
|
|
||||||
|
def test_nested_paragraph_with_image_in_span(self):
|
||||||
|
"""Test image inside nested inline elements."""
|
||||||
|
html = '<p><span><img src="nested.jpg" alt="Nested"/></span></p>'
|
||||||
|
blocks = parse_html_string(html, base_font=self.base_font)
|
||||||
|
|
||||||
|
# Should still extract the image
|
||||||
|
image_blocks = [b for b in blocks if isinstance(b, Image)]
|
||||||
|
self.assertGreater(len(image_blocks), 0, "Should find image even when nested")
|
||||||
|
|
||||||
|
# Verify image was parsed correctly
|
||||||
|
self.assertEqual(image_blocks[0].source, "nested.jpg")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user