Duncan Tourolle 8c35cbf5ce
Some checks failed
Python CI / test (push) Failing after 4m8s
Improved handling of pagnination.
2025-06-08 13:29:44 +02:00

678 lines
26 KiB
Python

from typing import List, Tuple, Optional, Dict, Any
import numpy as np
import re
import os
from urllib.parse import urljoin, urlparse
from PIL import Image
from pyWebLayout.core.base import Renderable, Layoutable
from .box import Box
from pyWebLayout.style.layout import Alignment
from .text import Text
from .image import RenderableImage
from .functional import RenderableLink, RenderableButton
from pyWebLayout.abstract.block import Block, Paragraph, Heading, HList, Image as AbstractImage, HeadingLevel, ListStyle
from pyWebLayout.abstract.inline import Word
from pyWebLayout.abstract.functional import Link, LinkType
from pyWebLayout.style.fonts import Font, FontWeight, FontStyle, TextDecoration
from pyWebLayout.typesetting.paragraph_layout import ParagraphLayout, ParagraphLayoutResult
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.typesetting.document_cursor import DocumentCursor, DocumentPosition
class Container(Box, Layoutable):
"""
A container that can hold multiple renderable objects and lay them out.
"""
def __init__(self, origin, size, direction='vertical', spacing=5,
callback=None, sheet=None, mode=None,
halign=Alignment.CENTER, valign=Alignment.CENTER,
padding: Tuple[int, int, int, int] = (10, 10, 10, 10)):
"""
Initialize a container.
Args:
origin: Top-left corner coordinates
size: Width and height of the container
direction: Layout direction ('vertical' or 'horizontal')
spacing: Space between elements
callback: Optional callback function
sheet: Optional image sheet
mode: Optional image mode
halign: Horizontal alignment
valign: Vertical alignment
padding: Padding as (top, right, bottom, left)
"""
super().__init__(origin, size, callback, sheet, mode, halign, valign)
self._children: List[Renderable] = []
self._direction = direction
self._spacing = spacing
self._padding = padding
def add_child(self, child: Renderable):
"""Add a child element to this container"""
self._children.append(child)
return self
def layout(self):
"""Layout the children according to the container's direction and spacing"""
if not self._children:
return
# Get available space after padding
padding_top, padding_right, padding_bottom, padding_left = self._padding
available_width = self._size[0] - padding_left - padding_right
available_height = self._size[1] - padding_top - padding_bottom
# Calculate total content size
if self._direction == 'vertical':
total_height = sum(getattr(child, '_size', [0, 0])[1] for child in self._children)
total_height += self._spacing * (len(self._children) - 1)
# Position each child
current_y = padding_top
for child in self._children:
if hasattr(child, '_size') and hasattr(child, '_origin'):
child_width, child_height = child._size
# Calculate horizontal position based on alignment
if self._halign == Alignment.LEFT:
x_pos = padding_left
elif self._halign == Alignment.RIGHT:
x_pos = padding_left + available_width - child_width
else: # CENTER
x_pos = padding_left + (available_width - child_width) // 2
# Set child position
child._origin = np.array([x_pos, current_y])
# Move down for next child
current_y += child_height + self._spacing
# Layout the child if it's layoutable
if isinstance(child, Layoutable):
child.layout()
else: # horizontal
total_width = sum(getattr(child, '_size', [0, 0])[0] for child in self._children)
total_width += self._spacing * (len(self._children) - 1)
# Position each child
current_x = padding_left
for child in self._children:
if hasattr(child, '_size') and hasattr(child, '_origin'):
child_width, child_height = child._size
# Calculate vertical position based on alignment
if self._valign == Alignment.TOP:
y_pos = padding_top
elif self._valign == Alignment.BOTTOM:
y_pos = padding_top + available_height - child_height
else: # CENTER
y_pos = padding_top + (available_height - child_height) // 2
# Set child position
child._origin = np.array([current_x, y_pos])
# Move right for next child
current_x += child_width + self._spacing
# Layout the child if it's layoutable
if isinstance(child, Layoutable):
child.layout()
def render(self) -> Image:
"""Render the container with all its children"""
# Make sure children are laid out
self.layout()
# Create base canvas
canvas = super().render()
# Render each child and paste it onto the canvas
for child in self._children:
if hasattr(child, '_origin'):
child_img = child.render()
# Calculate child position relative to container
rel_pos = tuple(child._origin - self._origin)
# Paste the child onto the canvas
canvas.paste(child_img, rel_pos, child_img)
return canvas
class Page(Container):
"""
Top-level container representing an HTML page.
"""
def __init__(self, size=(800, 600), background_color=(255, 255, 255), mode='RGBA'):
"""
Initialize a page.
Args:
size: Width and height of the page
background_color: Background color as RGB tuple
mode: Image mode
"""
super().__init__(
origin=(0, 0),
size=size,
direction='vertical',
spacing=10,
mode=mode,
halign=Alignment.CENTER, # Center horizontally to match test expectation
valign=Alignment.TOP,
padding=(10, 10, 10, 10) # Use 10 padding to match test expectation
)
self._background_color = background_color
def render_document(self, document, start_block: int = 0, max_blocks: Optional[int] = None) -> 'Page':
"""
Render blocks from a Document into this page.
Args:
document: The Document object to render
start_block: Which block to start rendering from (for pagination)
max_blocks: Maximum number of blocks to render (None for all remaining)
Returns:
Self for method chaining
"""
# Clear existing children
self._children.clear()
# Get blocks to render
blocks = document.blocks[start_block:]
if max_blocks is not None:
blocks = blocks[:max_blocks]
# Convert abstract blocks to renderable objects and add to page
for block in blocks:
renderable = self._convert_block_to_renderable(block)
if renderable:
self.add_child(renderable)
return self
def render_blocks(self, blocks: List[Block]) -> 'Page':
"""
Render a list of abstract blocks into this page.
Args:
blocks: List of Block objects to render
Returns:
Self for method chaining
"""
# Clear existing children
self._children.clear()
# Convert abstract blocks to renderable objects and add to page
for block in blocks:
renderable = self._convert_block_to_renderable(block)
if renderable:
self.add_child(renderable)
return self
def render_chapter(self, chapter) -> 'Page':
"""
Render a Chapter into this page.
Args:
chapter: The Chapter object to render
Returns:
Self for method chaining
"""
return self.render_blocks(chapter.blocks)
def render_from_cursor(self, cursor: DocumentCursor, max_height: Optional[int] = None) -> Tuple['Page', DocumentCursor]:
"""
Render content starting from a document cursor position, filling the page
and returning the cursor position where the page ends.
Args:
cursor: Starting position in the document
max_height: Maximum height to fill (defaults to page height minus padding)
Returns:
Tuple of (self, end_cursor) where end_cursor points to where next page should start
"""
# Clear existing children
self._children.clear()
if max_height is None:
max_height = self._size[1] - 40 # Account for top/bottom padding
current_height = 0
end_cursor = DocumentCursor(cursor.document, cursor.position.copy())
# Keep adding content until we reach the height limit
while current_height < max_height:
# Get current block
block = end_cursor.get_current_block()
if block is None:
break # End of document
# Convert block to renderable
renderable = self._convert_block_to_renderable(block)
if renderable:
# Check if adding this renderable would exceed height
renderable_height = getattr(renderable, '_size', [0, 0])[1]
if current_height + renderable_height > max_height:
# This block would exceed the page - handle partial rendering
if isinstance(block, Paragraph):
# For paragraphs, we can render partial content
partial_renderable = self._render_partial_paragraph(
block, max_height - current_height, end_cursor
)
if partial_renderable:
self.add_child(partial_renderable)
current_height += getattr(partial_renderable, '_size', [0, 0])[1]
break
else:
# Add the full block
self.add_child(renderable)
current_height += renderable_height
# Move cursor to next block
if not end_cursor.advance_block():
break # End of document
else:
# Skip blocks that can't be rendered
if not end_cursor.advance_block():
break
return self, end_cursor
def _render_partial_paragraph(self, paragraph: Paragraph, available_height: int, cursor: DocumentCursor) -> Optional[Container]:
"""
Render part of a paragraph that fits in the available height.
Updates the cursor to point to the remaining content.
Args:
paragraph: The paragraph to partially render
available_height: Available height for content
cursor: Cursor to update with new position
Returns:
Container with partial paragraph content or None
"""
# Use the paragraph layout system to break into lines
layout = ParagraphLayout(
line_width=self._size[0] - 40, # Account for margins
line_height=20,
word_spacing=(3, 8),
line_spacing=3,
halign=Alignment.LEFT
)
# Layout the paragraph into lines
lines = layout.layout_paragraph(paragraph)
if not lines:
return None
# Calculate how many lines we can fit
line_height = 23 # 20 + 3 spacing
max_lines = available_height // line_height
if max_lines <= 0:
return None
# Take only the lines that fit
lines_to_render = lines[:max_lines]
# Update cursor position to point to remaining content
if max_lines < len(lines):
# We have remaining lines - update cursor to point to next line in paragraph
cursor.position.paragraph_line_index = max_lines
else:
# We rendered the entire paragraph - cursor should advance to next block
cursor.advance_block()
# Create container for the partial paragraph
paragraph_container = Container(
origin=(0, 0),
size=(self._size[0], len(lines_to_render) * line_height),
direction='vertical',
spacing=0,
padding=(0, 0, 0, 0)
)
# Add the lines we can fit
for line in lines_to_render:
paragraph_container.add_child(line)
return paragraph_container
def get_position_bookmark(self) -> Optional[DocumentPosition]:
"""
Get a bookmark position representing the start of content on this page.
This can be used to return to this exact page later.
Returns:
DocumentPosition that can be used to recreate this page
"""
# This would be set by render_from_cursor method
return getattr(self, '_start_position', None)
def set_start_position(self, position: DocumentPosition):
"""
Set the document position that this page starts from.
Args:
position: The starting position for this page
"""
self._start_position = position
def fill_with_blocks(self, blocks: List[Block], start_index: int = 0) -> Tuple[int, List[Block]]:
"""
Fill this page with blocks using the external pagination system.
This method uses the new BlockPaginator system to handle different
block types with appropriate handlers. It replaces the internal
pagination logic and provides better support for partial content
and remainders.
Args:
blocks: List of blocks to add to the page
start_index: Index in blocks list to start from
Returns:
Tuple of (next_start_index, remainder_blocks)
- next_start_index: Index where pagination stopped
- remainder_blocks: Any partial blocks that need to continue on next page
"""
from pyWebLayout.typesetting.block_pagination import BlockPaginator
paginator = BlockPaginator()
return paginator.fill_page(self, blocks, start_index)
def try_add_block_external(self, block: Block, available_height: Optional[int] = None) -> Tuple[bool, Optional[Block], int]:
"""
Try to add a single block to this page using external handlers.
This method uses the BlockPaginator system to determine if a block
can fit on the page and handle any remainder content.
Args:
block: The block to try to add
available_height: Available height (defaults to remaining page height)
Returns:
Tuple of (success, remainder_block, height_used)
- success: Whether the block was successfully added
- remainder_block: Any remaining content that couldn't fit
- height_used: Height consumed by the added content
"""
from pyWebLayout.typesetting.block_pagination import BlockPaginator
if available_height is None:
# Calculate available height based on current content
current_height = self._calculate_current_content_height()
max_height = self._size[1] - 40 # Account for padding
available_height = max_height - current_height
paginator = BlockPaginator()
result = paginator.paginate_block(block, self, available_height)
if result.success and result.renderable:
self.add_child(result.renderable)
return True, result.remainder, result.height_used
else:
return False, result.remainder if result.can_continue else None, 0
def _calculate_current_content_height(self) -> int:
"""Calculate the height currently used by content on this page."""
if not self._children:
return 0
# Trigger layout to ensure positions are calculated
self.layout()
max_bottom = 0
for child in self._children:
if hasattr(child, '_origin') and hasattr(child, '_size'):
child_bottom = child._origin[1] + child._size[1]
max_bottom = max(max_bottom, child_bottom)
return max_bottom
def _convert_block_to_renderable(self, block: Block) -> Optional[Renderable]:
"""
Convert an abstract block to a renderable object.
Args:
block: Abstract block to convert
Returns:
Renderable object or None if conversion failed
"""
try:
if isinstance(block, Paragraph):
return self._convert_paragraph(block)
elif isinstance(block, Heading):
return self._convert_heading(block)
elif isinstance(block, HList):
return self._convert_list(block)
elif isinstance(block, AbstractImage):
return self._convert_image(block)
else:
# For other block types, try to extract text content
return self._convert_generic_block(block)
except Exception as e:
# Return error text for failed conversions
error_font = Font(colour=(255, 0, 0))
return Text(f"[Conversion Error: {str(e)}]", error_font)
def _convert_paragraph(self, paragraph: Paragraph) -> Optional[Container]:
"""Convert a paragraph block to a Container with proper Line objects."""
# Extract text content directly
text_content = self._extract_text_from_block(paragraph)
if not text_content:
return None
# Get the original font from the paragraph's first word
paragraph_font = Font(font_size=16) # Default fallback
# Try to extract font from the paragraph's words
try:
for _, word in paragraph.words():
if hasattr(word, 'font') and word.font:
paragraph_font = word.font
break
except:
pass # Use default if extraction fails
# Calculate available width using the page's padding system
padding_left = self._padding[3] # Left padding
padding_right = self._padding[1] # Right padding
available_width = self._size[0] - padding_left - padding_right
# Split into words
words = text_content.split()
if not words:
return None
# Import the Line class
from .text import Line
# Create lines using the proper Line class with justified alignment
lines = []
line_height = paragraph_font.font_size + 4 # Font size + small line spacing
word_spacing = (3, 8) # min, max spacing between words
# Create lines by adding words until they don't fit
word_index = 0
line_y_offset = 0
while word_index < len(words):
# Create a new line with proper bounding box
line_origin = (0, line_y_offset)
line_size = (available_width, line_height)
# Use JUSTIFY alignment for better text flow
line = Line(
spacing=word_spacing,
origin=line_origin,
size=line_size,
font=paragraph_font,
halign=Alignment.JUSTIFY
)
# Add words to this line until it's full
while word_index < len(words):
remaining_text = line.add_word(words[word_index], paragraph_font)
if remaining_text is None:
# Word fit completely
word_index += 1
else:
# Word didn't fit, move to next line
# Check if the remaining text is the same as the original word
if remaining_text == words[word_index]:
# Word couldn't fit at all, skip to next line
break
else:
# Word was partially fit (hyphenated), update the word
words[word_index] = remaining_text
break
# Add the line if it has any words
if len(line.renderable_words) > 0:
lines.append(line)
line_y_offset += line_height
else:
# Prevent infinite loop if no words can fit
word_index += 1
if not lines:
return None
# Create a container for the lines
total_height = len(lines) * line_height
paragraph_container = Container(
origin=(0, 0),
size=(available_width, total_height),
direction='vertical',
spacing=0, # Lines handle their own spacing
padding=(0, 0, 0, 0) # No additional padding since page handles it
)
# Add each line to the container
for line in lines:
paragraph_container.add_child(line)
return paragraph_container
def _convert_heading(self, heading: Heading) -> Optional[Text]:
"""Convert a heading block to a Text renderable with appropriate font."""
# Extract text content
words = []
for _, word in heading.words():
words.append(word.text)
if words:
text_content = ' '.join(words)
# Create heading font based on level
size_map = {
HeadingLevel.H1: 24,
HeadingLevel.H2: 20,
HeadingLevel.H3: 18,
HeadingLevel.H4: 16,
HeadingLevel.H5: 14,
HeadingLevel.H6: 12
}
font_size = size_map.get(heading.level, 16)
heading_font = Font(font_size=font_size, weight=FontWeight.BOLD)
return Text(text_content, heading_font)
return None
def _convert_list(self, hlist: HList) -> Optional[Container]:
"""Convert a list block to a Container with list items."""
list_container = Container(
origin=(0, 0),
size=(self._size[0] - 40, 100), # Adjust size as needed
direction='vertical',
spacing=5,
padding=(5, 20, 5, 20) # Add indentation
)
for item in hlist.items():
# Convert each list item
item_text = self._extract_text_from_block(item)
if item_text:
# Add bullet or number prefix
if hlist.style == ListStyle.UNORDERED:
prefix = ""
else:
# For ordered lists, we'd need to track the index
prefix = "- "
item_font = Font()
full_text = prefix + item_text
text_renderable = Text(full_text, item_font)
list_container.add_child(text_renderable)
return list_container if list_container._children else None
def _convert_image(self, image: AbstractImage) -> Optional[Renderable]:
"""Convert an image block to a RenderableImage."""
try:
# Try to create the image
renderable_image = RenderableImage(image, max_width=400, max_height=300)
return renderable_image
except Exception as e:
print(f"Image rendering failed: {e}")
# Return placeholder text if image fails
error_font = Font(colour=(128, 128, 128))
return Text(f"[Image: {image.alt_text or image.src if hasattr(image, 'src') else 'Unknown'}]", error_font)
def _convert_generic_block(self, block: Block) -> Optional[Text]:
"""Convert a generic block by extracting its text content."""
text_content = self._extract_text_from_block(block)
if text_content:
return Text(text_content, Font())
return None
def _extract_text_from_block(self, block: Block) -> str:
"""Extract plain text content from any block type."""
if hasattr(block, 'words') and callable(block.words):
words = []
for _, word in block.words():
words.append(word.text)
return ' '.join(words)
elif hasattr(block, 'text'):
return str(block.text)
elif hasattr(block, '__str__'):
return str(block)
else:
return ""
def render(self) -> Image:
"""Render the page with all its content"""
# Make sure children are laid out
self.layout()
# Create base canvas with background color
canvas = Image.new(self._mode, tuple(self._size), self._background_color)
# Render each child and paste it onto the canvas
for child in self._children:
if hasattr(child, '_origin'):
child_img = child.render()
# Calculate child position relative to page
rel_pos = tuple(child._origin)
# Paste the child onto the canvas with alpha channel if available
if 'A' in self._mode and child_img.mode == 'RGBA':
canvas.paste(child_img, rel_pos, child_img)
else:
canvas.paste(child_img, rel_pos)
return canvas