pyWebLayout/examples/html_multipage_simple.py
Duncan Tourolle 65ab46556f
Some checks failed
Python CI / test (push) Failing after 3m55s
big update with ok rendering
2025-08-27 22:22:54 +02:00

366 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Simple HTML Multi-Page Rendering Demo
This example demonstrates a working HTML to multi-page layout system using
the proven patterns from the integration tests. It shows:
1. Parse HTML content using pyWebLayout's HTML extraction system
2. Layout the parsed content across multiple pages using the document layouter
3. Save each page as an image file
This is a simplified but functional implementation.
"""
import os
import sys
from pathlib import Path
from typing import List, Tuple
from PIL import Image, ImageDraw, ImageFont
# Add pyWebLayout to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.layout.document_layouter import paragraph_layouter
from pyWebLayout.style.abstract_style import AbstractStyle
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext
from pyWebLayout.style import Font
from pyWebLayout.abstract.block import Block, Paragraph, Heading
from pyWebLayout.abstract.inline import Word
from pyWebLayout.concrete.text import Line
class SimplePage:
"""A simple page implementation for multi-page layout."""
def __init__(self, width=600, height=800, max_lines=30):
self.border_size = 40
self._current_y_offset = self.border_size
self.available_width = width - (2 * self.border_size)
self.available_height = height - (2 * self.border_size)
self.max_lines = max_lines
self.lines_added = 0
self.children = []
self.page_size = (width, height)
# Create a real drawing context
self.image = Image.new('RGB', (width, height), 'white')
self.draw = ImageDraw.Draw(self.image)
# Create a real style resolver
context = RenderingContext(base_font_size=16)
self.style_resolver = StyleResolver(context)
# Draw page border
border_color = (220, 220, 220)
self.draw.rectangle([0, 0, width-1, height-1], outline=border_color, width=2)
def can_fit_line(self, line_height):
"""Check if another line can fit on the page."""
remaining_height = self.available_height - (self._current_y_offset - self.border_size)
can_fit = remaining_height >= line_height and self.lines_added < self.max_lines
return can_fit
def add_child(self, child):
"""Add a child element (like a Line) to the page."""
self.children.append(child)
self.lines_added += 1
# Draw the line content on the page
if isinstance(child, Line):
self._draw_line(child)
return True
def _draw_line(self, line):
"""Draw a line of text on the page."""
try:
# Use a default font for drawing
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
except:
font = ImageFont.load_default()
# Get line text (simplified)
line_text = getattr(line, '_text_content', 'Line content')
# Draw the text
text_color = (0, 0, 0) # Black
x = self.border_size + 10
y = self._current_y_offset
self.draw.text((x, y), line_text, fill=text_color, font=font)
except Exception as e:
# Fallback: draw a simple representation
x = self.border_size + 10
y = self._current_y_offset
self.draw.text((x, y), "Text line", fill=(0, 0, 0))
class SimpleWord(Word):
"""A simple word implementation that works with the layouter."""
def __init__(self, text, style=None):
if style is None:
style = Font(font_size=14)
super().__init__(text, style)
def possible_hyphenation(self):
"""Return possible hyphenation points."""
if len(self.text) <= 6:
return []
# Simple hyphenation: split roughly in the middle
mid = len(self.text) // 2
return [(self.text[:mid] + "-", self.text[mid:])]
class SimpleParagraph:
"""A simple paragraph implementation that works with the layouter."""
def __init__(self, text_content, style=None):
if style is None:
style = AbstractStyle(
word_spacing=4.0,
word_spacing_min=2.0,
word_spacing_max=8.0
)
self.style = style
self.line_height = 20
# Create words from text content
self.words = []
for word_text in text_content.split():
if word_text.strip():
word = SimpleWord(word_text.strip())
self.words.append(word)
def create_sample_html() -> str:
"""Create a sample HTML document for testing."""
return """
<html>
<body>
<h1>Chapter 1: Introduction</h1>
<p>This is the first paragraph of our sample document. It demonstrates how HTML content
can be parsed and then laid out across multiple pages using the pyWebLayout system.</p>
<p>Here's another paragraph with some more text to show how the system handles
multiple paragraphs and automatic page breaking when content exceeds page boundaries.</p>
<h2>Section 1.1: Features</h2>
<p>The multi-page layout system includes several key features that make it suitable
for ereader applications and document processing systems.</p>
<p>Each paragraph is processed individually and can span multiple lines or even
multiple pages if the content is long enough to require it.</p>
<h1>Chapter 2: Implementation</h1>
<p>The implementation uses a sophisticated layout engine that processes abstract
document elements and renders them onto concrete pages.</p>
<p>This separation allows for flexible styling and layout while maintaining
the semantic structure of the original content.</p>
<p>The system can handle various HTML elements including headings, paragraphs,
lists, and other block-level elements commonly found in documents.</p>
<p>Position tracking is maintained throughout the layout process, enabling
features like bookmarking and navigation between different views of the content.</p>
</body>
</html>
"""
class HTMLMultiPageRenderer:
"""Simple HTML to multi-page renderer."""
def __init__(self, page_size: Tuple[int, int] = (600, 800)):
self.page_size = page_size
def parse_html_to_paragraphs(self, html_content: str) -> List[SimpleParagraph]:
"""Parse HTML content into simple paragraphs."""
# Parse HTML using the extraction system
base_font = Font(font_size=14)
blocks = parse_html_string(html_content, base_font=base_font)
paragraphs = []
for block in blocks:
if isinstance(block, (Paragraph, Heading)):
# Extract text from the block
text_parts = []
# Get words from the block - handle tuple format
if hasattr(block, 'words') and callable(block.words):
for word_item in block.words():
# Handle both Word objects and tuples
if hasattr(word_item, 'text'):
text_parts.append(word_item.text)
elif isinstance(word_item, tuple) and len(word_item) >= 2:
# Tuple format: (position, word_object)
word_obj = word_item[1]
if hasattr(word_obj, 'text'):
text_parts.append(word_obj.text)
elif isinstance(word_item, str):
text_parts.append(word_item)
# Fallback: try _words attribute directly
if not text_parts and hasattr(block, '_words'):
for word_item in block._words:
if hasattr(word_item, 'text'):
text_parts.append(word_item.text)
elif isinstance(word_item, str):
text_parts.append(word_item)
if text_parts:
text_content = " ".join(text_parts)
# Create appropriate style based on block type
if isinstance(block, Heading):
style = AbstractStyle(
word_spacing=5.0,
word_spacing_min=3.0,
word_spacing_max=10.0
)
else:
style = AbstractStyle(
word_spacing=4.0,
word_spacing_min=2.0,
word_spacing_max=8.0
)
paragraph = SimpleParagraph(text_content, style)
paragraphs.append(paragraph)
return paragraphs
def render_pages(self, paragraphs: List[SimpleParagraph]) -> List[SimplePage]:
"""Render paragraphs into multiple pages."""
if not paragraphs:
return []
pages = []
current_page = SimplePage(*self.page_size)
pages.append(current_page)
for paragraph in paragraphs:
start_word = 0
while start_word < len(paragraph.words):
# Try to layout the paragraph (or remaining part) on current page
success, failed_word_index, remaining_pretext = paragraph_layouter(
paragraph, current_page, start_word
)
if success:
# Paragraph completed on this page
break
else:
# Page is full, create a new page
current_page = SimplePage(*self.page_size)
pages.append(current_page)
# Continue with the failed word on the new page
if failed_word_index is not None:
start_word = failed_word_index
else:
# If no specific word failed, move to next paragraph
break
return pages
def save_pages(self, pages: List[SimplePage], output_dir: str = "output/html_simple"):
"""Save pages as image files."""
os.makedirs(output_dir, exist_ok=True)
for i, page in enumerate(pages, 1):
# Add page number
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
except:
font = ImageFont.load_default()
page_text = f"Page {i}"
text_bbox = page.draw.textbbox((0, 0), page_text, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_x = (page.page_size[0] - text_width) // 2
text_y = page.page_size[1] - 25
page.draw.text((text_x, text_y), page_text, fill=(100, 100, 100), font=font)
# Save the page
filename = f"page_{i:03d}.png"
filepath = os.path.join(output_dir, filename)
page.image.save(filepath)
print(f"Saved {filepath}")
print(f"\nRendered {len(pages)} pages to {output_dir}/")
def main():
"""Main demo function."""
print("Simple HTML Multi-Page Rendering Demo")
print("=" * 45)
# Create sample HTML content
print("1. Creating sample HTML content...")
html_content = create_sample_html()
print(f" Created HTML document ({len(html_content)} characters)")
# Initialize renderer
print("\n2. Initializing renderer...")
renderer = HTMLMultiPageRenderer(page_size=(600, 800))
print(" Renderer initialized")
# Parse HTML to paragraphs
print("\n3. Parsing HTML to paragraphs...")
paragraphs = renderer.parse_html_to_paragraphs(html_content)
print(f" Parsed {len(paragraphs)} paragraphs")
# Show paragraph preview
for i, para in enumerate(paragraphs[:3]): # Show first 3
preview = " ".join(word.text for word in para.words[:8]) # First 8 words
if len(para.words) > 8:
preview += "..."
print(f" Paragraph {i+1}: {preview}")
if len(paragraphs) > 3:
print(f" ... and {len(paragraphs) - 3} more paragraphs")
# Render pages
print("\n4. Rendering pages...")
pages = renderer.render_pages(paragraphs)
print(f" Rendered {len(pages)} pages")
# Show page statistics
for i, page in enumerate(pages, 1):
print(f" Page {i}: {page.lines_added} lines")
# Save pages
print("\n5. Saving pages...")
renderer.save_pages(pages)
print("\n✓ Demo completed successfully!")
print("\nTo view the results:")
print(" - Check the output/html_simple/ directory")
print(" - Open the PNG files to see each rendered page")
# Show statistics
print(f"\nStatistics:")
print(f" - Original HTML: {len(html_content)} characters")
print(f" - Parsed paragraphs: {len(paragraphs)}")
print(f" - Rendered pages: {len(pages)}")
print(f" - Total lines: {sum(page.lines_added for page in pages)}")
print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")
if __name__ == "__main__":
main()