pyWebLayout/examples/html_multipage_demo.py
Duncan Tourolle 65ab46556f
Some checks failed
Python CI / test (push) Failing after 3m55s
big update with ok rendering
2025-08-27 22:22:54 +02:00

327 lines
12 KiB
Python

#!/usr/bin/env python3
"""
HTML Multi-Page Rendering Demo
This example demonstrates how to:
1. Parse HTML content using pyWebLayout's HTML extraction system
2. Layout the parsed content across multiple pages using the ereader layout system
3. Render each page as an image file
The demo shows the complete pipeline from HTML to multi-page layout.
"""
import os
import sys
from pathlib import Path
from typing import List, Tuple
from PIL import Image, ImageDraw
# Add pyWebLayout to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.layout.ereader_layout import BidirectionalLayouter, RenderingPosition
from pyWebLayout.concrete.page import Page
from pyWebLayout.style.page_style import PageStyle
from pyWebLayout.style import Font
from pyWebLayout.abstract.block import Block
def create_sample_html() -> str:
"""Create a sample HTML document with various elements for testing."""
return """
<!DOCTYPE html>
<html>
<head>
<title>Sample Document</title>
</head>
<body>
<h1>Chapter 1: Introduction to Multi-Page Layout</h1>
<p>This is the first paragraph of our sample document. It demonstrates how HTML content
can be parsed and then laid out across multiple pages using the pyWebLayout system.
The system handles various HTML elements including headings, paragraphs, lists, and more.</p>
<p>Here's another paragraph with <strong>bold text</strong> and <em>italic text</em>
to show how inline formatting is preserved during the conversion process. The layout
engine will automatically handle word wrapping and page breaks as needed.</p>
<h2>Section 1.1: Features</h2>
<p>The multi-page layout system includes several key features:</p>
<ul>
<li>Automatic page breaking when content exceeds page boundaries</li>
<li>Font scaling support for different reading preferences</li>
<li>Position tracking for bookmarks and navigation</li>
<li>Support for various HTML elements and styling</li>
</ul>
<p>Each of these features works together to provide a seamless reading experience
that adapts to different page sizes and user preferences.</p>
<h2>Section 1.2: Technical Implementation</h2>
<p>The implementation uses a sophisticated layout engine that processes abstract
document elements and renders them onto concrete pages. This separation allows
for flexible styling and layout while maintaining the semantic structure of
the original content.</p>
<blockquote>
"The best way to understand a complex system is to see it in action with
real examples and practical demonstrations."
</blockquote>
<p>This quote illustrates the philosophy behind this demo - showing how the
various components work together in practice.</p>
<h1>Chapter 2: Advanced Layout Concepts</h1>
<p>Moving into more advanced territory, we can explore how the layout system
handles complex scenarios such as page breaks within paragraphs, font scaling
effects on layout, and position tracking across multiple pages.</p>
<p>The system maintains precise position information that allows for features
like bookmarking, search result highlighting, and seamless navigation between
different views of the same content.</p>
<h2>Section 2.1: Position Tracking</h2>
<p>Position tracking is implemented using a hierarchical system that can
reference any point in the document structure. This includes not just
paragraph and word positions, but also positions within tables, lists,
and other complex structures.</p>
<p>The position system is designed to be stable across different rendering
parameters, so a bookmark created with one font size will still be valid
when the user changes to a different font size.</p>
<h2>Section 2.2: Multi-Page Rendering</h2>
<p>The multi-page rendering system can generate pages both forward and
backward from any given position. This bidirectional capability is
essential for smooth navigation in ereader applications.</p>
<p>Each page is rendered independently, which allows for efficient
caching and parallel processing of multiple pages when needed.</p>
<p>This concludes our sample document. The layout system will automatically
determine how many pages are needed to display all this content based on
the page size and font settings used during rendering.</p>
</body>
</html>
"""
class HTMLMultiPageRenderer:
"""
Renderer that converts HTML to multiple page images.
"""
def __init__(self, page_size: Tuple[int, int] = (600, 800), font_scale: float = 1.0):
"""
Initialize the renderer.
Args:
page_size: Size of each page in pixels (width, height)
font_scale: Font scaling factor
"""
self.page_size = page_size
self.font_scale = font_scale
self.page_style = PageStyle()
def parse_html_to_blocks(self, html_content: str) -> List[Block]:
"""
Parse HTML content into abstract blocks.
Args:
html_content: HTML string to parse
Returns:
List of abstract Block objects
"""
base_font = Font(font_size=14) # Base font for the document
blocks = parse_html_string(html_content, base_font=base_font)
return blocks
def render_pages(self, blocks: List[Block], max_pages: int = 20) -> List[Image.Image]:
"""
Render blocks into multiple page images.
Args:
blocks: List of abstract blocks to render
max_pages: Maximum number of pages to render (safety limit)
Returns:
List of PIL Image objects, one per page
"""
if not blocks:
return []
# Create the bidirectional layouter
layouter = BidirectionalLayouter(blocks, self.page_style, self.page_size)
pages = []
current_position = RenderingPosition() # Start at beginning
page_count = 0
while page_count < max_pages:
try:
# Render the next page
page, next_position = layouter.render_page_forward(current_position, self.font_scale)
# Convert page to image
page_image = self._page_to_image(page)
pages.append(page_image)
page_count += 1
# Check if we've reached the end
if self._is_end_position(next_position, current_position, blocks):
break
current_position = next_position
except Exception as e:
print(f"Error rendering page {page_count + 1}: {e}")
break
return pages
def _page_to_image(self, page: Page) -> Image.Image:
"""
Convert a Page object to a PIL Image.
Args:
page: Page object to convert
Returns:
PIL Image object
"""
# Create a white background image
image = Image.new('RGB', self.page_size, 'white')
draw = ImageDraw.Draw(image)
# Draw page border
border_color = (200, 200, 200)
draw.rectangle([0, 0, self.page_size[0]-1, self.page_size[1]-1], outline=border_color)
# The page object should have already been rendered with its draw context
# For this demo, we'll create a simple representation
# Add page number at bottom
try:
from PIL import ImageFont
font = ImageFont.load_default()
except:
font = None
page_num_text = f"Page {len(pages) + 1}" if 'pages' in locals() else "Page"
text_bbox = draw.textbbox((0, 0), page_num_text, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_x = (self.page_size[0] - text_width) // 2
text_y = self.page_size[1] - 30
draw.text((text_x, text_y), page_num_text, fill='black', font=font)
return image
def _is_end_position(self, current_pos: RenderingPosition, previous_pos: RenderingPosition, blocks: List[Block]) -> bool:
"""
Check if we've reached the end of the document.
Args:
current_pos: Current rendering position
previous_pos: Previous rendering position
blocks: List of all blocks in document
Returns:
True if at end of document
"""
# If position hasn't advanced, we're likely at the end
if (current_pos.block_index == previous_pos.block_index and
current_pos.word_index == previous_pos.word_index):
return True
# If we've processed all blocks
if current_pos.block_index >= len(blocks):
return True
return False
def save_pages(self, pages: List[Image.Image], output_dir: str = "output/html_multipage"):
"""
Save rendered pages as image files.
Args:
pages: List of page images
output_dir: Directory to save images
"""
# Create output directory
os.makedirs(output_dir, exist_ok=True)
for i, page_image in enumerate(pages, 1):
filename = f"page_{i:03d}.png"
filepath = os.path.join(output_dir, filename)
page_image.save(filepath)
print(f"Saved {filepath}")
print(f"\nRendered {len(pages)} pages to {output_dir}/")
def main():
"""Main demo function."""
print("HTML Multi-Page Rendering Demo")
print("=" * 40)
# Create sample HTML content
print("1. Creating sample HTML content...")
html_content = create_sample_html()
print(f" Created HTML document ({len(html_content)} characters)")
# Initialize renderer
print("\n2. Initializing renderer...")
renderer = HTMLMultiPageRenderer(page_size=(600, 800), font_scale=1.0)
print(" Renderer initialized")
# Parse HTML to blocks
print("\n3. Parsing HTML to abstract blocks...")
blocks = renderer.parse_html_to_blocks(html_content)
print(f" Parsed {len(blocks)} blocks")
# Print block summary
block_types = {}
for block in blocks:
block_type = type(block).__name__
block_types[block_type] = block_types.get(block_type, 0) + 1
print(" Block types found:")
for block_type, count in block_types.items():
print(f" - {block_type}: {count}")
# Render pages
print("\n4. Rendering pages...")
pages = renderer.render_pages(blocks, max_pages=10)
print(f" Rendered {len(pages)} pages")
# Save pages
print("\n5. Saving pages...")
renderer.save_pages(pages)
print("\n✓ Demo completed successfully!")
print("\nTo view the results:")
print(" - Check the output/html_multipage/ directory")
print(" - Open the PNG files to see each rendered page")
# Show some statistics
print(f"\nStatistics:")
print(f" - Original HTML: {len(html_content)} characters")
print(f" - Abstract blocks: {len(blocks)}")
print(f" - Rendered pages: {len(pages)}")
print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")
print(f" - Font scale: {renderer.font_scale}x")
if __name__ == "__main__":
main()