#!/usr/bin/env python3 """ HTML Multi-Page Rendering Demo This example demonstrates how to: 1. Parse HTML content using pyWebLayout's HTML extraction system 2. Layout the parsed content across multiple pages using the ereader layout system 3. Render each page as an image file The demo shows the complete pipeline from HTML to multi-page layout. """ import os import sys from pathlib import Path from typing import List, Tuple from PIL import Image, ImageDraw # Add pyWebLayout to path sys.path.insert(0, str(Path(__file__).parent.parent)) from pyWebLayout.io.readers.html_extraction import parse_html_string from pyWebLayout.layout.ereader_layout import BidirectionalLayouter, RenderingPosition from pyWebLayout.concrete.page import Page from pyWebLayout.style.page_style import PageStyle from pyWebLayout.style import Font from pyWebLayout.abstract.block import Block def create_sample_html() -> str: """Create a sample HTML document with various elements for testing.""" return """ Sample Document

Chapter 1: Introduction to Multi-Page Layout

This is the first paragraph of our sample document. It demonstrates how HTML content can be parsed and then laid out across multiple pages using the pyWebLayout system. The system handles various HTML elements including headings, paragraphs, lists, and more.

Here's another paragraph with bold text and italic text to show how inline formatting is preserved during the conversion process. The layout engine will automatically handle word wrapping and page breaks as needed.

Section 1.1: Features

The multi-page layout system includes several key features:

Automatic page breaking when content exceeds page boundaries
Font scaling support for different reading preferences
Position tracking for bookmarks and navigation
Support for various HTML elements and styling

Each of these features works together to provide a seamless reading experience that adapts to different page sizes and user preferences.

Section 1.2: Technical Implementation

The implementation uses a sophisticated layout engine that processes abstract document elements and renders them onto concrete pages. This separation allows for flexible styling and layout while maintaining the semantic structure of the original content.

"The best way to understand a complex system is to see it in action with real examples and practical demonstrations."

This quote illustrates the philosophy behind this demo - showing how the various components work together in practice.

Chapter 2: Advanced Layout Concepts

Moving into more advanced territory, we can explore how the layout system handles complex scenarios such as page breaks within paragraphs, font scaling effects on layout, and position tracking across multiple pages.

The system maintains precise position information that allows for features like bookmarking, search result highlighting, and seamless navigation between different views of the same content.

Section 2.1: Position Tracking

Position tracking is implemented using a hierarchical system that can reference any point in the document structure. This includes not just paragraph and word positions, but also positions within tables, lists, and other complex structures.

The position system is designed to be stable across different rendering parameters, so a bookmark created with one font size will still be valid when the user changes to a different font size.

Section 2.2: Multi-Page Rendering

The multi-page rendering system can generate pages both forward and backward from any given position. This bidirectional capability is essential for smooth navigation in ereader applications.

Each page is rendered independently, which allows for efficient caching and parallel processing of multiple pages when needed.

This concludes our sample document. The layout system will automatically determine how many pages are needed to display all this content based on the page size and font settings used during rendering.

""" class HTMLMultiPageRenderer: """ Renderer that converts HTML to multiple page images. """ def __init__(self, page_size: Tuple[int, int] = (600, 800), font_scale: float = 1.0): """ Initialize the renderer. Args: page_size: Size of each page in pixels (width, height) font_scale: Font scaling factor """ self.page_size = page_size self.font_scale = font_scale self.page_style = PageStyle() def parse_html_to_blocks(self, html_content: str) -> List[Block]: """ Parse HTML content into abstract blocks. Args: html_content: HTML string to parse Returns: List of abstract Block objects """ base_font = Font(font_size=14) # Base font for the document blocks = parse_html_string(html_content, base_font=base_font) return blocks def render_pages(self, blocks: List[Block], max_pages: int = 20) -> List[Image.Image]: """ Render blocks into multiple page images. Args: blocks: List of abstract blocks to render max_pages: Maximum number of pages to render (safety limit) Returns: List of PIL Image objects, one per page """ if not blocks: return [] # Create the bidirectional layouter layouter = BidirectionalLayouter(blocks, self.page_style, self.page_size) pages = [] current_position = RenderingPosition() # Start at beginning page_count = 0 while page_count < max_pages: try: # Render the next page page, next_position = layouter.render_page_forward(current_position, self.font_scale) # Convert page to image page_image = self._page_to_image(page) pages.append(page_image) page_count += 1 # Check if we've reached the end if self._is_end_position(next_position, current_position, blocks): break current_position = next_position except Exception as e: print(f"Error rendering page {page_count + 1}: {e}") break return pages def _page_to_image(self, page: Page) -> Image.Image: """ Convert a Page object to a PIL Image. Args: page: Page object to convert Returns: PIL Image object """ # Create a white background image image = Image.new('RGB', self.page_size, 'white') draw = ImageDraw.Draw(image) # Draw page border border_color = (200, 200, 200) draw.rectangle([0, 0, self.page_size[0]-1, self.page_size[1]-1], outline=border_color) # The page object should have already been rendered with its draw context # For this demo, we'll create a simple representation # Add page number at bottom try: from PIL import ImageFont font = ImageFont.load_default() except: font = None page_num_text = f"Page {len(pages) + 1}" if 'pages' in locals() else "Page" text_bbox = draw.textbbox((0, 0), page_num_text, font=font) text_width = text_bbox[2] - text_bbox[0] text_x = (self.page_size[0] - text_width) // 2 text_y = self.page_size[1] - 30 draw.text((text_x, text_y), page_num_text, fill='black', font=font) return image def _is_end_position(self, current_pos: RenderingPosition, previous_pos: RenderingPosition, blocks: List[Block]) -> bool: """ Check if we've reached the end of the document. Args: current_pos: Current rendering position previous_pos: Previous rendering position blocks: List of all blocks in document Returns: True if at end of document """ # If position hasn't advanced, we're likely at the end if (current_pos.block_index == previous_pos.block_index and current_pos.word_index == previous_pos.word_index): return True # If we've processed all blocks if current_pos.block_index >= len(blocks): return True return False def save_pages(self, pages: List[Image.Image], output_dir: str = "output/html_multipage"): """ Save rendered pages as image files. Args: pages: List of page images output_dir: Directory to save images """ # Create output directory os.makedirs(output_dir, exist_ok=True) for i, page_image in enumerate(pages, 1): filename = f"page_{i:03d}.png" filepath = os.path.join(output_dir, filename) page_image.save(filepath) print(f"Saved {filepath}") print(f"\nRendered {len(pages)} pages to {output_dir}/") def main(): """Main demo function.""" print("HTML Multi-Page Rendering Demo") print("=" * 40) # Create sample HTML content print("1. Creating sample HTML content...") html_content = create_sample_html() print(f" Created HTML document ({len(html_content)} characters)") # Initialize renderer print("\n2. Initializing renderer...") renderer = HTMLMultiPageRenderer(page_size=(600, 800), font_scale=1.0) print(" Renderer initialized") # Parse HTML to blocks print("\n3. Parsing HTML to abstract blocks...") blocks = renderer.parse_html_to_blocks(html_content) print(f" Parsed {len(blocks)} blocks") # Print block summary block_types = {} for block in blocks: block_type = type(block).__name__ block_types[block_type] = block_types.get(block_type, 0) + 1 print(" Block types found:") for block_type, count in block_types.items(): print(f" - {block_type}: {count}") # Render pages print("\n4. Rendering pages...") pages = renderer.render_pages(blocks, max_pages=10) print(f" Rendered {len(pages)} pages") # Save pages print("\n5. Saving pages...") renderer.save_pages(pages) print("\n✓ Demo completed successfully!") print("\nTo view the results:") print(" - Check the output/html_multipage/ directory") print(" - Open the PNG files to see each rendered page") # Show some statistics print(f"\nStatistics:") print(f" - Original HTML: {len(html_content)} characters") print(f" - Abstract blocks: {len(blocks)}") print(f" - Rendered pages: {len(pages)}") print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels") print(f" - Font scale: {renderer.font_scale}x") if __name__ == "__main__": main()