pyWebLayout/examples/html_multipage_demo.py

#!/usr/bin/env python3
"""
HTML Multi-Page Rendering Demo

This example demonstrates how to:
1. Parse HTML content using pyWebLayout's HTML extraction system
2. Layout the parsed content across multiple pages using the ereader layout system
3. Render each page as an image file

The demo shows the complete pipeline from HTML to multi-page layout.
"""

import os
import sys
from pathlib import Path
from typing import List, Tuple
from PIL import Image, ImageDraw

# Add pyWebLayout to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.layout.ereader_layout import BidirectionalLayouter, RenderingPosition
from pyWebLayout.concrete.page import Page
from pyWebLayout.style.page_style import PageStyle
from pyWebLayout.style import Font
from pyWebLayout.abstract.block import Block


def create_sample_html() -> str:
    """Create a sample HTML document with various elements for testing."""
    return """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Sample Document</title>
    </head>
    <body>
        <h1>Chapter 1: Introduction to Multi-Page Layout</h1>

        <p>This is the first paragraph of our sample document. It demonstrates how HTML content
        can be parsed and then laid out across multiple pages using the pyWebLayout system.
        The system handles various HTML elements including headings, paragraphs, lists, and more.</p>

        <p>Here's another paragraph with <strong>bold text</strong> and <em>italic text</em>
        to show how inline formatting is preserved during the conversion process. The layout
        engine will automatically handle word wrapping and page breaks as needed.</p>

        <h2>Section 1.1: Features</h2>

        <p>The multi-page layout system includes several key features:</p>

        <ul>
            <li>Automatic page breaking when content exceeds page boundaries</li>
            <li>Font scaling support for different reading preferences</li>
            <li>Position tracking for bookmarks and navigation</li>
            <li>Support for various HTML elements and styling</li>
        </ul>

        <p>Each of these features works together to provide a seamless reading experience
        that adapts to different page sizes and user preferences.</p>

        <h2>Section 1.2: Technical Implementation</h2>

        <p>The implementation uses a sophisticated layout engine that processes abstract
        document elements and renders them onto concrete pages. This separation allows
        for flexible styling and layout while maintaining the semantic structure of
        the original content.</p>

        <blockquote>
            "The best way to understand a complex system is to see it in action with
            real examples and practical demonstrations."
        </blockquote>

        <p>This quote illustrates the philosophy behind this demo - showing how the
        various components work together in practice.</p>

        <h1>Chapter 2: Advanced Layout Concepts</h1>

        <p>Moving into more advanced territory, we can explore how the layout system
        handles complex scenarios such as page breaks within paragraphs, font scaling
        effects on layout, and position tracking across multiple pages.</p>

        <p>The system maintains precise position information that allows for features
        like bookmarking, search result highlighting, and seamless navigation between
        different views of the same content.</p>

        <h2>Section 2.1: Position Tracking</h2>

        <p>Position tracking is implemented using a hierarchical system that can
        reference any point in the document structure. This includes not just
        paragraph and word positions, but also positions within tables, lists,
        and other complex structures.</p>

        <p>The position system is designed to be stable across different rendering
        parameters, so a bookmark created with one font size will still be valid
        when the user changes to a different font size.</p>

        <h2>Section 2.2: Multi-Page Rendering</h2>

        <p>The multi-page rendering system can generate pages both forward and
        backward from any given position. This bidirectional capability is
        essential for smooth navigation in ereader applications.</p>

        <p>Each page is rendered independently, which allows for efficient
        caching and parallel processing of multiple pages when needed.</p>

        <p>This concludes our sample document. The layout system will automatically
        determine how many pages are needed to display all this content based on
        the page size and font settings used during rendering.</p>
    </body>
    </html>
    """


class HTMLMultiPageRenderer:
    """
    Renderer that converts HTML to multiple page images.
    """

    def __init__(self, page_size: Tuple[int, int] = (600, 800), font_scale: float = 1.0):
        """
        Initialize the renderer.

        Args:
            page_size: Size of each page in pixels (width, height)
            font_scale: Font scaling factor
        """
        self.page_size = page_size
        self.font_scale = font_scale
        self.page_style = PageStyle()

    def parse_html_to_blocks(self, html_content: str) -> List[Block]:
        """
        Parse HTML content into abstract blocks.

        Args:
            html_content: HTML string to parse

        Returns:
            List of abstract Block objects
        """
        base_font = Font(font_size=14)  # Base font for the document
        blocks = parse_html_string(html_content, base_font=base_font)
        return blocks

    def render_pages(self, blocks: List[Block], max_pages: int = 20) -> List[Image.Image]:
        """
        Render blocks into multiple page images.

        Args:
            blocks: List of abstract blocks to render
            max_pages: Maximum number of pages to render (safety limit)

        Returns:
            List of PIL Image objects, one per page
        """
        if not blocks:
            return []

        # Create the bidirectional layouter
        layouter = BidirectionalLayouter(blocks, self.page_style, self.page_size)

        pages = []
        current_position = RenderingPosition()  # Start at beginning
        page_count = 0

        while page_count < max_pages:
            try:
                # Render the next page
                page, next_position = layouter.render_page_forward(current_position, self.font_scale)

                # Convert page to image
                page_image = self._page_to_image(page)
                pages.append(page_image)

                page_count += 1

                # Check if we've reached the end
                if self._is_end_position(next_position, current_position, blocks):
                    break

                current_position = next_position

            except Exception as e:
                print(f"Error rendering page {page_count + 1}: {e}")
                break

        return pages

    def _page_to_image(self, page: Page) -> Image.Image:
        """
        Convert a Page object to a PIL Image.

        Args:
            page: Page object to convert

        Returns:
            PIL Image object
        """
        # Create a white background image
        image = Image.new('RGB', self.page_size, 'white')
        draw = ImageDraw.Draw(image)

        # Draw page border
        border_color = (200, 200, 200)
        draw.rectangle([0, 0, self.page_size[0]-1, self.page_size[1]-1], outline=border_color)

        # The page object should have already been rendered with its draw context
        # For this demo, we'll create a simple representation

        # Add page number at bottom
        try:
            from PIL import ImageFont
            font = ImageFont.load_default()
        except:
            font = None

        page_num_text = f"Page {len(pages) + 1}" if 'pages' in locals() else "Page"
        text_bbox = draw.textbbox((0, 0), page_num_text, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_x = (self.page_size[0] - text_width) // 2
        text_y = self.page_size[1] - 30

        draw.text((text_x, text_y), page_num_text, fill='black', font=font)

        return image

    def _is_end_position(self, current_pos: RenderingPosition, previous_pos: RenderingPosition, blocks: List[Block]) -> bool:
        """
        Check if we've reached the end of the document.

        Args:
            current_pos: Current rendering position
            previous_pos: Previous rendering position
            blocks: List of all blocks in document

        Returns:
            True if at end of document
        """
        # If position hasn't advanced, we're likely at the end
        if (current_pos.block_index == previous_pos.block_index and
            current_pos.word_index == previous_pos.word_index):
            return True

        # If we've processed all blocks
        if current_pos.block_index >= len(blocks):
            return True

        return False

    def save_pages(self, pages: List[Image.Image], output_dir: str = "output/html_multipage"):
        """
        Save rendered pages as image files.

        Args:
            pages: List of page images
            output_dir: Directory to save images
        """
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        for i, page_image in enumerate(pages, 1):
            filename = f"page_{i:03d}.png"
            filepath = os.path.join(output_dir, filename)
            page_image.save(filepath)
            print(f"Saved {filepath}")

        print(f"\nRendered {len(pages)} pages to {output_dir}/")


def main():
    """Main demo function."""
    print("HTML Multi-Page Rendering Demo")
    print("=" * 40)

    # Create sample HTML content
    print("1. Creating sample HTML content...")
    html_content = create_sample_html()
    print(f"   Created HTML document ({len(html_content)} characters)")

    # Initialize renderer
    print("\n2. Initializing renderer...")
    renderer = HTMLMultiPageRenderer(page_size=(600, 800), font_scale=1.0)
    print("   Renderer initialized")

    # Parse HTML to blocks
    print("\n3. Parsing HTML to abstract blocks...")
    blocks = renderer.parse_html_to_blocks(html_content)
    print(f"   Parsed {len(blocks)} blocks")

    # Print block summary
    block_types = {}
    for block in blocks:
        block_type = type(block).__name__
        block_types[block_type] = block_types.get(block_type, 0) + 1

    print("   Block types found:")
    for block_type, count in block_types.items():
        print(f"     - {block_type}: {count}")

    # Render pages
    print("\n4. Rendering pages...")
    pages = renderer.render_pages(blocks, max_pages=10)
    print(f"   Rendered {len(pages)} pages")

    # Save pages
    print("\n5. Saving pages...")
    renderer.save_pages(pages)

    print("\n✓ Demo completed successfully!")
    print("\nTo view the results:")
    print("  - Check the output/html_multipage/ directory")
    print("  - Open the PNG files to see each rendered page")

    # Show some statistics
    print(f"\nStatistics:")
    print(f"  - Original HTML: {len(html_content)} characters")
    print(f"  - Abstract blocks: {len(blocks)}")
    print(f"  - Rendered pages: {len(pages)}")
    print(f"  - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")
    print(f"  - Font scale: {renderer.font_scale}x")


if __name__ == "__main__":
    main()