pyWebLayout/examples/html_multipage_simple.py

#!/usr/bin/env python3
"""
Simple HTML Multi-Page Rendering Demo

This example demonstrates a working HTML to multi-page layout system using
the proven patterns from the integration tests. It shows:

1. Parse HTML content using pyWebLayout's HTML extraction system
2. Layout the parsed content across multiple pages using the document layouter
3. Save each page as an image file

This is a simplified but functional implementation.
"""

import os
import sys
from pathlib import Path
from typing import List, Tuple
from PIL import Image, ImageDraw, ImageFont

# Add pyWebLayout to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.layout.document_layouter import paragraph_layouter
from pyWebLayout.style.abstract_style import AbstractStyle
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext
from pyWebLayout.style import Font
from pyWebLayout.abstract.block import Block, Paragraph, Heading
from pyWebLayout.abstract.inline import Word
from pyWebLayout.concrete.text import Line


class SimplePage:
    """A simple page implementation for multi-page layout."""

    def __init__(self, width=600, height=800, max_lines=30):
        self.border_size = 40
        self._current_y_offset = self.border_size
        self.available_width = width - (2 * self.border_size)
        self.available_height = height - (2 * self.border_size)
        self.max_lines = max_lines
        self.lines_added = 0
        self.children = []
        self.page_size = (width, height)

        # Create a real drawing context
        self.image = Image.new('RGB', (width, height), 'white')
        self.draw = ImageDraw.Draw(self.image)

        # Create a real style resolver
        context = RenderingContext(base_font_size=16)
        self.style_resolver = StyleResolver(context)

        # Draw page border
        border_color = (220, 220, 220)
        self.draw.rectangle([0, 0, width-1, height-1], outline=border_color, width=2)

    def can_fit_line(self, line_height):
        """Check if another line can fit on the page."""
        remaining_height = self.available_height - (self._current_y_offset - self.border_size)
        can_fit = remaining_height >= line_height and self.lines_added < self.max_lines
        return can_fit

    def add_child(self, child):
        """Add a child element (like a Line) to the page."""
        self.children.append(child)
        self.lines_added += 1

        # Draw the line content on the page
        if isinstance(child, Line):
            self._draw_line(child)

        return True

    def _draw_line(self, line):
        """Draw a line of text on the page."""
        try:
            # Use a default font for drawing
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
            except:
                font = ImageFont.load_default()

            # Get line text (simplified)
            line_text = getattr(line, '_text_content', 'Line content')

            # Draw the text
            text_color = (0, 0, 0)  # Black
            x = self.border_size + 10
            y = self._current_y_offset

            self.draw.text((x, y), line_text, fill=text_color, font=font)

        except Exception as e:
            # Fallback: draw a simple representation
            x = self.border_size + 10
            y = self._current_y_offset
            self.draw.text((x, y), "Text line", fill=(0, 0, 0))


class SimpleWord(Word):
    """A simple word implementation that works with the layouter."""

    def __init__(self, text, style=None):
        if style is None:
            style = Font(font_size=14)
        super().__init__(text, style)

    def possible_hyphenation(self):
        """Return possible hyphenation points."""
        if len(self.text) <= 6:
            return []

        # Simple hyphenation: split roughly in the middle
        mid = len(self.text) // 2
        return [(self.text[:mid] + "-", self.text[mid:])]


class SimpleParagraph:
    """A simple paragraph implementation that works with the layouter."""

    def __init__(self, text_content, style=None):
        if style is None:
            style = AbstractStyle(
                word_spacing=4.0,
                word_spacing_min=2.0,
                word_spacing_max=8.0
            )

        self.style = style
        self.line_height = 20

        # Create words from text content
        self.words = []
        for word_text in text_content.split():
            if word_text.strip():
                word = SimpleWord(word_text.strip())
                self.words.append(word)


def create_sample_html() -> str:
    """Create a sample HTML document for testing."""
    return """
    <html>
    <body>
        <h1>Chapter 1: Introduction</h1>

        <p>This is the first paragraph of our sample document. It demonstrates how HTML content
        can be parsed and then laid out across multiple pages using the pyWebLayout system.</p>

        <p>Here's another paragraph with some more text to show how the system handles
        multiple paragraphs and automatic page breaking when content exceeds page boundaries.</p>

        <h2>Section 1.1: Features</h2>

        <p>The multi-page layout system includes several key features that make it suitable
        for ereader applications and document processing systems.</p>

        <p>Each paragraph is processed individually and can span multiple lines or even
        multiple pages if the content is long enough to require it.</p>

        <h1>Chapter 2: Implementation</h1>

        <p>The implementation uses a sophisticated layout engine that processes abstract
        document elements and renders them onto concrete pages.</p>

        <p>This separation allows for flexible styling and layout while maintaining
        the semantic structure of the original content.</p>

        <p>The system can handle various HTML elements including headings, paragraphs,
        lists, and other block-level elements commonly found in documents.</p>

        <p>Position tracking is maintained throughout the layout process, enabling
        features like bookmarking and navigation between different views of the content.</p>
    </body>
    </html>
    """


class HTMLMultiPageRenderer:
    """Simple HTML to multi-page renderer."""

    def __init__(self, page_size: Tuple[int, int] = (600, 800)):
        self.page_size = page_size

    def parse_html_to_paragraphs(self, html_content: str) -> List[SimpleParagraph]:
        """Parse HTML content into simple paragraphs."""
        # Parse HTML using the extraction system
        base_font = Font(font_size=14)
        blocks = parse_html_string(html_content, base_font=base_font)

        paragraphs = []

        for block in blocks:
            if isinstance(block, (Paragraph, Heading)):
                # Extract text from the block
                text_parts = []

                # Get words from the block - handle tuple format
                if hasattr(block, 'words') and callable(block.words):
                    for word_item in block.words():
                        # Handle both Word objects and tuples
                        if hasattr(word_item, 'text'):
                            text_parts.append(word_item.text)
                        elif isinstance(word_item, tuple) and len(word_item) >= 2:
                            # Tuple format: (position, word_object)
                            word_obj = word_item[1]
                            if hasattr(word_obj, 'text'):
                                text_parts.append(word_obj.text)
                        elif isinstance(word_item, str):
                            text_parts.append(word_item)

                # Fallback: try _words attribute directly
                if not text_parts and hasattr(block, '_words'):
                    for word_item in block._words:
                        if hasattr(word_item, 'text'):
                            text_parts.append(word_item.text)
                        elif isinstance(word_item, str):
                            text_parts.append(word_item)

                if text_parts:
                    text_content = " ".join(text_parts)

                    # Create appropriate style based on block type
                    if isinstance(block, Heading):
                        style = AbstractStyle(
                            word_spacing=5.0,
                            word_spacing_min=3.0,
                            word_spacing_max=10.0
                        )
                    else:
                        style = AbstractStyle(
                            word_spacing=4.0,
                            word_spacing_min=2.0,
                            word_spacing_max=8.0
                        )

                    paragraph = SimpleParagraph(text_content, style)
                    paragraphs.append(paragraph)

        return paragraphs

    def render_pages(self, paragraphs: List[SimpleParagraph]) -> List[SimplePage]:
        """Render paragraphs into multiple pages."""
        if not paragraphs:
            return []

        pages = []
        current_page = SimplePage(*self.page_size)
        pages.append(current_page)

        for paragraph in paragraphs:
            start_word = 0

            while start_word < len(paragraph.words):
                # Try to layout the paragraph (or remaining part) on current page
                success, failed_word_index, remaining_pretext = paragraph_layouter(
                    paragraph, current_page, start_word
                )

                if success:
                    # Paragraph completed on this page
                    break
                else:
                    # Page is full, create a new page
                    current_page = SimplePage(*self.page_size)
                    pages.append(current_page)

                    # Continue with the failed word on the new page
                    if failed_word_index is not None:
                        start_word = failed_word_index
                    else:
                        # If no specific word failed, move to next paragraph
                        break

        return pages

    def save_pages(self, pages: List[SimplePage], output_dir: str = "output/html_simple"):
        """Save pages as image files."""
        os.makedirs(output_dir, exist_ok=True)

        for i, page in enumerate(pages, 1):
            # Add page number
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
            except:
                font = ImageFont.load_default()

            page_text = f"Page {i}"
            text_bbox = page.draw.textbbox((0, 0), page_text, font=font)
            text_width = text_bbox[2] - text_bbox[0]
            text_x = (page.page_size[0] - text_width) // 2
            text_y = page.page_size[1] - 25

            page.draw.text((text_x, text_y), page_text, fill=(100, 100, 100), font=font)

            # Save the page
            filename = f"page_{i:03d}.png"
            filepath = os.path.join(output_dir, filename)
            page.image.save(filepath)
            print(f"Saved {filepath}")

        print(f"\nRendered {len(pages)} pages to {output_dir}/")


def main():
    """Main demo function."""
    print("Simple HTML Multi-Page Rendering Demo")
    print("=" * 45)

    # Create sample HTML content
    print("1. Creating sample HTML content...")
    html_content = create_sample_html()
    print(f"   Created HTML document ({len(html_content)} characters)")

    # Initialize renderer
    print("\n2. Initializing renderer...")
    renderer = HTMLMultiPageRenderer(page_size=(600, 800))
    print("   Renderer initialized")

    # Parse HTML to paragraphs
    print("\n3. Parsing HTML to paragraphs...")
    paragraphs = renderer.parse_html_to_paragraphs(html_content)
    print(f"   Parsed {len(paragraphs)} paragraphs")

    # Show paragraph preview
    for i, para in enumerate(paragraphs[:3]):  # Show first 3
        preview = " ".join(word.text for word in para.words[:8])  # First 8 words
        if len(para.words) > 8:
            preview += "..."
        print(f"     Paragraph {i+1}: {preview}")

    if len(paragraphs) > 3:
        print(f"     ... and {len(paragraphs) - 3} more paragraphs")

    # Render pages
    print("\n4. Rendering pages...")
    pages = renderer.render_pages(paragraphs)
    print(f"   Rendered {len(pages)} pages")

    # Show page statistics
    for i, page in enumerate(pages, 1):
        print(f"     Page {i}: {page.lines_added} lines")

    # Save pages
    print("\n5. Saving pages...")
    renderer.save_pages(pages)

    print("\n✓ Demo completed successfully!")
    print("\nTo view the results:")
    print("  - Check the output/html_simple/ directory")
    print("  - Open the PNG files to see each rendered page")

    # Show statistics
    print(f"\nStatistics:")
    print(f"  - Original HTML: {len(html_content)} characters")
    print(f"  - Parsed paragraphs: {len(paragraphs)}")
    print(f"  - Rendered pages: {len(pages)}")
    print(f"  - Total lines: {sum(page.lines_added for page in pages)}")
    print(f"  - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")


if __name__ == "__main__":
    main()