pyWebLayout/examples/html_multipage_demo_final.py

#!/usr/bin/env python3
"""
HTML Multi-Page Rendering Demo - Final Version

This example demonstrates a complete HTML to multi-page layout system that:
1. Parses HTML content using pyWebLayout's HTML extraction system
2. Layouts content across multiple pages using the document layouter
3. Saves each page as an image file
4. Shows true multi-page functionality with smaller pages

This demonstrates the complete pipeline from HTML to multi-page layout.
"""

import os
import sys
from pathlib import Path
from typing import List, Tuple
from PIL import Image, ImageDraw, ImageFont

# Add pyWebLayout to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.layout.document_layouter import paragraph_layouter
from pyWebLayout.style.abstract_style import AbstractStyle
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext
from pyWebLayout.style import Font
from pyWebLayout.abstract.block import Block, Paragraph, Heading
from pyWebLayout.abstract.inline import Word
from pyWebLayout.concrete.text import Line


class MultiPage:
    """A page implementation optimized for multi-page layout demonstration."""

    def __init__(self, width=400, height=500, max_lines=15):  # Smaller pages for multi-page demo
        self.border_size = 30
        self._current_y_offset = self.border_size + 20  # Leave space for header
        self.available_width = width - (2 * self.border_size)
        self.available_height = height - (2 * self.border_size) - 40  # Space for header/footer
        self.max_lines = max_lines
        self.lines_added = 0
        self.children = []
        self.page_size = (width, height)

        # Create a real drawing context
        self.image = Image.new('RGB', (width, height), 'white')
        self.draw = ImageDraw.Draw(self.image)

        # Create a real style resolver
        context = RenderingContext(base_font_size=14)
        self.style_resolver = StyleResolver(context)

        # Draw page border and header area
        border_color = (180, 180, 180)
        self.draw.rectangle([0, 0, width-1, height-1], outline=border_color, width=2)

        # Draw header line
        header_y = self.border_size + 15
        self.draw.line([self.border_size, header_y, width - self.border_size, header_y],
                      fill=border_color, width=1)

    def can_fit_line(self, line_height):
        """Check if another line can fit on the page."""
        remaining_height = self.available_height - (self._current_y_offset - self.border_size - 20)
        can_fit = remaining_height >= line_height and self.lines_added < self.max_lines
        return can_fit

    def add_child(self, child):
        """Add a child element (like a Line) to the page."""
        self.children.append(child)
        self.lines_added += 1

        # Draw the line content on the page
        if isinstance(child, Line):
            self._draw_line(child)

        # Update y offset for next line
        self._current_y_offset += 18  # Line spacing

        return True

    def _draw_line(self, line):
        """Draw a line of text on the page."""
        try:
            # Use a default font for drawing
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
            except:
                font = ImageFont.load_default()

            # Get line text (simplified - in real implementation this would be more complex)
            line_text = getattr(line, '_text_content', 'Text line')

            # Draw the text
            text_color = (0, 0, 0)  # Black
            x = self.border_size + 5
            y = self._current_y_offset

            self.draw.text((x, y), line_text, fill=text_color, font=font)

        except Exception as e:
            # Fallback: draw a simple representation
            x = self.border_size + 5
            y = self._current_y_offset
            self.draw.text((x, y), "Text line", fill=(0, 0, 0))


class SimpleWord(Word):
    """A simple word implementation that works with the layouter."""

    def __init__(self, text, style=None):
        if style is None:
            style = Font(font_size=12)  # Smaller font for more content per page
        super().__init__(text, style)

    def possible_hyphenation(self):
        """Return possible hyphenation points."""
        if len(self.text) <= 6:
            return []

        # Simple hyphenation: split roughly in the middle
        mid = len(self.text) // 2
        return [(self.text[:mid] + "-", self.text[mid:])]


class SimpleParagraph:
    """A simple paragraph implementation that works with the layouter."""

    def __init__(self, text_content, style=None, is_heading=False):
        if style is None:
            if is_heading:
                style = AbstractStyle(
                    word_spacing=4.0,
                    word_spacing_min=2.0,
                    word_spacing_max=8.0
                )
            else:
                style = AbstractStyle(
                    word_spacing=3.0,
                    word_spacing_min=2.0,
                    word_spacing_max=6.0
                )

        self.style = style
        self.line_height = 18 if not is_heading else 22  # Slightly larger for headings
        self.is_heading = is_heading

        # Create words from text content
        self.words = []
        for word_text in text_content.split():
            if word_text.strip():
                word = SimpleWord(word_text.strip())
                self.words.append(word)


def create_longer_html() -> str:
    """Create a longer HTML document that will definitely span multiple pages."""
    return """
    <html>
    <body>
        <h1>The Complete Guide to Multi-Page Layout Systems</h1>

        <p>This comprehensive document demonstrates the capabilities of the pyWebLayout system
        for rendering HTML content across multiple pages. The system is designed to handle
        complex document structures while maintaining precise control over layout and formatting.</p>

        <p>The multi-page layout engine processes content incrementally, ensuring that text
        flows naturally from one page to the next. This approach is essential for creating
        professional-quality documents and ereader applications.</p>

        <h2>Chapter 1: Introduction to Document Layout</h2>

        <p>Document layout systems have evolved significantly over the years, from simple
        text processors to sophisticated engines capable of handling complex typography,
        multiple columns, and advanced formatting features.</p>

        <p>The pyWebLayout system represents a modern approach to document processing,
        combining the flexibility of HTML with the precision required for high-quality
        page layout. This makes it suitable for a wide range of applications.</p>

        <p>Key features of the system include automatic page breaking, font scaling support,
        position tracking for navigation, and comprehensive support for HTML elements
        including headings, paragraphs, lists, tables, and inline formatting.</p>

        <h2>Chapter 2: Technical Architecture</h2>

        <p>The system is built on a layered architecture that separates content parsing
        from layout rendering. This separation allows for maximum flexibility while
        maintaining performance and reliability.</p>

        <p>At the core of the system is the HTML extraction module, which converts HTML
        elements into abstract document structures. These structures are then processed
        by the layout engine to produce concrete page representations.</p>

        <p>The layout engine uses sophisticated algorithms to determine optimal line breaks,
        word spacing, and page boundaries. It can handle complex scenarios such as
        hyphenation, widow and orphan control, and multi-column layouts.</p>

        <h2>Chapter 3: Practical Applications</h2>

        <p>This technology has numerous practical applications in modern software development.
        Ereader applications benefit from the precise position tracking and font scaling
        capabilities, while document processing systems can leverage the robust HTML parsing.</p>

        <p>The system is particularly well-suited for applications that need to display
        long-form content in a paginated format. This includes digital books, technical
        documentation, reports, and academic papers.</p>

        <p>Performance characteristics are excellent, with sub-second rendering times for
        typical documents. The system can handle documents with thousands of pages while
        maintaining responsive user interaction.</p>

        <h2>Chapter 4: Advanced Features</h2>

        <p>Beyond basic text layout, the system supports advanced features such as
        bidirectional text rendering, complex table layouts, and embedded images.
        These features make it suitable for international applications and rich content.</p>

        <p>The position tracking system is particularly noteworthy, as it maintains
        stable references to content locations even when layout parameters change.
        This enables features like bookmarking and search result highlighting.</p>

        <p>Font scaling is implemented at the layout level, ensuring that all elements
        scale proportionally while maintaining optimal readability. This is crucial
        for accessibility and user preference support.</p>

        <h2>Conclusion</h2>

        <p>The pyWebLayout system demonstrates that it's possible to create sophisticated
        document layout engines using modern Python technologies. The combination of
        HTML parsing, abstract document modeling, and precise layout control provides
        a powerful foundation for document-centric applications.</p>

        <p>This example has shown the complete pipeline from HTML input to multi-page
        output, illustrating how the various components work together to produce
        high-quality results. The system is ready for use in production applications
        requiring professional document layout capabilities.</p>
    </body>
    </html>
    """


class HTMLMultiPageRenderer:
    """HTML to multi-page renderer with enhanced multi-page demonstration."""

    def __init__(self, page_size: Tuple[int, int] = (400, 500)):
        self.page_size = page_size

    def parse_html_to_paragraphs(self, html_content: str) -> List[SimpleParagraph]:
        """Parse HTML content into simple paragraphs."""
        # Parse HTML using the extraction system
        base_font = Font(font_size=12)
        blocks = parse_html_string(html_content, base_font=base_font)

        paragraphs = []

        for block in blocks:
            if isinstance(block, (Paragraph, Heading)):
                # Extract text from the block
                text_parts = []

                # Get words from the block - handle tuple format
                if hasattr(block, 'words') and callable(block.words):
                    for word_item in block.words():
                        # Handle both Word objects and tuples
                        if hasattr(word_item, 'text'):
                            text_parts.append(word_item.text)
                        elif isinstance(word_item, tuple) and len(word_item) >= 2:
                            # Tuple format: (position, word_object)
                            word_obj = word_item[1]
                            if hasattr(word_obj, 'text'):
                                text_parts.append(word_obj.text)
                        elif isinstance(word_item, str):
                            text_parts.append(word_item)

                # Fallback: try _words attribute directly
                if not text_parts and hasattr(block, '_words'):
                    for word_item in block._words:
                        if hasattr(word_item, 'text'):
                            text_parts.append(word_item.text)
                        elif isinstance(word_item, str):
                            text_parts.append(word_item)

                if text_parts:
                    text_content = " ".join(text_parts)
                    is_heading = isinstance(block, Heading)

                    # Create appropriate style based on block type
                    if is_heading:
                        style = AbstractStyle(
                            word_spacing=4.0,
                            word_spacing_min=2.0,
                            word_spacing_max=8.0
                        )
                    else:
                        style = AbstractStyle(
                            word_spacing=3.0,
                            word_spacing_min=2.0,
                            word_spacing_max=6.0
                        )

                    paragraph = SimpleParagraph(text_content, style, is_heading)
                    paragraphs.append(paragraph)

        return paragraphs

    def render_pages(self, paragraphs: List[SimpleParagraph]) -> List[MultiPage]:
        """Render paragraphs into multiple pages."""
        if not paragraphs:
            return []

        pages = []
        current_page = MultiPage(*self.page_size)
        pages.append(current_page)

        for para_idx, paragraph in enumerate(paragraphs):
            start_word = 0

            # Add extra spacing before headings (except first paragraph)
            if paragraph.is_heading and para_idx > 0 and current_page.lines_added > 0:
                # Check if we have room for heading + some content
                if current_page.lines_added >= current_page.max_lines - 3:
                    # Start heading on new page
                    current_page = MultiPage(*self.page_size)
                    pages.append(current_page)

            while start_word < len(paragraph.words):
                # Try to layout the paragraph (or remaining part) on current page
                success, failed_word_index, remaining_pretext = paragraph_layouter(
                    paragraph, current_page, start_word
                )

                if success:
                    # Paragraph completed on this page
                    break
                else:
                    # Page is full, create a new page
                    current_page = MultiPage(*self.page_size)
                    pages.append(current_page)

                    # Continue with the failed word on the new page
                    if failed_word_index is not None:
                        start_word = failed_word_index
                    else:
                        # If no specific word failed, move to next paragraph
                        break

        return pages

    def save_pages(self, pages: List[MultiPage], output_dir: str = "output/html_multipage_final"):
        """Save pages as image files with enhanced formatting."""
        os.makedirs(output_dir, exist_ok=True)

        for i, page in enumerate(pages, 1):
            # Add page header and footer
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
                title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 11)
            except:
                font = ImageFont.load_default()
                title_font = font

            # Add document title in header
            header_text = "HTML Multi-Page Layout Demo"
            text_bbox = page.draw.textbbox((0, 0), header_text, font=title_font)
            text_width = text_bbox[2] - text_bbox[0]
            text_x = (page.page_size[0] - text_width) // 2
            text_y = 8

            page.draw.text((text_x, text_y), header_text, fill=(100, 100, 100), font=title_font)

            # Add page number in footer
            page_text = f"Page {i} of {len(pages)}"
            text_bbox = page.draw.textbbox((0, 0), page_text, font=font)
            text_width = text_bbox[2] - text_bbox[0]
            text_x = (page.page_size[0] - text_width) // 2
            text_y = page.page_size[1] - 20

            page.draw.text((text_x, text_y), page_text, fill=(120, 120, 120), font=font)

            # Save the page
            filename = f"page_{i:03d}.png"
            filepath = os.path.join(output_dir, filename)
            page.image.save(filepath)
            print(f"Saved {filepath}")

        print(f"\nRendered {len(pages)} pages to {output_dir}/")


def main():
    """Main demo function."""
    print("HTML Multi-Page Rendering Demo - Final Version")
    print("=" * 55)

    # Create longer HTML content for multi-page demo
    print("1. Creating comprehensive HTML content...")
    html_content = create_longer_html()
    print(f"   Created HTML document ({len(html_content)} characters)")

    # Initialize renderer with smaller pages to force multi-page layout
    print("\n2. Initializing renderer with smaller pages...")
    renderer = HTMLMultiPageRenderer(page_size=(400, 500))  # Smaller pages
    print("   Renderer initialized (400x500 pixel pages)")

    # Parse HTML to paragraphs
    print("\n3. Parsing HTML to paragraphs...")
    paragraphs = renderer.parse_html_to_paragraphs(html_content)
    print(f"   Parsed {len(paragraphs)} paragraphs")

    # Show paragraph preview
    heading_count = sum(1 for p in paragraphs if p.is_heading)
    regular_count = len(paragraphs) - heading_count
    print(f"   Found {heading_count} headings and {regular_count} regular paragraphs")

    # Render pages
    print("\n4. Rendering pages...")
    pages = renderer.render_pages(paragraphs)
    print(f"   Rendered {len(pages)} pages")

    # Show page statistics
    total_lines = 0
    for i, page in enumerate(pages, 1):
        total_lines += page.lines_added
        print(f"     Page {i}: {page.lines_added} lines")

    # Save pages
    print("\n5. Saving pages...")
    renderer.save_pages(pages)

    print("\n✓ Multi-page demo completed successfully!")
    print("\nTo view the results:")
    print("  - Check the output/html_multipage_final/ directory")
    print("  - Open the PNG files to see each rendered page")
    print("  - Notice how content flows naturally across pages")

    # Show final statistics
    print(f"\nFinal Statistics:")
    print(f"  - Original HTML: {len(html_content)} characters")
    print(f"  - Parsed paragraphs: {len(paragraphs)} ({heading_count} headings, {regular_count} regular)")
    print(f"  - Rendered pages: {len(pages)}")
    print(f"  - Total lines: {total_lines}")
    print(f"  - Average lines per page: {total_lines / len(pages):.1f}")
    print(f"  - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")

    print(f"\n🎉 This demonstrates the complete HTML → Multi-Page pipeline!")
    print(f"   The system successfully parsed HTML and laid it out across {len(pages)} pages.")


if __name__ == "__main__":
    main()