pyWebLayout/examples/html_line_breaking_demo.py

#!/usr/bin/env python3
"""
HTML Line Breaking and Paragraph Breaking Demo

This example demonstrates the proper use of pyWebLayout's line breaking system:
1. Line breaking with very long sentences
2. Word wrapping with long words
3. Hyphenation of extremely long words using pyphen
4. Paragraph breaking across pages
5. Various text formatting scenarios

This showcases the robustness of the layout engine's text flow capabilities
using the actual pyWebLayout concrete classes and layout system.
"""

import os
import sys
from pathlib import Path
from typing import List, Tuple
from PIL import Image, ImageDraw, ImageFont

# Add pyWebLayout to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.layout.document_layouter import paragraph_layouter
from pyWebLayout.style.abstract_style import AbstractStyle
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext, ConcreteStyleRegistry
from pyWebLayout.style.page_style import PageStyle
from pyWebLayout.concrete import Page
from pyWebLayout.abstract.block import Paragraph, Heading
from pyWebLayout.abstract.inline import Word


def create_line_breaking_html() -> str:
    """Create HTML content specifically designed to test line and paragraph breaking."""
    return """
    <html>
    <body>
        <h1>Line Breaking and Text Flow Demonstration</h1>

        <p>This paragraph contains some extraordinarily long words that will definitely require hyphenation when rendered on narrow pages: supercalifragilisticexpialidocious, antidisestablishmentarianism, pneumonoultramicroscopicsilicovolcanoconiosisology, and floccinaucinihilipilificationism.</p>

        <p>Here we have an extremely long sentence that goes on and on and on without any natural breaking points, demonstrating how the layout engine handles continuous text flow across multiple lines when the content exceeds the available width of the page and must be wrapped appropriately to maintain readability while preserving the semantic meaning of the original text content.</p>

        <h2>Technical Terms and Specialized Vocabulary</h2>

        <p>In the field of computational linguistics and natural language processing, we often encounter terminology such as morphophonological, psychopharmacological, electroencephalographic, and immunoelectrophoresis that challenges traditional typesetting systems.</p>

        <p>The implementation of sophisticated algorithms for handling such complex lexical items requires careful consideration of hyphenation patterns, word spacing constraints, and line breaking optimization to ensure that the resulting layout maintains both aesthetic appeal and functional readability across various display contexts and page dimensions.</p>

        <h2>Continuous Text Flow Example</h2>

        <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>

        <p>Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt.</p>

        <h2>Mixed Content Challenges</h2>

        <p>URLs like https://www.verylongdomainnamethatshoulddemonstratehowurlsarehandledinlayoutsystems.com/with/very/long/paths/that/might/need/special/treatment and email addresses such as someone.with.a.very.long.email.address@anextraordinarilylong.domainname.extension can present unique challenges.</p>

        <p>Similarly, technical identifiers like ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890 or chemical compound names such as methylenedioxymethamphetamine require special handling for proper text flow and readability.</p>

        <h2>Extreme Line Breaking Test</h2>

        <p>Thisisaverylongwordwithoutanyspacesorpunctuationthatwillrequireforcedhyphenationtofitonnarrowpagesanddemonstratehowtheenginehandlesextremecases.</p>

        <p>Finally, we test mixed scenarios: normal words, supercalifragilisticexpialidocious, more normal text, antidisestablishmentarianism, and regular content to show how the engine transitions between different text types seamlessly.</p>

    </body>
    </html>
    """


class HTMLMultiPageRenderer:
    """Renderer for HTML content across multiple narrow pages using proper pyWebLayout classes."""

    def __init__(self, page_width=300, page_height=400):
        self.page_width = page_width
        self.page_height = page_height
        self.pages = []
        self.current_page = None

        # Create rendering context for narrow pages
        self.context = RenderingContext(
            base_font_size=10,  # Small font for narrow pages
            available_width=page_width - 50,  # Account for borders
            available_height=page_height - 80,  # Account for borders and header
            default_language="en-US"
        )

        # Create style resolver
        self.style_resolver = StyleResolver(self.context)

        # Create page style for narrow pages
        self.page_style = PageStyle(
            border_width=2,
            border_color=(160, 160, 160),
            background_color=(255, 255, 255),
            padding=(20, 25, 20, 25)  # top, right, bottom, left
        )

    def create_new_page(self) -> Page:
        """Create a new page using proper pyWebLayout Page class."""
        page = Page(
            size=(self.page_width, self.page_height),
            style=self.page_style
        )

        # Set up the page with style resolver
        page.style_resolver = self.style_resolver

        # Calculate available dimensions
        page.available_width = page.content_size[0]
        page.available_height = page.content_size[1]
        page._current_y_offset = self.page_style.border_width + self.page_style.padding_top

        self.pages.append(page)
        return page

    def render_html(self, html_content: str) -> List[Page]:
        """Render HTML content to multiple pages using proper pyWebLayout system."""
        print("Parsing HTML content...")

        # Parse HTML into blocks
        blocks = parse_html_string(html_content)
        print(f"Parsed {len(blocks)} blocks from HTML")

        # Convert blocks to proper pyWebLayout objects
        paragraphs = []
        for block in blocks:
            if isinstance(block, Heading):
                # Create heading style with larger font
                heading_style = AbstractStyle(
                    font_size=14 if block.level.value <= 2 else 12,
                    word_spacing=3.0,
                    word_spacing_min=1.0,
                    word_spacing_max=6.0,
                    language="en-US"
                )

                # Create paragraph from heading with proper words
                paragraph = Paragraph(style=heading_style)
                paragraph.line_height = 18 if block.level.value <= 2 else 16

                # Add words from heading
                for _, word in block.words_iter():
                    paragraph.add_word(word)

                if paragraph._words:
                    paragraphs.append(paragraph)
                    print(f"Added heading: {' '.join(w.text for w in paragraph._words[:5])}...")

            elif isinstance(block, Paragraph):
                # Create paragraph style
                para_style = AbstractStyle(
                    font_size=10,
                    word_spacing=2.0,
                    word_spacing_min=1.0,
                    word_spacing_max=4.0,
                    language="en-US"
                )

                # Create paragraph with proper words
                paragraph = Paragraph(style=para_style)
                paragraph.line_height = 14

                # Add words from paragraph - use words property (list) directly
                for word in block.words:
                    paragraph.add_word(word)

                if paragraph._words:
                    paragraphs.append(paragraph)
                    print(f"Added paragraph: {' '.join(w.text for w in paragraph._words[:5])}...")

        print(f"Created {len(paragraphs)} paragraphs for layout")

        # Layout paragraphs across pages using proper paragraph_layouter
        self.current_page = self.create_new_page()
        total_lines = 0

        for i, paragraph in enumerate(paragraphs):
            print(f"Laying out paragraph {i+1}/{len(paragraphs)} ({len(paragraph._words)} words)")

            start_word = 0
            pretext = None

            while start_word < len(paragraph._words):
                # Use the proper paragraph_layouter function
                success, failed_word_index, remaining_pretext = paragraph_layouter(
                    paragraph, self.current_page, start_word, pretext
                )

                lines_on_page = len(self.current_page.children)

                if success:
                    # Paragraph completed on this page
                    print(f"  ✓ Paragraph completed on page {len(self.pages)} ({lines_on_page} lines)")
                    break
                else:
                    # Page is full, need new page
                    if failed_word_index is not None:
                        print(f"  → Page {len(self.pages)} full, continuing from word {failed_word_index}")
                        start_word = failed_word_index
                        pretext = remaining_pretext
                        self.current_page = self.create_new_page()
                    else:
                        print(f"  ✗ Layout failed for paragraph {i+1}")
                        break

        print(f"\nLayout complete:")
        print(f"  - Total pages: {len(self.pages)}")
        print(f"  - Total lines: {sum(len(page.children) for page in self.pages)}")

        return self.pages

    def save_pages(self, output_dir: str):
        """Save all pages as PNG images."""
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        print(f"\nSaving {len(self.pages)} pages to {output_path}")

        for i, page in enumerate(self.pages, 1):
            filename = f"page_{i:03d}.png"
            filepath = output_path / filename

            # Render the page using proper Page.render() method
            page_image = page.render()

            # Add page number at bottom
            draw = ImageDraw.Draw(page_image)
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 8)
            except:
                font = ImageFont.load_default()

            page_text = f"Page {i} of {len(self.pages)}"
            text_bbox = draw.textbbox((0, 0), page_text, font=font)
            text_width = text_bbox[2] - text_bbox[0]

            x = (self.page_width - text_width) // 2
            y = self.page_height - 15
            draw.text((x, y), page_text, fill=(120, 120, 120), font=font)

            # Save the page
            page_image.save(filepath)
            print(f"  Saved {filename} ({len(page.children)} lines)")


def main():
    """Main function to run the line breaking demonstration."""
    print("HTML Line Breaking and Paragraph Breaking Demo")
    print("=" * 50)

    # Create HTML content with challenging text
    html_content = create_line_breaking_html()
    print(f"Created HTML content ({len(html_content)} characters)")

    # Create renderer with narrow pages to force line breaking
    renderer = HTMLMultiPageRenderer(
        page_width=300,   # Very narrow to force line breaks
        page_height=400   # Moderate height
    )

    # Render HTML to pages
    pages = renderer.render_html(html_content)

    # Save pages
    output_dir = "output/html_line_breaking"
    renderer.save_pages(output_dir)

    print(f"\n✅ Demo complete!")
    print(f"   Generated {len(pages)} pages demonstrating:")
    print(f"   - Line breaking with long sentences")
    print(f"   - Word hyphenation for extremely long words")
    print(f"   - Paragraph flow across multiple pages")
    print(f"   - Mixed content handling")
    print(f"\n📁 Output saved to: {output_dir}/")

    # Print summary statistics
    total_lines = sum(len(page.children) for page in pages)
    avg_lines_per_page = total_lines / len(pages) if pages else 0

    print(f"\n📊 Statistics:")
    print(f"   - Total lines rendered: {total_lines}")
    print(f"   - Average lines per page: {avg_lines_per_page:.1f}")
    print(f"   - Page dimensions: {renderer.page_width}x{renderer.page_height} pixels")


if __name__ == "__main__":
    main()