#!/usr/bin/env python3 """ Simple HTML Multi-Page Rendering Demo This example demonstrates a working HTML to multi-page layout system using the proven patterns from the integration tests. It shows: 1. Parse HTML content using pyWebLayout's HTML extraction system 2. Layout the parsed content across multiple pages using the document layouter 3. Save each page as an image file This is a simplified but functional implementation. """ import os import sys from pathlib import Path from typing import List, Tuple from PIL import Image, ImageDraw, ImageFont # Add pyWebLayout to path sys.path.insert(0, str(Path(__file__).parent.parent)) from pyWebLayout.io.readers.html_extraction import parse_html_string from pyWebLayout.layout.document_layouter import paragraph_layouter from pyWebLayout.style.abstract_style import AbstractStyle from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext from pyWebLayout.style import Font from pyWebLayout.abstract.block import Block, Paragraph, Heading from pyWebLayout.abstract.inline import Word from pyWebLayout.concrete.text import Line class SimplePage: """A simple page implementation for multi-page layout.""" def __init__(self, width=600, height=800, max_lines=30): self.border_size = 40 self._current_y_offset = self.border_size self.available_width = width - (2 * self.border_size) self.available_height = height - (2 * self.border_size) self.max_lines = max_lines self.lines_added = 0 self.children = [] self.page_size = (width, height) # Create a real drawing context self.image = Image.new('RGB', (width, height), 'white') self.draw = ImageDraw.Draw(self.image) # Create a real style resolver context = RenderingContext(base_font_size=16) self.style_resolver = StyleResolver(context) # Draw page border border_color = (220, 220, 220) self.draw.rectangle([0, 0, width-1, height-1], outline=border_color, width=2) def can_fit_line(self, line_height): """Check if another line can fit on the page.""" remaining_height = self.available_height - (self._current_y_offset - self.border_size) can_fit = remaining_height >= line_height and self.lines_added < self.max_lines return can_fit def add_child(self, child): """Add a child element (like a Line) to the page.""" self.children.append(child) self.lines_added += 1 # Draw the line content on the page if isinstance(child, Line): self._draw_line(child) return True def _draw_line(self, line): """Draw a line of text on the page.""" try: # Use a default font for drawing try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14) except: font = ImageFont.load_default() # Get line text (simplified) line_text = getattr(line, '_text_content', 'Line content') # Draw the text text_color = (0, 0, 0) # Black x = self.border_size + 10 y = self._current_y_offset self.draw.text((x, y), line_text, fill=text_color, font=font) except Exception as e: # Fallback: draw a simple representation x = self.border_size + 10 y = self._current_y_offset self.draw.text((x, y), "Text line", fill=(0, 0, 0)) class SimpleWord(Word): """A simple word implementation that works with the layouter.""" def __init__(self, text, style=None): if style is None: style = Font(font_size=14) super().__init__(text, style) def possible_hyphenation(self): """Return possible hyphenation points.""" if len(self.text) <= 6: return [] # Simple hyphenation: split roughly in the middle mid = len(self.text) // 2 return [(self.text[:mid] + "-", self.text[mid:])] class SimpleParagraph: """A simple paragraph implementation that works with the layouter.""" def __init__(self, text_content, style=None): if style is None: style = AbstractStyle( word_spacing=4.0, word_spacing_min=2.0, word_spacing_max=8.0 ) self.style = style self.line_height = 20 # Create words from text content self.words = [] for word_text in text_content.split(): if word_text.strip(): word = SimpleWord(word_text.strip()) self.words.append(word) def create_sample_html() -> str: """Create a sample HTML document for testing.""" return """
This is the first paragraph of our sample document. It demonstrates how HTML content can be parsed and then laid out across multiple pages using the pyWebLayout system.
Here's another paragraph with some more text to show how the system handles multiple paragraphs and automatic page breaking when content exceeds page boundaries.
The multi-page layout system includes several key features that make it suitable for ereader applications and document processing systems.
Each paragraph is processed individually and can span multiple lines or even multiple pages if the content is long enough to require it.
The implementation uses a sophisticated layout engine that processes abstract document elements and renders them onto concrete pages.
This separation allows for flexible styling and layout while maintaining the semantic structure of the original content.
The system can handle various HTML elements including headings, paragraphs, lists, and other block-level elements commonly found in documents.
Position tracking is maintained throughout the layout process, enabling features like bookmarking and navigation between different views of the content.
""" class HTMLMultiPageRenderer: """Simple HTML to multi-page renderer.""" def __init__(self, page_size: Tuple[int, int] = (600, 800)): self.page_size = page_size def parse_html_to_paragraphs(self, html_content: str) -> List[SimpleParagraph]: """Parse HTML content into simple paragraphs.""" # Parse HTML using the extraction system base_font = Font(font_size=14) blocks = parse_html_string(html_content, base_font=base_font) paragraphs = [] for block in blocks: if isinstance(block, (Paragraph, Heading)): # Extract text from the block text_parts = [] # Get words from the block - handle tuple format if hasattr(block, 'words') and callable(block.words): for word_item in block.words(): # Handle both Word objects and tuples if hasattr(word_item, 'text'): text_parts.append(word_item.text) elif isinstance(word_item, tuple) and len(word_item) >= 2: # Tuple format: (position, word_object) word_obj = word_item[1] if hasattr(word_obj, 'text'): text_parts.append(word_obj.text) elif isinstance(word_item, str): text_parts.append(word_item) # Fallback: try _words attribute directly if not text_parts and hasattr(block, '_words'): for word_item in block._words: if hasattr(word_item, 'text'): text_parts.append(word_item.text) elif isinstance(word_item, str): text_parts.append(word_item) if text_parts: text_content = " ".join(text_parts) # Create appropriate style based on block type if isinstance(block, Heading): style = AbstractStyle( word_spacing=5.0, word_spacing_min=3.0, word_spacing_max=10.0 ) else: style = AbstractStyle( word_spacing=4.0, word_spacing_min=2.0, word_spacing_max=8.0 ) paragraph = SimpleParagraph(text_content, style) paragraphs.append(paragraph) return paragraphs def render_pages(self, paragraphs: List[SimpleParagraph]) -> List[SimplePage]: """Render paragraphs into multiple pages.""" if not paragraphs: return [] pages = [] current_page = SimplePage(*self.page_size) pages.append(current_page) for paragraph in paragraphs: start_word = 0 while start_word < len(paragraph.words): # Try to layout the paragraph (or remaining part) on current page success, failed_word_index, remaining_pretext = paragraph_layouter( paragraph, current_page, start_word ) if success: # Paragraph completed on this page break else: # Page is full, create a new page current_page = SimplePage(*self.page_size) pages.append(current_page) # Continue with the failed word on the new page if failed_word_index is not None: start_word = failed_word_index else: # If no specific word failed, move to next paragraph break return pages def save_pages(self, pages: List[SimplePage], output_dir: str = "output/html_simple"): """Save pages as image files.""" os.makedirs(output_dir, exist_ok=True) for i, page in enumerate(pages, 1): # Add page number try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12) except: font = ImageFont.load_default() page_text = f"Page {i}" text_bbox = page.draw.textbbox((0, 0), page_text, font=font) text_width = text_bbox[2] - text_bbox[0] text_x = (page.page_size[0] - text_width) // 2 text_y = page.page_size[1] - 25 page.draw.text((text_x, text_y), page_text, fill=(100, 100, 100), font=font) # Save the page filename = f"page_{i:03d}.png" filepath = os.path.join(output_dir, filename) page.image.save(filepath) print(f"Saved {filepath}") print(f"\nRendered {len(pages)} pages to {output_dir}/") def main(): """Main demo function.""" print("Simple HTML Multi-Page Rendering Demo") print("=" * 45) # Create sample HTML content print("1. Creating sample HTML content...") html_content = create_sample_html() print(f" Created HTML document ({len(html_content)} characters)") # Initialize renderer print("\n2. Initializing renderer...") renderer = HTMLMultiPageRenderer(page_size=(600, 800)) print(" Renderer initialized") # Parse HTML to paragraphs print("\n3. Parsing HTML to paragraphs...") paragraphs = renderer.parse_html_to_paragraphs(html_content) print(f" Parsed {len(paragraphs)} paragraphs") # Show paragraph preview for i, para in enumerate(paragraphs[:3]): # Show first 3 preview = " ".join(word.text for word in para.words[:8]) # First 8 words if len(para.words) > 8: preview += "..." print(f" Paragraph {i+1}: {preview}") if len(paragraphs) > 3: print(f" ... and {len(paragraphs) - 3} more paragraphs") # Render pages print("\n4. Rendering pages...") pages = renderer.render_pages(paragraphs) print(f" Rendered {len(pages)} pages") # Show page statistics for i, page in enumerate(pages, 1): print(f" Page {i}: {page.lines_added} lines") # Save pages print("\n5. Saving pages...") renderer.save_pages(pages) print("\n✓ Demo completed successfully!") print("\nTo view the results:") print(" - Check the output/html_simple/ directory") print(" - Open the PNG files to see each rendered page") # Show statistics print(f"\nStatistics:") print(f" - Original HTML: {len(html_content)} characters") print(f" - Parsed paragraphs: {len(paragraphs)}") print(f" - Rendered pages: {len(pages)}") print(f" - Total lines: {sum(page.lines_added for page in pages)}") print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels") if __name__ == "__main__": main()