#!/usr/bin/env python3 """ HTML Multi-Page Rendering Demo - Final Version This example demonstrates a complete HTML to multi-page layout system that: 1. Parses HTML content using pyWebLayout's HTML extraction system 2. Layouts content across multiple pages using the document layouter 3. Saves each page as an image file 4. Shows true multi-page functionality with smaller pages This demonstrates the complete pipeline from HTML to multi-page layout. """ import os import sys from pathlib import Path from typing import List, Tuple from PIL import Image, ImageDraw, ImageFont # Add pyWebLayout to path sys.path.insert(0, str(Path(__file__).parent.parent)) from pyWebLayout.io.readers.html_extraction import parse_html_string from pyWebLayout.layout.document_layouter import paragraph_layouter from pyWebLayout.style.abstract_style import AbstractStyle from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext from pyWebLayout.style import Font from pyWebLayout.abstract.block import Block, Paragraph, Heading from pyWebLayout.abstract.inline import Word from pyWebLayout.concrete.text import Line class MultiPage: """A page implementation optimized for multi-page layout demonstration.""" def __init__(self, width=400, height=500, max_lines=15): # Smaller pages for multi-page demo self.border_size = 30 self._current_y_offset = self.border_size + 20 # Leave space for header self.available_width = width - (2 * self.border_size) self.available_height = height - (2 * self.border_size) - 40 # Space for header/footer self.max_lines = max_lines self.lines_added = 0 self.children = [] self.page_size = (width, height) # Create a real drawing context self.image = Image.new('RGB', (width, height), 'white') self.draw = ImageDraw.Draw(self.image) # Create a real style resolver context = RenderingContext(base_font_size=14) self.style_resolver = StyleResolver(context) # Draw page border and header area border_color = (180, 180, 180) self.draw.rectangle([0, 0, width-1, height-1], outline=border_color, width=2) # Draw header line header_y = self.border_size + 15 self.draw.line([self.border_size, header_y, width - self.border_size, header_y], fill=border_color, width=1) def can_fit_line(self, line_height): """Check if another line can fit on the page.""" remaining_height = self.available_height - (self._current_y_offset - self.border_size - 20) can_fit = remaining_height >= line_height and self.lines_added < self.max_lines return can_fit def add_child(self, child): """Add a child element (like a Line) to the page.""" self.children.append(child) self.lines_added += 1 # Draw the line content on the page if isinstance(child, Line): self._draw_line(child) # Update y offset for next line self._current_y_offset += 18 # Line spacing return True def _draw_line(self, line): """Draw a line of text on the page.""" try: # Use a default font for drawing try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12) except: font = ImageFont.load_default() # Get line text (simplified - in real implementation this would be more complex) line_text = getattr(line, '_text_content', 'Text line') # Draw the text text_color = (0, 0, 0) # Black x = self.border_size + 5 y = self._current_y_offset self.draw.text((x, y), line_text, fill=text_color, font=font) except Exception as e: # Fallback: draw a simple representation x = self.border_size + 5 y = self._current_y_offset self.draw.text((x, y), "Text line", fill=(0, 0, 0)) class SimpleWord(Word): """A simple word implementation that works with the layouter.""" def __init__(self, text, style=None): if style is None: style = Font(font_size=12) # Smaller font for more content per page super().__init__(text, style) def possible_hyphenation(self): """Return possible hyphenation points.""" if len(self.text) <= 6: return [] # Simple hyphenation: split roughly in the middle mid = len(self.text) // 2 return [(self.text[:mid] + "-", self.text[mid:])] class SimpleParagraph: """A simple paragraph implementation that works with the layouter.""" def __init__(self, text_content, style=None, is_heading=False): if style is None: if is_heading: style = AbstractStyle( word_spacing=4.0, word_spacing_min=2.0, word_spacing_max=8.0 ) else: style = AbstractStyle( word_spacing=3.0, word_spacing_min=2.0, word_spacing_max=6.0 ) self.style = style self.line_height = 18 if not is_heading else 22 # Slightly larger for headings self.is_heading = is_heading # Create words from text content self.words = [] for word_text in text_content.split(): if word_text.strip(): word = SimpleWord(word_text.strip()) self.words.append(word) def create_longer_html() -> str: """Create a longer HTML document that will definitely span multiple pages.""" return """
This comprehensive document demonstrates the capabilities of the pyWebLayout system for rendering HTML content across multiple pages. The system is designed to handle complex document structures while maintaining precise control over layout and formatting.
The multi-page layout engine processes content incrementally, ensuring that text flows naturally from one page to the next. This approach is essential for creating professional-quality documents and ereader applications.
Document layout systems have evolved significantly over the years, from simple text processors to sophisticated engines capable of handling complex typography, multiple columns, and advanced formatting features.
The pyWebLayout system represents a modern approach to document processing, combining the flexibility of HTML with the precision required for high-quality page layout. This makes it suitable for a wide range of applications.
Key features of the system include automatic page breaking, font scaling support, position tracking for navigation, and comprehensive support for HTML elements including headings, paragraphs, lists, tables, and inline formatting.
The system is built on a layered architecture that separates content parsing from layout rendering. This separation allows for maximum flexibility while maintaining performance and reliability.
At the core of the system is the HTML extraction module, which converts HTML elements into abstract document structures. These structures are then processed by the layout engine to produce concrete page representations.
The layout engine uses sophisticated algorithms to determine optimal line breaks, word spacing, and page boundaries. It can handle complex scenarios such as hyphenation, widow and orphan control, and multi-column layouts.
This technology has numerous practical applications in modern software development. Ereader applications benefit from the precise position tracking and font scaling capabilities, while document processing systems can leverage the robust HTML parsing.
The system is particularly well-suited for applications that need to display long-form content in a paginated format. This includes digital books, technical documentation, reports, and academic papers.
Performance characteristics are excellent, with sub-second rendering times for typical documents. The system can handle documents with thousands of pages while maintaining responsive user interaction.
Beyond basic text layout, the system supports advanced features such as bidirectional text rendering, complex table layouts, and embedded images. These features make it suitable for international applications and rich content.
The position tracking system is particularly noteworthy, as it maintains stable references to content locations even when layout parameters change. This enables features like bookmarking and search result highlighting.
Font scaling is implemented at the layout level, ensuring that all elements scale proportionally while maintaining optimal readability. This is crucial for accessibility and user preference support.
The pyWebLayout system demonstrates that it's possible to create sophisticated document layout engines using modern Python technologies. The combination of HTML parsing, abstract document modeling, and precise layout control provides a powerful foundation for document-centric applications.
This example has shown the complete pipeline from HTML input to multi-page output, illustrating how the various components work together to produce high-quality results. The system is ready for use in production applications requiring professional document layout capabilities.
""" class HTMLMultiPageRenderer: """HTML to multi-page renderer with enhanced multi-page demonstration.""" def __init__(self, page_size: Tuple[int, int] = (400, 500)): self.page_size = page_size def parse_html_to_paragraphs(self, html_content: str) -> List[SimpleParagraph]: """Parse HTML content into simple paragraphs.""" # Parse HTML using the extraction system base_font = Font(font_size=12) blocks = parse_html_string(html_content, base_font=base_font) paragraphs = [] for block in blocks: if isinstance(block, (Paragraph, Heading)): # Extract text from the block text_parts = [] # Get words from the block - handle tuple format if hasattr(block, 'words') and callable(block.words): for word_item in block.words(): # Handle both Word objects and tuples if hasattr(word_item, 'text'): text_parts.append(word_item.text) elif isinstance(word_item, tuple) and len(word_item) >= 2: # Tuple format: (position, word_object) word_obj = word_item[1] if hasattr(word_obj, 'text'): text_parts.append(word_obj.text) elif isinstance(word_item, str): text_parts.append(word_item) # Fallback: try _words attribute directly if not text_parts and hasattr(block, '_words'): for word_item in block._words: if hasattr(word_item, 'text'): text_parts.append(word_item.text) elif isinstance(word_item, str): text_parts.append(word_item) if text_parts: text_content = " ".join(text_parts) is_heading = isinstance(block, Heading) # Create appropriate style based on block type if is_heading: style = AbstractStyle( word_spacing=4.0, word_spacing_min=2.0, word_spacing_max=8.0 ) else: style = AbstractStyle( word_spacing=3.0, word_spacing_min=2.0, word_spacing_max=6.0 ) paragraph = SimpleParagraph(text_content, style, is_heading) paragraphs.append(paragraph) return paragraphs def render_pages(self, paragraphs: List[SimpleParagraph]) -> List[MultiPage]: """Render paragraphs into multiple pages.""" if not paragraphs: return [] pages = [] current_page = MultiPage(*self.page_size) pages.append(current_page) for para_idx, paragraph in enumerate(paragraphs): start_word = 0 # Add extra spacing before headings (except first paragraph) if paragraph.is_heading and para_idx > 0 and current_page.lines_added > 0: # Check if we have room for heading + some content if current_page.lines_added >= current_page.max_lines - 3: # Start heading on new page current_page = MultiPage(*self.page_size) pages.append(current_page) while start_word < len(paragraph.words): # Try to layout the paragraph (or remaining part) on current page success, failed_word_index, remaining_pretext = paragraph_layouter( paragraph, current_page, start_word ) if success: # Paragraph completed on this page break else: # Page is full, create a new page current_page = MultiPage(*self.page_size) pages.append(current_page) # Continue with the failed word on the new page if failed_word_index is not None: start_word = failed_word_index else: # If no specific word failed, move to next paragraph break return pages def save_pages(self, pages: List[MultiPage], output_dir: str = "output/html_multipage_final"): """Save pages as image files with enhanced formatting.""" os.makedirs(output_dir, exist_ok=True) for i, page in enumerate(pages, 1): # Add page header and footer try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10) title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 11) except: font = ImageFont.load_default() title_font = font # Add document title in header header_text = "HTML Multi-Page Layout Demo" text_bbox = page.draw.textbbox((0, 0), header_text, font=title_font) text_width = text_bbox[2] - text_bbox[0] text_x = (page.page_size[0] - text_width) // 2 text_y = 8 page.draw.text((text_x, text_y), header_text, fill=(100, 100, 100), font=title_font) # Add page number in footer page_text = f"Page {i} of {len(pages)}" text_bbox = page.draw.textbbox((0, 0), page_text, font=font) text_width = text_bbox[2] - text_bbox[0] text_x = (page.page_size[0] - text_width) // 2 text_y = page.page_size[1] - 20 page.draw.text((text_x, text_y), page_text, fill=(120, 120, 120), font=font) # Save the page filename = f"page_{i:03d}.png" filepath = os.path.join(output_dir, filename) page.image.save(filepath) print(f"Saved {filepath}") print(f"\nRendered {len(pages)} pages to {output_dir}/") def main(): """Main demo function.""" print("HTML Multi-Page Rendering Demo - Final Version") print("=" * 55) # Create longer HTML content for multi-page demo print("1. Creating comprehensive HTML content...") html_content = create_longer_html() print(f" Created HTML document ({len(html_content)} characters)") # Initialize renderer with smaller pages to force multi-page layout print("\n2. Initializing renderer with smaller pages...") renderer = HTMLMultiPageRenderer(page_size=(400, 500)) # Smaller pages print(" Renderer initialized (400x500 pixel pages)") # Parse HTML to paragraphs print("\n3. Parsing HTML to paragraphs...") paragraphs = renderer.parse_html_to_paragraphs(html_content) print(f" Parsed {len(paragraphs)} paragraphs") # Show paragraph preview heading_count = sum(1 for p in paragraphs if p.is_heading) regular_count = len(paragraphs) - heading_count print(f" Found {heading_count} headings and {regular_count} regular paragraphs") # Render pages print("\n4. Rendering pages...") pages = renderer.render_pages(paragraphs) print(f" Rendered {len(pages)} pages") # Show page statistics total_lines = 0 for i, page in enumerate(pages, 1): total_lines += page.lines_added print(f" Page {i}: {page.lines_added} lines") # Save pages print("\n5. Saving pages...") renderer.save_pages(pages) print("\nā Multi-page demo completed successfully!") print("\nTo view the results:") print(" - Check the output/html_multipage_final/ directory") print(" - Open the PNG files to see each rendered page") print(" - Notice how content flows naturally across pages") # Show final statistics print(f"\nFinal Statistics:") print(f" - Original HTML: {len(html_content)} characters") print(f" - Parsed paragraphs: {len(paragraphs)} ({heading_count} headings, {regular_count} regular)") print(f" - Rendered pages: {len(pages)}") print(f" - Total lines: {total_lines}") print(f" - Average lines per page: {total_lines / len(pages):.1f}") print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels") print(f"\nš This demonstrates the complete HTML ā Multi-Page pipeline!") print(f" The system successfully parsed HTML and laid it out across {len(pages)} pages.") if __name__ == "__main__": main()