452 lines
19 KiB
Python
452 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
HTML Multi-Page Rendering Demo - Final Version
|
|
|
|
This example demonstrates a complete HTML to multi-page layout system that:
|
|
1. Parses HTML content using pyWebLayout's HTML extraction system
|
|
2. Layouts content across multiple pages using the document layouter
|
|
3. Saves each page as an image file
|
|
4. Shows true multi-page functionality with smaller pages
|
|
|
|
This demonstrates the complete pipeline from HTML to multi-page layout.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
# Add pyWebLayout to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
|
from pyWebLayout.layout.document_layouter import paragraph_layouter
|
|
from pyWebLayout.style.abstract_style import AbstractStyle
|
|
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext
|
|
from pyWebLayout.style import Font
|
|
from pyWebLayout.abstract.block import Block, Paragraph, Heading
|
|
from pyWebLayout.abstract.inline import Word
|
|
from pyWebLayout.concrete.text import Line
|
|
|
|
|
|
class MultiPage:
|
|
"""A page implementation optimized for multi-page layout demonstration."""
|
|
|
|
def __init__(self, width=400, height=500, max_lines=15): # Smaller pages for multi-page demo
|
|
self.border_size = 30
|
|
self._current_y_offset = self.border_size + 20 # Leave space for header
|
|
self.available_width = width - (2 * self.border_size)
|
|
self.available_height = height - (2 * self.border_size) - 40 # Space for header/footer
|
|
self.max_lines = max_lines
|
|
self.lines_added = 0
|
|
self.children = []
|
|
self.page_size = (width, height)
|
|
|
|
# Create a real drawing context
|
|
self.image = Image.new('RGB', (width, height), 'white')
|
|
self.draw = ImageDraw.Draw(self.image)
|
|
|
|
# Create a real style resolver
|
|
context = RenderingContext(base_font_size=14)
|
|
self.style_resolver = StyleResolver(context)
|
|
|
|
# Draw page border and header area
|
|
border_color = (180, 180, 180)
|
|
self.draw.rectangle([0, 0, width-1, height-1], outline=border_color, width=2)
|
|
|
|
# Draw header line
|
|
header_y = self.border_size + 15
|
|
self.draw.line([self.border_size, header_y, width - self.border_size, header_y],
|
|
fill=border_color, width=1)
|
|
|
|
def can_fit_line(self, line_height):
|
|
"""Check if another line can fit on the page."""
|
|
remaining_height = self.available_height - (self._current_y_offset - self.border_size - 20)
|
|
can_fit = remaining_height >= line_height and self.lines_added < self.max_lines
|
|
return can_fit
|
|
|
|
def add_child(self, child):
|
|
"""Add a child element (like a Line) to the page."""
|
|
self.children.append(child)
|
|
self.lines_added += 1
|
|
|
|
# Draw the line content on the page
|
|
if isinstance(child, Line):
|
|
self._draw_line(child)
|
|
|
|
# Update y offset for next line
|
|
self._current_y_offset += 18 # Line spacing
|
|
|
|
return True
|
|
|
|
def _draw_line(self, line):
|
|
"""Draw a line of text on the page."""
|
|
try:
|
|
# Use a default font for drawing
|
|
try:
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
|
|
except:
|
|
font = ImageFont.load_default()
|
|
|
|
# Get line text (simplified - in real implementation this would be more complex)
|
|
line_text = getattr(line, '_text_content', 'Text line')
|
|
|
|
# Draw the text
|
|
text_color = (0, 0, 0) # Black
|
|
x = self.border_size + 5
|
|
y = self._current_y_offset
|
|
|
|
self.draw.text((x, y), line_text, fill=text_color, font=font)
|
|
|
|
except Exception as e:
|
|
# Fallback: draw a simple representation
|
|
x = self.border_size + 5
|
|
y = self._current_y_offset
|
|
self.draw.text((x, y), "Text line", fill=(0, 0, 0))
|
|
|
|
|
|
class SimpleWord(Word):
|
|
"""A simple word implementation that works with the layouter."""
|
|
|
|
def __init__(self, text, style=None):
|
|
if style is None:
|
|
style = Font(font_size=12) # Smaller font for more content per page
|
|
super().__init__(text, style)
|
|
|
|
def possible_hyphenation(self):
|
|
"""Return possible hyphenation points."""
|
|
if len(self.text) <= 6:
|
|
return []
|
|
|
|
# Simple hyphenation: split roughly in the middle
|
|
mid = len(self.text) // 2
|
|
return [(self.text[:mid] + "-", self.text[mid:])]
|
|
|
|
|
|
class SimpleParagraph:
|
|
"""A simple paragraph implementation that works with the layouter."""
|
|
|
|
def __init__(self, text_content, style=None, is_heading=False):
|
|
if style is None:
|
|
if is_heading:
|
|
style = AbstractStyle(
|
|
word_spacing=4.0,
|
|
word_spacing_min=2.0,
|
|
word_spacing_max=8.0
|
|
)
|
|
else:
|
|
style = AbstractStyle(
|
|
word_spacing=3.0,
|
|
word_spacing_min=2.0,
|
|
word_spacing_max=6.0
|
|
)
|
|
|
|
self.style = style
|
|
self.line_height = 18 if not is_heading else 22 # Slightly larger for headings
|
|
self.is_heading = is_heading
|
|
|
|
# Create words from text content
|
|
self.words = []
|
|
for word_text in text_content.split():
|
|
if word_text.strip():
|
|
word = SimpleWord(word_text.strip())
|
|
self.words.append(word)
|
|
|
|
|
|
def create_longer_html() -> str:
|
|
"""Create a longer HTML document that will definitely span multiple pages."""
|
|
return """
|
|
<html>
|
|
<body>
|
|
<h1>The Complete Guide to Multi-Page Layout Systems</h1>
|
|
|
|
<p>This comprehensive document demonstrates the capabilities of the pyWebLayout system
|
|
for rendering HTML content across multiple pages. The system is designed to handle
|
|
complex document structures while maintaining precise control over layout and formatting.</p>
|
|
|
|
<p>The multi-page layout engine processes content incrementally, ensuring that text
|
|
flows naturally from one page to the next. This approach is essential for creating
|
|
professional-quality documents and ereader applications.</p>
|
|
|
|
<h2>Chapter 1: Introduction to Document Layout</h2>
|
|
|
|
<p>Document layout systems have evolved significantly over the years, from simple
|
|
text processors to sophisticated engines capable of handling complex typography,
|
|
multiple columns, and advanced formatting features.</p>
|
|
|
|
<p>The pyWebLayout system represents a modern approach to document processing,
|
|
combining the flexibility of HTML with the precision required for high-quality
|
|
page layout. This makes it suitable for a wide range of applications.</p>
|
|
|
|
<p>Key features of the system include automatic page breaking, font scaling support,
|
|
position tracking for navigation, and comprehensive support for HTML elements
|
|
including headings, paragraphs, lists, tables, and inline formatting.</p>
|
|
|
|
<h2>Chapter 2: Technical Architecture</h2>
|
|
|
|
<p>The system is built on a layered architecture that separates content parsing
|
|
from layout rendering. This separation allows for maximum flexibility while
|
|
maintaining performance and reliability.</p>
|
|
|
|
<p>At the core of the system is the HTML extraction module, which converts HTML
|
|
elements into abstract document structures. These structures are then processed
|
|
by the layout engine to produce concrete page representations.</p>
|
|
|
|
<p>The layout engine uses sophisticated algorithms to determine optimal line breaks,
|
|
word spacing, and page boundaries. It can handle complex scenarios such as
|
|
hyphenation, widow and orphan control, and multi-column layouts.</p>
|
|
|
|
<h2>Chapter 3: Practical Applications</h2>
|
|
|
|
<p>This technology has numerous practical applications in modern software development.
|
|
Ereader applications benefit from the precise position tracking and font scaling
|
|
capabilities, while document processing systems can leverage the robust HTML parsing.</p>
|
|
|
|
<p>The system is particularly well-suited for applications that need to display
|
|
long-form content in a paginated format. This includes digital books, technical
|
|
documentation, reports, and academic papers.</p>
|
|
|
|
<p>Performance characteristics are excellent, with sub-second rendering times for
|
|
typical documents. The system can handle documents with thousands of pages while
|
|
maintaining responsive user interaction.</p>
|
|
|
|
<h2>Chapter 4: Advanced Features</h2>
|
|
|
|
<p>Beyond basic text layout, the system supports advanced features such as
|
|
bidirectional text rendering, complex table layouts, and embedded images.
|
|
These features make it suitable for international applications and rich content.</p>
|
|
|
|
<p>The position tracking system is particularly noteworthy, as it maintains
|
|
stable references to content locations even when layout parameters change.
|
|
This enables features like bookmarking and search result highlighting.</p>
|
|
|
|
<p>Font scaling is implemented at the layout level, ensuring that all elements
|
|
scale proportionally while maintaining optimal readability. This is crucial
|
|
for accessibility and user preference support.</p>
|
|
|
|
<h2>Conclusion</h2>
|
|
|
|
<p>The pyWebLayout system demonstrates that it's possible to create sophisticated
|
|
document layout engines using modern Python technologies. The combination of
|
|
HTML parsing, abstract document modeling, and precise layout control provides
|
|
a powerful foundation for document-centric applications.</p>
|
|
|
|
<p>This example has shown the complete pipeline from HTML input to multi-page
|
|
output, illustrating how the various components work together to produce
|
|
high-quality results. The system is ready for use in production applications
|
|
requiring professional document layout capabilities.</p>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
|
|
class HTMLMultiPageRenderer:
|
|
"""HTML to multi-page renderer with enhanced multi-page demonstration."""
|
|
|
|
def __init__(self, page_size: Tuple[int, int] = (400, 500)):
|
|
self.page_size = page_size
|
|
|
|
def parse_html_to_paragraphs(self, html_content: str) -> List[SimpleParagraph]:
|
|
"""Parse HTML content into simple paragraphs."""
|
|
# Parse HTML using the extraction system
|
|
base_font = Font(font_size=12)
|
|
blocks = parse_html_string(html_content, base_font=base_font)
|
|
|
|
paragraphs = []
|
|
|
|
for block in blocks:
|
|
if isinstance(block, (Paragraph, Heading)):
|
|
# Extract text from the block
|
|
text_parts = []
|
|
|
|
# Get words from the block - handle tuple format
|
|
if hasattr(block, 'words') and callable(block.words):
|
|
for word_item in block.words():
|
|
# Handle both Word objects and tuples
|
|
if hasattr(word_item, 'text'):
|
|
text_parts.append(word_item.text)
|
|
elif isinstance(word_item, tuple) and len(word_item) >= 2:
|
|
# Tuple format: (position, word_object)
|
|
word_obj = word_item[1]
|
|
if hasattr(word_obj, 'text'):
|
|
text_parts.append(word_obj.text)
|
|
elif isinstance(word_item, str):
|
|
text_parts.append(word_item)
|
|
|
|
# Fallback: try _words attribute directly
|
|
if not text_parts and hasattr(block, '_words'):
|
|
for word_item in block._words:
|
|
if hasattr(word_item, 'text'):
|
|
text_parts.append(word_item.text)
|
|
elif isinstance(word_item, str):
|
|
text_parts.append(word_item)
|
|
|
|
if text_parts:
|
|
text_content = " ".join(text_parts)
|
|
is_heading = isinstance(block, Heading)
|
|
|
|
# Create appropriate style based on block type
|
|
if is_heading:
|
|
style = AbstractStyle(
|
|
word_spacing=4.0,
|
|
word_spacing_min=2.0,
|
|
word_spacing_max=8.0
|
|
)
|
|
else:
|
|
style = AbstractStyle(
|
|
word_spacing=3.0,
|
|
word_spacing_min=2.0,
|
|
word_spacing_max=6.0
|
|
)
|
|
|
|
paragraph = SimpleParagraph(text_content, style, is_heading)
|
|
paragraphs.append(paragraph)
|
|
|
|
return paragraphs
|
|
|
|
def render_pages(self, paragraphs: List[SimpleParagraph]) -> List[MultiPage]:
|
|
"""Render paragraphs into multiple pages."""
|
|
if not paragraphs:
|
|
return []
|
|
|
|
pages = []
|
|
current_page = MultiPage(*self.page_size)
|
|
pages.append(current_page)
|
|
|
|
for para_idx, paragraph in enumerate(paragraphs):
|
|
start_word = 0
|
|
|
|
# Add extra spacing before headings (except first paragraph)
|
|
if paragraph.is_heading and para_idx > 0 and current_page.lines_added > 0:
|
|
# Check if we have room for heading + some content
|
|
if current_page.lines_added >= current_page.max_lines - 3:
|
|
# Start heading on new page
|
|
current_page = MultiPage(*self.page_size)
|
|
pages.append(current_page)
|
|
|
|
while start_word < len(paragraph.words):
|
|
# Try to layout the paragraph (or remaining part) on current page
|
|
success, failed_word_index, remaining_pretext = paragraph_layouter(
|
|
paragraph, current_page, start_word
|
|
)
|
|
|
|
if success:
|
|
# Paragraph completed on this page
|
|
break
|
|
else:
|
|
# Page is full, create a new page
|
|
current_page = MultiPage(*self.page_size)
|
|
pages.append(current_page)
|
|
|
|
# Continue with the failed word on the new page
|
|
if failed_word_index is not None:
|
|
start_word = failed_word_index
|
|
else:
|
|
# If no specific word failed, move to next paragraph
|
|
break
|
|
|
|
return pages
|
|
|
|
def save_pages(self, pages: List[MultiPage], output_dir: str = "output/html_multipage_final"):
|
|
"""Save pages as image files with enhanced formatting."""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
for i, page in enumerate(pages, 1):
|
|
# Add page header and footer
|
|
try:
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
|
|
title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 11)
|
|
except:
|
|
font = ImageFont.load_default()
|
|
title_font = font
|
|
|
|
# Add document title in header
|
|
header_text = "HTML Multi-Page Layout Demo"
|
|
text_bbox = page.draw.textbbox((0, 0), header_text, font=title_font)
|
|
text_width = text_bbox[2] - text_bbox[0]
|
|
text_x = (page.page_size[0] - text_width) // 2
|
|
text_y = 8
|
|
|
|
page.draw.text((text_x, text_y), header_text, fill=(100, 100, 100), font=title_font)
|
|
|
|
# Add page number in footer
|
|
page_text = f"Page {i} of {len(pages)}"
|
|
text_bbox = page.draw.textbbox((0, 0), page_text, font=font)
|
|
text_width = text_bbox[2] - text_bbox[0]
|
|
text_x = (page.page_size[0] - text_width) // 2
|
|
text_y = page.page_size[1] - 20
|
|
|
|
page.draw.text((text_x, text_y), page_text, fill=(120, 120, 120), font=font)
|
|
|
|
# Save the page
|
|
filename = f"page_{i:03d}.png"
|
|
filepath = os.path.join(output_dir, filename)
|
|
page.image.save(filepath)
|
|
print(f"Saved {filepath}")
|
|
|
|
print(f"\nRendered {len(pages)} pages to {output_dir}/")
|
|
|
|
|
|
def main():
|
|
"""Main demo function."""
|
|
print("HTML Multi-Page Rendering Demo - Final Version")
|
|
print("=" * 55)
|
|
|
|
# Create longer HTML content for multi-page demo
|
|
print("1. Creating comprehensive HTML content...")
|
|
html_content = create_longer_html()
|
|
print(f" Created HTML document ({len(html_content)} characters)")
|
|
|
|
# Initialize renderer with smaller pages to force multi-page layout
|
|
print("\n2. Initializing renderer with smaller pages...")
|
|
renderer = HTMLMultiPageRenderer(page_size=(400, 500)) # Smaller pages
|
|
print(" Renderer initialized (400x500 pixel pages)")
|
|
|
|
# Parse HTML to paragraphs
|
|
print("\n3. Parsing HTML to paragraphs...")
|
|
paragraphs = renderer.parse_html_to_paragraphs(html_content)
|
|
print(f" Parsed {len(paragraphs)} paragraphs")
|
|
|
|
# Show paragraph preview
|
|
heading_count = sum(1 for p in paragraphs if p.is_heading)
|
|
regular_count = len(paragraphs) - heading_count
|
|
print(f" Found {heading_count} headings and {regular_count} regular paragraphs")
|
|
|
|
# Render pages
|
|
print("\n4. Rendering pages...")
|
|
pages = renderer.render_pages(paragraphs)
|
|
print(f" Rendered {len(pages)} pages")
|
|
|
|
# Show page statistics
|
|
total_lines = 0
|
|
for i, page in enumerate(pages, 1):
|
|
total_lines += page.lines_added
|
|
print(f" Page {i}: {page.lines_added} lines")
|
|
|
|
# Save pages
|
|
print("\n5. Saving pages...")
|
|
renderer.save_pages(pages)
|
|
|
|
print("\n✓ Multi-page demo completed successfully!")
|
|
print("\nTo view the results:")
|
|
print(" - Check the output/html_multipage_final/ directory")
|
|
print(" - Open the PNG files to see each rendered page")
|
|
print(" - Notice how content flows naturally across pages")
|
|
|
|
# Show final statistics
|
|
print(f"\nFinal Statistics:")
|
|
print(f" - Original HTML: {len(html_content)} characters")
|
|
print(f" - Parsed paragraphs: {len(paragraphs)} ({heading_count} headings, {regular_count} regular)")
|
|
print(f" - Rendered pages: {len(pages)}")
|
|
print(f" - Total lines: {total_lines}")
|
|
print(f" - Average lines per page: {total_lines / len(pages):.1f}")
|
|
print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")
|
|
|
|
print(f"\n🎉 This demonstrates the complete HTML → Multi-Page pipeline!")
|
|
print(f" The system successfully parsed HTML and laid it out across {len(pages)} pages.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|