366 lines
14 KiB
Python
366 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple HTML Multi-Page Rendering Demo
|
|
|
|
This example demonstrates a working HTML to multi-page layout system using
|
|
the proven patterns from the integration tests. It shows:
|
|
|
|
1. Parse HTML content using pyWebLayout's HTML extraction system
|
|
2. Layout the parsed content across multiple pages using the document layouter
|
|
3. Save each page as an image file
|
|
|
|
This is a simplified but functional implementation.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
# Add pyWebLayout to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
|
from pyWebLayout.layout.document_layouter import paragraph_layouter
|
|
from pyWebLayout.style.abstract_style import AbstractStyle
|
|
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext
|
|
from pyWebLayout.style import Font
|
|
from pyWebLayout.abstract.block import Block, Paragraph, Heading
|
|
from pyWebLayout.abstract.inline import Word
|
|
from pyWebLayout.concrete.text import Line
|
|
|
|
|
|
class SimplePage:
|
|
"""A simple page implementation for multi-page layout."""
|
|
|
|
def __init__(self, width=600, height=800, max_lines=30):
|
|
self.border_size = 40
|
|
self._current_y_offset = self.border_size
|
|
self.available_width = width - (2 * self.border_size)
|
|
self.available_height = height - (2 * self.border_size)
|
|
self.max_lines = max_lines
|
|
self.lines_added = 0
|
|
self.children = []
|
|
self.page_size = (width, height)
|
|
|
|
# Create a real drawing context
|
|
self.image = Image.new('RGB', (width, height), 'white')
|
|
self.draw = ImageDraw.Draw(self.image)
|
|
|
|
# Create a real style resolver
|
|
context = RenderingContext(base_font_size=16)
|
|
self.style_resolver = StyleResolver(context)
|
|
|
|
# Draw page border
|
|
border_color = (220, 220, 220)
|
|
self.draw.rectangle([0, 0, width-1, height-1], outline=border_color, width=2)
|
|
|
|
def can_fit_line(self, line_height):
|
|
"""Check if another line can fit on the page."""
|
|
remaining_height = self.available_height - (self._current_y_offset - self.border_size)
|
|
can_fit = remaining_height >= line_height and self.lines_added < self.max_lines
|
|
return can_fit
|
|
|
|
def add_child(self, child):
|
|
"""Add a child element (like a Line) to the page."""
|
|
self.children.append(child)
|
|
self.lines_added += 1
|
|
|
|
# Draw the line content on the page
|
|
if isinstance(child, Line):
|
|
self._draw_line(child)
|
|
|
|
return True
|
|
|
|
def _draw_line(self, line):
|
|
"""Draw a line of text on the page."""
|
|
try:
|
|
# Use a default font for drawing
|
|
try:
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
|
|
except:
|
|
font = ImageFont.load_default()
|
|
|
|
# Get line text (simplified)
|
|
line_text = getattr(line, '_text_content', 'Line content')
|
|
|
|
# Draw the text
|
|
text_color = (0, 0, 0) # Black
|
|
x = self.border_size + 10
|
|
y = self._current_y_offset
|
|
|
|
self.draw.text((x, y), line_text, fill=text_color, font=font)
|
|
|
|
except Exception as e:
|
|
# Fallback: draw a simple representation
|
|
x = self.border_size + 10
|
|
y = self._current_y_offset
|
|
self.draw.text((x, y), "Text line", fill=(0, 0, 0))
|
|
|
|
|
|
class SimpleWord(Word):
|
|
"""A simple word implementation that works with the layouter."""
|
|
|
|
def __init__(self, text, style=None):
|
|
if style is None:
|
|
style = Font(font_size=14)
|
|
super().__init__(text, style)
|
|
|
|
def possible_hyphenation(self):
|
|
"""Return possible hyphenation points."""
|
|
if len(self.text) <= 6:
|
|
return []
|
|
|
|
# Simple hyphenation: split roughly in the middle
|
|
mid = len(self.text) // 2
|
|
return [(self.text[:mid] + "-", self.text[mid:])]
|
|
|
|
|
|
class SimpleParagraph:
|
|
"""A simple paragraph implementation that works with the layouter."""
|
|
|
|
def __init__(self, text_content, style=None):
|
|
if style is None:
|
|
style = AbstractStyle(
|
|
word_spacing=4.0,
|
|
word_spacing_min=2.0,
|
|
word_spacing_max=8.0
|
|
)
|
|
|
|
self.style = style
|
|
self.line_height = 20
|
|
|
|
# Create words from text content
|
|
self.words = []
|
|
for word_text in text_content.split():
|
|
if word_text.strip():
|
|
word = SimpleWord(word_text.strip())
|
|
self.words.append(word)
|
|
|
|
|
|
def create_sample_html() -> str:
|
|
"""Create a sample HTML document for testing."""
|
|
return """
|
|
<html>
|
|
<body>
|
|
<h1>Chapter 1: Introduction</h1>
|
|
|
|
<p>This is the first paragraph of our sample document. It demonstrates how HTML content
|
|
can be parsed and then laid out across multiple pages using the pyWebLayout system.</p>
|
|
|
|
<p>Here's another paragraph with some more text to show how the system handles
|
|
multiple paragraphs and automatic page breaking when content exceeds page boundaries.</p>
|
|
|
|
<h2>Section 1.1: Features</h2>
|
|
|
|
<p>The multi-page layout system includes several key features that make it suitable
|
|
for ereader applications and document processing systems.</p>
|
|
|
|
<p>Each paragraph is processed individually and can span multiple lines or even
|
|
multiple pages if the content is long enough to require it.</p>
|
|
|
|
<h1>Chapter 2: Implementation</h1>
|
|
|
|
<p>The implementation uses a sophisticated layout engine that processes abstract
|
|
document elements and renders them onto concrete pages.</p>
|
|
|
|
<p>This separation allows for flexible styling and layout while maintaining
|
|
the semantic structure of the original content.</p>
|
|
|
|
<p>The system can handle various HTML elements including headings, paragraphs,
|
|
lists, and other block-level elements commonly found in documents.</p>
|
|
|
|
<p>Position tracking is maintained throughout the layout process, enabling
|
|
features like bookmarking and navigation between different views of the content.</p>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
|
|
class HTMLMultiPageRenderer:
|
|
"""Simple HTML to multi-page renderer."""
|
|
|
|
def __init__(self, page_size: Tuple[int, int] = (600, 800)):
|
|
self.page_size = page_size
|
|
|
|
def parse_html_to_paragraphs(self, html_content: str) -> List[SimpleParagraph]:
|
|
"""Parse HTML content into simple paragraphs."""
|
|
# Parse HTML using the extraction system
|
|
base_font = Font(font_size=14)
|
|
blocks = parse_html_string(html_content, base_font=base_font)
|
|
|
|
paragraphs = []
|
|
|
|
for block in blocks:
|
|
if isinstance(block, (Paragraph, Heading)):
|
|
# Extract text from the block
|
|
text_parts = []
|
|
|
|
# Get words from the block - handle tuple format
|
|
if hasattr(block, 'words') and callable(block.words):
|
|
for word_item in block.words():
|
|
# Handle both Word objects and tuples
|
|
if hasattr(word_item, 'text'):
|
|
text_parts.append(word_item.text)
|
|
elif isinstance(word_item, tuple) and len(word_item) >= 2:
|
|
# Tuple format: (position, word_object)
|
|
word_obj = word_item[1]
|
|
if hasattr(word_obj, 'text'):
|
|
text_parts.append(word_obj.text)
|
|
elif isinstance(word_item, str):
|
|
text_parts.append(word_item)
|
|
|
|
# Fallback: try _words attribute directly
|
|
if not text_parts and hasattr(block, '_words'):
|
|
for word_item in block._words:
|
|
if hasattr(word_item, 'text'):
|
|
text_parts.append(word_item.text)
|
|
elif isinstance(word_item, str):
|
|
text_parts.append(word_item)
|
|
|
|
if text_parts:
|
|
text_content = " ".join(text_parts)
|
|
|
|
# Create appropriate style based on block type
|
|
if isinstance(block, Heading):
|
|
style = AbstractStyle(
|
|
word_spacing=5.0,
|
|
word_spacing_min=3.0,
|
|
word_spacing_max=10.0
|
|
)
|
|
else:
|
|
style = AbstractStyle(
|
|
word_spacing=4.0,
|
|
word_spacing_min=2.0,
|
|
word_spacing_max=8.0
|
|
)
|
|
|
|
paragraph = SimpleParagraph(text_content, style)
|
|
paragraphs.append(paragraph)
|
|
|
|
return paragraphs
|
|
|
|
def render_pages(self, paragraphs: List[SimpleParagraph]) -> List[SimplePage]:
|
|
"""Render paragraphs into multiple pages."""
|
|
if not paragraphs:
|
|
return []
|
|
|
|
pages = []
|
|
current_page = SimplePage(*self.page_size)
|
|
pages.append(current_page)
|
|
|
|
for paragraph in paragraphs:
|
|
start_word = 0
|
|
|
|
while start_word < len(paragraph.words):
|
|
# Try to layout the paragraph (or remaining part) on current page
|
|
success, failed_word_index, remaining_pretext = paragraph_layouter(
|
|
paragraph, current_page, start_word
|
|
)
|
|
|
|
if success:
|
|
# Paragraph completed on this page
|
|
break
|
|
else:
|
|
# Page is full, create a new page
|
|
current_page = SimplePage(*self.page_size)
|
|
pages.append(current_page)
|
|
|
|
# Continue with the failed word on the new page
|
|
if failed_word_index is not None:
|
|
start_word = failed_word_index
|
|
else:
|
|
# If no specific word failed, move to next paragraph
|
|
break
|
|
|
|
return pages
|
|
|
|
def save_pages(self, pages: List[SimplePage], output_dir: str = "output/html_simple"):
|
|
"""Save pages as image files."""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
for i, page in enumerate(pages, 1):
|
|
# Add page number
|
|
try:
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
|
|
except:
|
|
font = ImageFont.load_default()
|
|
|
|
page_text = f"Page {i}"
|
|
text_bbox = page.draw.textbbox((0, 0), page_text, font=font)
|
|
text_width = text_bbox[2] - text_bbox[0]
|
|
text_x = (page.page_size[0] - text_width) // 2
|
|
text_y = page.page_size[1] - 25
|
|
|
|
page.draw.text((text_x, text_y), page_text, fill=(100, 100, 100), font=font)
|
|
|
|
# Save the page
|
|
filename = f"page_{i:03d}.png"
|
|
filepath = os.path.join(output_dir, filename)
|
|
page.image.save(filepath)
|
|
print(f"Saved {filepath}")
|
|
|
|
print(f"\nRendered {len(pages)} pages to {output_dir}/")
|
|
|
|
|
|
def main():
|
|
"""Main demo function."""
|
|
print("Simple HTML Multi-Page Rendering Demo")
|
|
print("=" * 45)
|
|
|
|
# Create sample HTML content
|
|
print("1. Creating sample HTML content...")
|
|
html_content = create_sample_html()
|
|
print(f" Created HTML document ({len(html_content)} characters)")
|
|
|
|
# Initialize renderer
|
|
print("\n2. Initializing renderer...")
|
|
renderer = HTMLMultiPageRenderer(page_size=(600, 800))
|
|
print(" Renderer initialized")
|
|
|
|
# Parse HTML to paragraphs
|
|
print("\n3. Parsing HTML to paragraphs...")
|
|
paragraphs = renderer.parse_html_to_paragraphs(html_content)
|
|
print(f" Parsed {len(paragraphs)} paragraphs")
|
|
|
|
# Show paragraph preview
|
|
for i, para in enumerate(paragraphs[:3]): # Show first 3
|
|
preview = " ".join(word.text for word in para.words[:8]) # First 8 words
|
|
if len(para.words) > 8:
|
|
preview += "..."
|
|
print(f" Paragraph {i+1}: {preview}")
|
|
|
|
if len(paragraphs) > 3:
|
|
print(f" ... and {len(paragraphs) - 3} more paragraphs")
|
|
|
|
# Render pages
|
|
print("\n4. Rendering pages...")
|
|
pages = renderer.render_pages(paragraphs)
|
|
print(f" Rendered {len(pages)} pages")
|
|
|
|
# Show page statistics
|
|
for i, page in enumerate(pages, 1):
|
|
print(f" Page {i}: {page.lines_added} lines")
|
|
|
|
# Save pages
|
|
print("\n5. Saving pages...")
|
|
renderer.save_pages(pages)
|
|
|
|
print("\n✓ Demo completed successfully!")
|
|
print("\nTo view the results:")
|
|
print(" - Check the output/html_simple/ directory")
|
|
print(" - Open the PNG files to see each rendered page")
|
|
|
|
# Show statistics
|
|
print(f"\nStatistics:")
|
|
print(f" - Original HTML: {len(html_content)} characters")
|
|
print(f" - Parsed paragraphs: {len(paragraphs)}")
|
|
print(f" - Rendered pages: {len(pages)}")
|
|
print(f" - Total lines: {sum(page.lines_added for page in pages)}")
|
|
print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|