327 lines
12 KiB
Python
327 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
HTML Multi-Page Rendering Demo
|
|
|
|
This example demonstrates how to:
|
|
1. Parse HTML content using pyWebLayout's HTML extraction system
|
|
2. Layout the parsed content across multiple pages using the ereader layout system
|
|
3. Render each page as an image file
|
|
|
|
The demo shows the complete pipeline from HTML to multi-page layout.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
from PIL import Image, ImageDraw
|
|
|
|
# Add pyWebLayout to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
|
from pyWebLayout.layout.ereader_layout import BidirectionalLayouter, RenderingPosition
|
|
from pyWebLayout.concrete.page import Page
|
|
from pyWebLayout.style.page_style import PageStyle
|
|
from pyWebLayout.style import Font
|
|
from pyWebLayout.abstract.block import Block
|
|
|
|
|
|
def create_sample_html() -> str:
|
|
"""Create a sample HTML document with various elements for testing."""
|
|
return """
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Sample Document</title>
|
|
</head>
|
|
<body>
|
|
<h1>Chapter 1: Introduction to Multi-Page Layout</h1>
|
|
|
|
<p>This is the first paragraph of our sample document. It demonstrates how HTML content
|
|
can be parsed and then laid out across multiple pages using the pyWebLayout system.
|
|
The system handles various HTML elements including headings, paragraphs, lists, and more.</p>
|
|
|
|
<p>Here's another paragraph with <strong>bold text</strong> and <em>italic text</em>
|
|
to show how inline formatting is preserved during the conversion process. The layout
|
|
engine will automatically handle word wrapping and page breaks as needed.</p>
|
|
|
|
<h2>Section 1.1: Features</h2>
|
|
|
|
<p>The multi-page layout system includes several key features:</p>
|
|
|
|
<ul>
|
|
<li>Automatic page breaking when content exceeds page boundaries</li>
|
|
<li>Font scaling support for different reading preferences</li>
|
|
<li>Position tracking for bookmarks and navigation</li>
|
|
<li>Support for various HTML elements and styling</li>
|
|
</ul>
|
|
|
|
<p>Each of these features works together to provide a seamless reading experience
|
|
that adapts to different page sizes and user preferences.</p>
|
|
|
|
<h2>Section 1.2: Technical Implementation</h2>
|
|
|
|
<p>The implementation uses a sophisticated layout engine that processes abstract
|
|
document elements and renders them onto concrete pages. This separation allows
|
|
for flexible styling and layout while maintaining the semantic structure of
|
|
the original content.</p>
|
|
|
|
<blockquote>
|
|
"The best way to understand a complex system is to see it in action with
|
|
real examples and practical demonstrations."
|
|
</blockquote>
|
|
|
|
<p>This quote illustrates the philosophy behind this demo - showing how the
|
|
various components work together in practice.</p>
|
|
|
|
<h1>Chapter 2: Advanced Layout Concepts</h1>
|
|
|
|
<p>Moving into more advanced territory, we can explore how the layout system
|
|
handles complex scenarios such as page breaks within paragraphs, font scaling
|
|
effects on layout, and position tracking across multiple pages.</p>
|
|
|
|
<p>The system maintains precise position information that allows for features
|
|
like bookmarking, search result highlighting, and seamless navigation between
|
|
different views of the same content.</p>
|
|
|
|
<h2>Section 2.1: Position Tracking</h2>
|
|
|
|
<p>Position tracking is implemented using a hierarchical system that can
|
|
reference any point in the document structure. This includes not just
|
|
paragraph and word positions, but also positions within tables, lists,
|
|
and other complex structures.</p>
|
|
|
|
<p>The position system is designed to be stable across different rendering
|
|
parameters, so a bookmark created with one font size will still be valid
|
|
when the user changes to a different font size.</p>
|
|
|
|
<h2>Section 2.2: Multi-Page Rendering</h2>
|
|
|
|
<p>The multi-page rendering system can generate pages both forward and
|
|
backward from any given position. This bidirectional capability is
|
|
essential for smooth navigation in ereader applications.</p>
|
|
|
|
<p>Each page is rendered independently, which allows for efficient
|
|
caching and parallel processing of multiple pages when needed.</p>
|
|
|
|
<p>This concludes our sample document. The layout system will automatically
|
|
determine how many pages are needed to display all this content based on
|
|
the page size and font settings used during rendering.</p>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
|
|
class HTMLMultiPageRenderer:
|
|
"""
|
|
Renderer that converts HTML to multiple page images.
|
|
"""
|
|
|
|
def __init__(self, page_size: Tuple[int, int] = (600, 800), font_scale: float = 1.0):
|
|
"""
|
|
Initialize the renderer.
|
|
|
|
Args:
|
|
page_size: Size of each page in pixels (width, height)
|
|
font_scale: Font scaling factor
|
|
"""
|
|
self.page_size = page_size
|
|
self.font_scale = font_scale
|
|
self.page_style = PageStyle()
|
|
|
|
def parse_html_to_blocks(self, html_content: str) -> List[Block]:
|
|
"""
|
|
Parse HTML content into abstract blocks.
|
|
|
|
Args:
|
|
html_content: HTML string to parse
|
|
|
|
Returns:
|
|
List of abstract Block objects
|
|
"""
|
|
base_font = Font(font_size=14) # Base font for the document
|
|
blocks = parse_html_string(html_content, base_font=base_font)
|
|
return blocks
|
|
|
|
def render_pages(self, blocks: List[Block], max_pages: int = 20) -> List[Image.Image]:
|
|
"""
|
|
Render blocks into multiple page images.
|
|
|
|
Args:
|
|
blocks: List of abstract blocks to render
|
|
max_pages: Maximum number of pages to render (safety limit)
|
|
|
|
Returns:
|
|
List of PIL Image objects, one per page
|
|
"""
|
|
if not blocks:
|
|
return []
|
|
|
|
# Create the bidirectional layouter
|
|
layouter = BidirectionalLayouter(blocks, self.page_style, self.page_size)
|
|
|
|
pages = []
|
|
current_position = RenderingPosition() # Start at beginning
|
|
page_count = 0
|
|
|
|
while page_count < max_pages:
|
|
try:
|
|
# Render the next page
|
|
page, next_position = layouter.render_page_forward(current_position, self.font_scale)
|
|
|
|
# Convert page to image
|
|
page_image = self._page_to_image(page)
|
|
pages.append(page_image)
|
|
|
|
page_count += 1
|
|
|
|
# Check if we've reached the end
|
|
if self._is_end_position(next_position, current_position, blocks):
|
|
break
|
|
|
|
current_position = next_position
|
|
|
|
except Exception as e:
|
|
print(f"Error rendering page {page_count + 1}: {e}")
|
|
break
|
|
|
|
return pages
|
|
|
|
def _page_to_image(self, page: Page) -> Image.Image:
|
|
"""
|
|
Convert a Page object to a PIL Image.
|
|
|
|
Args:
|
|
page: Page object to convert
|
|
|
|
Returns:
|
|
PIL Image object
|
|
"""
|
|
# Create a white background image
|
|
image = Image.new('RGB', self.page_size, 'white')
|
|
draw = ImageDraw.Draw(image)
|
|
|
|
# Draw page border
|
|
border_color = (200, 200, 200)
|
|
draw.rectangle([0, 0, self.page_size[0]-1, self.page_size[1]-1], outline=border_color)
|
|
|
|
# The page object should have already been rendered with its draw context
|
|
# For this demo, we'll create a simple representation
|
|
|
|
# Add page number at bottom
|
|
try:
|
|
from PIL import ImageFont
|
|
font = ImageFont.load_default()
|
|
except:
|
|
font = None
|
|
|
|
page_num_text = f"Page {len(pages) + 1}" if 'pages' in locals() else "Page"
|
|
text_bbox = draw.textbbox((0, 0), page_num_text, font=font)
|
|
text_width = text_bbox[2] - text_bbox[0]
|
|
text_x = (self.page_size[0] - text_width) // 2
|
|
text_y = self.page_size[1] - 30
|
|
|
|
draw.text((text_x, text_y), page_num_text, fill='black', font=font)
|
|
|
|
return image
|
|
|
|
def _is_end_position(self, current_pos: RenderingPosition, previous_pos: RenderingPosition, blocks: List[Block]) -> bool:
|
|
"""
|
|
Check if we've reached the end of the document.
|
|
|
|
Args:
|
|
current_pos: Current rendering position
|
|
previous_pos: Previous rendering position
|
|
blocks: List of all blocks in document
|
|
|
|
Returns:
|
|
True if at end of document
|
|
"""
|
|
# If position hasn't advanced, we're likely at the end
|
|
if (current_pos.block_index == previous_pos.block_index and
|
|
current_pos.word_index == previous_pos.word_index):
|
|
return True
|
|
|
|
# If we've processed all blocks
|
|
if current_pos.block_index >= len(blocks):
|
|
return True
|
|
|
|
return False
|
|
|
|
def save_pages(self, pages: List[Image.Image], output_dir: str = "output/html_multipage"):
|
|
"""
|
|
Save rendered pages as image files.
|
|
|
|
Args:
|
|
pages: List of page images
|
|
output_dir: Directory to save images
|
|
"""
|
|
# Create output directory
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
for i, page_image in enumerate(pages, 1):
|
|
filename = f"page_{i:03d}.png"
|
|
filepath = os.path.join(output_dir, filename)
|
|
page_image.save(filepath)
|
|
print(f"Saved {filepath}")
|
|
|
|
print(f"\nRendered {len(pages)} pages to {output_dir}/")
|
|
|
|
|
|
def main():
|
|
"""Main demo function."""
|
|
print("HTML Multi-Page Rendering Demo")
|
|
print("=" * 40)
|
|
|
|
# Create sample HTML content
|
|
print("1. Creating sample HTML content...")
|
|
html_content = create_sample_html()
|
|
print(f" Created HTML document ({len(html_content)} characters)")
|
|
|
|
# Initialize renderer
|
|
print("\n2. Initializing renderer...")
|
|
renderer = HTMLMultiPageRenderer(page_size=(600, 800), font_scale=1.0)
|
|
print(" Renderer initialized")
|
|
|
|
# Parse HTML to blocks
|
|
print("\n3. Parsing HTML to abstract blocks...")
|
|
blocks = renderer.parse_html_to_blocks(html_content)
|
|
print(f" Parsed {len(blocks)} blocks")
|
|
|
|
# Print block summary
|
|
block_types = {}
|
|
for block in blocks:
|
|
block_type = type(block).__name__
|
|
block_types[block_type] = block_types.get(block_type, 0) + 1
|
|
|
|
print(" Block types found:")
|
|
for block_type, count in block_types.items():
|
|
print(f" - {block_type}: {count}")
|
|
|
|
# Render pages
|
|
print("\n4. Rendering pages...")
|
|
pages = renderer.render_pages(blocks, max_pages=10)
|
|
print(f" Rendered {len(pages)} pages")
|
|
|
|
# Save pages
|
|
print("\n5. Saving pages...")
|
|
renderer.save_pages(pages)
|
|
|
|
print("\n✓ Demo completed successfully!")
|
|
print("\nTo view the results:")
|
|
print(" - Check the output/html_multipage/ directory")
|
|
print(" - Open the PNG files to see each rendered page")
|
|
|
|
# Show some statistics
|
|
print(f"\nStatistics:")
|
|
print(f" - Original HTML: {len(html_content)} characters")
|
|
print(f" - Abstract blocks: {len(blocks)}")
|
|
print(f" - Rendered pages: {len(pages)}")
|
|
print(f" - Page size: {renderer.page_size[0]}x{renderer.page_size[1]} pixels")
|
|
print(f" - Font scale: {renderer.font_scale}x")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|