pyWebLayout/examples/html_line_breaking_demo.py
Duncan Tourolle 65ab46556f
Some checks failed
Python CI / test (push) Failing after 3m55s
big update with ok rendering
2025-08-27 22:22:54 +02:00

293 lines
13 KiB
Python

#!/usr/bin/env python3
"""
HTML Line Breaking and Paragraph Breaking Demo
This example demonstrates the proper use of pyWebLayout's line breaking system:
1. Line breaking with very long sentences
2. Word wrapping with long words
3. Hyphenation of extremely long words using pyphen
4. Paragraph breaking across pages
5. Various text formatting scenarios
This showcases the robustness of the layout engine's text flow capabilities
using the actual pyWebLayout concrete classes and layout system.
"""
import os
import sys
from pathlib import Path
from typing import List, Tuple
from PIL import Image, ImageDraw, ImageFont
# Add pyWebLayout to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.layout.document_layouter import paragraph_layouter
from pyWebLayout.style.abstract_style import AbstractStyle
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext, ConcreteStyleRegistry
from pyWebLayout.style.page_style import PageStyle
from pyWebLayout.concrete import Page
from pyWebLayout.abstract.block import Paragraph, Heading
from pyWebLayout.abstract.inline import Word
def create_line_breaking_html() -> str:
"""Create HTML content specifically designed to test line and paragraph breaking."""
return """
<html>
<body>
<h1>Line Breaking and Text Flow Demonstration</h1>
<p>This paragraph contains some extraordinarily long words that will definitely require hyphenation when rendered on narrow pages: supercalifragilisticexpialidocious, antidisestablishmentarianism, pneumonoultramicroscopicsilicovolcanoconiosisology, and floccinaucinihilipilificationism.</p>
<p>Here we have an extremely long sentence that goes on and on and on without any natural breaking points, demonstrating how the layout engine handles continuous text flow across multiple lines when the content exceeds the available width of the page and must be wrapped appropriately to maintain readability while preserving the semantic meaning of the original text content.</p>
<h2>Technical Terms and Specialized Vocabulary</h2>
<p>In the field of computational linguistics and natural language processing, we often encounter terminology such as morphophonological, psychopharmacological, electroencephalographic, and immunoelectrophoresis that challenges traditional typesetting systems.</p>
<p>The implementation of sophisticated algorithms for handling such complex lexical items requires careful consideration of hyphenation patterns, word spacing constraints, and line breaking optimization to ensure that the resulting layout maintains both aesthetic appeal and functional readability across various display contexts and page dimensions.</p>
<h2>Continuous Text Flow Example</h2>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
<p>Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt.</p>
<h2>Mixed Content Challenges</h2>
<p>URLs like https://www.verylongdomainnamethatshoulddemonstratehowurlsarehandledinlayoutsystems.com/with/very/long/paths/that/might/need/special/treatment and email addresses such as someone.with.a.very.long.email.address@anextraordinarilylong.domainname.extension can present unique challenges.</p>
<p>Similarly, technical identifiers like ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890 or chemical compound names such as methylenedioxymethamphetamine require special handling for proper text flow and readability.</p>
<h2>Extreme Line Breaking Test</h2>
<p>Thisisaverylongwordwithoutanyspacesorpunctuationthatwillrequireforcedhyphenationtofitonnarrowpagesanddemonstratehowtheenginehandlesextremecases.</p>
<p>Finally, we test mixed scenarios: normal words, supercalifragilisticexpialidocious, more normal text, antidisestablishmentarianism, and regular content to show how the engine transitions between different text types seamlessly.</p>
</body>
</html>
"""
class HTMLMultiPageRenderer:
"""Renderer for HTML content across multiple narrow pages using proper pyWebLayout classes."""
def __init__(self, page_width=300, page_height=400):
self.page_width = page_width
self.page_height = page_height
self.pages = []
self.current_page = None
# Create rendering context for narrow pages
self.context = RenderingContext(
base_font_size=10, # Small font for narrow pages
available_width=page_width - 50, # Account for borders
available_height=page_height - 80, # Account for borders and header
default_language="en-US"
)
# Create style resolver
self.style_resolver = StyleResolver(self.context)
# Create page style for narrow pages
self.page_style = PageStyle(
border_width=2,
border_color=(160, 160, 160),
background_color=(255, 255, 255),
padding=(20, 25, 20, 25) # top, right, bottom, left
)
def create_new_page(self) -> Page:
"""Create a new page using proper pyWebLayout Page class."""
page = Page(
size=(self.page_width, self.page_height),
style=self.page_style
)
# Set up the page with style resolver
page.style_resolver = self.style_resolver
# Calculate available dimensions
page.available_width = page.content_size[0]
page.available_height = page.content_size[1]
page._current_y_offset = self.page_style.border_width + self.page_style.padding_top
self.pages.append(page)
return page
def render_html(self, html_content: str) -> List[Page]:
"""Render HTML content to multiple pages using proper pyWebLayout system."""
print("Parsing HTML content...")
# Parse HTML into blocks
blocks = parse_html_string(html_content)
print(f"Parsed {len(blocks)} blocks from HTML")
# Convert blocks to proper pyWebLayout objects
paragraphs = []
for block in blocks:
if isinstance(block, Heading):
# Create heading style with larger font
heading_style = AbstractStyle(
font_size=14 if block.level.value <= 2 else 12,
word_spacing=3.0,
word_spacing_min=1.0,
word_spacing_max=6.0,
language="en-US"
)
# Create paragraph from heading with proper words
paragraph = Paragraph(style=heading_style)
paragraph.line_height = 18 if block.level.value <= 2 else 16
# Add words from heading
for _, word in block.words_iter():
paragraph.add_word(word)
if paragraph._words:
paragraphs.append(paragraph)
print(f"Added heading: {' '.join(w.text for w in paragraph._words[:5])}...")
elif isinstance(block, Paragraph):
# Create paragraph style
para_style = AbstractStyle(
font_size=10,
word_spacing=2.0,
word_spacing_min=1.0,
word_spacing_max=4.0,
language="en-US"
)
# Create paragraph with proper words
paragraph = Paragraph(style=para_style)
paragraph.line_height = 14
# Add words from paragraph - use words property (list) directly
for word in block.words:
paragraph.add_word(word)
if paragraph._words:
paragraphs.append(paragraph)
print(f"Added paragraph: {' '.join(w.text for w in paragraph._words[:5])}...")
print(f"Created {len(paragraphs)} paragraphs for layout")
# Layout paragraphs across pages using proper paragraph_layouter
self.current_page = self.create_new_page()
total_lines = 0
for i, paragraph in enumerate(paragraphs):
print(f"Laying out paragraph {i+1}/{len(paragraphs)} ({len(paragraph._words)} words)")
start_word = 0
pretext = None
while start_word < len(paragraph._words):
# Use the proper paragraph_layouter function
success, failed_word_index, remaining_pretext = paragraph_layouter(
paragraph, self.current_page, start_word, pretext
)
lines_on_page = len(self.current_page.children)
if success:
# Paragraph completed on this page
print(f" ✓ Paragraph completed on page {len(self.pages)} ({lines_on_page} lines)")
break
else:
# Page is full, need new page
if failed_word_index is not None:
print(f" → Page {len(self.pages)} full, continuing from word {failed_word_index}")
start_word = failed_word_index
pretext = remaining_pretext
self.current_page = self.create_new_page()
else:
print(f" ✗ Layout failed for paragraph {i+1}")
break
print(f"\nLayout complete:")
print(f" - Total pages: {len(self.pages)}")
print(f" - Total lines: {sum(len(page.children) for page in self.pages)}")
return self.pages
def save_pages(self, output_dir: str):
"""Save all pages as PNG images."""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
print(f"\nSaving {len(self.pages)} pages to {output_path}")
for i, page in enumerate(self.pages, 1):
filename = f"page_{i:03d}.png"
filepath = output_path / filename
# Render the page using proper Page.render() method
page_image = page.render()
# Add page number at bottom
draw = ImageDraw.Draw(page_image)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 8)
except:
font = ImageFont.load_default()
page_text = f"Page {i} of {len(self.pages)}"
text_bbox = draw.textbbox((0, 0), page_text, font=font)
text_width = text_bbox[2] - text_bbox[0]
x = (self.page_width - text_width) // 2
y = self.page_height - 15
draw.text((x, y), page_text, fill=(120, 120, 120), font=font)
# Save the page
page_image.save(filepath)
print(f" Saved {filename} ({len(page.children)} lines)")
def main():
"""Main function to run the line breaking demonstration."""
print("HTML Line Breaking and Paragraph Breaking Demo")
print("=" * 50)
# Create HTML content with challenging text
html_content = create_line_breaking_html()
print(f"Created HTML content ({len(html_content)} characters)")
# Create renderer with narrow pages to force line breaking
renderer = HTMLMultiPageRenderer(
page_width=300, # Very narrow to force line breaks
page_height=400 # Moderate height
)
# Render HTML to pages
pages = renderer.render_html(html_content)
# Save pages
output_dir = "output/html_line_breaking"
renderer.save_pages(output_dir)
print(f"\n✅ Demo complete!")
print(f" Generated {len(pages)} pages demonstrating:")
print(f" - Line breaking with long sentences")
print(f" - Word hyphenation for extremely long words")
print(f" - Paragraph flow across multiple pages")
print(f" - Mixed content handling")
print(f"\n📁 Output saved to: {output_dir}/")
# Print summary statistics
total_lines = sum(len(page.children) for page in pages)
avg_lines_per_page = total_lines / len(pages) if pages else 0
print(f"\n📊 Statistics:")
print(f" - Total lines rendered: {total_lines}")
print(f" - Average lines per page: {avg_lines_per_page:.1f}")
print(f" - Page dimensions: {renderer.page_width}x{renderer.page_height} pixels")
if __name__ == "__main__":
main()