293 lines
13 KiB
Python
293 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
HTML Line Breaking and Paragraph Breaking Demo
|
|
|
|
This example demonstrates the proper use of pyWebLayout's line breaking system:
|
|
1. Line breaking with very long sentences
|
|
2. Word wrapping with long words
|
|
3. Hyphenation of extremely long words using pyphen
|
|
4. Paragraph breaking across pages
|
|
5. Various text formatting scenarios
|
|
|
|
This showcases the robustness of the layout engine's text flow capabilities
|
|
using the actual pyWebLayout concrete classes and layout system.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
# Add pyWebLayout to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
|
from pyWebLayout.layout.document_layouter import paragraph_layouter
|
|
from pyWebLayout.style.abstract_style import AbstractStyle
|
|
from pyWebLayout.style.concrete_style import StyleResolver, RenderingContext, ConcreteStyleRegistry
|
|
from pyWebLayout.style.page_style import PageStyle
|
|
from pyWebLayout.concrete import Page
|
|
from pyWebLayout.abstract.block import Paragraph, Heading
|
|
from pyWebLayout.abstract.inline import Word
|
|
|
|
|
|
def create_line_breaking_html() -> str:
|
|
"""Create HTML content specifically designed to test line and paragraph breaking."""
|
|
return """
|
|
<html>
|
|
<body>
|
|
<h1>Line Breaking and Text Flow Demonstration</h1>
|
|
|
|
<p>This paragraph contains some extraordinarily long words that will definitely require hyphenation when rendered on narrow pages: supercalifragilisticexpialidocious, antidisestablishmentarianism, pneumonoultramicroscopicsilicovolcanoconiosisology, and floccinaucinihilipilificationism.</p>
|
|
|
|
<p>Here we have an extremely long sentence that goes on and on and on without any natural breaking points, demonstrating how the layout engine handles continuous text flow across multiple lines when the content exceeds the available width of the page and must be wrapped appropriately to maintain readability while preserving the semantic meaning of the original text content.</p>
|
|
|
|
<h2>Technical Terms and Specialized Vocabulary</h2>
|
|
|
|
<p>In the field of computational linguistics and natural language processing, we often encounter terminology such as morphophonological, psychopharmacological, electroencephalographic, and immunoelectrophoresis that challenges traditional typesetting systems.</p>
|
|
|
|
<p>The implementation of sophisticated algorithms for handling such complex lexical items requires careful consideration of hyphenation patterns, word spacing constraints, and line breaking optimization to ensure that the resulting layout maintains both aesthetic appeal and functional readability across various display contexts and page dimensions.</p>
|
|
|
|
<h2>Continuous Text Flow Example</h2>
|
|
|
|
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
|
|
|
|
<p>Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt.</p>
|
|
|
|
<h2>Mixed Content Challenges</h2>
|
|
|
|
<p>URLs like https://www.verylongdomainnamethatshoulddemonstratehowurlsarehandledinlayoutsystems.com/with/very/long/paths/that/might/need/special/treatment and email addresses such as someone.with.a.very.long.email.address@anextraordinarilylong.domainname.extension can present unique challenges.</p>
|
|
|
|
<p>Similarly, technical identifiers like ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890 or chemical compound names such as methylenedioxymethamphetamine require special handling for proper text flow and readability.</p>
|
|
|
|
<h2>Extreme Line Breaking Test</h2>
|
|
|
|
<p>Thisisaverylongwordwithoutanyspacesorpunctuationthatwillrequireforcedhyphenationtofitonnarrowpagesanddemonstratehowtheenginehandlesextremecases.</p>
|
|
|
|
<p>Finally, we test mixed scenarios: normal words, supercalifragilisticexpialidocious, more normal text, antidisestablishmentarianism, and regular content to show how the engine transitions between different text types seamlessly.</p>
|
|
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
|
|
class HTMLMultiPageRenderer:
|
|
"""Renderer for HTML content across multiple narrow pages using proper pyWebLayout classes."""
|
|
|
|
def __init__(self, page_width=300, page_height=400):
|
|
self.page_width = page_width
|
|
self.page_height = page_height
|
|
self.pages = []
|
|
self.current_page = None
|
|
|
|
# Create rendering context for narrow pages
|
|
self.context = RenderingContext(
|
|
base_font_size=10, # Small font for narrow pages
|
|
available_width=page_width - 50, # Account for borders
|
|
available_height=page_height - 80, # Account for borders and header
|
|
default_language="en-US"
|
|
)
|
|
|
|
# Create style resolver
|
|
self.style_resolver = StyleResolver(self.context)
|
|
|
|
# Create page style for narrow pages
|
|
self.page_style = PageStyle(
|
|
border_width=2,
|
|
border_color=(160, 160, 160),
|
|
background_color=(255, 255, 255),
|
|
padding=(20, 25, 20, 25) # top, right, bottom, left
|
|
)
|
|
|
|
def create_new_page(self) -> Page:
|
|
"""Create a new page using proper pyWebLayout Page class."""
|
|
page = Page(
|
|
size=(self.page_width, self.page_height),
|
|
style=self.page_style
|
|
)
|
|
|
|
# Set up the page with style resolver
|
|
page.style_resolver = self.style_resolver
|
|
|
|
# Calculate available dimensions
|
|
page.available_width = page.content_size[0]
|
|
page.available_height = page.content_size[1]
|
|
page._current_y_offset = self.page_style.border_width + self.page_style.padding_top
|
|
|
|
self.pages.append(page)
|
|
return page
|
|
|
|
def render_html(self, html_content: str) -> List[Page]:
|
|
"""Render HTML content to multiple pages using proper pyWebLayout system."""
|
|
print("Parsing HTML content...")
|
|
|
|
# Parse HTML into blocks
|
|
blocks = parse_html_string(html_content)
|
|
print(f"Parsed {len(blocks)} blocks from HTML")
|
|
|
|
# Convert blocks to proper pyWebLayout objects
|
|
paragraphs = []
|
|
for block in blocks:
|
|
if isinstance(block, Heading):
|
|
# Create heading style with larger font
|
|
heading_style = AbstractStyle(
|
|
font_size=14 if block.level.value <= 2 else 12,
|
|
word_spacing=3.0,
|
|
word_spacing_min=1.0,
|
|
word_spacing_max=6.0,
|
|
language="en-US"
|
|
)
|
|
|
|
# Create paragraph from heading with proper words
|
|
paragraph = Paragraph(style=heading_style)
|
|
paragraph.line_height = 18 if block.level.value <= 2 else 16
|
|
|
|
# Add words from heading
|
|
for _, word in block.words_iter():
|
|
paragraph.add_word(word)
|
|
|
|
if paragraph._words:
|
|
paragraphs.append(paragraph)
|
|
print(f"Added heading: {' '.join(w.text for w in paragraph._words[:5])}...")
|
|
|
|
elif isinstance(block, Paragraph):
|
|
# Create paragraph style
|
|
para_style = AbstractStyle(
|
|
font_size=10,
|
|
word_spacing=2.0,
|
|
word_spacing_min=1.0,
|
|
word_spacing_max=4.0,
|
|
language="en-US"
|
|
)
|
|
|
|
# Create paragraph with proper words
|
|
paragraph = Paragraph(style=para_style)
|
|
paragraph.line_height = 14
|
|
|
|
# Add words from paragraph - use words property (list) directly
|
|
for word in block.words:
|
|
paragraph.add_word(word)
|
|
|
|
if paragraph._words:
|
|
paragraphs.append(paragraph)
|
|
print(f"Added paragraph: {' '.join(w.text for w in paragraph._words[:5])}...")
|
|
|
|
print(f"Created {len(paragraphs)} paragraphs for layout")
|
|
|
|
# Layout paragraphs across pages using proper paragraph_layouter
|
|
self.current_page = self.create_new_page()
|
|
total_lines = 0
|
|
|
|
for i, paragraph in enumerate(paragraphs):
|
|
print(f"Laying out paragraph {i+1}/{len(paragraphs)} ({len(paragraph._words)} words)")
|
|
|
|
start_word = 0
|
|
pretext = None
|
|
|
|
while start_word < len(paragraph._words):
|
|
# Use the proper paragraph_layouter function
|
|
success, failed_word_index, remaining_pretext = paragraph_layouter(
|
|
paragraph, self.current_page, start_word, pretext
|
|
)
|
|
|
|
lines_on_page = len(self.current_page.children)
|
|
|
|
if success:
|
|
# Paragraph completed on this page
|
|
print(f" ✓ Paragraph completed on page {len(self.pages)} ({lines_on_page} lines)")
|
|
break
|
|
else:
|
|
# Page is full, need new page
|
|
if failed_word_index is not None:
|
|
print(f" → Page {len(self.pages)} full, continuing from word {failed_word_index}")
|
|
start_word = failed_word_index
|
|
pretext = remaining_pretext
|
|
self.current_page = self.create_new_page()
|
|
else:
|
|
print(f" ✗ Layout failed for paragraph {i+1}")
|
|
break
|
|
|
|
print(f"\nLayout complete:")
|
|
print(f" - Total pages: {len(self.pages)}")
|
|
print(f" - Total lines: {sum(len(page.children) for page in self.pages)}")
|
|
|
|
return self.pages
|
|
|
|
def save_pages(self, output_dir: str):
|
|
"""Save all pages as PNG images."""
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"\nSaving {len(self.pages)} pages to {output_path}")
|
|
|
|
for i, page in enumerate(self.pages, 1):
|
|
filename = f"page_{i:03d}.png"
|
|
filepath = output_path / filename
|
|
|
|
# Render the page using proper Page.render() method
|
|
page_image = page.render()
|
|
|
|
# Add page number at bottom
|
|
draw = ImageDraw.Draw(page_image)
|
|
try:
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 8)
|
|
except:
|
|
font = ImageFont.load_default()
|
|
|
|
page_text = f"Page {i} of {len(self.pages)}"
|
|
text_bbox = draw.textbbox((0, 0), page_text, font=font)
|
|
text_width = text_bbox[2] - text_bbox[0]
|
|
|
|
x = (self.page_width - text_width) // 2
|
|
y = self.page_height - 15
|
|
draw.text((x, y), page_text, fill=(120, 120, 120), font=font)
|
|
|
|
# Save the page
|
|
page_image.save(filepath)
|
|
print(f" Saved {filename} ({len(page.children)} lines)")
|
|
|
|
|
|
def main():
|
|
"""Main function to run the line breaking demonstration."""
|
|
print("HTML Line Breaking and Paragraph Breaking Demo")
|
|
print("=" * 50)
|
|
|
|
# Create HTML content with challenging text
|
|
html_content = create_line_breaking_html()
|
|
print(f"Created HTML content ({len(html_content)} characters)")
|
|
|
|
# Create renderer with narrow pages to force line breaking
|
|
renderer = HTMLMultiPageRenderer(
|
|
page_width=300, # Very narrow to force line breaks
|
|
page_height=400 # Moderate height
|
|
)
|
|
|
|
# Render HTML to pages
|
|
pages = renderer.render_html(html_content)
|
|
|
|
# Save pages
|
|
output_dir = "output/html_line_breaking"
|
|
renderer.save_pages(output_dir)
|
|
|
|
print(f"\n✅ Demo complete!")
|
|
print(f" Generated {len(pages)} pages demonstrating:")
|
|
print(f" - Line breaking with long sentences")
|
|
print(f" - Word hyphenation for extremely long words")
|
|
print(f" - Paragraph flow across multiple pages")
|
|
print(f" - Mixed content handling")
|
|
print(f"\n📁 Output saved to: {output_dir}/")
|
|
|
|
# Print summary statistics
|
|
total_lines = sum(len(page.children) for page in pages)
|
|
avg_lines_per_page = total_lines / len(pages) if pages else 0
|
|
|
|
print(f"\n📊 Statistics:")
|
|
print(f" - Total lines rendered: {total_lines}")
|
|
print(f" - Average lines per page: {avg_lines_per_page:.1f}")
|
|
print(f" - Page dimensions: {renderer.page_width}x{renderer.page_height} pixels")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|