first code commit

This commit is contained in:
Duncan Tourolle 2025-05-27 11:58:19 +02:00
commit f7ad69f9ec
55 changed files with 10682 additions and 0 deletions

33
.gitignore vendored Normal file
View File

@ -0,0 +1,33 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
*/__pycache__
# Distribution / packaging
dist/
build/
*.egg-info/
# Environment
venv/
env/
.env/
.venv/
# Tests
.pytest_cache/
.coverage
htmlcov/
# IDE files
.idea/
.vscode/
*.swp
*.swo
# Project specific
*.png
*.jpg
*.jpeg
*.gif
*.svg

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 Duncan Tourolle
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

10
MANIFEST.in Normal file
View File

@ -0,0 +1,10 @@
include README.md
include LICENSE
include pyWebLayout/*.py
recursive-include pyWebLayout/abstract *.py
recursive-include pyWebLayout/concrete *.py
recursive-include pyWebLayout/style *.py
recursive-include pyWebLayout/core *.py
recursive-include pyWebLayout/typesetting *.py
recursive-include pyWebLayout/io *.py
recursive-include pyWebLayout/examples *.py

93
README.md Normal file
View File

@ -0,0 +1,93 @@
# PyWebLayout
A Python library for HTML-like layout and rendering.
## Description
PyWebLayout provides classes for rendering HTML-like content to images using a box-based layout system. It includes support for text, tables, and containers, as well as an HTML parser for converting HTML to layout objects.
## Features
- HTML-like layout system
- Text rendering with font support
- Table layouts
- Container elements
- HTML parsing
- Image output
## Installation
```bash
pip install pyWebLayout
```
## Usage
### Basic Example
```python
from pyWebLayout.concrete.page import Page, Container
from pyWebLayout.abstract.inline import Line
from pyWebLayout.layout import Alignment
from PIL import ImageFont
# Create a page
page = Page(size=(800, 600), background_color=(240, 240, 240))
# Add a title container
title_container = Container(
origin=(0, 0),
size=(780, 60),
direction='horizontal',
spacing=10,
padding=(10, 10, 10, 10),
halign=Alignment.CENTER,
valign=Alignment.CENTER
)
page.add_child(title_container)
# Create a title line with text
title_font = ImageFont.load_default()
title_line = Line(
spacing=(8, 15),
origin=(0, 0),
size=(760, 40),
font=title_font,
text_color=(0, 0, 0),
halign=Alignment.CENTER
)
title_container.add_child(title_line)
title_line.add_word("PyWebLayout", title_font)
title_line.add_word("Example", title_font)
# Layout and render the page
page.layout()
image = page.render()
image.save("example.png")
```
### HTML Example
```python
from pyWebLayout.html_parser import html_to_image
html = """
<div style="text-align: center; padding: 10px;">
<h1>PyWebLayout HTML Example</h1>
<p>This is a paragraph rendered from HTML.</p>
<p>The library supports <b>bold</b>, <i>italic</i>, and <u>underlined</u> text.</p>
</div>
"""
# Render HTML to an image
image = html_to_image(html, page_size=(800, 600))
image.save("html_example.png")
```
## License
MIT License
## Author
Duncan Tourolle - duncan@tourolle.paris

44
pyWebLayout/__init__.py Normal file
View File

@ -0,0 +1,44 @@
"""
PyWebLayout - A Python library for HTML-like layout and rendering.
This library provides classes for rendering HTML-like content to images
using a box-based layout system. It includes support for text, tables,
and containers, as well as parsers for HTML and EPUB content. It also
supports pagination for ebook-like content with the ability to pause,
save state, and resume rendering.
"""
__version__ = '0.1.0'
# Core abstractions
from pyWebLayout.core import Renderable, Interactable, Layoutable, Queriable
# Style components
from pyWebLayout.style import Alignment, Font, FontWeight, FontStyle, TextDecoration
# Typesetting algorithms
from pyWebLayout.typesetting import (
FlowLayout,
Paginator, PaginationState,
DocumentPaginator, DocumentPaginationState
)
# Abstract document model
from pyWebLayout.abstract.document import Document, Book, Chapter, MetadataType
# Concrete implementations
from pyWebLayout.concrete.box import Box
from pyWebLayout.concrete.text import Line
from pyWebLayout.concrete.page import Container, Page
# Abstract components
from pyWebLayout.abstract.inline import Word
# Layout components
from pyWebLayout.table import Table, TableCell
# IO functionality (reading and writing)
from pyWebLayout.io import (
parse_html, html_to_document, # HTML parsing
read_epub # EPUB reading
)

12
pyWebLayout/__main__.py Normal file
View File

@ -0,0 +1,12 @@
import os
import sys
# Add the parent directory to sys.path for direct execution
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Now import the example module
from pyWebLayout.example import save_examples
if __name__ == "__main__":
print("Running PyWebLayout examples...")
save_examples()

View File

@ -0,0 +1,6 @@
from .block import Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock
from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell
from .block import HorizontalRule, LineBreak, Image
from .inline import Word, FormattedSpan
from .document import Document, MetadataType, Chapter, Book
from .functional import Link, LinkType, Button, Form, FormField, FormFieldType

View File

@ -0,0 +1,783 @@
from typing import List, Iterator, Tuple, Dict, Optional, Union, Any
from enum import Enum
from .inline import Word, FormattedSpan
class BlockType(Enum):
"""Enumeration of different block types for classification purposes"""
PARAGRAPH = 1
HEADING = 2
QUOTE = 3
CODE_BLOCK = 4
LIST = 5
LIST_ITEM = 6
TABLE = 7
TABLE_ROW = 8
TABLE_CELL = 9
HORIZONTAL_RULE = 10
LINE_BREAK = 11
IMAGE = 12
class Block:
"""
Base class for all block-level elements.
Block elements typically represent visual blocks of content that stack vertically.
"""
def __init__(self, block_type: BlockType):
"""
Initialize a block element.
Args:
block_type: The type of block this element represents
"""
self._block_type = block_type
self._parent = None
@property
def block_type(self) -> BlockType:
"""Get the type of this block element"""
return self._block_type
@property
def parent(self):
"""Get the parent block containing this block, if any"""
return self._parent
@parent.setter
def parent(self, parent):
"""Set the parent block"""
self._parent = parent
class Parapgraph(Block):
"""
A paragraph is a block-level element that contains a sequence of words.
"""
def __init__(self):
"""Initialize an empty paragraph"""
super().__init__(BlockType.PARAGRAPH)
self._words: List[Word] = []
self._spans: List[FormattedSpan] = []
def add_word(self, word: Word):
"""
Add a word to this paragraph.
Args:
word: The Word object to add
"""
self._words.append(word)
def add_span(self, span: FormattedSpan):
"""
Add a formatted span to this paragraph.
Args:
span: The FormattedSpan object to add
"""
self._spans.append(span)
def words(self) -> Iterator[Tuple[int, Word]]:
"""
Iterate over the words in this paragraph.
Yields:
Tuples of (index, word) for each word in the paragraph
"""
for i, word in enumerate(self._words):
yield i, word
def spans(self) -> Iterator[FormattedSpan]:
"""
Iterate over the formatted spans in this paragraph.
Yields:
Each FormattedSpan in the paragraph
"""
for span in self._spans:
yield span
@property
def word_count(self) -> int:
"""Get the number of words in this paragraph"""
return len(self._words)
class HeadingLevel(Enum):
"""Enumeration representing HTML heading levels (h1-h6)"""
H1 = 1
H2 = 2
H3 = 3
H4 = 4
H5 = 5
H6 = 6
class Heading(Parapgraph):
"""
A heading element (h1, h2, h3, etc.) that contains text with a specific heading level.
Headings inherit from Paragraph as they contain words but have additional properties.
"""
def __init__(self, level: HeadingLevel = HeadingLevel.H1):
"""
Initialize a heading element.
Args:
level: The heading level (h1-h6)
"""
super().__init__()
self._block_type = BlockType.HEADING
self._level = level
@property
def level(self) -> HeadingLevel:
"""Get the heading level"""
return self._level
@level.setter
def level(self, level: HeadingLevel):
"""Set the heading level"""
self._level = level
class Quote(Block):
"""
A blockquote element that can contain other block elements.
"""
def __init__(self):
"""Initialize an empty blockquote"""
super().__init__(BlockType.QUOTE)
self._blocks: List[Block] = []
def add_block(self, block: Block):
"""
Add a block element to this quote.
Args:
block: The Block object to add
"""
self._blocks.append(block)
block.parent = self
def blocks(self) -> Iterator[Block]:
"""
Iterate over the blocks in this quote.
Yields:
Each Block in the quote
"""
for block in self._blocks:
yield block
class CodeBlock(Block):
"""
A code block element containing pre-formatted text with syntax highlighting.
"""
def __init__(self, language: str = ""):
"""
Initialize a code block.
Args:
language: The programming language for syntax highlighting
"""
super().__init__(BlockType.CODE_BLOCK)
self._language = language
self._lines: List[str] = []
@property
def language(self) -> str:
"""Get the programming language"""
return self._language
@language.setter
def language(self, language: str):
"""Set the programming language"""
self._language = language
def add_line(self, line: str):
"""
Add a line of code to this code block.
Args:
line: The line of code to add
"""
self._lines.append(line)
def lines(self) -> Iterator[Tuple[int, str]]:
"""
Iterate over the lines in this code block.
Yields:
Tuples of (line_number, line_text) for each line
"""
for i, line in enumerate(self._lines):
yield i, line
@property
def line_count(self) -> int:
"""Get the number of lines in this code block"""
return len(self._lines)
class ListStyle(Enum):
"""Enumeration of list styles"""
UNORDERED = 1 # <ul>
ORDERED = 2 # <ol>
DEFINITION = 3 # <dl>
class HList(Block):
"""
An HTML list element (ul, ol, dl).
"""
def __init__(self, style: ListStyle = ListStyle.UNORDERED):
"""
Initialize a list.
Args:
style: The style of list (unordered, ordered, definition)
"""
super().__init__(BlockType.LIST)
self._style = style
self._items: List[ListItem] = []
@property
def style(self) -> ListStyle:
"""Get the list style"""
return self._style
@style.setter
def style(self, style: ListStyle):
"""Set the list style"""
self._style = style
def add_item(self, item: 'ListItem'):
"""
Add an item to this list.
Args:
item: The ListItem to add
"""
self._items.append(item)
item.parent = self
def items(self) -> Iterator['ListItem']:
"""
Iterate over the items in this list.
Yields:
Each ListItem in the list
"""
for item in self._items:
yield item
@property
def item_count(self) -> int:
"""Get the number of items in this list"""
return len(self._items)
class ListItem(Block):
"""
A list item element that can contain other block elements.
"""
def __init__(self, term: Optional[str] = None):
"""
Initialize a list item.
Args:
term: Optional term for definition lists (dt element)
"""
super().__init__(BlockType.LIST_ITEM)
self._blocks: List[Block] = []
self._term = term
@property
def term(self) -> Optional[str]:
"""Get the definition term (for definition lists)"""
return self._term
@term.setter
def term(self, term: str):
"""Set the definition term"""
self._term = term
def add_block(self, block: Block):
"""
Add a block element to this list item.
Args:
block: The Block object to add
"""
self._blocks.append(block)
block.parent = self
def blocks(self) -> Iterator[Block]:
"""
Iterate over the blocks in this list item.
Yields:
Each Block in the list item
"""
for block in self._blocks:
yield block
class TableCell(Block):
"""
A table cell element that can contain other block elements.
"""
def __init__(self, is_header: bool = False, colspan: int = 1, rowspan: int = 1):
"""
Initialize a table cell.
Args:
is_header: Whether this cell is a header cell (th) or data cell (td)
colspan: Number of columns this cell spans
rowspan: Number of rows this cell spans
"""
super().__init__(BlockType.TABLE_CELL)
self._is_header = is_header
self._colspan = colspan
self._rowspan = rowspan
self._blocks: List[Block] = []
@property
def is_header(self) -> bool:
"""Check if this is a header cell"""
return self._is_header
@is_header.setter
def is_header(self, is_header: bool):
"""Set whether this is a header cell"""
self._is_header = is_header
@property
def colspan(self) -> int:
"""Get the column span"""
return self._colspan
@colspan.setter
def colspan(self, colspan: int):
"""Set the column span"""
self._colspan = max(1, colspan) # Ensure minimum of 1
@property
def rowspan(self) -> int:
"""Get the row span"""
return self._rowspan
@rowspan.setter
def rowspan(self, rowspan: int):
"""Set the row span"""
self._rowspan = max(1, rowspan) # Ensure minimum of 1
def add_block(self, block: Block):
"""
Add a block element to this cell.
Args:
block: The Block object to add
"""
self._blocks.append(block)
block.parent = self
def blocks(self) -> Iterator[Block]:
"""
Iterate over the blocks in this cell.
Yields:
Each Block in the cell
"""
for block in self._blocks:
yield block
class TableRow(Block):
"""
A table row element containing table cells.
"""
def __init__(self):
"""Initialize an empty table row"""
super().__init__(BlockType.TABLE_ROW)
self._cells: List[TableCell] = []
def add_cell(self, cell: TableCell):
"""
Add a cell to this row.
Args:
cell: The TableCell to add
"""
self._cells.append(cell)
cell.parent = self
def cells(self) -> Iterator[TableCell]:
"""
Iterate over the cells in this row.
Yields:
Each TableCell in the row
"""
for cell in self._cells:
yield cell
@property
def cell_count(self) -> int:
"""Get the number of cells in this row"""
return len(self._cells)
class Table(Block):
"""
A table element containing rows and cells.
"""
def __init__(self, caption: Optional[str] = None):
"""
Initialize a table.
Args:
caption: Optional caption for the table
"""
super().__init__(BlockType.TABLE)
self._caption = caption
self._rows: List[TableRow] = []
self._header_rows: List[TableRow] = []
self._footer_rows: List[TableRow] = []
@property
def caption(self) -> Optional[str]:
"""Get the table caption"""
return self._caption
@caption.setter
def caption(self, caption: Optional[str]):
"""Set the table caption"""
self._caption = caption
def add_row(self, row: TableRow, section: str = "body"):
"""
Add a row to this table.
Args:
row: The TableRow to add
section: The section to add the row to ("header", "body", or "footer")
"""
row.parent = self
if section.lower() == "header":
self._header_rows.append(row)
elif section.lower() == "footer":
self._footer_rows.append(row)
else: # Default to body
self._rows.append(row)
def header_rows(self) -> Iterator[TableRow]:
"""
Iterate over the header rows in this table.
Yields:
Each TableRow in the table header
"""
for row in self._header_rows:
yield row
def body_rows(self) -> Iterator[TableRow]:
"""
Iterate over the body rows in this table.
Yields:
Each TableRow in the table body
"""
for row in self._rows:
yield row
def footer_rows(self) -> Iterator[TableRow]:
"""
Iterate over the footer rows in this table.
Yields:
Each TableRow in the table footer
"""
for row in self._footer_rows:
yield row
def all_rows(self) -> Iterator[Tuple[str, TableRow]]:
"""
Iterate over all rows in this table with their section.
Yields:
Tuples of (section, row) for each row
"""
for row in self._header_rows:
yield "header", row
for row in self._rows:
yield "body", row
for row in self._footer_rows:
yield "footer", row
@property
def row_count(self) -> Dict[str, int]:
"""Get the number of rows in each section"""
return {
"header": len(self._header_rows),
"body": len(self._rows),
"footer": len(self._footer_rows),
"total": len(self._header_rows) + len(self._rows) + len(self._footer_rows)
}
class HorizontalRule(Block):
"""
A horizontal rule element (<hr>).
"""
def __init__(self):
"""Initialize a horizontal rule"""
super().__init__(BlockType.HORIZONTAL_RULE)
class LineBreak(Block):
"""
A line break element (<br>).
"""
def __init__(self):
"""Initialize a line break"""
super().__init__(BlockType.LINE_BREAK)
class Image(Block):
"""
An image element that can be displayed in a document.
"""
def __init__(self, source: str, alt_text: Optional[str] = None,
width: Optional[int] = None, height: Optional[int] = None):
"""
Initialize an image.
Args:
source: The path or URL to the image
alt_text: Alternative text description of the image
width: Optional width to display the image
height: Optional height to display the image
"""
super().__init__(BlockType.IMAGE)
self._source = source
self._alt_text = alt_text or ""
self._width = width
self._height = height
self._loaded_image = None
self._error = None
# Try to load the image immediately
self.load()
@property
def source(self) -> str:
"""Get the image source path or URL"""
return self._source
@source.setter
def source(self, source: str):
"""Set the image source path or URL"""
self._source = source
self._loaded_image = None # Reset loaded image when source changes
self._error = None
# Try to load the image with the new source
self.load()
@property
def alt_text(self) -> str:
"""Get the alternative text for the image"""
return self._alt_text
@alt_text.setter
def alt_text(self, alt_text: str):
"""Set the alternative text for the image"""
self._alt_text = alt_text
@property
def width(self) -> Optional[int]:
"""Get the specified width for the image"""
return self._width
@width.setter
def width(self, width: Optional[int]):
"""Set the specified width for the image"""
self._width = width
@property
def height(self) -> Optional[int]:
"""Get the specified height for the image"""
return self._height
@height.setter
def height(self, height: Optional[int]):
"""Set the specified height for the image"""
self._height = height
@property
def loaded_image(self):
"""Get the loaded image data, if available"""
return self._loaded_image
@property
def error(self) -> Optional[str]:
"""Get any error message from attempting to load the image"""
return self._error
def load(self):
"""
Load the image from the source.
This method handles loading from local files and URLs.
Returns:
True if the image was loaded successfully, False otherwise
"""
try:
import os
from PIL import Image as PILImage
# Handle different types of sources
if os.path.isfile(self._source):
# Local file
self._loaded_image = PILImage.open(self._source)
self._error = None
return True
elif self._source.startswith(('http://', 'https://')):
# URL - requires requests library
try:
import requests
from io import BytesIO
response = requests.get(self._source, stream=True)
if response.status_code == 200:
self._loaded_image = PILImage.open(BytesIO(response.content))
self._error = None
return True
else:
self._error = f"Failed to load image: HTTP status {response.status_code}"
return False
except ImportError:
self._error = "Requests library not available for URL loading"
return False
except Exception as e:
self._error = f"Error loading image from URL: {str(e)}"
return False
elif self._source.startswith('data:image/'):
# Data URI
try:
import base64
from io import BytesIO
# Parse the data URI
# Format: data:image/png;base64,<base64-encoded-data>
header, encoded = self._source.split(',', 1)
mime_type = header.split(';')[0].split(':')[1]
# Decode the base64 data
decoded = base64.b64decode(encoded)
self._loaded_image = PILImage.open(BytesIO(decoded))
self._error = None
return True
except Exception as e:
self._error = f"Error loading image from data URI: {str(e)}"
return False
else:
self._error = f"Unable to load image from source: {self._source}"
return False
except ImportError as e:
self._error = f"PIL library not available: {str(e)}"
return False
except Exception as e:
self._error = f"Error loading image: {str(e)}"
return False
def get_dimensions(self) -> Tuple[Optional[int], Optional[int]]:
"""
Get the dimensions of the image.
Returns:
A tuple of (width, height), or (None, None) if the image is not loaded
"""
if self._loaded_image:
return self._loaded_image.size
return self._width, self._height
def get_aspect_ratio(self) -> Optional[float]:
"""
Get the aspect ratio of the image (width/height).
Returns:
The aspect ratio as a float, or None if the image is not loaded
and no dimensions are specified
"""
if self._loaded_image:
width, height = self._loaded_image.size
if height > 0:
return width / height
elif self._width is not None and self._height is not None and self._height > 0:
return self._width / self._height
return None
def calculate_scaled_dimensions(self, max_width: Optional[int] = None,
max_height: Optional[int] = None) -> Tuple[int, int]:
"""
Calculate the scaled dimensions of the image within constraints.
Args:
max_width: The maximum width constraint
max_height: The maximum height constraint
Returns:
A tuple of (width, height) that fits within the constraints
while maintaining the aspect ratio
"""
# Use specified dimensions if available
if self._width is not None and self._height is not None:
return self._width, self._height
# If image is loaded, use its dimensions
if self._loaded_image:
orig_width, orig_height = self._loaded_image.size
else:
# If no image is loaded and no dimensions specified, use defaults
return self._width or 300, self._height or 200
# If only one dimension is specified, calculate the other
if self._width is not None and self._height is None:
aspect = orig_width / orig_height
return self._width, int(self._width / aspect)
elif self._height is not None and self._width is None:
aspect = orig_width / orig_height
return int(self._height * aspect), self._height
# Apply max constraints if provided
width, height = orig_width, orig_height
if max_width is not None and width > max_width:
height = int(height * (max_width / width))
width = max_width
if max_height is not None and height > max_height:
width = int(width * (max_height / height))
height = max_height
return width, height

View File

@ -0,0 +1,377 @@
from __future__ import annotations
from typing import List, Dict, Optional, Tuple, Union, Any
from enum import Enum
from .block import Block, BlockType, Heading, HeadingLevel, Parapgraph
from .functional import Link, Button, Form
from .inline import Word, FormattedSpan
class MetadataType(Enum):
"""Types of metadata that can be associated with a document"""
TITLE = 1
AUTHOR = 2
DESCRIPTION = 3
KEYWORDS = 4
LANGUAGE = 5
PUBLICATION_DATE = 6
MODIFIED_DATE = 7
PUBLISHER = 8
IDENTIFIER = 9
COVER_IMAGE = 10
CUSTOM = 100
class Document:
"""
Abstract representation of a complete document like an HTML page or an ebook.
This class manages the logical structure of the document without rendering concerns.
"""
def __init__(self, title: Optional[str] = None, language: str = "en-US"):
"""
Initialize a new document.
Args:
title: The document title
language: The document language code
"""
self._blocks: List[Block] = []
self._metadata: Dict[MetadataType, Any] = {}
self._anchors: Dict[str, Block] = {} # Named anchors for navigation
self._resources: Dict[str, Any] = {} # External resources like images
self._stylesheets: List[Dict[str, Any]] = [] # CSS stylesheets
self._scripts: List[str] = [] # JavaScript code
# Set basic metadata
if title:
self.set_metadata(MetadataType.TITLE, title)
self.set_metadata(MetadataType.LANGUAGE, language)
@property
def blocks(self) -> List[Block]:
"""Get the top-level blocks in this document"""
return self._blocks
def add_block(self, block: Block):
"""
Add a block to this document.
Args:
block: The block to add
"""
self._blocks.append(block)
def set_metadata(self, meta_type: MetadataType, value: Any):
"""
Set a metadata value.
Args:
meta_type: The type of metadata
value: The metadata value
"""
self._metadata[meta_type] = value
def get_metadata(self, meta_type: MetadataType) -> Optional[Any]:
"""
Get a metadata value.
Args:
meta_type: The type of metadata
Returns:
The metadata value, or None if not set
"""
return self._metadata.get(meta_type)
def add_anchor(self, name: str, target: Block):
"""
Add a named anchor to this document.
Args:
name: The anchor name
target: The target block
"""
self._anchors[name] = target
def get_anchor(self, name: str) -> Optional[Block]:
"""
Get a named anchor from this document.
Args:
name: The anchor name
Returns:
The target block, or None if not found
"""
return self._anchors.get(name)
def add_resource(self, name: str, resource: Any):
"""
Add a resource to this document.
Args:
name: The resource name
resource: The resource data
"""
self._resources[name] = resource
def get_resource(self, name: str) -> Optional[Any]:
"""
Get a resource from this document.
Args:
name: The resource name
Returns:
The resource data, or None if not found
"""
return self._resources.get(name)
def add_stylesheet(self, stylesheet: Dict[str, Any]):
"""
Add a stylesheet to this document.
Args:
stylesheet: The stylesheet data
"""
self._stylesheets.append(stylesheet)
def add_script(self, script: str):
"""
Add a script to this document.
Args:
script: The script code
"""
self._scripts.append(script)
def get_title(self) -> Optional[str]:
"""
Get the document title.
Returns:
The document title, or None if not set
"""
return self.get_metadata(MetadataType.TITLE)
def set_title(self, title: str):
"""
Set the document title.
Args:
title: The document title
"""
self.set_metadata(MetadataType.TITLE, title)
def find_blocks_by_type(self, block_type: BlockType) -> List[Block]:
"""
Find all blocks of a specific type.
Args:
block_type: The type of blocks to find
Returns:
A list of matching blocks
"""
result = []
def _find_recursive(blocks: List[Block]):
for block in blocks:
if block.block_type == block_type:
result.append(block)
# Check for child blocks based on block type
if hasattr(block, '_blocks'):
_find_recursive(block._blocks)
elif hasattr(block, '_items') and isinstance(block._items, list):
_find_recursive(block._items)
_find_recursive(self._blocks)
return result
def find_headings(self) -> List[Heading]:
"""
Find all headings in the document.
Returns:
A list of heading blocks
"""
blocks = self.find_blocks_by_type(BlockType.HEADING)
return [block for block in blocks if isinstance(block, Heading)]
def generate_table_of_contents(self) -> List[Tuple[int, str, Block]]:
"""
Generate a table of contents from headings.
Returns:
A list of tuples containing (level, title, heading_block)
"""
headings = self.find_headings()
toc = []
for heading in headings:
# Extract text from the heading
title = ""
for _, word in heading.words():
title += word.text + " "
title = title.strip()
# Add to TOC
level = heading.level.value # Get numeric value from HeadingLevel enum
toc.append((level, title, heading))
return toc
class Chapter:
"""
Represents a chapter or section in a document.
A chapter contains a sequence of blocks and has metadata.
"""
def __init__(self, title: Optional[str] = None, level: int = 1):
"""
Initialize a new chapter.
Args:
title: The chapter title
level: The chapter level (1 = top level, 2 = subsection, etc.)
"""
self._title = title
self._level = level
self._blocks: List[Block] = []
self._metadata: Dict[str, Any] = {}
@property
def title(self) -> Optional[str]:
"""Get the chapter title"""
return self._title
@title.setter
def title(self, title: str):
"""Set the chapter title"""
self._title = title
@property
def level(self) -> int:
"""Get the chapter level"""
return self._level
@property
def blocks(self) -> List[Block]:
"""Get the blocks in this chapter"""
return self._blocks
def add_block(self, block: Block):
"""
Add a block to this chapter.
Args:
block: The block to add
"""
self._blocks.append(block)
def set_metadata(self, key: str, value: Any):
"""
Set a metadata value.
Args:
key: The metadata key
value: The metadata value
"""
self._metadata[key] = value
def get_metadata(self, key: str) -> Optional[Any]:
"""
Get a metadata value.
Args:
key: The metadata key
Returns:
The metadata value, or None if not set
"""
return self._metadata.get(key)
class Book(Document):
"""
Abstract representation of an ebook.
A book is a document that contains chapters.
"""
def __init__(self, title: Optional[str] = None, author: Optional[str] = None, language: str = "en-US"):
"""
Initialize a new book.
Args:
title: The book title
author: The book author
language: The book language code
"""
super().__init__(title, language)
self._chapters: List[Chapter] = []
if author:
self.set_metadata(MetadataType.AUTHOR, author)
@property
def chapters(self) -> List[Chapter]:
"""Get the chapters in this book"""
return self._chapters
def add_chapter(self, chapter: Chapter):
"""
Add a chapter to this book.
Args:
chapter: The chapter to add
"""
self._chapters.append(chapter)
def create_chapter(self, title: Optional[str] = None, level: int = 1) -> Chapter:
"""
Create and add a new chapter.
Args:
title: The chapter title
level: The chapter level
Returns:
The new chapter
"""
chapter = Chapter(title, level)
self.add_chapter(chapter)
return chapter
def get_author(self) -> Optional[str]:
"""
Get the book author.
Returns:
The book author, or None if not set
"""
return self.get_metadata(MetadataType.AUTHOR)
def set_author(self, author: str):
"""
Set the book author.
Args:
author: The book author
"""
self.set_metadata(MetadataType.AUTHOR, author)
def generate_table_of_contents(self) -> List[Tuple[int, str, Chapter]]:
"""
Generate a table of contents from chapters.
Returns:
A list of tuples containing (level, title, chapter)
"""
toc = []
for chapter in self._chapters:
if chapter.title:
toc.append((chapter.level, chapter.title, chapter))
return toc

View File

@ -0,0 +1,310 @@
from __future__ import annotations
from enum import Enum
from typing import Callable, Dict, Any, Optional, Union, List, Tuple
from pyWebLayout.base import Interactable
class LinkType(Enum):
"""Enumeration of different types of links for classification purposes"""
INTERNAL = 1 # Links within the same document (e.g., chapter references, bookmarks)
EXTERNAL = 2 # Links to external resources (e.g., websites, other documents)
API = 3 # Links that trigger API calls (e.g., for settings management)
FUNCTION = 4 # Links that execute a specific function
class Link(Interactable):
"""
A link that can navigate to a location or execute a function.
Links can be used for navigation within a document, to external resources,
or to trigger API calls for functionality like settings management.
"""
def __init__(self,
location: str,
link_type: LinkType = LinkType.INTERNAL,
callback: Optional[Callable] = None,
params: Optional[Dict[str, Any]] = None,
title: Optional[str] = None):
"""
Initialize a link.
Args:
location: The target location or identifier for this link
link_type: The type of link (internal, external, API, function)
callback: Optional callback function to execute when the link is activated
params: Optional parameters to pass to the callback or API
title: Optional title/tooltip for the link
"""
super().__init__(callback)
self._location = location
self._link_type = link_type
self._params = params or {}
self._title = title
@property
def location(self) -> str:
"""Get the target location of this link"""
return self._location
@property
def link_type(self) -> LinkType:
"""Get the type of this link"""
return self._link_type
@property
def params(self) -> Dict[str, Any]:
"""Get the parameters for this link"""
return self._params
@property
def title(self) -> Optional[str]:
"""Get the title/tooltip for this link"""
return self._title
def execute(self) -> Any:
"""
Execute the link action based on its type.
For internal and external links, returns the location.
For API and function links, executes the callback with the provided parameters.
Returns:
The result of the link execution, which depends on the link type.
"""
if self._link_type in (LinkType.API, LinkType.FUNCTION) and self._callback:
return self._callback(self._location, **self._params)
else:
# For INTERNAL and EXTERNAL links, return the location
# The renderer/browser will handle the navigation
return self._location
class Button(Interactable):
"""
A button that can be clicked to execute an action.
Buttons are similar to function links but are rendered differently.
"""
def __init__(self,
label: str,
callback: Callable,
params: Optional[Dict[str, Any]] = None,
enabled: bool = True):
"""
Initialize a button.
Args:
label: The text label for the button
callback: The function to execute when the button is clicked
params: Optional parameters to pass to the callback
enabled: Whether the button is initially enabled
"""
super().__init__(callback)
self._label = label
self._params = params or {}
self._enabled = enabled
@property
def label(self) -> str:
"""Get the button label"""
return self._label
@label.setter
def label(self, label: str):
"""Set the button label"""
self._label = label
@property
def enabled(self) -> bool:
"""Check if the button is enabled"""
return self._enabled
@enabled.setter
def enabled(self, enabled: bool):
"""Enable or disable the button"""
self._enabled = enabled
def execute(self) -> Any:
"""
Execute the button's callback function if the button is enabled.
Returns:
The result of the callback function, or None if the button is disabled.
"""
if self._enabled and self._callback:
return self._callback(**self._params)
return None
class Form(Interactable):
"""
A form that can contain input fields and be submitted.
Forms can be used for user input and settings configuration.
"""
def __init__(self,
form_id: str,
action: Optional[str] = None,
callback: Optional[Callable] = None):
"""
Initialize a form.
Args:
form_id: The unique identifier for this form
action: The action URL or endpoint for form submission
callback: Optional callback function to execute on form submission
"""
super().__init__(callback)
self._form_id = form_id
self._action = action
self._fields: Dict[str, FormField] = {}
@property
def form_id(self) -> str:
"""Get the form ID"""
return self._form_id
@property
def action(self) -> Optional[str]:
"""Get the form action"""
return self._action
def add_field(self, field: FormField):
"""
Add a field to this form.
Args:
field: The FormField to add
"""
self._fields[field.name] = field
field.form = self
def get_field(self, name: str) -> Optional[FormField]:
"""
Get a field by name.
Args:
name: The name of the field to get
Returns:
The FormField with the specified name, or None if not found
"""
return self._fields.get(name)
def get_values(self) -> Dict[str, Any]:
"""
Get the current values of all fields in this form.
Returns:
A dictionary mapping field names to their current values
"""
return {name: field.value for name, field in self._fields.items()}
def execute(self) -> Any:
"""
Submit the form, executing the callback with the form values.
Returns:
The result of the callback function, or the form values if no callback is provided.
"""
values = self.get_values()
if self._callback:
return self._callback(self._form_id, values)
return values
class FormFieldType(Enum):
"""Enumeration of different types of form fields"""
TEXT = 1
PASSWORD = 2
CHECKBOX = 3
RADIO = 4
SELECT = 5
TEXTAREA = 6
NUMBER = 7
DATE = 8
TIME = 9
EMAIL = 10
URL = 11
COLOR = 12
RANGE = 13
HIDDEN = 14
class FormField:
"""
A field in a form that can accept user input.
"""
def __init__(self,
name: str,
field_type: FormFieldType,
label: Optional[str] = None,
value: Any = None,
required: bool = False,
options: Optional[List[Tuple[str, str]]] = None):
"""
Initialize a form field.
Args:
name: The name of this field
field_type: The type of this field
label: Optional label for this field
value: Initial value for this field
required: Whether this field is required
options: Options for select, radio, or checkbox fields (list of (value, label) tuples)
"""
self._name = name
self._field_type = field_type
self._label = label or name
self._value = value
self._required = required
self._options = options or []
self._form: Optional[Form] = None
@property
def name(self) -> str:
"""Get the field name"""
return self._name
@property
def field_type(self) -> FormFieldType:
"""Get the field type"""
return self._field_type
@property
def label(self) -> str:
"""Get the field label"""
return self._label
@property
def value(self) -> Any:
"""Get the current field value"""
return self._value
@value.setter
def value(self, value: Any):
"""Set the field value"""
self._value = value
@property
def required(self) -> bool:
"""Check if the field is required"""
return self._required
@property
def options(self) -> List[Tuple[str, str]]:
"""Get the field options"""
return self._options
@property
def form(self) -> Optional[Form]:
"""Get the form containing this field"""
return self._form
@form.setter
def form(self, form: Form):
"""Set the form containing this field"""
self._form = form

View File

@ -0,0 +1,208 @@
from __future__ import annotations
from pyWebLayout.base import Queriable
from pyWebLayout.style import Font
from typing import Tuple, Union, List, Optional, Dict
class Word:
"""
An abstract representation of a word in a document. Words can be split across
lines or pages during rendering. This class manages the logical representation
of a word without any rendering specifics.
"""
def __init__(self, text: str, style: Font, background=None, previous: Union[Word, None] = None):
"""
Initialize a new Word.
Args:
text: The text content of the word
style: Font style information for the word
background: Optional background color override
previous: Reference to the previous word in sequence
"""
self._text = text
self._style = style
self._background = background if background else style.background
self._previous = previous
self._next = None
self._hyphenated_parts = None # Will store hyphenated parts if word is hyphenated
@property
def text(self) -> str:
"""Get the text content of the word"""
return self._text
@property
def style(self) -> Font:
"""Get the font style of the word"""
return self._style
@property
def background(self):
"""Get the background color of the word"""
return self._background
@property
def previous(self) -> Union[Word, None]:
"""Get the previous word in sequence"""
return self._previous
@property
def next(self) -> Union[Word, None]:
"""Get the next word in sequence"""
return self._next
@property
def hyphenated_parts(self) -> Union[List[str], None]:
"""Get the hyphenated parts of the word if it has been hyphenated"""
return self._hyphenated_parts
def add_next(self, next_word: Word):
"""Set the next word in sequence"""
self._next = next_word
def can_hyphenate(self, language: str = None) -> bool:
"""
Check if the word can be hyphenated.
Args:
language: Language code for hyphenation. If None, uses the style's language.
Returns:
bool: True if the word can be hyphenated, False otherwise.
"""
# Only import pyphen when needed
import pyphen
# Use the provided language or fall back to style language
lang = language if language else self._style.language
dic = pyphen.Pyphen(lang=lang)
# Check if the word can be hyphenated
hyphenated = dic.inserted(self._text, hyphen='-')
return '-' in hyphenated
def hyphenate(self, language: str = None) -> bool:
"""
Hyphenate the word and store the parts.
Args:
language: Language code for hyphenation. If None, uses the style's language.
Returns:
bool: True if the word was hyphenated, False otherwise.
"""
# Only import pyphen when needed
import pyphen
# Use the provided language or fall back to style language
lang = language if language else self._style.language
dic = pyphen.Pyphen(lang=lang)
# Get hyphenated version
hyphenated = dic.inserted(self._text, hyphen='-')
# If no hyphens were inserted, the word cannot be hyphenated
if '-' not in hyphenated:
return False
# Split the word into parts by the hyphen
parts = hyphenated.split('-')
# Add the hyphen to all parts except the last one
for i in range(len(parts) - 1):
parts[i] = parts[i] + '-'
self._hyphenated_parts = parts
return True
def dehyphenate(self):
"""Remove hyphenation"""
self._hyphenated_parts = None
def get_hyphenated_part(self, index: int) -> str:
"""
Get a specific hyphenated part of the word.
Args:
index: The index of the part to retrieve.
Returns:
The text of the specified part.
Raises:
IndexError: If the index is out of range or the word has not been hyphenated.
"""
if not self._hyphenated_parts:
raise IndexError("Word has not been hyphenated")
return self._hyphenated_parts[index]
def get_hyphenated_part_count(self) -> int:
"""
Get the number of hyphenated parts.
Returns:
The number of parts, or 0 if the word has not been hyphenated.
"""
return len(self._hyphenated_parts) if self._hyphenated_parts else 0
class FormattedSpan:
"""
A run of words with consistent formatting.
This represents a sequence of words that share the same style attributes.
"""
def __init__(self, style: Font, background=None):
"""
Initialize a new formatted span.
Args:
style: Font style information for all words in this span
background: Optional background color override
"""
self._style = style
self._background = background if background else style.background
self._words: List[Word] = []
@property
def style(self) -> Font:
"""Get the font style of this span"""
return self._style
@property
def background(self):
"""Get the background color of this span"""
return self._background
@property
def words(self) -> List[Word]:
"""Get the list of words in this span"""
return self._words
def add_word(self, text: str) -> Word:
"""
Create and add a new word to this span.
Args:
text: The text content of the word
Returns:
The newly created Word object
"""
# Get the previous word if any
previous = self._words[-1] if self._words else None
# Create the new word
word = Word(text, self._style, self._background, previous)
# Link the previous word to this new one
if previous:
previous.add_next(word)
# Add the word to our list
self._words.append(word)
return word

68
pyWebLayout/base.py Normal file
View File

@ -0,0 +1,68 @@
from abc import ABC
import numpy as np
from pyWebLayout.style import Alignment
class Renderable(ABC):
"""
Abstract base class for any object that can be rendered to an image.
All renderable objects must implement the render method.
"""
def render(self):
"""
Render the object to an image.
Returns:
PIL.Image: The rendered image
"""
pass
class Interactable(ABC):
"""
Abstract base class for any object that can be interacted with.
Interactable objects must have a callback that is executed when interacted with.
"""
def __init__(self, callback=None):
"""
Initialize an interactable object.
Args:
callback: The function to call when this object is interacted with
"""
self._callback = callback
def interact(self, point: np.generic):
"""
Handle interaction at the given point.
Args:
point: The coordinates of the interaction
Returns:
The result of calling the callback function with the point
"""
if self._callback is None:
return None
return self._callback(point)
class Layoutable(ABC):
"""
Abstract base class for any object that can be laid out.
Layoutable objects must implement the layout method which arranges their contents.
"""
def layout(self):
"""
Layout the object's contents.
This method should be called before rendering to properly arrange the object's contents.
"""
pass
class Queriable(ABC):
def in_object(self, point:np.generic):
"""
check if a point is in the object
"""
pass

View File

@ -0,0 +1,5 @@
from .box import Box
from .page import Container, Page
from .text import Text, RenderableWord, Line
from .functional import RenderableLink, RenderableButton, RenderableForm, RenderableFormField
from .image import RenderableImage

View File

@ -0,0 +1,61 @@
import numpy as np
from PIL import Image
from pyWebLayout.base import Renderable, Queriable
from pyWebLayout.layout import Alignment
class Box(Renderable, Queriable):
def __init__(self,origin, size, callback = None, sheet : Image = None, mode: bool = None, halign=Alignment.CENTER, valign = Alignment.CENTER):
self._origin = np.array(origin)
self._size = np.array(size)
self._end = self._origin + self._size
self._callback = callback
self._sheet : Image = sheet
if self._sheet == None:
self._mode = mode
else:
self._mode = sheet.mode
self._halign = halign
self._valign = valign
def in_shape(self, point):
return np.all((point >= self.origin) & (point < self._end), axis=-1)
def render(self) -> Image:
# Create a new image canvas
if self._sheet is not None:
canvas = Image.new(self._sheet.mode, tuple(self._size))
else:
# Default to RGBA if no sheet is provided
canvas = Image.new(self._mode if self._mode else 'RGBA', tuple(self._size))
# Check if there's content to render
if hasattr(self, '_content') and self._content is not None:
content_render = self._content.render()
# Calculate positioning based on alignment
content_width, content_height = content_render.size
box_width, box_height = self._size
# Horizontal alignment
if self._halign == Alignment.LEFT:
x_offset = 0
elif self._halign == Alignment.RIGHT:
x_offset = box_width - content_width
else: # CENTER is default
x_offset = (box_width - content_width) // 2
# Vertical alignment
if self._valign == Alignment.TOP:
y_offset = 0
elif self._valign == Alignment.BOTTOM:
y_offset = box_height - content_height
else: # CENTER is default
y_offset = (box_height - content_height) // 2
# Paste the content onto the canvas
canvas.paste(content_render, (x_offset, y_offset))
return canvas

View File

@ -0,0 +1,545 @@
from __future__ import annotations
from typing import Optional, Dict, Any, Tuple, List, Union
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from pyWebLayout.base import Renderable, Queriable
from pyWebLayout.abstract.functional import Link, Button, Form, FormField, LinkType, FormFieldType
from pyWebLayout.style import Font, TextDecoration
from .box import Box
from .text import Text
class RenderableLink(Box, Queriable):
"""
A concrete implementation for rendering Link objects.
"""
def __init__(self, link: Link, text: str, font: Font,
padding: Tuple[int, int, int, int] = (2, 4, 2, 4),
origin=None, size=None, callback=None, sheet=None, mode=None):
"""
Initialize a renderable link.
Args:
link: The abstract Link object to render
text: The text to display for the link
font: The font to use for the link text
padding: Padding as (top, right, bottom, left)
origin: Optional origin coordinates
size: Optional size override
callback: Optional callback override
sheet: Optional sheet for rendering
mode: Optional mode for rendering
"""
# Create link style font (typically underlined and colored)
link_font = font.with_decoration(TextDecoration.UNDERLINE)
if link.link_type == LinkType.INTERNAL:
link_font = link_font.with_colour((0, 0, 200)) # Blue for internal links
elif link.link_type == LinkType.EXTERNAL:
link_font = link_font.with_colour((0, 0, 180)) # Darker blue for external links
elif link.link_type == LinkType.API:
link_font = link_font.with_colour((150, 0, 0)) # Red for API links
elif link.link_type == LinkType.FUNCTION:
link_font = link_font.with_colour((0, 120, 0)) # Green for function links
# Create the text object for the link
self._text_obj = Text(text, link_font)
# Calculate size if not provided
if size is None:
text_width, text_height = self._text_obj.size
size = (
text_width + padding[1] + padding[3], # width + right + left padding
text_height + padding[0] + padding[2] # height + top + bottom padding
)
# Use the link's callback if none provided
if callback is None:
callback = link.execute
# Initialize the box
super().__init__(origin or (0, 0), size, callback, sheet, mode)
# Store the link object and rendering properties
self._link = link
self._padding = padding
self._hovered = False
@property
def link(self) -> Link:
"""Get the abstract Link object"""
return self._link
def render(self) -> Image.Image:
"""
Render the link.
Returns:
A PIL Image containing the rendered link
"""
# Create the base canvas
canvas = super().render()
draw = ImageDraw.Draw(canvas)
# Position the text within the padding
text_x = self._padding[3] # left padding
text_y = self._padding[0] # top padding
# Render the text object
text_img = self._text_obj.render()
# Paste the text onto the canvas
canvas.paste(text_img, (text_x, text_y), text_img)
# Draw a highlight background if hovered
if self._hovered:
# Draw a semi-transparent highlight
highlight_color = (220, 220, 255, 100) # Light blue with alpha
draw.rectangle([(0, 0), self._size], fill=highlight_color)
return canvas
def set_hovered(self, hovered: bool):
"""Set whether the link is being hovered over"""
self._hovered = hovered
def in_object(self, point):
"""Check if a point is within this link"""
point_array = np.array(point)
relative_point = point_array - self._origin
# Check if the point is within the link boundaries
return (0 <= relative_point[0] < self._size[0] and
0 <= relative_point[1] < self._size[1])
class RenderableButton(Box, Queriable):
"""
A concrete implementation for rendering Button objects.
"""
def __init__(self, button: Button, font: Font,
padding: Tuple[int, int, int, int] = (6, 10, 6, 10),
border_radius: int = 4,
origin=None, size=None, callback=None, sheet=None, mode=None):
"""
Initialize a renderable button.
Args:
button: The abstract Button object to render
font: The font to use for the button text
padding: Padding as (top, right, bottom, left)
border_radius: Radius for rounded corners
origin: Optional origin coordinates
size: Optional size override
callback: Optional callback override
sheet: Optional sheet for rendering
mode: Optional mode for rendering
"""
# Create the text object for the button
self._text_obj = Text(button.label, font)
# Calculate size if not provided
if size is None:
text_width, text_height = self._text_obj.size
size = (
text_width + padding[1] + padding[3], # width + right + left padding
text_height + padding[0] + padding[2] # height + top + bottom padding
)
# Use the button's callback if none provided
if callback is None:
callback = button.execute
# Initialize the box
super().__init__(origin or (0, 0), size, callback, sheet, mode)
# Store the button object and rendering properties
self._button = button
self._padding = padding
self._border_radius = border_radius
self._pressed = False
self._hovered = False
@property
def button(self) -> Button:
"""Get the abstract Button object"""
return self._button
def render(self) -> Image.Image:
"""
Render the button.
Returns:
A PIL Image containing the rendered button
"""
# Create the base canvas
canvas = super().render()
draw = ImageDraw.Draw(canvas)
# Determine button colors based on state
if not self._button.enabled:
# Disabled button
bg_color = (200, 200, 200)
border_color = (150, 150, 150)
text_color = (100, 100, 100)
elif self._pressed:
# Pressed button
bg_color = (70, 130, 180)
border_color = (50, 100, 150)
text_color = (255, 255, 255)
elif self._hovered:
# Hovered button
bg_color = (100, 160, 220)
border_color = (70, 130, 180)
text_color = (255, 255, 255)
else:
# Normal button
bg_color = (100, 150, 200)
border_color = (70, 120, 170)
text_color = (255, 255, 255)
# Draw button background with rounded corners
draw.rounded_rectangle([(0, 0), self._size], fill=bg_color,
outline=border_color, width=1,
radius=self._border_radius)
# Position the text centered within the button
text_img = self._text_obj.render()
text_x = (self._size[0] - text_img.width) // 2
text_y = (self._size[1] - text_img.height) // 2
# Paste the text onto the canvas
canvas.paste(text_img, (text_x, text_y), text_img)
return canvas
def set_pressed(self, pressed: bool):
"""Set whether the button is being pressed"""
self._pressed = pressed
def set_hovered(self, hovered: bool):
"""Set whether the button is being hovered over"""
self._hovered = hovered
def in_object(self, point):
"""Check if a point is within this button"""
point_array = np.array(point)
relative_point = point_array - self._origin
# Check if the point is within the button boundaries
return (0 <= relative_point[0] < self._size[0] and
0 <= relative_point[1] < self._size[1])
class RenderableForm(Box):
"""
A concrete implementation for rendering Form objects.
"""
def __init__(self, form: Form, font: Font,
field_padding: Tuple[int, int, int, int] = (5, 10, 5, 10),
spacing: int = 10,
origin=None, size=None, callback=None, sheet=None, mode=None):
"""
Initialize a renderable form.
Args:
form: The abstract Form object to render
font: The font to use for form text
field_padding: Padding for form fields
spacing: Spacing between form elements
origin: Optional origin coordinates
size: Optional size override
callback: Optional callback override
sheet: Optional sheet for rendering
mode: Optional mode for rendering
"""
# Use the form's callback if none provided
if callback is None:
callback = form.execute
# Initialize with temporary size, will be updated during layout
temp_size = size or (400, 300)
super().__init__(origin or (0, 0), temp_size, callback, sheet, mode)
# Store the form object and rendering properties
self._form = form
self._font = font
self._field_padding = field_padding
self._spacing = spacing
# Create renderable field objects
self._renderable_fields: List[RenderableFormField] = []
self._submit_button: Optional[RenderableButton] = None
# Create the form elements
self._create_form_elements()
# If size was not provided, calculate it based on form elements
if size is None:
self._calculate_size()
def _create_form_elements(self):
"""Create renderable field objects for each form field"""
# Create field renderers
for field_name, field in self._form._fields.items():
renderable_field = RenderableFormField(field, self._font, self._field_padding)
self._renderable_fields.append(renderable_field)
# Create submit button
submit_button = Button("Submit", self._form.execute)
self._submit_button = RenderableButton(submit_button, self._font)
def _calculate_size(self):
"""Calculate the size of the form based on its elements"""
# Calculate the width based on the widest element
max_width = max(
[field.size[0] for field in self._renderable_fields] +
[self._submit_button.size[0] if self._submit_button else 0]
) + 20 # Add some padding
# Calculate the height based on all elements and spacing
total_height = sum(field.size[1] for field in self._renderable_fields)
total_height += self._spacing * (len(self._renderable_fields) - 1 if self._renderable_fields else 0)
# Add space for the submit button
if self._submit_button:
total_height += self._spacing + self._submit_button.size[1]
# Add some padding
total_height += 20
self._size = np.array([max_width, total_height])
def layout(self):
"""Layout the form elements"""
y_pos = 10 # Start with some padding
# Position each field
for field in self._renderable_fields:
field._origin = np.array([10, y_pos])
y_pos += field.size[1] + self._spacing
# Position the submit button
if self._submit_button:
# Center the submit button horizontally
submit_x = (self._size[0] - self._submit_button.size[0]) // 2
self._submit_button._origin = np.array([submit_x, y_pos])
def render(self) -> Image.Image:
"""
Render the form.
Returns:
A PIL Image containing the rendered form
"""
# Layout elements before rendering
self.layout()
# Create the base canvas
canvas = super().render()
# Render each field
for field in self._renderable_fields:
field_img = field.render()
field_pos = tuple(field._origin)
canvas.paste(field_img, field_pos, field_img)
# Render the submit button
if self._submit_button:
button_img = self._submit_button.render()
button_pos = tuple(self._submit_button._origin)
canvas.paste(button_img, button_pos, button_img)
return canvas
def handle_click(self, point):
"""
Handle a click on the form.
Args:
point: The coordinates of the click
Returns:
The result of the clicked element's callback, or None if no element was clicked
"""
# Check if the submit button was clicked
if (self._submit_button and
self._submit_button.in_object(point)):
return self._submit_button._callback()
# Check if any field was clicked
for field in self._renderable_fields:
if field.in_object(point):
return field.handle_click(point - field._origin)
return None
class RenderableFormField(Box, Queriable):
"""
A concrete implementation for rendering FormField objects.
"""
def __init__(self, field: FormField, font: Font,
padding: Tuple[int, int, int, int] = (5, 10, 5, 10),
origin=None, size=None, callback=None, sheet=None, mode=None):
"""
Initialize a renderable form field.
Args:
field: The abstract FormField object to render
font: The font to use for field text
padding: Padding for the field
origin: Optional origin coordinates
size: Optional size override
callback: Optional callback override
sheet: Optional sheet for rendering
mode: Optional mode for rendering
"""
# Create the label text object
self._label_text = Text(field.label, font)
# Calculate size if not provided
if size is None:
label_width, label_height = self._label_text.size
# Default field width based on type
if field.field_type in (FormFieldType.TEXTAREA, FormFieldType.SELECT):
field_width = 200
else:
field_width = 150
# Default field height based on type
if field.field_type == FormFieldType.TEXTAREA:
field_height = 80
elif field.field_type == FormFieldType.SELECT:
field_height = 24
else:
field_height = 24
# Calculate total width and height
total_width = max(label_width, field_width) + padding[1] + padding[3]
total_height = label_height + field_height + padding[0] + padding[2] + 5 # 5px between label and field
size = (total_width, total_height)
# Initialize the box
super().__init__(origin or (0, 0), size, callback, sheet, mode)
# Store the field object and rendering properties
self._field = field
self._font = font
self._padding = padding
self._focused = False
def render(self) -> Image.Image:
"""
Render the form field.
Returns:
A PIL Image containing the rendered form field
"""
# Create the base canvas
canvas = super().render()
draw = ImageDraw.Draw(canvas)
# Position the label
label_x = self._padding[3]
label_y = self._padding[0]
# Render the label
label_img = self._label_text.render()
canvas.paste(label_img, (label_x, label_y), label_img)
# Calculate field position
field_x = self._padding[3]
field_y = self._padding[0] + label_img.height + 5 # 5px between label and field
# Calculate field dimensions
field_width = self._size[0] - self._padding[1] - self._padding[3]
if self._field.field_type == FormFieldType.TEXTAREA:
field_height = 80
else:
field_height = 24
# Draw field background
bg_color = (255, 255, 255)
border_color = (200, 200, 200)
if self._focused:
border_color = (100, 150, 200)
# Draw field with border
draw.rectangle(
[(field_x, field_y), (field_x + field_width, field_y + field_height)],
fill=bg_color, outline=border_color, width=1
)
# Render field value if any
if self._field.value is not None:
value_text = str(self._field.value)
value_font = self._font
# For password fields, mask the text
if self._field.field_type == FormFieldType.PASSWORD:
value_text = "" * len(value_text)
# Create text object for value
value_text_obj = Text(value_text, value_font)
value_img = value_text_obj.render()
# Position value text within field (with some padding)
value_x = field_x + 5
value_y = field_y + (field_height - value_img.height) // 2
# Paste value text
canvas.paste(value_img, (value_x, value_y), value_img)
return canvas
def set_focused(self, focused: bool):
"""Set whether the field is focused"""
self._focused = focused
def handle_click(self, point):
"""
Handle a click on the field.
Args:
point: The coordinates of the click relative to the field
Returns:
True if the field was clicked, False otherwise
"""
# Calculate field position
field_x = self._padding[3]
field_y = self._padding[0] + self._label_text.size[1] + 5
# Calculate field dimensions
field_width = self._size[0] - self._padding[1] - self._padding[3]
if self._field.field_type == FormFieldType.TEXTAREA:
field_height = 80
else:
field_height = 24
# Check if click is within field
if (field_x <= point[0] <= field_x + field_width and
field_y <= point[1] <= field_y + field_height):
self.set_focused(True)
return True
return False
def in_object(self, point):
"""Check if a point is within this field"""
point_array = np.array(point)
relative_point = point_array - self._origin
# Check if the point is within the field boundaries
return (0 <= relative_point[0] < self._size[0] and
0 <= relative_point[1] < self._size[1])

View File

@ -0,0 +1,233 @@
import os
from typing import Optional, Tuple, Union, Dict, Any
import numpy as np
from PIL import Image as PILImage, ImageDraw, ImageFont
from pyWebLayout.base import Renderable, Queriable
from pyWebLayout.abstract.block import Image as AbstractImage
from .box import Box
from pyWebLayout.layout import Alignment
class RenderableImage(Box, Queriable):
"""
A concrete implementation for rendering Image objects.
"""
def __init__(self, image: AbstractImage,
max_width: Optional[int] = None, max_height: Optional[int] = None,
origin=None, size=None, callback=None, sheet=None, mode=None,
halign=Alignment.CENTER, valign=Alignment.CENTER):
"""
Initialize a renderable image.
Args:
image: The abstract Image object to render
max_width: Maximum width constraint for the image
max_height: Maximum height constraint for the image
origin: Optional origin coordinates
size: Optional size override
callback: Optional callback function
sheet: Optional sheet for rendering
mode: Optional image mode
halign: Horizontal alignment
valign: Vertical alignment
"""
self._abstract_image = image
self._pil_image = None
self._error_message = None
# Try to load the image
self._load_image()
# Calculate the size if not provided
if size is None:
size = image.calculate_scaled_dimensions(max_width, max_height)
# Initialize the box
super().__init__(origin or (0, 0), size, callback, sheet, mode, halign, valign)
def _load_image(self):
"""Load the image from the source path"""
try:
source = self._abstract_image.source
# Handle different types of sources
if os.path.isfile(source):
# Local file
self._pil_image = PILImage.open(source)
self._abstract_image._loaded_image = self._pil_image
elif source.startswith(('http://', 'https://')):
# URL - requires requests library
try:
import requests
from io import BytesIO
response = requests.get(source, stream=True)
if response.status_code == 200:
self._pil_image = PILImage.open(BytesIO(response.content))
self._abstract_image._loaded_image = self._pil_image
else:
self._error_message = f"Failed to load image: HTTP status {response.status_code}"
except ImportError:
self._error_message = "Requests library not available for URL loading"
else:
self._error_message = f"Unable to load image from source: {source}"
except Exception as e:
self._error_message = f"Error loading image: {str(e)}"
self._abstract_image._error = self._error_message
def render(self) -> PILImage.Image:
"""
Render the image.
Returns:
A PIL Image containing the rendered image
"""
# Create a base canvas
canvas = super().render()
if self._pil_image:
# Resize the image to fit the box while maintaining aspect ratio
resized_image = self._resize_image()
# Calculate position based on alignment
img_width, img_height = resized_image.size
box_width, box_height = self._size
# Horizontal alignment
if self._halign == Alignment.LEFT:
x_offset = 0
elif self._halign == Alignment.RIGHT:
x_offset = box_width - img_width
else: # CENTER is default
x_offset = (box_width - img_width) // 2
# Vertical alignment
if self._valign == Alignment.TOP:
y_offset = 0
elif self._valign == Alignment.BOTTOM:
y_offset = box_height - img_height
else: # CENTER is default
y_offset = (box_height - img_height) // 2
# Paste the image onto the canvas
if resized_image.mode == 'RGBA' and canvas.mode == 'RGBA':
canvas.paste(resized_image, (x_offset, y_offset), resized_image)
else:
canvas.paste(resized_image, (x_offset, y_offset))
else:
# Draw error placeholder
self._draw_error_placeholder(canvas)
return canvas
def _resize_image(self) -> PILImage.Image:
"""
Resize the image to fit within the box while maintaining aspect ratio.
Returns:
A resized PIL Image
"""
if not self._pil_image:
return PILImage.new('RGBA', self._size, (200, 200, 200, 100))
# Get the target dimensions
target_width, target_height = self._size
# Get the original dimensions
orig_width, orig_height = self._pil_image.size
# Calculate the scaling factor to maintain aspect ratio
width_ratio = target_width / orig_width
height_ratio = target_height / orig_height
# Use the smaller ratio to ensure the image fits within the box
ratio = min(width_ratio, height_ratio)
# Calculate new dimensions
new_width = int(orig_width * ratio)
new_height = int(orig_height * ratio)
# Resize the image
if self._pil_image.mode == 'RGBA':
resized = self._pil_image.resize((new_width, new_height), PILImage.LANCZOS)
else:
# Convert to RGBA if needed
resized = self._pil_image.convert('RGBA').resize((new_width, new_height), PILImage.LANCZOS)
return resized
def _draw_error_placeholder(self, canvas: PILImage.Image):
"""
Draw a placeholder for when the image can't be loaded.
Args:
canvas: The canvas to draw on
"""
draw = ImageDraw.Draw(canvas)
# Draw a gray box with a border
draw.rectangle([(0, 0), self._size], fill=(240, 240, 240), outline=(180, 180, 180), width=2)
# Draw an X across the box
draw.line([(0, 0), self._size], fill=(180, 180, 180), width=2)
draw.line([(0, self._size[1]), (self._size[0], 0)], fill=(180, 180, 180), width=2)
# Add error text if available
if self._error_message:
try:
# Try to use a basic font
font = ImageFont.load_default()
# Draw the error message, wrapped to fit
error_text = "Error: " + self._error_message
# Simple text wrapping - split by words and add lines
words = error_text.split()
lines = []
current_line = ""
for word in words:
test_line = current_line + " " + word if current_line else word
text_bbox = draw.textbbox((0, 0), test_line, font=font)
text_width = text_bbox[2] - text_bbox[0]
if text_width <= self._size[0] - 20: # 10px padding on each side
current_line = test_line
else:
lines.append(current_line)
current_line = word
if current_line:
lines.append(current_line)
# Draw each line
y_pos = 10
for line in lines:
text_bbox = draw.textbbox((0, 0), line, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
# Center the text horizontally
x_pos = (self._size[0] - text_width) // 2
# Draw the text
draw.text((x_pos, y_pos), line, fill=(80, 80, 80), font=font)
# Move to the next line
y_pos += text_height + 2
except Exception:
# If text rendering fails, just draw a generic error indicator
pass
def in_object(self, point):
"""Check if a point is within this image"""
point_array = np.array(point)
relative_point = point_array - self._origin
# Check if the point is within the image boundaries
return (0 <= relative_point[0] < self._size[0] and
0 <= relative_point[1] < self._size[1])

View File

@ -0,0 +1,175 @@
from typing import List, Tuple, Optional, Dict, Any
import numpy as np
from PIL import Image
from pyWebLayout.base import Renderable, Layoutable
from .box import Box
from pyWebLayout.layout import Alignment
class Container(Box, Layoutable):
"""
A container that can hold multiple renderable objects and lay them out.
"""
def __init__(self, origin, size, direction='vertical', spacing=5,
callback=None, sheet=None, mode=None,
halign=Alignment.CENTER, valign=Alignment.CENTER,
padding: Tuple[int, int, int, int] = (10, 10, 10, 10)):
"""
Initialize a container.
Args:
origin: Top-left corner coordinates
size: Width and height of the container
direction: Layout direction ('vertical' or 'horizontal')
spacing: Space between elements
callback: Optional callback function
sheet: Optional image sheet
mode: Optional image mode
halign: Horizontal alignment
valign: Vertical alignment
padding: Padding as (top, right, bottom, left)
"""
super().__init__(origin, size, callback, sheet, mode, halign, valign)
self._children: List[Renderable] = []
self._direction = direction
self._spacing = spacing
self._padding = padding
def add_child(self, child: Renderable):
"""Add a child element to this container"""
self._children.append(child)
return self
def layout(self):
"""Layout the children according to the container's direction and spacing"""
if not self._children:
return
# Get available space after padding
padding_top, padding_right, padding_bottom, padding_left = self._padding
available_width = self._size[0] - padding_left - padding_right
available_height = self._size[1] - padding_top - padding_bottom
# Calculate total content size
if self._direction == 'vertical':
total_height = sum(getattr(child, '_size', [0, 0])[1] for child in self._children)
total_height += self._spacing * (len(self._children) - 1)
# Position each child
current_y = padding_top
for child in self._children:
if hasattr(child, '_size') and hasattr(child, '_origin'):
child_width, child_height = child._size
# Calculate horizontal position based on alignment
if self._halign == Alignment.LEFT:
x_pos = padding_left
elif self._halign == Alignment.RIGHT:
x_pos = padding_left + available_width - child_width
else: # CENTER
x_pos = padding_left + (available_width - child_width) // 2
# Set child position
child._origin = np.array([x_pos, current_y])
# Move down for next child
current_y += child_height + self._spacing
# Layout the child if it's layoutable
if isinstance(child, Layoutable):
child.layout()
else: # horizontal
total_width = sum(getattr(child, '_size', [0, 0])[0] for child in self._children)
total_width += self._spacing * (len(self._children) - 1)
# Position each child
current_x = padding_left
for child in self._children:
if hasattr(child, '_size') and hasattr(child, '_origin'):
child_width, child_height = child._size
# Calculate vertical position based on alignment
if self._valign == Alignment.TOP:
y_pos = padding_top
elif self._valign == Alignment.BOTTOM:
y_pos = padding_top + available_height - child_height
else: # CENTER
y_pos = padding_top + (available_height - child_height) // 2
# Set child position
child._origin = np.array([current_x, y_pos])
# Move right for next child
current_x += child_width + self._spacing
# Layout the child if it's layoutable
if isinstance(child, Layoutable):
child.layout()
def render(self) -> Image:
"""Render the container with all its children"""
# Make sure children are laid out
self.layout()
# Create base canvas
canvas = super().render()
# Render each child and paste it onto the canvas
for child in self._children:
if hasattr(child, '_origin'):
child_img = child.render()
# Calculate child position relative to container
rel_pos = tuple(child._origin - self._origin)
# Paste the child onto the canvas
canvas.paste(child_img, rel_pos, child_img)
return canvas
class Page(Container):
"""
Top-level container representing an HTML page.
"""
def __init__(self, size=(800, 600), background_color=(255, 255, 255), mode='RGBA'):
"""
Initialize a page.
Args:
size: Width and height of the page
background_color: Background color as RGB tuple
mode: Image mode
"""
super().__init__(
origin=(0, 0),
size=size,
direction='vertical',
spacing=10,
mode=mode,
halign=Alignment.CENTER,
valign=Alignment.TOP
)
self._background_color = background_color
def render(self) -> Image:
"""Render the page with all its content"""
# Make sure children are laid out
self.layout()
# Create base canvas with background color
canvas = Image.new(self._mode, tuple(self._size), self._background_color)
# Render each child and paste it onto the canvas
for child in self._children:
if hasattr(child, '_origin'):
child_img = child.render()
# Calculate child position relative to page
rel_pos = tuple(child._origin)
# Paste the child onto the canvas with alpha channel if available
if 'A' in self._mode and child_img.mode == 'RGBA':
canvas.paste(child_img, rel_pos, child_img)
else:
canvas.paste(child_img, rel_pos)
return canvas

View File

@ -0,0 +1,455 @@
from __future__ import annotations
from pyWebLayout.base import Renderable, Queriable
from .box import Box
from pyWebLayout.layout import Alignment
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration
from pyWebLayout.abstract.inline import Word
from PIL import Image, ImageDraw, ImageFont
from typing import Tuple, Union, List, Optional
import numpy as np
class Text(Renderable, Queriable):
"""
Concrete implementation for rendering text.
This class handles the visual representation of text fragments.
"""
def __init__(self, text: str, style: Font):
"""
Initialize a Text object.
Args:
text: The text content to render
style: The font style to use for rendering
"""
super().__init__()
self._text = text
self._style = style
self._line = None
self._previous = None
self._next = None
self._origin = np.array([0, 0])
# Calculate dimensions
self._calculate_dimensions()
def _calculate_dimensions(self):
"""Calculate the width and height of the text based on the font metrics"""
# Get the size using PIL's text size functionality
font = self._style.font
# GetTextSize is deprecated, using textbbox for better accuracy
# The bounding box is (left, top, right, bottom)
try:
bbox = font.getbbox(self._text)
self._width = bbox[2] - bbox[0]
self._height = bbox[3] - bbox[1]
self._size = (self._width, self._height)
except AttributeError:
# Fallback for older PIL versions
self._width, self._height = font.getsize(self._text)
self._size = (self._width, self._height)
@property
def text(self) -> str:
"""Get the text content"""
return self._text
@property
def style(self) -> Font:
"""Get the text style"""
return self._style
@property
def line(self) -> Optional[Line]:
"""Get the line containing this text"""
return self._line
@line.setter
def line(self, line):
"""Set the line containing this text"""
self._line = line
@property
def width(self) -> int:
"""Get the width of the text"""
return self._width
@property
def height(self) -> int:
"""Get the height of the text"""
return self._height
@property
def size(self) -> Tuple[int, int]:
"""Get the size (width, height) of the text"""
return self._size
def set_origin(self, x: int, y: int):
"""Set the origin (top-left corner) of this text element"""
self._origin = np.array([x, y])
def add_to_line(self, line):
"""Add this text to a line"""
self._line = line
def _apply_decoration(self, draw: ImageDraw.Draw):
"""Apply text decoration (underline or strikethrough)"""
if self._style.decoration == TextDecoration.UNDERLINE:
# Draw underline at about 90% of the height
y_position = int(self._height * 0.9)
draw.line([(0, y_position), (self._width, y_position)],
fill=self._style.colour, width=max(1, int(self._style.font_size / 15)))
elif self._style.decoration == TextDecoration.STRIKETHROUGH:
# Draw strikethrough at about 50% of the height
y_position = int(self._height * 0.5)
draw.line([(0, y_position), (self._width, y_position)],
fill=self._style.colour, width=max(1, int(self._style.font_size / 15)))
def render(self) -> Image.Image:
"""
Render the text to an image.
Returns:
A PIL Image containing the rendered text
"""
# Create a transparent image with the appropriate size
canvas = Image.new('RGBA', self._size, (0, 0, 0, 0))
draw = ImageDraw.Draw(canvas)
# Draw the text background if specified
if self._style.background and self._style.background[3] > 0: # If alpha > 0
draw.rectangle([(0, 0), self._size], fill=self._style.background)
# Draw the text
draw.text((0, 0), self._text, font=self._style.font, fill=self._style.colour)
# Apply any text decorations
self._apply_decoration(draw)
return canvas
def get_size(self) -> Tuple[int, int]:
"""Get the size (width, height) of the text"""
return self._size
def in_object(self, point):
"""Check if a point is within this text object"""
point_array = np.array(point)
relative_point = point_array - self._origin
# Check if the point is within the text boundaries
return (0 <= relative_point[0] < self._width and
0 <= relative_point[1] < self._height)
class RenderableWord(Renderable, Queriable):
"""
A concrete implementation for rendering Word objects.
This bridges between the abstract Word class and rendering capabilities.
"""
def __init__(self, word: Word):
"""
Initialize a new renderable word.
Args:
word: The abstract Word object to render
"""
super().__init__()
self._word = word
self._text_parts: List[Text] = []
self._origin = np.array([0, 0])
self._size = (0, 0)
# Initialize with the full word as a single text part
self._initialize_text_parts()
def _initialize_text_parts(self):
"""Initialize the text parts based on the word's current state"""
# Clear existing parts
self._text_parts.clear()
if self._word.hyphenated_parts:
# If the word is hyphenated, create a Text object for each part
for part in self._word.hyphenated_parts:
self._text_parts.append(Text(part, self._word.style))
else:
# Otherwise, create a single Text object for the whole word
self._text_parts.append(Text(self._word.text, self._word.style))
# Calculate total size
self._recalculate_size()
def _recalculate_size(self):
"""Recalculate the size of the word based on its text parts"""
if not self._text_parts:
self._size = (0, 0)
return
# For a non-hyphenated word, use the size of the single text part
if len(self._text_parts) == 1:
self._size = self._text_parts[0].size
return
# For a hyphenated word that's not yet split across lines,
# calculate the total width and maximum height
total_width = sum(part.width for part in self._text_parts)
max_height = max(part.height for part in self._text_parts)
self._size = (total_width, max_height)
@property
def word(self) -> Word:
"""Get the abstract Word object"""
return self._word
@property
def text_parts(self) -> List[Text]:
"""Get the list of Text objects that make up this word"""
return self._text_parts
def update_from_word(self):
"""Update the text parts based on changes to the word"""
self._initialize_text_parts()
def get_part_size(self, index: int) -> Tuple[int, int]:
"""
Get the size of a specific text part.
Args:
index: The index of the part to query.
Returns:
A tuple (width, height) of the part.
Raises:
IndexError: If the index is out of range.
"""
if index >= len(self._text_parts):
raise IndexError(f"Part index {index} out of range")
return self._text_parts[index].size
@property
def width(self) -> int:
"""Get the total width of the word"""
return self._size[0]
@property
def height(self) -> int:
"""Get the height of the word"""
return self._size[1]
def set_origin(self, x: int, y: int):
"""Set the origin (top-left corner) of this word"""
self._origin = np.array([x, y])
# Update positions of text parts
x_offset = 0
for part in self._text_parts:
part.set_origin(x + x_offset, y)
x_offset += part.width
def render(self) -> Image.Image:
"""
Render the word to an image.
Returns:
A PIL Image containing the rendered word
"""
# For a non-hyphenated word or if there's only one part, render just that part
if len(self._text_parts) == 1:
return self._text_parts[0].render()
# For a hyphenated word, create a canvas and paste all parts
canvas = Image.new('RGBA', self._size, (0, 0, 0, 0))
x_offset = 0
for part in self._text_parts:
part_img = part.render()
canvas.paste(part_img, (x_offset, 0), part_img)
x_offset += part.width
return canvas
def in_object(self, point):
"""Check if a point is within this word"""
point_array = np.array(point)
# First check if the point is within the word's boundaries
relative_point = point_array - self._origin
if not (0 <= relative_point[0] < self._size[0] and
0 <= relative_point[1] < self._size[1]):
return False
# Then check which text part contains the point
x_offset = 0
for part in self._text_parts:
part_width = part.width
if x_offset <= relative_point[0] < x_offset + part_width:
# The point is within this part's horizontal bounds
# Adjust the point to be relative to the part
part_relative_point = relative_point.copy()
part_relative_point[0] -= x_offset
return part.in_object(self._origin + part_relative_point)
x_offset += part_width
return False
class Line(Box):
"""
A line of text consisting of words with consistent spacing.
"""
def __init__(self, spacing: Tuple[int, int], origin, size, font: Optional[Font] = None,
callback=None, sheet=None, mode=None, halign=Alignment.CENTER,
valign=Alignment.CENTER, previous = None):
"""
Initialize a new line.
Args:
spacing: A tuple of (min_spacing, max_spacing) between words
origin: The top-left position of the line
size: The width and height of the line
font: The default font to use for text in this line
callback: Optional callback function
sheet: Optional image sheet
mode: Optional image mode
halign: Horizontal alignment of text within the line
valign: Vertical alignment of text within the line
previous: Reference to the previous line
"""
super().__init__(origin, size, callback, sheet, mode, halign, valign)
self._renderable_words: List[RenderableWord] = []
self._spacing = spacing # (min_spacing, max_spacing)
self._font = font if font else Font() # Use default font if none provided
self._current_width = 0 # Track the current width used
self._previous = previous
self._next = None
@property
def renderable_words(self) -> List[RenderableWord]:
"""Get the list of renderable words in this line"""
return self._renderable_words
def set_next(self, line: Line):
"""Set the next line in sequence"""
self._next = line
def add_word(self, text: str, font: Optional[Font] = None) -> Union[None, str]:
"""
Add a word to this line.
Args:
text: The text content of the word
font: The font to use for this word, or None to use the line's default font
Returns:
None if the word fits, or the remaining text if it doesn't fit
"""
if not font:
font = self._font
# Create an abstract word
abstract_word = Word(text, font)
# Create a renderable word
renderable_word = RenderableWord(abstract_word)
# Check if the word fits in the current line with minimum spacing
min_spacing, max_spacing = self._spacing
word_width = renderable_word.width
# If this is the first word, no spacing is needed
spacing_needed = min_spacing if self._renderable_words else 0
# Check if word fits in the line
if self._current_width + spacing_needed + word_width <= self._size[0]:
self._renderable_words.append(renderable_word)
self._current_width += spacing_needed + word_width
return None
else:
# Try to hyphenate the word if it doesn't fit
if abstract_word.hyphenate():
# Update the renderable word to reflect hyphenation
renderable_word.update_from_word()
# Check if first part with hyphen fits
first_part_size = renderable_word.get_part_size(0)
if self._current_width + spacing_needed + first_part_size[0] <= self._size[0]:
# Create a word with just the first part
first_part_text = abstract_word.get_hyphenated_part(0)
first_word = Word(first_part_text, font)
renderable_first_word = RenderableWord(first_word)
self._renderable_words.append(renderable_first_word)
self._current_width += spacing_needed + first_part_size[0]
# Return the remaining parts as a single string
remaining_parts = [abstract_word.get_hyphenated_part(i)
for i in range(1, abstract_word.get_hyphenated_part_count())]
return ''.join(remaining_parts)
# If we can't hyphenate or first part doesn't fit, return the entire word
return text
def render(self) -> Image.Image:
"""
Render the line with all its words.
Returns:
A PIL Image containing the rendered line
"""
# Create an image for the line
canvas = super().render()
# If there are no words, return the empty canvas
if not self._renderable_words:
return canvas
# Calculate total width of words
total_word_width = sum(word.width for word in self._renderable_words)
# Calculate spacing based on alignment and available space
available_space = self._size[0] - total_word_width
num_spaces = len(self._renderable_words) - 1
if num_spaces > 0:
if self._halign == Alignment.JUSTIFY:
# For justified text, distribute space evenly between words
spacing = available_space // num_spaces
else:
# Use minimum spacing for other alignments
spacing = self._spacing[0]
else:
spacing = 0
# Calculate starting x position based on alignment
if self._halign == Alignment.LEFT:
x_pos = 0
elif self._halign == Alignment.RIGHT:
x_pos = self._size[0] - (total_word_width + spacing * num_spaces)
else: # CENTER
x_pos = (self._size[0] - (total_word_width + spacing * num_spaces)) // 2
# Vertical alignment - center words vertically in the line
y_pos = (self._size[1] - max(word.height for word in self._renderable_words)) // 2
# Render and paste each word onto the line
for word in self._renderable_words:
# Set the word's position
word.set_origin(x_pos, y_pos)
# Render the word
word_img = word.render()
# Paste the word onto the canvas
canvas.paste(word_img, (x_pos, y_pos), word_img)
# Move to the next word position
x_pos += word.width + spacing
return canvas

View File

@ -0,0 +1,10 @@
"""
Core functionality for the pyWebLayout library.
This package contains the core abstractions and base classes that form the foundation
of the pyWebLayout rendering system.
"""
from pyWebLayout.core.base import (
Renderable, Interactable, Layoutable, Queriable
)

67
pyWebLayout/core/base.py Normal file
View File

@ -0,0 +1,67 @@
from abc import ABC
import numpy as np
from pyWebLayout.style import Alignment
class Renderable(ABC):
"""
Abstract base class for any object that can be rendered to an image.
All renderable objects must implement the render method.
"""
def render(self):
"""
Render the object to an image.
Returns:
PIL.Image: The rendered image
"""
pass
class Interactable(ABC):
"""
Abstract base class for any object that can be interacted with.
Interactable objects must have a callback that is executed when interacted with.
"""
def __init__(self, callback=None):
"""
Initialize an interactable object.
Args:
callback: The function to call when this object is interacted with
"""
self._callback = callback
def interact(self, point: np.generic):
"""
Handle interaction at the given point.
Args:
point: The coordinates of the interaction
Returns:
The result of calling the callback function with the point
"""
if self._callback is None:
return None
return self._callback(point)
class Layoutable(ABC):
"""
Abstract base class for any object that can be laid out.
Layoutable objects must implement the layout method which arranges their contents.
"""
def layout(self):
"""
Layout the object's contents.
This method should be called before rendering to properly arrange the object's contents.
"""
pass
class Queriable(ABC):
def in_object(self, point:np.generic):
"""
check if a point is in the object
"""
pass

View File

@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""
Example EPUB viewer using pyWebLayout.
This example demonstrates how to use pyWebLayout to load an EPUB file,
paginate it, and render pages as images.
"""
import os
import sys
import argparse
from pathlib import Path
from PIL import Image
# Add the parent directory to the path to import pyWebLayout
sys.path.append(str(Path(__file__).parent.parent.parent))
from pyWebLayout import (
Document, Book, read_epub,
DocumentPaginator, Page
)
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='EPUB viewer example')
parser.add_argument('epub_file', help='Path to EPUB file')
parser.add_argument('--output-dir', '-o', default='output', help='Output directory for rendered pages')
parser.add_argument('--width', '-w', type=int, default=800, help='Page width')
parser.add_argument('--height', '-h', type=int, default=1000, help='Page height')
parser.add_argument('--margin', '-m', type=int, default=50, help='Page margin')
parser.add_argument('--max-pages', '-p', type=int, default=10, help='Maximum number of pages to render')
args = parser.parse_args()
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
# Read EPUB file
print(f"Reading EPUB file: {args.epub_file}")
book = read_epub(args.epub_file)
# Display book metadata
print(f"Title: {book.get_title()}")
print(f"Author: {book.get_metadata('AUTHOR')}")
print(f"Chapters: {len(book.chapters)}")
# Create a paginator
page_size = (args.width, args.height)
margins = (args.margin, args.margin, args.margin, args.margin)
paginator = DocumentPaginator(
document=book,
page_size=page_size,
margins=margins
)
# Paginate and render pages
print("Paginating and rendering pages...")
# Option 1: Render all pages at once
pages = paginator.paginate(max_pages=args.max_pages)
for i, page in enumerate(pages):
# Render the page
image = page.render()
# Save the image
output_path = os.path.join(args.output_dir, f"page_{i+1:03d}.png")
image.save(output_path)
print(f"Saved page {i+1} to {output_path}")
# Option 2: Render pages one by one with state saving
"""
# Clear paginator state
paginator.state = DocumentPaginationState()
for i in range(args.max_pages):
# Get next page
page = paginator.paginate_next()
if page is None:
print(f"No more pages after page {i}")
break
# Render the page
image = page.render()
# Save the image
output_path = os.path.join(args.output_dir, f"page_{i+1:03d}.png")
image.save(output_path)
print(f"Saved page {i+1} to {output_path}")
# Save pagination state (could be saved to a file for later resumption)
state_dict = paginator.get_state()
# Progress information
progress = paginator.get_progress() * 100
print(f"Progress: {progress:.1f}%")
"""
if __name__ == "__main__":
main()

918
pyWebLayout/html_parser.py Normal file
View File

@ -0,0 +1,918 @@
import re
from html.parser import HTMLParser as BaseHTMLParser
from typing import Dict, List, Optional, Tuple, Union, Any, Set, Callable
import urllib.parse
from PIL import Image
from .style import Font, FontStyle, FontWeight, TextDecoration
from .abstract.document import Document, MetadataType, Book, Chapter
from .abstract.block import (
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak
)
from .abstract.inline import Word, FormattedSpan
from .abstract.functional import Link, LinkType, Button, Form, FormField, FormFieldType
from .concrete.page import Page
from pyWebLayout.layout import Alignment
class HTMLParser(BaseHTMLParser):
"""
HTML parser that builds an abstract document representation from HTML content.
This parser converts HTML to abstract document classes without any rendering specifics.
"""
def __init__(self, base_url: Optional[str] = None):
"""
Initialize the HTML parser.
Args:
base_url: Base URL for resolving relative links
"""
super().__init__()
# Document structure
self.document = Document()
# State variables
self._current_block = None
self._block_stack: List[Block] = []
# Text handling
self._current_paragraph = None
self._current_span = None
self._text_buffer = ""
# Style state
self._style_stack: List[Dict[str, Any]] = []
self._current_style = {
'font_size': 12,
'font_weight': FontWeight.NORMAL,
'font_style': FontStyle.NORMAL,
'decoration': TextDecoration.NONE,
'color': (0, 0, 0),
'background': None,
'language': 'en_US'
}
# Tag state
self._list_stack: List[HList] = []
self._table_stack: List[Table] = []
self._current_table_row = None
# Link handling
self._base_url = base_url
self._in_link = False
self._current_link = None
# Special state flags
self._in_head = False
self._in_title = False
self._in_script = False
self._in_style = False
self._script_buffer = ""
self._style_buffer = ""
self._title_buffer = ""
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
"""
Handle the start of an HTML tag.
Args:
tag: The tag name
attrs: List of attribute tuples (name, value)
"""
tag = tag.lower()
attrs_dict = dict(attrs)
# Special handling for elements where we collect content
if self._in_script and tag != 'script':
return
if self._in_style and tag != 'style':
return
# Parse style attribute if present
style = {}
if 'style' in attrs_dict:
style = self._parse_style(attrs_dict['style'])
# Apply tag-specific styling based on the tag
tag_style = self._get_tag_style(tag)
for key, value in tag_style.items():
if key not in style:
style[key] = value
# Push the current style and apply the new style
self._push_style(style)
# Handle specific tags
if tag == 'html':
# Set document language if specified
if 'lang' in attrs_dict:
self.document.set_metadata(MetadataType.LANGUAGE, attrs_dict['lang'])
elif tag == 'head':
self._in_head = True
elif tag == 'title' and self._in_head:
self._in_title = True
self._title_buffer = ""
elif tag == 'meta' and self._in_head:
self._handle_meta_tag(attrs_dict)
elif tag == 'link' and self._in_head:
self._handle_link_tag(attrs_dict)
elif tag == 'script':
self._in_script = True
self._script_buffer = ""
elif tag == 'style':
self._in_style = True
self._style_buffer = ""
elif tag == 'body':
# Body attributes can contain style information
pass
elif tag == 'p':
self._flush_text() # Flush any pending text
self._current_paragraph = Parapgraph()
# Add the paragraph to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(self._current_paragraph)
else:
self.document.add_block(self._current_paragraph)
# Push to block stack
self._block_stack.append(self._current_paragraph)
self._current_block = self._current_paragraph
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
self._flush_text() # Flush any pending text
# Determine heading level
level_map = {
'h1': HeadingLevel.H1,
'h2': HeadingLevel.H2,
'h3': HeadingLevel.H3,
'h4': HeadingLevel.H4,
'h5': HeadingLevel.H5,
'h6': HeadingLevel.H6
}
heading = Heading(level=level_map[tag])
# Add the heading to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(heading)
else:
self.document.add_block(heading)
# Push to block stack
self._block_stack.append(heading)
self._current_block = heading
self._current_paragraph = heading # Heading inherits from Paragraph
elif tag == 'div':
self._flush_text() # Flush any pending text
# For divs, we create a new paragraph as a container
div_para = Parapgraph()
# Add the div to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(div_para)
else:
self.document.add_block(div_para)
# Push to block stack
self._block_stack.append(div_para)
self._current_block = div_para
self._current_paragraph = div_para
elif tag == 'blockquote':
self._flush_text() # Flush any pending text
quote = Quote()
# Add the quote to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(quote)
else:
self.document.add_block(quote)
# Push to block stack
self._block_stack.append(quote)
self._current_block = quote
elif tag == 'pre':
self._flush_text() # Flush any pending text
# Pre can optionally contain a code block
# We'll create a paragraph for now, and if we find a code tag inside,
# we'll replace it with a code block
pre_para = Parapgraph()
# Add the pre to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(pre_para)
else:
self.document.add_block(pre_para)
# Push to block stack
self._block_stack.append(pre_para)
self._current_block = pre_para
self._current_paragraph = pre_para
elif tag == 'code':
# If we're inside a pre, replace the paragraph with a code block
if self._block_stack and isinstance(self._block_stack[-1], Parapgraph):
pre_para = self._block_stack.pop()
# Get the language from class if specified (e.g., class="language-python")
language = ""
if 'class' in attrs_dict:
class_attr = attrs_dict['class']
if class_attr.startswith('language-'):
language = class_attr[9:]
code_block = CodeBlock(language=language)
# Replace the paragraph with the code block
if pre_para.parent:
parent = pre_para.parent
if hasattr(parent, '_blocks'):
# Find the paragraph in the parent's blocks and replace it
for i, block in enumerate(parent._blocks):
if block == pre_para:
parent._blocks[i] = code_block
break
# Push the code block to the stack
self._block_stack.append(code_block)
self._current_block = code_block
self._current_paragraph = None
else:
# If not in a pre, just create a formatted span for code
self._current_span = None # Force creation of a new span with code style
elif tag in ('ul', 'ol', 'dl'):
self._flush_text() # Flush any pending text
# Determine list style
style_map = {
'ul': ListStyle.UNORDERED,
'ol': ListStyle.ORDERED,
'dl': ListStyle.DEFINITION
}
list_block = HList(style=style_map[tag])
# Add the list to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(list_block)
else:
self.document.add_block(list_block)
# Push to block stack and list stack
self._block_stack.append(list_block)
self._list_stack.append(list_block)
self._current_block = list_block
self._current_paragraph = None
elif tag == 'li' and self._list_stack:
self._flush_text() # Flush any pending text
list_item = ListItem()
# Add to the current list
current_list = self._list_stack[-1]
current_list.add_item(list_item)
# Push to block stack
self._block_stack.append(list_item)
self._current_block = list_item
self._current_paragraph = None
elif tag == 'dt' and self._list_stack and self._list_stack[-1].style == ListStyle.DEFINITION:
self._flush_text() # Flush any pending text
# For definition term, we create a list item with a term
list_item = ListItem(term="") # Will be filled by content
# Add to the current list
current_list = self._list_stack[-1]
current_list.add_item(list_item)
# Push to block stack
self._block_stack.append(list_item)
self._current_block = list_item
# Create a paragraph for the term content
term_para = Parapgraph()
list_item.add_block(term_para)
self._current_paragraph = term_para
elif tag == 'dd' and self._list_stack and self._list_stack[-1].style == ListStyle.DEFINITION:
self._flush_text() # Flush any pending text
# Find the last dt item
current_list = self._list_stack[-1]
if current_list._items:
list_item = current_list._items[-1]
# Create a paragraph for the description content
desc_para = Parapgraph()
list_item.add_block(desc_para)
# Update current state
self._current_paragraph = desc_para
else:
# If no dt found, create a new list item
list_item = ListItem()
current_list.add_item(list_item)
# Push to block stack
self._block_stack.append(list_item)
self._current_block = list_item
# Create a paragraph for the description content
desc_para = Parapgraph()
list_item.add_block(desc_para)
self._current_paragraph = desc_para
elif tag == 'table':
self._flush_text() # Flush any pending text
# Create a new table
caption = None
if 'summary' in attrs_dict:
caption = attrs_dict['summary']
table = Table(caption=caption)
# Add the table to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(table)
else:
self.document.add_block(table)
# Push to block stack and table stack
self._block_stack.append(table)
self._table_stack.append(table)
self._current_block = table
self._current_paragraph = None
elif tag in ('thead', 'tbody', 'tfoot') and self._table_stack:
# Just track the current section - no need to create new objects
self._current_table_section = tag
elif tag == 'tr' and self._table_stack:
self._flush_text() # Flush any pending text
# Create a new row
row = TableRow()
# Add to the current table
current_table = self._table_stack[-1]
# Determine the section based on context
section = "body"
if hasattr(self, '_current_table_section'):
if self._current_table_section == 'thead':
section = "header"
elif self._current_table_section == 'tfoot':
section = "footer"
current_table.add_row(row, section=section)
# Update state
self._current_table_row = row
self._current_paragraph = None
elif tag in ('td', 'th') and self._current_table_row:
self._flush_text() # Flush any pending text
# Parse attributes
colspan = 1
rowspan = 1
if 'colspan' in attrs_dict:
try:
colspan = int(attrs_dict['colspan'])
except (ValueError, TypeError):
pass
if 'rowspan' in attrs_dict:
try:
rowspan = int(attrs_dict['rowspan'])
except (ValueError, TypeError):
pass
# Create a new cell
is_header = (tag == 'th')
cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan)
# Add to the current row
self._current_table_row.add_cell(cell)
# Push to block stack
self._block_stack.append(cell)
self._current_block = cell
# Create a paragraph for the cell content
cell_para = Parapgraph()
cell.add_block(cell_para)
self._current_paragraph = cell_para
elif tag == 'a':
self._flush_text() # Flush any pending text
# Parse attributes
href = attrs_dict.get('href', '')
title = attrs_dict.get('title', '')
# Determine link type
link_type = LinkType.INTERNAL
if href.startswith('http://') or href.startswith('https://'):
link_type = LinkType.EXTERNAL
elif href.startswith('javascript:'):
link_type = LinkType.FUNCTION
elif href.startswith('api:'):
link_type = LinkType.API
href = href[4:] # Remove api: prefix
# If we have a base URL and the href is relative, resolve it
if self._base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')):
href = urllib.parse.urljoin(self._base_url, href)
# Create a Link object
self._current_link = Link(
location=href,
link_type=link_type,
title=title if title else None
)
# Set the flag to indicate we're inside a link
self._in_link = True
# Force creation of a new span with link style
self._current_span = None
elif tag == 'img':
# Handle image
src = attrs_dict.get('src', '')
alt = attrs_dict.get('alt', '')
# Parse width and height if provided
width = None
height = None
if 'width' in attrs_dict:
try:
width = int(attrs_dict['width'])
except (ValueError, TypeError):
pass
if 'height' in attrs_dict:
try:
height = int(attrs_dict['height'])
except (ValueError, TypeError):
pass
# If we have a base URL and the src is relative, resolve it
if self._base_url and not src.startswith(('http://', 'https://')):
src = urllib.parse.urljoin(self._base_url, src)
# Create an Image block
from .abstract.block import Image
image = Image(source=src, alt_text=alt, width=width, height=height)
# Add the image to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(image)
else:
self.document.add_block(image)
# Also add as a resource for backwards compatibility
resource_name = f"img_{len(self.document._resources) + 1}"
self.document.add_resource(resource_name, {
'type': 'image',
'src': src,
'alt': alt,
'width': width,
'height': height,
'image_object': image
})
elif tag == 'br':
# Add a line break
if self._current_paragraph:
line_break = LineBreak()
if hasattr(self._current_paragraph, 'add_block'):
self._current_paragraph.add_block(line_break)
# Flush any text before the break
self._flush_text()
elif tag == 'hr':
self._flush_text() # Flush any pending text
# Create a horizontal rule
hr = HorizontalRule()
# Add to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'):
self._current_block.add_block(hr)
else:
self.document.add_block(hr)
elif tag in ('b', 'strong'):
# Bold text
self._current_style['font_weight'] = FontWeight.BOLD
self._current_span = None # Force creation of a new span
elif tag in ('i', 'em'):
# Italic text
self._current_style['font_style'] = FontStyle.ITALIC
self._current_span = None # Force creation of a new span
elif tag == 'u':
# Underlined text
self._current_style['decoration'] = TextDecoration.UNDERLINE
self._current_span = None # Force creation of a new span
elif tag == 'span':
# Span can have style attributes
self._current_span = None # Force creation of a new span
elif tag == 'form':
self._flush_text() # Flush any pending text
# Parse attributes
form_id = attrs_dict.get('id', f"form_{len(self.document._resources) + 1}")
action = attrs_dict.get('action', '')
# Create a Form object
form = Form(form_id=form_id, action=action)
# Add as a resource
self.document.add_resource(form_id, form)
# TODO: Create a proper Form block class and add it to the document
elif tag == 'input':
# Parse attributes
input_type = attrs_dict.get('type', 'text')
input_name = attrs_dict.get('name', '')
input_value = attrs_dict.get('value', '')
input_required = 'required' in attrs_dict
# Map HTML input types to FormFieldType
type_map = {
'text': FormFieldType.TEXT,
'password': FormFieldType.PASSWORD,
'checkbox': FormFieldType.CHECKBOX,
'radio': FormFieldType.RADIO,
'number': FormFieldType.NUMBER,
'date': FormFieldType.DATE,
'time': FormFieldType.TIME,
'email': FormFieldType.EMAIL,
'url': FormFieldType.URL,
'color': FormFieldType.COLOR,
'range': FormFieldType.RANGE,
'hidden': FormFieldType.HIDDEN
}
field_type = type_map.get(input_type, FormFieldType.TEXT)
# Create a FormField object
field = FormField(
name=input_name,
field_type=field_type,
label=attrs_dict.get('placeholder', input_name),
value=input_value,
required=input_required
)
# TODO: Add the field to a form if inside a form
elif tag == 'textarea':
# Similar to input but with multiline content
# We'll handle the content in handle_data
pass
elif tag == 'select':
# Similar to input but with options
# We'll handle the options in handle_data
pass
elif tag == 'button':
# Parse attributes
button_type = attrs_dict.get('type', 'button')
button_name = attrs_dict.get('name', '')
# TODO: Create a Button object and add it to the document
def handle_endtag(self, tag: str):
"""
Handle the end of an HTML tag.
Args:
tag: The tag name
"""
tag = tag.lower()
# Special handling for elements where we collect content
if tag == 'script' and self._in_script:
self._in_script = False
self.document.add_script(self._script_buffer)
self._script_buffer = ""
self._pop_style()
return
if tag == 'style' and self._in_style:
self._in_style = False
# Parse the style and add to document
stylesheet = self._parse_css(self._style_buffer)
if stylesheet:
self.document.add_stylesheet(stylesheet)
self._style_buffer = ""
self._pop_style()
return
if tag == 'title' and self._in_title:
self._in_title = False
self.document.set_title(self._title_buffer.strip())
self._title_buffer = ""
self._pop_style()
return
if self._in_script and tag != 'script':
return
if self._in_style and tag != 'style':
return
# Flush any accumulated text
self._flush_text()
# Handle specific end tags
if tag == 'head':
self._in_head = False
elif tag == 'body':
pass # Nothing special to do
elif tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre'):
# Pop from block stack
if self._block_stack:
self._block_stack.pop()
# Update current block
if self._block_stack:
self._current_block = self._block_stack[-1]
else:
self._current_block = None
# Reset current paragraph
self._current_paragraph = None
self._current_span = None
elif tag == 'code':
# If we're inside a code block, no need to do anything special
pass
elif tag in ('ul', 'ol', 'dl'):
# Pop from block stack and list stack
if self._block_stack:
self._block_stack.pop()
if self._list_stack:
self._list_stack.pop()
# Update current block
if self._block_stack:
self._current_block = self._block_stack[-1]
else:
self._current_block = None
# Reset current paragraph
self._current_paragraph = None
self._current_span = None
elif tag in ('li', 'dt', 'dd'):
# Pop from block stack
if self._block_stack:
self._block_stack.pop()
# Update current block
if self._block_stack:
self._current_block = self._block_stack[-1]
else:
self._current_block = None
# Reset current paragraph
self._current_paragraph = None
self._current_span = None
elif tag == 'table':
# Pop from block stack and table stack
if self._block_stack:
self._block_stack.pop()
if self._table_stack:
self._table_stack.pop()
# Update current block
if self._block_stack:
self._current_block = self._block_stack[-1]
else:
self._current_block = None
# Reset current paragraph and table state
self._current_paragraph = None
self._current_span = None
self._current_table_row = None
if hasattr(self, '_current_table_section'):
delattr(self, '_current_table_section')
elif tag in ('thead', 'tbody', 'tfoot'):
# Clear current section
if hasattr(self, '_current_table_section'):
delattr(self, '_current_table_section')
elif tag == 'tr':
# Reset current row
self._current_table_row = None
elif tag in ('td', 'th'):
# Pop from block stack
if self._block_stack:
self._block_stack.pop()
# Update current block
if self._block_stack:
self._current_block = self._block_stack[-1]
else:
self._current_block = None
# Reset current paragraph
self._current_paragraph = None
self._current_span = None
elif tag == 'a':
# End of link
self._in_link = False
self._current_link = None
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
# End of styled text
self._current_span = None
# Pop style regardless of tag
self._pop_style()
def handle_data(self, data: str):
"""
Handle text data.
Args:
data: The text data
"""
if self._in_script:
self._script_buffer += data
return
if self._in_style:
self._style_buffer += data
return
if self._in_title:
self._title_buffer += data
return
# Add to text buffer
self._text_buffer += data
def handle_entityref(self, name: str):
"""
Handle an HTML entity reference.
Args:
name: The entity name
"""
# Map common entity references to characters
entities = {
'lt': '<',
'gt': '>',
'amp': '&',
'quot': '"',
'apos': "'",
'nbsp': ' ',
'copy': '©',
'reg': '®',
'trade': '',
}
if name in entities:
char = entities[name]
else:
try:
import html.entities
char = chr(html.entities.name2codepoint[name])
except (KeyError, ImportError):
char = f'&{name};'
# Handle based on context
if self._in_script:
self._script_buffer += char
elif self._in_style:
self._style_buffer += char
elif self._in_title:
self._title_buffer += char
else:
self._text_buffer += char
def handle_charref(self, name: str):
"""
Handle a character reference.
Args:
name: The character reference (decimal or hex)
"""
# Convert character reference to character
if name.startswith('x'):
# Hexadecimal reference
char = chr(int(name[1:], 16))
else:
# Decimal reference
char = chr(int(name))
# Handle based on context
if self._in_script:
self._script_buffer += char
elif self._in_style:
self._style_buffer += char
elif self._in_title:
self._title_buffer += char
else:
self._text_buffer += char
def _push_style(self, style: Dict[str, Any]):
"""
Push a new style onto the style stack.
Args:
style: The style to push
"""
# Save the current style
self._style_stack.append(self._current_style.copy())
# Apply the new style
for key, value in style.items():
self._current_style[key] = value
def _pop_style(self):
"""Pop a style from the style stack."""
if self._style_stack:
self._current_style = self._style_stack.pop()
def _get_tag_style(self, tag: str) -> Dict[str, Any]:
"""
Get the default style for a tag.
Args:
tag: The tag name
Returns:
A dictionary of style properties
"""
# Default styles for common tags
tag_styles = {
'h1': {'font_size': 24, 'font_weight': FontWeight.BOLD},
'h2': {'font_size': 20, 'font_weight': FontWeight.BOLD},
'h3': {'font_size': 18, 'font_weight': FontWeight.BOLD},
'h4': {'font_size': 16, 'font_weight': FontWeight.BOLD},
'h5': {'font_size': 14, 'font_weight': FontWeight.BOLD},
'h6': {'font_size': 12, 'font_weight': FontWeight.BOLD},
'b': {'font_weight': FontWeight.BOLD},
'strong': {'font_weight': FontWeight.BOLD},
'i': {'font_style': FontStyle.ITALIC},
'em': {'font_style': FontStyle.ITALIC},
'u': {'decoration': TextDecoration.UNDERLINE},
'a': {'decoration': TextDecoration.UNDERLINE, 'color': (0, 0, 255)},
'code': {'font_family': 'monospace', 'background': (240, 240, 240, 255)},
'pre': {'font_family': 'monospace'},
}
return tag_styles.get(tag, {})
def _create_font(self) -> Font:
"""
Create a Font object from the current style.
Returns:
Font: A font object with the current style settings
"""

View File

@ -0,0 +1,69 @@
"""
Input/Output module for pyWebLayout.
This package provides functionality for reading and writing various file formats,
including HTML, EPUB, and other document formats.
The module uses a decomposed architecture with specialized readers for different
aspects of document parsing (metadata, content, resources), following the same
pattern as the abstract module.
"""
# Legacy readers (for backward compatibility)
# Legacy functions provided by new HTML reader for backward compatibility
from pyWebLayout.io.readers.html import parse_html_string as parse_html
from pyWebLayout.io.readers.html import read_html_file as html_to_document
from pyWebLayout.io.readers.epub_reader import read_epub
# New decomposed readers
from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string
from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
# Specialized HTML readers
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
from pyWebLayout.io.readers.html_content import HTMLContentReader
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
# Specialized EPUB readers
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
# Convenience functions using the new architecture
def read_document(source, format_hint=None, **options):
"""
Read a document using the appropriate reader based on format detection.
Args:
source: The source to read (file path, URL, or content)
format_hint: Optional hint about the format ('html', 'epub', etc.)
**options: Additional options for reading
Returns:
Document: The parsed document
"""
if format_hint == 'html' or (not format_hint and _is_html_source(source)):
reader = HTMLReader()
return reader.read(source, **options)
elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)):
# Use legacy EPUB reader for now
return read_epub(source)
else:
# Try HTML reader as fallback
try:
reader = HTMLReader()
if reader.can_read(source):
return reader.read(source, **options)
except:
pass
raise ValueError(f"Cannot determine format for source: {source}")
def _is_html_source(source):
"""Check if source appears to be HTML."""
reader = HTMLReader()
return reader.can_read(source)
def _is_epub_source(source):
"""Check if source appears to be EPUB."""
if isinstance(source, str):
return source.lower().endswith('.epub')
return False

View File

@ -0,0 +1,36 @@
"""
Readers module for pyWebLayout.
This module provides specialized readers for different document formats
using a decomposed architecture pattern.
"""
# Base classes for the decomposed architecture
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
# HTML readers (decomposed)
from .html import HTMLReader, read_html, read_html_file, parse_html_string
from .html_metadata import HTMLMetadataReader
from .html_content import HTMLContentReader
from .html_resources import HTMLResourceReader
# HTML processing components (supporting modules)
from .html_style import HTMLStyleManager
from .html_text import HTMLTextProcessor
from .html_elements import BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
# EPUB readers
from .epub_reader import read_epub # Legacy
from .epub_metadata import EPUBMetadataReader # New decomposed
__all__ = [
# Base classes
'BaseReader', 'MetadataReader', 'ContentReader', 'ResourceReader', 'CompositeReader',
# HTML readers
'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string',
'HTMLMetadataReader', 'HTMLContentReader', 'HTMLResourceReader',
# EPUB readers
'read_epub', 'EPUBMetadataReader',
]

View File

@ -0,0 +1,229 @@
"""
Base classes for document readers in pyWebLayout.
This module provides the foundational classes that all readers inherit from,
similar to how the abstract module provides base classes for document elements.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union
from pyWebLayout.abstract.document import Document
class BaseReader(ABC):
"""
Abstract base class for all document readers.
This class defines the common interface that all readers must implement.
"""
def __init__(self):
"""Initialize the base reader."""
self._document = None
self._options = {}
@abstractmethod
def can_read(self, source: Union[str, bytes]) -> bool:
"""
Check if this reader can handle the given source.
Args:
source: The source to check (file path, URL, or content)
Returns:
True if this reader can handle the source, False otherwise
"""
pass
@abstractmethod
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read and parse the source into a Document.
Args:
source: The source to read (file path, URL, or content)
**options: Additional options for reading
Returns:
The parsed Document
"""
pass
def set_option(self, key: str, value: Any):
"""
Set a reader option.
Args:
key: The option name
value: The option value
"""
self._options[key] = value
def get_option(self, key: str, default: Any = None) -> Any:
"""
Get a reader option.
Args:
key: The option name
default: Default value if option is not set
Returns:
The option value or default
"""
return self._options.get(key, default)
class MetadataReader(ABC):
"""
Abstract base class for reading document metadata.
This class handles extraction of document metadata like title, author, etc.
"""
@abstractmethod
def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
"""
Extract metadata from the source.
Args:
source: The source data
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
pass
class StructureReader(ABC):
"""
Abstract base class for reading document structure.
This class handles extraction of document structure like headings, sections, etc.
"""
@abstractmethod
def extract_structure(self, source: Any, document: Document) -> List[Any]:
"""
Extract structure information from the source.
Args:
source: The source data
document: The document to populate with structure
Returns:
List of structural elements
"""
pass
class ContentReader(ABC):
"""
Abstract base class for reading document content.
This class handles extraction of document content like text, formatting, etc.
"""
@abstractmethod
def extract_content(self, source: Any, document: Document) -> Any:
"""
Extract content from the source.
Args:
source: The source data
document: The document to populate with content
Returns:
The extracted content
"""
pass
class ResourceReader(ABC):
"""
Abstract base class for reading document resources.
This class handles extraction of document resources like images, stylesheets, etc.
"""
@abstractmethod
def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
"""
Extract resources from the source.
Args:
source: The source data
document: The document to populate with resources
Returns:
Dictionary of extracted resources
"""
pass
class CompositeReader(BaseReader):
"""
A reader that combines multiple specialized readers.
This class uses composition to combine metadata, structure, content,
and resource readers into a complete document reader.
"""
def __init__(self):
"""Initialize the composite reader."""
super().__init__()
self._metadata_reader: Optional[MetadataReader] = None
self._structure_reader: Optional[StructureReader] = None
self._content_reader: Optional[ContentReader] = None
self._resource_reader: Optional[ResourceReader] = None
def set_metadata_reader(self, reader: MetadataReader):
"""Set the metadata reader."""
self._metadata_reader = reader
def set_structure_reader(self, reader: StructureReader):
"""Set the structure reader."""
self._structure_reader = reader
def set_content_reader(self, reader: ContentReader):
"""Set the content reader."""
self._content_reader = reader
def set_resource_reader(self, reader: ResourceReader):
"""Set the resource reader."""
self._resource_reader = reader
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read the source using all configured readers.
Args:
source: The source to read
**options: Additional options for reading
Returns:
The parsed Document
"""
# Create a new document
document = Document()
# Store options
self._options.update(options)
# Extract metadata if reader is available
if self._metadata_reader:
self._metadata_reader.extract_metadata(source, document)
# Extract structure if reader is available
if self._structure_reader:
self._structure_reader.extract_structure(source, document)
# Extract content if reader is available
if self._content_reader:
self._content_reader.extract_content(source, document)
# Extract resources if reader is available
if self._resource_reader:
self._resource_reader.extract_resources(source, document)
return document

View File

@ -0,0 +1,352 @@
"""
EPUB metadata reader for pyWebLayout.
This module provides specialized functionality for extracting metadata
from EPUB documents, following the decomposed architecture pattern.
"""
import os
import zipfile
import tempfile
from typing import Dict, Any, Optional, List
import xml.etree.ElementTree as ET
from pyWebLayout.abstract.document import Document, MetadataType
from pyWebLayout.io.readers.base import MetadataReader
# XML namespaces used in EPUB files
NAMESPACES = {
'opf': 'http://www.idpf.org/2007/opf',
'dc': 'http://purl.org/dc/elements/1.1/',
'dcterms': 'http://purl.org/dc/terms/',
}
class EPUBMetadataReader(MetadataReader):
"""
Specialized reader for extracting metadata from EPUB documents.
This class handles EPUB package document metadata including
Dublin Core elements and custom metadata.
"""
def __init__(self):
"""Initialize the EPUB metadata reader."""
self._metadata = {}
self._temp_dir = None
self._package_path = None
def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
"""
Extract metadata from EPUB file.
Args:
epub_path: Path to the EPUB file
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
# Reset internal state
self._reset()
try:
# Extract EPUB to temporary directory
self._extract_epub(epub_path)
# Find and parse package document
self._find_package_document()
if self._package_path:
self._parse_package_metadata()
# Populate document with extracted metadata
self._populate_document(document)
return self._metadata
finally:
# Clean up temporary files
self._cleanup()
def _reset(self):
"""Reset internal state for a new extraction."""
self._metadata = {}
self._temp_dir = None
self._package_path = None
def _extract_epub(self, epub_path: str):
"""
Extract EPUB file to temporary directory.
Args:
epub_path: Path to the EPUB file
"""
self._temp_dir = tempfile.mkdtemp()
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
zip_ref.extractall(self._temp_dir)
def _find_package_document(self):
"""Find the package document (content.opf) in the extracted EPUB."""
# First, try to find it via META-INF/container.xml
container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
if os.path.exists(container_path):
try:
tree = ET.parse(container_path)
root = tree.getroot()
# Find rootfile element
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
full_path = rootfile.get('full-path')
if full_path:
self._package_path = os.path.join(self._temp_dir, full_path)
if os.path.exists(self._package_path):
return
except ET.ParseError:
pass
# Fallback: search for .opf files
for root, dirs, files in os.walk(self._temp_dir):
for file in files:
if file.endswith('.opf'):
self._package_path = os.path.join(root, file)
return
def _parse_package_metadata(self):
"""Parse metadata from the package document."""
if not self._package_path or not os.path.exists(self._package_path):
return
try:
tree = ET.parse(self._package_path)
root = tree.getroot()
# Find metadata element
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
if metadata_elem is None:
return
# Parse Dublin Core metadata
self._parse_dublin_core(metadata_elem)
# Parse OPF-specific metadata
self._parse_opf_metadata(metadata_elem)
except ET.ParseError as e:
print(f"Error parsing package document: {e}")
def _parse_dublin_core(self, metadata_elem: ET.Element):
"""
Parse Dublin Core metadata elements.
Args:
metadata_elem: The metadata XML element
"""
dc_elements = {
'title': 'title',
'creator': 'creator',
'subject': 'subject',
'description': 'description',
'publisher': 'publisher',
'contributor': 'contributor',
'date': 'date',
'type': 'type',
'format': 'format',
'identifier': 'identifier',
'source': 'source',
'language': 'language',
'relation': 'relation',
'coverage': 'coverage',
'rights': 'rights'
}
for dc_name, meta_key in dc_elements.items():
elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
if elements:
if len(elements) == 1:
# Single element
text = elements[0].text
if text:
self._metadata[meta_key] = text.strip()
# Handle special attributes
elem = elements[0]
if dc_name == 'creator':
# Check for role attribute
role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
if role:
self._metadata[f'{meta_key}_role'] = role
# Check for file-as attribute for sorting
file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
if file_as:
self._metadata[f'{meta_key}_file_as'] = file_as
elif dc_name == 'identifier':
# Check for scheme (ISBN, DOI, etc.)
scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
if scheme:
self._metadata[f'{meta_key}_scheme'] = scheme
# Check if this is the unique identifier
id_attr = elem.get('id')
if id_attr:
self._metadata[f'{meta_key}_id'] = id_attr
elif dc_name == 'date':
# Check for event type
event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
if event:
self._metadata[f'{meta_key}_event'] = event
else:
# Multiple elements - store as list
values = []
for elem in elements:
if elem.text:
values.append(elem.text.strip())
if values:
self._metadata[meta_key] = values
def _parse_opf_metadata(self, metadata_elem: ET.Element):
"""
Parse OPF-specific metadata elements.
Args:
metadata_elem: The metadata XML element
"""
# Parse meta elements
meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
for meta in meta_elements:
name = meta.get('name')
content = meta.get('content')
if name and content:
self._metadata[f'meta_{name}'] = content
# Parse x-metadata elements (custom metadata)
x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
for x_meta in x_meta_elements:
for child in x_meta:
if child.tag and child.text:
# Remove namespace prefix for cleaner key names
tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
def _populate_document(self, document: Document):
"""
Populate the document with extracted metadata.
Args:
document: The document to populate
"""
# Map EPUB metadata to document metadata types
metadata_mapping = {
'title': MetadataType.TITLE,
'creator': MetadataType.AUTHOR,
'description': MetadataType.DESCRIPTION,
'subject': MetadataType.KEYWORDS,
'language': MetadataType.LANGUAGE,
'date': MetadataType.PUBLICATION_DATE,
'publisher': MetadataType.PUBLISHER,
'identifier': MetadataType.IDENTIFIER,
}
for epub_key, doc_type in metadata_mapping.items():
if epub_key in self._metadata:
value = self._metadata[epub_key]
# Handle list values (like multiple subjects)
if isinstance(value, list):
if epub_key == 'subject':
# Join subjects with commas for keywords
document.set_metadata(doc_type, ', '.join(value))
else:
# For other list values, use the first one
document.set_metadata(doc_type, value[0])
else:
document.set_metadata(doc_type, value)
# Handle cover image
cover_meta = self._metadata.get('meta_cover')
if cover_meta:
document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
# Store original EPUB metadata for reference
document.set_metadata(MetadataType.CUSTOM, {
'epub_metadata': self._metadata
})
def _cleanup(self):
"""Clean up temporary files."""
if self._temp_dir:
try:
import shutil
shutil.rmtree(self._temp_dir, ignore_errors=True)
except:
pass
self._temp_dir = None
def get_unique_identifier(self) -> Optional[str]:
"""
Get the unique identifier from the EPUB metadata.
Returns:
The unique identifier string, or None if not found
"""
# Look for identifier with specific ID
for key, value in self._metadata.items():
if key.startswith('identifier') and key.endswith('_id'):
return self._metadata.get('identifier')
# Fallback to any identifier
return self._metadata.get('identifier')
def get_cover_id(self) -> Optional[str]:
"""
Get the cover image ID from metadata.
Returns:
The cover image ID, or None if not found
"""
return self._metadata.get('meta_cover')
def get_creators(self) -> List[Dict[str, str]]:
"""
Get creator information with roles.
Returns:
List of creator dictionaries with name, role, and file-as info
"""
creators = []
creator_value = self._metadata.get('creator')
if creator_value:
if isinstance(creator_value, list):
# Multiple creators - this is simplified, real implementation
# would need to correlate with role and file-as attributes
for creator in creator_value:
creators.append({'name': creator})
else:
# Single creator
creator_info = {'name': creator_value}
# Add role if available
role = self._metadata.get('creator_role')
if role:
creator_info['role'] = role
# Add file-as if available
file_as = self._metadata.get('creator_file_as')
if file_as:
creator_info['file_as'] = file_as
creators.append(creator_info)
return creators

View File

@ -0,0 +1,400 @@
"""
EPUB reader for pyWebLayout.
This module provides functionality for reading EPUB documents and converting them
to pyWebLayout's abstract document model.
"""
import os
import zipfile
import tempfile
from typing import Dict, List, Optional, Any, Tuple
import xml.etree.ElementTree as ET
import re
import urllib.parse
from pyWebLayout.abstract.document import Document, Book, Chapter, MetadataType
from pyWebLayout.io.readers.html import parse_html_string as parse_html, read_html_file as html_to_document
# XML namespaces used in EPUB files
NAMESPACES = {
'opf': 'http://www.idpf.org/2007/opf',
'dc': 'http://purl.org/dc/elements/1.1/',
'dcterms': 'http://purl.org/dc/terms/',
'xhtml': 'http://www.w3.org/1999/xhtml',
'ncx': 'http://www.daisy.org/z3986/2005/ncx/',
}
class EPUBReader:
"""
Reader for EPUB documents.
This class extracts content from EPUB files and converts it to
pyWebLayout's abstract document model.
"""
def __init__(self, epub_path: str):
"""
Initialize an EPUB reader.
Args:
epub_path: Path to the EPUB file
"""
self.epub_path = epub_path
self.book = Book()
self.temp_dir = None
self.content_dir = None
self.metadata = {}
self.toc = []
self.spine = []
self.manifest = {}
def read(self) -> Book:
"""
Read the EPUB file and convert it to a Book.
Returns:
Book: The parsed book
"""
try:
# Extract the EPUB file
self.temp_dir = tempfile.mkdtemp()
self._extract_epub()
# Parse the package document (content.opf)
self._parse_package_document()
# Parse the table of contents
self._parse_toc()
# Create a Book object
self._create_book()
# Add chapters to the book
self._add_chapters()
return self.book
finally:
# Clean up temporary files
if self.temp_dir:
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
def _extract_epub(self):
"""Extract the EPUB file to a temporary directory."""
with zipfile.ZipFile(self.epub_path, 'r') as zip_ref:
zip_ref.extractall(self.temp_dir)
# Find the content directory (typically OEBPS or OPS)
container_path = os.path.join(self.temp_dir, 'META-INF', 'container.xml')
if os.path.exists(container_path):
tree = ET.parse(container_path)
root = tree.getroot()
# Get the path to the package document (content.opf)
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
full_path = rootfile.get('full-path')
if full_path:
self.content_dir = os.path.dirname(os.path.join(self.temp_dir, full_path))
return
# Fallback: look for common content directories
for content_dir in ['OEBPS', 'OPS', 'Content']:
if os.path.exists(os.path.join(self.temp_dir, content_dir)):
self.content_dir = os.path.join(self.temp_dir, content_dir)
return
# If no content directory found, use the root
self.content_dir = self.temp_dir
def _parse_package_document(self):
"""Parse the package document (content.opf)."""
# Find the package document
opf_path = None
for root, dirs, files in os.walk(self.content_dir):
for file in files:
if file.endswith('.opf'):
opf_path = os.path.join(root, file)
break
if opf_path:
break
if not opf_path:
raise ValueError("No package document (.opf) found in EPUB")
# Parse the package document
tree = ET.parse(opf_path)
root = tree.getroot()
# Parse metadata
self._parse_metadata(root)
# Parse manifest
self._parse_manifest(root)
# Parse spine
self._parse_spine(root)
def _parse_metadata(self, root: ET.Element):
"""
Parse metadata from the package document.
Args:
root: Root element of the package document
"""
# Find the metadata element
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
if metadata_elem is None:
return
# Parse DC metadata
for elem in metadata_elem:
if elem.tag.startswith('{{{0}}}'.format(NAMESPACES['dc'])):
# Get the local name (without namespace)
name = elem.tag.split('}', 1)[1]
value = elem.text
if name == 'title':
self.metadata['title'] = value
elif name == 'creator':
self.metadata['creator'] = value
elif name == 'language':
self.metadata['language'] = value
elif name == 'description':
self.metadata['description'] = value
elif name == 'subject':
if 'subjects' not in self.metadata:
self.metadata['subjects'] = []
self.metadata['subjects'].append(value)
elif name == 'date':
self.metadata['date'] = value
elif name == 'identifier':
self.metadata['identifier'] = value
elif name == 'publisher':
self.metadata['publisher'] = value
else:
# Store other metadata
self.metadata[name] = value
def _parse_manifest(self, root: ET.Element):
"""
Parse manifest from the package document.
Args:
root: Root element of the package document
"""
# Find the manifest element
manifest_elem = root.find('.//{{{0}}}manifest'.format(NAMESPACES['opf']))
if manifest_elem is None:
return
# Parse items
for item in manifest_elem.findall('.//{{{0}}}item'.format(NAMESPACES['opf'])):
id = item.get('id')
href = item.get('href')
media_type = item.get('media-type')
if id and href:
# Resolve relative path
href = urllib.parse.unquote(href)
path = os.path.normpath(os.path.join(self.content_dir, href))
self.manifest[id] = {
'href': href,
'path': path,
'media_type': media_type
}
def _parse_spine(self, root: ET.Element):
"""
Parse spine from the package document.
Args:
root: Root element of the package document
"""
# Find the spine element
spine_elem = root.find('.//{{{0}}}spine'.format(NAMESPACES['opf']))
if spine_elem is None:
return
# Get the toc attribute (NCX file ID)
toc_id = spine_elem.get('toc')
if toc_id and toc_id in self.manifest:
self.toc_path = self.manifest[toc_id]['path']
# Parse itemrefs
for itemref in spine_elem.findall('.//{{{0}}}itemref'.format(NAMESPACES['opf'])):
idref = itemref.get('idref')
if idref and idref in self.manifest:
self.spine.append(idref)
def _parse_toc(self):
"""Parse the table of contents."""
if not hasattr(self, 'toc_path') or not self.toc_path or not os.path.exists(self.toc_path):
# Try to find the toc.ncx file
for root, dirs, files in os.walk(self.content_dir):
for file in files:
if file.endswith('.ncx'):
self.toc_path = os.path.join(root, file)
break
if hasattr(self, 'toc_path') and self.toc_path:
break
if not hasattr(self, 'toc_path') or not self.toc_path or not os.path.exists(self.toc_path):
# No TOC found
return
# Parse the NCX file
tree = ET.parse(self.toc_path)
root = tree.getroot()
# Parse navMap
nav_map = root.find('.//{{{0}}}navMap'.format(NAMESPACES['ncx']))
if nav_map is None:
return
# Parse navPoints
self._parse_nav_points(nav_map, [])
def _parse_nav_points(self, parent: ET.Element, path: List[Dict[str, Any]]):
"""
Recursively parse navPoints from the NCX file.
Args:
parent: Parent element containing navPoints
path: Current path in the TOC hierarchy
"""
for nav_point in parent.findall('.//{{{0}}}navPoint'.format(NAMESPACES['ncx'])):
# Get navPoint attributes
id = nav_point.get('id')
play_order = nav_point.get('playOrder')
# Get navLabel
nav_label = nav_point.find('.//{{{0}}}navLabel'.format(NAMESPACES['ncx']))
text_elem = nav_label.find('.//{{{0}}}text'.format(NAMESPACES['ncx'])) if nav_label else None
label = text_elem.text if text_elem is not None else ""
# Get content
content = nav_point.find('.//{{{0}}}content'.format(NAMESPACES['ncx']))
src = content.get('src') if content is not None else ""
# Create a TOC entry
entry = {
'id': id,
'label': label,
'src': src,
'play_order': play_order,
'children': []
}
# Add to TOC
if path:
path[-1]['children'].append(entry)
else:
self.toc.append(entry)
# Parse child navPoints
self._parse_nav_points(nav_point, path + [entry])
def _create_book(self):
"""Create a Book object from the parsed metadata."""
# Set book metadata
if 'title' in self.metadata:
self.book.set_title(self.metadata['title'])
if 'creator' in self.metadata:
self.book.set_metadata(MetadataType.AUTHOR, self.metadata['creator'])
if 'language' in self.metadata:
self.book.set_metadata(MetadataType.LANGUAGE, self.metadata['language'])
if 'description' in self.metadata:
self.book.set_metadata(MetadataType.DESCRIPTION, self.metadata['description'])
if 'subjects' in self.metadata:
self.book.set_metadata(MetadataType.KEYWORDS, ', '.join(self.metadata['subjects']))
if 'date' in self.metadata:
self.book.set_metadata(MetadataType.PUBLICATION_DATE, self.metadata['date'])
if 'identifier' in self.metadata:
self.book.set_metadata(MetadataType.IDENTIFIER, self.metadata['identifier'])
if 'publisher' in self.metadata:
self.book.set_metadata(MetadataType.PUBLISHER, self.metadata['publisher'])
def _add_chapters(self):
"""Add chapters to the book based on the spine and TOC."""
# Create a mapping from src to TOC entry
toc_map = {}
def add_to_toc_map(entries):
for entry in entries:
if entry['src']:
# Extract the path part of the src (remove fragment)
src_parts = entry['src'].split('#', 1)
path = src_parts[0]
toc_map[path] = entry
# Process children
if entry['children']:
add_to_toc_map(entry['children'])
add_to_toc_map(self.toc)
# Process spine items
for i, idref in enumerate(self.spine):
if idref not in self.manifest:
continue
item = self.manifest[idref]
path = item['path']
href = item['href']
# Check if this item is in the TOC
chapter_title = None
if href in toc_map:
chapter_title = toc_map[href]['label']
# Create a chapter
chapter = self.book.create_chapter(chapter_title, i + 1)
# Parse the HTML content
try:
# Read the HTML file
with open(path, 'r', encoding='utf-8') as f:
html = f.read()
# Parse HTML and add blocks to chapter
base_url = os.path.dirname(path)
document = parse_html(html, base_url)
# Copy blocks to the chapter
for block in document.blocks:
chapter.add_block(block)
except Exception as e:
print(f"Error parsing chapter {i+1}: {str(e)}")
# Add an error message block
from pyWebLayout.abstract.block import Parapgraph
from pyWebLayout.abstract.inline import Word
error_para = Parapgraph()
error_para.add_word(Word(f"Error loading chapter: {str(e)}"))
chapter.add_block(error_para)
def read_epub(epub_path: str) -> Book:
"""
Read an EPUB file and convert it to a Book.
Args:
epub_path: Path to the EPUB file
Returns:
Book: The parsed book
"""
reader = EPUBReader(epub_path)
return reader.read()

View File

@ -0,0 +1,190 @@
"""
Modern HTML reader for pyWebLayout.
This module provides a decomposed HTML reader that uses specialized
readers for metadata, content, and resources, following the pattern
established in the abstract module.
"""
import os
from typing import Union, Optional
from pyWebLayout.abstract.document import Document
from pyWebLayout.io.readers.base import CompositeReader
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
from pyWebLayout.io.readers.html_content import HTMLContentReader
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
class HTMLReader(CompositeReader):
"""
Modern HTML reader using decomposed architecture.
This reader combines specialized readers for metadata, content,
and resources to provide a complete HTML parsing solution.
"""
def __init__(self):
"""Initialize the HTML reader with all specialized readers."""
super().__init__()
# Set up specialized readers
self.set_metadata_reader(HTMLMetadataReader())
self.set_content_reader(HTMLContentReader())
self.set_resource_reader(HTMLResourceReader())
def can_read(self, source: Union[str, bytes]) -> bool:
"""
Check if this reader can handle the given source.
Args:
source: The source to check (file path, URL, or content)
Returns:
True if this reader can handle the source, False otherwise
"""
if isinstance(source, str):
# Check if it's a file path
if os.path.isfile(source):
return source.lower().endswith(('.html', '.htm', '.xhtml'))
# Check if it's HTML content (very basic check)
source_lower = source.lower().strip()
return (source_lower.startswith('<!doctype html') or
source_lower.startswith('<html') or
'<html' in source_lower[:200])
elif isinstance(source, bytes):
# Check if it's HTML content in bytes
try:
source_str = source.decode('utf-8', errors='ignore').lower().strip()
return (source_str.startswith('<!doctype html') or
source_str.startswith('<html') or
'<html' in source_str[:200])
except:
return False
return False
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read and parse the HTML source into a Document.
Args:
source: The HTML source to read (file path, URL, or content)
**options: Additional options for reading
- base_url: Base URL for resolving relative links
- encoding: Character encoding (default: 'utf-8')
- extract_metadata: Whether to extract metadata (default: True)
- extract_resources: Whether to extract resources (default: True)
Returns:
The parsed Document
"""
# Get options
base_url = options.get('base_url')
encoding = options.get('encoding', 'utf-8')
extract_metadata = options.get('extract_metadata', True)
extract_resources = options.get('extract_resources', True)
# Read the HTML content
html_content = self._read_html_content(source, encoding)
# Set base URL if not provided and source is a file
if not base_url and isinstance(source, str) and os.path.isfile(source):
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
# Set base URL in content reader
if self._content_reader and hasattr(self._content_reader, 'set_base_url'):
self._content_reader.set_base_url(base_url)
# Create a new document
document = Document()
# Extract metadata if enabled
if extract_metadata and self._metadata_reader:
self._metadata_reader.extract_metadata(html_content, document)
# Extract content
if self._content_reader:
self._content_reader.extract_content(html_content, document)
# Extract resources if enabled
if extract_resources and self._resource_reader:
self._resource_reader.extract_resources(html_content, document)
return document
def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str:
"""
Read HTML content from various sources.
Args:
source: The source to read from
encoding: Character encoding to use
Returns:
The HTML content as a string
"""
if isinstance(source, bytes):
# Source is already bytes, decode it
return source.decode(encoding, errors='replace')
elif isinstance(source, str):
# Check if it's a file path
if os.path.isfile(source):
with open(source, 'r', encoding=encoding, errors='replace') as f:
return f.read()
else:
# Assume it's HTML content
return source
else:
raise ValueError(f"Unsupported source type: {type(source)}")
def read_html(source: Union[str, bytes], **options) -> Document:
"""
Convenience function to read HTML content.
Args:
source: The HTML source to read (file path, URL, or content)
**options: Additional options for reading
Returns:
The parsed Document
"""
reader = HTMLReader()
return reader.read(source, **options)
def read_html_file(file_path: str, **options) -> Document:
"""
Convenience function to read HTML from a file.
Args:
file_path: Path to the HTML file
**options: Additional options for reading
Returns:
The parsed Document
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"HTML file not found: {file_path}")
reader = HTMLReader()
return reader.read(file_path, **options)
def parse_html_string(html_content: str, **options) -> Document:
"""
Convenience function to parse HTML content from a string.
Args:
html_content: The HTML content as a string
**options: Additional options for reading
Returns:
The parsed Document
"""
reader = HTMLReader()
return reader.read(html_content, **options)

View File

@ -0,0 +1,269 @@
"""
Modern HTML content reader for pyWebLayout.
This module provides a decomposed HTML content reader that uses specialized
handlers and managers for different aspects of HTML parsing.
"""
from html.parser import HTMLParser as BaseHTMLParser
from typing import Dict, List, Optional, Tuple, Union, Any
from pyWebLayout.abstract.document import Document
from pyWebLayout.io.readers.base import ContentReader
from pyWebLayout.io.readers.html_style import HTMLStyleManager
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
from pyWebLayout.io.readers.html_elements import (
BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
)
class HTMLContentReader(ContentReader, BaseHTMLParser):
"""
Modern HTML content reader using decomposed architecture.
This class orchestrates specialized handlers to parse HTML content
and convert it to pyWebLayout's abstract document model.
"""
def __init__(self):
"""Initialize the HTML content reader."""
BaseHTMLParser.__init__(self)
# Initialize managers and processors
self.style_manager = HTMLStyleManager()
self.text_processor = HTMLTextProcessor(self.style_manager)
# Initialize element handlers
self.block_handler = BlockElementHandler(self.style_manager, self.text_processor)
self.list_handler = ListElementHandler(self.text_processor)
self.table_handler = TableElementHandler(self.text_processor)
self.inline_handler = InlineElementHandler(self.text_processor)
# Document and parsing state
self._document: Optional[Document] = None
self._in_head = False
self._in_script = False
self._in_style = False
def extract_content(self, html_content: str, document: Document) -> Any:
"""
Extract content from HTML.
Args:
html_content: The HTML content to parse
document: The document to populate with content
Returns:
The document with populated content
"""
self._document = document
self._reset_state()
# Parse the HTML content
self.feed(html_content)
# Flush any remaining text
self.text_processor.flush_text()
return document
def set_base_url(self, base_url: str):
"""Set the base URL for resolving relative links."""
self.inline_handler.set_base_url(base_url)
def _reset_state(self):
"""Reset all parser state for new content."""
# Reset managers and processors
self.style_manager.reset()
self.text_processor.reset()
# Reset element handlers
self.block_handler.reset()
self.list_handler.reset()
self.table_handler.reset()
self.inline_handler.reset()
# Reset parser flags
self._in_head = False
self._in_script = False
self._in_style = False
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
"""Handle the start of an HTML tag."""
tag = tag.lower()
attrs_dict = dict(attrs)
# Skip content in head, script, style (except body)
if self._should_skip_content(tag):
return
# Handle special section markers
if self._handle_special_sections_start(tag):
return
# Apply styles for this element
style = self.style_manager.apply_style_to_element(tag, attrs_dict)
self.style_manager.push_style(style)
# Delegate to appropriate handler
self._delegate_start_tag(tag, attrs_dict)
def handle_endtag(self, tag: str):
"""Handle the end of an HTML tag."""
tag = tag.lower()
# Handle special section markers
if self._handle_special_sections_end(tag):
return
# Skip content in head, script, style
if self._in_head or self._in_script or self._in_style:
return
# Flush any accumulated text
self.text_processor.flush_text()
# Delegate to appropriate handler
self._delegate_end_tag(tag)
# Pop style regardless of tag
self.style_manager.pop_style()
def handle_data(self, data: str):
"""Handle text data."""
if self._in_head or self._in_script or self._in_style:
return
self.text_processor.add_text(data)
def handle_entityref(self, name: str):
"""Handle an HTML entity reference."""
if self._in_head or self._in_script or self._in_style:
return
self.text_processor.add_entity_reference(name)
def handle_charref(self, name: str):
"""Handle a character reference."""
if self._in_head or self._in_script or self._in_style:
return
self.text_processor.add_character_reference(name)
def _should_skip_content(self, tag: str) -> bool:
"""Check if we should skip content based on current state."""
if self._in_head or self._in_script or self._in_style:
if tag in ('head', 'script', 'style'):
return False # Let special section handlers deal with these
if tag != 'body':
return True
return False
def _handle_special_sections_start(self, tag: str) -> bool:
"""Handle special section start tags. Returns True if handled."""
if tag == 'head':
self._in_head = True
return True
elif tag == 'body':
self._in_head = False
return True
elif tag == 'script':
self._in_script = True
return True
elif tag == 'style':
self._in_style = True
return True
return False
def _handle_special_sections_end(self, tag: str) -> bool:
"""Handle special section end tags. Returns True if handled."""
if tag == 'head':
self._in_head = False
self.style_manager.pop_style()
return True
elif tag == 'script':
self._in_script = False
self.style_manager.pop_style()
return True
elif tag == 'style':
self._in_style = False
self.style_manager.pop_style()
return True
return False
def _delegate_start_tag(self, tag: str, attrs: Dict[str, str]):
"""Delegate start tag handling to appropriate handler."""
# Block elements
if tag == 'p':
self.block_handler.handle_paragraph_start(self._document)
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
self.block_handler.handle_heading_start(tag, self._document)
elif tag == 'div':
self.block_handler.handle_div_start(self._document)
elif tag == 'blockquote':
self.block_handler.handle_blockquote_start(self._document)
elif tag == 'pre':
self.block_handler.handle_pre_start(self._document)
elif tag == 'code':
self.block_handler.handle_code_start(attrs, self._document)
# List elements
elif tag in ('ul', 'ol', 'dl'):
self.list_handler.handle_list_start(tag, self.block_handler, self._document)
elif tag == 'li':
self.list_handler.handle_list_item_start(self.block_handler)
elif tag in ('dt', 'dd'):
self.list_handler.handle_definition_start(tag, self.block_handler)
# Table elements
elif tag == 'table':
self.table_handler.handle_table_start(attrs, self.block_handler, self._document)
elif tag in ('thead', 'tbody', 'tfoot'):
self.table_handler.handle_table_section_start(tag)
elif tag == 'tr':
self.table_handler.handle_table_row_start()
elif tag in ('td', 'th'):
self.table_handler.handle_table_cell_start(tag, attrs, self.block_handler)
# Inline elements
elif tag == 'a':
self.inline_handler.handle_link_start(attrs)
elif tag == 'img':
self.inline_handler.handle_image(attrs, self.block_handler, self._document)
elif tag == 'br':
self.inline_handler.handle_line_break(self.block_handler)
elif tag == 'hr':
self.inline_handler.handle_horizontal_rule(self.block_handler, self._document)
# Style-only elements (no special handling needed, just styling)
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
pass # Styles are already applied by style manager
def _delegate_end_tag(self, tag: str):
"""Delegate end tag handling to appropriate handler."""
# Block elements
if tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code'):
self.block_handler.handle_block_end()
# List elements
elif tag in ('ul', 'ol', 'dl'):
self.list_handler.handle_list_end(self.block_handler)
elif tag in ('li', 'dt', 'dd'):
self.list_handler.handle_list_item_end(self.block_handler)
# Table elements
elif tag == 'table':
self.table_handler.handle_table_end(self.block_handler)
elif tag in ('thead', 'tbody', 'tfoot'):
self.table_handler.handle_table_section_end()
elif tag == 'tr':
self.table_handler.handle_table_row_end()
elif tag in ('td', 'th'):
self.table_handler.handle_table_cell_end(self.block_handler)
# Inline elements
elif tag == 'a':
self.inline_handler.handle_link_end()
# Style-only elements (no special handling needed)
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
pass # Styles are handled by style manager

View File

@ -0,0 +1,472 @@
"""
HTML element handlers for pyWebLayout.
This module provides specialized handlers for different types of HTML elements,
using composition and delegation to handle specific element types.
"""
from typing import Dict, List, Optional, Any
import urllib.parse
from pyWebLayout.abstract.document import Document
from pyWebLayout.abstract.block import (
Block, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, ListItem, Table, TableRow, TableCell,
HorizontalRule, LineBreak, Image
)
from pyWebLayout.abstract.functional import Link, LinkType
from pyWebLayout.io.readers.html_style import HTMLStyleManager
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
class BlockElementHandler:
"""Handles block-level HTML elements like paragraphs, headings, divs."""
def __init__(self, style_manager: HTMLStyleManager, text_processor: HTMLTextProcessor):
self.style_manager = style_manager
self.text_processor = text_processor
self.block_stack: List[Block] = []
self.current_block: Optional[Block] = None
self.current_paragraph: Optional[Parapgraph] = None
def reset(self):
"""Reset the handler state."""
self.block_stack = []
self.current_block = None
self.current_paragraph = None
def add_block_to_document_or_parent(self, block: Block, document: Document):
"""Add a block to the document or current parent block."""
if self.current_block and hasattr(self.current_block, 'add_block'):
self.current_block.add_block(block)
else:
document.add_block(block)
def handle_paragraph_start(self, document: Document):
"""Handle the start of a paragraph element."""
self.text_processor.flush_text()
paragraph = Parapgraph()
self.add_block_to_document_or_parent(paragraph, document)
self.block_stack.append(paragraph)
self.current_block = paragraph
self.current_paragraph = paragraph
self.text_processor.set_current_paragraph(paragraph)
def handle_heading_start(self, tag: str, document: Document):
"""Handle the start of a heading element."""
self.text_processor.flush_text()
level_map = {
'h1': HeadingLevel.H1, 'h2': HeadingLevel.H2, 'h3': HeadingLevel.H3,
'h4': HeadingLevel.H4, 'h5': HeadingLevel.H5, 'h6': HeadingLevel.H6
}
heading = Heading(level=level_map[tag])
self.add_block_to_document_or_parent(heading, document)
self.block_stack.append(heading)
self.current_block = heading
self.current_paragraph = heading # Heading inherits from Paragraph
self.text_processor.set_current_paragraph(heading)
def handle_div_start(self, document: Document):
"""Handle the start of a div element."""
self.text_processor.flush_text()
div_para = Parapgraph()
self.add_block_to_document_or_parent(div_para, document)
self.block_stack.append(div_para)
self.current_block = div_para
self.current_paragraph = div_para
self.text_processor.set_current_paragraph(div_para)
def handle_blockquote_start(self, document: Document):
"""Handle the start of a blockquote element."""
self.text_processor.flush_text()
quote = Quote()
self.add_block_to_document_or_parent(quote, document)
self.block_stack.append(quote)
self.current_block = quote
self.current_paragraph = None
self.text_processor.set_current_paragraph(None)
def handle_pre_start(self, document: Document):
"""Handle the start of a pre element."""
self.text_processor.flush_text()
pre_para = Parapgraph()
self.add_block_to_document_or_parent(pre_para, document)
self.block_stack.append(pre_para)
self.current_block = pre_para
self.current_paragraph = pre_para
self.text_processor.set_current_paragraph(pre_para)
def handle_code_start(self, attrs: Dict[str, str], document: Document):
"""Handle the start of a code element."""
# If we're inside a pre, replace the paragraph with a code block
if self.block_stack and isinstance(self.block_stack[-1], Parapgraph):
pre_para = self.block_stack.pop()
# Get the language from class if specified
language = ""
if 'class' in attrs:
class_attr = attrs['class']
if class_attr.startswith('language-'):
language = class_attr[9:]
code_block = CodeBlock(language=language)
# Replace the paragraph with the code block in its parent
if pre_para.parent:
parent = pre_para.parent
if hasattr(parent, '_blocks'):
for i, block in enumerate(parent._blocks):
if block == pre_para:
parent._blocks[i] = code_block
code_block.parent = parent
break
else:
# Replace in document blocks
for i, block in enumerate(document.blocks):
if block == pre_para:
document.blocks[i] = code_block
break
self.block_stack.append(code_block)
self.current_block = code_block
self.current_paragraph = None
self.text_processor.set_current_paragraph(None)
def handle_block_end(self):
"""Handle the end of a block element."""
if self.block_stack:
self.block_stack.pop()
if self.block_stack:
self.current_block = self.block_stack[-1]
# Update current paragraph based on block type
if isinstance(self.current_block, Parapgraph):
self.current_paragraph = self.current_block
else:
self.current_paragraph = None
else:
self.current_block = None
self.current_paragraph = None
self.text_processor.set_current_paragraph(self.current_paragraph)
class ListElementHandler:
"""Handles list-related HTML elements (ul, ol, dl, li, dt, dd)."""
def __init__(self, text_processor: HTMLTextProcessor):
self.text_processor = text_processor
self.list_stack: List[HList] = []
def reset(self):
"""Reset the handler state."""
self.list_stack = []
def handle_list_start(self, tag: str, block_handler: BlockElementHandler, document: Document):
"""Handle the start of a list element."""
self.text_processor.flush_text()
style_map = {
'ul': ListStyle.UNORDERED,
'ol': ListStyle.ORDERED,
'dl': ListStyle.DEFINITION
}
list_block = HList(style=style_map[tag])
block_handler.add_block_to_document_or_parent(list_block, document)
block_handler.block_stack.append(list_block)
self.list_stack.append(list_block)
block_handler.current_block = list_block
block_handler.current_paragraph = None
self.text_processor.set_current_paragraph(None)
def handle_list_item_start(self, block_handler: BlockElementHandler):
"""Handle the start of a list item."""
if not self.list_stack:
return
self.text_processor.flush_text()
list_item = ListItem()
current_list = self.list_stack[-1]
current_list.add_item(list_item)
block_handler.block_stack.append(list_item)
block_handler.current_block = list_item
# Create a paragraph for the list item content
item_para = Parapgraph()
list_item.add_block(item_para)
block_handler.current_paragraph = item_para
self.text_processor.set_current_paragraph(item_para)
def handle_definition_start(self, tag: str, block_handler: BlockElementHandler):
"""Handle the start of definition terms or descriptions."""
if not self.list_stack or self.list_stack[-1].style != ListStyle.DEFINITION:
return
self.text_processor.flush_text()
current_list = self.list_stack[-1]
if tag == 'dt':
list_item = ListItem(term="")
current_list.add_item(list_item)
block_handler.block_stack.append(list_item)
block_handler.current_block = list_item
term_para = Parapgraph()
list_item.add_block(term_para)
block_handler.current_paragraph = term_para
self.text_processor.set_current_paragraph(term_para)
elif tag == 'dd':
if current_list._items:
list_item = current_list._items[-1]
desc_para = Parapgraph()
list_item.add_block(desc_para)
block_handler.current_paragraph = desc_para
self.text_processor.set_current_paragraph(desc_para)
def handle_list_end(self, block_handler: BlockElementHandler):
"""Handle the end of a list."""
if block_handler.block_stack:
block_handler.block_stack.pop()
if self.list_stack:
self.list_stack.pop()
if block_handler.block_stack:
block_handler.current_block = block_handler.block_stack[-1]
else:
block_handler.current_block = None
block_handler.current_paragraph = None
self.text_processor.set_current_paragraph(None)
def handle_list_item_end(self, block_handler: BlockElementHandler):
"""Handle the end of a list item."""
if block_handler.block_stack:
block_handler.block_stack.pop()
if block_handler.block_stack:
block_handler.current_block = block_handler.block_stack[-1]
else:
block_handler.current_block = None
block_handler.current_paragraph = None
self.text_processor.set_current_paragraph(None)
class TableElementHandler:
"""Handles table-related HTML elements (table, tr, td, th, thead, tbody, tfoot)."""
def __init__(self, text_processor: HTMLTextProcessor):
self.text_processor = text_processor
self.table_stack: List[Table] = []
self.current_table_row: Optional[TableRow] = None
self.current_table_section = "body"
def reset(self):
"""Reset the handler state."""
self.table_stack = []
self.current_table_row = None
self.current_table_section = "body"
def handle_table_start(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
"""Handle the start of a table element."""
self.text_processor.flush_text()
caption = attrs.get('summary')
table = Table(caption=caption)
block_handler.add_block_to_document_or_parent(table, document)
block_handler.block_stack.append(table)
self.table_stack.append(table)
block_handler.current_block = table
block_handler.current_paragraph = None
self.text_processor.set_current_paragraph(None)
def handle_table_section_start(self, tag: str):
"""Handle the start of a table section."""
self.current_table_section = tag
def handle_table_row_start(self):
"""Handle the start of a table row."""
if not self.table_stack:
return
self.text_processor.flush_text()
row = TableRow()
current_table = self.table_stack[-1]
section = self.current_table_section
if section == 'thead':
section = "header"
elif section == 'tfoot':
section = "footer"
else:
section = "body"
current_table.add_row(row, section=section)
self.current_table_row = row
def handle_table_cell_start(self, tag: str, attrs: Dict[str, str], block_handler: BlockElementHandler):
"""Handle the start of a table cell."""
if not self.current_table_row:
return
self.text_processor.flush_text()
# Parse attributes
try:
colspan = int(attrs.get('colspan', 1))
rowspan = int(attrs.get('rowspan', 1))
except ValueError:
colspan, rowspan = 1, 1
is_header = (tag == 'th')
cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan)
self.current_table_row.add_cell(cell)
block_handler.block_stack.append(cell)
block_handler.current_block = cell
# Create a paragraph for the cell content
cell_para = Parapgraph()
cell.add_block(cell_para)
block_handler.current_paragraph = cell_para
self.text_processor.set_current_paragraph(cell_para)
def handle_table_end(self, block_handler: BlockElementHandler):
"""Handle the end of a table."""
if block_handler.block_stack:
block_handler.block_stack.pop()
if self.table_stack:
self.table_stack.pop()
if block_handler.block_stack:
block_handler.current_block = block_handler.block_stack[-1]
else:
block_handler.current_block = None
block_handler.current_paragraph = None
self.text_processor.set_current_paragraph(None)
self.current_table_row = None
self.current_table_section = "body"
def handle_table_section_end(self):
"""Handle the end of a table section."""
self.current_table_section = "body"
def handle_table_row_end(self):
"""Handle the end of a table row."""
self.current_table_row = None
def handle_table_cell_end(self, block_handler: BlockElementHandler):
"""Handle the end of a table cell."""
if block_handler.block_stack:
block_handler.block_stack.pop()
if block_handler.block_stack:
block_handler.current_block = block_handler.block_stack[-1]
else:
block_handler.current_block = None
block_handler.current_paragraph = None
self.text_processor.set_current_paragraph(None)
class InlineElementHandler:
"""Handles inline and special HTML elements (a, img, br, hr)."""
def __init__(self, text_processor: HTMLTextProcessor, base_url: Optional[str] = None):
self.text_processor = text_processor
self.base_url = base_url
self.in_link = False
self.current_link: Optional[Link] = None
def reset(self):
"""Reset the handler state."""
self.in_link = False
self.current_link = None
def set_base_url(self, base_url: Optional[str]):
"""Set the base URL for resolving relative links."""
self.base_url = base_url
def handle_link_start(self, attrs: Dict[str, str]):
"""Handle the start of a link element."""
self.text_processor.flush_text()
href = attrs.get('href', '')
title = attrs.get('title', '')
# Determine link type
link_type = LinkType.INTERNAL
if href.startswith('http://') or href.startswith('https://'):
link_type = LinkType.EXTERNAL
elif href.startswith('javascript:'):
link_type = LinkType.FUNCTION
elif href.startswith('api:'):
link_type = LinkType.API
href = href[4:]
# Resolve relative URLs
if self.base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')):
href = urllib.parse.urljoin(self.base_url, href)
self.current_link = Link(
location=href,
link_type=link_type,
title=title if title else None
)
self.in_link = True
def handle_link_end(self):
"""Handle the end of a link element."""
self.in_link = False
self.current_link = None
def handle_image(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
"""Handle an image element."""
src = attrs.get('src', '')
alt = attrs.get('alt', '')
# Parse dimensions
width = height = None
try:
if 'width' in attrs:
width = int(attrs['width'])
if 'height' in attrs:
height = int(attrs['height'])
except ValueError:
pass
# Resolve relative URLs
if self.base_url and not src.startswith(('http://', 'https://')):
src = urllib.parse.urljoin(self.base_url, src)
image = Image(source=src, alt_text=alt, width=width, height=height)
block_handler.add_block_to_document_or_parent(image, document)
def handle_line_break(self, block_handler: BlockElementHandler):
"""Handle a line break element."""
if block_handler.current_paragraph:
line_break = LineBreak()
if hasattr(block_handler.current_paragraph, 'add_block'):
block_handler.current_paragraph.add_block(line_break)
self.text_processor.flush_text()
def handle_horizontal_rule(self, block_handler: BlockElementHandler, document: Document):
"""Handle a horizontal rule element."""
self.text_processor.flush_text()
hr = HorizontalRule()
block_handler.add_block_to_document_or_parent(hr, document)

View File

@ -0,0 +1,426 @@
"""
HTML metadata reader for pyWebLayout.
This module provides specialized functionality for extracting metadata
from HTML documents, following the decomposed architecture pattern.
"""
from typing import Dict, Any, Optional
import re
from pyWebLayout.abstract.document import Document, MetadataType
from pyWebLayout.io.readers.base import MetadataReader
class HTMLMetadataReader(MetadataReader):
"""
Specialized reader for extracting metadata from HTML documents.
This class handles HTML meta tags, title elements, and other metadata
sources like Open Graph tags and JSON-LD structured data.
"""
def __init__(self):
"""Initialize the HTML metadata reader."""
self._title = None
self._meta_tags = {}
self._og_tags = {}
self._twitter_tags = {}
self._json_ld = {}
def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
"""
Extract metadata from HTML content.
Args:
html_content: The HTML content to parse
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
# Reset internal state
self._reset()
# Extract title
self._extract_title(html_content)
# Extract meta tags
self._extract_meta_tags(html_content)
# Extract Open Graph tags
self._extract_open_graph(html_content)
# Extract Twitter Card tags
self._extract_twitter_cards(html_content)
# Extract JSON-LD structured data
self._extract_json_ld(html_content)
# Populate document with extracted metadata
self._populate_document(document)
# Return all extracted metadata
return {
'title': self._title,
'meta_tags': self._meta_tags,
'open_graph': self._og_tags,
'twitter_cards': self._twitter_tags,
'json_ld': self._json_ld
}
def _reset(self):
"""Reset internal state for a new extraction."""
self._title = None
self._meta_tags = {}
self._og_tags = {}
self._twitter_tags = {}
self._json_ld = {}
def _extract_title(self, html_content: str):
"""
Extract the title from HTML content.
Args:
html_content: The HTML content to parse
"""
# Look for title tag
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
if title_match:
# Clean up the title text
self._title = self._clean_text(title_match.group(1))
def _extract_meta_tags(self, html_content: str):
"""
Extract meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match meta tags
meta_pattern = r'<meta\s+([^>]+)>'
for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
# Get name and content
name = attrs.get('name', '').lower()
content = attrs.get('content', '')
# Handle different types of meta tags
if name and content:
self._meta_tags[name] = content
# Handle http-equiv meta tags
http_equiv = attrs.get('http-equiv', '').lower()
if http_equiv and content:
self._meta_tags[f'http-equiv:{http_equiv}'] = content
# Handle charset meta tags
charset = attrs.get('charset', '')
if charset:
self._meta_tags['charset'] = charset
def _extract_open_graph(self, html_content: str):
"""
Extract Open Graph meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match Open Graph meta tags
og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
property_name = match.group(1)
content = match.group(2)
self._og_tags[property_name] = content
def _extract_twitter_cards(self, html_content: str):
"""
Extract Twitter Card meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match Twitter Card meta tags
twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
property_name = match.group(1)
content = match.group(2)
self._twitter_tags[property_name] = content
def _extract_json_ld(self, html_content: str):
"""
Extract JSON-LD structured data from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match JSON-LD script tags
json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
try:
import json
json_content = match.group(1).strip()
data = json.loads(json_content)
# Store JSON-LD data by type if available
if isinstance(data, dict) and '@type' in data:
type_name = data['@type']
if type_name not in self._json_ld:
self._json_ld[type_name] = []
self._json_ld[type_name].append(data)
elif isinstance(data, list):
# Handle arrays of structured data
for item in data:
if isinstance(item, dict) and '@type' in item:
type_name = item['@type']
if type_name not in self._json_ld:
self._json_ld[type_name] = []
self._json_ld[type_name].append(item)
except (json.JSONDecodeError, ImportError):
# Skip invalid JSON-LD
continue
def _populate_document(self, document: Document):
"""
Populate the document with extracted metadata.
Args:
document: The document to populate
"""
# Set title
title = self._get_best_title()
if title:
document.set_metadata(MetadataType.TITLE, title)
# Set description
description = self._get_best_description()
if description:
document.set_metadata(MetadataType.DESCRIPTION, description)
# Set author
author = self._get_best_author()
if author:
document.set_metadata(MetadataType.AUTHOR, author)
# Set keywords
keywords = self._get_keywords()
if keywords:
document.set_metadata(MetadataType.KEYWORDS, keywords)
# Set language
language = self._get_language()
if language:
document.set_metadata(MetadataType.LANGUAGE, language)
# Set cover image
cover_image = self._get_cover_image()
if cover_image:
document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
# Set publisher
publisher = self._get_publisher()
if publisher:
document.set_metadata(MetadataType.PUBLISHER, publisher)
# Set publication date
pub_date = self._get_publication_date()
if pub_date:
document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
def _get_best_title(self) -> Optional[str]:
"""Get the best available title from all sources."""
# Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
# Check Open Graph
if 'title' in self._og_tags:
return self._og_tags['title']
# Check Twitter Cards
if 'title' in self._twitter_tags:
return self._twitter_tags['title']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'name' in item:
return item['name']
elif 'headline' in item:
return item['headline']
# Check meta tags
for key in ['title', 'og:title', 'twitter:title']:
if key in self._meta_tags:
return self._meta_tags[key]
# Fall back to HTML title
return self._title
def _get_best_description(self) -> Optional[str]:
"""Get the best available description from all sources."""
# Priority order: Open Graph > Twitter > meta description > JSON-LD
# Check Open Graph
if 'description' in self._og_tags:
return self._og_tags['description']
# Check Twitter Cards
if 'description' in self._twitter_tags:
return self._twitter_tags['description']
# Check meta description
if 'description' in self._meta_tags:
return self._meta_tags['description']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'description' in item:
return item['description']
return None
def _get_best_author(self) -> Optional[str]:
"""Get the best available author from all sources."""
# Check meta tags
if 'author' in self._meta_tags:
return self._meta_tags['author']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'author' in item:
author = item['author']
if isinstance(author, dict) and 'name' in author:
return author['name']
elif isinstance(author, str):
return author
elif 'creator' in item:
creator = item['creator']
if isinstance(creator, dict) and 'name' in creator:
return creator['name']
elif isinstance(creator, str):
return creator
return None
def _get_keywords(self) -> Optional[str]:
"""Get keywords from meta tags."""
return self._meta_tags.get('keywords')
def _get_language(self) -> Optional[str]:
"""Get language from meta tags or HTML lang attribute."""
# Check meta tags first
if 'language' in self._meta_tags:
return self._meta_tags['language']
# Could also extract from html lang attribute if needed
return None
def _get_cover_image(self) -> Optional[str]:
"""Get the best available cover image from all sources."""
# Check Open Graph
if 'image' in self._og_tags:
return self._og_tags['image']
# Check Twitter Cards
if 'image' in self._twitter_tags:
return self._twitter_tags['image']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'image' in item:
image = item['image']
if isinstance(image, dict) and 'url' in image:
return image['url']
elif isinstance(image, str):
return image
return None
def _get_publisher(self) -> Optional[str]:
"""Get publisher from JSON-LD or other sources."""
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'publisher' in item:
publisher = item['publisher']
if isinstance(publisher, dict) and 'name' in publisher:
return publisher['name']
elif isinstance(publisher, str):
return publisher
return None
def _get_publication_date(self) -> Optional[str]:
"""Get publication date from JSON-LD or other sources."""
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'datePublished' in item:
return item['datePublished']
elif 'publishDate' in item:
return item['publishDate']
return None
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
"""
Parse HTML attributes from a string.
Args:
attr_string: String containing HTML attributes
Returns:
Dictionary of attribute name-value pairs
"""
attrs = {}
# Regular expression to match attribute="value" or attribute='value'
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
for match in re.finditer(attr_pattern, attr_string):
name = match.group(1).lower()
value = match.group(2) or match.group(3) or match.group(4) or ''
attrs[name] = value
# Handle standalone attributes (like charset)
standalone_pattern = r'\b(\w+)(?!=)'
for match in re.finditer(standalone_pattern, attr_string):
attr_name = match.group(1).lower()
if attr_name not in attrs:
attrs[attr_name] = ''
return attrs
def _clean_text(self, text: str) -> str:
"""
Clean up text content by removing extra whitespace and HTML entities.
Args:
text: The text to clean
Returns:
Cleaned text
"""
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', text).strip()
# Decode common HTML entities
entities = {
'&lt;': '<',
'&gt;': '>',
'&amp;': '&',
'&quot;': '"',
'&apos;': "'",
'&nbsp;': ' ',
}
for entity, char in entities.items():
cleaned = cleaned.replace(entity, char)
return cleaned

View File

@ -0,0 +1,483 @@
"""
HTML resources reader for pyWebLayout.
This module provides specialized functionality for extracting resources
from HTML documents, such as stylesheets, scripts, and external files.
"""
from typing import Dict, Any, Optional, List
import re
import urllib.parse
from pyWebLayout.abstract.document import Document
from pyWebLayout.io.readers.base import ResourceReader
class HTMLResourceReader(ResourceReader):
"""
Specialized reader for extracting resources from HTML documents.
This class handles CSS stylesheets, JavaScript files, images,
and other external resources referenced in HTML.
"""
def __init__(self):
"""Initialize the HTML resource reader."""
self._stylesheets = []
self._scripts = []
self._external_resources = {}
self._inline_styles = {}
self._inline_scripts = []
def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]:
"""
Extract resources from HTML content.
Args:
html_content: The HTML content to parse
document: The document to populate with resources
Returns:
Dictionary of extracted resources
"""
# Reset internal state
self._reset()
# Extract stylesheets
self._extract_stylesheets(html_content)
# Extract scripts
self._extract_scripts(html_content)
# Extract other external resources
self._extract_external_resources(html_content)
# Extract inline styles
self._extract_inline_styles(html_content)
# Extract inline scripts
self._extract_inline_scripts(html_content)
# Populate document with extracted resources
self._populate_document(document)
# Return all extracted resources
return {
'stylesheets': self._stylesheets,
'scripts': self._scripts,
'external_resources': self._external_resources,
'inline_styles': self._inline_styles,
'inline_scripts': self._inline_scripts
}
def _reset(self):
"""Reset internal state for a new extraction."""
self._stylesheets = []
self._scripts = []
self._external_resources = {}
self._inline_styles = {}
self._inline_scripts = []
def _extract_stylesheets(self, html_content: str):
"""
Extract CSS stylesheet references from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match link tags for stylesheets
link_pattern = r'<link\s+([^>]+)>'
for match in re.finditer(link_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
# Check if this is a stylesheet
rel = attrs.get('rel', '').lower()
if rel == 'stylesheet':
href = attrs.get('href', '')
media = attrs.get('media', 'all')
type_attr = attrs.get('type', 'text/css')
if href:
stylesheet = {
'type': 'external',
'href': href,
'media': media,
'content_type': type_attr
}
self._stylesheets.append(stylesheet)
# Handle other link types
elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'):
href = attrs.get('href', '')
if href:
self._external_resources[f'icon_{len(self._external_resources)}'] = {
'type': 'icon',
'rel': rel,
'href': href,
'sizes': attrs.get('sizes', ''),
'content_type': attrs.get('type', '')
}
elif rel == 'preload':
href = attrs.get('href', '')
if href:
self._external_resources[f'preload_{len(self._external_resources)}'] = {
'type': 'preload',
'href': href,
'as': attrs.get('as', ''),
'content_type': attrs.get('type', '')
}
def _extract_scripts(self, html_content: str):
"""
Extract script references from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match script tags
script_pattern = r'<script\s*([^>]*)>(.*?)</script>'
for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL):
attrs_str = match.group(1)
content = match.group(2).strip()
attrs = self._parse_attributes(attrs_str)
src = attrs.get('src', '')
script_type = attrs.get('type', 'text/javascript')
if src:
# External script
script = {
'type': 'external',
'src': src,
'content_type': script_type,
'async': 'async' in attrs,
'defer': 'defer' in attrs,
'integrity': attrs.get('integrity', ''),
'crossorigin': attrs.get('crossorigin', '')
}
self._scripts.append(script)
elif content:
# Inline script
script = {
'type': 'inline',
'content': content,
'content_type': script_type
}
self._scripts.append(script)
def _extract_external_resources(self, html_content: str):
"""
Extract other external resources from HTML content.
Args:
html_content: The HTML content to parse
"""
# Extract images
img_pattern = r'<img\s+([^>]+)>'
for match in re.finditer(img_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'image_{len(self._external_resources)}'] = {
'type': 'image',
'src': src,
'alt': attrs.get('alt', ''),
'width': attrs.get('width', ''),
'height': attrs.get('height', ''),
'loading': attrs.get('loading', ''),
'srcset': attrs.get('srcset', '')
}
# Extract audio
audio_pattern = r'<audio\s+([^>]+)>'
for match in re.finditer(audio_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'audio_{len(self._external_resources)}'] = {
'type': 'audio',
'src': src,
'controls': 'controls' in attrs,
'autoplay': 'autoplay' in attrs,
'loop': 'loop' in attrs,
'muted': 'muted' in attrs
}
# Extract video
video_pattern = r'<video\s+([^>]+)>'
for match in re.finditer(video_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'video_{len(self._external_resources)}'] = {
'type': 'video',
'src': src,
'controls': 'controls' in attrs,
'autoplay': 'autoplay' in attrs,
'loop': 'loop' in attrs,
'muted': 'muted' in attrs,
'width': attrs.get('width', ''),
'height': attrs.get('height', ''),
'poster': attrs.get('poster', '')
}
# Extract embed/object resources
embed_pattern = r'<embed\s+([^>]+)>'
for match in re.finditer(embed_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'embed_{len(self._external_resources)}'] = {
'type': 'embed',
'src': src,
'content_type': attrs.get('type', ''),
'width': attrs.get('width', ''),
'height': attrs.get('height', '')
}
# Extract iframe sources
iframe_pattern = r'<iframe\s+([^>]+)>'
for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'iframe_{len(self._external_resources)}'] = {
'type': 'iframe',
'src': src,
'width': attrs.get('width', ''),
'height': attrs.get('height', ''),
'loading': attrs.get('loading', ''),
'sandbox': attrs.get('sandbox', '')
}
def _extract_inline_styles(self, html_content: str):
"""
Extract inline CSS styles from HTML content.
Args:
html_content: The HTML content to parse
"""
# Extract style blocks
style_pattern = r'<style\s*([^>]*)>(.*?)</style>'
for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)):
attrs_str = match.group(1)
content = match.group(2).strip()
attrs = self._parse_attributes(attrs_str)
if content:
style_block = {
'content': content,
'media': attrs.get('media', 'all'),
'content_type': attrs.get('type', 'text/css')
}
self._inline_styles[f'style_block_{i}'] = style_block
# Extract inline style attributes (this would be more complex
# as it requires parsing all elements with style attributes)
style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>'
for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)):
style_content = match.group(1)
if style_content:
style_attr = {
'content': style_content,
'type': 'attribute'
}
self._inline_styles[f'style_attr_{i}'] = style_attr
def _extract_inline_scripts(self, html_content: str):
"""
Extract inline JavaScript from HTML content.
Args:
html_content: The HTML content to parse
"""
# This is already handled in _extract_scripts, but we keep this
# method for consistency and potential future extensions
pass
def _populate_document(self, document: Document):
"""
Populate the document with extracted resources.
Args:
document: The document to populate
"""
# Add stylesheets
for stylesheet in self._stylesheets:
document.add_stylesheet(stylesheet)
# Add scripts
for script in self._scripts:
if script['type'] == 'inline':
document.add_script(script['content'])
else:
# For external scripts, we store them as resources
script_name = f"script_{len(document._resources)}"
document.add_resource(script_name, script)
# Add external resources
for name, resource in self._external_resources.items():
document.add_resource(name, resource)
# Add inline styles as stylesheets
for name, style in self._inline_styles.items():
if style.get('type') != 'attribute': # Don't add individual style attributes
parsed_style = self._parse_css(style['content'])
if parsed_style:
document.add_stylesheet({
'type': 'inline',
'content': style['content'],
'parsed': parsed_style,
'media': style.get('media', 'all')
})
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
"""
Parse HTML attributes from a string.
Args:
attr_string: String containing HTML attributes
Returns:
Dictionary of attribute name-value pairs
"""
attrs = {}
# Regular expression to match attribute="value" or attribute='value'
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
for match in re.finditer(attr_pattern, attr_string):
name = match.group(1).lower()
value = match.group(2) or match.group(3) or match.group(4) or ''
attrs[name] = value
# Handle standalone attributes (like async, defer)
standalone_pattern = r'\b(\w+)(?!=)'
for match in re.finditer(standalone_pattern, attr_string):
attr_name = match.group(1).lower()
if attr_name not in attrs:
attrs[attr_name] = ''
return attrs
def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]:
"""
Parse a CSS stylesheet.
Args:
css_str: CSS stylesheet string
Returns:
Dictionary of selectors and their style properties
"""
stylesheet = {}
# Remove comments
css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL)
# Split into rule sets
rule_sets = css_str.split('}')
for rule_set in rule_sets:
# Split into selector and declarations
parts = rule_set.split('{', 1)
if len(parts) != 2:
continue
selector = parts[0].strip()
declarations = parts[1].strip()
# Parse declarations
style = self._parse_css_declarations(declarations)
# Add to stylesheet
if selector and style:
stylesheet[selector] = style
return stylesheet
def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]:
"""
Parse CSS declarations.
Args:
declarations_str: CSS declarations string
Returns:
Dictionary of CSS properties and values
"""
declarations = {}
# Split the declarations string into individual declarations
decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()]
for declaration in decl_list:
# Split into property and value
parts = declaration.split(':', 1)
if len(parts) != 2:
continue
prop = parts[0].strip().lower()
value = parts[1].strip()
# Store the declaration
declarations[prop] = value
return declarations
def resolve_url(self, url: str, base_url: Optional[str] = None) -> str:
"""
Resolve a relative URL against a base URL.
Args:
url: The URL to resolve
base_url: The base URL to resolve against
Returns:
The resolved URL
"""
if base_url and not url.startswith(('http://', 'https://', '//', 'data:')):
return urllib.parse.urljoin(base_url, url)
return url
def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]:
"""
Get the dependencies of a resource (e.g., CSS imports, script dependencies).
Args:
resource: The resource to analyze
Returns:
List of dependency URLs
"""
dependencies = []
if resource.get('type') == 'external' and 'content' in resource:
content = resource['content']
# Check for CSS @import rules
if resource.get('content_type', '').startswith('text/css'):
import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?'
for match in re.finditer(import_pattern, content, re.IGNORECASE):
dependencies.append(match.group(1))
# Check for JavaScript imports/requires (basic detection)
elif resource.get('content_type', '').startswith('text/javascript'):
# ES6 imports
import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']'
for match in re.finditer(import_pattern, content):
dependencies.append(match.group(1))
# CommonJS requires
require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)'
for match in re.finditer(require_pattern, content):
dependencies.append(match.group(1))
return dependencies

View File

@ -0,0 +1,281 @@
"""
HTML style management for pyWebLayout.
This module provides specialized functionality for handling CSS styles,
style stacks, and style parsing in HTML documents.
"""
from typing import Dict, List, Any, Optional, Tuple
import re
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration
class HTMLStyleManager:
"""
Manages CSS styles and style stacks during HTML parsing.
This class handles style parsing, style inheritance, and maintains
the style stack for proper style nesting.
"""
def __init__(self):
"""Initialize the style manager."""
self._style_stack: List[Dict[str, Any]] = []
self._current_style = self._get_default_style()
def _get_default_style(self) -> Dict[str, Any]:
"""Get the default style settings."""
return {
'font_size': 12,
'font_weight': FontWeight.NORMAL,
'font_style': FontStyle.NORMAL,
'decoration': TextDecoration.NONE,
'color': (0, 0, 0),
'background': None,
'language': 'en_US'
}
def reset(self):
"""Reset the style manager to initial state."""
self._style_stack = []
self._current_style = self._get_default_style()
def push_style(self, style: Dict[str, Any]):
"""
Push a new style onto the style stack.
Args:
style: The style to push
"""
# Save the current style
self._style_stack.append(self._current_style.copy())
# Apply the new style
for key, value in style.items():
self._current_style[key] = value
def pop_style(self):
"""Pop a style from the style stack."""
if self._style_stack:
self._current_style = self._style_stack.pop()
def get_current_style(self) -> Dict[str, Any]:
"""Get the current style."""
return self._current_style.copy()
def get_tag_style(self, tag: str) -> Dict[str, Any]:
"""
Get the default style for a tag.
Args:
tag: The tag name
Returns:
A dictionary of style properties
"""
tag_styles = {
'h1': {'font_size': 24, 'font_weight': FontWeight.BOLD},
'h2': {'font_size': 20, 'font_weight': FontWeight.BOLD},
'h3': {'font_size': 18, 'font_weight': FontWeight.BOLD},
'h4': {'font_size': 16, 'font_weight': FontWeight.BOLD},
'h5': {'font_size': 14, 'font_weight': FontWeight.BOLD},
'h6': {'font_size': 12, 'font_weight': FontWeight.BOLD},
'b': {'font_weight': FontWeight.BOLD},
'strong': {'font_weight': FontWeight.BOLD},
'i': {'font_style': FontStyle.ITALIC},
'em': {'font_style': FontStyle.ITALIC},
'u': {'decoration': TextDecoration.UNDERLINE},
'a': {'decoration': TextDecoration.UNDERLINE, 'color': (0, 0, 255)},
'code': {'font_family': 'monospace', 'background': (240, 240, 240, 255)},
'pre': {'font_family': 'monospace'},
}
return tag_styles.get(tag, {})
def create_font(self) -> Font:
"""
Create a Font object from the current style.
Returns:
Font: A font object with the current style settings
"""
return Font(
font_size=self._current_style['font_size'],
colour=self._current_style['color'],
weight=self._current_style['font_weight'],
style=self._current_style['font_style'],
decoration=self._current_style['decoration'],
background=self._current_style['background'],
langauge=self._current_style['language']
)
def parse_inline_style(self, style_str: str) -> Dict[str, Any]:
"""
Parse inline CSS style string.
Args:
style_str: CSS style string
Returns:
Dictionary of style properties
"""
if not style_str:
return {}
style_dict = {}
declarations = [d.strip() for d in style_str.split(';') if d.strip()]
for declaration in declarations:
parts = declaration.split(':', 1)
if len(parts) != 2:
continue
prop = parts[0].strip().lower()
value = parts[1].strip()
# Handle specific properties
if prop == 'font-size':
if value.endswith('px'):
try:
size = int(value[:-2])
style_dict['font_size'] = size
except ValueError:
pass
elif value.endswith('pt'):
try:
size = int(value[:-2])
style_dict['font_size'] = size
except ValueError:
pass
elif prop == 'font-weight':
if value == 'bold':
style_dict['font_weight'] = FontWeight.BOLD
elif value == 'normal':
style_dict['font_weight'] = FontWeight.NORMAL
elif prop == 'font-style':
if value == 'italic':
style_dict['font_style'] = FontStyle.ITALIC
elif value == 'normal':
style_dict['font_style'] = FontStyle.NORMAL
elif prop == 'text-decoration':
if value == 'underline':
style_dict['decoration'] = TextDecoration.UNDERLINE
elif value == 'line-through':
style_dict['decoration'] = TextDecoration.STRIKETHROUGH
elif value == 'none':
style_dict['decoration'] = TextDecoration.NONE
elif prop == 'color':
color = self.parse_color(value)
if color:
style_dict['color'] = color
elif prop == 'background-color':
color = self.parse_color(value)
if color:
style_dict['background'] = color + (255,)
return style_dict
def parse_color(self, color_str: str) -> Optional[Tuple[int, int, int]]:
"""
Parse a CSS color string.
Args:
color_str: CSS color string
Returns:
RGB tuple or None if parsing fails
"""
# Named colors
color_map = {
'black': (0, 0, 0),
'white': (255, 255, 255),
'red': (255, 0, 0),
'green': (0, 128, 0),
'blue': (0, 0, 255),
'yellow': (255, 255, 0),
'cyan': (0, 255, 255),
'magenta': (255, 0, 255),
'gray': (128, 128, 128),
'grey': (128, 128, 128),
'silver': (192, 192, 192),
'maroon': (128, 0, 0),
'olive': (128, 128, 0),
'navy': (0, 0, 128),
'purple': (128, 0, 128),
'teal': (0, 128, 128),
'lime': (0, 255, 0),
'aqua': (0, 255, 255),
'fuchsia': (255, 0, 255),
}
# Check for named color
color_str = color_str.lower().strip()
if color_str in color_map:
return color_map[color_str]
# Check for hex color
if color_str.startswith('#'):
try:
if len(color_str) == 4: # #RGB
r = int(color_str[1] + color_str[1], 16)
g = int(color_str[2] + color_str[2], 16)
b = int(color_str[3] + color_str[3], 16)
return (r, g, b)
elif len(color_str) == 7: # #RRGGBB
r = int(color_str[1:3], 16)
g = int(color_str[3:5], 16)
b = int(color_str[5:7], 16)
return (r, g, b)
except ValueError:
pass
# Check for rgb() color
rgb_match = re.match(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', color_str)
if rgb_match:
try:
r_val = int(rgb_match.group(1))
g_val = int(rgb_match.group(2))
b_val = int(rgb_match.group(3))
# Check if values are in valid range (0-255)
if r_val > 255 or g_val > 255 or b_val > 255 or r_val < 0 or g_val < 0 or b_val < 0:
return None # Invalid color values
return (r_val, g_val, b_val)
except ValueError:
pass
# Check for rgba() color (ignore alpha)
rgba_match = re.match(r'rgba\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*[\d.]+\s*\)', color_str)
if rgba_match:
try:
r = min(255, max(0, int(rgba_match.group(1))))
g = min(255, max(0, int(rgba_match.group(2))))
b = min(255, max(0, int(rgba_match.group(3))))
return (r, g, b)
except ValueError:
pass
# Failed to parse color
return None
def apply_style_to_element(self, tag: str, attrs: Dict[str, str]) -> Dict[str, Any]:
"""
Apply combined styles (tag defaults + inline styles) for an element.
Args:
tag: The HTML tag name
attrs: Dictionary of tag attributes
Returns:
Combined style dictionary
"""
# Start with tag-specific styles
style = self.get_tag_style(tag)
# Override with inline styles if present
if 'style' in attrs:
inline_style = self.parse_inline_style(attrs['style'])
style.update(inline_style)
return style

View File

@ -0,0 +1,163 @@
"""
HTML text processing for pyWebLayout.
This module provides specialized functionality for handling text content,
entity references, and word creation in HTML documents.
"""
from typing import Optional
from pyWebLayout.abstract.inline import Word
from pyWebLayout.abstract.block import Parapgraph
from pyWebLayout.io.readers.html_style import HTMLStyleManager
class HTMLTextProcessor:
"""
Processes text content during HTML parsing.
This class handles text buffering, entity resolution, and word creation
with proper styling applied.
"""
def __init__(self, style_manager: HTMLStyleManager):
"""
Initialize the text processor.
Args:
style_manager: The style manager for creating styled words
"""
self._style_manager = style_manager
self._text_buffer = ""
self._current_paragraph: Optional[Parapgraph] = None
def reset(self):
"""Reset the text processor state."""
self._text_buffer = ""
self._current_paragraph = None
def set_current_paragraph(self, paragraph: Optional[Parapgraph]):
"""
Set the current paragraph for text output.
Args:
paragraph: The paragraph to receive text, or None
"""
self._current_paragraph = paragraph
def add_text(self, text: str):
"""
Add text to the buffer.
Args:
text: The text to add
"""
self._text_buffer += text
def add_entity_reference(self, name: str):
"""
Add an HTML entity reference to the buffer.
Args:
name: The entity name (e.g., 'lt', 'gt', 'amp')
"""
# Map common entity references to characters
entities = {
'lt': '<',
'gt': '>',
'amp': '&',
'quot': '"',
'apos': "'",
'nbsp': ' ',
'copy': '©',
'reg': '®',
'trade': '',
'mdash': '',
'ndash': '',
'hellip': '',
'laquo': '«',
'raquo': '»',
'ldquo': '"',
'rdquo': '"',
'lsquo': ''',
'rsquo': ''',
'deg': '°',
'plusmn': '±',
'times': '×',
'divide': '÷',
'euro': '',
'pound': '£',
'yen': '¥',
}
char = entities.get(name, f'&{name};')
self._text_buffer += char
def add_character_reference(self, name: str):
"""
Add a character reference to the buffer.
Args:
name: The character reference (decimal or hex)
"""
try:
if name.startswith('x'):
# Hexadecimal reference
char = chr(int(name[1:], 16))
else:
# Decimal reference
char = chr(int(name))
self._text_buffer += char
except (ValueError, OverflowError):
# Invalid character reference
self._text_buffer += f'&#{name};'
def flush_text(self) -> bool:
"""
Flush the text buffer, creating words as needed.
Returns:
True if text was flushed, False if buffer was empty
"""
if not self._text_buffer or not self._current_paragraph:
self._text_buffer = ""
return False
# Clean up the text
text = self._text_buffer.strip()
if not text:
self._text_buffer = ""
return False
# Create words from the text
words = text.split()
for word_text in words:
if word_text:
font = self._style_manager.create_font()
word = Word(word_text, font)
self._current_paragraph.add_word(word)
# Reset text buffer
self._text_buffer = ""
return True
def has_pending_text(self) -> bool:
"""
Check if there is pending text in the buffer.
Returns:
True if there is text waiting to be flushed
"""
return bool(self._text_buffer.strip())
def get_buffer_content(self) -> str:
"""
Get the current buffer content without flushing.
Returns:
The current text buffer content
"""
return self._text_buffer
def clear_buffer(self):
"""Clear the text buffer without creating words."""
self._text_buffer = ""

11
pyWebLayout/layout.py Normal file
View File

@ -0,0 +1,11 @@
from enum import Enum
class Alignment(Enum):
LEFT = 1
CENTER = 2
RIGHT = 3
TOP = 4
BOTTOM = 5
JUSTIFY = 6

View File

@ -0,0 +1 @@
## list langauges

176
pyWebLayout/style.py Normal file
View File

@ -0,0 +1,176 @@
# this should contain classes for how different object can be rendered, e.g. bold, italic, regular
from PIL import ImageFont
from enum import Enum
from typing import Tuple, Union, Optional
class FontWeight(Enum):
NORMAL = "normal"
BOLD = "bold"
class FontStyle(Enum):
NORMAL = "normal"
ITALIC = "italic"
class TextDecoration(Enum):
NONE = "none"
UNDERLINE = "underline"
STRIKETHROUGH = "strikethrough"
class Font:
"""
Font class to manage text rendering properties including font face, size, color, and styling.
This class is used by the text renderer to determine how to render text.
"""
def __init__(self,
font_path: Optional[str] = None,
font_size: int = 12,
colour: Tuple[int, int, int] = (0, 0, 0),
weight: FontWeight = FontWeight.NORMAL,
style: FontStyle = FontStyle.NORMAL,
decoration: TextDecoration = TextDecoration.NONE,
background: Optional[Tuple[int, int, int, int]] = None,
langauge = "en_EN"):
"""
Initialize a Font object with the specified properties.
Args:
font_path: Path to the font file (.ttf, .otf). If None, uses default font.
font_size: Size of the font in points.
colour: RGB color tuple for the text.
weight: Font weight (normal or bold).
style: Font style (normal or italic).
decoration: Text decoration (none, underline, or strikethrough).
background: RGBA background color for the text. If None, transparent background.
"""
self._font_path = font_path
self._font_size = font_size
self._colour = colour
self._weight = weight
self._style = style
self._decoration = decoration
self._background = background if background else (255, 255, 255, 0)
self.language = langauge
# Load the font file or use default
self._load_font()
def _load_font(self):
"""Load the font using PIL's ImageFont"""
try:
if self._font_path:
self._font = ImageFont.truetype(
self._font_path,
self._font_size
)
else:
# Use default font
self._font = ImageFont.load_default()
if self._font_size != 12: # Default size might not be 12
self._font = ImageFont.truetype(self._font.path, self._font_size)
except Exception as e:
print(f"Error loading font: {e}")
self._font = ImageFont.load_default()
@property
def font(self):
"""Get the PIL ImageFont object"""
return self._font
@property
def font_size(self):
"""Get the font size"""
return self._font_size
@property
def colour(self):
"""Get the text color"""
return self._colour
@property
def color(self):
"""Alias for colour (American spelling)"""
return self._colour
@property
def background(self):
"""Get the background color"""
return self._background
@property
def weight(self):
"""Get the font weight"""
return self._weight
@property
def style(self):
"""Get the font style"""
return self._style
@property
def decoration(self):
"""Get the text decoration"""
return self._decoration
def with_size(self, size: int):
"""Create a new Font object with modified size"""
return Font(
self._font_path,
size,
self._colour,
self._weight,
self._style,
self._decoration,
self._background
)
def with_colour(self, colour: Tuple[int, int, int]):
"""Create a new Font object with modified colour"""
return Font(
self._font_path,
self._font_size,
colour,
self._weight,
self._style,
self._decoration,
self._background
)
def with_weight(self, weight: FontWeight):
"""Create a new Font object with modified weight"""
return Font(
self._font_path,
self._font_size,
self._colour,
weight,
self._style,
self._decoration,
self._background
)
def with_style(self, style: FontStyle):
"""Create a new Font object with modified style"""
return Font(
self._font_path,
self._font_size,
self._colour,
self._weight,
style,
self._decoration,
self._background
)
def with_decoration(self, decoration: TextDecoration):
"""Create a new Font object with modified decoration"""
return Font(
self._font_path,
self._font_size,
self._colour,
self._weight,
self._style,
decoration,
self._background
)

View File

@ -0,0 +1,17 @@
"""
Styling module for the pyWebLayout library.
This package contains styling-related components including:
- Font handling and text styling
- Color management
- Text decoration and formatting
- Alignment and positioning properties
"""
# Import alignment options
from pyWebLayout.style.alignment import Alignment
# Import font-related classes
from pyWebLayout.style.fonts import (
Font, FontWeight, FontStyle, TextDecoration
)

View File

@ -0,0 +1,16 @@
"""
Alignment options for text and elements in the pyWebLayout library.
"""
from enum import Enum
class Alignment(Enum):
"""
Enum for alignment options used in layout and rendering.
"""
LEFT = 1
CENTER = 2
RIGHT = 3
TOP = 4
BOTTOM = 5
JUSTIFY = 6

176
pyWebLayout/style/fonts.py Normal file
View File

@ -0,0 +1,176 @@
# this should contain classes for how different object can be rendered, e.g. bold, italic, regular
from PIL import ImageFont
from enum import Enum
from typing import Tuple, Union, Optional
class FontWeight(Enum):
NORMAL = "normal"
BOLD = "bold"
class FontStyle(Enum):
NORMAL = "normal"
ITALIC = "italic"
class TextDecoration(Enum):
NONE = "none"
UNDERLINE = "underline"
STRIKETHROUGH = "strikethrough"
class Font:
"""
Font class to manage text rendering properties including font face, size, color, and styling.
This class is used by the text renderer to determine how to render text.
"""
def __init__(self,
font_path: Optional[str] = None,
font_size: int = 12,
colour: Tuple[int, int, int] = (0, 0, 0),
weight: FontWeight = FontWeight.NORMAL,
style: FontStyle = FontStyle.NORMAL,
decoration: TextDecoration = TextDecoration.NONE,
background: Optional[Tuple[int, int, int, int]] = None,
langauge = "en_EN"):
"""
Initialize a Font object with the specified properties.
Args:
font_path: Path to the font file (.ttf, .otf). If None, uses default font.
font_size: Size of the font in points.
colour: RGB color tuple for the text.
weight: Font weight (normal or bold).
style: Font style (normal or italic).
decoration: Text decoration (none, underline, or strikethrough).
background: RGBA background color for the text. If None, transparent background.
"""
self._font_path = font_path
self._font_size = font_size
self._colour = colour
self._weight = weight
self._style = style
self._decoration = decoration
self._background = background if background else (255, 255, 255, 0)
self.language = langauge
# Load the font file or use default
self._load_font()
def _load_font(self):
"""Load the font using PIL's ImageFont"""
try:
if self._font_path:
self._font = ImageFont.truetype(
self._font_path,
self._font_size
)
else:
# Use default font
self._font = ImageFont.load_default()
if self._font_size != 12: # Default size might not be 12
self._font = ImageFont.truetype(self._font.path, self._font_size)
except Exception as e:
print(f"Error loading font: {e}")
self._font = ImageFont.load_default()
@property
def font(self):
"""Get the PIL ImageFont object"""
return self._font
@property
def font_size(self):
"""Get the font size"""
return self._font_size
@property
def colour(self):
"""Get the text color"""
return self._colour
@property
def color(self):
"""Alias for colour (American spelling)"""
return self._colour
@property
def background(self):
"""Get the background color"""
return self._background
@property
def weight(self):
"""Get the font weight"""
return self._weight
@property
def style(self):
"""Get the font style"""
return self._style
@property
def decoration(self):
"""Get the text decoration"""
return self._decoration
def with_size(self, size: int):
"""Create a new Font object with modified size"""
return Font(
self._font_path,
size,
self._colour,
self._weight,
self._style,
self._decoration,
self._background
)
def with_colour(self, colour: Tuple[int, int, int]):
"""Create a new Font object with modified colour"""
return Font(
self._font_path,
self._font_size,
colour,
self._weight,
self._style,
self._decoration,
self._background
)
def with_weight(self, weight: FontWeight):
"""Create a new Font object with modified weight"""
return Font(
self._font_path,
self._font_size,
self._colour,
weight,
self._style,
self._decoration,
self._background
)
def with_style(self, style: FontStyle):
"""Create a new Font object with modified style"""
return Font(
self._font_path,
self._font_size,
self._colour,
self._weight,
style,
self._decoration,
self._background
)
def with_decoration(self, decoration: TextDecoration):
"""Create a new Font object with modified decoration"""
return Font(
self._font_path,
self._font_size,
self._colour,
self._weight,
self._style,
decoration,
self._background
)

137
pyWebLayout/table.py Normal file
View File

@ -0,0 +1,137 @@
from pyWebLayout.base import Renderable
from .concrete.box import Box
from pyWebLayout.layout import Alignment
import numpy as np
from PIL import Image, ImageDraw
from typing import List, Tuple, Optional
class TableCell(Box):
def __init__(self, origin, size, content: Optional[Renderable] = None,
callback=None, sheet=None, mode=None,
halign=Alignment.CENTER, valign=Alignment.CENTER,
padding: Tuple[int, int, int, int] = (5, 5, 5, 5)):
"""
Initialize a table cell.
Args:
origin: Top-left corner coordinates
size: Width and height of the cell
content: Optional renderable content to place in the cell
callback: Optional callback function
sheet: Optional image sheet
mode: Optional image mode
halign: Horizontal alignment
valign: Vertical alignment
padding: Padding as (top, right, bottom, left)
"""
super().__init__(origin, size, callback, sheet, mode, halign, valign)
self._content = content
self._padding = padding # (top, right, bottom, left)
def set_content(self, content: Renderable):
"""Set the content of this cell"""
self._content = content
def render(self) -> Image:
"""Render the cell with its content and border"""
# Create the base canvas
canvas = super().render()
draw = ImageDraw.Draw(canvas)
# Draw border (optional - can be customized)
draw.rectangle([(0, 0), tuple(self._size - np.array([1, 1]))],
outline=(0, 0, 0), width=1)
return canvas
class Table(Box):
def __init__(self, rows: int, columns: int, origin, size,
cell_padding: Tuple[int, int, int, int] = (5, 5, 5, 5),
callback=None, sheet=None, mode=None,
halign=Alignment.CENTER, valign=Alignment.CENTER):
"""
Initialize a table with specified number of rows and columns.
Args:
rows: Number of rows in the table
columns: Number of columns in the table
origin: Top-left corner coordinates
size: Width and height of the table
cell_padding: Padding for each cell as (top, right, bottom, left)
callback: Optional callback function
sheet: Optional image sheet
mode: Optional image mode
halign: Horizontal alignment
valign: Vertical alignment
"""
super().__init__(origin, size, callback, sheet, mode, halign, valign)
self._rows = rows
self._columns = columns
self._cell_padding = cell_padding
# Calculate cell dimensions
cell_width = size[0] // columns
cell_height = size[1] // rows
# Create a 2D array of cells
self._cells: List[List[TableCell]] = []
for row in range(rows):
cell_row = []
for col in range(columns):
# Calculate cell position
cell_origin = np.array([col * cell_width, row * cell_height])
cell_size = np.array([cell_width, cell_height])
# Create the cell
cell = TableCell(
origin=cell_origin,
size=cell_size,
sheet=sheet,
mode=mode,
halign=halign,
valign=valign,
padding=cell_padding
)
cell_row.append(cell)
self._cells.append(cell_row)
def add_to_cell(self, x: int, y: int, content: Renderable):
"""
Add content to a specific cell in the table.
Args:
x: Column index (0-based)
y: Row index (0-based)
content: Renderable content to add to the cell
"""
if 0 <= y < self._rows and 0 <= x < self._columns:
self._cells[y][x].set_content(content)
else:
raise IndexError(f"Cell indices ({x}, {y}) out of range. Table is {self._columns}x{self._rows}")
def render(self) -> Image:
"""Render the complete table with all cells"""
# Create base canvas
canvas = super().render()
# Render each cell and paste it onto the canvas
for row in range(self._rows):
for col in range(self._columns):
cell = self._cells[row][col]
cell_img = cell.render()
# Get the position for this cell
cell_pos = (col * (self._size[0] // self._columns),
row * (self._size[1] // self._rows))
# Paste the cell onto the canvas
canvas.paste(cell_img, cell_pos, cell_img)
return canvas

View File

@ -0,0 +1,15 @@
"""
Typesetting module for the pyWebLayout library.
This package handles the organization and arrangement of elements for rendering, including:
- Flow layout algorithms
- Container management
- Element positioning and sizing
- Content wrapping and overflow
- Coordinate systems and transformations
- Pagination for book-like content
"""
from pyWebLayout.typesetting.flow import FlowLayout
from pyWebLayout.typesetting.pagination import Paginator, PaginationState
from pyWebLayout.typesetting.document_pagination import DocumentPaginator, DocumentPaginationState

View File

@ -0,0 +1,323 @@
"""
Document-aware pagination system for pyWebLayout.
This module provides functionality for paginating Document and Book objects
across multiple pages, with the ability to stop, save state, and resume pagination.
"""
from typing import List, Tuple, Dict, Any, Optional, Iterator, Generator
import copy
import json
from pyWebLayout.core import Layoutable, Renderable
from pyWebLayout.style import Alignment
from pyWebLayout.abstract.document import Document, Book, Chapter
from pyWebLayout.abstract.block import Block
from pyWebLayout.typesetting.pagination import PaginationState, Paginator
from pyWebLayout.concrete.page import Page
class DocumentPaginationState(PaginationState):
"""
Extended pagination state for tracking document-specific information.
This class extends the basic PaginationState to include information
about the document structure, like current chapter and section.
"""
def __init__(self):
"""Initialize a new document pagination state."""
super().__init__()
self.current_chapter = 0
self.current_section = 0
self.rendered_blocks = set() # Track which blocks have been rendered
def save(self) -> Dict[str, Any]:
"""
Save the current pagination state to a dictionary.
Returns:
A dictionary representing the pagination state
"""
state = super().save()
state.update({
'current_chapter': self.current_chapter,
'current_section': self.current_section,
'rendered_blocks': list(self.rendered_blocks) # Convert set to list for serialization
})
return state
@classmethod
def load(cls, state_dict: Dict[str, Any]) -> 'DocumentPaginationState':
"""
Load pagination state from a dictionary.
Args:
state_dict: Dictionary containing pagination state
Returns:
A DocumentPaginationState object
"""
state = super(DocumentPaginationState, cls).load(state_dict)
state.current_chapter = state_dict.get('current_chapter', 0)
state.current_section = state_dict.get('current_section', 0)
state.rendered_blocks = set(state_dict.get('rendered_blocks', []))
return state
def to_json(self) -> str:
"""
Convert the state to a JSON string for persistence.
Returns:
JSON string representation of the state
"""
return json.dumps(self.save())
@classmethod
def from_json(cls, json_str: str) -> 'DocumentPaginationState':
"""
Load state from a JSON string.
Args:
json_str: JSON string representation of state
Returns:
A DocumentPaginationState object
"""
return cls.load(json.loads(json_str))
class DocumentPaginator:
"""
Paginator for Document and Book objects.
This class paginates Document or Book objects into a series of pages,
respecting the document structure and allowing for state tracking.
"""
def __init__(
self,
document: Document,
page_size: Tuple[int, int],
margins: Tuple[int, int, int, int] = (20, 20, 20, 20), # top, right, bottom, left
spacing: int = 5,
halign: Alignment = Alignment.LEFT,
):
"""
Initialize a document paginator.
Args:
document: The document to paginate
page_size: Size of each page (width, height)
margins: Margins for each page (top, right, bottom, left)
spacing: Spacing between elements
halign: Horizontal alignment of elements
"""
self.document = document
self.page_size = page_size
self.margins = margins
self.spacing = spacing
self.halign = halign
self.state = DocumentPaginationState()
# Preprocess document to get all blocks
self._blocks = self._collect_blocks()
def _collect_blocks(self) -> List[Block]:
"""
Collect all blocks from the document in a flat list.
For Books, this includes blocks from all chapters.
Returns:
List of blocks from the document
"""
all_blocks = []
if isinstance(self.document, Book):
# For books, process chapters
for chapter in self.document.chapters:
# Add a heading block for the chapter if it has a title
if chapter.title:
from pyWebLayout.abstract.block import Heading, HeadingLevel, Parapgraph
from pyWebLayout.abstract.inline import Word
# Create a heading for the chapter
heading = Heading(level=HeadingLevel.H1)
heading_word = Word(chapter.title)
heading.add_word(heading_word)
all_blocks.append(heading)
# Add all blocks from the chapter
all_blocks.extend(chapter.blocks)
else:
# For regular documents, just add all blocks
all_blocks.extend(self.document.blocks)
return all_blocks
def paginate(self, max_pages: Optional[int] = None) -> List[Page]:
"""
Paginate the document into pages.
Args:
max_pages: Maximum number of pages to generate (None for all)
Returns:
List of Page objects
"""
pages = []
# Reset state
self.state = DocumentPaginationState()
# Create a generator for pagination
page_generator = self._paginate_generator()
# Generate pages up to max_pages or until all content is paginated
page_count = 0
for page in page_generator:
pages.append(page)
page_count += 1
if max_pages is not None and page_count >= max_pages:
break
return pages
def paginate_next(self) -> Optional[Page]:
"""
Paginate and return the next page only.
Returns:
The next Page object, or None if no more content
"""
try:
return next(self._paginate_generator())
except StopIteration:
return None
def _paginate_generator(self) -> Generator[Page, None, None]:
"""
Generator that yields one page at a time.
Yields:
A Page object for each page in the document
"""
# Get blocks starting from the current position
current_index = self.state.current_element_index
remaining_blocks = self._blocks[current_index:]
# Keep track of which chapter we're in
current_chapter = self.state.current_chapter
# Process blocks until we run out
while current_index < len(self._blocks):
# Create a new page
page = Page(size=self.page_size)
# Fill the page with blocks
page_blocks = []
# Track how much space we've used on the page
used_height = self.margins[0] # Start at top margin
avail_height = self.page_size[1] - self.margins[0] - self.margins[2]
# Add blocks until we fill the page or run out
while current_index < len(self._blocks):
block = self._blocks[current_index]
# Make sure the block is properly laid out
if hasattr(block, 'layout'):
block.layout()
# Get the rendered height of the block
block_height = getattr(block, 'size', (0, 0))[1]
# Check if the block fits on this page
if used_height + block_height > avail_height:
# Block doesn't fit, move to next page
break
# Add the block to the page
page_blocks.append(block)
page.add_child(block)
# Update position
used_height += block_height + self.spacing
# Track that we've rendered this block
self.state.rendered_blocks.add(id(block))
# Move to the next block
current_index += 1
# Check if we're moving to a new chapter (for Book objects)
if isinstance(self.document, Book) and current_index < len(self._blocks):
# Check if the next block is a heading that starts a new chapter
# This is a simplified check - in a real implementation you'd need
# a more robust way to identify chapter boundaries
from pyWebLayout.abstract.block import Heading
if isinstance(self._blocks[current_index], Heading):
# We're at a chapter boundary, might want to start a new page
# This is optional and depends on your layout preferences
current_chapter += 1
break
# Update state
self.state.current_page += 1
self.state.current_element_index = current_index
self.state.current_chapter = current_chapter
# Layout the page
page.layout()
# If we couldn't fit any blocks on this page but have more, skip the block
if not page_blocks and current_index < len(self._blocks):
print(f"Warning: Block at index {current_index} is too large to fit on a page")
current_index += 1
self.state.current_element_index = current_index
# Yield the page
if page_blocks:
yield page
else:
# No more blocks to paginate
break
def get_state(self) -> Dict[str, Any]:
"""
Get the current pagination state.
Returns:
Dictionary representing pagination state
"""
return self.state.save()
def set_state(self, state: Dict[str, Any]) -> None:
"""
Set the pagination state.
Args:
state: Dictionary representing pagination state
"""
self.state = DocumentPaginationState.load(state)
def is_complete(self) -> bool:
"""
Check if pagination is complete.
Returns:
True if all blocks have been paginated, False otherwise
"""
return self.state.current_element_index >= len(self._blocks)
def get_progress(self) -> float:
"""
Get the pagination progress as a percentage.
Returns:
Percentage of blocks that have been paginated (0.0 to 1.0)
"""
if not self._blocks:
return 1.0
return self.state.current_element_index / len(self._blocks)

View File

@ -0,0 +1,155 @@
"""
Flow layout implementation for pyWebLayout.
This module provides a flow layout algorithm similar to HTML's normal flow,
where elements are positioned sequentially, wrapping to the next line when
they exceed the container width.
"""
from typing import List, Tuple, Optional, Any
import numpy as np
from pyWebLayout.core import Layoutable
from pyWebLayout.style import Alignment
class FlowLayout:
"""
Flow layout algorithm for arranging elements in a container.
Flow layout places elements sequentially from left to right, wrapping to the
next line when the elements exceed the container's width. It supports various
alignment options for both horizontal and vertical positioning.
"""
@staticmethod
def layout_elements(
elements: List[Layoutable],
container_size: Tuple[int, int],
padding: Tuple[int, int, int, int] = (0, 0, 0, 0), # top, right, bottom, left
spacing: int = 0,
halign: Alignment = Alignment.LEFT,
valign: Alignment = Alignment.TOP
) -> List[Tuple[int, int]]:
"""
Layout elements in a flow layout within the given container.
Args:
elements: List of layoutable elements to arrange
container_size: (width, height) tuple for the container
padding: (top, right, bottom, left) padding inside the container
spacing: Horizontal spacing between elements
halign: Horizontal alignment (LEFT, CENTER, RIGHT)
valign: Vertical alignment (TOP, CENTER, BOTTOM)
Returns:
List of (x, y) positions for each element
"""
# Calculate available width and height after padding
avail_width = container_size[0] - padding[1] - padding[3]
avail_height = container_size[1] - padding[0] - padding[2]
# First, lay out elements in rows
positions = []
current_x = padding[3] # Start at left padding
current_y = padding[0] # Start at top padding
row_height = 0
row_start_idx = 0
# Ensure elements are properly laid out internally
for element in elements:
if hasattr(element, 'layout'):
element.layout()
# First pass - group elements into rows
for i, element in enumerate(elements):
element_width = element.size[0] if hasattr(element, 'size') else 0
element_height = element.size[1] if hasattr(element, 'size') else 0
# Check if this element fits in the current row
if current_x + element_width > padding[3] + avail_width and i > row_start_idx:
# Adjust positions for the completed row based on halign
FlowLayout._align_row(
positions, elements, row_start_idx, i,
padding[3], avail_width, halign
)
# Move to next row
current_x = padding[3]
current_y += row_height + spacing
row_height = 0
row_start_idx = i
# Add element to current row
positions.append((current_x, current_y))
current_x += element_width + spacing
row_height = max(row_height, element_height)
# Handle the last row
if row_start_idx < len(elements):
FlowLayout._align_row(
positions, elements, row_start_idx, len(elements),
padding[3], avail_width, halign
)
# Second pass - adjust vertical positions based on valign
if valign != Alignment.TOP:
total_height = current_y + row_height - padding[0]
if total_height < avail_height:
offset = 0
if valign == Alignment.CENTER:
offset = (avail_height - total_height) // 2
elif valign == Alignment.BOTTOM:
offset = avail_height - total_height
# Apply vertical offset to all positions
positions = [(x, y + offset) for x, y in positions]
return positions
@staticmethod
def _align_row(
positions: List[Tuple[int, int]],
elements: List[Any],
start_idx: int,
end_idx: int,
left_margin: int,
avail_width: int,
halign: Alignment
) -> None:
"""
Adjust positions of elements in a row based on horizontal alignment.
Args:
positions: List of element positions to adjust
elements: List of elements
start_idx: Start index of the row
end_idx: End index of the row
left_margin: Left margin of the container
avail_width: Available width of the container
halign: Horizontal alignment
"""
if halign == Alignment.LEFT:
# No adjustment needed for left alignment
return
# Calculate total width of elements in the row
total_width = sum(
elements[i].size[0] if hasattr(elements[i], 'size') else 0
for i in range(start_idx, end_idx)
)
# Add spacing between elements
if end_idx - start_idx > 1:
total_width += (end_idx - start_idx - 1) * 0 # No spacing for now
# Calculate the adjustment
offset = 0
if halign == Alignment.CENTER:
offset = (avail_width - total_width) // 2
elif halign == Alignment.RIGHT:
offset = avail_width - total_width
# Apply the offset
for i in range(start_idx, end_idx):
positions[i] = (positions[i][0] + offset, positions[i][1])

View File

@ -0,0 +1,231 @@
"""
Pagination system for pyWebLayout.
This module provides functionality for paginating content across multiple pages,
with the ability to stop, save state, and resume pagination.
"""
from typing import List, Tuple, Dict, Any, Optional, Iterator, Generator
import copy
from pyWebLayout.core import Layoutable
from pyWebLayout.style import Alignment
from pyWebLayout.typesetting.flow import FlowLayout
class PaginationState:
"""
Class to hold the state of a pagination process.
This allows pagination to be paused, saved, and resumed later.
"""
def __init__(self):
"""Initialize a new pagination state."""
self.current_page = 0
self.current_element_index = 0
self.position_in_element = 0 # For elements that might be split across pages
self.consumed_elements = []
self.metadata = {} # For any additional state information
def save(self) -> Dict[str, Any]:
"""
Save the current pagination state to a dictionary.
Returns:
A dictionary representing the pagination state
"""
return {
'current_page': self.current_page,
'current_element_index': self.current_element_index,
'position_in_element': self.position_in_element,
'consumed_elements': self.consumed_elements,
'metadata': self.metadata
}
@classmethod
def load(cls, state_dict: Dict[str, Any]) -> 'PaginationState':
"""
Load pagination state from a dictionary.
Args:
state_dict: Dictionary containing pagination state
Returns:
A PaginationState object
"""
state = cls()
state.current_page = state_dict.get('current_page', 0)
state.current_element_index = state_dict.get('current_element_index', 0)
state.position_in_element = state_dict.get('position_in_element', 0)
state.consumed_elements = state_dict.get('consumed_elements', [])
state.metadata = state_dict.get('metadata', {})
return state
class Paginator:
"""
Class for paginating content across multiple pages.
Supports flow layout within each page and maintains state between pages.
"""
def __init__(
self,
elements: List[Layoutable],
page_size: Tuple[int, int],
margins: Tuple[int, int, int, int] = (20, 20, 20, 20), # top, right, bottom, left
spacing: int = 5,
halign: Alignment = Alignment.LEFT,
):
"""
Initialize a paginator.
Args:
elements: List of elements to paginate
page_size: Size of each page (width, height)
margins: Margins for each page (top, right, bottom, left)
spacing: Spacing between elements
halign: Horizontal alignment of elements
"""
self.elements = elements
self.page_size = page_size
self.margins = margins
self.spacing = spacing
self.halign = halign
self.state = PaginationState()
def paginate(self, max_pages: Optional[int] = None) -> List[List[Tuple[Layoutable, Tuple[int, int]]]]:
"""
Paginate all content into pages.
Args:
max_pages: Maximum number of pages to generate (None for all)
Returns:
List of pages, where each page is a list of (element, position) tuples
"""
pages = []
# Reset state
self.state = PaginationState()
# Create a generator for pagination
page_generator = self._paginate_generator()
# Generate pages up to max_pages or until all content is paginated
page_count = 0
for page in page_generator:
pages.append(page)
page_count += 1
if max_pages is not None and page_count >= max_pages:
break
return pages
def paginate_next(self) -> Optional[List[Tuple[Layoutable, Tuple[int, int]]]]:
"""
Paginate and return the next page only.
Returns:
A list of (element, position) tuples for the next page, or None if no more content
"""
try:
return next(self._paginate_generator())
except StopIteration:
return None
def _paginate_generator(self) -> Generator[List[Tuple[Layoutable, Tuple[int, int]]], None, None]:
"""
Generator that yields one page at a time.
Yields:
A list of (element, position) tuples for each page
"""
# Calculate available space on a page
avail_width = self.page_size[0] - self.margins[1] - self.margins[3]
avail_height = self.page_size[1] - self.margins[0] - self.margins[2]
# Current position on the page
current_index = self.state.current_element_index
remaining_elements = self.elements[current_index:]
# Process elements until we run out
while current_index < len(self.elements):
# Start a new page
page_elements = []
current_y = self.margins[0]
# Fill the page with elements
while current_index < len(self.elements):
element = self.elements[current_index]
# Ensure element is laid out properly
if hasattr(element, 'layout'):
element.layout()
# Get element size
element_width = element.size[0] if hasattr(element, 'size') else 0
element_height = element.size[1] if hasattr(element, 'size') else 0
# Check if element fits on current page
if current_y + element_height > self.margins[0] + avail_height:
# Element doesn't fit, move to next page
break
# Position the element on the page based on alignment
if self.halign == Alignment.LEFT:
element_x = self.margins[3]
elif self.halign == Alignment.CENTER:
element_x = self.margins[3] + (avail_width - element_width) // 2
elif self.halign == Alignment.RIGHT:
element_x = self.margins[3] + (avail_width - element_width)
else:
element_x = self.margins[3] # Default to left alignment
# Add element to page
page_elements.append((element, (element_x, current_y)))
# Move to next element and update position
current_index += 1
current_y += element_height + self.spacing
# Update state
self.state.current_page += 1
self.state.current_element_index = current_index
# If we couldn't fit any elements on this page, we're done
if not page_elements and current_index < len(self.elements):
# This could happen if an element is too large for a page
# Skip the element to avoid an infinite loop
current_index += 1
self.state.current_element_index = current_index
# Add a warning element to the page
warning_message = f"Element at index {current_index-1} is too large to fit on a page"
print(f"Warning: {warning_message}")
# Yield the page if it has elements
if page_elements:
yield page_elements
else:
# No more elements to paginate
break
def get_state(self) -> Dict[str, Any]:
"""
Get the current pagination state.
Returns:
Dictionary representing pagination state
"""
return self.state.save()
def set_state(self, state: Dict[str, Any]) -> None:
"""
Set the pagination state.
Args:
state: Dictionary representing pagination state
"""
self.state = PaginationState.load(state)

18
pyproject.toml Normal file
View File

@ -0,0 +1,18 @@
[build-system]
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "pyWebLayout"
description = "A Python library for HTML-like layout and rendering"
readme = "README.md"
requires-python = ">=3.6"
license = {file = "LICENSE"}
authors = [
{name = "Duncan Tourolle", email = "duncan@tourolle.paris"}
]
dynamic = ["version"]
dependencies = [
"Pillow",
"numpy",
]

23
setup.cfg Normal file
View File

@ -0,0 +1,23 @@
[metadata]
name = pyWebLayout
version = 0.1.0
author = Duncan Tourolle
author_email = duncan@tourolle.paris
description = A Python library for HTML-like layout and rendering
long_description = file: README.md
long_description_content_type = text/markdown
url = https://gitea.tourolle.paris/pyWebLayout
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: MIT License
Operating System :: OS Independent
[options]
packages = find:
python_requires = >=3.6
install_requires =
Pillow
numpy
[options.packages.find]
include = pyWebLayout*

32
setup.py Normal file
View File

@ -0,0 +1,32 @@
from setuptools import setup, find_packages
setup(
name="pyWebLayout",
version="0.1.0",
packages=find_packages(),
install_requires=[
"Pillow",
"numpy",
],
extras_require={
"test": [
"coverage>=5.0",
],
"dev": [
"coverage>=5.0",
"pytest>=6.0",
],
},
author="Duncan Tourolle",
author_email="duncan@tourolle.paris",
description="A Python library for HTML-like layout and rendering",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
url="https://gitea.tourolle.paris/pyWebLayout",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires=">=3.6",
)

299
tests/TESTING_STRATEGY.md Normal file
View File

@ -0,0 +1,299 @@
# PyWebLayout Testing Strategy
This document outlines the comprehensive unit testing strategy for the pyWebLayout project.
## Testing Philosophy
The testing strategy follows these principles:
- **Separation of Concerns**: Each component is tested independently
- **Comprehensive Coverage**: All public APIs and critical functionality are tested
- **Integration Testing**: End-to-end workflows are validated
- **Regression Prevention**: Tests prevent breaking changes
- **Documentation**: Tests serve as living documentation of expected behavior
## Test Organization
### Current Test Files (Implemented)
#### ✅ `test_html_style.py`
Tests the `HTMLStyleManager` class for CSS parsing and style management.
**Coverage:**
- Style initialization and defaults
- Style stack operations (push/pop)
- CSS property parsing (font-size, font-weight, colors, etc.)
- Color parsing (named, hex, rgb, rgba)
- Tag-specific default styles
- Inline style parsing
- Font object creation
- Style combination (tag + inline styles)
#### ✅ `test_html_text.py`
Tests the `HTMLTextProcessor` class for text buffering and word creation.
**Coverage:**
- Text buffer management
- HTML entity reference handling
- Character reference processing (decimal/hex)
- Word creation with styling
- Paragraph management
- Text flushing operations
- Buffer state operations
#### ✅ `test_html_content.py`
Integration tests for the `HTMLContentReader` class covering complete HTML parsing.
**Coverage:**
- Simple paragraph parsing
- Heading levels (h1-h6)
- Styled text (bold, italic)
- Lists (ul, ol, dl)
- Tables with headers and cells
- Blockquotes with nested content
- Code blocks with language detection
- HTML entities
- Nested element structures
- Complex document parsing
#### ✅ `test_abstract_blocks.py`
Tests for the core abstract block element classes.
**Coverage:**
- Paragraph word management
- Heading levels and properties
- Quote nesting capabilities
- Code block line management
- List creation and item handling
- Table structure (rows, cells, sections)
- Image properties and scaling
- Simple elements (hr, br)
#### ✅ `test_runner.py`
Test runner script for executing all tests with summary reporting.
---
## Additional Tests Needed
### 🔄 High Priority (Should Implement Next)
#### `test_abstract_inline.py`
Tests for inline elements and text formatting.
**Needed Coverage:**
- Word creation and properties
- Word hyphenation functionality
- FormattedSpan management
- Word chaining (previous/next relationships)
- Font style application
- Language-specific hyphenation
#### `test_abstract_document.py`
Tests for document structure and metadata.
**Needed Coverage:**
- Document creation and initialization
- Metadata management (title, author, language, etc.)
- Block addition and management
- Anchor creation and resolution
- Resource management
- Table of contents generation
- Chapter and book structures
#### `test_abstract_functional.py`
Tests for functional elements (links, buttons, forms).
**Needed Coverage:**
- Link creation and type detection
- Link execution for different types
- Button functionality and state
- Form field management
- Form validation and submission
- Parameter handling
#### `test_style_system.py`
Tests for the style system (fonts, colors, alignment).
**Needed Coverage:**
- Font creation and properties
- Color representation and manipulation
- Font weight, style, decoration enums
- Alignment enums and behavior
- Style inheritance and cascading
### 🔧 Medium Priority
#### `test_html_elements.py`
Unit tests for the HTML element handlers.
**Needed Coverage:**
- BlockElementHandler individual methods
- ListElementHandler state management
- TableElementHandler complex scenarios
- InlineElementHandler link processing
- Handler coordination and delegation
- Error handling in handlers
#### `test_html_metadata.py`
Tests for HTML metadata extraction.
**Needed Coverage:**
- Meta tag parsing
- Open Graph extraction
- JSON-LD structured data
- Title and description extraction
- Language detection
- Character encoding handling
#### `test_html_resources.py`
Tests for HTML resource extraction.
**Needed Coverage:**
- CSS stylesheet extraction
- JavaScript resource identification
- Image source collection
- Media element detection
- External resource resolution
- Base URL handling
#### `test_io_base.py`
Tests for the base reader architecture.
**Needed Coverage:**
- BaseReader interface compliance
- MetadataReader abstract methods
- ContentReader abstract methods
- ResourceReader abstract methods
- CompositeReader coordination
### 🔍 Lower Priority
#### `test_concrete_elements.py`
Tests for concrete rendering implementations.
**Needed Coverage:**
- Box model calculations
- Text rendering specifics
- Image rendering and scaling
- Page layout management
- Functional element rendering
#### `test_typesetting.py`
Tests for the typesetting system.
**Needed Coverage:**
- Flow algorithms
- Pagination logic
- Document pagination
- Line breaking
- Hyphenation integration
#### `test_epub_reader.py`
Tests for EPUB reading functionality.
**Needed Coverage:**
- EPUB file structure parsing
- Manifest processing
- Chapter extraction
- Metadata reading
- Navigation document parsing
#### `test_integration.py`
End-to-end integration tests.
**Needed Coverage:**
- Complete HTML-to-document workflows
- EPUB-to-document workflows
- Style application across parsers
- Resource resolution chains
- Error handling scenarios
## Testing Infrastructure
### Test Dependencies
```python
# Required for testing
unittest # Built-in Python testing framework
unittest.mock # For mocking and test doubles
```
### Test Data
- Create `tests/data/` directory with sample files:
- `sample.html` - Well-formed HTML document
- `complex.html` - Complex nested HTML
- `malformed.html` - Edge cases and error conditions
- `sample.epub` - Sample EPUB file
- `test_images/` - Sample images for testing
### Continuous Integration
- Tests should run on Python 3.6+
- All tests must pass before merging
- Aim for >90% code coverage
- Performance regression testing for parsing speed
## Running Tests
### Run All Tests
```bash
python tests/test_runner.py
```
### Run Specific Test Module
```bash
python tests/test_runner.py html_style
python -m unittest tests.test_html_style
```
### Run Individual Test
```bash
python -m unittest tests.test_html_style.TestHTMLStyleManager.test_color_parsing
```
### Run with Coverage
```bash
pip install coverage
coverage run -m unittest discover tests/
coverage report -m
coverage html # Generate HTML report
```
## Test Quality Guidelines
### Test Naming
- Test files: `test_<module_name>.py`
- Test classes: `Test<ClassName>`
- Test methods: `test_<specific_functionality>`
### Test Structure
1. **Arrange**: Set up test data and mocks
2. **Act**: Execute the functionality being tested
3. **Assert**: Verify the expected behavior
### Mock Usage
- Mock external dependencies (file I/O, network)
- Mock complex objects when testing units in isolation
- Prefer real objects for integration tests
### Edge Cases
- Empty inputs
- Invalid inputs
- Boundary conditions
- Error scenarios
- Performance edge cases
## Success Metrics
- **Coverage**: >90% line coverage across all modules
- **Performance**: No test takes longer than 1 second
- **Reliability**: Tests pass consistently across environments
- **Maintainability**: Tests are easy to understand and modify
- **Documentation**: Tests clearly show expected behavior
## Implementation Priority
1. **Week 1**: Complete high-priority abstract tests
2. **Week 2**: Implement HTML processing component tests
3. **Week 3**: Add integration and end-to-end tests
4. **Week 4**: Performance and edge case testing
This testing strategy ensures comprehensive coverage of the pyWebLayout library while maintaining good separation of concerns and providing clear documentation of expected behavior.

6
tests/__init__.py Normal file
View File

@ -0,0 +1,6 @@
"""
Test suite for pyWebLayout.
This package contains comprehensive unit tests for all components of the pyWebLayout library,
organized by module and functionality.
"""

View File

@ -0,0 +1,275 @@
"""
Unit tests for abstract block elements.
Tests the core abstract block classes that form the foundation of the document model.
"""
import unittest
from pyWebLayout.abstract.block import (
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, ListItem, Table, TableRow, TableCell,
HorizontalRule, LineBreak, Image
)
from pyWebLayout.abstract.inline import Word
from pyWebLayout.style import Font
class TestBlockElements(unittest.TestCase):
"""Test cases for basic block elements."""
def test_paragraph_creation(self):
"""Test creating and using paragraphs."""
paragraph = Parapgraph()
self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH)
self.assertEqual(paragraph.word_count, 0)
self.assertIsNone(paragraph.parent)
# Add words
font = Font()
word1 = Word("Hello", font)
word2 = Word("World", font)
paragraph.add_word(word1)
paragraph.add_word(word2)
self.assertEqual(paragraph.word_count, 2)
# Test word iteration
words = list(paragraph.words())
self.assertEqual(len(words), 2)
self.assertEqual(words[0][1].text, "Hello")
self.assertEqual(words[1][1].text, "World")
def test_heading_levels(self):
"""Test heading creation with different levels."""
h1 = Heading(HeadingLevel.H1)
h3 = Heading(HeadingLevel.H3)
h6 = Heading(HeadingLevel.H6)
self.assertEqual(h1.level, HeadingLevel.H1)
self.assertEqual(h3.level, HeadingLevel.H3)
self.assertEqual(h6.level, HeadingLevel.H6)
self.assertEqual(h1.block_type, BlockType.HEADING)
# Test level modification
h1.level = HeadingLevel.H2
self.assertEqual(h1.level, HeadingLevel.H2)
def test_quote_nesting(self):
"""Test blockquote with nested content."""
quote = Quote()
# Add nested paragraphs
p1 = Parapgraph()
p2 = Parapgraph()
quote.add_block(p1)
quote.add_block(p2)
self.assertEqual(p1.parent, quote)
self.assertEqual(p2.parent, quote)
# Test block iteration
blocks = list(quote.blocks())
self.assertEqual(len(blocks), 2)
self.assertEqual(blocks[0], p1)
self.assertEqual(blocks[1], p2)
def test_code_block(self):
"""Test code block functionality."""
code = CodeBlock("python")
self.assertEqual(code.language, "python")
self.assertEqual(code.line_count, 0)
# Add code lines
code.add_line("def hello():")
code.add_line(" print('Hello!')")
self.assertEqual(code.line_count, 2)
# Test line iteration
lines = list(code.lines())
self.assertEqual(len(lines), 2)
self.assertEqual(lines[0][1], "def hello():")
self.assertEqual(lines[1][1], " print('Hello!')")
# Test language modification
code.language = "javascript"
self.assertEqual(code.language, "javascript")
def test_list_creation(self):
"""Test list creation and item management."""
# Unordered list
ul = HList(ListStyle.UNORDERED)
self.assertEqual(ul.style, ListStyle.UNORDERED)
self.assertEqual(ul.item_count, 0)
# Add list items
item1 = ListItem()
item2 = ListItem()
ul.add_item(item1)
ul.add_item(item2)
self.assertEqual(ul.item_count, 2)
self.assertEqual(item1.parent, ul)
self.assertEqual(item2.parent, ul)
# Test item iteration
items = list(ul.items())
self.assertEqual(len(items), 2)
# Test list style change
ul.style = ListStyle.ORDERED
self.assertEqual(ul.style, ListStyle.ORDERED)
def test_definition_list(self):
"""Test definition list with terms."""
dl = HList(ListStyle.DEFINITION)
# Add definition items with terms
dt1 = ListItem(term="Python")
dt2 = ListItem(term="JavaScript")
dl.add_item(dt1)
dl.add_item(dt2)
self.assertEqual(dt1.term, "Python")
self.assertEqual(dt2.term, "JavaScript")
# Test term modification
dt1.term = "Python 3"
self.assertEqual(dt1.term, "Python 3")
def test_table_structure(self):
"""Test table, row, and cell structure."""
table = Table(caption="Test Table")
self.assertEqual(table.caption, "Test Table")
self.assertEqual(table.row_count["total"], 0)
# Create rows and cells
header_row = TableRow()
data_row = TableRow()
# Header cells
h1 = TableCell(is_header=True)
h2 = TableCell(is_header=True)
header_row.add_cell(h1)
header_row.add_cell(h2)
# Data cells
d1 = TableCell(is_header=False)
d2 = TableCell(is_header=False, colspan=2)
data_row.add_cell(d1)
data_row.add_cell(d2)
# Add rows to table
table.add_row(header_row, "header")
table.add_row(data_row, "body")
# Test structure
self.assertEqual(table.row_count["header"], 1)
self.assertEqual(table.row_count["body"], 1)
self.assertEqual(table.row_count["total"], 2)
# Test cell properties
self.assertTrue(h1.is_header)
self.assertFalse(d1.is_header)
self.assertEqual(d2.colspan, 2)
self.assertEqual(d2.rowspan, 1) # Default
# Test row cell count
self.assertEqual(header_row.cell_count, 2)
self.assertEqual(data_row.cell_count, 2)
def test_table_sections(self):
"""Test table header, body, and footer sections."""
table = Table()
# Add rows to different sections
header = TableRow()
body1 = TableRow()
body2 = TableRow()
footer = TableRow()
table.add_row(header, "header")
table.add_row(body1, "body")
table.add_row(body2, "body")
table.add_row(footer, "footer")
# Test section iteration
header_rows = list(table.header_rows())
body_rows = list(table.body_rows())
footer_rows = list(table.footer_rows())
self.assertEqual(len(header_rows), 1)
self.assertEqual(len(body_rows), 2)
self.assertEqual(len(footer_rows), 1)
# Test all_rows iteration
all_rows = list(table.all_rows())
self.assertEqual(len(all_rows), 4)
# Check section labels
sections = [section for section, row in all_rows]
self.assertEqual(sections, ["header", "body", "body", "footer"])
def test_image_loading(self):
"""Test image element properties."""
# Test with basic properties
img = Image("test.jpg", "Test image", 100, 200)
self.assertEqual(img.source, "test.jpg")
self.assertEqual(img.alt_text, "Test image")
self.assertEqual(img.width, 100)
self.assertEqual(img.height, 200)
# Test property modification
img.source = "new.png"
img.alt_text = "New image"
img.width = 150
img.height = 300
self.assertEqual(img.source, "new.png")
self.assertEqual(img.alt_text, "New image")
self.assertEqual(img.width, 150)
self.assertEqual(img.height, 300)
# Test dimensions tuple
self.assertEqual(img.get_dimensions(), (150, 300))
def test_aspect_ratio_calculation(self):
"""Test image aspect ratio calculations."""
# Test with specified dimensions
img = Image("test.jpg", width=400, height=200)
self.assertEqual(img.get_aspect_ratio(), 2.0) # 400/200
# Test with only one dimension
img2 = Image("test.jpg", width=300)
self.assertIsNone(img2.get_aspect_ratio()) # No height specified
# Test scaled dimensions
scaled = img.calculate_scaled_dimensions(max_width=200, max_height=150)
# Should scale down proportionally
self.assertEqual(scaled[0], 200) # Width limited by max_width
self.assertEqual(scaled[1], 100) # Height scaled proportionally
def test_simple_elements(self):
"""Test simple block elements."""
hr = HorizontalRule()
br = LineBreak()
self.assertEqual(hr.block_type, BlockType.HORIZONTAL_RULE)
self.assertEqual(br.block_type, BlockType.LINE_BREAK)
# These elements have no additional properties
self.assertIsNone(hr.parent)
self.assertIsNone(br.parent)
if __name__ == '__main__':
unittest.main()

354
tests/test_html_content.py Normal file
View File

@ -0,0 +1,354 @@
"""
Unit tests for HTML content reading.
Tests the HTMLContentReader class for parsing complete HTML documents.
This is more of an integration test covering the entire parsing pipeline.
"""
import unittest
from pyWebLayout.io.readers.html_content import HTMLContentReader
from pyWebLayout.abstract.document import Document
from pyWebLayout.abstract.block import (
Parapgraph, Heading, HeadingLevel, HList, ListStyle,
Table, Quote, CodeBlock, HorizontalRule, LineBreak
)
class TestHTMLContentReader(unittest.TestCase):
"""Test cases for HTMLContentReader."""
def setUp(self):
"""Set up test fixtures."""
self.reader = HTMLContentReader()
self.document = Document()
def test_simple_paragraph(self):
"""Test parsing a simple paragraph."""
html = '<p>Hello world!</p>'
result = self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], Parapgraph)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
self.assertEqual(len(words), 2)
self.assertEqual(words[0][1].text, "Hello")
self.assertEqual(words[1][1].text, "world!")
def test_headings(self):
"""Test parsing different heading levels."""
html = '''
<h1>Heading 1</h1>
<h2>Heading 2</h2>
<h3>Heading 3</h3>
<h6>Heading 6</h6>
'''
self.reader.extract_content(html, self.document)
# Should have 4 heading blocks
headings = [block for block in self.document.blocks if isinstance(block, Heading)]
self.assertEqual(len(headings), 4)
# Check heading levels
self.assertEqual(headings[0].level, HeadingLevel.H1)
self.assertEqual(headings[1].level, HeadingLevel.H2)
self.assertEqual(headings[2].level, HeadingLevel.H3)
self.assertEqual(headings[3].level, HeadingLevel.H6)
# Check text content
h1_words = list(headings[0].words())
self.assertEqual(len(h1_words), 2)
self.assertEqual(h1_words[0][1].text, "Heading")
self.assertEqual(h1_words[1][1].text, "1")
def test_styled_text(self):
"""Test parsing text with inline styling."""
html = '<p>This is <b>bold</b> and <i>italic</i> text.</p>'
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
# Should have words: "This", "is", "bold", "and", "italic", "text."
self.assertEqual(len(words), 6)
# The styling information is embedded in the Font objects
# We can't easily test the exact styling without more complex setup
# but we can verify the words are created correctly
word_texts = [word[1].text for word in words]
self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."])
def test_unordered_list(self):
"""Test parsing unordered lists."""
html = '''
<ul>
<li>First item</li>
<li>Second item</li>
<li>Third item</li>
</ul>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], HList)
list_block = self.document.blocks[0]
self.assertEqual(list_block.style, ListStyle.UNORDERED)
items = list(list_block.items())
self.assertEqual(len(items), 3)
# Check first item content
first_item_blocks = list(items[0].blocks())
self.assertEqual(len(first_item_blocks), 1)
self.assertIsInstance(first_item_blocks[0], Parapgraph)
def test_ordered_list(self):
"""Test parsing ordered lists."""
html = '''
<ol>
<li>First step</li>
<li>Second step</li>
</ol>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
list_block = self.document.blocks[0]
self.assertEqual(list_block.style, ListStyle.ORDERED)
items = list(list_block.items())
self.assertEqual(len(items), 2)
def test_definition_list(self):
"""Test parsing definition lists."""
html = '''
<dl>
<dt>Term 1</dt>
<dd>Definition 1</dd>
<dt>Term 2</dt>
<dd>Definition 2</dd>
</dl>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
list_block = self.document.blocks[0]
self.assertEqual(list_block.style, ListStyle.DEFINITION)
items = list(list_block.items())
self.assertEqual(len(items), 2) # Two dt/dd pairs
def test_table(self):
"""Test parsing simple tables."""
html = '''
<table>
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
<tr>
<td>Cell 1</td>
<td>Cell 2</td>
</tr>
</table>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], Table)
table = self.document.blocks[0]
# Check body rows
body_rows = list(table.body_rows())
self.assertEqual(len(body_rows), 2) # Header row + data row
# Check first row (header)
first_row_cells = list(body_rows[0].cells())
self.assertEqual(len(first_row_cells), 2)
self.assertTrue(first_row_cells[0].is_header)
self.assertTrue(first_row_cells[1].is_header)
# Check second row (data)
second_row_cells = list(body_rows[1].cells())
self.assertEqual(len(second_row_cells), 2)
self.assertFalse(second_row_cells[0].is_header)
self.assertFalse(second_row_cells[1].is_header)
def test_blockquote(self):
"""Test parsing blockquotes."""
html = '''
<blockquote>
<p>This is a quoted paragraph.</p>
<p>Another quoted paragraph.</p>
</blockquote>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], Quote)
quote = self.document.blocks[0]
quote_blocks = list(quote.blocks())
self.assertEqual(len(quote_blocks), 2)
self.assertIsInstance(quote_blocks[0], Parapgraph)
self.assertIsInstance(quote_blocks[1], Parapgraph)
def test_code_block(self):
"""Test parsing code blocks."""
html = '''
<pre><code class="language-python">
def hello():
print("Hello, world!")
</code></pre>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], CodeBlock)
code_block = self.document.blocks[0]
self.assertEqual(code_block.language, "python")
def test_horizontal_rule(self):
"""Test parsing horizontal rules."""
html = '<p>Before</p><hr><p>After</p>'
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 3)
self.assertIsInstance(self.document.blocks[0], Parapgraph)
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
self.assertIsInstance(self.document.blocks[2], Parapgraph)
def test_html_entities(self):
"""Test handling HTML entities."""
html = '<p>Less than: &lt; Greater than: &gt; Ampersand: &amp;</p>'
self.reader.extract_content(html, self.document)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
# Find the entity words
word_texts = [word[1].text for word in words]
self.assertIn('<', word_texts)
self.assertIn('>', word_texts)
self.assertIn('&', word_texts)
def test_nested_elements(self):
"""Test parsing nested HTML elements."""
html = '''
<div>
<h2>Section Title</h2>
<p>Section content with <strong>important</strong> text.</p>
<ul>
<li>List item 1</li>
<li>List item 2</li>
</ul>
</div>
'''
self.reader.extract_content(html, self.document)
# Should have multiple blocks
self.assertGreater(len(self.document.blocks), 1)
# Check that we have different types of blocks
block_types = [type(block).__name__ for block in self.document.blocks]
self.assertIn('Parapgraph', block_types) # From div
self.assertIn('Heading', block_types)
self.assertIn('HList', block_types)
def test_empty_elements(self):
"""Test handling empty HTML elements."""
html = '<p></p><div></div><ul></ul>'
self.reader.extract_content(html, self.document)
# Empty elements should still create blocks
self.assertEqual(len(self.document.blocks), 3)
def test_whitespace_handling(self):
"""Test proper whitespace handling."""
html = '''
<p> Word1 Word2
Word3 </p>
'''
self.reader.extract_content(html, self.document)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
# Should normalize whitespace and create separate words
word_texts = [word[1].text for word in words]
self.assertEqual(word_texts, ["Word1", "Word2", "Word3"])
def test_base_url_setting(self):
"""Test setting base URL for link resolution."""
base_url = "https://example.com/path/"
self.reader.set_base_url(base_url)
# The base URL should be passed to the inline handler
self.assertEqual(self.reader.inline_handler.base_url, base_url)
def test_complex_document(self):
"""Test parsing a complex HTML document."""
html = '''
<!DOCTYPE html>
<html>
<head>
<title>Test Document</title>
<style>body { font-family: Arial; }</style>
</head>
<body>
<h1>Main Title</h1>
<p>Introduction paragraph with <em>emphasis</em>.</p>
<h2>Section 1</h2>
<p>Content with <a href="link.html">a link</a>.</p>
<ul>
<li>Item 1</li>
<li>Item 2 with <strong>bold text</strong></li>
</ul>
<h2>Section 2</h2>
<blockquote>
<p>A quoted paragraph.</p>
</blockquote>
<table>
<tr><th>Col1</th><th>Col2</th></tr>
<tr><td>A</td><td>B</td></tr>
</table>
</body>
</html>
'''
self.reader.extract_content(html, self.document)
# Should have parsed multiple blocks
self.assertGreater(len(self.document.blocks), 5)
# Should have different types of content
block_types = set(type(block).__name__ for block in self.document.blocks)
expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'}
self.assertTrue(expected_types.issubset(block_types))
if __name__ == '__main__':
unittest.main()

182
tests/test_html_style.py Normal file
View File

@ -0,0 +1,182 @@
"""
Unit tests for HTML style management.
Tests the HTMLStyleManager class for CSS parsing, style stacks, and font creation.
"""
import unittest
from pyWebLayout.io.readers.html_style import HTMLStyleManager
from pyWebLayout.style import FontStyle, FontWeight, TextDecoration
class TestHTMLStyleManager(unittest.TestCase):
"""Test cases for HTMLStyleManager."""
def setUp(self):
"""Set up test fixtures."""
self.style_manager = HTMLStyleManager()
def test_initialization(self):
"""Test proper initialization of style manager."""
style = self.style_manager.get_current_style()
self.assertEqual(style['font_size'], 12)
self.assertEqual(style['font_weight'], FontWeight.NORMAL)
self.assertEqual(style['font_style'], FontStyle.NORMAL)
self.assertEqual(style['decoration'], TextDecoration.NONE)
self.assertEqual(style['color'], (0, 0, 0))
self.assertIsNone(style['background'])
self.assertEqual(style['language'], 'en_US')
def test_style_stack_operations(self):
"""Test push and pop operations on style stack."""
# Initial state
initial_style = self.style_manager.get_current_style()
# Push a new style
new_style = {'font_size': 16, 'font_weight': FontWeight.BOLD}
self.style_manager.push_style(new_style)
current_style = self.style_manager.get_current_style()
self.assertEqual(current_style['font_size'], 16)
self.assertEqual(current_style['font_weight'], FontWeight.BOLD)
self.assertEqual(current_style['color'], (0, 0, 0)) # Unchanged
# Pop the style
self.style_manager.pop_style()
restored_style = self.style_manager.get_current_style()
self.assertEqual(restored_style, initial_style)
def test_tag_styles(self):
"""Test default styles for HTML tags."""
h1_style = self.style_manager.get_tag_style('h1')
self.assertEqual(h1_style['font_size'], 24)
self.assertEqual(h1_style['font_weight'], FontWeight.BOLD)
h6_style = self.style_manager.get_tag_style('h6')
self.assertEqual(h6_style['font_size'], 12)
self.assertEqual(h6_style['font_weight'], FontWeight.BOLD)
em_style = self.style_manager.get_tag_style('em')
self.assertEqual(em_style['font_style'], FontStyle.ITALIC)
unknown_style = self.style_manager.get_tag_style('unknown')
self.assertEqual(unknown_style, {})
def test_inline_style_parsing(self):
"""Test parsing of inline CSS styles."""
# Test font-size
style = self.style_manager.parse_inline_style('font-size: 18px')
self.assertEqual(style['font_size'], 18)
style = self.style_manager.parse_inline_style('font-size: 14pt')
self.assertEqual(style['font_size'], 14)
# Test font-weight
style = self.style_manager.parse_inline_style('font-weight: bold')
self.assertEqual(style['font_weight'], FontWeight.BOLD)
# Test font-style
style = self.style_manager.parse_inline_style('font-style: italic')
self.assertEqual(style['font_style'], FontStyle.ITALIC)
# Test text-decoration
style = self.style_manager.parse_inline_style('text-decoration: underline')
self.assertEqual(style['decoration'], TextDecoration.UNDERLINE)
# Test multiple properties
style = self.style_manager.parse_inline_style(
'font-size: 20px; font-weight: bold; color: red'
)
self.assertEqual(style['font_size'], 20)
self.assertEqual(style['font_weight'], FontWeight.BOLD)
self.assertEqual(style['color'], (255, 0, 0))
def test_color_parsing(self):
"""Test CSS color parsing."""
# Named colors
self.assertEqual(self.style_manager.parse_color('red'), (255, 0, 0))
self.assertEqual(self.style_manager.parse_color('blue'), (0, 0, 255))
self.assertEqual(self.style_manager.parse_color('white'), (255, 255, 255))
self.assertEqual(self.style_manager.parse_color('gray'), (128, 128, 128))
self.assertEqual(self.style_manager.parse_color('grey'), (128, 128, 128))
# Hex colors
self.assertEqual(self.style_manager.parse_color('#ff0000'), (255, 0, 0))
self.assertEqual(self.style_manager.parse_color('#00ff00'), (0, 255, 0))
self.assertEqual(self.style_manager.parse_color('#f00'), (255, 0, 0))
self.assertEqual(self.style_manager.parse_color('#0f0'), (0, 255, 0))
# RGB colors
self.assertEqual(self.style_manager.parse_color('rgb(255, 0, 0)'), (255, 0, 0))
self.assertEqual(self.style_manager.parse_color('rgb(128, 128, 128)'), (128, 128, 128))
self.assertEqual(self.style_manager.parse_color('rgb( 255 , 255 , 255 )'), (255, 255, 255))
# RGBA colors (alpha ignored)
self.assertEqual(self.style_manager.parse_color('rgba(255, 0, 0, 0.5)'), (255, 0, 0))
# Invalid colors
self.assertIsNone(self.style_manager.parse_color('invalid'))
self.assertIsNone(self.style_manager.parse_color('#gg0000'))
self.assertIsNone(self.style_manager.parse_color('rgb(300, 0, 0)')) # Invalid values return None
def test_color_clamping(self):
"""Test that RGB values outside valid range return None."""
# Values outside 0-255 range should return None
color = self.style_manager.parse_color('rgb(300, -10, 128)')
self.assertIsNone(color) # Invalid values return None
def test_apply_style_to_element(self):
"""Test combining tag styles with inline styles."""
# Test h1 with inline style
attrs = {'style': 'color: blue; font-size: 30px'}
combined = self.style_manager.apply_style_to_element('h1', attrs)
# Should have h1 defaults plus inline overrides
self.assertEqual(combined['font_size'], 30) # Overridden
self.assertEqual(combined['font_weight'], FontWeight.BOLD) # From h1
self.assertEqual(combined['color'], (0, 0, 255)) # Inline
# Test without inline styles
combined = self.style_manager.apply_style_to_element('strong', {})
self.assertEqual(combined['font_weight'], FontWeight.BOLD)
def test_reset(self):
"""Test resetting the style manager."""
# Change the state
self.style_manager.push_style({'font_size': 20})
self.style_manager.push_style({'color': (255, 0, 0)})
# Reset
self.style_manager.reset()
# Should be back to initial state
style = self.style_manager.get_current_style()
self.assertEqual(style['font_size'], 12)
self.assertEqual(style['color'], (0, 0, 0))
self.assertEqual(len(self.style_manager._style_stack), 0)
def test_font_creation(self):
"""Test Font object creation from current style."""
# Set some specific styles
self.style_manager.push_style({
'font_size': 16,
'font_weight': FontWeight.BOLD,
'font_style': FontStyle.ITALIC,
'decoration': TextDecoration.UNDERLINE,
'color': (255, 0, 0),
'background': (255, 255, 0, 255)
})
font = self.style_manager.create_font()
self.assertEqual(font.font_size, 16)
self.assertEqual(font.weight, FontWeight.BOLD)
self.assertEqual(font.style, FontStyle.ITALIC)
self.assertEqual(font.decoration, TextDecoration.UNDERLINE)
self.assertEqual(font.colour, (255, 0, 0))
self.assertEqual(font.background, (255, 255, 0, 255))
if __name__ == '__main__':
unittest.main()

247
tests/test_html_text.py Normal file
View File

@ -0,0 +1,247 @@
"""
Unit tests for HTML text processing.
Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation.
"""
import unittest
from unittest.mock import Mock, MagicMock
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
from pyWebLayout.io.readers.html_style import HTMLStyleManager
from pyWebLayout.abstract.block import Parapgraph
from pyWebLayout.abstract.inline import Word
class TestHTMLTextProcessor(unittest.TestCase):
"""Test cases for HTMLTextProcessor."""
def setUp(self):
"""Set up test fixtures."""
self.style_manager = HTMLStyleManager()
self.text_processor = HTMLTextProcessor(self.style_manager)
# Create a mock paragraph
self.mock_paragraph = Mock(spec=Parapgraph)
self.mock_paragraph.add_word = Mock()
def test_initialization(self):
"""Test proper initialization of text processor."""
self.assertEqual(self.text_processor._text_buffer, "")
self.assertIsNone(self.text_processor._current_paragraph)
self.assertEqual(self.text_processor._style_manager, self.style_manager)
def test_add_text(self):
"""Test adding text to buffer."""
self.text_processor.add_text("Hello")
self.assertEqual(self.text_processor.get_buffer_content(), "Hello")
self.text_processor.add_text(" World")
self.assertEqual(self.text_processor.get_buffer_content(), "Hello World")
def test_entity_references(self):
"""Test HTML entity reference handling."""
test_cases = [
('lt', '<'),
('gt', '>'),
('amp', '&'),
('quot', '"'),
('apos', "'"),
('nbsp', ' '),
('copy', '©'),
('reg', '®'),
('trade', ''),
('mdash', ''),
('ndash', ''),
('hellip', ''),
('euro', ''),
('unknown', '&unknown;') # Unknown entities should be preserved
]
for entity, expected in test_cases:
with self.subTest(entity=entity):
self.text_processor.clear_buffer()
self.text_processor.add_entity_reference(entity)
self.assertEqual(self.text_processor.get_buffer_content(), expected)
def test_character_references(self):
"""Test character reference handling."""
# Decimal character references
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('65') # 'A'
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
# Hexadecimal character references
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('x41') # 'A'
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
# Unicode character
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('8364') # Euro symbol
self.assertEqual(self.text_processor.get_buffer_content(), '')
# Invalid character reference
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('invalid')
self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;')
# Out of range character
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('99999999999')
self.assertTrue(self.text_processor.get_buffer_content().startswith('&#'))
def test_buffer_operations(self):
"""Test buffer state operations."""
# Test has_pending_text
self.assertFalse(self.text_processor.has_pending_text())
self.text_processor.add_text("Some text")
self.assertTrue(self.text_processor.has_pending_text())
# Test clear_buffer
self.text_processor.clear_buffer()
self.assertFalse(self.text_processor.has_pending_text())
self.assertEqual(self.text_processor.get_buffer_content(), "")
# Test with whitespace only
self.text_processor.add_text(" \n\t ")
self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace
def test_paragraph_management(self):
"""Test current paragraph setting."""
# Initially no paragraph
self.assertIsNone(self.text_processor._current_paragraph)
# Set paragraph
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph)
# Clear paragraph
self.text_processor.set_current_paragraph(None)
self.assertIsNone(self.text_processor._current_paragraph)
def test_flush_text_with_paragraph(self):
"""Test flushing text when paragraph is set."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("Hello world test")
# Mock the style manager to return a specific font
mock_font = Mock()
self.style_manager.create_font = Mock(return_value=mock_font)
result = self.text_processor.flush_text()
# Should return True (text was flushed)
self.assertTrue(result)
# Should have created words
self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test"
# Verify the words were created with correct text
calls = self.mock_paragraph.add_word.call_args_list
word_texts = [call[0][0].text for call in calls]
self.assertEqual(word_texts, ["Hello", "world", "test"])
# Buffer should be empty after flush
self.assertEqual(self.text_processor.get_buffer_content(), "")
def test_flush_text_without_paragraph(self):
"""Test flushing text when no paragraph is set."""
self.text_processor.add_text("Hello world")
result = self.text_processor.flush_text()
# Should return False (no paragraph to flush to)
self.assertFalse(result)
# Buffer should be cleared anyway
self.assertEqual(self.text_processor.get_buffer_content(), "")
def test_flush_empty_buffer(self):
"""Test flushing when buffer is empty."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
result = self.text_processor.flush_text()
# Should return False (nothing to flush)
self.assertFalse(result)
# No words should be added
self.mock_paragraph.add_word.assert_not_called()
def test_flush_whitespace_only(self):
"""Test flushing when buffer contains only whitespace."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text(" \n\t ")
result = self.text_processor.flush_text()
# Should return False (no meaningful content)
self.assertFalse(result)
# No words should be added
self.mock_paragraph.add_word.assert_not_called()
def test_word_creation_with_styling(self):
"""Test that words are created with proper styling."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("styled text")
# Set up style manager to return specific font
mock_font = Mock()
mock_font.font_size = 16
mock_font.weight = "bold"
self.style_manager.create_font = Mock(return_value=mock_font)
self.text_processor.flush_text()
# Verify font was created
self.style_manager.create_font.assert_called()
# Verify words were created with the font
calls = self.mock_paragraph.add_word.call_args_list
for call in calls:
word = call[0][0]
self.assertEqual(word.style, mock_font)
def test_reset(self):
"""Test resetting the text processor."""
# Set up some state
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("Some text")
# Reset
self.text_processor.reset()
# Should be back to initial state
self.assertEqual(self.text_processor._text_buffer, "")
self.assertIsNone(self.text_processor._current_paragraph)
def test_complex_text_processing(self):
"""Test processing text with mixed content."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
# Mock font creation
mock_font = Mock()
self.style_manager.create_font = Mock(return_value=mock_font)
# Add mixed content
self.text_processor.add_text("Hello ")
self.text_processor.add_entity_reference('amp')
self.text_processor.add_text(" world")
self.text_processor.add_character_reference('33') # '!'
# Should have "Hello & world!"
expected_content = "Hello & world!"
self.assertEqual(self.text_processor.get_buffer_content(), expected_content)
# Flush and verify words
self.text_processor.flush_text()
calls = self.mock_paragraph.add_word.call_args_list
word_texts = [call[0][0].text for call in calls]
self.assertEqual(word_texts, ["Hello", "&", "world!"])
if __name__ == '__main__':
unittest.main()

84
tests/test_runner.py Normal file
View File

@ -0,0 +1,84 @@
"""
Test runner for pyWebLayout.
This script runs all unit tests and provides a summary of results.
"""
import unittest
import sys
import os
# Add the project root to the Python path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def run_all_tests():
"""Run all unit tests and return the result."""
# Discover and run all tests
loader = unittest.TestLoader()
start_dir = os.path.dirname(os.path.abspath(__file__))
suite = loader.discover(start_dir, pattern='test_*.py')
# Run tests with detailed output
runner = unittest.TextTestRunner(
verbosity=2,
stream=sys.stdout,
descriptions=True,
failfast=False
)
result = runner.run(suite)
# Print summary
print("\n" + "="*70)
print("TEST SUMMARY")
print("="*70)
print(f"Tests run: {result.testsRun}")
print(f"Failures: {len(result.failures)}")
print(f"Errors: {len(result.errors)}")
print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
if result.failures:
print(f"\nFAILURES ({len(result.failures)}):")
for test, traceback in result.failures:
print(f"- {test}")
if result.errors:
print(f"\nERRORS ({len(result.errors)}):")
for test, traceback in result.errors:
print(f"- {test}")
success = len(result.failures) == 0 and len(result.errors) == 0
print(f"\nResult: {'PASSED' if success else 'FAILED'}")
print("="*70)
return success
def run_specific_test(test_module):
"""Run a specific test module."""
loader = unittest.TestLoader()
suite = loader.loadTestsFromName(test_module)
runner = unittest.TextTestRunner(verbosity=2)
result = runner.run(suite)
return len(result.failures) == 0 and len(result.errors) == 0
if __name__ == '__main__':
if len(sys.argv) > 1:
# Run specific test
test_name = sys.argv[1]
if not test_name.startswith('test_'):
test_name = f'test_{test_name}'
if not test_name.endswith('.py'):
test_name = f'{test_name}.py'
module_name = test_name[:-3] # Remove .py extension
success = run_specific_test(module_name)
else:
# Run all tests
success = run_all_tests()
sys.exit(0 if success else 1)