first code commit
This commit is contained in:
commit
f7ad69f9ec
33
.gitignore
vendored
Normal file
33
.gitignore
vendored
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*/__pycache__
|
||||||
|
# Distribution / packaging
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
*.egg-info/
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
.env/
|
||||||
|
.venv/
|
||||||
|
|
||||||
|
# Tests
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
|
||||||
|
# IDE files
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Project specific
|
||||||
|
*.png
|
||||||
|
*.jpg
|
||||||
|
*.jpeg
|
||||||
|
*.gif
|
||||||
|
*.svg
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 Duncan Tourolle
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
10
MANIFEST.in
Normal file
10
MANIFEST.in
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
include README.md
|
||||||
|
include LICENSE
|
||||||
|
include pyWebLayout/*.py
|
||||||
|
recursive-include pyWebLayout/abstract *.py
|
||||||
|
recursive-include pyWebLayout/concrete *.py
|
||||||
|
recursive-include pyWebLayout/style *.py
|
||||||
|
recursive-include pyWebLayout/core *.py
|
||||||
|
recursive-include pyWebLayout/typesetting *.py
|
||||||
|
recursive-include pyWebLayout/io *.py
|
||||||
|
recursive-include pyWebLayout/examples *.py
|
||||||
93
README.md
Normal file
93
README.md
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
# PyWebLayout
|
||||||
|
|
||||||
|
A Python library for HTML-like layout and rendering.
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
PyWebLayout provides classes for rendering HTML-like content to images using a box-based layout system. It includes support for text, tables, and containers, as well as an HTML parser for converting HTML to layout objects.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- HTML-like layout system
|
||||||
|
- Text rendering with font support
|
||||||
|
- Table layouts
|
||||||
|
- Container elements
|
||||||
|
- HTML parsing
|
||||||
|
- Image output
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pyWebLayout
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Basic Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from pyWebLayout.concrete.page import Page, Container
|
||||||
|
from pyWebLayout.abstract.inline import Line
|
||||||
|
from pyWebLayout.layout import Alignment
|
||||||
|
from PIL import ImageFont
|
||||||
|
|
||||||
|
# Create a page
|
||||||
|
page = Page(size=(800, 600), background_color=(240, 240, 240))
|
||||||
|
|
||||||
|
# Add a title container
|
||||||
|
title_container = Container(
|
||||||
|
origin=(0, 0),
|
||||||
|
size=(780, 60),
|
||||||
|
direction='horizontal',
|
||||||
|
spacing=10,
|
||||||
|
padding=(10, 10, 10, 10),
|
||||||
|
halign=Alignment.CENTER,
|
||||||
|
valign=Alignment.CENTER
|
||||||
|
)
|
||||||
|
page.add_child(title_container)
|
||||||
|
|
||||||
|
# Create a title line with text
|
||||||
|
title_font = ImageFont.load_default()
|
||||||
|
title_line = Line(
|
||||||
|
spacing=(8, 15),
|
||||||
|
origin=(0, 0),
|
||||||
|
size=(760, 40),
|
||||||
|
font=title_font,
|
||||||
|
text_color=(0, 0, 0),
|
||||||
|
halign=Alignment.CENTER
|
||||||
|
)
|
||||||
|
title_container.add_child(title_line)
|
||||||
|
title_line.add_word("PyWebLayout", title_font)
|
||||||
|
title_line.add_word("Example", title_font)
|
||||||
|
|
||||||
|
# Layout and render the page
|
||||||
|
page.layout()
|
||||||
|
image = page.render()
|
||||||
|
image.save("example.png")
|
||||||
|
```
|
||||||
|
|
||||||
|
### HTML Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from pyWebLayout.html_parser import html_to_image
|
||||||
|
|
||||||
|
html = """
|
||||||
|
<div style="text-align: center; padding: 10px;">
|
||||||
|
<h1>PyWebLayout HTML Example</h1>
|
||||||
|
<p>This is a paragraph rendered from HTML.</p>
|
||||||
|
<p>The library supports <b>bold</b>, <i>italic</i>, and <u>underlined</u> text.</p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Render HTML to an image
|
||||||
|
image = html_to_image(html, page_size=(800, 600))
|
||||||
|
image.save("html_example.png")
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
## Author
|
||||||
|
|
||||||
|
Duncan Tourolle - duncan@tourolle.paris
|
||||||
44
pyWebLayout/__init__.py
Normal file
44
pyWebLayout/__init__.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
"""
|
||||||
|
PyWebLayout - A Python library for HTML-like layout and rendering.
|
||||||
|
|
||||||
|
This library provides classes for rendering HTML-like content to images
|
||||||
|
using a box-based layout system. It includes support for text, tables,
|
||||||
|
and containers, as well as parsers for HTML and EPUB content. It also
|
||||||
|
supports pagination for ebook-like content with the ability to pause,
|
||||||
|
save state, and resume rendering.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__version__ = '0.1.0'
|
||||||
|
|
||||||
|
# Core abstractions
|
||||||
|
from pyWebLayout.core import Renderable, Interactable, Layoutable, Queriable
|
||||||
|
|
||||||
|
# Style components
|
||||||
|
from pyWebLayout.style import Alignment, Font, FontWeight, FontStyle, TextDecoration
|
||||||
|
|
||||||
|
# Typesetting algorithms
|
||||||
|
from pyWebLayout.typesetting import (
|
||||||
|
FlowLayout,
|
||||||
|
Paginator, PaginationState,
|
||||||
|
DocumentPaginator, DocumentPaginationState
|
||||||
|
)
|
||||||
|
|
||||||
|
# Abstract document model
|
||||||
|
from pyWebLayout.abstract.document import Document, Book, Chapter, MetadataType
|
||||||
|
|
||||||
|
# Concrete implementations
|
||||||
|
from pyWebLayout.concrete.box import Box
|
||||||
|
from pyWebLayout.concrete.text import Line
|
||||||
|
from pyWebLayout.concrete.page import Container, Page
|
||||||
|
|
||||||
|
# Abstract components
|
||||||
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
|
||||||
|
# Layout components
|
||||||
|
from pyWebLayout.table import Table, TableCell
|
||||||
|
|
||||||
|
# IO functionality (reading and writing)
|
||||||
|
from pyWebLayout.io import (
|
||||||
|
parse_html, html_to_document, # HTML parsing
|
||||||
|
read_epub # EPUB reading
|
||||||
|
)
|
||||||
12
pyWebLayout/__main__.py
Normal file
12
pyWebLayout/__main__.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add the parent directory to sys.path for direct execution
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
|
|
||||||
|
# Now import the example module
|
||||||
|
from pyWebLayout.example import save_examples
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Running PyWebLayout examples...")
|
||||||
|
save_examples()
|
||||||
6
pyWebLayout/abstract/__init__.py
Normal file
6
pyWebLayout/abstract/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from .block import Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock
|
||||||
|
from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell
|
||||||
|
from .block import HorizontalRule, LineBreak, Image
|
||||||
|
from .inline import Word, FormattedSpan
|
||||||
|
from .document import Document, MetadataType, Chapter, Book
|
||||||
|
from .functional import Link, LinkType, Button, Form, FormField, FormFieldType
|
||||||
783
pyWebLayout/abstract/block.py
Normal file
783
pyWebLayout/abstract/block.py
Normal file
@ -0,0 +1,783 @@
|
|||||||
|
from typing import List, Iterator, Tuple, Dict, Optional, Union, Any
|
||||||
|
from enum import Enum
|
||||||
|
from .inline import Word, FormattedSpan
|
||||||
|
|
||||||
|
|
||||||
|
class BlockType(Enum):
|
||||||
|
"""Enumeration of different block types for classification purposes"""
|
||||||
|
PARAGRAPH = 1
|
||||||
|
HEADING = 2
|
||||||
|
QUOTE = 3
|
||||||
|
CODE_BLOCK = 4
|
||||||
|
LIST = 5
|
||||||
|
LIST_ITEM = 6
|
||||||
|
TABLE = 7
|
||||||
|
TABLE_ROW = 8
|
||||||
|
TABLE_CELL = 9
|
||||||
|
HORIZONTAL_RULE = 10
|
||||||
|
LINE_BREAK = 11
|
||||||
|
IMAGE = 12
|
||||||
|
|
||||||
|
|
||||||
|
class Block:
|
||||||
|
"""
|
||||||
|
Base class for all block-level elements.
|
||||||
|
Block elements typically represent visual blocks of content that stack vertically.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, block_type: BlockType):
|
||||||
|
"""
|
||||||
|
Initialize a block element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block_type: The type of block this element represents
|
||||||
|
"""
|
||||||
|
self._block_type = block_type
|
||||||
|
self._parent = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def block_type(self) -> BlockType:
|
||||||
|
"""Get the type of this block element"""
|
||||||
|
return self._block_type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parent(self):
|
||||||
|
"""Get the parent block containing this block, if any"""
|
||||||
|
return self._parent
|
||||||
|
|
||||||
|
@parent.setter
|
||||||
|
def parent(self, parent):
|
||||||
|
"""Set the parent block"""
|
||||||
|
self._parent = parent
|
||||||
|
|
||||||
|
|
||||||
|
class Parapgraph(Block):
|
||||||
|
"""
|
||||||
|
A paragraph is a block-level element that contains a sequence of words.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize an empty paragraph"""
|
||||||
|
super().__init__(BlockType.PARAGRAPH)
|
||||||
|
self._words: List[Word] = []
|
||||||
|
self._spans: List[FormattedSpan] = []
|
||||||
|
|
||||||
|
def add_word(self, word: Word):
|
||||||
|
"""
|
||||||
|
Add a word to this paragraph.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: The Word object to add
|
||||||
|
"""
|
||||||
|
self._words.append(word)
|
||||||
|
|
||||||
|
def add_span(self, span: FormattedSpan):
|
||||||
|
"""
|
||||||
|
Add a formatted span to this paragraph.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
span: The FormattedSpan object to add
|
||||||
|
"""
|
||||||
|
self._spans.append(span)
|
||||||
|
|
||||||
|
def words(self) -> Iterator[Tuple[int, Word]]:
|
||||||
|
"""
|
||||||
|
Iterate over the words in this paragraph.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Tuples of (index, word) for each word in the paragraph
|
||||||
|
"""
|
||||||
|
for i, word in enumerate(self._words):
|
||||||
|
yield i, word
|
||||||
|
|
||||||
|
def spans(self) -> Iterator[FormattedSpan]:
|
||||||
|
"""
|
||||||
|
Iterate over the formatted spans in this paragraph.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each FormattedSpan in the paragraph
|
||||||
|
"""
|
||||||
|
for span in self._spans:
|
||||||
|
yield span
|
||||||
|
|
||||||
|
@property
|
||||||
|
def word_count(self) -> int:
|
||||||
|
"""Get the number of words in this paragraph"""
|
||||||
|
return len(self._words)
|
||||||
|
|
||||||
|
|
||||||
|
class HeadingLevel(Enum):
|
||||||
|
"""Enumeration representing HTML heading levels (h1-h6)"""
|
||||||
|
H1 = 1
|
||||||
|
H2 = 2
|
||||||
|
H3 = 3
|
||||||
|
H4 = 4
|
||||||
|
H5 = 5
|
||||||
|
H6 = 6
|
||||||
|
|
||||||
|
|
||||||
|
class Heading(Parapgraph):
|
||||||
|
"""
|
||||||
|
A heading element (h1, h2, h3, etc.) that contains text with a specific heading level.
|
||||||
|
Headings inherit from Paragraph as they contain words but have additional properties.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, level: HeadingLevel = HeadingLevel.H1):
|
||||||
|
"""
|
||||||
|
Initialize a heading element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
level: The heading level (h1-h6)
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self._block_type = BlockType.HEADING
|
||||||
|
self._level = level
|
||||||
|
|
||||||
|
@property
|
||||||
|
def level(self) -> HeadingLevel:
|
||||||
|
"""Get the heading level"""
|
||||||
|
return self._level
|
||||||
|
|
||||||
|
@level.setter
|
||||||
|
def level(self, level: HeadingLevel):
|
||||||
|
"""Set the heading level"""
|
||||||
|
self._level = level
|
||||||
|
|
||||||
|
|
||||||
|
class Quote(Block):
|
||||||
|
"""
|
||||||
|
A blockquote element that can contain other block elements.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize an empty blockquote"""
|
||||||
|
super().__init__(BlockType.QUOTE)
|
||||||
|
self._blocks: List[Block] = []
|
||||||
|
|
||||||
|
def add_block(self, block: Block):
|
||||||
|
"""
|
||||||
|
Add a block element to this quote.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block: The Block object to add
|
||||||
|
"""
|
||||||
|
self._blocks.append(block)
|
||||||
|
block.parent = self
|
||||||
|
|
||||||
|
def blocks(self) -> Iterator[Block]:
|
||||||
|
"""
|
||||||
|
Iterate over the blocks in this quote.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each Block in the quote
|
||||||
|
"""
|
||||||
|
for block in self._blocks:
|
||||||
|
yield block
|
||||||
|
|
||||||
|
|
||||||
|
class CodeBlock(Block):
|
||||||
|
"""
|
||||||
|
A code block element containing pre-formatted text with syntax highlighting.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, language: str = ""):
|
||||||
|
"""
|
||||||
|
Initialize a code block.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
language: The programming language for syntax highlighting
|
||||||
|
"""
|
||||||
|
super().__init__(BlockType.CODE_BLOCK)
|
||||||
|
self._language = language
|
||||||
|
self._lines: List[str] = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self) -> str:
|
||||||
|
"""Get the programming language"""
|
||||||
|
return self._language
|
||||||
|
|
||||||
|
@language.setter
|
||||||
|
def language(self, language: str):
|
||||||
|
"""Set the programming language"""
|
||||||
|
self._language = language
|
||||||
|
|
||||||
|
def add_line(self, line: str):
|
||||||
|
"""
|
||||||
|
Add a line of code to this code block.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
line: The line of code to add
|
||||||
|
"""
|
||||||
|
self._lines.append(line)
|
||||||
|
|
||||||
|
def lines(self) -> Iterator[Tuple[int, str]]:
|
||||||
|
"""
|
||||||
|
Iterate over the lines in this code block.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Tuples of (line_number, line_text) for each line
|
||||||
|
"""
|
||||||
|
for i, line in enumerate(self._lines):
|
||||||
|
yield i, line
|
||||||
|
|
||||||
|
@property
|
||||||
|
def line_count(self) -> int:
|
||||||
|
"""Get the number of lines in this code block"""
|
||||||
|
return len(self._lines)
|
||||||
|
|
||||||
|
|
||||||
|
class ListStyle(Enum):
|
||||||
|
"""Enumeration of list styles"""
|
||||||
|
UNORDERED = 1 # <ul>
|
||||||
|
ORDERED = 2 # <ol>
|
||||||
|
DEFINITION = 3 # <dl>
|
||||||
|
|
||||||
|
|
||||||
|
class HList(Block):
|
||||||
|
"""
|
||||||
|
An HTML list element (ul, ol, dl).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, style: ListStyle = ListStyle.UNORDERED):
|
||||||
|
"""
|
||||||
|
Initialize a list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style: The style of list (unordered, ordered, definition)
|
||||||
|
"""
|
||||||
|
super().__init__(BlockType.LIST)
|
||||||
|
self._style = style
|
||||||
|
self._items: List[ListItem] = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def style(self) -> ListStyle:
|
||||||
|
"""Get the list style"""
|
||||||
|
return self._style
|
||||||
|
|
||||||
|
@style.setter
|
||||||
|
def style(self, style: ListStyle):
|
||||||
|
"""Set the list style"""
|
||||||
|
self._style = style
|
||||||
|
|
||||||
|
def add_item(self, item: 'ListItem'):
|
||||||
|
"""
|
||||||
|
Add an item to this list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
item: The ListItem to add
|
||||||
|
"""
|
||||||
|
self._items.append(item)
|
||||||
|
item.parent = self
|
||||||
|
|
||||||
|
def items(self) -> Iterator['ListItem']:
|
||||||
|
"""
|
||||||
|
Iterate over the items in this list.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each ListItem in the list
|
||||||
|
"""
|
||||||
|
for item in self._items:
|
||||||
|
yield item
|
||||||
|
|
||||||
|
@property
|
||||||
|
def item_count(self) -> int:
|
||||||
|
"""Get the number of items in this list"""
|
||||||
|
return len(self._items)
|
||||||
|
|
||||||
|
|
||||||
|
class ListItem(Block):
|
||||||
|
"""
|
||||||
|
A list item element that can contain other block elements.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, term: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize a list item.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
term: Optional term for definition lists (dt element)
|
||||||
|
"""
|
||||||
|
super().__init__(BlockType.LIST_ITEM)
|
||||||
|
self._blocks: List[Block] = []
|
||||||
|
self._term = term
|
||||||
|
|
||||||
|
@property
|
||||||
|
def term(self) -> Optional[str]:
|
||||||
|
"""Get the definition term (for definition lists)"""
|
||||||
|
return self._term
|
||||||
|
|
||||||
|
@term.setter
|
||||||
|
def term(self, term: str):
|
||||||
|
"""Set the definition term"""
|
||||||
|
self._term = term
|
||||||
|
|
||||||
|
def add_block(self, block: Block):
|
||||||
|
"""
|
||||||
|
Add a block element to this list item.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block: The Block object to add
|
||||||
|
"""
|
||||||
|
self._blocks.append(block)
|
||||||
|
block.parent = self
|
||||||
|
|
||||||
|
def blocks(self) -> Iterator[Block]:
|
||||||
|
"""
|
||||||
|
Iterate over the blocks in this list item.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each Block in the list item
|
||||||
|
"""
|
||||||
|
for block in self._blocks:
|
||||||
|
yield block
|
||||||
|
|
||||||
|
|
||||||
|
class TableCell(Block):
|
||||||
|
"""
|
||||||
|
A table cell element that can contain other block elements.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, is_header: bool = False, colspan: int = 1, rowspan: int = 1):
|
||||||
|
"""
|
||||||
|
Initialize a table cell.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
is_header: Whether this cell is a header cell (th) or data cell (td)
|
||||||
|
colspan: Number of columns this cell spans
|
||||||
|
rowspan: Number of rows this cell spans
|
||||||
|
"""
|
||||||
|
super().__init__(BlockType.TABLE_CELL)
|
||||||
|
self._is_header = is_header
|
||||||
|
self._colspan = colspan
|
||||||
|
self._rowspan = rowspan
|
||||||
|
self._blocks: List[Block] = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_header(self) -> bool:
|
||||||
|
"""Check if this is a header cell"""
|
||||||
|
return self._is_header
|
||||||
|
|
||||||
|
@is_header.setter
|
||||||
|
def is_header(self, is_header: bool):
|
||||||
|
"""Set whether this is a header cell"""
|
||||||
|
self._is_header = is_header
|
||||||
|
|
||||||
|
@property
|
||||||
|
def colspan(self) -> int:
|
||||||
|
"""Get the column span"""
|
||||||
|
return self._colspan
|
||||||
|
|
||||||
|
@colspan.setter
|
||||||
|
def colspan(self, colspan: int):
|
||||||
|
"""Set the column span"""
|
||||||
|
self._colspan = max(1, colspan) # Ensure minimum of 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def rowspan(self) -> int:
|
||||||
|
"""Get the row span"""
|
||||||
|
return self._rowspan
|
||||||
|
|
||||||
|
@rowspan.setter
|
||||||
|
def rowspan(self, rowspan: int):
|
||||||
|
"""Set the row span"""
|
||||||
|
self._rowspan = max(1, rowspan) # Ensure minimum of 1
|
||||||
|
|
||||||
|
def add_block(self, block: Block):
|
||||||
|
"""
|
||||||
|
Add a block element to this cell.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block: The Block object to add
|
||||||
|
"""
|
||||||
|
self._blocks.append(block)
|
||||||
|
block.parent = self
|
||||||
|
|
||||||
|
def blocks(self) -> Iterator[Block]:
|
||||||
|
"""
|
||||||
|
Iterate over the blocks in this cell.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each Block in the cell
|
||||||
|
"""
|
||||||
|
for block in self._blocks:
|
||||||
|
yield block
|
||||||
|
|
||||||
|
|
||||||
|
class TableRow(Block):
|
||||||
|
"""
|
||||||
|
A table row element containing table cells.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize an empty table row"""
|
||||||
|
super().__init__(BlockType.TABLE_ROW)
|
||||||
|
self._cells: List[TableCell] = []
|
||||||
|
|
||||||
|
def add_cell(self, cell: TableCell):
|
||||||
|
"""
|
||||||
|
Add a cell to this row.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cell: The TableCell to add
|
||||||
|
"""
|
||||||
|
self._cells.append(cell)
|
||||||
|
cell.parent = self
|
||||||
|
|
||||||
|
def cells(self) -> Iterator[TableCell]:
|
||||||
|
"""
|
||||||
|
Iterate over the cells in this row.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each TableCell in the row
|
||||||
|
"""
|
||||||
|
for cell in self._cells:
|
||||||
|
yield cell
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cell_count(self) -> int:
|
||||||
|
"""Get the number of cells in this row"""
|
||||||
|
return len(self._cells)
|
||||||
|
|
||||||
|
|
||||||
|
class Table(Block):
|
||||||
|
"""
|
||||||
|
A table element containing rows and cells.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, caption: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize a table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
caption: Optional caption for the table
|
||||||
|
"""
|
||||||
|
super().__init__(BlockType.TABLE)
|
||||||
|
self._caption = caption
|
||||||
|
self._rows: List[TableRow] = []
|
||||||
|
self._header_rows: List[TableRow] = []
|
||||||
|
self._footer_rows: List[TableRow] = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def caption(self) -> Optional[str]:
|
||||||
|
"""Get the table caption"""
|
||||||
|
return self._caption
|
||||||
|
|
||||||
|
@caption.setter
|
||||||
|
def caption(self, caption: Optional[str]):
|
||||||
|
"""Set the table caption"""
|
||||||
|
self._caption = caption
|
||||||
|
|
||||||
|
def add_row(self, row: TableRow, section: str = "body"):
|
||||||
|
"""
|
||||||
|
Add a row to this table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
row: The TableRow to add
|
||||||
|
section: The section to add the row to ("header", "body", or "footer")
|
||||||
|
"""
|
||||||
|
row.parent = self
|
||||||
|
|
||||||
|
if section.lower() == "header":
|
||||||
|
self._header_rows.append(row)
|
||||||
|
elif section.lower() == "footer":
|
||||||
|
self._footer_rows.append(row)
|
||||||
|
else: # Default to body
|
||||||
|
self._rows.append(row)
|
||||||
|
|
||||||
|
def header_rows(self) -> Iterator[TableRow]:
|
||||||
|
"""
|
||||||
|
Iterate over the header rows in this table.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each TableRow in the table header
|
||||||
|
"""
|
||||||
|
for row in self._header_rows:
|
||||||
|
yield row
|
||||||
|
|
||||||
|
def body_rows(self) -> Iterator[TableRow]:
|
||||||
|
"""
|
||||||
|
Iterate over the body rows in this table.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each TableRow in the table body
|
||||||
|
"""
|
||||||
|
for row in self._rows:
|
||||||
|
yield row
|
||||||
|
|
||||||
|
def footer_rows(self) -> Iterator[TableRow]:
|
||||||
|
"""
|
||||||
|
Iterate over the footer rows in this table.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each TableRow in the table footer
|
||||||
|
"""
|
||||||
|
for row in self._footer_rows:
|
||||||
|
yield row
|
||||||
|
|
||||||
|
def all_rows(self) -> Iterator[Tuple[str, TableRow]]:
|
||||||
|
"""
|
||||||
|
Iterate over all rows in this table with their section.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Tuples of (section, row) for each row
|
||||||
|
"""
|
||||||
|
for row in self._header_rows:
|
||||||
|
yield "header", row
|
||||||
|
|
||||||
|
for row in self._rows:
|
||||||
|
yield "body", row
|
||||||
|
|
||||||
|
for row in self._footer_rows:
|
||||||
|
yield "footer", row
|
||||||
|
|
||||||
|
@property
|
||||||
|
def row_count(self) -> Dict[str, int]:
|
||||||
|
"""Get the number of rows in each section"""
|
||||||
|
return {
|
||||||
|
"header": len(self._header_rows),
|
||||||
|
"body": len(self._rows),
|
||||||
|
"footer": len(self._footer_rows),
|
||||||
|
"total": len(self._header_rows) + len(self._rows) + len(self._footer_rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class HorizontalRule(Block):
|
||||||
|
"""
|
||||||
|
A horizontal rule element (<hr>).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize a horizontal rule"""
|
||||||
|
super().__init__(BlockType.HORIZONTAL_RULE)
|
||||||
|
|
||||||
|
|
||||||
|
class LineBreak(Block):
|
||||||
|
"""
|
||||||
|
A line break element (<br>).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize a line break"""
|
||||||
|
super().__init__(BlockType.LINE_BREAK)
|
||||||
|
|
||||||
|
|
||||||
|
class Image(Block):
|
||||||
|
"""
|
||||||
|
An image element that can be displayed in a document.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, source: str, alt_text: Optional[str] = None,
|
||||||
|
width: Optional[int] = None, height: Optional[int] = None):
|
||||||
|
"""
|
||||||
|
Initialize an image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The path or URL to the image
|
||||||
|
alt_text: Alternative text description of the image
|
||||||
|
width: Optional width to display the image
|
||||||
|
height: Optional height to display the image
|
||||||
|
"""
|
||||||
|
super().__init__(BlockType.IMAGE)
|
||||||
|
self._source = source
|
||||||
|
self._alt_text = alt_text or ""
|
||||||
|
self._width = width
|
||||||
|
self._height = height
|
||||||
|
self._loaded_image = None
|
||||||
|
self._error = None
|
||||||
|
|
||||||
|
# Try to load the image immediately
|
||||||
|
self.load()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def source(self) -> str:
|
||||||
|
"""Get the image source path or URL"""
|
||||||
|
return self._source
|
||||||
|
|
||||||
|
@source.setter
|
||||||
|
def source(self, source: str):
|
||||||
|
"""Set the image source path or URL"""
|
||||||
|
self._source = source
|
||||||
|
self._loaded_image = None # Reset loaded image when source changes
|
||||||
|
self._error = None
|
||||||
|
# Try to load the image with the new source
|
||||||
|
self.load()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alt_text(self) -> str:
|
||||||
|
"""Get the alternative text for the image"""
|
||||||
|
return self._alt_text
|
||||||
|
|
||||||
|
@alt_text.setter
|
||||||
|
def alt_text(self, alt_text: str):
|
||||||
|
"""Set the alternative text for the image"""
|
||||||
|
self._alt_text = alt_text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def width(self) -> Optional[int]:
|
||||||
|
"""Get the specified width for the image"""
|
||||||
|
return self._width
|
||||||
|
|
||||||
|
@width.setter
|
||||||
|
def width(self, width: Optional[int]):
|
||||||
|
"""Set the specified width for the image"""
|
||||||
|
self._width = width
|
||||||
|
|
||||||
|
@property
|
||||||
|
def height(self) -> Optional[int]:
|
||||||
|
"""Get the specified height for the image"""
|
||||||
|
return self._height
|
||||||
|
|
||||||
|
@height.setter
|
||||||
|
def height(self, height: Optional[int]):
|
||||||
|
"""Set the specified height for the image"""
|
||||||
|
self._height = height
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loaded_image(self):
|
||||||
|
"""Get the loaded image data, if available"""
|
||||||
|
return self._loaded_image
|
||||||
|
|
||||||
|
@property
|
||||||
|
def error(self) -> Optional[str]:
|
||||||
|
"""Get any error message from attempting to load the image"""
|
||||||
|
return self._error
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
"""
|
||||||
|
Load the image from the source.
|
||||||
|
This method handles loading from local files and URLs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the image was loaded successfully, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import os
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
# Handle different types of sources
|
||||||
|
if os.path.isfile(self._source):
|
||||||
|
# Local file
|
||||||
|
self._loaded_image = PILImage.open(self._source)
|
||||||
|
self._error = None
|
||||||
|
return True
|
||||||
|
elif self._source.startswith(('http://', 'https://')):
|
||||||
|
# URL - requires requests library
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
response = requests.get(self._source, stream=True)
|
||||||
|
if response.status_code == 200:
|
||||||
|
self._loaded_image = PILImage.open(BytesIO(response.content))
|
||||||
|
self._error = None
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self._error = f"Failed to load image: HTTP status {response.status_code}"
|
||||||
|
return False
|
||||||
|
except ImportError:
|
||||||
|
self._error = "Requests library not available for URL loading"
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
self._error = f"Error loading image from URL: {str(e)}"
|
||||||
|
return False
|
||||||
|
elif self._source.startswith('data:image/'):
|
||||||
|
# Data URI
|
||||||
|
try:
|
||||||
|
import base64
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
# Parse the data URI
|
||||||
|
# Format: data:image/png;base64,<base64-encoded-data>
|
||||||
|
header, encoded = self._source.split(',', 1)
|
||||||
|
mime_type = header.split(';')[0].split(':')[1]
|
||||||
|
|
||||||
|
# Decode the base64 data
|
||||||
|
decoded = base64.b64decode(encoded)
|
||||||
|
self._loaded_image = PILImage.open(BytesIO(decoded))
|
||||||
|
self._error = None
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self._error = f"Error loading image from data URI: {str(e)}"
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
self._error = f"Unable to load image from source: {self._source}"
|
||||||
|
return False
|
||||||
|
|
||||||
|
except ImportError as e:
|
||||||
|
self._error = f"PIL library not available: {str(e)}"
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
self._error = f"Error loading image: {str(e)}"
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_dimensions(self) -> Tuple[Optional[int], Optional[int]]:
|
||||||
|
"""
|
||||||
|
Get the dimensions of the image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple of (width, height), or (None, None) if the image is not loaded
|
||||||
|
"""
|
||||||
|
if self._loaded_image:
|
||||||
|
return self._loaded_image.size
|
||||||
|
return self._width, self._height
|
||||||
|
|
||||||
|
def get_aspect_ratio(self) -> Optional[float]:
|
||||||
|
"""
|
||||||
|
Get the aspect ratio of the image (width/height).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The aspect ratio as a float, or None if the image is not loaded
|
||||||
|
and no dimensions are specified
|
||||||
|
"""
|
||||||
|
if self._loaded_image:
|
||||||
|
width, height = self._loaded_image.size
|
||||||
|
if height > 0:
|
||||||
|
return width / height
|
||||||
|
elif self._width is not None and self._height is not None and self._height > 0:
|
||||||
|
return self._width / self._height
|
||||||
|
return None
|
||||||
|
|
||||||
|
def calculate_scaled_dimensions(self, max_width: Optional[int] = None,
|
||||||
|
max_height: Optional[int] = None) -> Tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Calculate the scaled dimensions of the image within constraints.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_width: The maximum width constraint
|
||||||
|
max_height: The maximum height constraint
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple of (width, height) that fits within the constraints
|
||||||
|
while maintaining the aspect ratio
|
||||||
|
"""
|
||||||
|
# Use specified dimensions if available
|
||||||
|
if self._width is not None and self._height is not None:
|
||||||
|
return self._width, self._height
|
||||||
|
|
||||||
|
# If image is loaded, use its dimensions
|
||||||
|
if self._loaded_image:
|
||||||
|
orig_width, orig_height = self._loaded_image.size
|
||||||
|
else:
|
||||||
|
# If no image is loaded and no dimensions specified, use defaults
|
||||||
|
return self._width or 300, self._height or 200
|
||||||
|
|
||||||
|
# If only one dimension is specified, calculate the other
|
||||||
|
if self._width is not None and self._height is None:
|
||||||
|
aspect = orig_width / orig_height
|
||||||
|
return self._width, int(self._width / aspect)
|
||||||
|
elif self._height is not None and self._width is None:
|
||||||
|
aspect = orig_width / orig_height
|
||||||
|
return int(self._height * aspect), self._height
|
||||||
|
|
||||||
|
# Apply max constraints if provided
|
||||||
|
width, height = orig_width, orig_height
|
||||||
|
|
||||||
|
if max_width is not None and width > max_width:
|
||||||
|
height = int(height * (max_width / width))
|
||||||
|
width = max_width
|
||||||
|
|
||||||
|
if max_height is not None and height > max_height:
|
||||||
|
width = int(width * (max_height / height))
|
||||||
|
height = max_height
|
||||||
|
|
||||||
|
return width, height
|
||||||
377
pyWebLayout/abstract/document.py
Normal file
377
pyWebLayout/abstract/document.py
Normal file
@ -0,0 +1,377 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
from typing import List, Dict, Optional, Tuple, Union, Any
|
||||||
|
from enum import Enum
|
||||||
|
from .block import Block, BlockType, Heading, HeadingLevel, Parapgraph
|
||||||
|
from .functional import Link, Button, Form
|
||||||
|
from .inline import Word, FormattedSpan
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataType(Enum):
|
||||||
|
"""Types of metadata that can be associated with a document"""
|
||||||
|
TITLE = 1
|
||||||
|
AUTHOR = 2
|
||||||
|
DESCRIPTION = 3
|
||||||
|
KEYWORDS = 4
|
||||||
|
LANGUAGE = 5
|
||||||
|
PUBLICATION_DATE = 6
|
||||||
|
MODIFIED_DATE = 7
|
||||||
|
PUBLISHER = 8
|
||||||
|
IDENTIFIER = 9
|
||||||
|
COVER_IMAGE = 10
|
||||||
|
CUSTOM = 100
|
||||||
|
|
||||||
|
|
||||||
|
class Document:
|
||||||
|
"""
|
||||||
|
Abstract representation of a complete document like an HTML page or an ebook.
|
||||||
|
This class manages the logical structure of the document without rendering concerns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, title: Optional[str] = None, language: str = "en-US"):
|
||||||
|
"""
|
||||||
|
Initialize a new document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: The document title
|
||||||
|
language: The document language code
|
||||||
|
"""
|
||||||
|
self._blocks: List[Block] = []
|
||||||
|
self._metadata: Dict[MetadataType, Any] = {}
|
||||||
|
self._anchors: Dict[str, Block] = {} # Named anchors for navigation
|
||||||
|
self._resources: Dict[str, Any] = {} # External resources like images
|
||||||
|
self._stylesheets: List[Dict[str, Any]] = [] # CSS stylesheets
|
||||||
|
self._scripts: List[str] = [] # JavaScript code
|
||||||
|
|
||||||
|
# Set basic metadata
|
||||||
|
if title:
|
||||||
|
self.set_metadata(MetadataType.TITLE, title)
|
||||||
|
self.set_metadata(MetadataType.LANGUAGE, language)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def blocks(self) -> List[Block]:
|
||||||
|
"""Get the top-level blocks in this document"""
|
||||||
|
return self._blocks
|
||||||
|
|
||||||
|
def add_block(self, block: Block):
|
||||||
|
"""
|
||||||
|
Add a block to this document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block: The block to add
|
||||||
|
"""
|
||||||
|
self._blocks.append(block)
|
||||||
|
|
||||||
|
def set_metadata(self, meta_type: MetadataType, value: Any):
|
||||||
|
"""
|
||||||
|
Set a metadata value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
meta_type: The type of metadata
|
||||||
|
value: The metadata value
|
||||||
|
"""
|
||||||
|
self._metadata[meta_type] = value
|
||||||
|
|
||||||
|
def get_metadata(self, meta_type: MetadataType) -> Optional[Any]:
|
||||||
|
"""
|
||||||
|
Get a metadata value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
meta_type: The type of metadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The metadata value, or None if not set
|
||||||
|
"""
|
||||||
|
return self._metadata.get(meta_type)
|
||||||
|
|
||||||
|
def add_anchor(self, name: str, target: Block):
|
||||||
|
"""
|
||||||
|
Add a named anchor to this document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The anchor name
|
||||||
|
target: The target block
|
||||||
|
"""
|
||||||
|
self._anchors[name] = target
|
||||||
|
|
||||||
|
def get_anchor(self, name: str) -> Optional[Block]:
|
||||||
|
"""
|
||||||
|
Get a named anchor from this document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The anchor name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The target block, or None if not found
|
||||||
|
"""
|
||||||
|
return self._anchors.get(name)
|
||||||
|
|
||||||
|
def add_resource(self, name: str, resource: Any):
|
||||||
|
"""
|
||||||
|
Add a resource to this document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The resource name
|
||||||
|
resource: The resource data
|
||||||
|
"""
|
||||||
|
self._resources[name] = resource
|
||||||
|
|
||||||
|
def get_resource(self, name: str) -> Optional[Any]:
|
||||||
|
"""
|
||||||
|
Get a resource from this document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The resource name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The resource data, or None if not found
|
||||||
|
"""
|
||||||
|
return self._resources.get(name)
|
||||||
|
|
||||||
|
def add_stylesheet(self, stylesheet: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Add a stylesheet to this document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stylesheet: The stylesheet data
|
||||||
|
"""
|
||||||
|
self._stylesheets.append(stylesheet)
|
||||||
|
|
||||||
|
def add_script(self, script: str):
|
||||||
|
"""
|
||||||
|
Add a script to this document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
script: The script code
|
||||||
|
"""
|
||||||
|
self._scripts.append(script)
|
||||||
|
|
||||||
|
def get_title(self) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get the document title.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The document title, or None if not set
|
||||||
|
"""
|
||||||
|
return self.get_metadata(MetadataType.TITLE)
|
||||||
|
|
||||||
|
def set_title(self, title: str):
|
||||||
|
"""
|
||||||
|
Set the document title.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: The document title
|
||||||
|
"""
|
||||||
|
self.set_metadata(MetadataType.TITLE, title)
|
||||||
|
|
||||||
|
def find_blocks_by_type(self, block_type: BlockType) -> List[Block]:
|
||||||
|
"""
|
||||||
|
Find all blocks of a specific type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block_type: The type of blocks to find
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of matching blocks
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
|
||||||
|
def _find_recursive(blocks: List[Block]):
|
||||||
|
for block in blocks:
|
||||||
|
if block.block_type == block_type:
|
||||||
|
result.append(block)
|
||||||
|
|
||||||
|
# Check for child blocks based on block type
|
||||||
|
if hasattr(block, '_blocks'):
|
||||||
|
_find_recursive(block._blocks)
|
||||||
|
elif hasattr(block, '_items') and isinstance(block._items, list):
|
||||||
|
_find_recursive(block._items)
|
||||||
|
|
||||||
|
_find_recursive(self._blocks)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def find_headings(self) -> List[Heading]:
|
||||||
|
"""
|
||||||
|
Find all headings in the document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of heading blocks
|
||||||
|
"""
|
||||||
|
blocks = self.find_blocks_by_type(BlockType.HEADING)
|
||||||
|
return [block for block in blocks if isinstance(block, Heading)]
|
||||||
|
|
||||||
|
def generate_table_of_contents(self) -> List[Tuple[int, str, Block]]:
|
||||||
|
"""
|
||||||
|
Generate a table of contents from headings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of tuples containing (level, title, heading_block)
|
||||||
|
"""
|
||||||
|
headings = self.find_headings()
|
||||||
|
|
||||||
|
toc = []
|
||||||
|
for heading in headings:
|
||||||
|
# Extract text from the heading
|
||||||
|
title = ""
|
||||||
|
for _, word in heading.words():
|
||||||
|
title += word.text + " "
|
||||||
|
title = title.strip()
|
||||||
|
|
||||||
|
# Add to TOC
|
||||||
|
level = heading.level.value # Get numeric value from HeadingLevel enum
|
||||||
|
toc.append((level, title, heading))
|
||||||
|
|
||||||
|
return toc
|
||||||
|
|
||||||
|
|
||||||
|
class Chapter:
|
||||||
|
"""
|
||||||
|
Represents a chapter or section in a document.
|
||||||
|
A chapter contains a sequence of blocks and has metadata.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, title: Optional[str] = None, level: int = 1):
|
||||||
|
"""
|
||||||
|
Initialize a new chapter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: The chapter title
|
||||||
|
level: The chapter level (1 = top level, 2 = subsection, etc.)
|
||||||
|
"""
|
||||||
|
self._title = title
|
||||||
|
self._level = level
|
||||||
|
self._blocks: List[Block] = []
|
||||||
|
self._metadata: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def title(self) -> Optional[str]:
|
||||||
|
"""Get the chapter title"""
|
||||||
|
return self._title
|
||||||
|
|
||||||
|
@title.setter
|
||||||
|
def title(self, title: str):
|
||||||
|
"""Set the chapter title"""
|
||||||
|
self._title = title
|
||||||
|
|
||||||
|
@property
|
||||||
|
def level(self) -> int:
|
||||||
|
"""Get the chapter level"""
|
||||||
|
return self._level
|
||||||
|
|
||||||
|
@property
|
||||||
|
def blocks(self) -> List[Block]:
|
||||||
|
"""Get the blocks in this chapter"""
|
||||||
|
return self._blocks
|
||||||
|
|
||||||
|
def add_block(self, block: Block):
|
||||||
|
"""
|
||||||
|
Add a block to this chapter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block: The block to add
|
||||||
|
"""
|
||||||
|
self._blocks.append(block)
|
||||||
|
|
||||||
|
def set_metadata(self, key: str, value: Any):
|
||||||
|
"""
|
||||||
|
Set a metadata value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: The metadata key
|
||||||
|
value: The metadata value
|
||||||
|
"""
|
||||||
|
self._metadata[key] = value
|
||||||
|
|
||||||
|
def get_metadata(self, key: str) -> Optional[Any]:
|
||||||
|
"""
|
||||||
|
Get a metadata value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: The metadata key
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The metadata value, or None if not set
|
||||||
|
"""
|
||||||
|
return self._metadata.get(key)
|
||||||
|
|
||||||
|
|
||||||
|
class Book(Document):
|
||||||
|
"""
|
||||||
|
Abstract representation of an ebook.
|
||||||
|
A book is a document that contains chapters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, title: Optional[str] = None, author: Optional[str] = None, language: str = "en-US"):
|
||||||
|
"""
|
||||||
|
Initialize a new book.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: The book title
|
||||||
|
author: The book author
|
||||||
|
language: The book language code
|
||||||
|
"""
|
||||||
|
super().__init__(title, language)
|
||||||
|
self._chapters: List[Chapter] = []
|
||||||
|
|
||||||
|
if author:
|
||||||
|
self.set_metadata(MetadataType.AUTHOR, author)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chapters(self) -> List[Chapter]:
|
||||||
|
"""Get the chapters in this book"""
|
||||||
|
return self._chapters
|
||||||
|
|
||||||
|
def add_chapter(self, chapter: Chapter):
|
||||||
|
"""
|
||||||
|
Add a chapter to this book.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chapter: The chapter to add
|
||||||
|
"""
|
||||||
|
self._chapters.append(chapter)
|
||||||
|
|
||||||
|
def create_chapter(self, title: Optional[str] = None, level: int = 1) -> Chapter:
|
||||||
|
"""
|
||||||
|
Create and add a new chapter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: The chapter title
|
||||||
|
level: The chapter level
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The new chapter
|
||||||
|
"""
|
||||||
|
chapter = Chapter(title, level)
|
||||||
|
self.add_chapter(chapter)
|
||||||
|
return chapter
|
||||||
|
|
||||||
|
def get_author(self) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get the book author.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The book author, or None if not set
|
||||||
|
"""
|
||||||
|
return self.get_metadata(MetadataType.AUTHOR)
|
||||||
|
|
||||||
|
def set_author(self, author: str):
|
||||||
|
"""
|
||||||
|
Set the book author.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
author: The book author
|
||||||
|
"""
|
||||||
|
self.set_metadata(MetadataType.AUTHOR, author)
|
||||||
|
|
||||||
|
def generate_table_of_contents(self) -> List[Tuple[int, str, Chapter]]:
|
||||||
|
"""
|
||||||
|
Generate a table of contents from chapters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of tuples containing (level, title, chapter)
|
||||||
|
"""
|
||||||
|
toc = []
|
||||||
|
for chapter in self._chapters:
|
||||||
|
if chapter.title:
|
||||||
|
toc.append((chapter.level, chapter.title, chapter))
|
||||||
|
|
||||||
|
return toc
|
||||||
310
pyWebLayout/abstract/functional.py
Normal file
310
pyWebLayout/abstract/functional.py
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Callable, Dict, Any, Optional, Union, List, Tuple
|
||||||
|
from pyWebLayout.base import Interactable
|
||||||
|
|
||||||
|
|
||||||
|
class LinkType(Enum):
|
||||||
|
"""Enumeration of different types of links for classification purposes"""
|
||||||
|
INTERNAL = 1 # Links within the same document (e.g., chapter references, bookmarks)
|
||||||
|
EXTERNAL = 2 # Links to external resources (e.g., websites, other documents)
|
||||||
|
API = 3 # Links that trigger API calls (e.g., for settings management)
|
||||||
|
FUNCTION = 4 # Links that execute a specific function
|
||||||
|
|
||||||
|
|
||||||
|
class Link(Interactable):
|
||||||
|
"""
|
||||||
|
A link that can navigate to a location or execute a function.
|
||||||
|
Links can be used for navigation within a document, to external resources,
|
||||||
|
or to trigger API calls for functionality like settings management.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
location: str,
|
||||||
|
link_type: LinkType = LinkType.INTERNAL,
|
||||||
|
callback: Optional[Callable] = None,
|
||||||
|
params: Optional[Dict[str, Any]] = None,
|
||||||
|
title: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize a link.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
location: The target location or identifier for this link
|
||||||
|
link_type: The type of link (internal, external, API, function)
|
||||||
|
callback: Optional callback function to execute when the link is activated
|
||||||
|
params: Optional parameters to pass to the callback or API
|
||||||
|
title: Optional title/tooltip for the link
|
||||||
|
"""
|
||||||
|
super().__init__(callback)
|
||||||
|
self._location = location
|
||||||
|
self._link_type = link_type
|
||||||
|
self._params = params or {}
|
||||||
|
self._title = title
|
||||||
|
|
||||||
|
@property
|
||||||
|
def location(self) -> str:
|
||||||
|
"""Get the target location of this link"""
|
||||||
|
return self._location
|
||||||
|
|
||||||
|
@property
|
||||||
|
def link_type(self) -> LinkType:
|
||||||
|
"""Get the type of this link"""
|
||||||
|
return self._link_type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self) -> Dict[str, Any]:
|
||||||
|
"""Get the parameters for this link"""
|
||||||
|
return self._params
|
||||||
|
|
||||||
|
@property
|
||||||
|
def title(self) -> Optional[str]:
|
||||||
|
"""Get the title/tooltip for this link"""
|
||||||
|
return self._title
|
||||||
|
|
||||||
|
def execute(self) -> Any:
|
||||||
|
"""
|
||||||
|
Execute the link action based on its type.
|
||||||
|
|
||||||
|
For internal and external links, returns the location.
|
||||||
|
For API and function links, executes the callback with the provided parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The result of the link execution, which depends on the link type.
|
||||||
|
"""
|
||||||
|
if self._link_type in (LinkType.API, LinkType.FUNCTION) and self._callback:
|
||||||
|
return self._callback(self._location, **self._params)
|
||||||
|
else:
|
||||||
|
# For INTERNAL and EXTERNAL links, return the location
|
||||||
|
# The renderer/browser will handle the navigation
|
||||||
|
return self._location
|
||||||
|
|
||||||
|
|
||||||
|
class Button(Interactable):
|
||||||
|
"""
|
||||||
|
A button that can be clicked to execute an action.
|
||||||
|
Buttons are similar to function links but are rendered differently.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
label: str,
|
||||||
|
callback: Callable,
|
||||||
|
params: Optional[Dict[str, Any]] = None,
|
||||||
|
enabled: bool = True):
|
||||||
|
"""
|
||||||
|
Initialize a button.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
label: The text label for the button
|
||||||
|
callback: The function to execute when the button is clicked
|
||||||
|
params: Optional parameters to pass to the callback
|
||||||
|
enabled: Whether the button is initially enabled
|
||||||
|
"""
|
||||||
|
super().__init__(callback)
|
||||||
|
self._label = label
|
||||||
|
self._params = params or {}
|
||||||
|
self._enabled = enabled
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label(self) -> str:
|
||||||
|
"""Get the button label"""
|
||||||
|
return self._label
|
||||||
|
|
||||||
|
@label.setter
|
||||||
|
def label(self, label: str):
|
||||||
|
"""Set the button label"""
|
||||||
|
self._label = label
|
||||||
|
|
||||||
|
@property
|
||||||
|
def enabled(self) -> bool:
|
||||||
|
"""Check if the button is enabled"""
|
||||||
|
return self._enabled
|
||||||
|
|
||||||
|
@enabled.setter
|
||||||
|
def enabled(self, enabled: bool):
|
||||||
|
"""Enable or disable the button"""
|
||||||
|
self._enabled = enabled
|
||||||
|
|
||||||
|
def execute(self) -> Any:
|
||||||
|
"""
|
||||||
|
Execute the button's callback function if the button is enabled.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The result of the callback function, or None if the button is disabled.
|
||||||
|
"""
|
||||||
|
if self._enabled and self._callback:
|
||||||
|
return self._callback(**self._params)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class Form(Interactable):
|
||||||
|
"""
|
||||||
|
A form that can contain input fields and be submitted.
|
||||||
|
Forms can be used for user input and settings configuration.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
form_id: str,
|
||||||
|
action: Optional[str] = None,
|
||||||
|
callback: Optional[Callable] = None):
|
||||||
|
"""
|
||||||
|
Initialize a form.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
form_id: The unique identifier for this form
|
||||||
|
action: The action URL or endpoint for form submission
|
||||||
|
callback: Optional callback function to execute on form submission
|
||||||
|
"""
|
||||||
|
super().__init__(callback)
|
||||||
|
self._form_id = form_id
|
||||||
|
self._action = action
|
||||||
|
self._fields: Dict[str, FormField] = {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def form_id(self) -> str:
|
||||||
|
"""Get the form ID"""
|
||||||
|
return self._form_id
|
||||||
|
|
||||||
|
@property
|
||||||
|
def action(self) -> Optional[str]:
|
||||||
|
"""Get the form action"""
|
||||||
|
return self._action
|
||||||
|
|
||||||
|
def add_field(self, field: FormField):
|
||||||
|
"""
|
||||||
|
Add a field to this form.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
field: The FormField to add
|
||||||
|
"""
|
||||||
|
self._fields[field.name] = field
|
||||||
|
field.form = self
|
||||||
|
|
||||||
|
def get_field(self, name: str) -> Optional[FormField]:
|
||||||
|
"""
|
||||||
|
Get a field by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The name of the field to get
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The FormField with the specified name, or None if not found
|
||||||
|
"""
|
||||||
|
return self._fields.get(name)
|
||||||
|
|
||||||
|
def get_values(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the current values of all fields in this form.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary mapping field names to their current values
|
||||||
|
"""
|
||||||
|
return {name: field.value for name, field in self._fields.items()}
|
||||||
|
|
||||||
|
def execute(self) -> Any:
|
||||||
|
"""
|
||||||
|
Submit the form, executing the callback with the form values.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The result of the callback function, or the form values if no callback is provided.
|
||||||
|
"""
|
||||||
|
values = self.get_values()
|
||||||
|
|
||||||
|
if self._callback:
|
||||||
|
return self._callback(self._form_id, values)
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
class FormFieldType(Enum):
|
||||||
|
"""Enumeration of different types of form fields"""
|
||||||
|
TEXT = 1
|
||||||
|
PASSWORD = 2
|
||||||
|
CHECKBOX = 3
|
||||||
|
RADIO = 4
|
||||||
|
SELECT = 5
|
||||||
|
TEXTAREA = 6
|
||||||
|
NUMBER = 7
|
||||||
|
DATE = 8
|
||||||
|
TIME = 9
|
||||||
|
EMAIL = 10
|
||||||
|
URL = 11
|
||||||
|
COLOR = 12
|
||||||
|
RANGE = 13
|
||||||
|
HIDDEN = 14
|
||||||
|
|
||||||
|
|
||||||
|
class FormField:
|
||||||
|
"""
|
||||||
|
A field in a form that can accept user input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
name: str,
|
||||||
|
field_type: FormFieldType,
|
||||||
|
label: Optional[str] = None,
|
||||||
|
value: Any = None,
|
||||||
|
required: bool = False,
|
||||||
|
options: Optional[List[Tuple[str, str]]] = None):
|
||||||
|
"""
|
||||||
|
Initialize a form field.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The name of this field
|
||||||
|
field_type: The type of this field
|
||||||
|
label: Optional label for this field
|
||||||
|
value: Initial value for this field
|
||||||
|
required: Whether this field is required
|
||||||
|
options: Options for select, radio, or checkbox fields (list of (value, label) tuples)
|
||||||
|
"""
|
||||||
|
self._name = name
|
||||||
|
self._field_type = field_type
|
||||||
|
self._label = label or name
|
||||||
|
self._value = value
|
||||||
|
self._required = required
|
||||||
|
self._options = options or []
|
||||||
|
self._form: Optional[Form] = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Get the field name"""
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def field_type(self) -> FormFieldType:
|
||||||
|
"""Get the field type"""
|
||||||
|
return self._field_type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label(self) -> str:
|
||||||
|
"""Get the field label"""
|
||||||
|
return self._label
|
||||||
|
|
||||||
|
@property
|
||||||
|
def value(self) -> Any:
|
||||||
|
"""Get the current field value"""
|
||||||
|
return self._value
|
||||||
|
|
||||||
|
@value.setter
|
||||||
|
def value(self, value: Any):
|
||||||
|
"""Set the field value"""
|
||||||
|
self._value = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def required(self) -> bool:
|
||||||
|
"""Check if the field is required"""
|
||||||
|
return self._required
|
||||||
|
|
||||||
|
@property
|
||||||
|
def options(self) -> List[Tuple[str, str]]:
|
||||||
|
"""Get the field options"""
|
||||||
|
return self._options
|
||||||
|
|
||||||
|
@property
|
||||||
|
def form(self) -> Optional[Form]:
|
||||||
|
"""Get the form containing this field"""
|
||||||
|
return self._form
|
||||||
|
|
||||||
|
@form.setter
|
||||||
|
def form(self, form: Form):
|
||||||
|
"""Set the form containing this field"""
|
||||||
|
self._form = form
|
||||||
208
pyWebLayout/abstract/inline.py
Normal file
208
pyWebLayout/abstract/inline.py
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
from pyWebLayout.base import Queriable
|
||||||
|
from pyWebLayout.style import Font
|
||||||
|
from typing import Tuple, Union, List, Optional, Dict
|
||||||
|
|
||||||
|
|
||||||
|
class Word:
|
||||||
|
"""
|
||||||
|
An abstract representation of a word in a document. Words can be split across
|
||||||
|
lines or pages during rendering. This class manages the logical representation
|
||||||
|
of a word without any rendering specifics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, text: str, style: Font, background=None, previous: Union[Word, None] = None):
|
||||||
|
"""
|
||||||
|
Initialize a new Word.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text content of the word
|
||||||
|
style: Font style information for the word
|
||||||
|
background: Optional background color override
|
||||||
|
previous: Reference to the previous word in sequence
|
||||||
|
"""
|
||||||
|
self._text = text
|
||||||
|
self._style = style
|
||||||
|
self._background = background if background else style.background
|
||||||
|
self._previous = previous
|
||||||
|
self._next = None
|
||||||
|
self._hyphenated_parts = None # Will store hyphenated parts if word is hyphenated
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self) -> str:
|
||||||
|
"""Get the text content of the word"""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def style(self) -> Font:
|
||||||
|
"""Get the font style of the word"""
|
||||||
|
return self._style
|
||||||
|
|
||||||
|
@property
|
||||||
|
def background(self):
|
||||||
|
"""Get the background color of the word"""
|
||||||
|
return self._background
|
||||||
|
|
||||||
|
@property
|
||||||
|
def previous(self) -> Union[Word, None]:
|
||||||
|
"""Get the previous word in sequence"""
|
||||||
|
return self._previous
|
||||||
|
|
||||||
|
@property
|
||||||
|
def next(self) -> Union[Word, None]:
|
||||||
|
"""Get the next word in sequence"""
|
||||||
|
return self._next
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hyphenated_parts(self) -> Union[List[str], None]:
|
||||||
|
"""Get the hyphenated parts of the word if it has been hyphenated"""
|
||||||
|
return self._hyphenated_parts
|
||||||
|
|
||||||
|
def add_next(self, next_word: Word):
|
||||||
|
"""Set the next word in sequence"""
|
||||||
|
self._next = next_word
|
||||||
|
|
||||||
|
def can_hyphenate(self, language: str = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the word can be hyphenated.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
language: Language code for hyphenation. If None, uses the style's language.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the word can be hyphenated, False otherwise.
|
||||||
|
"""
|
||||||
|
# Only import pyphen when needed
|
||||||
|
import pyphen
|
||||||
|
|
||||||
|
# Use the provided language or fall back to style language
|
||||||
|
lang = language if language else self._style.language
|
||||||
|
dic = pyphen.Pyphen(lang=lang)
|
||||||
|
|
||||||
|
# Check if the word can be hyphenated
|
||||||
|
hyphenated = dic.inserted(self._text, hyphen='-')
|
||||||
|
return '-' in hyphenated
|
||||||
|
|
||||||
|
def hyphenate(self, language: str = None) -> bool:
|
||||||
|
"""
|
||||||
|
Hyphenate the word and store the parts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
language: Language code for hyphenation. If None, uses the style's language.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the word was hyphenated, False otherwise.
|
||||||
|
"""
|
||||||
|
# Only import pyphen when needed
|
||||||
|
import pyphen
|
||||||
|
|
||||||
|
# Use the provided language or fall back to style language
|
||||||
|
lang = language if language else self._style.language
|
||||||
|
dic = pyphen.Pyphen(lang=lang)
|
||||||
|
|
||||||
|
# Get hyphenated version
|
||||||
|
hyphenated = dic.inserted(self._text, hyphen='-')
|
||||||
|
|
||||||
|
# If no hyphens were inserted, the word cannot be hyphenated
|
||||||
|
if '-' not in hyphenated:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Split the word into parts by the hyphen
|
||||||
|
parts = hyphenated.split('-')
|
||||||
|
|
||||||
|
# Add the hyphen to all parts except the last one
|
||||||
|
for i in range(len(parts) - 1):
|
||||||
|
parts[i] = parts[i] + '-'
|
||||||
|
|
||||||
|
self._hyphenated_parts = parts
|
||||||
|
return True
|
||||||
|
|
||||||
|
def dehyphenate(self):
|
||||||
|
"""Remove hyphenation"""
|
||||||
|
self._hyphenated_parts = None
|
||||||
|
|
||||||
|
def get_hyphenated_part(self, index: int) -> str:
|
||||||
|
"""
|
||||||
|
Get a specific hyphenated part of the word.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index: The index of the part to retrieve.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The text of the specified part.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
IndexError: If the index is out of range or the word has not been hyphenated.
|
||||||
|
"""
|
||||||
|
if not self._hyphenated_parts:
|
||||||
|
raise IndexError("Word has not been hyphenated")
|
||||||
|
|
||||||
|
return self._hyphenated_parts[index]
|
||||||
|
|
||||||
|
def get_hyphenated_part_count(self) -> int:
|
||||||
|
"""
|
||||||
|
Get the number of hyphenated parts.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The number of parts, or 0 if the word has not been hyphenated.
|
||||||
|
"""
|
||||||
|
return len(self._hyphenated_parts) if self._hyphenated_parts else 0
|
||||||
|
|
||||||
|
|
||||||
|
class FormattedSpan:
|
||||||
|
"""
|
||||||
|
A run of words with consistent formatting.
|
||||||
|
This represents a sequence of words that share the same style attributes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, style: Font, background=None):
|
||||||
|
"""
|
||||||
|
Initialize a new formatted span.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style: Font style information for all words in this span
|
||||||
|
background: Optional background color override
|
||||||
|
"""
|
||||||
|
self._style = style
|
||||||
|
self._background = background if background else style.background
|
||||||
|
self._words: List[Word] = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def style(self) -> Font:
|
||||||
|
"""Get the font style of this span"""
|
||||||
|
return self._style
|
||||||
|
|
||||||
|
@property
|
||||||
|
def background(self):
|
||||||
|
"""Get the background color of this span"""
|
||||||
|
return self._background
|
||||||
|
|
||||||
|
@property
|
||||||
|
def words(self) -> List[Word]:
|
||||||
|
"""Get the list of words in this span"""
|
||||||
|
return self._words
|
||||||
|
|
||||||
|
def add_word(self, text: str) -> Word:
|
||||||
|
"""
|
||||||
|
Create and add a new word to this span.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text content of the word
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The newly created Word object
|
||||||
|
"""
|
||||||
|
# Get the previous word if any
|
||||||
|
previous = self._words[-1] if self._words else None
|
||||||
|
|
||||||
|
# Create the new word
|
||||||
|
word = Word(text, self._style, self._background, previous)
|
||||||
|
|
||||||
|
# Link the previous word to this new one
|
||||||
|
if previous:
|
||||||
|
previous.add_next(word)
|
||||||
|
|
||||||
|
# Add the word to our list
|
||||||
|
self._words.append(word)
|
||||||
|
|
||||||
|
return word
|
||||||
68
pyWebLayout/base.py
Normal file
68
pyWebLayout/base.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
|
||||||
|
from abc import ABC
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from pyWebLayout.style import Alignment
|
||||||
|
|
||||||
|
|
||||||
|
class Renderable(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for any object that can be rendered to an image.
|
||||||
|
All renderable objects must implement the render method.
|
||||||
|
"""
|
||||||
|
def render(self):
|
||||||
|
"""
|
||||||
|
Render the object to an image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PIL.Image: The rendered image
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Interactable(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for any object that can be interacted with.
|
||||||
|
Interactable objects must have a callback that is executed when interacted with.
|
||||||
|
"""
|
||||||
|
def __init__(self, callback=None):
|
||||||
|
"""
|
||||||
|
Initialize an interactable object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
callback: The function to call when this object is interacted with
|
||||||
|
"""
|
||||||
|
self._callback = callback
|
||||||
|
|
||||||
|
def interact(self, point: np.generic):
|
||||||
|
"""
|
||||||
|
Handle interaction at the given point.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
point: The coordinates of the interaction
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The result of calling the callback function with the point
|
||||||
|
"""
|
||||||
|
if self._callback is None:
|
||||||
|
return None
|
||||||
|
return self._callback(point)
|
||||||
|
|
||||||
|
class Layoutable(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for any object that can be laid out.
|
||||||
|
Layoutable objects must implement the layout method which arranges their contents.
|
||||||
|
"""
|
||||||
|
def layout(self):
|
||||||
|
"""
|
||||||
|
Layout the object's contents.
|
||||||
|
This method should be called before rendering to properly arrange the object's contents.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Queriable(ABC):
|
||||||
|
|
||||||
|
def in_object(self, point:np.generic):
|
||||||
|
"""
|
||||||
|
check if a point is in the object
|
||||||
|
"""
|
||||||
|
pass
|
||||||
5
pyWebLayout/concrete/__init__.py
Normal file
5
pyWebLayout/concrete/__init__.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from .box import Box
|
||||||
|
from .page import Container, Page
|
||||||
|
from .text import Text, RenderableWord, Line
|
||||||
|
from .functional import RenderableLink, RenderableButton, RenderableForm, RenderableFormField
|
||||||
|
from .image import RenderableImage
|
||||||
61
pyWebLayout/concrete/box.py
Normal file
61
pyWebLayout/concrete/box.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from pyWebLayout.base import Renderable, Queriable
|
||||||
|
from pyWebLayout.layout import Alignment
|
||||||
|
|
||||||
|
class Box(Renderable, Queriable):
|
||||||
|
|
||||||
|
def __init__(self,origin, size, callback = None, sheet : Image = None, mode: bool = None, halign=Alignment.CENTER, valign = Alignment.CENTER):
|
||||||
|
self._origin = np.array(origin)
|
||||||
|
self._size = np.array(size)
|
||||||
|
self._end = self._origin + self._size
|
||||||
|
self._callback = callback
|
||||||
|
self._sheet : Image = sheet
|
||||||
|
if self._sheet == None:
|
||||||
|
self._mode = mode
|
||||||
|
else:
|
||||||
|
self._mode = sheet.mode
|
||||||
|
self._halign = halign
|
||||||
|
self._valign = valign
|
||||||
|
|
||||||
|
def in_shape(self, point):
|
||||||
|
|
||||||
|
return np.all((point >= self.origin) & (point < self._end), axis=-1)
|
||||||
|
|
||||||
|
def render(self) -> Image:
|
||||||
|
# Create a new image canvas
|
||||||
|
if self._sheet is not None:
|
||||||
|
canvas = Image.new(self._sheet.mode, tuple(self._size))
|
||||||
|
else:
|
||||||
|
# Default to RGBA if no sheet is provided
|
||||||
|
canvas = Image.new(self._mode if self._mode else 'RGBA', tuple(self._size))
|
||||||
|
|
||||||
|
# Check if there's content to render
|
||||||
|
if hasattr(self, '_content') and self._content is not None:
|
||||||
|
content_render = self._content.render()
|
||||||
|
|
||||||
|
# Calculate positioning based on alignment
|
||||||
|
content_width, content_height = content_render.size
|
||||||
|
box_width, box_height = self._size
|
||||||
|
|
||||||
|
# Horizontal alignment
|
||||||
|
if self._halign == Alignment.LEFT:
|
||||||
|
x_offset = 0
|
||||||
|
elif self._halign == Alignment.RIGHT:
|
||||||
|
x_offset = box_width - content_width
|
||||||
|
else: # CENTER is default
|
||||||
|
x_offset = (box_width - content_width) // 2
|
||||||
|
|
||||||
|
# Vertical alignment
|
||||||
|
if self._valign == Alignment.TOP:
|
||||||
|
y_offset = 0
|
||||||
|
elif self._valign == Alignment.BOTTOM:
|
||||||
|
y_offset = box_height - content_height
|
||||||
|
else: # CENTER is default
|
||||||
|
y_offset = (box_height - content_height) // 2
|
||||||
|
|
||||||
|
# Paste the content onto the canvas
|
||||||
|
canvas.paste(content_render, (x_offset, y_offset))
|
||||||
|
|
||||||
|
return canvas
|
||||||
545
pyWebLayout/concrete/functional.py
Normal file
545
pyWebLayout/concrete/functional.py
Normal file
@ -0,0 +1,545 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
from typing import Optional, Dict, Any, Tuple, List, Union
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
from pyWebLayout.base import Renderable, Queriable
|
||||||
|
from pyWebLayout.abstract.functional import Link, Button, Form, FormField, LinkType, FormFieldType
|
||||||
|
from pyWebLayout.style import Font, TextDecoration
|
||||||
|
from .box import Box
|
||||||
|
from .text import Text
|
||||||
|
|
||||||
|
|
||||||
|
class RenderableLink(Box, Queriable):
|
||||||
|
"""
|
||||||
|
A concrete implementation for rendering Link objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, link: Link, text: str, font: Font,
|
||||||
|
padding: Tuple[int, int, int, int] = (2, 4, 2, 4),
|
||||||
|
origin=None, size=None, callback=None, sheet=None, mode=None):
|
||||||
|
"""
|
||||||
|
Initialize a renderable link.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
link: The abstract Link object to render
|
||||||
|
text: The text to display for the link
|
||||||
|
font: The font to use for the link text
|
||||||
|
padding: Padding as (top, right, bottom, left)
|
||||||
|
origin: Optional origin coordinates
|
||||||
|
size: Optional size override
|
||||||
|
callback: Optional callback override
|
||||||
|
sheet: Optional sheet for rendering
|
||||||
|
mode: Optional mode for rendering
|
||||||
|
"""
|
||||||
|
# Create link style font (typically underlined and colored)
|
||||||
|
link_font = font.with_decoration(TextDecoration.UNDERLINE)
|
||||||
|
if link.link_type == LinkType.INTERNAL:
|
||||||
|
link_font = link_font.with_colour((0, 0, 200)) # Blue for internal links
|
||||||
|
elif link.link_type == LinkType.EXTERNAL:
|
||||||
|
link_font = link_font.with_colour((0, 0, 180)) # Darker blue for external links
|
||||||
|
elif link.link_type == LinkType.API:
|
||||||
|
link_font = link_font.with_colour((150, 0, 0)) # Red for API links
|
||||||
|
elif link.link_type == LinkType.FUNCTION:
|
||||||
|
link_font = link_font.with_colour((0, 120, 0)) # Green for function links
|
||||||
|
|
||||||
|
# Create the text object for the link
|
||||||
|
self._text_obj = Text(text, link_font)
|
||||||
|
|
||||||
|
# Calculate size if not provided
|
||||||
|
if size is None:
|
||||||
|
text_width, text_height = self._text_obj.size
|
||||||
|
size = (
|
||||||
|
text_width + padding[1] + padding[3], # width + right + left padding
|
||||||
|
text_height + padding[0] + padding[2] # height + top + bottom padding
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the link's callback if none provided
|
||||||
|
if callback is None:
|
||||||
|
callback = link.execute
|
||||||
|
|
||||||
|
# Initialize the box
|
||||||
|
super().__init__(origin or (0, 0), size, callback, sheet, mode)
|
||||||
|
|
||||||
|
# Store the link object and rendering properties
|
||||||
|
self._link = link
|
||||||
|
self._padding = padding
|
||||||
|
self._hovered = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def link(self) -> Link:
|
||||||
|
"""Get the abstract Link object"""
|
||||||
|
return self._link
|
||||||
|
|
||||||
|
def render(self) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Render the link.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered link
|
||||||
|
"""
|
||||||
|
# Create the base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
draw = ImageDraw.Draw(canvas)
|
||||||
|
|
||||||
|
# Position the text within the padding
|
||||||
|
text_x = self._padding[3] # left padding
|
||||||
|
text_y = self._padding[0] # top padding
|
||||||
|
|
||||||
|
# Render the text object
|
||||||
|
text_img = self._text_obj.render()
|
||||||
|
|
||||||
|
# Paste the text onto the canvas
|
||||||
|
canvas.paste(text_img, (text_x, text_y), text_img)
|
||||||
|
|
||||||
|
# Draw a highlight background if hovered
|
||||||
|
if self._hovered:
|
||||||
|
# Draw a semi-transparent highlight
|
||||||
|
highlight_color = (220, 220, 255, 100) # Light blue with alpha
|
||||||
|
draw.rectangle([(0, 0), self._size], fill=highlight_color)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def set_hovered(self, hovered: bool):
|
||||||
|
"""Set whether the link is being hovered over"""
|
||||||
|
self._hovered = hovered
|
||||||
|
|
||||||
|
def in_object(self, point):
|
||||||
|
"""Check if a point is within this link"""
|
||||||
|
point_array = np.array(point)
|
||||||
|
relative_point = point_array - self._origin
|
||||||
|
|
||||||
|
# Check if the point is within the link boundaries
|
||||||
|
return (0 <= relative_point[0] < self._size[0] and
|
||||||
|
0 <= relative_point[1] < self._size[1])
|
||||||
|
|
||||||
|
|
||||||
|
class RenderableButton(Box, Queriable):
|
||||||
|
"""
|
||||||
|
A concrete implementation for rendering Button objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, button: Button, font: Font,
|
||||||
|
padding: Tuple[int, int, int, int] = (6, 10, 6, 10),
|
||||||
|
border_radius: int = 4,
|
||||||
|
origin=None, size=None, callback=None, sheet=None, mode=None):
|
||||||
|
"""
|
||||||
|
Initialize a renderable button.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
button: The abstract Button object to render
|
||||||
|
font: The font to use for the button text
|
||||||
|
padding: Padding as (top, right, bottom, left)
|
||||||
|
border_radius: Radius for rounded corners
|
||||||
|
origin: Optional origin coordinates
|
||||||
|
size: Optional size override
|
||||||
|
callback: Optional callback override
|
||||||
|
sheet: Optional sheet for rendering
|
||||||
|
mode: Optional mode for rendering
|
||||||
|
"""
|
||||||
|
# Create the text object for the button
|
||||||
|
self._text_obj = Text(button.label, font)
|
||||||
|
|
||||||
|
# Calculate size if not provided
|
||||||
|
if size is None:
|
||||||
|
text_width, text_height = self._text_obj.size
|
||||||
|
size = (
|
||||||
|
text_width + padding[1] + padding[3], # width + right + left padding
|
||||||
|
text_height + padding[0] + padding[2] # height + top + bottom padding
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the button's callback if none provided
|
||||||
|
if callback is None:
|
||||||
|
callback = button.execute
|
||||||
|
|
||||||
|
# Initialize the box
|
||||||
|
super().__init__(origin or (0, 0), size, callback, sheet, mode)
|
||||||
|
|
||||||
|
# Store the button object and rendering properties
|
||||||
|
self._button = button
|
||||||
|
self._padding = padding
|
||||||
|
self._border_radius = border_radius
|
||||||
|
self._pressed = False
|
||||||
|
self._hovered = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def button(self) -> Button:
|
||||||
|
"""Get the abstract Button object"""
|
||||||
|
return self._button
|
||||||
|
|
||||||
|
def render(self) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Render the button.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered button
|
||||||
|
"""
|
||||||
|
# Create the base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
draw = ImageDraw.Draw(canvas)
|
||||||
|
|
||||||
|
# Determine button colors based on state
|
||||||
|
if not self._button.enabled:
|
||||||
|
# Disabled button
|
||||||
|
bg_color = (200, 200, 200)
|
||||||
|
border_color = (150, 150, 150)
|
||||||
|
text_color = (100, 100, 100)
|
||||||
|
elif self._pressed:
|
||||||
|
# Pressed button
|
||||||
|
bg_color = (70, 130, 180)
|
||||||
|
border_color = (50, 100, 150)
|
||||||
|
text_color = (255, 255, 255)
|
||||||
|
elif self._hovered:
|
||||||
|
# Hovered button
|
||||||
|
bg_color = (100, 160, 220)
|
||||||
|
border_color = (70, 130, 180)
|
||||||
|
text_color = (255, 255, 255)
|
||||||
|
else:
|
||||||
|
# Normal button
|
||||||
|
bg_color = (100, 150, 200)
|
||||||
|
border_color = (70, 120, 170)
|
||||||
|
text_color = (255, 255, 255)
|
||||||
|
|
||||||
|
# Draw button background with rounded corners
|
||||||
|
draw.rounded_rectangle([(0, 0), self._size], fill=bg_color,
|
||||||
|
outline=border_color, width=1,
|
||||||
|
radius=self._border_radius)
|
||||||
|
|
||||||
|
# Position the text centered within the button
|
||||||
|
text_img = self._text_obj.render()
|
||||||
|
text_x = (self._size[0] - text_img.width) // 2
|
||||||
|
text_y = (self._size[1] - text_img.height) // 2
|
||||||
|
|
||||||
|
# Paste the text onto the canvas
|
||||||
|
canvas.paste(text_img, (text_x, text_y), text_img)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def set_pressed(self, pressed: bool):
|
||||||
|
"""Set whether the button is being pressed"""
|
||||||
|
self._pressed = pressed
|
||||||
|
|
||||||
|
def set_hovered(self, hovered: bool):
|
||||||
|
"""Set whether the button is being hovered over"""
|
||||||
|
self._hovered = hovered
|
||||||
|
|
||||||
|
def in_object(self, point):
|
||||||
|
"""Check if a point is within this button"""
|
||||||
|
point_array = np.array(point)
|
||||||
|
relative_point = point_array - self._origin
|
||||||
|
|
||||||
|
# Check if the point is within the button boundaries
|
||||||
|
return (0 <= relative_point[0] < self._size[0] and
|
||||||
|
0 <= relative_point[1] < self._size[1])
|
||||||
|
|
||||||
|
|
||||||
|
class RenderableForm(Box):
|
||||||
|
"""
|
||||||
|
A concrete implementation for rendering Form objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, form: Form, font: Font,
|
||||||
|
field_padding: Tuple[int, int, int, int] = (5, 10, 5, 10),
|
||||||
|
spacing: int = 10,
|
||||||
|
origin=None, size=None, callback=None, sheet=None, mode=None):
|
||||||
|
"""
|
||||||
|
Initialize a renderable form.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
form: The abstract Form object to render
|
||||||
|
font: The font to use for form text
|
||||||
|
field_padding: Padding for form fields
|
||||||
|
spacing: Spacing between form elements
|
||||||
|
origin: Optional origin coordinates
|
||||||
|
size: Optional size override
|
||||||
|
callback: Optional callback override
|
||||||
|
sheet: Optional sheet for rendering
|
||||||
|
mode: Optional mode for rendering
|
||||||
|
"""
|
||||||
|
# Use the form's callback if none provided
|
||||||
|
if callback is None:
|
||||||
|
callback = form.execute
|
||||||
|
|
||||||
|
# Initialize with temporary size, will be updated during layout
|
||||||
|
temp_size = size or (400, 300)
|
||||||
|
super().__init__(origin or (0, 0), temp_size, callback, sheet, mode)
|
||||||
|
|
||||||
|
# Store the form object and rendering properties
|
||||||
|
self._form = form
|
||||||
|
self._font = font
|
||||||
|
self._field_padding = field_padding
|
||||||
|
self._spacing = spacing
|
||||||
|
|
||||||
|
# Create renderable field objects
|
||||||
|
self._renderable_fields: List[RenderableFormField] = []
|
||||||
|
self._submit_button: Optional[RenderableButton] = None
|
||||||
|
|
||||||
|
# Create the form elements
|
||||||
|
self._create_form_elements()
|
||||||
|
|
||||||
|
# If size was not provided, calculate it based on form elements
|
||||||
|
if size is None:
|
||||||
|
self._calculate_size()
|
||||||
|
|
||||||
|
def _create_form_elements(self):
|
||||||
|
"""Create renderable field objects for each form field"""
|
||||||
|
# Create field renderers
|
||||||
|
for field_name, field in self._form._fields.items():
|
||||||
|
renderable_field = RenderableFormField(field, self._font, self._field_padding)
|
||||||
|
self._renderable_fields.append(renderable_field)
|
||||||
|
|
||||||
|
# Create submit button
|
||||||
|
submit_button = Button("Submit", self._form.execute)
|
||||||
|
self._submit_button = RenderableButton(submit_button, self._font)
|
||||||
|
|
||||||
|
def _calculate_size(self):
|
||||||
|
"""Calculate the size of the form based on its elements"""
|
||||||
|
# Calculate the width based on the widest element
|
||||||
|
max_width = max(
|
||||||
|
[field.size[0] for field in self._renderable_fields] +
|
||||||
|
[self._submit_button.size[0] if self._submit_button else 0]
|
||||||
|
) + 20 # Add some padding
|
||||||
|
|
||||||
|
# Calculate the height based on all elements and spacing
|
||||||
|
total_height = sum(field.size[1] for field in self._renderable_fields)
|
||||||
|
total_height += self._spacing * (len(self._renderable_fields) - 1 if self._renderable_fields else 0)
|
||||||
|
|
||||||
|
# Add space for the submit button
|
||||||
|
if self._submit_button:
|
||||||
|
total_height += self._spacing + self._submit_button.size[1]
|
||||||
|
|
||||||
|
# Add some padding
|
||||||
|
total_height += 20
|
||||||
|
|
||||||
|
self._size = np.array([max_width, total_height])
|
||||||
|
|
||||||
|
def layout(self):
|
||||||
|
"""Layout the form elements"""
|
||||||
|
y_pos = 10 # Start with some padding
|
||||||
|
|
||||||
|
# Position each field
|
||||||
|
for field in self._renderable_fields:
|
||||||
|
field._origin = np.array([10, y_pos])
|
||||||
|
y_pos += field.size[1] + self._spacing
|
||||||
|
|
||||||
|
# Position the submit button
|
||||||
|
if self._submit_button:
|
||||||
|
# Center the submit button horizontally
|
||||||
|
submit_x = (self._size[0] - self._submit_button.size[0]) // 2
|
||||||
|
self._submit_button._origin = np.array([submit_x, y_pos])
|
||||||
|
|
||||||
|
def render(self) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Render the form.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered form
|
||||||
|
"""
|
||||||
|
# Layout elements before rendering
|
||||||
|
self.layout()
|
||||||
|
|
||||||
|
# Create the base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
|
||||||
|
# Render each field
|
||||||
|
for field in self._renderable_fields:
|
||||||
|
field_img = field.render()
|
||||||
|
field_pos = tuple(field._origin)
|
||||||
|
canvas.paste(field_img, field_pos, field_img)
|
||||||
|
|
||||||
|
# Render the submit button
|
||||||
|
if self._submit_button:
|
||||||
|
button_img = self._submit_button.render()
|
||||||
|
button_pos = tuple(self._submit_button._origin)
|
||||||
|
canvas.paste(button_img, button_pos, button_img)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def handle_click(self, point):
|
||||||
|
"""
|
||||||
|
Handle a click on the form.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
point: The coordinates of the click
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The result of the clicked element's callback, or None if no element was clicked
|
||||||
|
"""
|
||||||
|
# Check if the submit button was clicked
|
||||||
|
if (self._submit_button and
|
||||||
|
self._submit_button.in_object(point)):
|
||||||
|
return self._submit_button._callback()
|
||||||
|
|
||||||
|
# Check if any field was clicked
|
||||||
|
for field in self._renderable_fields:
|
||||||
|
if field.in_object(point):
|
||||||
|
return field.handle_click(point - field._origin)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class RenderableFormField(Box, Queriable):
|
||||||
|
"""
|
||||||
|
A concrete implementation for rendering FormField objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, field: FormField, font: Font,
|
||||||
|
padding: Tuple[int, int, int, int] = (5, 10, 5, 10),
|
||||||
|
origin=None, size=None, callback=None, sheet=None, mode=None):
|
||||||
|
"""
|
||||||
|
Initialize a renderable form field.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
field: The abstract FormField object to render
|
||||||
|
font: The font to use for field text
|
||||||
|
padding: Padding for the field
|
||||||
|
origin: Optional origin coordinates
|
||||||
|
size: Optional size override
|
||||||
|
callback: Optional callback override
|
||||||
|
sheet: Optional sheet for rendering
|
||||||
|
mode: Optional mode for rendering
|
||||||
|
"""
|
||||||
|
# Create the label text object
|
||||||
|
self._label_text = Text(field.label, font)
|
||||||
|
|
||||||
|
# Calculate size if not provided
|
||||||
|
if size is None:
|
||||||
|
label_width, label_height = self._label_text.size
|
||||||
|
|
||||||
|
# Default field width based on type
|
||||||
|
if field.field_type in (FormFieldType.TEXTAREA, FormFieldType.SELECT):
|
||||||
|
field_width = 200
|
||||||
|
else:
|
||||||
|
field_width = 150
|
||||||
|
|
||||||
|
# Default field height based on type
|
||||||
|
if field.field_type == FormFieldType.TEXTAREA:
|
||||||
|
field_height = 80
|
||||||
|
elif field.field_type == FormFieldType.SELECT:
|
||||||
|
field_height = 24
|
||||||
|
else:
|
||||||
|
field_height = 24
|
||||||
|
|
||||||
|
# Calculate total width and height
|
||||||
|
total_width = max(label_width, field_width) + padding[1] + padding[3]
|
||||||
|
total_height = label_height + field_height + padding[0] + padding[2] + 5 # 5px between label and field
|
||||||
|
|
||||||
|
size = (total_width, total_height)
|
||||||
|
|
||||||
|
# Initialize the box
|
||||||
|
super().__init__(origin or (0, 0), size, callback, sheet, mode)
|
||||||
|
|
||||||
|
# Store the field object and rendering properties
|
||||||
|
self._field = field
|
||||||
|
self._font = font
|
||||||
|
self._padding = padding
|
||||||
|
self._focused = False
|
||||||
|
|
||||||
|
def render(self) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Render the form field.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered form field
|
||||||
|
"""
|
||||||
|
# Create the base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
draw = ImageDraw.Draw(canvas)
|
||||||
|
|
||||||
|
# Position the label
|
||||||
|
label_x = self._padding[3]
|
||||||
|
label_y = self._padding[0]
|
||||||
|
|
||||||
|
# Render the label
|
||||||
|
label_img = self._label_text.render()
|
||||||
|
canvas.paste(label_img, (label_x, label_y), label_img)
|
||||||
|
|
||||||
|
# Calculate field position
|
||||||
|
field_x = self._padding[3]
|
||||||
|
field_y = self._padding[0] + label_img.height + 5 # 5px between label and field
|
||||||
|
|
||||||
|
# Calculate field dimensions
|
||||||
|
field_width = self._size[0] - self._padding[1] - self._padding[3]
|
||||||
|
|
||||||
|
if self._field.field_type == FormFieldType.TEXTAREA:
|
||||||
|
field_height = 80
|
||||||
|
else:
|
||||||
|
field_height = 24
|
||||||
|
|
||||||
|
# Draw field background
|
||||||
|
bg_color = (255, 255, 255)
|
||||||
|
border_color = (200, 200, 200)
|
||||||
|
|
||||||
|
if self._focused:
|
||||||
|
border_color = (100, 150, 200)
|
||||||
|
|
||||||
|
# Draw field with border
|
||||||
|
draw.rectangle(
|
||||||
|
[(field_x, field_y), (field_x + field_width, field_y + field_height)],
|
||||||
|
fill=bg_color, outline=border_color, width=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Render field value if any
|
||||||
|
if self._field.value is not None:
|
||||||
|
value_text = str(self._field.value)
|
||||||
|
value_font = self._font
|
||||||
|
|
||||||
|
# For password fields, mask the text
|
||||||
|
if self._field.field_type == FormFieldType.PASSWORD:
|
||||||
|
value_text = "•" * len(value_text)
|
||||||
|
|
||||||
|
# Create text object for value
|
||||||
|
value_text_obj = Text(value_text, value_font)
|
||||||
|
value_img = value_text_obj.render()
|
||||||
|
|
||||||
|
# Position value text within field (with some padding)
|
||||||
|
value_x = field_x + 5
|
||||||
|
value_y = field_y + (field_height - value_img.height) // 2
|
||||||
|
|
||||||
|
# Paste value text
|
||||||
|
canvas.paste(value_img, (value_x, value_y), value_img)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def set_focused(self, focused: bool):
|
||||||
|
"""Set whether the field is focused"""
|
||||||
|
self._focused = focused
|
||||||
|
|
||||||
|
def handle_click(self, point):
|
||||||
|
"""
|
||||||
|
Handle a click on the field.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
point: The coordinates of the click relative to the field
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the field was clicked, False otherwise
|
||||||
|
"""
|
||||||
|
# Calculate field position
|
||||||
|
field_x = self._padding[3]
|
||||||
|
field_y = self._padding[0] + self._label_text.size[1] + 5
|
||||||
|
|
||||||
|
# Calculate field dimensions
|
||||||
|
field_width = self._size[0] - self._padding[1] - self._padding[3]
|
||||||
|
|
||||||
|
if self._field.field_type == FormFieldType.TEXTAREA:
|
||||||
|
field_height = 80
|
||||||
|
else:
|
||||||
|
field_height = 24
|
||||||
|
|
||||||
|
# Check if click is within field
|
||||||
|
if (field_x <= point[0] <= field_x + field_width and
|
||||||
|
field_y <= point[1] <= field_y + field_height):
|
||||||
|
self.set_focused(True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def in_object(self, point):
|
||||||
|
"""Check if a point is within this field"""
|
||||||
|
point_array = np.array(point)
|
||||||
|
relative_point = point_array - self._origin
|
||||||
|
|
||||||
|
# Check if the point is within the field boundaries
|
||||||
|
return (0 <= relative_point[0] < self._size[0] and
|
||||||
|
0 <= relative_point[1] < self._size[1])
|
||||||
233
pyWebLayout/concrete/image.py
Normal file
233
pyWebLayout/concrete/image.py
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
import os
|
||||||
|
from typing import Optional, Tuple, Union, Dict, Any
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image as PILImage, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
from pyWebLayout.base import Renderable, Queriable
|
||||||
|
from pyWebLayout.abstract.block import Image as AbstractImage
|
||||||
|
from .box import Box
|
||||||
|
from pyWebLayout.layout import Alignment
|
||||||
|
|
||||||
|
|
||||||
|
class RenderableImage(Box, Queriable):
|
||||||
|
"""
|
||||||
|
A concrete implementation for rendering Image objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, image: AbstractImage,
|
||||||
|
max_width: Optional[int] = None, max_height: Optional[int] = None,
|
||||||
|
origin=None, size=None, callback=None, sheet=None, mode=None,
|
||||||
|
halign=Alignment.CENTER, valign=Alignment.CENTER):
|
||||||
|
"""
|
||||||
|
Initialize a renderable image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: The abstract Image object to render
|
||||||
|
max_width: Maximum width constraint for the image
|
||||||
|
max_height: Maximum height constraint for the image
|
||||||
|
origin: Optional origin coordinates
|
||||||
|
size: Optional size override
|
||||||
|
callback: Optional callback function
|
||||||
|
sheet: Optional sheet for rendering
|
||||||
|
mode: Optional image mode
|
||||||
|
halign: Horizontal alignment
|
||||||
|
valign: Vertical alignment
|
||||||
|
"""
|
||||||
|
self._abstract_image = image
|
||||||
|
self._pil_image = None
|
||||||
|
self._error_message = None
|
||||||
|
|
||||||
|
# Try to load the image
|
||||||
|
self._load_image()
|
||||||
|
|
||||||
|
# Calculate the size if not provided
|
||||||
|
if size is None:
|
||||||
|
size = image.calculate_scaled_dimensions(max_width, max_height)
|
||||||
|
|
||||||
|
# Initialize the box
|
||||||
|
super().__init__(origin or (0, 0), size, callback, sheet, mode, halign, valign)
|
||||||
|
|
||||||
|
def _load_image(self):
|
||||||
|
"""Load the image from the source path"""
|
||||||
|
try:
|
||||||
|
source = self._abstract_image.source
|
||||||
|
|
||||||
|
# Handle different types of sources
|
||||||
|
if os.path.isfile(source):
|
||||||
|
# Local file
|
||||||
|
self._pil_image = PILImage.open(source)
|
||||||
|
self._abstract_image._loaded_image = self._pil_image
|
||||||
|
elif source.startswith(('http://', 'https://')):
|
||||||
|
# URL - requires requests library
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
response = requests.get(source, stream=True)
|
||||||
|
if response.status_code == 200:
|
||||||
|
self._pil_image = PILImage.open(BytesIO(response.content))
|
||||||
|
self._abstract_image._loaded_image = self._pil_image
|
||||||
|
else:
|
||||||
|
self._error_message = f"Failed to load image: HTTP status {response.status_code}"
|
||||||
|
except ImportError:
|
||||||
|
self._error_message = "Requests library not available for URL loading"
|
||||||
|
else:
|
||||||
|
self._error_message = f"Unable to load image from source: {source}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self._error_message = f"Error loading image: {str(e)}"
|
||||||
|
self._abstract_image._error = self._error_message
|
||||||
|
|
||||||
|
def render(self) -> PILImage.Image:
|
||||||
|
"""
|
||||||
|
Render the image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered image
|
||||||
|
"""
|
||||||
|
# Create a base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
|
||||||
|
if self._pil_image:
|
||||||
|
# Resize the image to fit the box while maintaining aspect ratio
|
||||||
|
resized_image = self._resize_image()
|
||||||
|
|
||||||
|
# Calculate position based on alignment
|
||||||
|
img_width, img_height = resized_image.size
|
||||||
|
box_width, box_height = self._size
|
||||||
|
|
||||||
|
# Horizontal alignment
|
||||||
|
if self._halign == Alignment.LEFT:
|
||||||
|
x_offset = 0
|
||||||
|
elif self._halign == Alignment.RIGHT:
|
||||||
|
x_offset = box_width - img_width
|
||||||
|
else: # CENTER is default
|
||||||
|
x_offset = (box_width - img_width) // 2
|
||||||
|
|
||||||
|
# Vertical alignment
|
||||||
|
if self._valign == Alignment.TOP:
|
||||||
|
y_offset = 0
|
||||||
|
elif self._valign == Alignment.BOTTOM:
|
||||||
|
y_offset = box_height - img_height
|
||||||
|
else: # CENTER is default
|
||||||
|
y_offset = (box_height - img_height) // 2
|
||||||
|
|
||||||
|
# Paste the image onto the canvas
|
||||||
|
if resized_image.mode == 'RGBA' and canvas.mode == 'RGBA':
|
||||||
|
canvas.paste(resized_image, (x_offset, y_offset), resized_image)
|
||||||
|
else:
|
||||||
|
canvas.paste(resized_image, (x_offset, y_offset))
|
||||||
|
else:
|
||||||
|
# Draw error placeholder
|
||||||
|
self._draw_error_placeholder(canvas)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def _resize_image(self) -> PILImage.Image:
|
||||||
|
"""
|
||||||
|
Resize the image to fit within the box while maintaining aspect ratio.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A resized PIL Image
|
||||||
|
"""
|
||||||
|
if not self._pil_image:
|
||||||
|
return PILImage.new('RGBA', self._size, (200, 200, 200, 100))
|
||||||
|
|
||||||
|
# Get the target dimensions
|
||||||
|
target_width, target_height = self._size
|
||||||
|
|
||||||
|
# Get the original dimensions
|
||||||
|
orig_width, orig_height = self._pil_image.size
|
||||||
|
|
||||||
|
# Calculate the scaling factor to maintain aspect ratio
|
||||||
|
width_ratio = target_width / orig_width
|
||||||
|
height_ratio = target_height / orig_height
|
||||||
|
|
||||||
|
# Use the smaller ratio to ensure the image fits within the box
|
||||||
|
ratio = min(width_ratio, height_ratio)
|
||||||
|
|
||||||
|
# Calculate new dimensions
|
||||||
|
new_width = int(orig_width * ratio)
|
||||||
|
new_height = int(orig_height * ratio)
|
||||||
|
|
||||||
|
# Resize the image
|
||||||
|
if self._pil_image.mode == 'RGBA':
|
||||||
|
resized = self._pil_image.resize((new_width, new_height), PILImage.LANCZOS)
|
||||||
|
else:
|
||||||
|
# Convert to RGBA if needed
|
||||||
|
resized = self._pil_image.convert('RGBA').resize((new_width, new_height), PILImage.LANCZOS)
|
||||||
|
|
||||||
|
return resized
|
||||||
|
|
||||||
|
def _draw_error_placeholder(self, canvas: PILImage.Image):
|
||||||
|
"""
|
||||||
|
Draw a placeholder for when the image can't be loaded.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
canvas: The canvas to draw on
|
||||||
|
"""
|
||||||
|
draw = ImageDraw.Draw(canvas)
|
||||||
|
|
||||||
|
# Draw a gray box with a border
|
||||||
|
draw.rectangle([(0, 0), self._size], fill=(240, 240, 240), outline=(180, 180, 180), width=2)
|
||||||
|
|
||||||
|
# Draw an X across the box
|
||||||
|
draw.line([(0, 0), self._size], fill=(180, 180, 180), width=2)
|
||||||
|
draw.line([(0, self._size[1]), (self._size[0], 0)], fill=(180, 180, 180), width=2)
|
||||||
|
|
||||||
|
# Add error text if available
|
||||||
|
if self._error_message:
|
||||||
|
try:
|
||||||
|
# Try to use a basic font
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
|
||||||
|
# Draw the error message, wrapped to fit
|
||||||
|
error_text = "Error: " + self._error_message
|
||||||
|
|
||||||
|
# Simple text wrapping - split by words and add lines
|
||||||
|
words = error_text.split()
|
||||||
|
lines = []
|
||||||
|
current_line = ""
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
test_line = current_line + " " + word if current_line else word
|
||||||
|
text_bbox = draw.textbbox((0, 0), test_line, font=font)
|
||||||
|
text_width = text_bbox[2] - text_bbox[0]
|
||||||
|
|
||||||
|
if text_width <= self._size[0] - 20: # 10px padding on each side
|
||||||
|
current_line = test_line
|
||||||
|
else:
|
||||||
|
lines.append(current_line)
|
||||||
|
current_line = word
|
||||||
|
|
||||||
|
if current_line:
|
||||||
|
lines.append(current_line)
|
||||||
|
|
||||||
|
# Draw each line
|
||||||
|
y_pos = 10
|
||||||
|
for line in lines:
|
||||||
|
text_bbox = draw.textbbox((0, 0), line, font=font)
|
||||||
|
text_width = text_bbox[2] - text_bbox[0]
|
||||||
|
text_height = text_bbox[3] - text_bbox[1]
|
||||||
|
|
||||||
|
# Center the text horizontally
|
||||||
|
x_pos = (self._size[0] - text_width) // 2
|
||||||
|
|
||||||
|
# Draw the text
|
||||||
|
draw.text((x_pos, y_pos), line, fill=(80, 80, 80), font=font)
|
||||||
|
|
||||||
|
# Move to the next line
|
||||||
|
y_pos += text_height + 2
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# If text rendering fails, just draw a generic error indicator
|
||||||
|
pass
|
||||||
|
|
||||||
|
def in_object(self, point):
|
||||||
|
"""Check if a point is within this image"""
|
||||||
|
point_array = np.array(point)
|
||||||
|
relative_point = point_array - self._origin
|
||||||
|
|
||||||
|
# Check if the point is within the image boundaries
|
||||||
|
return (0 <= relative_point[0] < self._size[0] and
|
||||||
|
0 <= relative_point[1] < self._size[1])
|
||||||
175
pyWebLayout/concrete/page.py
Normal file
175
pyWebLayout/concrete/page.py
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
from typing import List, Tuple, Optional, Dict, Any
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from pyWebLayout.base import Renderable, Layoutable
|
||||||
|
from .box import Box
|
||||||
|
from pyWebLayout.layout import Alignment
|
||||||
|
|
||||||
|
|
||||||
|
class Container(Box, Layoutable):
|
||||||
|
"""
|
||||||
|
A container that can hold multiple renderable objects and lay them out.
|
||||||
|
"""
|
||||||
|
def __init__(self, origin, size, direction='vertical', spacing=5,
|
||||||
|
callback=None, sheet=None, mode=None,
|
||||||
|
halign=Alignment.CENTER, valign=Alignment.CENTER,
|
||||||
|
padding: Tuple[int, int, int, int] = (10, 10, 10, 10)):
|
||||||
|
"""
|
||||||
|
Initialize a container.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
origin: Top-left corner coordinates
|
||||||
|
size: Width and height of the container
|
||||||
|
direction: Layout direction ('vertical' or 'horizontal')
|
||||||
|
spacing: Space between elements
|
||||||
|
callback: Optional callback function
|
||||||
|
sheet: Optional image sheet
|
||||||
|
mode: Optional image mode
|
||||||
|
halign: Horizontal alignment
|
||||||
|
valign: Vertical alignment
|
||||||
|
padding: Padding as (top, right, bottom, left)
|
||||||
|
"""
|
||||||
|
super().__init__(origin, size, callback, sheet, mode, halign, valign)
|
||||||
|
self._children: List[Renderable] = []
|
||||||
|
self._direction = direction
|
||||||
|
self._spacing = spacing
|
||||||
|
self._padding = padding
|
||||||
|
|
||||||
|
def add_child(self, child: Renderable):
|
||||||
|
"""Add a child element to this container"""
|
||||||
|
self._children.append(child)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def layout(self):
|
||||||
|
"""Layout the children according to the container's direction and spacing"""
|
||||||
|
if not self._children:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get available space after padding
|
||||||
|
padding_top, padding_right, padding_bottom, padding_left = self._padding
|
||||||
|
available_width = self._size[0] - padding_left - padding_right
|
||||||
|
available_height = self._size[1] - padding_top - padding_bottom
|
||||||
|
|
||||||
|
# Calculate total content size
|
||||||
|
if self._direction == 'vertical':
|
||||||
|
total_height = sum(getattr(child, '_size', [0, 0])[1] for child in self._children)
|
||||||
|
total_height += self._spacing * (len(self._children) - 1)
|
||||||
|
|
||||||
|
# Position each child
|
||||||
|
current_y = padding_top
|
||||||
|
for child in self._children:
|
||||||
|
if hasattr(child, '_size') and hasattr(child, '_origin'):
|
||||||
|
child_width, child_height = child._size
|
||||||
|
|
||||||
|
# Calculate horizontal position based on alignment
|
||||||
|
if self._halign == Alignment.LEFT:
|
||||||
|
x_pos = padding_left
|
||||||
|
elif self._halign == Alignment.RIGHT:
|
||||||
|
x_pos = padding_left + available_width - child_width
|
||||||
|
else: # CENTER
|
||||||
|
x_pos = padding_left + (available_width - child_width) // 2
|
||||||
|
|
||||||
|
# Set child position
|
||||||
|
child._origin = np.array([x_pos, current_y])
|
||||||
|
|
||||||
|
# Move down for next child
|
||||||
|
current_y += child_height + self._spacing
|
||||||
|
|
||||||
|
# Layout the child if it's layoutable
|
||||||
|
if isinstance(child, Layoutable):
|
||||||
|
child.layout()
|
||||||
|
|
||||||
|
else: # horizontal
|
||||||
|
total_width = sum(getattr(child, '_size', [0, 0])[0] for child in self._children)
|
||||||
|
total_width += self._spacing * (len(self._children) - 1)
|
||||||
|
|
||||||
|
# Position each child
|
||||||
|
current_x = padding_left
|
||||||
|
for child in self._children:
|
||||||
|
if hasattr(child, '_size') and hasattr(child, '_origin'):
|
||||||
|
child_width, child_height = child._size
|
||||||
|
|
||||||
|
# Calculate vertical position based on alignment
|
||||||
|
if self._valign == Alignment.TOP:
|
||||||
|
y_pos = padding_top
|
||||||
|
elif self._valign == Alignment.BOTTOM:
|
||||||
|
y_pos = padding_top + available_height - child_height
|
||||||
|
else: # CENTER
|
||||||
|
y_pos = padding_top + (available_height - child_height) // 2
|
||||||
|
|
||||||
|
# Set child position
|
||||||
|
child._origin = np.array([current_x, y_pos])
|
||||||
|
|
||||||
|
# Move right for next child
|
||||||
|
current_x += child_width + self._spacing
|
||||||
|
|
||||||
|
# Layout the child if it's layoutable
|
||||||
|
if isinstance(child, Layoutable):
|
||||||
|
child.layout()
|
||||||
|
|
||||||
|
def render(self) -> Image:
|
||||||
|
"""Render the container with all its children"""
|
||||||
|
# Make sure children are laid out
|
||||||
|
self.layout()
|
||||||
|
|
||||||
|
# Create base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
|
||||||
|
# Render each child and paste it onto the canvas
|
||||||
|
for child in self._children:
|
||||||
|
if hasattr(child, '_origin'):
|
||||||
|
child_img = child.render()
|
||||||
|
# Calculate child position relative to container
|
||||||
|
rel_pos = tuple(child._origin - self._origin)
|
||||||
|
# Paste the child onto the canvas
|
||||||
|
canvas.paste(child_img, rel_pos, child_img)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
|
||||||
|
class Page(Container):
|
||||||
|
"""
|
||||||
|
Top-level container representing an HTML page.
|
||||||
|
"""
|
||||||
|
def __init__(self, size=(800, 600), background_color=(255, 255, 255), mode='RGBA'):
|
||||||
|
"""
|
||||||
|
Initialize a page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
size: Width and height of the page
|
||||||
|
background_color: Background color as RGB tuple
|
||||||
|
mode: Image mode
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
origin=(0, 0),
|
||||||
|
size=size,
|
||||||
|
direction='vertical',
|
||||||
|
spacing=10,
|
||||||
|
mode=mode,
|
||||||
|
halign=Alignment.CENTER,
|
||||||
|
valign=Alignment.TOP
|
||||||
|
)
|
||||||
|
self._background_color = background_color
|
||||||
|
|
||||||
|
def render(self) -> Image:
|
||||||
|
"""Render the page with all its content"""
|
||||||
|
# Make sure children are laid out
|
||||||
|
self.layout()
|
||||||
|
|
||||||
|
# Create base canvas with background color
|
||||||
|
canvas = Image.new(self._mode, tuple(self._size), self._background_color)
|
||||||
|
|
||||||
|
# Render each child and paste it onto the canvas
|
||||||
|
for child in self._children:
|
||||||
|
if hasattr(child, '_origin'):
|
||||||
|
child_img = child.render()
|
||||||
|
# Calculate child position relative to page
|
||||||
|
rel_pos = tuple(child._origin)
|
||||||
|
# Paste the child onto the canvas with alpha channel if available
|
||||||
|
if 'A' in self._mode and child_img.mode == 'RGBA':
|
||||||
|
canvas.paste(child_img, rel_pos, child_img)
|
||||||
|
else:
|
||||||
|
canvas.paste(child_img, rel_pos)
|
||||||
|
|
||||||
|
return canvas
|
||||||
455
pyWebLayout/concrete/text.py
Normal file
455
pyWebLayout/concrete/text.py
Normal file
@ -0,0 +1,455 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
from pyWebLayout.base import Renderable, Queriable
|
||||||
|
from .box import Box
|
||||||
|
from pyWebLayout.layout import Alignment
|
||||||
|
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration
|
||||||
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from typing import Tuple, Union, List, Optional
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class Text(Renderable, Queriable):
|
||||||
|
"""
|
||||||
|
Concrete implementation for rendering text.
|
||||||
|
This class handles the visual representation of text fragments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, text: str, style: Font):
|
||||||
|
"""
|
||||||
|
Initialize a Text object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text content to render
|
||||||
|
style: The font style to use for rendering
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self._text = text
|
||||||
|
self._style = style
|
||||||
|
self._line = None
|
||||||
|
self._previous = None
|
||||||
|
self._next = None
|
||||||
|
self._origin = np.array([0, 0])
|
||||||
|
|
||||||
|
# Calculate dimensions
|
||||||
|
self._calculate_dimensions()
|
||||||
|
|
||||||
|
def _calculate_dimensions(self):
|
||||||
|
"""Calculate the width and height of the text based on the font metrics"""
|
||||||
|
# Get the size using PIL's text size functionality
|
||||||
|
font = self._style.font
|
||||||
|
|
||||||
|
# GetTextSize is deprecated, using textbbox for better accuracy
|
||||||
|
# The bounding box is (left, top, right, bottom)
|
||||||
|
try:
|
||||||
|
bbox = font.getbbox(self._text)
|
||||||
|
self._width = bbox[2] - bbox[0]
|
||||||
|
self._height = bbox[3] - bbox[1]
|
||||||
|
self._size = (self._width, self._height)
|
||||||
|
except AttributeError:
|
||||||
|
# Fallback for older PIL versions
|
||||||
|
self._width, self._height = font.getsize(self._text)
|
||||||
|
self._size = (self._width, self._height)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self) -> str:
|
||||||
|
"""Get the text content"""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def style(self) -> Font:
|
||||||
|
"""Get the text style"""
|
||||||
|
return self._style
|
||||||
|
|
||||||
|
@property
|
||||||
|
def line(self) -> Optional[Line]:
|
||||||
|
"""Get the line containing this text"""
|
||||||
|
return self._line
|
||||||
|
|
||||||
|
@line.setter
|
||||||
|
def line(self, line):
|
||||||
|
"""Set the line containing this text"""
|
||||||
|
self._line = line
|
||||||
|
|
||||||
|
@property
|
||||||
|
def width(self) -> int:
|
||||||
|
"""Get the width of the text"""
|
||||||
|
return self._width
|
||||||
|
|
||||||
|
@property
|
||||||
|
def height(self) -> int:
|
||||||
|
"""Get the height of the text"""
|
||||||
|
return self._height
|
||||||
|
|
||||||
|
@property
|
||||||
|
def size(self) -> Tuple[int, int]:
|
||||||
|
"""Get the size (width, height) of the text"""
|
||||||
|
return self._size
|
||||||
|
|
||||||
|
def set_origin(self, x: int, y: int):
|
||||||
|
"""Set the origin (top-left corner) of this text element"""
|
||||||
|
self._origin = np.array([x, y])
|
||||||
|
|
||||||
|
def add_to_line(self, line):
|
||||||
|
"""Add this text to a line"""
|
||||||
|
self._line = line
|
||||||
|
|
||||||
|
def _apply_decoration(self, draw: ImageDraw.Draw):
|
||||||
|
"""Apply text decoration (underline or strikethrough)"""
|
||||||
|
if self._style.decoration == TextDecoration.UNDERLINE:
|
||||||
|
# Draw underline at about 90% of the height
|
||||||
|
y_position = int(self._height * 0.9)
|
||||||
|
draw.line([(0, y_position), (self._width, y_position)],
|
||||||
|
fill=self._style.colour, width=max(1, int(self._style.font_size / 15)))
|
||||||
|
|
||||||
|
elif self._style.decoration == TextDecoration.STRIKETHROUGH:
|
||||||
|
# Draw strikethrough at about 50% of the height
|
||||||
|
y_position = int(self._height * 0.5)
|
||||||
|
draw.line([(0, y_position), (self._width, y_position)],
|
||||||
|
fill=self._style.colour, width=max(1, int(self._style.font_size / 15)))
|
||||||
|
|
||||||
|
def render(self) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Render the text to an image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered text
|
||||||
|
"""
|
||||||
|
# Create a transparent image with the appropriate size
|
||||||
|
canvas = Image.new('RGBA', self._size, (0, 0, 0, 0))
|
||||||
|
draw = ImageDraw.Draw(canvas)
|
||||||
|
|
||||||
|
# Draw the text background if specified
|
||||||
|
if self._style.background and self._style.background[3] > 0: # If alpha > 0
|
||||||
|
draw.rectangle([(0, 0), self._size], fill=self._style.background)
|
||||||
|
|
||||||
|
# Draw the text
|
||||||
|
draw.text((0, 0), self._text, font=self._style.font, fill=self._style.colour)
|
||||||
|
|
||||||
|
# Apply any text decorations
|
||||||
|
self._apply_decoration(draw)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def get_size(self) -> Tuple[int, int]:
|
||||||
|
"""Get the size (width, height) of the text"""
|
||||||
|
return self._size
|
||||||
|
|
||||||
|
def in_object(self, point):
|
||||||
|
"""Check if a point is within this text object"""
|
||||||
|
point_array = np.array(point)
|
||||||
|
relative_point = point_array - self._origin
|
||||||
|
|
||||||
|
# Check if the point is within the text boundaries
|
||||||
|
return (0 <= relative_point[0] < self._width and
|
||||||
|
0 <= relative_point[1] < self._height)
|
||||||
|
class RenderableWord(Renderable, Queriable):
|
||||||
|
"""
|
||||||
|
A concrete implementation for rendering Word objects.
|
||||||
|
This bridges between the abstract Word class and rendering capabilities.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, word: Word):
|
||||||
|
"""
|
||||||
|
Initialize a new renderable word.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: The abstract Word object to render
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self._word = word
|
||||||
|
self._text_parts: List[Text] = []
|
||||||
|
self._origin = np.array([0, 0])
|
||||||
|
self._size = (0, 0)
|
||||||
|
|
||||||
|
# Initialize with the full word as a single text part
|
||||||
|
self._initialize_text_parts()
|
||||||
|
|
||||||
|
def _initialize_text_parts(self):
|
||||||
|
"""Initialize the text parts based on the word's current state"""
|
||||||
|
# Clear existing parts
|
||||||
|
self._text_parts.clear()
|
||||||
|
|
||||||
|
if self._word.hyphenated_parts:
|
||||||
|
# If the word is hyphenated, create a Text object for each part
|
||||||
|
for part in self._word.hyphenated_parts:
|
||||||
|
self._text_parts.append(Text(part, self._word.style))
|
||||||
|
else:
|
||||||
|
# Otherwise, create a single Text object for the whole word
|
||||||
|
self._text_parts.append(Text(self._word.text, self._word.style))
|
||||||
|
|
||||||
|
# Calculate total size
|
||||||
|
self._recalculate_size()
|
||||||
|
|
||||||
|
def _recalculate_size(self):
|
||||||
|
"""Recalculate the size of the word based on its text parts"""
|
||||||
|
if not self._text_parts:
|
||||||
|
self._size = (0, 0)
|
||||||
|
return
|
||||||
|
|
||||||
|
# For a non-hyphenated word, use the size of the single text part
|
||||||
|
if len(self._text_parts) == 1:
|
||||||
|
self._size = self._text_parts[0].size
|
||||||
|
return
|
||||||
|
|
||||||
|
# For a hyphenated word that's not yet split across lines,
|
||||||
|
# calculate the total width and maximum height
|
||||||
|
total_width = sum(part.width for part in self._text_parts)
|
||||||
|
max_height = max(part.height for part in self._text_parts)
|
||||||
|
self._size = (total_width, max_height)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def word(self) -> Word:
|
||||||
|
"""Get the abstract Word object"""
|
||||||
|
return self._word
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text_parts(self) -> List[Text]:
|
||||||
|
"""Get the list of Text objects that make up this word"""
|
||||||
|
return self._text_parts
|
||||||
|
|
||||||
|
def update_from_word(self):
|
||||||
|
"""Update the text parts based on changes to the word"""
|
||||||
|
self._initialize_text_parts()
|
||||||
|
|
||||||
|
def get_part_size(self, index: int) -> Tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Get the size of a specific text part.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index: The index of the part to query.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple (width, height) of the part.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
IndexError: If the index is out of range.
|
||||||
|
"""
|
||||||
|
if index >= len(self._text_parts):
|
||||||
|
raise IndexError(f"Part index {index} out of range")
|
||||||
|
|
||||||
|
return self._text_parts[index].size
|
||||||
|
|
||||||
|
@property
|
||||||
|
def width(self) -> int:
|
||||||
|
"""Get the total width of the word"""
|
||||||
|
return self._size[0]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def height(self) -> int:
|
||||||
|
"""Get the height of the word"""
|
||||||
|
return self._size[1]
|
||||||
|
|
||||||
|
def set_origin(self, x: int, y: int):
|
||||||
|
"""Set the origin (top-left corner) of this word"""
|
||||||
|
self._origin = np.array([x, y])
|
||||||
|
|
||||||
|
# Update positions of text parts
|
||||||
|
x_offset = 0
|
||||||
|
for part in self._text_parts:
|
||||||
|
part.set_origin(x + x_offset, y)
|
||||||
|
x_offset += part.width
|
||||||
|
|
||||||
|
def render(self) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Render the word to an image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered word
|
||||||
|
"""
|
||||||
|
# For a non-hyphenated word or if there's only one part, render just that part
|
||||||
|
if len(self._text_parts) == 1:
|
||||||
|
return self._text_parts[0].render()
|
||||||
|
|
||||||
|
# For a hyphenated word, create a canvas and paste all parts
|
||||||
|
canvas = Image.new('RGBA', self._size, (0, 0, 0, 0))
|
||||||
|
|
||||||
|
x_offset = 0
|
||||||
|
for part in self._text_parts:
|
||||||
|
part_img = part.render()
|
||||||
|
canvas.paste(part_img, (x_offset, 0), part_img)
|
||||||
|
x_offset += part.width
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def in_object(self, point):
|
||||||
|
"""Check if a point is within this word"""
|
||||||
|
point_array = np.array(point)
|
||||||
|
|
||||||
|
# First check if the point is within the word's boundaries
|
||||||
|
relative_point = point_array - self._origin
|
||||||
|
if not (0 <= relative_point[0] < self._size[0] and
|
||||||
|
0 <= relative_point[1] < self._size[1]):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Then check which text part contains the point
|
||||||
|
x_offset = 0
|
||||||
|
for part in self._text_parts:
|
||||||
|
part_width = part.width
|
||||||
|
if x_offset <= relative_point[0] < x_offset + part_width:
|
||||||
|
# The point is within this part's horizontal bounds
|
||||||
|
# Adjust the point to be relative to the part
|
||||||
|
part_relative_point = relative_point.copy()
|
||||||
|
part_relative_point[0] -= x_offset
|
||||||
|
return part.in_object(self._origin + part_relative_point)
|
||||||
|
|
||||||
|
x_offset += part_width
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class Line(Box):
|
||||||
|
"""
|
||||||
|
A line of text consisting of words with consistent spacing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, spacing: Tuple[int, int], origin, size, font: Optional[Font] = None,
|
||||||
|
callback=None, sheet=None, mode=None, halign=Alignment.CENTER,
|
||||||
|
valign=Alignment.CENTER, previous = None):
|
||||||
|
"""
|
||||||
|
Initialize a new line.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spacing: A tuple of (min_spacing, max_spacing) between words
|
||||||
|
origin: The top-left position of the line
|
||||||
|
size: The width and height of the line
|
||||||
|
font: The default font to use for text in this line
|
||||||
|
callback: Optional callback function
|
||||||
|
sheet: Optional image sheet
|
||||||
|
mode: Optional image mode
|
||||||
|
halign: Horizontal alignment of text within the line
|
||||||
|
valign: Vertical alignment of text within the line
|
||||||
|
previous: Reference to the previous line
|
||||||
|
"""
|
||||||
|
super().__init__(origin, size, callback, sheet, mode, halign, valign)
|
||||||
|
self._renderable_words: List[RenderableWord] = []
|
||||||
|
self._spacing = spacing # (min_spacing, max_spacing)
|
||||||
|
self._font = font if font else Font() # Use default font if none provided
|
||||||
|
self._current_width = 0 # Track the current width used
|
||||||
|
|
||||||
|
self._previous = previous
|
||||||
|
self._next = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def renderable_words(self) -> List[RenderableWord]:
|
||||||
|
"""Get the list of renderable words in this line"""
|
||||||
|
return self._renderable_words
|
||||||
|
|
||||||
|
def set_next(self, line: Line):
|
||||||
|
"""Set the next line in sequence"""
|
||||||
|
self._next = line
|
||||||
|
|
||||||
|
def add_word(self, text: str, font: Optional[Font] = None) -> Union[None, str]:
|
||||||
|
"""
|
||||||
|
Add a word to this line.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text content of the word
|
||||||
|
font: The font to use for this word, or None to use the line's default font
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None if the word fits, or the remaining text if it doesn't fit
|
||||||
|
"""
|
||||||
|
if not font:
|
||||||
|
font = self._font
|
||||||
|
|
||||||
|
# Create an abstract word
|
||||||
|
abstract_word = Word(text, font)
|
||||||
|
|
||||||
|
# Create a renderable word
|
||||||
|
renderable_word = RenderableWord(abstract_word)
|
||||||
|
|
||||||
|
# Check if the word fits in the current line with minimum spacing
|
||||||
|
min_spacing, max_spacing = self._spacing
|
||||||
|
word_width = renderable_word.width
|
||||||
|
|
||||||
|
# If this is the first word, no spacing is needed
|
||||||
|
spacing_needed = min_spacing if self._renderable_words else 0
|
||||||
|
|
||||||
|
# Check if word fits in the line
|
||||||
|
if self._current_width + spacing_needed + word_width <= self._size[0]:
|
||||||
|
self._renderable_words.append(renderable_word)
|
||||||
|
self._current_width += spacing_needed + word_width
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
# Try to hyphenate the word if it doesn't fit
|
||||||
|
if abstract_word.hyphenate():
|
||||||
|
# Update the renderable word to reflect hyphenation
|
||||||
|
renderable_word.update_from_word()
|
||||||
|
|
||||||
|
# Check if first part with hyphen fits
|
||||||
|
first_part_size = renderable_word.get_part_size(0)
|
||||||
|
if self._current_width + spacing_needed + first_part_size[0] <= self._size[0]:
|
||||||
|
# Create a word with just the first part
|
||||||
|
first_part_text = abstract_word.get_hyphenated_part(0)
|
||||||
|
first_word = Word(first_part_text, font)
|
||||||
|
renderable_first_word = RenderableWord(first_word)
|
||||||
|
|
||||||
|
self._renderable_words.append(renderable_first_word)
|
||||||
|
self._current_width += spacing_needed + first_part_size[0]
|
||||||
|
|
||||||
|
# Return the remaining parts as a single string
|
||||||
|
remaining_parts = [abstract_word.get_hyphenated_part(i)
|
||||||
|
for i in range(1, abstract_word.get_hyphenated_part_count())]
|
||||||
|
return ''.join(remaining_parts)
|
||||||
|
|
||||||
|
# If we can't hyphenate or first part doesn't fit, return the entire word
|
||||||
|
return text
|
||||||
|
|
||||||
|
def render(self) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Render the line with all its words.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PIL Image containing the rendered line
|
||||||
|
"""
|
||||||
|
# Create an image for the line
|
||||||
|
canvas = super().render()
|
||||||
|
|
||||||
|
# If there are no words, return the empty canvas
|
||||||
|
if not self._renderable_words:
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
# Calculate total width of words
|
||||||
|
total_word_width = sum(word.width for word in self._renderable_words)
|
||||||
|
|
||||||
|
# Calculate spacing based on alignment and available space
|
||||||
|
available_space = self._size[0] - total_word_width
|
||||||
|
num_spaces = len(self._renderable_words) - 1
|
||||||
|
|
||||||
|
if num_spaces > 0:
|
||||||
|
if self._halign == Alignment.JUSTIFY:
|
||||||
|
# For justified text, distribute space evenly between words
|
||||||
|
spacing = available_space // num_spaces
|
||||||
|
else:
|
||||||
|
# Use minimum spacing for other alignments
|
||||||
|
spacing = self._spacing[0]
|
||||||
|
else:
|
||||||
|
spacing = 0
|
||||||
|
|
||||||
|
# Calculate starting x position based on alignment
|
||||||
|
if self._halign == Alignment.LEFT:
|
||||||
|
x_pos = 0
|
||||||
|
elif self._halign == Alignment.RIGHT:
|
||||||
|
x_pos = self._size[0] - (total_word_width + spacing * num_spaces)
|
||||||
|
else: # CENTER
|
||||||
|
x_pos = (self._size[0] - (total_word_width + spacing * num_spaces)) // 2
|
||||||
|
|
||||||
|
# Vertical alignment - center words vertically in the line
|
||||||
|
y_pos = (self._size[1] - max(word.height for word in self._renderable_words)) // 2
|
||||||
|
|
||||||
|
# Render and paste each word onto the line
|
||||||
|
for word in self._renderable_words:
|
||||||
|
# Set the word's position
|
||||||
|
word.set_origin(x_pos, y_pos)
|
||||||
|
|
||||||
|
# Render the word
|
||||||
|
word_img = word.render()
|
||||||
|
|
||||||
|
# Paste the word onto the canvas
|
||||||
|
canvas.paste(word_img, (x_pos, y_pos), word_img)
|
||||||
|
|
||||||
|
# Move to the next word position
|
||||||
|
x_pos += word.width + spacing
|
||||||
|
|
||||||
|
return canvas
|
||||||
10
pyWebLayout/core/__init__.py
Normal file
10
pyWebLayout/core/__init__.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
"""
|
||||||
|
Core functionality for the pyWebLayout library.
|
||||||
|
|
||||||
|
This package contains the core abstractions and base classes that form the foundation
|
||||||
|
of the pyWebLayout rendering system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pyWebLayout.core.base import (
|
||||||
|
Renderable, Interactable, Layoutable, Queriable
|
||||||
|
)
|
||||||
67
pyWebLayout/core/base.py
Normal file
67
pyWebLayout/core/base.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
from abc import ABC
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from pyWebLayout.style import Alignment
|
||||||
|
|
||||||
|
|
||||||
|
class Renderable(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for any object that can be rendered to an image.
|
||||||
|
All renderable objects must implement the render method.
|
||||||
|
"""
|
||||||
|
def render(self):
|
||||||
|
"""
|
||||||
|
Render the object to an image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PIL.Image: The rendered image
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Interactable(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for any object that can be interacted with.
|
||||||
|
Interactable objects must have a callback that is executed when interacted with.
|
||||||
|
"""
|
||||||
|
def __init__(self, callback=None):
|
||||||
|
"""
|
||||||
|
Initialize an interactable object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
callback: The function to call when this object is interacted with
|
||||||
|
"""
|
||||||
|
self._callback = callback
|
||||||
|
|
||||||
|
def interact(self, point: np.generic):
|
||||||
|
"""
|
||||||
|
Handle interaction at the given point.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
point: The coordinates of the interaction
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The result of calling the callback function with the point
|
||||||
|
"""
|
||||||
|
if self._callback is None:
|
||||||
|
return None
|
||||||
|
return self._callback(point)
|
||||||
|
|
||||||
|
class Layoutable(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for any object that can be laid out.
|
||||||
|
Layoutable objects must implement the layout method which arranges their contents.
|
||||||
|
"""
|
||||||
|
def layout(self):
|
||||||
|
"""
|
||||||
|
Layout the object's contents.
|
||||||
|
This method should be called before rendering to properly arrange the object's contents.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Queriable(ABC):
|
||||||
|
|
||||||
|
def in_object(self, point:np.generic):
|
||||||
|
"""
|
||||||
|
check if a point is in the object
|
||||||
|
"""
|
||||||
|
pass
|
||||||
100
pyWebLayout/examples/epub_viewer.py
Normal file
100
pyWebLayout/examples/epub_viewer.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Example EPUB viewer using pyWebLayout.
|
||||||
|
|
||||||
|
This example demonstrates how to use pyWebLayout to load an EPUB file,
|
||||||
|
paginate it, and render pages as images.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
# Add the parent directory to the path to import pyWebLayout
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
from pyWebLayout import (
|
||||||
|
Document, Book, read_epub,
|
||||||
|
DocumentPaginator, Page
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Parse command line arguments
|
||||||
|
parser = argparse.ArgumentParser(description='EPUB viewer example')
|
||||||
|
parser.add_argument('epub_file', help='Path to EPUB file')
|
||||||
|
parser.add_argument('--output-dir', '-o', default='output', help='Output directory for rendered pages')
|
||||||
|
parser.add_argument('--width', '-w', type=int, default=800, help='Page width')
|
||||||
|
parser.add_argument('--height', '-h', type=int, default=1000, help='Page height')
|
||||||
|
parser.add_argument('--margin', '-m', type=int, default=50, help='Page margin')
|
||||||
|
parser.add_argument('--max-pages', '-p', type=int, default=10, help='Maximum number of pages to render')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Read EPUB file
|
||||||
|
print(f"Reading EPUB file: {args.epub_file}")
|
||||||
|
book = read_epub(args.epub_file)
|
||||||
|
|
||||||
|
# Display book metadata
|
||||||
|
print(f"Title: {book.get_title()}")
|
||||||
|
print(f"Author: {book.get_metadata('AUTHOR')}")
|
||||||
|
print(f"Chapters: {len(book.chapters)}")
|
||||||
|
|
||||||
|
# Create a paginator
|
||||||
|
page_size = (args.width, args.height)
|
||||||
|
margins = (args.margin, args.margin, args.margin, args.margin)
|
||||||
|
paginator = DocumentPaginator(
|
||||||
|
document=book,
|
||||||
|
page_size=page_size,
|
||||||
|
margins=margins
|
||||||
|
)
|
||||||
|
|
||||||
|
# Paginate and render pages
|
||||||
|
print("Paginating and rendering pages...")
|
||||||
|
|
||||||
|
# Option 1: Render all pages at once
|
||||||
|
pages = paginator.paginate(max_pages=args.max_pages)
|
||||||
|
for i, page in enumerate(pages):
|
||||||
|
# Render the page
|
||||||
|
image = page.render()
|
||||||
|
|
||||||
|
# Save the image
|
||||||
|
output_path = os.path.join(args.output_dir, f"page_{i+1:03d}.png")
|
||||||
|
image.save(output_path)
|
||||||
|
print(f"Saved page {i+1} to {output_path}")
|
||||||
|
|
||||||
|
# Option 2: Render pages one by one with state saving
|
||||||
|
"""
|
||||||
|
# Clear paginator state
|
||||||
|
paginator.state = DocumentPaginationState()
|
||||||
|
|
||||||
|
for i in range(args.max_pages):
|
||||||
|
# Get next page
|
||||||
|
page = paginator.paginate_next()
|
||||||
|
if page is None:
|
||||||
|
print(f"No more pages after page {i}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Render the page
|
||||||
|
image = page.render()
|
||||||
|
|
||||||
|
# Save the image
|
||||||
|
output_path = os.path.join(args.output_dir, f"page_{i+1:03d}.png")
|
||||||
|
image.save(output_path)
|
||||||
|
print(f"Saved page {i+1} to {output_path}")
|
||||||
|
|
||||||
|
# Save pagination state (could be saved to a file for later resumption)
|
||||||
|
state_dict = paginator.get_state()
|
||||||
|
|
||||||
|
# Progress information
|
||||||
|
progress = paginator.get_progress() * 100
|
||||||
|
print(f"Progress: {progress:.1f}%")
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
918
pyWebLayout/html_parser.py
Normal file
918
pyWebLayout/html_parser.py
Normal file
@ -0,0 +1,918 @@
|
|||||||
|
import re
|
||||||
|
from html.parser import HTMLParser as BaseHTMLParser
|
||||||
|
from typing import Dict, List, Optional, Tuple, Union, Any, Set, Callable
|
||||||
|
import urllib.parse
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from .style import Font, FontStyle, FontWeight, TextDecoration
|
||||||
|
from .abstract.document import Document, MetadataType, Book, Chapter
|
||||||
|
from .abstract.block import (
|
||||||
|
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
|
HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak
|
||||||
|
)
|
||||||
|
from .abstract.inline import Word, FormattedSpan
|
||||||
|
from .abstract.functional import Link, LinkType, Button, Form, FormField, FormFieldType
|
||||||
|
from .concrete.page import Page
|
||||||
|
from pyWebLayout.layout import Alignment
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLParser(BaseHTMLParser):
|
||||||
|
"""
|
||||||
|
HTML parser that builds an abstract document representation from HTML content.
|
||||||
|
This parser converts HTML to abstract document classes without any rendering specifics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, base_url: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize the HTML parser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: Base URL for resolving relative links
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# Document structure
|
||||||
|
self.document = Document()
|
||||||
|
|
||||||
|
# State variables
|
||||||
|
self._current_block = None
|
||||||
|
self._block_stack: List[Block] = []
|
||||||
|
|
||||||
|
# Text handling
|
||||||
|
self._current_paragraph = None
|
||||||
|
self._current_span = None
|
||||||
|
self._text_buffer = ""
|
||||||
|
|
||||||
|
# Style state
|
||||||
|
self._style_stack: List[Dict[str, Any]] = []
|
||||||
|
self._current_style = {
|
||||||
|
'font_size': 12,
|
||||||
|
'font_weight': FontWeight.NORMAL,
|
||||||
|
'font_style': FontStyle.NORMAL,
|
||||||
|
'decoration': TextDecoration.NONE,
|
||||||
|
'color': (0, 0, 0),
|
||||||
|
'background': None,
|
||||||
|
'language': 'en_US'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Tag state
|
||||||
|
self._list_stack: List[HList] = []
|
||||||
|
self._table_stack: List[Table] = []
|
||||||
|
self._current_table_row = None
|
||||||
|
|
||||||
|
# Link handling
|
||||||
|
self._base_url = base_url
|
||||||
|
self._in_link = False
|
||||||
|
self._current_link = None
|
||||||
|
|
||||||
|
# Special state flags
|
||||||
|
self._in_head = False
|
||||||
|
self._in_title = False
|
||||||
|
self._in_script = False
|
||||||
|
self._in_style = False
|
||||||
|
self._script_buffer = ""
|
||||||
|
self._style_buffer = ""
|
||||||
|
self._title_buffer = ""
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
||||||
|
"""
|
||||||
|
Handle the start of an HTML tag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag: The tag name
|
||||||
|
attrs: List of attribute tuples (name, value)
|
||||||
|
"""
|
||||||
|
tag = tag.lower()
|
||||||
|
attrs_dict = dict(attrs)
|
||||||
|
|
||||||
|
# Special handling for elements where we collect content
|
||||||
|
if self._in_script and tag != 'script':
|
||||||
|
return
|
||||||
|
if self._in_style and tag != 'style':
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse style attribute if present
|
||||||
|
style = {}
|
||||||
|
if 'style' in attrs_dict:
|
||||||
|
style = self._parse_style(attrs_dict['style'])
|
||||||
|
|
||||||
|
# Apply tag-specific styling based on the tag
|
||||||
|
tag_style = self._get_tag_style(tag)
|
||||||
|
for key, value in tag_style.items():
|
||||||
|
if key not in style:
|
||||||
|
style[key] = value
|
||||||
|
|
||||||
|
# Push the current style and apply the new style
|
||||||
|
self._push_style(style)
|
||||||
|
|
||||||
|
# Handle specific tags
|
||||||
|
if tag == 'html':
|
||||||
|
# Set document language if specified
|
||||||
|
if 'lang' in attrs_dict:
|
||||||
|
self.document.set_metadata(MetadataType.LANGUAGE, attrs_dict['lang'])
|
||||||
|
|
||||||
|
elif tag == 'head':
|
||||||
|
self._in_head = True
|
||||||
|
|
||||||
|
elif tag == 'title' and self._in_head:
|
||||||
|
self._in_title = True
|
||||||
|
self._title_buffer = ""
|
||||||
|
|
||||||
|
elif tag == 'meta' and self._in_head:
|
||||||
|
self._handle_meta_tag(attrs_dict)
|
||||||
|
|
||||||
|
elif tag == 'link' and self._in_head:
|
||||||
|
self._handle_link_tag(attrs_dict)
|
||||||
|
|
||||||
|
elif tag == 'script':
|
||||||
|
self._in_script = True
|
||||||
|
self._script_buffer = ""
|
||||||
|
|
||||||
|
elif tag == 'style':
|
||||||
|
self._in_style = True
|
||||||
|
self._style_buffer = ""
|
||||||
|
|
||||||
|
elif tag == 'body':
|
||||||
|
# Body attributes can contain style information
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif tag == 'p':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
self._current_paragraph = Parapgraph()
|
||||||
|
|
||||||
|
# Add the paragraph to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(self._current_paragraph)
|
||||||
|
else:
|
||||||
|
self.document.add_block(self._current_paragraph)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(self._current_paragraph)
|
||||||
|
self._current_block = self._current_paragraph
|
||||||
|
|
||||||
|
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Determine heading level
|
||||||
|
level_map = {
|
||||||
|
'h1': HeadingLevel.H1,
|
||||||
|
'h2': HeadingLevel.H2,
|
||||||
|
'h3': HeadingLevel.H3,
|
||||||
|
'h4': HeadingLevel.H4,
|
||||||
|
'h5': HeadingLevel.H5,
|
||||||
|
'h6': HeadingLevel.H6
|
||||||
|
}
|
||||||
|
|
||||||
|
heading = Heading(level=level_map[tag])
|
||||||
|
|
||||||
|
# Add the heading to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(heading)
|
||||||
|
else:
|
||||||
|
self.document.add_block(heading)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(heading)
|
||||||
|
self._current_block = heading
|
||||||
|
self._current_paragraph = heading # Heading inherits from Paragraph
|
||||||
|
|
||||||
|
elif tag == 'div':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# For divs, we create a new paragraph as a container
|
||||||
|
div_para = Parapgraph()
|
||||||
|
|
||||||
|
# Add the div to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(div_para)
|
||||||
|
else:
|
||||||
|
self.document.add_block(div_para)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(div_para)
|
||||||
|
self._current_block = div_para
|
||||||
|
self._current_paragraph = div_para
|
||||||
|
|
||||||
|
elif tag == 'blockquote':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
quote = Quote()
|
||||||
|
|
||||||
|
# Add the quote to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(quote)
|
||||||
|
else:
|
||||||
|
self.document.add_block(quote)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(quote)
|
||||||
|
self._current_block = quote
|
||||||
|
|
||||||
|
elif tag == 'pre':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Pre can optionally contain a code block
|
||||||
|
# We'll create a paragraph for now, and if we find a code tag inside,
|
||||||
|
# we'll replace it with a code block
|
||||||
|
pre_para = Parapgraph()
|
||||||
|
|
||||||
|
# Add the pre to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(pre_para)
|
||||||
|
else:
|
||||||
|
self.document.add_block(pre_para)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(pre_para)
|
||||||
|
self._current_block = pre_para
|
||||||
|
self._current_paragraph = pre_para
|
||||||
|
|
||||||
|
elif tag == 'code':
|
||||||
|
# If we're inside a pre, replace the paragraph with a code block
|
||||||
|
if self._block_stack and isinstance(self._block_stack[-1], Parapgraph):
|
||||||
|
pre_para = self._block_stack.pop()
|
||||||
|
|
||||||
|
# Get the language from class if specified (e.g., class="language-python")
|
||||||
|
language = ""
|
||||||
|
if 'class' in attrs_dict:
|
||||||
|
class_attr = attrs_dict['class']
|
||||||
|
if class_attr.startswith('language-'):
|
||||||
|
language = class_attr[9:]
|
||||||
|
|
||||||
|
code_block = CodeBlock(language=language)
|
||||||
|
|
||||||
|
# Replace the paragraph with the code block
|
||||||
|
if pre_para.parent:
|
||||||
|
parent = pre_para.parent
|
||||||
|
if hasattr(parent, '_blocks'):
|
||||||
|
# Find the paragraph in the parent's blocks and replace it
|
||||||
|
for i, block in enumerate(parent._blocks):
|
||||||
|
if block == pre_para:
|
||||||
|
parent._blocks[i] = code_block
|
||||||
|
break
|
||||||
|
|
||||||
|
# Push the code block to the stack
|
||||||
|
self._block_stack.append(code_block)
|
||||||
|
self._current_block = code_block
|
||||||
|
self._current_paragraph = None
|
||||||
|
else:
|
||||||
|
# If not in a pre, just create a formatted span for code
|
||||||
|
self._current_span = None # Force creation of a new span with code style
|
||||||
|
|
||||||
|
elif tag in ('ul', 'ol', 'dl'):
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Determine list style
|
||||||
|
style_map = {
|
||||||
|
'ul': ListStyle.UNORDERED,
|
||||||
|
'ol': ListStyle.ORDERED,
|
||||||
|
'dl': ListStyle.DEFINITION
|
||||||
|
}
|
||||||
|
|
||||||
|
list_block = HList(style=style_map[tag])
|
||||||
|
|
||||||
|
# Add the list to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(list_block)
|
||||||
|
else:
|
||||||
|
self.document.add_block(list_block)
|
||||||
|
|
||||||
|
# Push to block stack and list stack
|
||||||
|
self._block_stack.append(list_block)
|
||||||
|
self._list_stack.append(list_block)
|
||||||
|
self._current_block = list_block
|
||||||
|
self._current_paragraph = None
|
||||||
|
|
||||||
|
elif tag == 'li' and self._list_stack:
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
list_item = ListItem()
|
||||||
|
|
||||||
|
# Add to the current list
|
||||||
|
current_list = self._list_stack[-1]
|
||||||
|
current_list.add_item(list_item)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(list_item)
|
||||||
|
self._current_block = list_item
|
||||||
|
self._current_paragraph = None
|
||||||
|
|
||||||
|
elif tag == 'dt' and self._list_stack and self._list_stack[-1].style == ListStyle.DEFINITION:
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# For definition term, we create a list item with a term
|
||||||
|
list_item = ListItem(term="") # Will be filled by content
|
||||||
|
|
||||||
|
# Add to the current list
|
||||||
|
current_list = self._list_stack[-1]
|
||||||
|
current_list.add_item(list_item)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(list_item)
|
||||||
|
self._current_block = list_item
|
||||||
|
|
||||||
|
# Create a paragraph for the term content
|
||||||
|
term_para = Parapgraph()
|
||||||
|
list_item.add_block(term_para)
|
||||||
|
self._current_paragraph = term_para
|
||||||
|
|
||||||
|
elif tag == 'dd' and self._list_stack and self._list_stack[-1].style == ListStyle.DEFINITION:
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Find the last dt item
|
||||||
|
current_list = self._list_stack[-1]
|
||||||
|
if current_list._items:
|
||||||
|
list_item = current_list._items[-1]
|
||||||
|
|
||||||
|
# Create a paragraph for the description content
|
||||||
|
desc_para = Parapgraph()
|
||||||
|
list_item.add_block(desc_para)
|
||||||
|
|
||||||
|
# Update current state
|
||||||
|
self._current_paragraph = desc_para
|
||||||
|
else:
|
||||||
|
# If no dt found, create a new list item
|
||||||
|
list_item = ListItem()
|
||||||
|
current_list.add_item(list_item)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(list_item)
|
||||||
|
self._current_block = list_item
|
||||||
|
|
||||||
|
# Create a paragraph for the description content
|
||||||
|
desc_para = Parapgraph()
|
||||||
|
list_item.add_block(desc_para)
|
||||||
|
self._current_paragraph = desc_para
|
||||||
|
|
||||||
|
elif tag == 'table':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Create a new table
|
||||||
|
caption = None
|
||||||
|
if 'summary' in attrs_dict:
|
||||||
|
caption = attrs_dict['summary']
|
||||||
|
|
||||||
|
table = Table(caption=caption)
|
||||||
|
|
||||||
|
# Add the table to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(table)
|
||||||
|
else:
|
||||||
|
self.document.add_block(table)
|
||||||
|
|
||||||
|
# Push to block stack and table stack
|
||||||
|
self._block_stack.append(table)
|
||||||
|
self._table_stack.append(table)
|
||||||
|
self._current_block = table
|
||||||
|
self._current_paragraph = None
|
||||||
|
|
||||||
|
elif tag in ('thead', 'tbody', 'tfoot') and self._table_stack:
|
||||||
|
# Just track the current section - no need to create new objects
|
||||||
|
self._current_table_section = tag
|
||||||
|
|
||||||
|
elif tag == 'tr' and self._table_stack:
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Create a new row
|
||||||
|
row = TableRow()
|
||||||
|
|
||||||
|
# Add to the current table
|
||||||
|
current_table = self._table_stack[-1]
|
||||||
|
|
||||||
|
# Determine the section based on context
|
||||||
|
section = "body"
|
||||||
|
if hasattr(self, '_current_table_section'):
|
||||||
|
if self._current_table_section == 'thead':
|
||||||
|
section = "header"
|
||||||
|
elif self._current_table_section == 'tfoot':
|
||||||
|
section = "footer"
|
||||||
|
|
||||||
|
current_table.add_row(row, section=section)
|
||||||
|
|
||||||
|
# Update state
|
||||||
|
self._current_table_row = row
|
||||||
|
self._current_paragraph = None
|
||||||
|
|
||||||
|
elif tag in ('td', 'th') and self._current_table_row:
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Parse attributes
|
||||||
|
colspan = 1
|
||||||
|
rowspan = 1
|
||||||
|
|
||||||
|
if 'colspan' in attrs_dict:
|
||||||
|
try:
|
||||||
|
colspan = int(attrs_dict['colspan'])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if 'rowspan' in attrs_dict:
|
||||||
|
try:
|
||||||
|
rowspan = int(attrs_dict['rowspan'])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Create a new cell
|
||||||
|
is_header = (tag == 'th')
|
||||||
|
cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan)
|
||||||
|
|
||||||
|
# Add to the current row
|
||||||
|
self._current_table_row.add_cell(cell)
|
||||||
|
|
||||||
|
# Push to block stack
|
||||||
|
self._block_stack.append(cell)
|
||||||
|
self._current_block = cell
|
||||||
|
|
||||||
|
# Create a paragraph for the cell content
|
||||||
|
cell_para = Parapgraph()
|
||||||
|
cell.add_block(cell_para)
|
||||||
|
self._current_paragraph = cell_para
|
||||||
|
|
||||||
|
elif tag == 'a':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Parse attributes
|
||||||
|
href = attrs_dict.get('href', '')
|
||||||
|
title = attrs_dict.get('title', '')
|
||||||
|
|
||||||
|
# Determine link type
|
||||||
|
link_type = LinkType.INTERNAL
|
||||||
|
if href.startswith('http://') or href.startswith('https://'):
|
||||||
|
link_type = LinkType.EXTERNAL
|
||||||
|
elif href.startswith('javascript:'):
|
||||||
|
link_type = LinkType.FUNCTION
|
||||||
|
elif href.startswith('api:'):
|
||||||
|
link_type = LinkType.API
|
||||||
|
href = href[4:] # Remove api: prefix
|
||||||
|
|
||||||
|
# If we have a base URL and the href is relative, resolve it
|
||||||
|
if self._base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')):
|
||||||
|
href = urllib.parse.urljoin(self._base_url, href)
|
||||||
|
|
||||||
|
# Create a Link object
|
||||||
|
self._current_link = Link(
|
||||||
|
location=href,
|
||||||
|
link_type=link_type,
|
||||||
|
title=title if title else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set the flag to indicate we're inside a link
|
||||||
|
self._in_link = True
|
||||||
|
|
||||||
|
# Force creation of a new span with link style
|
||||||
|
self._current_span = None
|
||||||
|
|
||||||
|
elif tag == 'img':
|
||||||
|
# Handle image
|
||||||
|
src = attrs_dict.get('src', '')
|
||||||
|
alt = attrs_dict.get('alt', '')
|
||||||
|
|
||||||
|
# Parse width and height if provided
|
||||||
|
width = None
|
||||||
|
height = None
|
||||||
|
if 'width' in attrs_dict:
|
||||||
|
try:
|
||||||
|
width = int(attrs_dict['width'])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if 'height' in attrs_dict:
|
||||||
|
try:
|
||||||
|
height = int(attrs_dict['height'])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If we have a base URL and the src is relative, resolve it
|
||||||
|
if self._base_url and not src.startswith(('http://', 'https://')):
|
||||||
|
src = urllib.parse.urljoin(self._base_url, src)
|
||||||
|
|
||||||
|
# Create an Image block
|
||||||
|
from .abstract.block import Image
|
||||||
|
image = Image(source=src, alt_text=alt, width=width, height=height)
|
||||||
|
|
||||||
|
# Add the image to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(image)
|
||||||
|
else:
|
||||||
|
self.document.add_block(image)
|
||||||
|
|
||||||
|
# Also add as a resource for backwards compatibility
|
||||||
|
resource_name = f"img_{len(self.document._resources) + 1}"
|
||||||
|
self.document.add_resource(resource_name, {
|
||||||
|
'type': 'image',
|
||||||
|
'src': src,
|
||||||
|
'alt': alt,
|
||||||
|
'width': width,
|
||||||
|
'height': height,
|
||||||
|
'image_object': image
|
||||||
|
})
|
||||||
|
|
||||||
|
elif tag == 'br':
|
||||||
|
# Add a line break
|
||||||
|
if self._current_paragraph:
|
||||||
|
line_break = LineBreak()
|
||||||
|
if hasattr(self._current_paragraph, 'add_block'):
|
||||||
|
self._current_paragraph.add_block(line_break)
|
||||||
|
|
||||||
|
# Flush any text before the break
|
||||||
|
self._flush_text()
|
||||||
|
|
||||||
|
elif tag == 'hr':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Create a horizontal rule
|
||||||
|
hr = HorizontalRule()
|
||||||
|
|
||||||
|
# Add to the current block or document
|
||||||
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
|
self._current_block.add_block(hr)
|
||||||
|
else:
|
||||||
|
self.document.add_block(hr)
|
||||||
|
|
||||||
|
elif tag in ('b', 'strong'):
|
||||||
|
# Bold text
|
||||||
|
self._current_style['font_weight'] = FontWeight.BOLD
|
||||||
|
self._current_span = None # Force creation of a new span
|
||||||
|
|
||||||
|
elif tag in ('i', 'em'):
|
||||||
|
# Italic text
|
||||||
|
self._current_style['font_style'] = FontStyle.ITALIC
|
||||||
|
self._current_span = None # Force creation of a new span
|
||||||
|
|
||||||
|
elif tag == 'u':
|
||||||
|
# Underlined text
|
||||||
|
self._current_style['decoration'] = TextDecoration.UNDERLINE
|
||||||
|
self._current_span = None # Force creation of a new span
|
||||||
|
|
||||||
|
elif tag == 'span':
|
||||||
|
# Span can have style attributes
|
||||||
|
self._current_span = None # Force creation of a new span
|
||||||
|
|
||||||
|
elif tag == 'form':
|
||||||
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
|
# Parse attributes
|
||||||
|
form_id = attrs_dict.get('id', f"form_{len(self.document._resources) + 1}")
|
||||||
|
action = attrs_dict.get('action', '')
|
||||||
|
|
||||||
|
# Create a Form object
|
||||||
|
form = Form(form_id=form_id, action=action)
|
||||||
|
|
||||||
|
# Add as a resource
|
||||||
|
self.document.add_resource(form_id, form)
|
||||||
|
|
||||||
|
# TODO: Create a proper Form block class and add it to the document
|
||||||
|
|
||||||
|
elif tag == 'input':
|
||||||
|
# Parse attributes
|
||||||
|
input_type = attrs_dict.get('type', 'text')
|
||||||
|
input_name = attrs_dict.get('name', '')
|
||||||
|
input_value = attrs_dict.get('value', '')
|
||||||
|
input_required = 'required' in attrs_dict
|
||||||
|
|
||||||
|
# Map HTML input types to FormFieldType
|
||||||
|
type_map = {
|
||||||
|
'text': FormFieldType.TEXT,
|
||||||
|
'password': FormFieldType.PASSWORD,
|
||||||
|
'checkbox': FormFieldType.CHECKBOX,
|
||||||
|
'radio': FormFieldType.RADIO,
|
||||||
|
'number': FormFieldType.NUMBER,
|
||||||
|
'date': FormFieldType.DATE,
|
||||||
|
'time': FormFieldType.TIME,
|
||||||
|
'email': FormFieldType.EMAIL,
|
||||||
|
'url': FormFieldType.URL,
|
||||||
|
'color': FormFieldType.COLOR,
|
||||||
|
'range': FormFieldType.RANGE,
|
||||||
|
'hidden': FormFieldType.HIDDEN
|
||||||
|
}
|
||||||
|
|
||||||
|
field_type = type_map.get(input_type, FormFieldType.TEXT)
|
||||||
|
|
||||||
|
# Create a FormField object
|
||||||
|
field = FormField(
|
||||||
|
name=input_name,
|
||||||
|
field_type=field_type,
|
||||||
|
label=attrs_dict.get('placeholder', input_name),
|
||||||
|
value=input_value,
|
||||||
|
required=input_required
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: Add the field to a form if inside a form
|
||||||
|
|
||||||
|
elif tag == 'textarea':
|
||||||
|
# Similar to input but with multiline content
|
||||||
|
# We'll handle the content in handle_data
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif tag == 'select':
|
||||||
|
# Similar to input but with options
|
||||||
|
# We'll handle the options in handle_data
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif tag == 'button':
|
||||||
|
# Parse attributes
|
||||||
|
button_type = attrs_dict.get('type', 'button')
|
||||||
|
button_name = attrs_dict.get('name', '')
|
||||||
|
|
||||||
|
# TODO: Create a Button object and add it to the document
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str):
|
||||||
|
"""
|
||||||
|
Handle the end of an HTML tag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag: The tag name
|
||||||
|
"""
|
||||||
|
tag = tag.lower()
|
||||||
|
|
||||||
|
# Special handling for elements where we collect content
|
||||||
|
if tag == 'script' and self._in_script:
|
||||||
|
self._in_script = False
|
||||||
|
self.document.add_script(self._script_buffer)
|
||||||
|
self._script_buffer = ""
|
||||||
|
self._pop_style()
|
||||||
|
return
|
||||||
|
|
||||||
|
if tag == 'style' and self._in_style:
|
||||||
|
self._in_style = False
|
||||||
|
# Parse the style and add to document
|
||||||
|
stylesheet = self._parse_css(self._style_buffer)
|
||||||
|
if stylesheet:
|
||||||
|
self.document.add_stylesheet(stylesheet)
|
||||||
|
self._style_buffer = ""
|
||||||
|
self._pop_style()
|
||||||
|
return
|
||||||
|
|
||||||
|
if tag == 'title' and self._in_title:
|
||||||
|
self._in_title = False
|
||||||
|
self.document.set_title(self._title_buffer.strip())
|
||||||
|
self._title_buffer = ""
|
||||||
|
self._pop_style()
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._in_script and tag != 'script':
|
||||||
|
return
|
||||||
|
if self._in_style and tag != 'style':
|
||||||
|
return
|
||||||
|
|
||||||
|
# Flush any accumulated text
|
||||||
|
self._flush_text()
|
||||||
|
|
||||||
|
# Handle specific end tags
|
||||||
|
if tag == 'head':
|
||||||
|
self._in_head = False
|
||||||
|
|
||||||
|
elif tag == 'body':
|
||||||
|
pass # Nothing special to do
|
||||||
|
|
||||||
|
elif tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre'):
|
||||||
|
# Pop from block stack
|
||||||
|
if self._block_stack:
|
||||||
|
self._block_stack.pop()
|
||||||
|
|
||||||
|
# Update current block
|
||||||
|
if self._block_stack:
|
||||||
|
self._current_block = self._block_stack[-1]
|
||||||
|
else:
|
||||||
|
self._current_block = None
|
||||||
|
|
||||||
|
# Reset current paragraph
|
||||||
|
self._current_paragraph = None
|
||||||
|
self._current_span = None
|
||||||
|
|
||||||
|
elif tag == 'code':
|
||||||
|
# If we're inside a code block, no need to do anything special
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif tag in ('ul', 'ol', 'dl'):
|
||||||
|
# Pop from block stack and list stack
|
||||||
|
if self._block_stack:
|
||||||
|
self._block_stack.pop()
|
||||||
|
|
||||||
|
if self._list_stack:
|
||||||
|
self._list_stack.pop()
|
||||||
|
|
||||||
|
# Update current block
|
||||||
|
if self._block_stack:
|
||||||
|
self._current_block = self._block_stack[-1]
|
||||||
|
else:
|
||||||
|
self._current_block = None
|
||||||
|
|
||||||
|
# Reset current paragraph
|
||||||
|
self._current_paragraph = None
|
||||||
|
self._current_span = None
|
||||||
|
|
||||||
|
elif tag in ('li', 'dt', 'dd'):
|
||||||
|
# Pop from block stack
|
||||||
|
if self._block_stack:
|
||||||
|
self._block_stack.pop()
|
||||||
|
|
||||||
|
# Update current block
|
||||||
|
if self._block_stack:
|
||||||
|
self._current_block = self._block_stack[-1]
|
||||||
|
else:
|
||||||
|
self._current_block = None
|
||||||
|
|
||||||
|
# Reset current paragraph
|
||||||
|
self._current_paragraph = None
|
||||||
|
self._current_span = None
|
||||||
|
|
||||||
|
elif tag == 'table':
|
||||||
|
# Pop from block stack and table stack
|
||||||
|
if self._block_stack:
|
||||||
|
self._block_stack.pop()
|
||||||
|
|
||||||
|
if self._table_stack:
|
||||||
|
self._table_stack.pop()
|
||||||
|
|
||||||
|
# Update current block
|
||||||
|
if self._block_stack:
|
||||||
|
self._current_block = self._block_stack[-1]
|
||||||
|
else:
|
||||||
|
self._current_block = None
|
||||||
|
|
||||||
|
# Reset current paragraph and table state
|
||||||
|
self._current_paragraph = None
|
||||||
|
self._current_span = None
|
||||||
|
self._current_table_row = None
|
||||||
|
if hasattr(self, '_current_table_section'):
|
||||||
|
delattr(self, '_current_table_section')
|
||||||
|
|
||||||
|
elif tag in ('thead', 'tbody', 'tfoot'):
|
||||||
|
# Clear current section
|
||||||
|
if hasattr(self, '_current_table_section'):
|
||||||
|
delattr(self, '_current_table_section')
|
||||||
|
|
||||||
|
elif tag == 'tr':
|
||||||
|
# Reset current row
|
||||||
|
self._current_table_row = None
|
||||||
|
|
||||||
|
elif tag in ('td', 'th'):
|
||||||
|
# Pop from block stack
|
||||||
|
if self._block_stack:
|
||||||
|
self._block_stack.pop()
|
||||||
|
|
||||||
|
# Update current block
|
||||||
|
if self._block_stack:
|
||||||
|
self._current_block = self._block_stack[-1]
|
||||||
|
else:
|
||||||
|
self._current_block = None
|
||||||
|
|
||||||
|
# Reset current paragraph
|
||||||
|
self._current_paragraph = None
|
||||||
|
self._current_span = None
|
||||||
|
|
||||||
|
elif tag == 'a':
|
||||||
|
# End of link
|
||||||
|
self._in_link = False
|
||||||
|
self._current_link = None
|
||||||
|
|
||||||
|
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
|
||||||
|
# End of styled text
|
||||||
|
self._current_span = None
|
||||||
|
|
||||||
|
# Pop style regardless of tag
|
||||||
|
self._pop_style()
|
||||||
|
|
||||||
|
def handle_data(self, data: str):
|
||||||
|
"""
|
||||||
|
Handle text data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The text data
|
||||||
|
"""
|
||||||
|
if self._in_script:
|
||||||
|
self._script_buffer += data
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._in_style:
|
||||||
|
self._style_buffer += data
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._in_title:
|
||||||
|
self._title_buffer += data
|
||||||
|
return
|
||||||
|
|
||||||
|
# Add to text buffer
|
||||||
|
self._text_buffer += data
|
||||||
|
|
||||||
|
def handle_entityref(self, name: str):
|
||||||
|
"""
|
||||||
|
Handle an HTML entity reference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The entity name
|
||||||
|
"""
|
||||||
|
# Map common entity references to characters
|
||||||
|
entities = {
|
||||||
|
'lt': '<',
|
||||||
|
'gt': '>',
|
||||||
|
'amp': '&',
|
||||||
|
'quot': '"',
|
||||||
|
'apos': "'",
|
||||||
|
'nbsp': ' ',
|
||||||
|
'copy': '©',
|
||||||
|
'reg': '®',
|
||||||
|
'trade': '™',
|
||||||
|
}
|
||||||
|
|
||||||
|
if name in entities:
|
||||||
|
char = entities[name]
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import html.entities
|
||||||
|
char = chr(html.entities.name2codepoint[name])
|
||||||
|
except (KeyError, ImportError):
|
||||||
|
char = f'&{name};'
|
||||||
|
|
||||||
|
# Handle based on context
|
||||||
|
if self._in_script:
|
||||||
|
self._script_buffer += char
|
||||||
|
elif self._in_style:
|
||||||
|
self._style_buffer += char
|
||||||
|
elif self._in_title:
|
||||||
|
self._title_buffer += char
|
||||||
|
else:
|
||||||
|
self._text_buffer += char
|
||||||
|
|
||||||
|
def handle_charref(self, name: str):
|
||||||
|
"""
|
||||||
|
Handle a character reference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The character reference (decimal or hex)
|
||||||
|
"""
|
||||||
|
# Convert character reference to character
|
||||||
|
if name.startswith('x'):
|
||||||
|
# Hexadecimal reference
|
||||||
|
char = chr(int(name[1:], 16))
|
||||||
|
else:
|
||||||
|
# Decimal reference
|
||||||
|
char = chr(int(name))
|
||||||
|
|
||||||
|
# Handle based on context
|
||||||
|
if self._in_script:
|
||||||
|
self._script_buffer += char
|
||||||
|
elif self._in_style:
|
||||||
|
self._style_buffer += char
|
||||||
|
elif self._in_title:
|
||||||
|
self._title_buffer += char
|
||||||
|
else:
|
||||||
|
self._text_buffer += char
|
||||||
|
|
||||||
|
def _push_style(self, style: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Push a new style onto the style stack.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style: The style to push
|
||||||
|
"""
|
||||||
|
# Save the current style
|
||||||
|
self._style_stack.append(self._current_style.copy())
|
||||||
|
|
||||||
|
# Apply the new style
|
||||||
|
for key, value in style.items():
|
||||||
|
self._current_style[key] = value
|
||||||
|
|
||||||
|
def _pop_style(self):
|
||||||
|
"""Pop a style from the style stack."""
|
||||||
|
if self._style_stack:
|
||||||
|
self._current_style = self._style_stack.pop()
|
||||||
|
|
||||||
|
def _get_tag_style(self, tag: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the default style for a tag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag: The tag name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary of style properties
|
||||||
|
"""
|
||||||
|
# Default styles for common tags
|
||||||
|
tag_styles = {
|
||||||
|
'h1': {'font_size': 24, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h2': {'font_size': 20, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h3': {'font_size': 18, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h4': {'font_size': 16, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h5': {'font_size': 14, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h6': {'font_size': 12, 'font_weight': FontWeight.BOLD},
|
||||||
|
'b': {'font_weight': FontWeight.BOLD},
|
||||||
|
'strong': {'font_weight': FontWeight.BOLD},
|
||||||
|
'i': {'font_style': FontStyle.ITALIC},
|
||||||
|
'em': {'font_style': FontStyle.ITALIC},
|
||||||
|
'u': {'decoration': TextDecoration.UNDERLINE},
|
||||||
|
'a': {'decoration': TextDecoration.UNDERLINE, 'color': (0, 0, 255)},
|
||||||
|
'code': {'font_family': 'monospace', 'background': (240, 240, 240, 255)},
|
||||||
|
'pre': {'font_family': 'monospace'},
|
||||||
|
}
|
||||||
|
|
||||||
|
return tag_styles.get(tag, {})
|
||||||
|
|
||||||
|
def _create_font(self) -> Font:
|
||||||
|
"""
|
||||||
|
Create a Font object from the current style.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Font: A font object with the current style settings
|
||||||
|
"""
|
||||||
69
pyWebLayout/io/__init__.py
Normal file
69
pyWebLayout/io/__init__.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
"""
|
||||||
|
Input/Output module for pyWebLayout.
|
||||||
|
|
||||||
|
This package provides functionality for reading and writing various file formats,
|
||||||
|
including HTML, EPUB, and other document formats.
|
||||||
|
|
||||||
|
The module uses a decomposed architecture with specialized readers for different
|
||||||
|
aspects of document parsing (metadata, content, resources), following the same
|
||||||
|
pattern as the abstract module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Legacy readers (for backward compatibility)
|
||||||
|
# Legacy functions provided by new HTML reader for backward compatibility
|
||||||
|
from pyWebLayout.io.readers.html import parse_html_string as parse_html
|
||||||
|
from pyWebLayout.io.readers.html import read_html_file as html_to_document
|
||||||
|
from pyWebLayout.io.readers.epub_reader import read_epub
|
||||||
|
|
||||||
|
# New decomposed readers
|
||||||
|
from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string
|
||||||
|
from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
||||||
|
|
||||||
|
# Specialized HTML readers
|
||||||
|
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||||
|
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
||||||
|
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||||
|
|
||||||
|
# Specialized EPUB readers
|
||||||
|
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
|
||||||
|
|
||||||
|
# Convenience functions using the new architecture
|
||||||
|
def read_document(source, format_hint=None, **options):
|
||||||
|
"""
|
||||||
|
Read a document using the appropriate reader based on format detection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source to read (file path, URL, or content)
|
||||||
|
format_hint: Optional hint about the format ('html', 'epub', etc.)
|
||||||
|
**options: Additional options for reading
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: The parsed document
|
||||||
|
"""
|
||||||
|
if format_hint == 'html' or (not format_hint and _is_html_source(source)):
|
||||||
|
reader = HTMLReader()
|
||||||
|
return reader.read(source, **options)
|
||||||
|
elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)):
|
||||||
|
# Use legacy EPUB reader for now
|
||||||
|
return read_epub(source)
|
||||||
|
else:
|
||||||
|
# Try HTML reader as fallback
|
||||||
|
try:
|
||||||
|
reader = HTMLReader()
|
||||||
|
if reader.can_read(source):
|
||||||
|
return reader.read(source, **options)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise ValueError(f"Cannot determine format for source: {source}")
|
||||||
|
|
||||||
|
def _is_html_source(source):
|
||||||
|
"""Check if source appears to be HTML."""
|
||||||
|
reader = HTMLReader()
|
||||||
|
return reader.can_read(source)
|
||||||
|
|
||||||
|
def _is_epub_source(source):
|
||||||
|
"""Check if source appears to be EPUB."""
|
||||||
|
if isinstance(source, str):
|
||||||
|
return source.lower().endswith('.epub')
|
||||||
|
return False
|
||||||
36
pyWebLayout/io/readers/__init__.py
Normal file
36
pyWebLayout/io/readers/__init__.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
"""
|
||||||
|
Readers module for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides specialized readers for different document formats
|
||||||
|
using a decomposed architecture pattern.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Base classes for the decomposed architecture
|
||||||
|
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
||||||
|
|
||||||
|
# HTML readers (decomposed)
|
||||||
|
from .html import HTMLReader, read_html, read_html_file, parse_html_string
|
||||||
|
from .html_metadata import HTMLMetadataReader
|
||||||
|
from .html_content import HTMLContentReader
|
||||||
|
from .html_resources import HTMLResourceReader
|
||||||
|
|
||||||
|
# HTML processing components (supporting modules)
|
||||||
|
from .html_style import HTMLStyleManager
|
||||||
|
from .html_text import HTMLTextProcessor
|
||||||
|
from .html_elements import BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
|
||||||
|
|
||||||
|
# EPUB readers
|
||||||
|
from .epub_reader import read_epub # Legacy
|
||||||
|
from .epub_metadata import EPUBMetadataReader # New decomposed
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Base classes
|
||||||
|
'BaseReader', 'MetadataReader', 'ContentReader', 'ResourceReader', 'CompositeReader',
|
||||||
|
|
||||||
|
# HTML readers
|
||||||
|
'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string',
|
||||||
|
'HTMLMetadataReader', 'HTMLContentReader', 'HTMLResourceReader',
|
||||||
|
|
||||||
|
# EPUB readers
|
||||||
|
'read_epub', 'EPUBMetadataReader',
|
||||||
|
]
|
||||||
229
pyWebLayout/io/readers/base.py
Normal file
229
pyWebLayout/io/readers/base.py
Normal file
@ -0,0 +1,229 @@
|
|||||||
|
"""
|
||||||
|
Base classes for document readers in pyWebLayout.
|
||||||
|
|
||||||
|
This module provides the foundational classes that all readers inherit from,
|
||||||
|
similar to how the abstract module provides base classes for document elements.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
from pyWebLayout.abstract.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
class BaseReader(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for all document readers.
|
||||||
|
|
||||||
|
This class defines the common interface that all readers must implement.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the base reader."""
|
||||||
|
self._document = None
|
||||||
|
self._options = {}
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def can_read(self, source: Union[str, bytes]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this reader can handle the given source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source to check (file path, URL, or content)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if this reader can handle the source, False otherwise
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def read(self, source: Union[str, bytes], **options) -> Document:
|
||||||
|
"""
|
||||||
|
Read and parse the source into a Document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source to read (file path, URL, or content)
|
||||||
|
**options: Additional options for reading
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed Document
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def set_option(self, key: str, value: Any):
|
||||||
|
"""
|
||||||
|
Set a reader option.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: The option name
|
||||||
|
value: The option value
|
||||||
|
"""
|
||||||
|
self._options[key] = value
|
||||||
|
|
||||||
|
def get_option(self, key: str, default: Any = None) -> Any:
|
||||||
|
"""
|
||||||
|
Get a reader option.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: The option name
|
||||||
|
default: Default value if option is not set
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The option value or default
|
||||||
|
"""
|
||||||
|
return self._options.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataReader(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for reading document metadata.
|
||||||
|
|
||||||
|
This class handles extraction of document metadata like title, author, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract metadata from the source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source data
|
||||||
|
document: The document to populate with metadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of extracted metadata
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class StructureReader(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for reading document structure.
|
||||||
|
|
||||||
|
This class handles extraction of document structure like headings, sections, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract_structure(self, source: Any, document: Document) -> List[Any]:
|
||||||
|
"""
|
||||||
|
Extract structure information from the source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source data
|
||||||
|
document: The document to populate with structure
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of structural elements
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ContentReader(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for reading document content.
|
||||||
|
|
||||||
|
This class handles extraction of document content like text, formatting, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract_content(self, source: Any, document: Document) -> Any:
|
||||||
|
"""
|
||||||
|
Extract content from the source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source data
|
||||||
|
document: The document to populate with content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The extracted content
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ResourceReader(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for reading document resources.
|
||||||
|
|
||||||
|
This class handles extraction of document resources like images, stylesheets, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract resources from the source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source data
|
||||||
|
document: The document to populate with resources
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of extracted resources
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class CompositeReader(BaseReader):
|
||||||
|
"""
|
||||||
|
A reader that combines multiple specialized readers.
|
||||||
|
|
||||||
|
This class uses composition to combine metadata, structure, content,
|
||||||
|
and resource readers into a complete document reader.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the composite reader."""
|
||||||
|
super().__init__()
|
||||||
|
self._metadata_reader: Optional[MetadataReader] = None
|
||||||
|
self._structure_reader: Optional[StructureReader] = None
|
||||||
|
self._content_reader: Optional[ContentReader] = None
|
||||||
|
self._resource_reader: Optional[ResourceReader] = None
|
||||||
|
|
||||||
|
def set_metadata_reader(self, reader: MetadataReader):
|
||||||
|
"""Set the metadata reader."""
|
||||||
|
self._metadata_reader = reader
|
||||||
|
|
||||||
|
def set_structure_reader(self, reader: StructureReader):
|
||||||
|
"""Set the structure reader."""
|
||||||
|
self._structure_reader = reader
|
||||||
|
|
||||||
|
def set_content_reader(self, reader: ContentReader):
|
||||||
|
"""Set the content reader."""
|
||||||
|
self._content_reader = reader
|
||||||
|
|
||||||
|
def set_resource_reader(self, reader: ResourceReader):
|
||||||
|
"""Set the resource reader."""
|
||||||
|
self._resource_reader = reader
|
||||||
|
|
||||||
|
def read(self, source: Union[str, bytes], **options) -> Document:
|
||||||
|
"""
|
||||||
|
Read the source using all configured readers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source to read
|
||||||
|
**options: Additional options for reading
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed Document
|
||||||
|
"""
|
||||||
|
# Create a new document
|
||||||
|
document = Document()
|
||||||
|
|
||||||
|
# Store options
|
||||||
|
self._options.update(options)
|
||||||
|
|
||||||
|
# Extract metadata if reader is available
|
||||||
|
if self._metadata_reader:
|
||||||
|
self._metadata_reader.extract_metadata(source, document)
|
||||||
|
|
||||||
|
# Extract structure if reader is available
|
||||||
|
if self._structure_reader:
|
||||||
|
self._structure_reader.extract_structure(source, document)
|
||||||
|
|
||||||
|
# Extract content if reader is available
|
||||||
|
if self._content_reader:
|
||||||
|
self._content_reader.extract_content(source, document)
|
||||||
|
|
||||||
|
# Extract resources if reader is available
|
||||||
|
if self._resource_reader:
|
||||||
|
self._resource_reader.extract_resources(source, document)
|
||||||
|
|
||||||
|
return document
|
||||||
352
pyWebLayout/io/readers/epub_metadata.py
Normal file
352
pyWebLayout/io/readers/epub_metadata.py
Normal file
@ -0,0 +1,352 @@
|
|||||||
|
"""
|
||||||
|
EPUB metadata reader for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides specialized functionality for extracting metadata
|
||||||
|
from EPUB documents, following the decomposed architecture pattern.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
import tempfile
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from pyWebLayout.abstract.document import Document, MetadataType
|
||||||
|
from pyWebLayout.io.readers.base import MetadataReader
|
||||||
|
|
||||||
|
|
||||||
|
# XML namespaces used in EPUB files
|
||||||
|
NAMESPACES = {
|
||||||
|
'opf': 'http://www.idpf.org/2007/opf',
|
||||||
|
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||||
|
'dcterms': 'http://purl.org/dc/terms/',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class EPUBMetadataReader(MetadataReader):
|
||||||
|
"""
|
||||||
|
Specialized reader for extracting metadata from EPUB documents.
|
||||||
|
|
||||||
|
This class handles EPUB package document metadata including
|
||||||
|
Dublin Core elements and custom metadata.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the EPUB metadata reader."""
|
||||||
|
self._metadata = {}
|
||||||
|
self._temp_dir = None
|
||||||
|
self._package_path = None
|
||||||
|
|
||||||
|
def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract metadata from EPUB file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
epub_path: Path to the EPUB file
|
||||||
|
document: The document to populate with metadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of extracted metadata
|
||||||
|
"""
|
||||||
|
# Reset internal state
|
||||||
|
self._reset()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Extract EPUB to temporary directory
|
||||||
|
self._extract_epub(epub_path)
|
||||||
|
|
||||||
|
# Find and parse package document
|
||||||
|
self._find_package_document()
|
||||||
|
|
||||||
|
if self._package_path:
|
||||||
|
self._parse_package_metadata()
|
||||||
|
|
||||||
|
# Populate document with extracted metadata
|
||||||
|
self._populate_document(document)
|
||||||
|
|
||||||
|
return self._metadata
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up temporary files
|
||||||
|
self._cleanup()
|
||||||
|
|
||||||
|
def _reset(self):
|
||||||
|
"""Reset internal state for a new extraction."""
|
||||||
|
self._metadata = {}
|
||||||
|
self._temp_dir = None
|
||||||
|
self._package_path = None
|
||||||
|
|
||||||
|
def _extract_epub(self, epub_path: str):
|
||||||
|
"""
|
||||||
|
Extract EPUB file to temporary directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
epub_path: Path to the EPUB file
|
||||||
|
"""
|
||||||
|
self._temp_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
|
||||||
|
zip_ref.extractall(self._temp_dir)
|
||||||
|
|
||||||
|
def _find_package_document(self):
|
||||||
|
"""Find the package document (content.opf) in the extracted EPUB."""
|
||||||
|
# First, try to find it via META-INF/container.xml
|
||||||
|
container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
|
||||||
|
|
||||||
|
if os.path.exists(container_path):
|
||||||
|
try:
|
||||||
|
tree = ET.parse(container_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Find rootfile element
|
||||||
|
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
|
||||||
|
full_path = rootfile.get('full-path')
|
||||||
|
if full_path:
|
||||||
|
self._package_path = os.path.join(self._temp_dir, full_path)
|
||||||
|
if os.path.exists(self._package_path):
|
||||||
|
return
|
||||||
|
except ET.ParseError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback: search for .opf files
|
||||||
|
for root, dirs, files in os.walk(self._temp_dir):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.opf'):
|
||||||
|
self._package_path = os.path.join(root, file)
|
||||||
|
return
|
||||||
|
|
||||||
|
def _parse_package_metadata(self):
|
||||||
|
"""Parse metadata from the package document."""
|
||||||
|
if not self._package_path or not os.path.exists(self._package_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
tree = ET.parse(self._package_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Find metadata element
|
||||||
|
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
|
||||||
|
if metadata_elem is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse Dublin Core metadata
|
||||||
|
self._parse_dublin_core(metadata_elem)
|
||||||
|
|
||||||
|
# Parse OPF-specific metadata
|
||||||
|
self._parse_opf_metadata(metadata_elem)
|
||||||
|
|
||||||
|
except ET.ParseError as e:
|
||||||
|
print(f"Error parsing package document: {e}")
|
||||||
|
|
||||||
|
def _parse_dublin_core(self, metadata_elem: ET.Element):
|
||||||
|
"""
|
||||||
|
Parse Dublin Core metadata elements.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata_elem: The metadata XML element
|
||||||
|
"""
|
||||||
|
dc_elements = {
|
||||||
|
'title': 'title',
|
||||||
|
'creator': 'creator',
|
||||||
|
'subject': 'subject',
|
||||||
|
'description': 'description',
|
||||||
|
'publisher': 'publisher',
|
||||||
|
'contributor': 'contributor',
|
||||||
|
'date': 'date',
|
||||||
|
'type': 'type',
|
||||||
|
'format': 'format',
|
||||||
|
'identifier': 'identifier',
|
||||||
|
'source': 'source',
|
||||||
|
'language': 'language',
|
||||||
|
'relation': 'relation',
|
||||||
|
'coverage': 'coverage',
|
||||||
|
'rights': 'rights'
|
||||||
|
}
|
||||||
|
|
||||||
|
for dc_name, meta_key in dc_elements.items():
|
||||||
|
elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
|
||||||
|
|
||||||
|
if elements:
|
||||||
|
if len(elements) == 1:
|
||||||
|
# Single element
|
||||||
|
text = elements[0].text
|
||||||
|
if text:
|
||||||
|
self._metadata[meta_key] = text.strip()
|
||||||
|
|
||||||
|
# Handle special attributes
|
||||||
|
elem = elements[0]
|
||||||
|
if dc_name == 'creator':
|
||||||
|
# Check for role attribute
|
||||||
|
role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
|
||||||
|
if role:
|
||||||
|
self._metadata[f'{meta_key}_role'] = role
|
||||||
|
|
||||||
|
# Check for file-as attribute for sorting
|
||||||
|
file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
|
||||||
|
if file_as:
|
||||||
|
self._metadata[f'{meta_key}_file_as'] = file_as
|
||||||
|
|
||||||
|
elif dc_name == 'identifier':
|
||||||
|
# Check for scheme (ISBN, DOI, etc.)
|
||||||
|
scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
|
||||||
|
if scheme:
|
||||||
|
self._metadata[f'{meta_key}_scheme'] = scheme
|
||||||
|
|
||||||
|
# Check if this is the unique identifier
|
||||||
|
id_attr = elem.get('id')
|
||||||
|
if id_attr:
|
||||||
|
self._metadata[f'{meta_key}_id'] = id_attr
|
||||||
|
|
||||||
|
elif dc_name == 'date':
|
||||||
|
# Check for event type
|
||||||
|
event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
|
||||||
|
if event:
|
||||||
|
self._metadata[f'{meta_key}_event'] = event
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Multiple elements - store as list
|
||||||
|
values = []
|
||||||
|
for elem in elements:
|
||||||
|
if elem.text:
|
||||||
|
values.append(elem.text.strip())
|
||||||
|
|
||||||
|
if values:
|
||||||
|
self._metadata[meta_key] = values
|
||||||
|
|
||||||
|
def _parse_opf_metadata(self, metadata_elem: ET.Element):
|
||||||
|
"""
|
||||||
|
Parse OPF-specific metadata elements.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata_elem: The metadata XML element
|
||||||
|
"""
|
||||||
|
# Parse meta elements
|
||||||
|
meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
|
||||||
|
|
||||||
|
for meta in meta_elements:
|
||||||
|
name = meta.get('name')
|
||||||
|
content = meta.get('content')
|
||||||
|
|
||||||
|
if name and content:
|
||||||
|
self._metadata[f'meta_{name}'] = content
|
||||||
|
|
||||||
|
# Parse x-metadata elements (custom metadata)
|
||||||
|
x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
|
||||||
|
|
||||||
|
for x_meta in x_meta_elements:
|
||||||
|
for child in x_meta:
|
||||||
|
if child.tag and child.text:
|
||||||
|
# Remove namespace prefix for cleaner key names
|
||||||
|
tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
||||||
|
self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
|
||||||
|
|
||||||
|
def _populate_document(self, document: Document):
|
||||||
|
"""
|
||||||
|
Populate the document with extracted metadata.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The document to populate
|
||||||
|
"""
|
||||||
|
# Map EPUB metadata to document metadata types
|
||||||
|
metadata_mapping = {
|
||||||
|
'title': MetadataType.TITLE,
|
||||||
|
'creator': MetadataType.AUTHOR,
|
||||||
|
'description': MetadataType.DESCRIPTION,
|
||||||
|
'subject': MetadataType.KEYWORDS,
|
||||||
|
'language': MetadataType.LANGUAGE,
|
||||||
|
'date': MetadataType.PUBLICATION_DATE,
|
||||||
|
'publisher': MetadataType.PUBLISHER,
|
||||||
|
'identifier': MetadataType.IDENTIFIER,
|
||||||
|
}
|
||||||
|
|
||||||
|
for epub_key, doc_type in metadata_mapping.items():
|
||||||
|
if epub_key in self._metadata:
|
||||||
|
value = self._metadata[epub_key]
|
||||||
|
|
||||||
|
# Handle list values (like multiple subjects)
|
||||||
|
if isinstance(value, list):
|
||||||
|
if epub_key == 'subject':
|
||||||
|
# Join subjects with commas for keywords
|
||||||
|
document.set_metadata(doc_type, ', '.join(value))
|
||||||
|
else:
|
||||||
|
# For other list values, use the first one
|
||||||
|
document.set_metadata(doc_type, value[0])
|
||||||
|
else:
|
||||||
|
document.set_metadata(doc_type, value)
|
||||||
|
|
||||||
|
# Handle cover image
|
||||||
|
cover_meta = self._metadata.get('meta_cover')
|
||||||
|
if cover_meta:
|
||||||
|
document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
|
||||||
|
|
||||||
|
# Store original EPUB metadata for reference
|
||||||
|
document.set_metadata(MetadataType.CUSTOM, {
|
||||||
|
'epub_metadata': self._metadata
|
||||||
|
})
|
||||||
|
|
||||||
|
def _cleanup(self):
|
||||||
|
"""Clean up temporary files."""
|
||||||
|
if self._temp_dir:
|
||||||
|
try:
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
self._temp_dir = None
|
||||||
|
|
||||||
|
def get_unique_identifier(self) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get the unique identifier from the EPUB metadata.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The unique identifier string, or None if not found
|
||||||
|
"""
|
||||||
|
# Look for identifier with specific ID
|
||||||
|
for key, value in self._metadata.items():
|
||||||
|
if key.startswith('identifier') and key.endswith('_id'):
|
||||||
|
return self._metadata.get('identifier')
|
||||||
|
|
||||||
|
# Fallback to any identifier
|
||||||
|
return self._metadata.get('identifier')
|
||||||
|
|
||||||
|
def get_cover_id(self) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get the cover image ID from metadata.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The cover image ID, or None if not found
|
||||||
|
"""
|
||||||
|
return self._metadata.get('meta_cover')
|
||||||
|
|
||||||
|
def get_creators(self) -> List[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Get creator information with roles.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of creator dictionaries with name, role, and file-as info
|
||||||
|
"""
|
||||||
|
creators = []
|
||||||
|
creator_value = self._metadata.get('creator')
|
||||||
|
|
||||||
|
if creator_value:
|
||||||
|
if isinstance(creator_value, list):
|
||||||
|
# Multiple creators - this is simplified, real implementation
|
||||||
|
# would need to correlate with role and file-as attributes
|
||||||
|
for creator in creator_value:
|
||||||
|
creators.append({'name': creator})
|
||||||
|
else:
|
||||||
|
# Single creator
|
||||||
|
creator_info = {'name': creator_value}
|
||||||
|
|
||||||
|
# Add role if available
|
||||||
|
role = self._metadata.get('creator_role')
|
||||||
|
if role:
|
||||||
|
creator_info['role'] = role
|
||||||
|
|
||||||
|
# Add file-as if available
|
||||||
|
file_as = self._metadata.get('creator_file_as')
|
||||||
|
if file_as:
|
||||||
|
creator_info['file_as'] = file_as
|
||||||
|
|
||||||
|
creators.append(creator_info)
|
||||||
|
|
||||||
|
return creators
|
||||||
400
pyWebLayout/io/readers/epub_reader.py
Normal file
400
pyWebLayout/io/readers/epub_reader.py
Normal file
@ -0,0 +1,400 @@
|
|||||||
|
"""
|
||||||
|
EPUB reader for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides functionality for reading EPUB documents and converting them
|
||||||
|
to pyWebLayout's abstract document model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
import tempfile
|
||||||
|
from typing import Dict, List, Optional, Any, Tuple
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
from pyWebLayout.abstract.document import Document, Book, Chapter, MetadataType
|
||||||
|
from pyWebLayout.io.readers.html import parse_html_string as parse_html, read_html_file as html_to_document
|
||||||
|
|
||||||
|
|
||||||
|
# XML namespaces used in EPUB files
|
||||||
|
NAMESPACES = {
|
||||||
|
'opf': 'http://www.idpf.org/2007/opf',
|
||||||
|
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||||
|
'dcterms': 'http://purl.org/dc/terms/',
|
||||||
|
'xhtml': 'http://www.w3.org/1999/xhtml',
|
||||||
|
'ncx': 'http://www.daisy.org/z3986/2005/ncx/',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class EPUBReader:
|
||||||
|
"""
|
||||||
|
Reader for EPUB documents.
|
||||||
|
|
||||||
|
This class extracts content from EPUB files and converts it to
|
||||||
|
pyWebLayout's abstract document model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, epub_path: str):
|
||||||
|
"""
|
||||||
|
Initialize an EPUB reader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
epub_path: Path to the EPUB file
|
||||||
|
"""
|
||||||
|
self.epub_path = epub_path
|
||||||
|
self.book = Book()
|
||||||
|
self.temp_dir = None
|
||||||
|
self.content_dir = None
|
||||||
|
self.metadata = {}
|
||||||
|
self.toc = []
|
||||||
|
self.spine = []
|
||||||
|
self.manifest = {}
|
||||||
|
|
||||||
|
def read(self) -> Book:
|
||||||
|
"""
|
||||||
|
Read the EPUB file and convert it to a Book.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Book: The parsed book
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract the EPUB file
|
||||||
|
self.temp_dir = tempfile.mkdtemp()
|
||||||
|
self._extract_epub()
|
||||||
|
|
||||||
|
# Parse the package document (content.opf)
|
||||||
|
self._parse_package_document()
|
||||||
|
|
||||||
|
# Parse the table of contents
|
||||||
|
self._parse_toc()
|
||||||
|
|
||||||
|
# Create a Book object
|
||||||
|
self._create_book()
|
||||||
|
|
||||||
|
# Add chapters to the book
|
||||||
|
self._add_chapters()
|
||||||
|
|
||||||
|
return self.book
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up temporary files
|
||||||
|
if self.temp_dir:
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def _extract_epub(self):
|
||||||
|
"""Extract the EPUB file to a temporary directory."""
|
||||||
|
with zipfile.ZipFile(self.epub_path, 'r') as zip_ref:
|
||||||
|
zip_ref.extractall(self.temp_dir)
|
||||||
|
|
||||||
|
# Find the content directory (typically OEBPS or OPS)
|
||||||
|
container_path = os.path.join(self.temp_dir, 'META-INF', 'container.xml')
|
||||||
|
if os.path.exists(container_path):
|
||||||
|
tree = ET.parse(container_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Get the path to the package document (content.opf)
|
||||||
|
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
|
||||||
|
full_path = rootfile.get('full-path')
|
||||||
|
if full_path:
|
||||||
|
self.content_dir = os.path.dirname(os.path.join(self.temp_dir, full_path))
|
||||||
|
return
|
||||||
|
|
||||||
|
# Fallback: look for common content directories
|
||||||
|
for content_dir in ['OEBPS', 'OPS', 'Content']:
|
||||||
|
if os.path.exists(os.path.join(self.temp_dir, content_dir)):
|
||||||
|
self.content_dir = os.path.join(self.temp_dir, content_dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
# If no content directory found, use the root
|
||||||
|
self.content_dir = self.temp_dir
|
||||||
|
|
||||||
|
def _parse_package_document(self):
|
||||||
|
"""Parse the package document (content.opf)."""
|
||||||
|
# Find the package document
|
||||||
|
opf_path = None
|
||||||
|
for root, dirs, files in os.walk(self.content_dir):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.opf'):
|
||||||
|
opf_path = os.path.join(root, file)
|
||||||
|
break
|
||||||
|
if opf_path:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not opf_path:
|
||||||
|
raise ValueError("No package document (.opf) found in EPUB")
|
||||||
|
|
||||||
|
# Parse the package document
|
||||||
|
tree = ET.parse(opf_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Parse metadata
|
||||||
|
self._parse_metadata(root)
|
||||||
|
|
||||||
|
# Parse manifest
|
||||||
|
self._parse_manifest(root)
|
||||||
|
|
||||||
|
# Parse spine
|
||||||
|
self._parse_spine(root)
|
||||||
|
|
||||||
|
def _parse_metadata(self, root: ET.Element):
|
||||||
|
"""
|
||||||
|
Parse metadata from the package document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root: Root element of the package document
|
||||||
|
"""
|
||||||
|
# Find the metadata element
|
||||||
|
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
|
||||||
|
if metadata_elem is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse DC metadata
|
||||||
|
for elem in metadata_elem:
|
||||||
|
if elem.tag.startswith('{{{0}}}'.format(NAMESPACES['dc'])):
|
||||||
|
# Get the local name (without namespace)
|
||||||
|
name = elem.tag.split('}', 1)[1]
|
||||||
|
value = elem.text
|
||||||
|
|
||||||
|
if name == 'title':
|
||||||
|
self.metadata['title'] = value
|
||||||
|
elif name == 'creator':
|
||||||
|
self.metadata['creator'] = value
|
||||||
|
elif name == 'language':
|
||||||
|
self.metadata['language'] = value
|
||||||
|
elif name == 'description':
|
||||||
|
self.metadata['description'] = value
|
||||||
|
elif name == 'subject':
|
||||||
|
if 'subjects' not in self.metadata:
|
||||||
|
self.metadata['subjects'] = []
|
||||||
|
self.metadata['subjects'].append(value)
|
||||||
|
elif name == 'date':
|
||||||
|
self.metadata['date'] = value
|
||||||
|
elif name == 'identifier':
|
||||||
|
self.metadata['identifier'] = value
|
||||||
|
elif name == 'publisher':
|
||||||
|
self.metadata['publisher'] = value
|
||||||
|
else:
|
||||||
|
# Store other metadata
|
||||||
|
self.metadata[name] = value
|
||||||
|
|
||||||
|
def _parse_manifest(self, root: ET.Element):
|
||||||
|
"""
|
||||||
|
Parse manifest from the package document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root: Root element of the package document
|
||||||
|
"""
|
||||||
|
# Find the manifest element
|
||||||
|
manifest_elem = root.find('.//{{{0}}}manifest'.format(NAMESPACES['opf']))
|
||||||
|
if manifest_elem is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse items
|
||||||
|
for item in manifest_elem.findall('.//{{{0}}}item'.format(NAMESPACES['opf'])):
|
||||||
|
id = item.get('id')
|
||||||
|
href = item.get('href')
|
||||||
|
media_type = item.get('media-type')
|
||||||
|
|
||||||
|
if id and href:
|
||||||
|
# Resolve relative path
|
||||||
|
href = urllib.parse.unquote(href)
|
||||||
|
path = os.path.normpath(os.path.join(self.content_dir, href))
|
||||||
|
|
||||||
|
self.manifest[id] = {
|
||||||
|
'href': href,
|
||||||
|
'path': path,
|
||||||
|
'media_type': media_type
|
||||||
|
}
|
||||||
|
|
||||||
|
def _parse_spine(self, root: ET.Element):
|
||||||
|
"""
|
||||||
|
Parse spine from the package document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root: Root element of the package document
|
||||||
|
"""
|
||||||
|
# Find the spine element
|
||||||
|
spine_elem = root.find('.//{{{0}}}spine'.format(NAMESPACES['opf']))
|
||||||
|
if spine_elem is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get the toc attribute (NCX file ID)
|
||||||
|
toc_id = spine_elem.get('toc')
|
||||||
|
if toc_id and toc_id in self.manifest:
|
||||||
|
self.toc_path = self.manifest[toc_id]['path']
|
||||||
|
|
||||||
|
# Parse itemrefs
|
||||||
|
for itemref in spine_elem.findall('.//{{{0}}}itemref'.format(NAMESPACES['opf'])):
|
||||||
|
idref = itemref.get('idref')
|
||||||
|
if idref and idref in self.manifest:
|
||||||
|
self.spine.append(idref)
|
||||||
|
|
||||||
|
def _parse_toc(self):
|
||||||
|
"""Parse the table of contents."""
|
||||||
|
if not hasattr(self, 'toc_path') or not self.toc_path or not os.path.exists(self.toc_path):
|
||||||
|
# Try to find the toc.ncx file
|
||||||
|
for root, dirs, files in os.walk(self.content_dir):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.ncx'):
|
||||||
|
self.toc_path = os.path.join(root, file)
|
||||||
|
break
|
||||||
|
if hasattr(self, 'toc_path') and self.toc_path:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not hasattr(self, 'toc_path') or not self.toc_path or not os.path.exists(self.toc_path):
|
||||||
|
# No TOC found
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse the NCX file
|
||||||
|
tree = ET.parse(self.toc_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Parse navMap
|
||||||
|
nav_map = root.find('.//{{{0}}}navMap'.format(NAMESPACES['ncx']))
|
||||||
|
if nav_map is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse navPoints
|
||||||
|
self._parse_nav_points(nav_map, [])
|
||||||
|
|
||||||
|
def _parse_nav_points(self, parent: ET.Element, path: List[Dict[str, Any]]):
|
||||||
|
"""
|
||||||
|
Recursively parse navPoints from the NCX file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
parent: Parent element containing navPoints
|
||||||
|
path: Current path in the TOC hierarchy
|
||||||
|
"""
|
||||||
|
for nav_point in parent.findall('.//{{{0}}}navPoint'.format(NAMESPACES['ncx'])):
|
||||||
|
# Get navPoint attributes
|
||||||
|
id = nav_point.get('id')
|
||||||
|
play_order = nav_point.get('playOrder')
|
||||||
|
|
||||||
|
# Get navLabel
|
||||||
|
nav_label = nav_point.find('.//{{{0}}}navLabel'.format(NAMESPACES['ncx']))
|
||||||
|
text_elem = nav_label.find('.//{{{0}}}text'.format(NAMESPACES['ncx'])) if nav_label else None
|
||||||
|
label = text_elem.text if text_elem is not None else ""
|
||||||
|
|
||||||
|
# Get content
|
||||||
|
content = nav_point.find('.//{{{0}}}content'.format(NAMESPACES['ncx']))
|
||||||
|
src = content.get('src') if content is not None else ""
|
||||||
|
|
||||||
|
# Create a TOC entry
|
||||||
|
entry = {
|
||||||
|
'id': id,
|
||||||
|
'label': label,
|
||||||
|
'src': src,
|
||||||
|
'play_order': play_order,
|
||||||
|
'children': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add to TOC
|
||||||
|
if path:
|
||||||
|
path[-1]['children'].append(entry)
|
||||||
|
else:
|
||||||
|
self.toc.append(entry)
|
||||||
|
|
||||||
|
# Parse child navPoints
|
||||||
|
self._parse_nav_points(nav_point, path + [entry])
|
||||||
|
|
||||||
|
def _create_book(self):
|
||||||
|
"""Create a Book object from the parsed metadata."""
|
||||||
|
# Set book metadata
|
||||||
|
if 'title' in self.metadata:
|
||||||
|
self.book.set_title(self.metadata['title'])
|
||||||
|
|
||||||
|
if 'creator' in self.metadata:
|
||||||
|
self.book.set_metadata(MetadataType.AUTHOR, self.metadata['creator'])
|
||||||
|
|
||||||
|
if 'language' in self.metadata:
|
||||||
|
self.book.set_metadata(MetadataType.LANGUAGE, self.metadata['language'])
|
||||||
|
|
||||||
|
if 'description' in self.metadata:
|
||||||
|
self.book.set_metadata(MetadataType.DESCRIPTION, self.metadata['description'])
|
||||||
|
|
||||||
|
if 'subjects' in self.metadata:
|
||||||
|
self.book.set_metadata(MetadataType.KEYWORDS, ', '.join(self.metadata['subjects']))
|
||||||
|
|
||||||
|
if 'date' in self.metadata:
|
||||||
|
self.book.set_metadata(MetadataType.PUBLICATION_DATE, self.metadata['date'])
|
||||||
|
|
||||||
|
if 'identifier' in self.metadata:
|
||||||
|
self.book.set_metadata(MetadataType.IDENTIFIER, self.metadata['identifier'])
|
||||||
|
|
||||||
|
if 'publisher' in self.metadata:
|
||||||
|
self.book.set_metadata(MetadataType.PUBLISHER, self.metadata['publisher'])
|
||||||
|
|
||||||
|
def _add_chapters(self):
|
||||||
|
"""Add chapters to the book based on the spine and TOC."""
|
||||||
|
# Create a mapping from src to TOC entry
|
||||||
|
toc_map = {}
|
||||||
|
|
||||||
|
def add_to_toc_map(entries):
|
||||||
|
for entry in entries:
|
||||||
|
if entry['src']:
|
||||||
|
# Extract the path part of the src (remove fragment)
|
||||||
|
src_parts = entry['src'].split('#', 1)
|
||||||
|
path = src_parts[0]
|
||||||
|
toc_map[path] = entry
|
||||||
|
|
||||||
|
# Process children
|
||||||
|
if entry['children']:
|
||||||
|
add_to_toc_map(entry['children'])
|
||||||
|
|
||||||
|
add_to_toc_map(self.toc)
|
||||||
|
|
||||||
|
# Process spine items
|
||||||
|
for i, idref in enumerate(self.spine):
|
||||||
|
if idref not in self.manifest:
|
||||||
|
continue
|
||||||
|
|
||||||
|
item = self.manifest[idref]
|
||||||
|
path = item['path']
|
||||||
|
href = item['href']
|
||||||
|
|
||||||
|
# Check if this item is in the TOC
|
||||||
|
chapter_title = None
|
||||||
|
if href in toc_map:
|
||||||
|
chapter_title = toc_map[href]['label']
|
||||||
|
|
||||||
|
# Create a chapter
|
||||||
|
chapter = self.book.create_chapter(chapter_title, i + 1)
|
||||||
|
|
||||||
|
# Parse the HTML content
|
||||||
|
try:
|
||||||
|
# Read the HTML file
|
||||||
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
|
html = f.read()
|
||||||
|
|
||||||
|
# Parse HTML and add blocks to chapter
|
||||||
|
base_url = os.path.dirname(path)
|
||||||
|
document = parse_html(html, base_url)
|
||||||
|
|
||||||
|
# Copy blocks to the chapter
|
||||||
|
for block in document.blocks:
|
||||||
|
chapter.add_block(block)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error parsing chapter {i+1}: {str(e)}")
|
||||||
|
# Add an error message block
|
||||||
|
from pyWebLayout.abstract.block import Parapgraph
|
||||||
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
error_para = Parapgraph()
|
||||||
|
error_para.add_word(Word(f"Error loading chapter: {str(e)}"))
|
||||||
|
chapter.add_block(error_para)
|
||||||
|
|
||||||
|
|
||||||
|
def read_epub(epub_path: str) -> Book:
|
||||||
|
"""
|
||||||
|
Read an EPUB file and convert it to a Book.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
epub_path: Path to the EPUB file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Book: The parsed book
|
||||||
|
"""
|
||||||
|
reader = EPUBReader(epub_path)
|
||||||
|
return reader.read()
|
||||||
190
pyWebLayout/io/readers/html.py
Normal file
190
pyWebLayout/io/readers/html.py
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
"""
|
||||||
|
Modern HTML reader for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides a decomposed HTML reader that uses specialized
|
||||||
|
readers for metadata, content, and resources, following the pattern
|
||||||
|
established in the abstract module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Union, Optional
|
||||||
|
from pyWebLayout.abstract.document import Document
|
||||||
|
from pyWebLayout.io.readers.base import CompositeReader
|
||||||
|
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||||
|
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
||||||
|
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLReader(CompositeReader):
|
||||||
|
"""
|
||||||
|
Modern HTML reader using decomposed architecture.
|
||||||
|
|
||||||
|
This reader combines specialized readers for metadata, content,
|
||||||
|
and resources to provide a complete HTML parsing solution.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the HTML reader with all specialized readers."""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# Set up specialized readers
|
||||||
|
self.set_metadata_reader(HTMLMetadataReader())
|
||||||
|
self.set_content_reader(HTMLContentReader())
|
||||||
|
self.set_resource_reader(HTMLResourceReader())
|
||||||
|
|
||||||
|
def can_read(self, source: Union[str, bytes]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this reader can handle the given source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source to check (file path, URL, or content)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if this reader can handle the source, False otherwise
|
||||||
|
"""
|
||||||
|
if isinstance(source, str):
|
||||||
|
# Check if it's a file path
|
||||||
|
if os.path.isfile(source):
|
||||||
|
return source.lower().endswith(('.html', '.htm', '.xhtml'))
|
||||||
|
|
||||||
|
# Check if it's HTML content (very basic check)
|
||||||
|
source_lower = source.lower().strip()
|
||||||
|
return (source_lower.startswith('<!doctype html') or
|
||||||
|
source_lower.startswith('<html') or
|
||||||
|
'<html' in source_lower[:200])
|
||||||
|
|
||||||
|
elif isinstance(source, bytes):
|
||||||
|
# Check if it's HTML content in bytes
|
||||||
|
try:
|
||||||
|
source_str = source.decode('utf-8', errors='ignore').lower().strip()
|
||||||
|
return (source_str.startswith('<!doctype html') or
|
||||||
|
source_str.startswith('<html') or
|
||||||
|
'<html' in source_str[:200])
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def read(self, source: Union[str, bytes], **options) -> Document:
|
||||||
|
"""
|
||||||
|
Read and parse the HTML source into a Document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The HTML source to read (file path, URL, or content)
|
||||||
|
**options: Additional options for reading
|
||||||
|
- base_url: Base URL for resolving relative links
|
||||||
|
- encoding: Character encoding (default: 'utf-8')
|
||||||
|
- extract_metadata: Whether to extract metadata (default: True)
|
||||||
|
- extract_resources: Whether to extract resources (default: True)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed Document
|
||||||
|
"""
|
||||||
|
# Get options
|
||||||
|
base_url = options.get('base_url')
|
||||||
|
encoding = options.get('encoding', 'utf-8')
|
||||||
|
extract_metadata = options.get('extract_metadata', True)
|
||||||
|
extract_resources = options.get('extract_resources', True)
|
||||||
|
|
||||||
|
# Read the HTML content
|
||||||
|
html_content = self._read_html_content(source, encoding)
|
||||||
|
|
||||||
|
# Set base URL if not provided and source is a file
|
||||||
|
if not base_url and isinstance(source, str) and os.path.isfile(source):
|
||||||
|
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
|
||||||
|
|
||||||
|
# Set base URL in content reader
|
||||||
|
if self._content_reader and hasattr(self._content_reader, 'set_base_url'):
|
||||||
|
self._content_reader.set_base_url(base_url)
|
||||||
|
|
||||||
|
# Create a new document
|
||||||
|
document = Document()
|
||||||
|
|
||||||
|
# Extract metadata if enabled
|
||||||
|
if extract_metadata and self._metadata_reader:
|
||||||
|
self._metadata_reader.extract_metadata(html_content, document)
|
||||||
|
|
||||||
|
# Extract content
|
||||||
|
if self._content_reader:
|
||||||
|
self._content_reader.extract_content(html_content, document)
|
||||||
|
|
||||||
|
# Extract resources if enabled
|
||||||
|
if extract_resources and self._resource_reader:
|
||||||
|
self._resource_reader.extract_resources(html_content, document)
|
||||||
|
|
||||||
|
return document
|
||||||
|
|
||||||
|
def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str:
|
||||||
|
"""
|
||||||
|
Read HTML content from various sources.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The source to read from
|
||||||
|
encoding: Character encoding to use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The HTML content as a string
|
||||||
|
"""
|
||||||
|
if isinstance(source, bytes):
|
||||||
|
# Source is already bytes, decode it
|
||||||
|
return source.decode(encoding, errors='replace')
|
||||||
|
|
||||||
|
elif isinstance(source, str):
|
||||||
|
# Check if it's a file path
|
||||||
|
if os.path.isfile(source):
|
||||||
|
with open(source, 'r', encoding=encoding, errors='replace') as f:
|
||||||
|
return f.read()
|
||||||
|
else:
|
||||||
|
# Assume it's HTML content
|
||||||
|
return source
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported source type: {type(source)}")
|
||||||
|
|
||||||
|
|
||||||
|
def read_html(source: Union[str, bytes], **options) -> Document:
|
||||||
|
"""
|
||||||
|
Convenience function to read HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The HTML source to read (file path, URL, or content)
|
||||||
|
**options: Additional options for reading
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed Document
|
||||||
|
"""
|
||||||
|
reader = HTMLReader()
|
||||||
|
return reader.read(source, **options)
|
||||||
|
|
||||||
|
|
||||||
|
def read_html_file(file_path: str, **options) -> Document:
|
||||||
|
"""
|
||||||
|
Convenience function to read HTML from a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the HTML file
|
||||||
|
**options: Additional options for reading
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed Document
|
||||||
|
"""
|
||||||
|
if not os.path.isfile(file_path):
|
||||||
|
raise FileNotFoundError(f"HTML file not found: {file_path}")
|
||||||
|
|
||||||
|
reader = HTMLReader()
|
||||||
|
return reader.read(file_path, **options)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html_string(html_content: str, **options) -> Document:
|
||||||
|
"""
|
||||||
|
Convenience function to parse HTML content from a string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content as a string
|
||||||
|
**options: Additional options for reading
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed Document
|
||||||
|
"""
|
||||||
|
reader = HTMLReader()
|
||||||
|
return reader.read(html_content, **options)
|
||||||
269
pyWebLayout/io/readers/html_content.py
Normal file
269
pyWebLayout/io/readers/html_content.py
Normal file
@ -0,0 +1,269 @@
|
|||||||
|
"""
|
||||||
|
Modern HTML content reader for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides a decomposed HTML content reader that uses specialized
|
||||||
|
handlers and managers for different aspects of HTML parsing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from html.parser import HTMLParser as BaseHTMLParser
|
||||||
|
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||||
|
from pyWebLayout.abstract.document import Document
|
||||||
|
from pyWebLayout.io.readers.base import ContentReader
|
||||||
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
|
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||||||
|
from pyWebLayout.io.readers.html_elements import (
|
||||||
|
BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLContentReader(ContentReader, BaseHTMLParser):
|
||||||
|
"""
|
||||||
|
Modern HTML content reader using decomposed architecture.
|
||||||
|
|
||||||
|
This class orchestrates specialized handlers to parse HTML content
|
||||||
|
and convert it to pyWebLayout's abstract document model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the HTML content reader."""
|
||||||
|
BaseHTMLParser.__init__(self)
|
||||||
|
|
||||||
|
# Initialize managers and processors
|
||||||
|
self.style_manager = HTMLStyleManager()
|
||||||
|
self.text_processor = HTMLTextProcessor(self.style_manager)
|
||||||
|
|
||||||
|
# Initialize element handlers
|
||||||
|
self.block_handler = BlockElementHandler(self.style_manager, self.text_processor)
|
||||||
|
self.list_handler = ListElementHandler(self.text_processor)
|
||||||
|
self.table_handler = TableElementHandler(self.text_processor)
|
||||||
|
self.inline_handler = InlineElementHandler(self.text_processor)
|
||||||
|
|
||||||
|
# Document and parsing state
|
||||||
|
self._document: Optional[Document] = None
|
||||||
|
self._in_head = False
|
||||||
|
self._in_script = False
|
||||||
|
self._in_style = False
|
||||||
|
|
||||||
|
def extract_content(self, html_content: str, document: Document) -> Any:
|
||||||
|
"""
|
||||||
|
Extract content from HTML.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
document: The document to populate with content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The document with populated content
|
||||||
|
"""
|
||||||
|
self._document = document
|
||||||
|
self._reset_state()
|
||||||
|
|
||||||
|
# Parse the HTML content
|
||||||
|
self.feed(html_content)
|
||||||
|
|
||||||
|
# Flush any remaining text
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
return document
|
||||||
|
|
||||||
|
def set_base_url(self, base_url: str):
|
||||||
|
"""Set the base URL for resolving relative links."""
|
||||||
|
self.inline_handler.set_base_url(base_url)
|
||||||
|
|
||||||
|
def _reset_state(self):
|
||||||
|
"""Reset all parser state for new content."""
|
||||||
|
# Reset managers and processors
|
||||||
|
self.style_manager.reset()
|
||||||
|
self.text_processor.reset()
|
||||||
|
|
||||||
|
# Reset element handlers
|
||||||
|
self.block_handler.reset()
|
||||||
|
self.list_handler.reset()
|
||||||
|
self.table_handler.reset()
|
||||||
|
self.inline_handler.reset()
|
||||||
|
|
||||||
|
# Reset parser flags
|
||||||
|
self._in_head = False
|
||||||
|
self._in_script = False
|
||||||
|
self._in_style = False
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
||||||
|
"""Handle the start of an HTML tag."""
|
||||||
|
tag = tag.lower()
|
||||||
|
attrs_dict = dict(attrs)
|
||||||
|
|
||||||
|
# Skip content in head, script, style (except body)
|
||||||
|
if self._should_skip_content(tag):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Handle special section markers
|
||||||
|
if self._handle_special_sections_start(tag):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Apply styles for this element
|
||||||
|
style = self.style_manager.apply_style_to_element(tag, attrs_dict)
|
||||||
|
self.style_manager.push_style(style)
|
||||||
|
|
||||||
|
# Delegate to appropriate handler
|
||||||
|
self._delegate_start_tag(tag, attrs_dict)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str):
|
||||||
|
"""Handle the end of an HTML tag."""
|
||||||
|
tag = tag.lower()
|
||||||
|
|
||||||
|
# Handle special section markers
|
||||||
|
if self._handle_special_sections_end(tag):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip content in head, script, style
|
||||||
|
if self._in_head or self._in_script or self._in_style:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Flush any accumulated text
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
# Delegate to appropriate handler
|
||||||
|
self._delegate_end_tag(tag)
|
||||||
|
|
||||||
|
# Pop style regardless of tag
|
||||||
|
self.style_manager.pop_style()
|
||||||
|
|
||||||
|
def handle_data(self, data: str):
|
||||||
|
"""Handle text data."""
|
||||||
|
if self._in_head or self._in_script or self._in_style:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.text_processor.add_text(data)
|
||||||
|
|
||||||
|
def handle_entityref(self, name: str):
|
||||||
|
"""Handle an HTML entity reference."""
|
||||||
|
if self._in_head or self._in_script or self._in_style:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.text_processor.add_entity_reference(name)
|
||||||
|
|
||||||
|
def handle_charref(self, name: str):
|
||||||
|
"""Handle a character reference."""
|
||||||
|
if self._in_head or self._in_script or self._in_style:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.text_processor.add_character_reference(name)
|
||||||
|
|
||||||
|
def _should_skip_content(self, tag: str) -> bool:
|
||||||
|
"""Check if we should skip content based on current state."""
|
||||||
|
if self._in_head or self._in_script or self._in_style:
|
||||||
|
if tag in ('head', 'script', 'style'):
|
||||||
|
return False # Let special section handlers deal with these
|
||||||
|
if tag != 'body':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _handle_special_sections_start(self, tag: str) -> bool:
|
||||||
|
"""Handle special section start tags. Returns True if handled."""
|
||||||
|
if tag == 'head':
|
||||||
|
self._in_head = True
|
||||||
|
return True
|
||||||
|
elif tag == 'body':
|
||||||
|
self._in_head = False
|
||||||
|
return True
|
||||||
|
elif tag == 'script':
|
||||||
|
self._in_script = True
|
||||||
|
return True
|
||||||
|
elif tag == 'style':
|
||||||
|
self._in_style = True
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _handle_special_sections_end(self, tag: str) -> bool:
|
||||||
|
"""Handle special section end tags. Returns True if handled."""
|
||||||
|
if tag == 'head':
|
||||||
|
self._in_head = False
|
||||||
|
self.style_manager.pop_style()
|
||||||
|
return True
|
||||||
|
elif tag == 'script':
|
||||||
|
self._in_script = False
|
||||||
|
self.style_manager.pop_style()
|
||||||
|
return True
|
||||||
|
elif tag == 'style':
|
||||||
|
self._in_style = False
|
||||||
|
self.style_manager.pop_style()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _delegate_start_tag(self, tag: str, attrs: Dict[str, str]):
|
||||||
|
"""Delegate start tag handling to appropriate handler."""
|
||||||
|
# Block elements
|
||||||
|
if tag == 'p':
|
||||||
|
self.block_handler.handle_paragraph_start(self._document)
|
||||||
|
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
||||||
|
self.block_handler.handle_heading_start(tag, self._document)
|
||||||
|
elif tag == 'div':
|
||||||
|
self.block_handler.handle_div_start(self._document)
|
||||||
|
elif tag == 'blockquote':
|
||||||
|
self.block_handler.handle_blockquote_start(self._document)
|
||||||
|
elif tag == 'pre':
|
||||||
|
self.block_handler.handle_pre_start(self._document)
|
||||||
|
elif tag == 'code':
|
||||||
|
self.block_handler.handle_code_start(attrs, self._document)
|
||||||
|
|
||||||
|
# List elements
|
||||||
|
elif tag in ('ul', 'ol', 'dl'):
|
||||||
|
self.list_handler.handle_list_start(tag, self.block_handler, self._document)
|
||||||
|
elif tag == 'li':
|
||||||
|
self.list_handler.handle_list_item_start(self.block_handler)
|
||||||
|
elif tag in ('dt', 'dd'):
|
||||||
|
self.list_handler.handle_definition_start(tag, self.block_handler)
|
||||||
|
|
||||||
|
# Table elements
|
||||||
|
elif tag == 'table':
|
||||||
|
self.table_handler.handle_table_start(attrs, self.block_handler, self._document)
|
||||||
|
elif tag in ('thead', 'tbody', 'tfoot'):
|
||||||
|
self.table_handler.handle_table_section_start(tag)
|
||||||
|
elif tag == 'tr':
|
||||||
|
self.table_handler.handle_table_row_start()
|
||||||
|
elif tag in ('td', 'th'):
|
||||||
|
self.table_handler.handle_table_cell_start(tag, attrs, self.block_handler)
|
||||||
|
|
||||||
|
# Inline elements
|
||||||
|
elif tag == 'a':
|
||||||
|
self.inline_handler.handle_link_start(attrs)
|
||||||
|
elif tag == 'img':
|
||||||
|
self.inline_handler.handle_image(attrs, self.block_handler, self._document)
|
||||||
|
elif tag == 'br':
|
||||||
|
self.inline_handler.handle_line_break(self.block_handler)
|
||||||
|
elif tag == 'hr':
|
||||||
|
self.inline_handler.handle_horizontal_rule(self.block_handler, self._document)
|
||||||
|
|
||||||
|
# Style-only elements (no special handling needed, just styling)
|
||||||
|
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
|
||||||
|
pass # Styles are already applied by style manager
|
||||||
|
|
||||||
|
def _delegate_end_tag(self, tag: str):
|
||||||
|
"""Delegate end tag handling to appropriate handler."""
|
||||||
|
# Block elements
|
||||||
|
if tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code'):
|
||||||
|
self.block_handler.handle_block_end()
|
||||||
|
|
||||||
|
# List elements
|
||||||
|
elif tag in ('ul', 'ol', 'dl'):
|
||||||
|
self.list_handler.handle_list_end(self.block_handler)
|
||||||
|
elif tag in ('li', 'dt', 'dd'):
|
||||||
|
self.list_handler.handle_list_item_end(self.block_handler)
|
||||||
|
|
||||||
|
# Table elements
|
||||||
|
elif tag == 'table':
|
||||||
|
self.table_handler.handle_table_end(self.block_handler)
|
||||||
|
elif tag in ('thead', 'tbody', 'tfoot'):
|
||||||
|
self.table_handler.handle_table_section_end()
|
||||||
|
elif tag == 'tr':
|
||||||
|
self.table_handler.handle_table_row_end()
|
||||||
|
elif tag in ('td', 'th'):
|
||||||
|
self.table_handler.handle_table_cell_end(self.block_handler)
|
||||||
|
|
||||||
|
# Inline elements
|
||||||
|
elif tag == 'a':
|
||||||
|
self.inline_handler.handle_link_end()
|
||||||
|
|
||||||
|
# Style-only elements (no special handling needed)
|
||||||
|
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
|
||||||
|
pass # Styles are handled by style manager
|
||||||
472
pyWebLayout/io/readers/html_elements.py
Normal file
472
pyWebLayout/io/readers/html_elements.py
Normal file
@ -0,0 +1,472 @@
|
|||||||
|
"""
|
||||||
|
HTML element handlers for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides specialized handlers for different types of HTML elements,
|
||||||
|
using composition and delegation to handle specific element types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
import urllib.parse
|
||||||
|
from pyWebLayout.abstract.document import Document
|
||||||
|
from pyWebLayout.abstract.block import (
|
||||||
|
Block, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
|
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
||||||
|
HorizontalRule, LineBreak, Image
|
||||||
|
)
|
||||||
|
from pyWebLayout.abstract.functional import Link, LinkType
|
||||||
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
|
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||||||
|
|
||||||
|
|
||||||
|
class BlockElementHandler:
|
||||||
|
"""Handles block-level HTML elements like paragraphs, headings, divs."""
|
||||||
|
|
||||||
|
def __init__(self, style_manager: HTMLStyleManager, text_processor: HTMLTextProcessor):
|
||||||
|
self.style_manager = style_manager
|
||||||
|
self.text_processor = text_processor
|
||||||
|
self.block_stack: List[Block] = []
|
||||||
|
self.current_block: Optional[Block] = None
|
||||||
|
self.current_paragraph: Optional[Parapgraph] = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset the handler state."""
|
||||||
|
self.block_stack = []
|
||||||
|
self.current_block = None
|
||||||
|
self.current_paragraph = None
|
||||||
|
|
||||||
|
def add_block_to_document_or_parent(self, block: Block, document: Document):
|
||||||
|
"""Add a block to the document or current parent block."""
|
||||||
|
if self.current_block and hasattr(self.current_block, 'add_block'):
|
||||||
|
self.current_block.add_block(block)
|
||||||
|
else:
|
||||||
|
document.add_block(block)
|
||||||
|
|
||||||
|
def handle_paragraph_start(self, document: Document):
|
||||||
|
"""Handle the start of a paragraph element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
paragraph = Parapgraph()
|
||||||
|
|
||||||
|
self.add_block_to_document_or_parent(paragraph, document)
|
||||||
|
self.block_stack.append(paragraph)
|
||||||
|
self.current_block = paragraph
|
||||||
|
self.current_paragraph = paragraph
|
||||||
|
self.text_processor.set_current_paragraph(paragraph)
|
||||||
|
|
||||||
|
def handle_heading_start(self, tag: str, document: Document):
|
||||||
|
"""Handle the start of a heading element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
level_map = {
|
||||||
|
'h1': HeadingLevel.H1, 'h2': HeadingLevel.H2, 'h3': HeadingLevel.H3,
|
||||||
|
'h4': HeadingLevel.H4, 'h5': HeadingLevel.H5, 'h6': HeadingLevel.H6
|
||||||
|
}
|
||||||
|
|
||||||
|
heading = Heading(level=level_map[tag])
|
||||||
|
self.add_block_to_document_or_parent(heading, document)
|
||||||
|
self.block_stack.append(heading)
|
||||||
|
self.current_block = heading
|
||||||
|
self.current_paragraph = heading # Heading inherits from Paragraph
|
||||||
|
self.text_processor.set_current_paragraph(heading)
|
||||||
|
|
||||||
|
def handle_div_start(self, document: Document):
|
||||||
|
"""Handle the start of a div element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
div_para = Parapgraph()
|
||||||
|
|
||||||
|
self.add_block_to_document_or_parent(div_para, document)
|
||||||
|
self.block_stack.append(div_para)
|
||||||
|
self.current_block = div_para
|
||||||
|
self.current_paragraph = div_para
|
||||||
|
self.text_processor.set_current_paragraph(div_para)
|
||||||
|
|
||||||
|
def handle_blockquote_start(self, document: Document):
|
||||||
|
"""Handle the start of a blockquote element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
quote = Quote()
|
||||||
|
|
||||||
|
self.add_block_to_document_or_parent(quote, document)
|
||||||
|
self.block_stack.append(quote)
|
||||||
|
self.current_block = quote
|
||||||
|
self.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
|
||||||
|
def handle_pre_start(self, document: Document):
|
||||||
|
"""Handle the start of a pre element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
pre_para = Parapgraph()
|
||||||
|
|
||||||
|
self.add_block_to_document_or_parent(pre_para, document)
|
||||||
|
self.block_stack.append(pre_para)
|
||||||
|
self.current_block = pre_para
|
||||||
|
self.current_paragraph = pre_para
|
||||||
|
self.text_processor.set_current_paragraph(pre_para)
|
||||||
|
|
||||||
|
def handle_code_start(self, attrs: Dict[str, str], document: Document):
|
||||||
|
"""Handle the start of a code element."""
|
||||||
|
# If we're inside a pre, replace the paragraph with a code block
|
||||||
|
if self.block_stack and isinstance(self.block_stack[-1], Parapgraph):
|
||||||
|
pre_para = self.block_stack.pop()
|
||||||
|
|
||||||
|
# Get the language from class if specified
|
||||||
|
language = ""
|
||||||
|
if 'class' in attrs:
|
||||||
|
class_attr = attrs['class']
|
||||||
|
if class_attr.startswith('language-'):
|
||||||
|
language = class_attr[9:]
|
||||||
|
|
||||||
|
code_block = CodeBlock(language=language)
|
||||||
|
|
||||||
|
# Replace the paragraph with the code block in its parent
|
||||||
|
if pre_para.parent:
|
||||||
|
parent = pre_para.parent
|
||||||
|
if hasattr(parent, '_blocks'):
|
||||||
|
for i, block in enumerate(parent._blocks):
|
||||||
|
if block == pre_para:
|
||||||
|
parent._blocks[i] = code_block
|
||||||
|
code_block.parent = parent
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Replace in document blocks
|
||||||
|
for i, block in enumerate(document.blocks):
|
||||||
|
if block == pre_para:
|
||||||
|
document.blocks[i] = code_block
|
||||||
|
break
|
||||||
|
|
||||||
|
self.block_stack.append(code_block)
|
||||||
|
self.current_block = code_block
|
||||||
|
self.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
|
||||||
|
def handle_block_end(self):
|
||||||
|
"""Handle the end of a block element."""
|
||||||
|
if self.block_stack:
|
||||||
|
self.block_stack.pop()
|
||||||
|
|
||||||
|
if self.block_stack:
|
||||||
|
self.current_block = self.block_stack[-1]
|
||||||
|
# Update current paragraph based on block type
|
||||||
|
if isinstance(self.current_block, Parapgraph):
|
||||||
|
self.current_paragraph = self.current_block
|
||||||
|
else:
|
||||||
|
self.current_paragraph = None
|
||||||
|
else:
|
||||||
|
self.current_block = None
|
||||||
|
self.current_paragraph = None
|
||||||
|
|
||||||
|
self.text_processor.set_current_paragraph(self.current_paragraph)
|
||||||
|
|
||||||
|
|
||||||
|
class ListElementHandler:
|
||||||
|
"""Handles list-related HTML elements (ul, ol, dl, li, dt, dd)."""
|
||||||
|
|
||||||
|
def __init__(self, text_processor: HTMLTextProcessor):
|
||||||
|
self.text_processor = text_processor
|
||||||
|
self.list_stack: List[HList] = []
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset the handler state."""
|
||||||
|
self.list_stack = []
|
||||||
|
|
||||||
|
def handle_list_start(self, tag: str, block_handler: BlockElementHandler, document: Document):
|
||||||
|
"""Handle the start of a list element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
style_map = {
|
||||||
|
'ul': ListStyle.UNORDERED,
|
||||||
|
'ol': ListStyle.ORDERED,
|
||||||
|
'dl': ListStyle.DEFINITION
|
||||||
|
}
|
||||||
|
|
||||||
|
list_block = HList(style=style_map[tag])
|
||||||
|
block_handler.add_block_to_document_or_parent(list_block, document)
|
||||||
|
|
||||||
|
block_handler.block_stack.append(list_block)
|
||||||
|
self.list_stack.append(list_block)
|
||||||
|
block_handler.current_block = list_block
|
||||||
|
block_handler.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
|
||||||
|
def handle_list_item_start(self, block_handler: BlockElementHandler):
|
||||||
|
"""Handle the start of a list item."""
|
||||||
|
if not self.list_stack:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
list_item = ListItem()
|
||||||
|
|
||||||
|
current_list = self.list_stack[-1]
|
||||||
|
current_list.add_item(list_item)
|
||||||
|
|
||||||
|
block_handler.block_stack.append(list_item)
|
||||||
|
block_handler.current_block = list_item
|
||||||
|
|
||||||
|
# Create a paragraph for the list item content
|
||||||
|
item_para = Parapgraph()
|
||||||
|
list_item.add_block(item_para)
|
||||||
|
block_handler.current_paragraph = item_para
|
||||||
|
self.text_processor.set_current_paragraph(item_para)
|
||||||
|
|
||||||
|
def handle_definition_start(self, tag: str, block_handler: BlockElementHandler):
|
||||||
|
"""Handle the start of definition terms or descriptions."""
|
||||||
|
if not self.list_stack or self.list_stack[-1].style != ListStyle.DEFINITION:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
current_list = self.list_stack[-1]
|
||||||
|
|
||||||
|
if tag == 'dt':
|
||||||
|
list_item = ListItem(term="")
|
||||||
|
current_list.add_item(list_item)
|
||||||
|
block_handler.block_stack.append(list_item)
|
||||||
|
block_handler.current_block = list_item
|
||||||
|
|
||||||
|
term_para = Parapgraph()
|
||||||
|
list_item.add_block(term_para)
|
||||||
|
block_handler.current_paragraph = term_para
|
||||||
|
self.text_processor.set_current_paragraph(term_para)
|
||||||
|
|
||||||
|
elif tag == 'dd':
|
||||||
|
if current_list._items:
|
||||||
|
list_item = current_list._items[-1]
|
||||||
|
desc_para = Parapgraph()
|
||||||
|
list_item.add_block(desc_para)
|
||||||
|
block_handler.current_paragraph = desc_para
|
||||||
|
self.text_processor.set_current_paragraph(desc_para)
|
||||||
|
|
||||||
|
def handle_list_end(self, block_handler: BlockElementHandler):
|
||||||
|
"""Handle the end of a list."""
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.block_stack.pop()
|
||||||
|
if self.list_stack:
|
||||||
|
self.list_stack.pop()
|
||||||
|
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.current_block = block_handler.block_stack[-1]
|
||||||
|
else:
|
||||||
|
block_handler.current_block = None
|
||||||
|
|
||||||
|
block_handler.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
|
||||||
|
def handle_list_item_end(self, block_handler: BlockElementHandler):
|
||||||
|
"""Handle the end of a list item."""
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.block_stack.pop()
|
||||||
|
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.current_block = block_handler.block_stack[-1]
|
||||||
|
else:
|
||||||
|
block_handler.current_block = None
|
||||||
|
|
||||||
|
block_handler.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
|
||||||
|
|
||||||
|
class TableElementHandler:
|
||||||
|
"""Handles table-related HTML elements (table, tr, td, th, thead, tbody, tfoot)."""
|
||||||
|
|
||||||
|
def __init__(self, text_processor: HTMLTextProcessor):
|
||||||
|
self.text_processor = text_processor
|
||||||
|
self.table_stack: List[Table] = []
|
||||||
|
self.current_table_row: Optional[TableRow] = None
|
||||||
|
self.current_table_section = "body"
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset the handler state."""
|
||||||
|
self.table_stack = []
|
||||||
|
self.current_table_row = None
|
||||||
|
self.current_table_section = "body"
|
||||||
|
|
||||||
|
def handle_table_start(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
|
||||||
|
"""Handle the start of a table element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
caption = attrs.get('summary')
|
||||||
|
table = Table(caption=caption)
|
||||||
|
|
||||||
|
block_handler.add_block_to_document_or_parent(table, document)
|
||||||
|
block_handler.block_stack.append(table)
|
||||||
|
self.table_stack.append(table)
|
||||||
|
block_handler.current_block = table
|
||||||
|
block_handler.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
|
||||||
|
def handle_table_section_start(self, tag: str):
|
||||||
|
"""Handle the start of a table section."""
|
||||||
|
self.current_table_section = tag
|
||||||
|
|
||||||
|
def handle_table_row_start(self):
|
||||||
|
"""Handle the start of a table row."""
|
||||||
|
if not self.table_stack:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
row = TableRow()
|
||||||
|
|
||||||
|
current_table = self.table_stack[-1]
|
||||||
|
section = self.current_table_section
|
||||||
|
|
||||||
|
if section == 'thead':
|
||||||
|
section = "header"
|
||||||
|
elif section == 'tfoot':
|
||||||
|
section = "footer"
|
||||||
|
else:
|
||||||
|
section = "body"
|
||||||
|
|
||||||
|
current_table.add_row(row, section=section)
|
||||||
|
self.current_table_row = row
|
||||||
|
|
||||||
|
def handle_table_cell_start(self, tag: str, attrs: Dict[str, str], block_handler: BlockElementHandler):
|
||||||
|
"""Handle the start of a table cell."""
|
||||||
|
if not self.current_table_row:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
# Parse attributes
|
||||||
|
try:
|
||||||
|
colspan = int(attrs.get('colspan', 1))
|
||||||
|
rowspan = int(attrs.get('rowspan', 1))
|
||||||
|
except ValueError:
|
||||||
|
colspan, rowspan = 1, 1
|
||||||
|
|
||||||
|
is_header = (tag == 'th')
|
||||||
|
|
||||||
|
cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan)
|
||||||
|
self.current_table_row.add_cell(cell)
|
||||||
|
|
||||||
|
block_handler.block_stack.append(cell)
|
||||||
|
block_handler.current_block = cell
|
||||||
|
|
||||||
|
# Create a paragraph for the cell content
|
||||||
|
cell_para = Parapgraph()
|
||||||
|
cell.add_block(cell_para)
|
||||||
|
block_handler.current_paragraph = cell_para
|
||||||
|
self.text_processor.set_current_paragraph(cell_para)
|
||||||
|
|
||||||
|
def handle_table_end(self, block_handler: BlockElementHandler):
|
||||||
|
"""Handle the end of a table."""
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.block_stack.pop()
|
||||||
|
if self.table_stack:
|
||||||
|
self.table_stack.pop()
|
||||||
|
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.current_block = block_handler.block_stack[-1]
|
||||||
|
else:
|
||||||
|
block_handler.current_block = None
|
||||||
|
|
||||||
|
block_handler.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
self.current_table_row = None
|
||||||
|
self.current_table_section = "body"
|
||||||
|
|
||||||
|
def handle_table_section_end(self):
|
||||||
|
"""Handle the end of a table section."""
|
||||||
|
self.current_table_section = "body"
|
||||||
|
|
||||||
|
def handle_table_row_end(self):
|
||||||
|
"""Handle the end of a table row."""
|
||||||
|
self.current_table_row = None
|
||||||
|
|
||||||
|
def handle_table_cell_end(self, block_handler: BlockElementHandler):
|
||||||
|
"""Handle the end of a table cell."""
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.block_stack.pop()
|
||||||
|
|
||||||
|
if block_handler.block_stack:
|
||||||
|
block_handler.current_block = block_handler.block_stack[-1]
|
||||||
|
else:
|
||||||
|
block_handler.current_block = None
|
||||||
|
|
||||||
|
block_handler.current_paragraph = None
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
|
||||||
|
|
||||||
|
class InlineElementHandler:
|
||||||
|
"""Handles inline and special HTML elements (a, img, br, hr)."""
|
||||||
|
|
||||||
|
def __init__(self, text_processor: HTMLTextProcessor, base_url: Optional[str] = None):
|
||||||
|
self.text_processor = text_processor
|
||||||
|
self.base_url = base_url
|
||||||
|
self.in_link = False
|
||||||
|
self.current_link: Optional[Link] = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset the handler state."""
|
||||||
|
self.in_link = False
|
||||||
|
self.current_link = None
|
||||||
|
|
||||||
|
def set_base_url(self, base_url: Optional[str]):
|
||||||
|
"""Set the base URL for resolving relative links."""
|
||||||
|
self.base_url = base_url
|
||||||
|
|
||||||
|
def handle_link_start(self, attrs: Dict[str, str]):
|
||||||
|
"""Handle the start of a link element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
href = attrs.get('href', '')
|
||||||
|
title = attrs.get('title', '')
|
||||||
|
|
||||||
|
# Determine link type
|
||||||
|
link_type = LinkType.INTERNAL
|
||||||
|
if href.startswith('http://') or href.startswith('https://'):
|
||||||
|
link_type = LinkType.EXTERNAL
|
||||||
|
elif href.startswith('javascript:'):
|
||||||
|
link_type = LinkType.FUNCTION
|
||||||
|
elif href.startswith('api:'):
|
||||||
|
link_type = LinkType.API
|
||||||
|
href = href[4:]
|
||||||
|
|
||||||
|
# Resolve relative URLs
|
||||||
|
if self.base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')):
|
||||||
|
href = urllib.parse.urljoin(self.base_url, href)
|
||||||
|
|
||||||
|
self.current_link = Link(
|
||||||
|
location=href,
|
||||||
|
link_type=link_type,
|
||||||
|
title=title if title else None
|
||||||
|
)
|
||||||
|
|
||||||
|
self.in_link = True
|
||||||
|
|
||||||
|
def handle_link_end(self):
|
||||||
|
"""Handle the end of a link element."""
|
||||||
|
self.in_link = False
|
||||||
|
self.current_link = None
|
||||||
|
|
||||||
|
def handle_image(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
|
||||||
|
"""Handle an image element."""
|
||||||
|
src = attrs.get('src', '')
|
||||||
|
alt = attrs.get('alt', '')
|
||||||
|
|
||||||
|
# Parse dimensions
|
||||||
|
width = height = None
|
||||||
|
try:
|
||||||
|
if 'width' in attrs:
|
||||||
|
width = int(attrs['width'])
|
||||||
|
if 'height' in attrs:
|
||||||
|
height = int(attrs['height'])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Resolve relative URLs
|
||||||
|
if self.base_url and not src.startswith(('http://', 'https://')):
|
||||||
|
src = urllib.parse.urljoin(self.base_url, src)
|
||||||
|
|
||||||
|
image = Image(source=src, alt_text=alt, width=width, height=height)
|
||||||
|
block_handler.add_block_to_document_or_parent(image, document)
|
||||||
|
|
||||||
|
def handle_line_break(self, block_handler: BlockElementHandler):
|
||||||
|
"""Handle a line break element."""
|
||||||
|
if block_handler.current_paragraph:
|
||||||
|
line_break = LineBreak()
|
||||||
|
if hasattr(block_handler.current_paragraph, 'add_block'):
|
||||||
|
block_handler.current_paragraph.add_block(line_break)
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
def handle_horizontal_rule(self, block_handler: BlockElementHandler, document: Document):
|
||||||
|
"""Handle a horizontal rule element."""
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
hr = HorizontalRule()
|
||||||
|
block_handler.add_block_to_document_or_parent(hr, document)
|
||||||
426
pyWebLayout/io/readers/html_metadata.py
Normal file
426
pyWebLayout/io/readers/html_metadata.py
Normal file
@ -0,0 +1,426 @@
|
|||||||
|
"""
|
||||||
|
HTML metadata reader for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides specialized functionality for extracting metadata
|
||||||
|
from HTML documents, following the decomposed architecture pattern.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
import re
|
||||||
|
from pyWebLayout.abstract.document import Document, MetadataType
|
||||||
|
from pyWebLayout.io.readers.base import MetadataReader
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLMetadataReader(MetadataReader):
|
||||||
|
"""
|
||||||
|
Specialized reader for extracting metadata from HTML documents.
|
||||||
|
|
||||||
|
This class handles HTML meta tags, title elements, and other metadata
|
||||||
|
sources like Open Graph tags and JSON-LD structured data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the HTML metadata reader."""
|
||||||
|
self._title = None
|
||||||
|
self._meta_tags = {}
|
||||||
|
self._og_tags = {}
|
||||||
|
self._twitter_tags = {}
|
||||||
|
self._json_ld = {}
|
||||||
|
|
||||||
|
def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract metadata from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
document: The document to populate with metadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of extracted metadata
|
||||||
|
"""
|
||||||
|
# Reset internal state
|
||||||
|
self._reset()
|
||||||
|
|
||||||
|
# Extract title
|
||||||
|
self._extract_title(html_content)
|
||||||
|
|
||||||
|
# Extract meta tags
|
||||||
|
self._extract_meta_tags(html_content)
|
||||||
|
|
||||||
|
# Extract Open Graph tags
|
||||||
|
self._extract_open_graph(html_content)
|
||||||
|
|
||||||
|
# Extract Twitter Card tags
|
||||||
|
self._extract_twitter_cards(html_content)
|
||||||
|
|
||||||
|
# Extract JSON-LD structured data
|
||||||
|
self._extract_json_ld(html_content)
|
||||||
|
|
||||||
|
# Populate document with extracted metadata
|
||||||
|
self._populate_document(document)
|
||||||
|
|
||||||
|
# Return all extracted metadata
|
||||||
|
return {
|
||||||
|
'title': self._title,
|
||||||
|
'meta_tags': self._meta_tags,
|
||||||
|
'open_graph': self._og_tags,
|
||||||
|
'twitter_cards': self._twitter_tags,
|
||||||
|
'json_ld': self._json_ld
|
||||||
|
}
|
||||||
|
|
||||||
|
def _reset(self):
|
||||||
|
"""Reset internal state for a new extraction."""
|
||||||
|
self._title = None
|
||||||
|
self._meta_tags = {}
|
||||||
|
self._og_tags = {}
|
||||||
|
self._twitter_tags = {}
|
||||||
|
self._json_ld = {}
|
||||||
|
|
||||||
|
def _extract_title(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract the title from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Look for title tag
|
||||||
|
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
|
||||||
|
if title_match:
|
||||||
|
# Clean up the title text
|
||||||
|
self._title = self._clean_text(title_match.group(1))
|
||||||
|
|
||||||
|
def _extract_meta_tags(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract meta tags from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Regular expression to match meta tags
|
||||||
|
meta_pattern = r'<meta\s+([^>]+)>'
|
||||||
|
|
||||||
|
for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
|
||||||
|
attrs = self._parse_attributes(match.group(1))
|
||||||
|
|
||||||
|
# Get name and content
|
||||||
|
name = attrs.get('name', '').lower()
|
||||||
|
content = attrs.get('content', '')
|
||||||
|
|
||||||
|
# Handle different types of meta tags
|
||||||
|
if name and content:
|
||||||
|
self._meta_tags[name] = content
|
||||||
|
|
||||||
|
# Handle http-equiv meta tags
|
||||||
|
http_equiv = attrs.get('http-equiv', '').lower()
|
||||||
|
if http_equiv and content:
|
||||||
|
self._meta_tags[f'http-equiv:{http_equiv}'] = content
|
||||||
|
|
||||||
|
# Handle charset meta tags
|
||||||
|
charset = attrs.get('charset', '')
|
||||||
|
if charset:
|
||||||
|
self._meta_tags['charset'] = charset
|
||||||
|
|
||||||
|
def _extract_open_graph(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract Open Graph meta tags from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Regular expression to match Open Graph meta tags
|
||||||
|
og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
||||||
|
|
||||||
|
for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
|
||||||
|
property_name = match.group(1)
|
||||||
|
content = match.group(2)
|
||||||
|
self._og_tags[property_name] = content
|
||||||
|
|
||||||
|
def _extract_twitter_cards(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract Twitter Card meta tags from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Regular expression to match Twitter Card meta tags
|
||||||
|
twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
||||||
|
|
||||||
|
for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
|
||||||
|
property_name = match.group(1)
|
||||||
|
content = match.group(2)
|
||||||
|
self._twitter_tags[property_name] = content
|
||||||
|
|
||||||
|
def _extract_json_ld(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract JSON-LD structured data from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Regular expression to match JSON-LD script tags
|
||||||
|
json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
|
||||||
|
|
||||||
|
for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
json_content = match.group(1).strip()
|
||||||
|
data = json.loads(json_content)
|
||||||
|
|
||||||
|
# Store JSON-LD data by type if available
|
||||||
|
if isinstance(data, dict) and '@type' in data:
|
||||||
|
type_name = data['@type']
|
||||||
|
if type_name not in self._json_ld:
|
||||||
|
self._json_ld[type_name] = []
|
||||||
|
self._json_ld[type_name].append(data)
|
||||||
|
elif isinstance(data, list):
|
||||||
|
# Handle arrays of structured data
|
||||||
|
for item in data:
|
||||||
|
if isinstance(item, dict) and '@type' in item:
|
||||||
|
type_name = item['@type']
|
||||||
|
if type_name not in self._json_ld:
|
||||||
|
self._json_ld[type_name] = []
|
||||||
|
self._json_ld[type_name].append(item)
|
||||||
|
except (json.JSONDecodeError, ImportError):
|
||||||
|
# Skip invalid JSON-LD
|
||||||
|
continue
|
||||||
|
|
||||||
|
def _populate_document(self, document: Document):
|
||||||
|
"""
|
||||||
|
Populate the document with extracted metadata.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The document to populate
|
||||||
|
"""
|
||||||
|
# Set title
|
||||||
|
title = self._get_best_title()
|
||||||
|
if title:
|
||||||
|
document.set_metadata(MetadataType.TITLE, title)
|
||||||
|
|
||||||
|
# Set description
|
||||||
|
description = self._get_best_description()
|
||||||
|
if description:
|
||||||
|
document.set_metadata(MetadataType.DESCRIPTION, description)
|
||||||
|
|
||||||
|
# Set author
|
||||||
|
author = self._get_best_author()
|
||||||
|
if author:
|
||||||
|
document.set_metadata(MetadataType.AUTHOR, author)
|
||||||
|
|
||||||
|
# Set keywords
|
||||||
|
keywords = self._get_keywords()
|
||||||
|
if keywords:
|
||||||
|
document.set_metadata(MetadataType.KEYWORDS, keywords)
|
||||||
|
|
||||||
|
# Set language
|
||||||
|
language = self._get_language()
|
||||||
|
if language:
|
||||||
|
document.set_metadata(MetadataType.LANGUAGE, language)
|
||||||
|
|
||||||
|
# Set cover image
|
||||||
|
cover_image = self._get_cover_image()
|
||||||
|
if cover_image:
|
||||||
|
document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
|
||||||
|
|
||||||
|
# Set publisher
|
||||||
|
publisher = self._get_publisher()
|
||||||
|
if publisher:
|
||||||
|
document.set_metadata(MetadataType.PUBLISHER, publisher)
|
||||||
|
|
||||||
|
# Set publication date
|
||||||
|
pub_date = self._get_publication_date()
|
||||||
|
if pub_date:
|
||||||
|
document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
|
||||||
|
|
||||||
|
def _get_best_title(self) -> Optional[str]:
|
||||||
|
"""Get the best available title from all sources."""
|
||||||
|
# Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
|
||||||
|
|
||||||
|
# Check Open Graph
|
||||||
|
if 'title' in self._og_tags:
|
||||||
|
return self._og_tags['title']
|
||||||
|
|
||||||
|
# Check Twitter Cards
|
||||||
|
if 'title' in self._twitter_tags:
|
||||||
|
return self._twitter_tags['title']
|
||||||
|
|
||||||
|
# Check JSON-LD
|
||||||
|
for type_name, items in self._json_ld.items():
|
||||||
|
for item in items:
|
||||||
|
if 'name' in item:
|
||||||
|
return item['name']
|
||||||
|
elif 'headline' in item:
|
||||||
|
return item['headline']
|
||||||
|
|
||||||
|
# Check meta tags
|
||||||
|
for key in ['title', 'og:title', 'twitter:title']:
|
||||||
|
if key in self._meta_tags:
|
||||||
|
return self._meta_tags[key]
|
||||||
|
|
||||||
|
# Fall back to HTML title
|
||||||
|
return self._title
|
||||||
|
|
||||||
|
def _get_best_description(self) -> Optional[str]:
|
||||||
|
"""Get the best available description from all sources."""
|
||||||
|
# Priority order: Open Graph > Twitter > meta description > JSON-LD
|
||||||
|
|
||||||
|
# Check Open Graph
|
||||||
|
if 'description' in self._og_tags:
|
||||||
|
return self._og_tags['description']
|
||||||
|
|
||||||
|
# Check Twitter Cards
|
||||||
|
if 'description' in self._twitter_tags:
|
||||||
|
return self._twitter_tags['description']
|
||||||
|
|
||||||
|
# Check meta description
|
||||||
|
if 'description' in self._meta_tags:
|
||||||
|
return self._meta_tags['description']
|
||||||
|
|
||||||
|
# Check JSON-LD
|
||||||
|
for type_name, items in self._json_ld.items():
|
||||||
|
for item in items:
|
||||||
|
if 'description' in item:
|
||||||
|
return item['description']
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_best_author(self) -> Optional[str]:
|
||||||
|
"""Get the best available author from all sources."""
|
||||||
|
# Check meta tags
|
||||||
|
if 'author' in self._meta_tags:
|
||||||
|
return self._meta_tags['author']
|
||||||
|
|
||||||
|
# Check JSON-LD
|
||||||
|
for type_name, items in self._json_ld.items():
|
||||||
|
for item in items:
|
||||||
|
if 'author' in item:
|
||||||
|
author = item['author']
|
||||||
|
if isinstance(author, dict) and 'name' in author:
|
||||||
|
return author['name']
|
||||||
|
elif isinstance(author, str):
|
||||||
|
return author
|
||||||
|
elif 'creator' in item:
|
||||||
|
creator = item['creator']
|
||||||
|
if isinstance(creator, dict) and 'name' in creator:
|
||||||
|
return creator['name']
|
||||||
|
elif isinstance(creator, str):
|
||||||
|
return creator
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_keywords(self) -> Optional[str]:
|
||||||
|
"""Get keywords from meta tags."""
|
||||||
|
return self._meta_tags.get('keywords')
|
||||||
|
|
||||||
|
def _get_language(self) -> Optional[str]:
|
||||||
|
"""Get language from meta tags or HTML lang attribute."""
|
||||||
|
# Check meta tags first
|
||||||
|
if 'language' in self._meta_tags:
|
||||||
|
return self._meta_tags['language']
|
||||||
|
|
||||||
|
# Could also extract from html lang attribute if needed
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_cover_image(self) -> Optional[str]:
|
||||||
|
"""Get the best available cover image from all sources."""
|
||||||
|
# Check Open Graph
|
||||||
|
if 'image' in self._og_tags:
|
||||||
|
return self._og_tags['image']
|
||||||
|
|
||||||
|
# Check Twitter Cards
|
||||||
|
if 'image' in self._twitter_tags:
|
||||||
|
return self._twitter_tags['image']
|
||||||
|
|
||||||
|
# Check JSON-LD
|
||||||
|
for type_name, items in self._json_ld.items():
|
||||||
|
for item in items:
|
||||||
|
if 'image' in item:
|
||||||
|
image = item['image']
|
||||||
|
if isinstance(image, dict) and 'url' in image:
|
||||||
|
return image['url']
|
||||||
|
elif isinstance(image, str):
|
||||||
|
return image
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_publisher(self) -> Optional[str]:
|
||||||
|
"""Get publisher from JSON-LD or other sources."""
|
||||||
|
# Check JSON-LD
|
||||||
|
for type_name, items in self._json_ld.items():
|
||||||
|
for item in items:
|
||||||
|
if 'publisher' in item:
|
||||||
|
publisher = item['publisher']
|
||||||
|
if isinstance(publisher, dict) and 'name' in publisher:
|
||||||
|
return publisher['name']
|
||||||
|
elif isinstance(publisher, str):
|
||||||
|
return publisher
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_publication_date(self) -> Optional[str]:
|
||||||
|
"""Get publication date from JSON-LD or other sources."""
|
||||||
|
# Check JSON-LD
|
||||||
|
for type_name, items in self._json_ld.items():
|
||||||
|
for item in items:
|
||||||
|
if 'datePublished' in item:
|
||||||
|
return item['datePublished']
|
||||||
|
elif 'publishDate' in item:
|
||||||
|
return item['publishDate']
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Parse HTML attributes from a string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
attr_string: String containing HTML attributes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of attribute name-value pairs
|
||||||
|
"""
|
||||||
|
attrs = {}
|
||||||
|
|
||||||
|
# Regular expression to match attribute="value" or attribute='value'
|
||||||
|
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
|
||||||
|
|
||||||
|
for match in re.finditer(attr_pattern, attr_string):
|
||||||
|
name = match.group(1).lower()
|
||||||
|
value = match.group(2) or match.group(3) or match.group(4) or ''
|
||||||
|
attrs[name] = value
|
||||||
|
|
||||||
|
# Handle standalone attributes (like charset)
|
||||||
|
standalone_pattern = r'\b(\w+)(?!=)'
|
||||||
|
for match in re.finditer(standalone_pattern, attr_string):
|
||||||
|
attr_name = match.group(1).lower()
|
||||||
|
if attr_name not in attrs:
|
||||||
|
attrs[attr_name] = ''
|
||||||
|
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
def _clean_text(self, text: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean up text content by removing extra whitespace and HTML entities.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to clean
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned text
|
||||||
|
"""
|
||||||
|
# Remove extra whitespace
|
||||||
|
cleaned = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
|
# Decode common HTML entities
|
||||||
|
entities = {
|
||||||
|
'<': '<',
|
||||||
|
'>': '>',
|
||||||
|
'&': '&',
|
||||||
|
'"': '"',
|
||||||
|
''': "'",
|
||||||
|
' ': ' ',
|
||||||
|
}
|
||||||
|
|
||||||
|
for entity, char in entities.items():
|
||||||
|
cleaned = cleaned.replace(entity, char)
|
||||||
|
|
||||||
|
return cleaned
|
||||||
483
pyWebLayout/io/readers/html_resources.py
Normal file
483
pyWebLayout/io/readers/html_resources.py
Normal file
@ -0,0 +1,483 @@
|
|||||||
|
"""
|
||||||
|
HTML resources reader for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides specialized functionality for extracting resources
|
||||||
|
from HTML documents, such as stylesheets, scripts, and external files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from pyWebLayout.abstract.document import Document
|
||||||
|
from pyWebLayout.io.readers.base import ResourceReader
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLResourceReader(ResourceReader):
|
||||||
|
"""
|
||||||
|
Specialized reader for extracting resources from HTML documents.
|
||||||
|
|
||||||
|
This class handles CSS stylesheets, JavaScript files, images,
|
||||||
|
and other external resources referenced in HTML.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the HTML resource reader."""
|
||||||
|
self._stylesheets = []
|
||||||
|
self._scripts = []
|
||||||
|
self._external_resources = {}
|
||||||
|
self._inline_styles = {}
|
||||||
|
self._inline_scripts = []
|
||||||
|
|
||||||
|
def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract resources from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
document: The document to populate with resources
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of extracted resources
|
||||||
|
"""
|
||||||
|
# Reset internal state
|
||||||
|
self._reset()
|
||||||
|
|
||||||
|
# Extract stylesheets
|
||||||
|
self._extract_stylesheets(html_content)
|
||||||
|
|
||||||
|
# Extract scripts
|
||||||
|
self._extract_scripts(html_content)
|
||||||
|
|
||||||
|
# Extract other external resources
|
||||||
|
self._extract_external_resources(html_content)
|
||||||
|
|
||||||
|
# Extract inline styles
|
||||||
|
self._extract_inline_styles(html_content)
|
||||||
|
|
||||||
|
# Extract inline scripts
|
||||||
|
self._extract_inline_scripts(html_content)
|
||||||
|
|
||||||
|
# Populate document with extracted resources
|
||||||
|
self._populate_document(document)
|
||||||
|
|
||||||
|
# Return all extracted resources
|
||||||
|
return {
|
||||||
|
'stylesheets': self._stylesheets,
|
||||||
|
'scripts': self._scripts,
|
||||||
|
'external_resources': self._external_resources,
|
||||||
|
'inline_styles': self._inline_styles,
|
||||||
|
'inline_scripts': self._inline_scripts
|
||||||
|
}
|
||||||
|
|
||||||
|
def _reset(self):
|
||||||
|
"""Reset internal state for a new extraction."""
|
||||||
|
self._stylesheets = []
|
||||||
|
self._scripts = []
|
||||||
|
self._external_resources = {}
|
||||||
|
self._inline_styles = {}
|
||||||
|
self._inline_scripts = []
|
||||||
|
|
||||||
|
def _extract_stylesheets(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract CSS stylesheet references from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Regular expression to match link tags for stylesheets
|
||||||
|
link_pattern = r'<link\s+([^>]+)>'
|
||||||
|
|
||||||
|
for match in re.finditer(link_pattern, html_content, re.IGNORECASE):
|
||||||
|
attrs = self._parse_attributes(match.group(1))
|
||||||
|
|
||||||
|
# Check if this is a stylesheet
|
||||||
|
rel = attrs.get('rel', '').lower()
|
||||||
|
if rel == 'stylesheet':
|
||||||
|
href = attrs.get('href', '')
|
||||||
|
media = attrs.get('media', 'all')
|
||||||
|
type_attr = attrs.get('type', 'text/css')
|
||||||
|
|
||||||
|
if href:
|
||||||
|
stylesheet = {
|
||||||
|
'type': 'external',
|
||||||
|
'href': href,
|
||||||
|
'media': media,
|
||||||
|
'content_type': type_attr
|
||||||
|
}
|
||||||
|
self._stylesheets.append(stylesheet)
|
||||||
|
|
||||||
|
# Handle other link types
|
||||||
|
elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'):
|
||||||
|
href = attrs.get('href', '')
|
||||||
|
if href:
|
||||||
|
self._external_resources[f'icon_{len(self._external_resources)}'] = {
|
||||||
|
'type': 'icon',
|
||||||
|
'rel': rel,
|
||||||
|
'href': href,
|
||||||
|
'sizes': attrs.get('sizes', ''),
|
||||||
|
'content_type': attrs.get('type', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
elif rel == 'preload':
|
||||||
|
href = attrs.get('href', '')
|
||||||
|
if href:
|
||||||
|
self._external_resources[f'preload_{len(self._external_resources)}'] = {
|
||||||
|
'type': 'preload',
|
||||||
|
'href': href,
|
||||||
|
'as': attrs.get('as', ''),
|
||||||
|
'content_type': attrs.get('type', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_scripts(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract script references from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Regular expression to match script tags
|
||||||
|
script_pattern = r'<script\s*([^>]*)>(.*?)</script>'
|
||||||
|
|
||||||
|
for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL):
|
||||||
|
attrs_str = match.group(1)
|
||||||
|
content = match.group(2).strip()
|
||||||
|
|
||||||
|
attrs = self._parse_attributes(attrs_str)
|
||||||
|
|
||||||
|
src = attrs.get('src', '')
|
||||||
|
script_type = attrs.get('type', 'text/javascript')
|
||||||
|
|
||||||
|
if src:
|
||||||
|
# External script
|
||||||
|
script = {
|
||||||
|
'type': 'external',
|
||||||
|
'src': src,
|
||||||
|
'content_type': script_type,
|
||||||
|
'async': 'async' in attrs,
|
||||||
|
'defer': 'defer' in attrs,
|
||||||
|
'integrity': attrs.get('integrity', ''),
|
||||||
|
'crossorigin': attrs.get('crossorigin', '')
|
||||||
|
}
|
||||||
|
self._scripts.append(script)
|
||||||
|
|
||||||
|
elif content:
|
||||||
|
# Inline script
|
||||||
|
script = {
|
||||||
|
'type': 'inline',
|
||||||
|
'content': content,
|
||||||
|
'content_type': script_type
|
||||||
|
}
|
||||||
|
self._scripts.append(script)
|
||||||
|
|
||||||
|
def _extract_external_resources(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract other external resources from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Extract images
|
||||||
|
img_pattern = r'<img\s+([^>]+)>'
|
||||||
|
for match in re.finditer(img_pattern, html_content, re.IGNORECASE):
|
||||||
|
attrs = self._parse_attributes(match.group(1))
|
||||||
|
src = attrs.get('src', '')
|
||||||
|
if src:
|
||||||
|
self._external_resources[f'image_{len(self._external_resources)}'] = {
|
||||||
|
'type': 'image',
|
||||||
|
'src': src,
|
||||||
|
'alt': attrs.get('alt', ''),
|
||||||
|
'width': attrs.get('width', ''),
|
||||||
|
'height': attrs.get('height', ''),
|
||||||
|
'loading': attrs.get('loading', ''),
|
||||||
|
'srcset': attrs.get('srcset', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract audio
|
||||||
|
audio_pattern = r'<audio\s+([^>]+)>'
|
||||||
|
for match in re.finditer(audio_pattern, html_content, re.IGNORECASE):
|
||||||
|
attrs = self._parse_attributes(match.group(1))
|
||||||
|
src = attrs.get('src', '')
|
||||||
|
if src:
|
||||||
|
self._external_resources[f'audio_{len(self._external_resources)}'] = {
|
||||||
|
'type': 'audio',
|
||||||
|
'src': src,
|
||||||
|
'controls': 'controls' in attrs,
|
||||||
|
'autoplay': 'autoplay' in attrs,
|
||||||
|
'loop': 'loop' in attrs,
|
||||||
|
'muted': 'muted' in attrs
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract video
|
||||||
|
video_pattern = r'<video\s+([^>]+)>'
|
||||||
|
for match in re.finditer(video_pattern, html_content, re.IGNORECASE):
|
||||||
|
attrs = self._parse_attributes(match.group(1))
|
||||||
|
src = attrs.get('src', '')
|
||||||
|
if src:
|
||||||
|
self._external_resources[f'video_{len(self._external_resources)}'] = {
|
||||||
|
'type': 'video',
|
||||||
|
'src': src,
|
||||||
|
'controls': 'controls' in attrs,
|
||||||
|
'autoplay': 'autoplay' in attrs,
|
||||||
|
'loop': 'loop' in attrs,
|
||||||
|
'muted': 'muted' in attrs,
|
||||||
|
'width': attrs.get('width', ''),
|
||||||
|
'height': attrs.get('height', ''),
|
||||||
|
'poster': attrs.get('poster', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract embed/object resources
|
||||||
|
embed_pattern = r'<embed\s+([^>]+)>'
|
||||||
|
for match in re.finditer(embed_pattern, html_content, re.IGNORECASE):
|
||||||
|
attrs = self._parse_attributes(match.group(1))
|
||||||
|
src = attrs.get('src', '')
|
||||||
|
if src:
|
||||||
|
self._external_resources[f'embed_{len(self._external_resources)}'] = {
|
||||||
|
'type': 'embed',
|
||||||
|
'src': src,
|
||||||
|
'content_type': attrs.get('type', ''),
|
||||||
|
'width': attrs.get('width', ''),
|
||||||
|
'height': attrs.get('height', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract iframe sources
|
||||||
|
iframe_pattern = r'<iframe\s+([^>]+)>'
|
||||||
|
for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE):
|
||||||
|
attrs = self._parse_attributes(match.group(1))
|
||||||
|
src = attrs.get('src', '')
|
||||||
|
if src:
|
||||||
|
self._external_resources[f'iframe_{len(self._external_resources)}'] = {
|
||||||
|
'type': 'iframe',
|
||||||
|
'src': src,
|
||||||
|
'width': attrs.get('width', ''),
|
||||||
|
'height': attrs.get('height', ''),
|
||||||
|
'loading': attrs.get('loading', ''),
|
||||||
|
'sandbox': attrs.get('sandbox', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_inline_styles(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract inline CSS styles from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# Extract style blocks
|
||||||
|
style_pattern = r'<style\s*([^>]*)>(.*?)</style>'
|
||||||
|
|
||||||
|
for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)):
|
||||||
|
attrs_str = match.group(1)
|
||||||
|
content = match.group(2).strip()
|
||||||
|
|
||||||
|
attrs = self._parse_attributes(attrs_str)
|
||||||
|
|
||||||
|
if content:
|
||||||
|
style_block = {
|
||||||
|
'content': content,
|
||||||
|
'media': attrs.get('media', 'all'),
|
||||||
|
'content_type': attrs.get('type', 'text/css')
|
||||||
|
}
|
||||||
|
self._inline_styles[f'style_block_{i}'] = style_block
|
||||||
|
|
||||||
|
# Extract inline style attributes (this would be more complex
|
||||||
|
# as it requires parsing all elements with style attributes)
|
||||||
|
style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>'
|
||||||
|
|
||||||
|
for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)):
|
||||||
|
style_content = match.group(1)
|
||||||
|
if style_content:
|
||||||
|
style_attr = {
|
||||||
|
'content': style_content,
|
||||||
|
'type': 'attribute'
|
||||||
|
}
|
||||||
|
self._inline_styles[f'style_attr_{i}'] = style_attr
|
||||||
|
|
||||||
|
def _extract_inline_scripts(self, html_content: str):
|
||||||
|
"""
|
||||||
|
Extract inline JavaScript from HTML content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: The HTML content to parse
|
||||||
|
"""
|
||||||
|
# This is already handled in _extract_scripts, but we keep this
|
||||||
|
# method for consistency and potential future extensions
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _populate_document(self, document: Document):
|
||||||
|
"""
|
||||||
|
Populate the document with extracted resources.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The document to populate
|
||||||
|
"""
|
||||||
|
# Add stylesheets
|
||||||
|
for stylesheet in self._stylesheets:
|
||||||
|
document.add_stylesheet(stylesheet)
|
||||||
|
|
||||||
|
# Add scripts
|
||||||
|
for script in self._scripts:
|
||||||
|
if script['type'] == 'inline':
|
||||||
|
document.add_script(script['content'])
|
||||||
|
else:
|
||||||
|
# For external scripts, we store them as resources
|
||||||
|
script_name = f"script_{len(document._resources)}"
|
||||||
|
document.add_resource(script_name, script)
|
||||||
|
|
||||||
|
# Add external resources
|
||||||
|
for name, resource in self._external_resources.items():
|
||||||
|
document.add_resource(name, resource)
|
||||||
|
|
||||||
|
# Add inline styles as stylesheets
|
||||||
|
for name, style in self._inline_styles.items():
|
||||||
|
if style.get('type') != 'attribute': # Don't add individual style attributes
|
||||||
|
parsed_style = self._parse_css(style['content'])
|
||||||
|
if parsed_style:
|
||||||
|
document.add_stylesheet({
|
||||||
|
'type': 'inline',
|
||||||
|
'content': style['content'],
|
||||||
|
'parsed': parsed_style,
|
||||||
|
'media': style.get('media', 'all')
|
||||||
|
})
|
||||||
|
|
||||||
|
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Parse HTML attributes from a string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
attr_string: String containing HTML attributes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of attribute name-value pairs
|
||||||
|
"""
|
||||||
|
attrs = {}
|
||||||
|
|
||||||
|
# Regular expression to match attribute="value" or attribute='value'
|
||||||
|
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
|
||||||
|
|
||||||
|
for match in re.finditer(attr_pattern, attr_string):
|
||||||
|
name = match.group(1).lower()
|
||||||
|
value = match.group(2) or match.group(3) or match.group(4) or ''
|
||||||
|
attrs[name] = value
|
||||||
|
|
||||||
|
# Handle standalone attributes (like async, defer)
|
||||||
|
standalone_pattern = r'\b(\w+)(?!=)'
|
||||||
|
for match in re.finditer(standalone_pattern, attr_string):
|
||||||
|
attr_name = match.group(1).lower()
|
||||||
|
if attr_name not in attrs:
|
||||||
|
attrs[attr_name] = ''
|
||||||
|
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Parse a CSS stylesheet.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
css_str: CSS stylesheet string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of selectors and their style properties
|
||||||
|
"""
|
||||||
|
stylesheet = {}
|
||||||
|
|
||||||
|
# Remove comments
|
||||||
|
css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL)
|
||||||
|
|
||||||
|
# Split into rule sets
|
||||||
|
rule_sets = css_str.split('}')
|
||||||
|
|
||||||
|
for rule_set in rule_sets:
|
||||||
|
# Split into selector and declarations
|
||||||
|
parts = rule_set.split('{', 1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
selector = parts[0].strip()
|
||||||
|
declarations = parts[1].strip()
|
||||||
|
|
||||||
|
# Parse declarations
|
||||||
|
style = self._parse_css_declarations(declarations)
|
||||||
|
|
||||||
|
# Add to stylesheet
|
||||||
|
if selector and style:
|
||||||
|
stylesheet[selector] = style
|
||||||
|
|
||||||
|
return stylesheet
|
||||||
|
|
||||||
|
def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Parse CSS declarations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
declarations_str: CSS declarations string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of CSS properties and values
|
||||||
|
"""
|
||||||
|
declarations = {}
|
||||||
|
|
||||||
|
# Split the declarations string into individual declarations
|
||||||
|
decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()]
|
||||||
|
|
||||||
|
for declaration in decl_list:
|
||||||
|
# Split into property and value
|
||||||
|
parts = declaration.split(':', 1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
prop = parts[0].strip().lower()
|
||||||
|
value = parts[1].strip()
|
||||||
|
|
||||||
|
# Store the declaration
|
||||||
|
declarations[prop] = value
|
||||||
|
|
||||||
|
return declarations
|
||||||
|
|
||||||
|
def resolve_url(self, url: str, base_url: Optional[str] = None) -> str:
|
||||||
|
"""
|
||||||
|
Resolve a relative URL against a base URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to resolve
|
||||||
|
base_url: The base URL to resolve against
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The resolved URL
|
||||||
|
"""
|
||||||
|
if base_url and not url.startswith(('http://', 'https://', '//', 'data:')):
|
||||||
|
return urllib.parse.urljoin(base_url, url)
|
||||||
|
return url
|
||||||
|
|
||||||
|
def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get the dependencies of a resource (e.g., CSS imports, script dependencies).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
resource: The resource to analyze
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dependency URLs
|
||||||
|
"""
|
||||||
|
dependencies = []
|
||||||
|
|
||||||
|
if resource.get('type') == 'external' and 'content' in resource:
|
||||||
|
content = resource['content']
|
||||||
|
|
||||||
|
# Check for CSS @import rules
|
||||||
|
if resource.get('content_type', '').startswith('text/css'):
|
||||||
|
import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?'
|
||||||
|
for match in re.finditer(import_pattern, content, re.IGNORECASE):
|
||||||
|
dependencies.append(match.group(1))
|
||||||
|
|
||||||
|
# Check for JavaScript imports/requires (basic detection)
|
||||||
|
elif resource.get('content_type', '').startswith('text/javascript'):
|
||||||
|
# ES6 imports
|
||||||
|
import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']'
|
||||||
|
for match in re.finditer(import_pattern, content):
|
||||||
|
dependencies.append(match.group(1))
|
||||||
|
|
||||||
|
# CommonJS requires
|
||||||
|
require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)'
|
||||||
|
for match in re.finditer(require_pattern, content):
|
||||||
|
dependencies.append(match.group(1))
|
||||||
|
|
||||||
|
return dependencies
|
||||||
281
pyWebLayout/io/readers/html_style.py
Normal file
281
pyWebLayout/io/readers/html_style.py
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
"""
|
||||||
|
HTML style management for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides specialized functionality for handling CSS styles,
|
||||||
|
style stacks, and style parsing in HTML documents.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Any, Optional, Tuple
|
||||||
|
import re
|
||||||
|
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLStyleManager:
|
||||||
|
"""
|
||||||
|
Manages CSS styles and style stacks during HTML parsing.
|
||||||
|
|
||||||
|
This class handles style parsing, style inheritance, and maintains
|
||||||
|
the style stack for proper style nesting.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the style manager."""
|
||||||
|
self._style_stack: List[Dict[str, Any]] = []
|
||||||
|
self._current_style = self._get_default_style()
|
||||||
|
|
||||||
|
def _get_default_style(self) -> Dict[str, Any]:
|
||||||
|
"""Get the default style settings."""
|
||||||
|
return {
|
||||||
|
'font_size': 12,
|
||||||
|
'font_weight': FontWeight.NORMAL,
|
||||||
|
'font_style': FontStyle.NORMAL,
|
||||||
|
'decoration': TextDecoration.NONE,
|
||||||
|
'color': (0, 0, 0),
|
||||||
|
'background': None,
|
||||||
|
'language': 'en_US'
|
||||||
|
}
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset the style manager to initial state."""
|
||||||
|
self._style_stack = []
|
||||||
|
self._current_style = self._get_default_style()
|
||||||
|
|
||||||
|
def push_style(self, style: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Push a new style onto the style stack.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style: The style to push
|
||||||
|
"""
|
||||||
|
# Save the current style
|
||||||
|
self._style_stack.append(self._current_style.copy())
|
||||||
|
|
||||||
|
# Apply the new style
|
||||||
|
for key, value in style.items():
|
||||||
|
self._current_style[key] = value
|
||||||
|
|
||||||
|
def pop_style(self):
|
||||||
|
"""Pop a style from the style stack."""
|
||||||
|
if self._style_stack:
|
||||||
|
self._current_style = self._style_stack.pop()
|
||||||
|
|
||||||
|
def get_current_style(self) -> Dict[str, Any]:
|
||||||
|
"""Get the current style."""
|
||||||
|
return self._current_style.copy()
|
||||||
|
|
||||||
|
def get_tag_style(self, tag: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the default style for a tag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag: The tag name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary of style properties
|
||||||
|
"""
|
||||||
|
tag_styles = {
|
||||||
|
'h1': {'font_size': 24, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h2': {'font_size': 20, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h3': {'font_size': 18, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h4': {'font_size': 16, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h5': {'font_size': 14, 'font_weight': FontWeight.BOLD},
|
||||||
|
'h6': {'font_size': 12, 'font_weight': FontWeight.BOLD},
|
||||||
|
'b': {'font_weight': FontWeight.BOLD},
|
||||||
|
'strong': {'font_weight': FontWeight.BOLD},
|
||||||
|
'i': {'font_style': FontStyle.ITALIC},
|
||||||
|
'em': {'font_style': FontStyle.ITALIC},
|
||||||
|
'u': {'decoration': TextDecoration.UNDERLINE},
|
||||||
|
'a': {'decoration': TextDecoration.UNDERLINE, 'color': (0, 0, 255)},
|
||||||
|
'code': {'font_family': 'monospace', 'background': (240, 240, 240, 255)},
|
||||||
|
'pre': {'font_family': 'monospace'},
|
||||||
|
}
|
||||||
|
|
||||||
|
return tag_styles.get(tag, {})
|
||||||
|
|
||||||
|
def create_font(self) -> Font:
|
||||||
|
"""
|
||||||
|
Create a Font object from the current style.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Font: A font object with the current style settings
|
||||||
|
"""
|
||||||
|
return Font(
|
||||||
|
font_size=self._current_style['font_size'],
|
||||||
|
colour=self._current_style['color'],
|
||||||
|
weight=self._current_style['font_weight'],
|
||||||
|
style=self._current_style['font_style'],
|
||||||
|
decoration=self._current_style['decoration'],
|
||||||
|
background=self._current_style['background'],
|
||||||
|
langauge=self._current_style['language']
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_inline_style(self, style_str: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Parse inline CSS style string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style_str: CSS style string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of style properties
|
||||||
|
"""
|
||||||
|
if not style_str:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
style_dict = {}
|
||||||
|
declarations = [d.strip() for d in style_str.split(';') if d.strip()]
|
||||||
|
|
||||||
|
for declaration in declarations:
|
||||||
|
parts = declaration.split(':', 1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
prop = parts[0].strip().lower()
|
||||||
|
value = parts[1].strip()
|
||||||
|
|
||||||
|
# Handle specific properties
|
||||||
|
if prop == 'font-size':
|
||||||
|
if value.endswith('px'):
|
||||||
|
try:
|
||||||
|
size = int(value[:-2])
|
||||||
|
style_dict['font_size'] = size
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
elif value.endswith('pt'):
|
||||||
|
try:
|
||||||
|
size = int(value[:-2])
|
||||||
|
style_dict['font_size'] = size
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
elif prop == 'font-weight':
|
||||||
|
if value == 'bold':
|
||||||
|
style_dict['font_weight'] = FontWeight.BOLD
|
||||||
|
elif value == 'normal':
|
||||||
|
style_dict['font_weight'] = FontWeight.NORMAL
|
||||||
|
elif prop == 'font-style':
|
||||||
|
if value == 'italic':
|
||||||
|
style_dict['font_style'] = FontStyle.ITALIC
|
||||||
|
elif value == 'normal':
|
||||||
|
style_dict['font_style'] = FontStyle.NORMAL
|
||||||
|
elif prop == 'text-decoration':
|
||||||
|
if value == 'underline':
|
||||||
|
style_dict['decoration'] = TextDecoration.UNDERLINE
|
||||||
|
elif value == 'line-through':
|
||||||
|
style_dict['decoration'] = TextDecoration.STRIKETHROUGH
|
||||||
|
elif value == 'none':
|
||||||
|
style_dict['decoration'] = TextDecoration.NONE
|
||||||
|
elif prop == 'color':
|
||||||
|
color = self.parse_color(value)
|
||||||
|
if color:
|
||||||
|
style_dict['color'] = color
|
||||||
|
elif prop == 'background-color':
|
||||||
|
color = self.parse_color(value)
|
||||||
|
if color:
|
||||||
|
style_dict['background'] = color + (255,)
|
||||||
|
|
||||||
|
return style_dict
|
||||||
|
|
||||||
|
def parse_color(self, color_str: str) -> Optional[Tuple[int, int, int]]:
|
||||||
|
"""
|
||||||
|
Parse a CSS color string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
color_str: CSS color string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RGB tuple or None if parsing fails
|
||||||
|
"""
|
||||||
|
# Named colors
|
||||||
|
color_map = {
|
||||||
|
'black': (0, 0, 0),
|
||||||
|
'white': (255, 255, 255),
|
||||||
|
'red': (255, 0, 0),
|
||||||
|
'green': (0, 128, 0),
|
||||||
|
'blue': (0, 0, 255),
|
||||||
|
'yellow': (255, 255, 0),
|
||||||
|
'cyan': (0, 255, 255),
|
||||||
|
'magenta': (255, 0, 255),
|
||||||
|
'gray': (128, 128, 128),
|
||||||
|
'grey': (128, 128, 128),
|
||||||
|
'silver': (192, 192, 192),
|
||||||
|
'maroon': (128, 0, 0),
|
||||||
|
'olive': (128, 128, 0),
|
||||||
|
'navy': (0, 0, 128),
|
||||||
|
'purple': (128, 0, 128),
|
||||||
|
'teal': (0, 128, 128),
|
||||||
|
'lime': (0, 255, 0),
|
||||||
|
'aqua': (0, 255, 255),
|
||||||
|
'fuchsia': (255, 0, 255),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for named color
|
||||||
|
color_str = color_str.lower().strip()
|
||||||
|
if color_str in color_map:
|
||||||
|
return color_map[color_str]
|
||||||
|
|
||||||
|
# Check for hex color
|
||||||
|
if color_str.startswith('#'):
|
||||||
|
try:
|
||||||
|
if len(color_str) == 4: # #RGB
|
||||||
|
r = int(color_str[1] + color_str[1], 16)
|
||||||
|
g = int(color_str[2] + color_str[2], 16)
|
||||||
|
b = int(color_str[3] + color_str[3], 16)
|
||||||
|
return (r, g, b)
|
||||||
|
elif len(color_str) == 7: # #RRGGBB
|
||||||
|
r = int(color_str[1:3], 16)
|
||||||
|
g = int(color_str[3:5], 16)
|
||||||
|
b = int(color_str[5:7], 16)
|
||||||
|
return (r, g, b)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for rgb() color
|
||||||
|
rgb_match = re.match(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', color_str)
|
||||||
|
if rgb_match:
|
||||||
|
try:
|
||||||
|
r_val = int(rgb_match.group(1))
|
||||||
|
g_val = int(rgb_match.group(2))
|
||||||
|
b_val = int(rgb_match.group(3))
|
||||||
|
|
||||||
|
# Check if values are in valid range (0-255)
|
||||||
|
if r_val > 255 or g_val > 255 or b_val > 255 or r_val < 0 or g_val < 0 or b_val < 0:
|
||||||
|
return None # Invalid color values
|
||||||
|
|
||||||
|
return (r_val, g_val, b_val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for rgba() color (ignore alpha)
|
||||||
|
rgba_match = re.match(r'rgba\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*[\d.]+\s*\)', color_str)
|
||||||
|
if rgba_match:
|
||||||
|
try:
|
||||||
|
r = min(255, max(0, int(rgba_match.group(1))))
|
||||||
|
g = min(255, max(0, int(rgba_match.group(2))))
|
||||||
|
b = min(255, max(0, int(rgba_match.group(3))))
|
||||||
|
return (r, g, b)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Failed to parse color
|
||||||
|
return None
|
||||||
|
|
||||||
|
def apply_style_to_element(self, tag: str, attrs: Dict[str, str]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Apply combined styles (tag defaults + inline styles) for an element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag: The HTML tag name
|
||||||
|
attrs: Dictionary of tag attributes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Combined style dictionary
|
||||||
|
"""
|
||||||
|
# Start with tag-specific styles
|
||||||
|
style = self.get_tag_style(tag)
|
||||||
|
|
||||||
|
# Override with inline styles if present
|
||||||
|
if 'style' in attrs:
|
||||||
|
inline_style = self.parse_inline_style(attrs['style'])
|
||||||
|
style.update(inline_style)
|
||||||
|
|
||||||
|
return style
|
||||||
163
pyWebLayout/io/readers/html_text.py
Normal file
163
pyWebLayout/io/readers/html_text.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
"""
|
||||||
|
HTML text processing for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides specialized functionality for handling text content,
|
||||||
|
entity references, and word creation in HTML documents.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
from pyWebLayout.abstract.block import Parapgraph
|
||||||
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLTextProcessor:
|
||||||
|
"""
|
||||||
|
Processes text content during HTML parsing.
|
||||||
|
|
||||||
|
This class handles text buffering, entity resolution, and word creation
|
||||||
|
with proper styling applied.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, style_manager: HTMLStyleManager):
|
||||||
|
"""
|
||||||
|
Initialize the text processor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style_manager: The style manager for creating styled words
|
||||||
|
"""
|
||||||
|
self._style_manager = style_manager
|
||||||
|
self._text_buffer = ""
|
||||||
|
self._current_paragraph: Optional[Parapgraph] = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset the text processor state."""
|
||||||
|
self._text_buffer = ""
|
||||||
|
self._current_paragraph = None
|
||||||
|
|
||||||
|
def set_current_paragraph(self, paragraph: Optional[Parapgraph]):
|
||||||
|
"""
|
||||||
|
Set the current paragraph for text output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
paragraph: The paragraph to receive text, or None
|
||||||
|
"""
|
||||||
|
self._current_paragraph = paragraph
|
||||||
|
|
||||||
|
def add_text(self, text: str):
|
||||||
|
"""
|
||||||
|
Add text to the buffer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to add
|
||||||
|
"""
|
||||||
|
self._text_buffer += text
|
||||||
|
|
||||||
|
def add_entity_reference(self, name: str):
|
||||||
|
"""
|
||||||
|
Add an HTML entity reference to the buffer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The entity name (e.g., 'lt', 'gt', 'amp')
|
||||||
|
"""
|
||||||
|
# Map common entity references to characters
|
||||||
|
entities = {
|
||||||
|
'lt': '<',
|
||||||
|
'gt': '>',
|
||||||
|
'amp': '&',
|
||||||
|
'quot': '"',
|
||||||
|
'apos': "'",
|
||||||
|
'nbsp': ' ',
|
||||||
|
'copy': '©',
|
||||||
|
'reg': '®',
|
||||||
|
'trade': '™',
|
||||||
|
'mdash': '—',
|
||||||
|
'ndash': '–',
|
||||||
|
'hellip': '…',
|
||||||
|
'laquo': '«',
|
||||||
|
'raquo': '»',
|
||||||
|
'ldquo': '"',
|
||||||
|
'rdquo': '"',
|
||||||
|
'lsquo': ''',
|
||||||
|
'rsquo': ''',
|
||||||
|
'deg': '°',
|
||||||
|
'plusmn': '±',
|
||||||
|
'times': '×',
|
||||||
|
'divide': '÷',
|
||||||
|
'euro': '€',
|
||||||
|
'pound': '£',
|
||||||
|
'yen': '¥',
|
||||||
|
}
|
||||||
|
|
||||||
|
char = entities.get(name, f'&{name};')
|
||||||
|
self._text_buffer += char
|
||||||
|
|
||||||
|
def add_character_reference(self, name: str):
|
||||||
|
"""
|
||||||
|
Add a character reference to the buffer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The character reference (decimal or hex)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if name.startswith('x'):
|
||||||
|
# Hexadecimal reference
|
||||||
|
char = chr(int(name[1:], 16))
|
||||||
|
else:
|
||||||
|
# Decimal reference
|
||||||
|
char = chr(int(name))
|
||||||
|
self._text_buffer += char
|
||||||
|
except (ValueError, OverflowError):
|
||||||
|
# Invalid character reference
|
||||||
|
self._text_buffer += f'&#{name};'
|
||||||
|
|
||||||
|
def flush_text(self) -> bool:
|
||||||
|
"""
|
||||||
|
Flush the text buffer, creating words as needed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if text was flushed, False if buffer was empty
|
||||||
|
"""
|
||||||
|
if not self._text_buffer or not self._current_paragraph:
|
||||||
|
self._text_buffer = ""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Clean up the text
|
||||||
|
text = self._text_buffer.strip()
|
||||||
|
if not text:
|
||||||
|
self._text_buffer = ""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create words from the text
|
||||||
|
words = text.split()
|
||||||
|
for word_text in words:
|
||||||
|
if word_text:
|
||||||
|
font = self._style_manager.create_font()
|
||||||
|
word = Word(word_text, font)
|
||||||
|
self._current_paragraph.add_word(word)
|
||||||
|
|
||||||
|
# Reset text buffer
|
||||||
|
self._text_buffer = ""
|
||||||
|
return True
|
||||||
|
|
||||||
|
def has_pending_text(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if there is pending text in the buffer.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if there is text waiting to be flushed
|
||||||
|
"""
|
||||||
|
return bool(self._text_buffer.strip())
|
||||||
|
|
||||||
|
def get_buffer_content(self) -> str:
|
||||||
|
"""
|
||||||
|
Get the current buffer content without flushing.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The current text buffer content
|
||||||
|
"""
|
||||||
|
return self._text_buffer
|
||||||
|
|
||||||
|
def clear_buffer(self):
|
||||||
|
"""Clear the text buffer without creating words."""
|
||||||
|
self._text_buffer = ""
|
||||||
11
pyWebLayout/layout.py
Normal file
11
pyWebLayout/layout.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
class Alignment(Enum):
|
||||||
|
LEFT = 1
|
||||||
|
CENTER = 2
|
||||||
|
RIGHT = 3
|
||||||
|
TOP = 4
|
||||||
|
BOTTOM = 5
|
||||||
|
JUSTIFY = 6
|
||||||
|
|
||||||
|
|
||||||
1
pyWebLayout/localisation.py
Normal file
1
pyWebLayout/localisation.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
## list langauges
|
||||||
176
pyWebLayout/style.py
Normal file
176
pyWebLayout/style.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
# this should contain classes for how different object can be rendered, e.g. bold, italic, regular
|
||||||
|
from PIL import ImageFont
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Tuple, Union, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class FontWeight(Enum):
|
||||||
|
NORMAL = "normal"
|
||||||
|
BOLD = "bold"
|
||||||
|
|
||||||
|
|
||||||
|
class FontStyle(Enum):
|
||||||
|
NORMAL = "normal"
|
||||||
|
ITALIC = "italic"
|
||||||
|
|
||||||
|
|
||||||
|
class TextDecoration(Enum):
|
||||||
|
NONE = "none"
|
||||||
|
UNDERLINE = "underline"
|
||||||
|
STRIKETHROUGH = "strikethrough"
|
||||||
|
|
||||||
|
|
||||||
|
class Font:
|
||||||
|
"""
|
||||||
|
Font class to manage text rendering properties including font face, size, color, and styling.
|
||||||
|
This class is used by the text renderer to determine how to render text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
font_path: Optional[str] = None,
|
||||||
|
font_size: int = 12,
|
||||||
|
colour: Tuple[int, int, int] = (0, 0, 0),
|
||||||
|
weight: FontWeight = FontWeight.NORMAL,
|
||||||
|
style: FontStyle = FontStyle.NORMAL,
|
||||||
|
decoration: TextDecoration = TextDecoration.NONE,
|
||||||
|
background: Optional[Tuple[int, int, int, int]] = None,
|
||||||
|
langauge = "en_EN"):
|
||||||
|
"""
|
||||||
|
Initialize a Font object with the specified properties.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
font_path: Path to the font file (.ttf, .otf). If None, uses default font.
|
||||||
|
font_size: Size of the font in points.
|
||||||
|
colour: RGB color tuple for the text.
|
||||||
|
weight: Font weight (normal or bold).
|
||||||
|
style: Font style (normal or italic).
|
||||||
|
decoration: Text decoration (none, underline, or strikethrough).
|
||||||
|
background: RGBA background color for the text. If None, transparent background.
|
||||||
|
"""
|
||||||
|
self._font_path = font_path
|
||||||
|
self._font_size = font_size
|
||||||
|
self._colour = colour
|
||||||
|
self._weight = weight
|
||||||
|
self._style = style
|
||||||
|
self._decoration = decoration
|
||||||
|
self._background = background if background else (255, 255, 255, 0)
|
||||||
|
self.language = langauge
|
||||||
|
# Load the font file or use default
|
||||||
|
self._load_font()
|
||||||
|
|
||||||
|
def _load_font(self):
|
||||||
|
"""Load the font using PIL's ImageFont"""
|
||||||
|
try:
|
||||||
|
if self._font_path:
|
||||||
|
self._font = ImageFont.truetype(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use default font
|
||||||
|
self._font = ImageFont.load_default()
|
||||||
|
if self._font_size != 12: # Default size might not be 12
|
||||||
|
self._font = ImageFont.truetype(self._font.path, self._font_size)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading font: {e}")
|
||||||
|
self._font = ImageFont.load_default()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def font(self):
|
||||||
|
"""Get the PIL ImageFont object"""
|
||||||
|
return self._font
|
||||||
|
|
||||||
|
@property
|
||||||
|
def font_size(self):
|
||||||
|
"""Get the font size"""
|
||||||
|
return self._font_size
|
||||||
|
|
||||||
|
@property
|
||||||
|
def colour(self):
|
||||||
|
"""Get the text color"""
|
||||||
|
return self._colour
|
||||||
|
|
||||||
|
@property
|
||||||
|
def color(self):
|
||||||
|
"""Alias for colour (American spelling)"""
|
||||||
|
return self._colour
|
||||||
|
|
||||||
|
@property
|
||||||
|
def background(self):
|
||||||
|
"""Get the background color"""
|
||||||
|
return self._background
|
||||||
|
|
||||||
|
@property
|
||||||
|
def weight(self):
|
||||||
|
"""Get the font weight"""
|
||||||
|
return self._weight
|
||||||
|
|
||||||
|
@property
|
||||||
|
def style(self):
|
||||||
|
"""Get the font style"""
|
||||||
|
return self._style
|
||||||
|
|
||||||
|
@property
|
||||||
|
def decoration(self):
|
||||||
|
"""Get the text decoration"""
|
||||||
|
return self._decoration
|
||||||
|
|
||||||
|
def with_size(self, size: int):
|
||||||
|
"""Create a new Font object with modified size"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
size,
|
||||||
|
self._colour,
|
||||||
|
self._weight,
|
||||||
|
self._style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_colour(self, colour: Tuple[int, int, int]):
|
||||||
|
"""Create a new Font object with modified colour"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
colour,
|
||||||
|
self._weight,
|
||||||
|
self._style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_weight(self, weight: FontWeight):
|
||||||
|
"""Create a new Font object with modified weight"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
self._colour,
|
||||||
|
weight,
|
||||||
|
self._style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_style(self, style: FontStyle):
|
||||||
|
"""Create a new Font object with modified style"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
self._colour,
|
||||||
|
self._weight,
|
||||||
|
style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_decoration(self, decoration: TextDecoration):
|
||||||
|
"""Create a new Font object with modified decoration"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
self._colour,
|
||||||
|
self._weight,
|
||||||
|
self._style,
|
||||||
|
decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
17
pyWebLayout/style/__init__.py
Normal file
17
pyWebLayout/style/__init__.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
"""
|
||||||
|
Styling module for the pyWebLayout library.
|
||||||
|
|
||||||
|
This package contains styling-related components including:
|
||||||
|
- Font handling and text styling
|
||||||
|
- Color management
|
||||||
|
- Text decoration and formatting
|
||||||
|
- Alignment and positioning properties
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Import alignment options
|
||||||
|
from pyWebLayout.style.alignment import Alignment
|
||||||
|
|
||||||
|
# Import font-related classes
|
||||||
|
from pyWebLayout.style.fonts import (
|
||||||
|
Font, FontWeight, FontStyle, TextDecoration
|
||||||
|
)
|
||||||
16
pyWebLayout/style/alignment.py
Normal file
16
pyWebLayout/style/alignment.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
Alignment options for text and elements in the pyWebLayout library.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
class Alignment(Enum):
|
||||||
|
"""
|
||||||
|
Enum for alignment options used in layout and rendering.
|
||||||
|
"""
|
||||||
|
LEFT = 1
|
||||||
|
CENTER = 2
|
||||||
|
RIGHT = 3
|
||||||
|
TOP = 4
|
||||||
|
BOTTOM = 5
|
||||||
|
JUSTIFY = 6
|
||||||
176
pyWebLayout/style/fonts.py
Normal file
176
pyWebLayout/style/fonts.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
# this should contain classes for how different object can be rendered, e.g. bold, italic, regular
|
||||||
|
from PIL import ImageFont
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Tuple, Union, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class FontWeight(Enum):
|
||||||
|
NORMAL = "normal"
|
||||||
|
BOLD = "bold"
|
||||||
|
|
||||||
|
|
||||||
|
class FontStyle(Enum):
|
||||||
|
NORMAL = "normal"
|
||||||
|
ITALIC = "italic"
|
||||||
|
|
||||||
|
|
||||||
|
class TextDecoration(Enum):
|
||||||
|
NONE = "none"
|
||||||
|
UNDERLINE = "underline"
|
||||||
|
STRIKETHROUGH = "strikethrough"
|
||||||
|
|
||||||
|
|
||||||
|
class Font:
|
||||||
|
"""
|
||||||
|
Font class to manage text rendering properties including font face, size, color, and styling.
|
||||||
|
This class is used by the text renderer to determine how to render text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
font_path: Optional[str] = None,
|
||||||
|
font_size: int = 12,
|
||||||
|
colour: Tuple[int, int, int] = (0, 0, 0),
|
||||||
|
weight: FontWeight = FontWeight.NORMAL,
|
||||||
|
style: FontStyle = FontStyle.NORMAL,
|
||||||
|
decoration: TextDecoration = TextDecoration.NONE,
|
||||||
|
background: Optional[Tuple[int, int, int, int]] = None,
|
||||||
|
langauge = "en_EN"):
|
||||||
|
"""
|
||||||
|
Initialize a Font object with the specified properties.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
font_path: Path to the font file (.ttf, .otf). If None, uses default font.
|
||||||
|
font_size: Size of the font in points.
|
||||||
|
colour: RGB color tuple for the text.
|
||||||
|
weight: Font weight (normal or bold).
|
||||||
|
style: Font style (normal or italic).
|
||||||
|
decoration: Text decoration (none, underline, or strikethrough).
|
||||||
|
background: RGBA background color for the text. If None, transparent background.
|
||||||
|
"""
|
||||||
|
self._font_path = font_path
|
||||||
|
self._font_size = font_size
|
||||||
|
self._colour = colour
|
||||||
|
self._weight = weight
|
||||||
|
self._style = style
|
||||||
|
self._decoration = decoration
|
||||||
|
self._background = background if background else (255, 255, 255, 0)
|
||||||
|
self.language = langauge
|
||||||
|
# Load the font file or use default
|
||||||
|
self._load_font()
|
||||||
|
|
||||||
|
def _load_font(self):
|
||||||
|
"""Load the font using PIL's ImageFont"""
|
||||||
|
try:
|
||||||
|
if self._font_path:
|
||||||
|
self._font = ImageFont.truetype(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use default font
|
||||||
|
self._font = ImageFont.load_default()
|
||||||
|
if self._font_size != 12: # Default size might not be 12
|
||||||
|
self._font = ImageFont.truetype(self._font.path, self._font_size)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading font: {e}")
|
||||||
|
self._font = ImageFont.load_default()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def font(self):
|
||||||
|
"""Get the PIL ImageFont object"""
|
||||||
|
return self._font
|
||||||
|
|
||||||
|
@property
|
||||||
|
def font_size(self):
|
||||||
|
"""Get the font size"""
|
||||||
|
return self._font_size
|
||||||
|
|
||||||
|
@property
|
||||||
|
def colour(self):
|
||||||
|
"""Get the text color"""
|
||||||
|
return self._colour
|
||||||
|
|
||||||
|
@property
|
||||||
|
def color(self):
|
||||||
|
"""Alias for colour (American spelling)"""
|
||||||
|
return self._colour
|
||||||
|
|
||||||
|
@property
|
||||||
|
def background(self):
|
||||||
|
"""Get the background color"""
|
||||||
|
return self._background
|
||||||
|
|
||||||
|
@property
|
||||||
|
def weight(self):
|
||||||
|
"""Get the font weight"""
|
||||||
|
return self._weight
|
||||||
|
|
||||||
|
@property
|
||||||
|
def style(self):
|
||||||
|
"""Get the font style"""
|
||||||
|
return self._style
|
||||||
|
|
||||||
|
@property
|
||||||
|
def decoration(self):
|
||||||
|
"""Get the text decoration"""
|
||||||
|
return self._decoration
|
||||||
|
|
||||||
|
def with_size(self, size: int):
|
||||||
|
"""Create a new Font object with modified size"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
size,
|
||||||
|
self._colour,
|
||||||
|
self._weight,
|
||||||
|
self._style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_colour(self, colour: Tuple[int, int, int]):
|
||||||
|
"""Create a new Font object with modified colour"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
colour,
|
||||||
|
self._weight,
|
||||||
|
self._style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_weight(self, weight: FontWeight):
|
||||||
|
"""Create a new Font object with modified weight"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
self._colour,
|
||||||
|
weight,
|
||||||
|
self._style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_style(self, style: FontStyle):
|
||||||
|
"""Create a new Font object with modified style"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
self._colour,
|
||||||
|
self._weight,
|
||||||
|
style,
|
||||||
|
self._decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
|
|
||||||
|
def with_decoration(self, decoration: TextDecoration):
|
||||||
|
"""Create a new Font object with modified decoration"""
|
||||||
|
return Font(
|
||||||
|
self._font_path,
|
||||||
|
self._font_size,
|
||||||
|
self._colour,
|
||||||
|
self._weight,
|
||||||
|
self._style,
|
||||||
|
decoration,
|
||||||
|
self._background
|
||||||
|
)
|
||||||
137
pyWebLayout/table.py
Normal file
137
pyWebLayout/table.py
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
from pyWebLayout.base import Renderable
|
||||||
|
from .concrete.box import Box
|
||||||
|
from pyWebLayout.layout import Alignment
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
from typing import List, Tuple, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class TableCell(Box):
|
||||||
|
def __init__(self, origin, size, content: Optional[Renderable] = None,
|
||||||
|
callback=None, sheet=None, mode=None,
|
||||||
|
halign=Alignment.CENTER, valign=Alignment.CENTER,
|
||||||
|
padding: Tuple[int, int, int, int] = (5, 5, 5, 5)):
|
||||||
|
"""
|
||||||
|
Initialize a table cell.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
origin: Top-left corner coordinates
|
||||||
|
size: Width and height of the cell
|
||||||
|
content: Optional renderable content to place in the cell
|
||||||
|
callback: Optional callback function
|
||||||
|
sheet: Optional image sheet
|
||||||
|
mode: Optional image mode
|
||||||
|
halign: Horizontal alignment
|
||||||
|
valign: Vertical alignment
|
||||||
|
padding: Padding as (top, right, bottom, left)
|
||||||
|
"""
|
||||||
|
super().__init__(origin, size, callback, sheet, mode, halign, valign)
|
||||||
|
self._content = content
|
||||||
|
self._padding = padding # (top, right, bottom, left)
|
||||||
|
|
||||||
|
def set_content(self, content: Renderable):
|
||||||
|
"""Set the content of this cell"""
|
||||||
|
self._content = content
|
||||||
|
|
||||||
|
def render(self) -> Image:
|
||||||
|
"""Render the cell with its content and border"""
|
||||||
|
# Create the base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
draw = ImageDraw.Draw(canvas)
|
||||||
|
|
||||||
|
# Draw border (optional - can be customized)
|
||||||
|
draw.rectangle([(0, 0), tuple(self._size - np.array([1, 1]))],
|
||||||
|
outline=(0, 0, 0), width=1)
|
||||||
|
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
|
||||||
|
class Table(Box):
|
||||||
|
def __init__(self, rows: int, columns: int, origin, size,
|
||||||
|
cell_padding: Tuple[int, int, int, int] = (5, 5, 5, 5),
|
||||||
|
callback=None, sheet=None, mode=None,
|
||||||
|
halign=Alignment.CENTER, valign=Alignment.CENTER):
|
||||||
|
"""
|
||||||
|
Initialize a table with specified number of rows and columns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rows: Number of rows in the table
|
||||||
|
columns: Number of columns in the table
|
||||||
|
origin: Top-left corner coordinates
|
||||||
|
size: Width and height of the table
|
||||||
|
cell_padding: Padding for each cell as (top, right, bottom, left)
|
||||||
|
callback: Optional callback function
|
||||||
|
sheet: Optional image sheet
|
||||||
|
mode: Optional image mode
|
||||||
|
halign: Horizontal alignment
|
||||||
|
valign: Vertical alignment
|
||||||
|
"""
|
||||||
|
super().__init__(origin, size, callback, sheet, mode, halign, valign)
|
||||||
|
|
||||||
|
self._rows = rows
|
||||||
|
self._columns = columns
|
||||||
|
self._cell_padding = cell_padding
|
||||||
|
|
||||||
|
# Calculate cell dimensions
|
||||||
|
cell_width = size[0] // columns
|
||||||
|
cell_height = size[1] // rows
|
||||||
|
|
||||||
|
# Create a 2D array of cells
|
||||||
|
self._cells: List[List[TableCell]] = []
|
||||||
|
|
||||||
|
for row in range(rows):
|
||||||
|
cell_row = []
|
||||||
|
for col in range(columns):
|
||||||
|
# Calculate cell position
|
||||||
|
cell_origin = np.array([col * cell_width, row * cell_height])
|
||||||
|
cell_size = np.array([cell_width, cell_height])
|
||||||
|
|
||||||
|
# Create the cell
|
||||||
|
cell = TableCell(
|
||||||
|
origin=cell_origin,
|
||||||
|
size=cell_size,
|
||||||
|
sheet=sheet,
|
||||||
|
mode=mode,
|
||||||
|
halign=halign,
|
||||||
|
valign=valign,
|
||||||
|
padding=cell_padding
|
||||||
|
)
|
||||||
|
|
||||||
|
cell_row.append(cell)
|
||||||
|
|
||||||
|
self._cells.append(cell_row)
|
||||||
|
|
||||||
|
def add_to_cell(self, x: int, y: int, content: Renderable):
|
||||||
|
"""
|
||||||
|
Add content to a specific cell in the table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x: Column index (0-based)
|
||||||
|
y: Row index (0-based)
|
||||||
|
content: Renderable content to add to the cell
|
||||||
|
"""
|
||||||
|
if 0 <= y < self._rows and 0 <= x < self._columns:
|
||||||
|
self._cells[y][x].set_content(content)
|
||||||
|
else:
|
||||||
|
raise IndexError(f"Cell indices ({x}, {y}) out of range. Table is {self._columns}x{self._rows}")
|
||||||
|
|
||||||
|
def render(self) -> Image:
|
||||||
|
"""Render the complete table with all cells"""
|
||||||
|
# Create base canvas
|
||||||
|
canvas = super().render()
|
||||||
|
|
||||||
|
# Render each cell and paste it onto the canvas
|
||||||
|
for row in range(self._rows):
|
||||||
|
for col in range(self._columns):
|
||||||
|
cell = self._cells[row][col]
|
||||||
|
cell_img = cell.render()
|
||||||
|
|
||||||
|
# Get the position for this cell
|
||||||
|
cell_pos = (col * (self._size[0] // self._columns),
|
||||||
|
row * (self._size[1] // self._rows))
|
||||||
|
|
||||||
|
# Paste the cell onto the canvas
|
||||||
|
canvas.paste(cell_img, cell_pos, cell_img)
|
||||||
|
|
||||||
|
return canvas
|
||||||
15
pyWebLayout/typesetting/__init__.py
Normal file
15
pyWebLayout/typesetting/__init__.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
"""
|
||||||
|
Typesetting module for the pyWebLayout library.
|
||||||
|
|
||||||
|
This package handles the organization and arrangement of elements for rendering, including:
|
||||||
|
- Flow layout algorithms
|
||||||
|
- Container management
|
||||||
|
- Element positioning and sizing
|
||||||
|
- Content wrapping and overflow
|
||||||
|
- Coordinate systems and transformations
|
||||||
|
- Pagination for book-like content
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pyWebLayout.typesetting.flow import FlowLayout
|
||||||
|
from pyWebLayout.typesetting.pagination import Paginator, PaginationState
|
||||||
|
from pyWebLayout.typesetting.document_pagination import DocumentPaginator, DocumentPaginationState
|
||||||
323
pyWebLayout/typesetting/document_pagination.py
Normal file
323
pyWebLayout/typesetting/document_pagination.py
Normal file
@ -0,0 +1,323 @@
|
|||||||
|
"""
|
||||||
|
Document-aware pagination system for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides functionality for paginating Document and Book objects
|
||||||
|
across multiple pages, with the ability to stop, save state, and resume pagination.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Tuple, Dict, Any, Optional, Iterator, Generator
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
|
||||||
|
from pyWebLayout.core import Layoutable, Renderable
|
||||||
|
from pyWebLayout.style import Alignment
|
||||||
|
from pyWebLayout.abstract.document import Document, Book, Chapter
|
||||||
|
from pyWebLayout.abstract.block import Block
|
||||||
|
from pyWebLayout.typesetting.pagination import PaginationState, Paginator
|
||||||
|
from pyWebLayout.concrete.page import Page
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentPaginationState(PaginationState):
|
||||||
|
"""
|
||||||
|
Extended pagination state for tracking document-specific information.
|
||||||
|
|
||||||
|
This class extends the basic PaginationState to include information
|
||||||
|
about the document structure, like current chapter and section.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize a new document pagination state."""
|
||||||
|
super().__init__()
|
||||||
|
self.current_chapter = 0
|
||||||
|
self.current_section = 0
|
||||||
|
self.rendered_blocks = set() # Track which blocks have been rendered
|
||||||
|
|
||||||
|
def save(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Save the current pagination state to a dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary representing the pagination state
|
||||||
|
"""
|
||||||
|
state = super().save()
|
||||||
|
state.update({
|
||||||
|
'current_chapter': self.current_chapter,
|
||||||
|
'current_section': self.current_section,
|
||||||
|
'rendered_blocks': list(self.rendered_blocks) # Convert set to list for serialization
|
||||||
|
})
|
||||||
|
return state
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, state_dict: Dict[str, Any]) -> 'DocumentPaginationState':
|
||||||
|
"""
|
||||||
|
Load pagination state from a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
state_dict: Dictionary containing pagination state
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A DocumentPaginationState object
|
||||||
|
"""
|
||||||
|
state = super(DocumentPaginationState, cls).load(state_dict)
|
||||||
|
state.current_chapter = state_dict.get('current_chapter', 0)
|
||||||
|
state.current_section = state_dict.get('current_section', 0)
|
||||||
|
state.rendered_blocks = set(state_dict.get('rendered_blocks', []))
|
||||||
|
return state
|
||||||
|
|
||||||
|
def to_json(self) -> str:
|
||||||
|
"""
|
||||||
|
Convert the state to a JSON string for persistence.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON string representation of the state
|
||||||
|
"""
|
||||||
|
return json.dumps(self.save())
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json(cls, json_str: str) -> 'DocumentPaginationState':
|
||||||
|
"""
|
||||||
|
Load state from a JSON string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_str: JSON string representation of state
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A DocumentPaginationState object
|
||||||
|
"""
|
||||||
|
return cls.load(json.loads(json_str))
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentPaginator:
|
||||||
|
"""
|
||||||
|
Paginator for Document and Book objects.
|
||||||
|
|
||||||
|
This class paginates Document or Book objects into a series of pages,
|
||||||
|
respecting the document structure and allowing for state tracking.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
document: Document,
|
||||||
|
page_size: Tuple[int, int],
|
||||||
|
margins: Tuple[int, int, int, int] = (20, 20, 20, 20), # top, right, bottom, left
|
||||||
|
spacing: int = 5,
|
||||||
|
halign: Alignment = Alignment.LEFT,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize a document paginator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The document to paginate
|
||||||
|
page_size: Size of each page (width, height)
|
||||||
|
margins: Margins for each page (top, right, bottom, left)
|
||||||
|
spacing: Spacing between elements
|
||||||
|
halign: Horizontal alignment of elements
|
||||||
|
"""
|
||||||
|
self.document = document
|
||||||
|
self.page_size = page_size
|
||||||
|
self.margins = margins
|
||||||
|
self.spacing = spacing
|
||||||
|
self.halign = halign
|
||||||
|
self.state = DocumentPaginationState()
|
||||||
|
|
||||||
|
# Preprocess document to get all blocks
|
||||||
|
self._blocks = self._collect_blocks()
|
||||||
|
|
||||||
|
def _collect_blocks(self) -> List[Block]:
|
||||||
|
"""
|
||||||
|
Collect all blocks from the document in a flat list.
|
||||||
|
|
||||||
|
For Books, this includes blocks from all chapters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of blocks from the document
|
||||||
|
"""
|
||||||
|
all_blocks = []
|
||||||
|
|
||||||
|
if isinstance(self.document, Book):
|
||||||
|
# For books, process chapters
|
||||||
|
for chapter in self.document.chapters:
|
||||||
|
# Add a heading block for the chapter if it has a title
|
||||||
|
if chapter.title:
|
||||||
|
from pyWebLayout.abstract.block import Heading, HeadingLevel, Parapgraph
|
||||||
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
|
||||||
|
# Create a heading for the chapter
|
||||||
|
heading = Heading(level=HeadingLevel.H1)
|
||||||
|
heading_word = Word(chapter.title)
|
||||||
|
heading.add_word(heading_word)
|
||||||
|
all_blocks.append(heading)
|
||||||
|
|
||||||
|
# Add all blocks from the chapter
|
||||||
|
all_blocks.extend(chapter.blocks)
|
||||||
|
else:
|
||||||
|
# For regular documents, just add all blocks
|
||||||
|
all_blocks.extend(self.document.blocks)
|
||||||
|
|
||||||
|
return all_blocks
|
||||||
|
|
||||||
|
def paginate(self, max_pages: Optional[int] = None) -> List[Page]:
|
||||||
|
"""
|
||||||
|
Paginate the document into pages.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_pages: Maximum number of pages to generate (None for all)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Page objects
|
||||||
|
"""
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
# Reset state
|
||||||
|
self.state = DocumentPaginationState()
|
||||||
|
|
||||||
|
# Create a generator for pagination
|
||||||
|
page_generator = self._paginate_generator()
|
||||||
|
|
||||||
|
# Generate pages up to max_pages or until all content is paginated
|
||||||
|
page_count = 0
|
||||||
|
for page in page_generator:
|
||||||
|
pages.append(page)
|
||||||
|
page_count += 1
|
||||||
|
if max_pages is not None and page_count >= max_pages:
|
||||||
|
break
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
||||||
|
def paginate_next(self) -> Optional[Page]:
|
||||||
|
"""
|
||||||
|
Paginate and return the next page only.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The next Page object, or None if no more content
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return next(self._paginate_generator())
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _paginate_generator(self) -> Generator[Page, None, None]:
|
||||||
|
"""
|
||||||
|
Generator that yields one page at a time.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
A Page object for each page in the document
|
||||||
|
"""
|
||||||
|
# Get blocks starting from the current position
|
||||||
|
current_index = self.state.current_element_index
|
||||||
|
remaining_blocks = self._blocks[current_index:]
|
||||||
|
|
||||||
|
# Keep track of which chapter we're in
|
||||||
|
current_chapter = self.state.current_chapter
|
||||||
|
|
||||||
|
# Process blocks until we run out
|
||||||
|
while current_index < len(self._blocks):
|
||||||
|
# Create a new page
|
||||||
|
page = Page(size=self.page_size)
|
||||||
|
|
||||||
|
# Fill the page with blocks
|
||||||
|
page_blocks = []
|
||||||
|
|
||||||
|
# Track how much space we've used on the page
|
||||||
|
used_height = self.margins[0] # Start at top margin
|
||||||
|
avail_height = self.page_size[1] - self.margins[0] - self.margins[2]
|
||||||
|
|
||||||
|
# Add blocks until we fill the page or run out
|
||||||
|
while current_index < len(self._blocks):
|
||||||
|
block = self._blocks[current_index]
|
||||||
|
|
||||||
|
# Make sure the block is properly laid out
|
||||||
|
if hasattr(block, 'layout'):
|
||||||
|
block.layout()
|
||||||
|
|
||||||
|
# Get the rendered height of the block
|
||||||
|
block_height = getattr(block, 'size', (0, 0))[1]
|
||||||
|
|
||||||
|
# Check if the block fits on this page
|
||||||
|
if used_height + block_height > avail_height:
|
||||||
|
# Block doesn't fit, move to next page
|
||||||
|
break
|
||||||
|
|
||||||
|
# Add the block to the page
|
||||||
|
page_blocks.append(block)
|
||||||
|
page.add_child(block)
|
||||||
|
|
||||||
|
# Update position
|
||||||
|
used_height += block_height + self.spacing
|
||||||
|
|
||||||
|
# Track that we've rendered this block
|
||||||
|
self.state.rendered_blocks.add(id(block))
|
||||||
|
|
||||||
|
# Move to the next block
|
||||||
|
current_index += 1
|
||||||
|
|
||||||
|
# Check if we're moving to a new chapter (for Book objects)
|
||||||
|
if isinstance(self.document, Book) and current_index < len(self._blocks):
|
||||||
|
# Check if the next block is a heading that starts a new chapter
|
||||||
|
# This is a simplified check - in a real implementation you'd need
|
||||||
|
# a more robust way to identify chapter boundaries
|
||||||
|
from pyWebLayout.abstract.block import Heading
|
||||||
|
if isinstance(self._blocks[current_index], Heading):
|
||||||
|
# We're at a chapter boundary, might want to start a new page
|
||||||
|
# This is optional and depends on your layout preferences
|
||||||
|
current_chapter += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# Update state
|
||||||
|
self.state.current_page += 1
|
||||||
|
self.state.current_element_index = current_index
|
||||||
|
self.state.current_chapter = current_chapter
|
||||||
|
|
||||||
|
# Layout the page
|
||||||
|
page.layout()
|
||||||
|
|
||||||
|
# If we couldn't fit any blocks on this page but have more, skip the block
|
||||||
|
if not page_blocks and current_index < len(self._blocks):
|
||||||
|
print(f"Warning: Block at index {current_index} is too large to fit on a page")
|
||||||
|
current_index += 1
|
||||||
|
self.state.current_element_index = current_index
|
||||||
|
|
||||||
|
# Yield the page
|
||||||
|
if page_blocks:
|
||||||
|
yield page
|
||||||
|
else:
|
||||||
|
# No more blocks to paginate
|
||||||
|
break
|
||||||
|
|
||||||
|
def get_state(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the current pagination state.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary representing pagination state
|
||||||
|
"""
|
||||||
|
return self.state.save()
|
||||||
|
|
||||||
|
def set_state(self, state: Dict[str, Any]) -> None:
|
||||||
|
"""
|
||||||
|
Set the pagination state.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
state: Dictionary representing pagination state
|
||||||
|
"""
|
||||||
|
self.state = DocumentPaginationState.load(state)
|
||||||
|
|
||||||
|
def is_complete(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if pagination is complete.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if all blocks have been paginated, False otherwise
|
||||||
|
"""
|
||||||
|
return self.state.current_element_index >= len(self._blocks)
|
||||||
|
|
||||||
|
def get_progress(self) -> float:
|
||||||
|
"""
|
||||||
|
Get the pagination progress as a percentage.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Percentage of blocks that have been paginated (0.0 to 1.0)
|
||||||
|
"""
|
||||||
|
if not self._blocks:
|
||||||
|
return 1.0
|
||||||
|
return self.state.current_element_index / len(self._blocks)
|
||||||
155
pyWebLayout/typesetting/flow.py
Normal file
155
pyWebLayout/typesetting/flow.py
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
"""
|
||||||
|
Flow layout implementation for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides a flow layout algorithm similar to HTML's normal flow,
|
||||||
|
where elements are positioned sequentially, wrapping to the next line when
|
||||||
|
they exceed the container width.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Tuple, Optional, Any
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from pyWebLayout.core import Layoutable
|
||||||
|
from pyWebLayout.style import Alignment
|
||||||
|
|
||||||
|
|
||||||
|
class FlowLayout:
|
||||||
|
"""
|
||||||
|
Flow layout algorithm for arranging elements in a container.
|
||||||
|
|
||||||
|
Flow layout places elements sequentially from left to right, wrapping to the
|
||||||
|
next line when the elements exceed the container's width. It supports various
|
||||||
|
alignment options for both horizontal and vertical positioning.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def layout_elements(
|
||||||
|
elements: List[Layoutable],
|
||||||
|
container_size: Tuple[int, int],
|
||||||
|
padding: Tuple[int, int, int, int] = (0, 0, 0, 0), # top, right, bottom, left
|
||||||
|
spacing: int = 0,
|
||||||
|
halign: Alignment = Alignment.LEFT,
|
||||||
|
valign: Alignment = Alignment.TOP
|
||||||
|
) -> List[Tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Layout elements in a flow layout within the given container.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elements: List of layoutable elements to arrange
|
||||||
|
container_size: (width, height) tuple for the container
|
||||||
|
padding: (top, right, bottom, left) padding inside the container
|
||||||
|
spacing: Horizontal spacing between elements
|
||||||
|
halign: Horizontal alignment (LEFT, CENTER, RIGHT)
|
||||||
|
valign: Vertical alignment (TOP, CENTER, BOTTOM)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (x, y) positions for each element
|
||||||
|
"""
|
||||||
|
# Calculate available width and height after padding
|
||||||
|
avail_width = container_size[0] - padding[1] - padding[3]
|
||||||
|
avail_height = container_size[1] - padding[0] - padding[2]
|
||||||
|
|
||||||
|
# First, lay out elements in rows
|
||||||
|
positions = []
|
||||||
|
current_x = padding[3] # Start at left padding
|
||||||
|
current_y = padding[0] # Start at top padding
|
||||||
|
row_height = 0
|
||||||
|
row_start_idx = 0
|
||||||
|
|
||||||
|
# Ensure elements are properly laid out internally
|
||||||
|
for element in elements:
|
||||||
|
if hasattr(element, 'layout'):
|
||||||
|
element.layout()
|
||||||
|
|
||||||
|
# First pass - group elements into rows
|
||||||
|
for i, element in enumerate(elements):
|
||||||
|
element_width = element.size[0] if hasattr(element, 'size') else 0
|
||||||
|
element_height = element.size[1] if hasattr(element, 'size') else 0
|
||||||
|
|
||||||
|
# Check if this element fits in the current row
|
||||||
|
if current_x + element_width > padding[3] + avail_width and i > row_start_idx:
|
||||||
|
# Adjust positions for the completed row based on halign
|
||||||
|
FlowLayout._align_row(
|
||||||
|
positions, elements, row_start_idx, i,
|
||||||
|
padding[3], avail_width, halign
|
||||||
|
)
|
||||||
|
|
||||||
|
# Move to next row
|
||||||
|
current_x = padding[3]
|
||||||
|
current_y += row_height + spacing
|
||||||
|
row_height = 0
|
||||||
|
row_start_idx = i
|
||||||
|
|
||||||
|
# Add element to current row
|
||||||
|
positions.append((current_x, current_y))
|
||||||
|
current_x += element_width + spacing
|
||||||
|
row_height = max(row_height, element_height)
|
||||||
|
|
||||||
|
# Handle the last row
|
||||||
|
if row_start_idx < len(elements):
|
||||||
|
FlowLayout._align_row(
|
||||||
|
positions, elements, row_start_idx, len(elements),
|
||||||
|
padding[3], avail_width, halign
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second pass - adjust vertical positions based on valign
|
||||||
|
if valign != Alignment.TOP:
|
||||||
|
total_height = current_y + row_height - padding[0]
|
||||||
|
if total_height < avail_height:
|
||||||
|
offset = 0
|
||||||
|
if valign == Alignment.CENTER:
|
||||||
|
offset = (avail_height - total_height) // 2
|
||||||
|
elif valign == Alignment.BOTTOM:
|
||||||
|
offset = avail_height - total_height
|
||||||
|
|
||||||
|
# Apply vertical offset to all positions
|
||||||
|
positions = [(x, y + offset) for x, y in positions]
|
||||||
|
|
||||||
|
return positions
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _align_row(
|
||||||
|
positions: List[Tuple[int, int]],
|
||||||
|
elements: List[Any],
|
||||||
|
start_idx: int,
|
||||||
|
end_idx: int,
|
||||||
|
left_margin: int,
|
||||||
|
avail_width: int,
|
||||||
|
halign: Alignment
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Adjust positions of elements in a row based on horizontal alignment.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
positions: List of element positions to adjust
|
||||||
|
elements: List of elements
|
||||||
|
start_idx: Start index of the row
|
||||||
|
end_idx: End index of the row
|
||||||
|
left_margin: Left margin of the container
|
||||||
|
avail_width: Available width of the container
|
||||||
|
halign: Horizontal alignment
|
||||||
|
"""
|
||||||
|
if halign == Alignment.LEFT:
|
||||||
|
# No adjustment needed for left alignment
|
||||||
|
return
|
||||||
|
|
||||||
|
# Calculate total width of elements in the row
|
||||||
|
total_width = sum(
|
||||||
|
elements[i].size[0] if hasattr(elements[i], 'size') else 0
|
||||||
|
for i in range(start_idx, end_idx)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add spacing between elements
|
||||||
|
if end_idx - start_idx > 1:
|
||||||
|
total_width += (end_idx - start_idx - 1) * 0 # No spacing for now
|
||||||
|
|
||||||
|
# Calculate the adjustment
|
||||||
|
offset = 0
|
||||||
|
if halign == Alignment.CENTER:
|
||||||
|
offset = (avail_width - total_width) // 2
|
||||||
|
elif halign == Alignment.RIGHT:
|
||||||
|
offset = avail_width - total_width
|
||||||
|
|
||||||
|
# Apply the offset
|
||||||
|
for i in range(start_idx, end_idx):
|
||||||
|
positions[i] = (positions[i][0] + offset, positions[i][1])
|
||||||
231
pyWebLayout/typesetting/pagination.py
Normal file
231
pyWebLayout/typesetting/pagination.py
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
"""
|
||||||
|
Pagination system for pyWebLayout.
|
||||||
|
|
||||||
|
This module provides functionality for paginating content across multiple pages,
|
||||||
|
with the ability to stop, save state, and resume pagination.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Tuple, Dict, Any, Optional, Iterator, Generator
|
||||||
|
import copy
|
||||||
|
|
||||||
|
from pyWebLayout.core import Layoutable
|
||||||
|
from pyWebLayout.style import Alignment
|
||||||
|
from pyWebLayout.typesetting.flow import FlowLayout
|
||||||
|
|
||||||
|
|
||||||
|
class PaginationState:
|
||||||
|
"""
|
||||||
|
Class to hold the state of a pagination process.
|
||||||
|
|
||||||
|
This allows pagination to be paused, saved, and resumed later.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize a new pagination state."""
|
||||||
|
self.current_page = 0
|
||||||
|
self.current_element_index = 0
|
||||||
|
self.position_in_element = 0 # For elements that might be split across pages
|
||||||
|
self.consumed_elements = []
|
||||||
|
self.metadata = {} # For any additional state information
|
||||||
|
|
||||||
|
def save(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Save the current pagination state to a dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary representing the pagination state
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'current_page': self.current_page,
|
||||||
|
'current_element_index': self.current_element_index,
|
||||||
|
'position_in_element': self.position_in_element,
|
||||||
|
'consumed_elements': self.consumed_elements,
|
||||||
|
'metadata': self.metadata
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, state_dict: Dict[str, Any]) -> 'PaginationState':
|
||||||
|
"""
|
||||||
|
Load pagination state from a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
state_dict: Dictionary containing pagination state
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A PaginationState object
|
||||||
|
"""
|
||||||
|
state = cls()
|
||||||
|
state.current_page = state_dict.get('current_page', 0)
|
||||||
|
state.current_element_index = state_dict.get('current_element_index', 0)
|
||||||
|
state.position_in_element = state_dict.get('position_in_element', 0)
|
||||||
|
state.consumed_elements = state_dict.get('consumed_elements', [])
|
||||||
|
state.metadata = state_dict.get('metadata', {})
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
class Paginator:
|
||||||
|
"""
|
||||||
|
Class for paginating content across multiple pages.
|
||||||
|
|
||||||
|
Supports flow layout within each page and maintains state between pages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
elements: List[Layoutable],
|
||||||
|
page_size: Tuple[int, int],
|
||||||
|
margins: Tuple[int, int, int, int] = (20, 20, 20, 20), # top, right, bottom, left
|
||||||
|
spacing: int = 5,
|
||||||
|
halign: Alignment = Alignment.LEFT,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize a paginator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elements: List of elements to paginate
|
||||||
|
page_size: Size of each page (width, height)
|
||||||
|
margins: Margins for each page (top, right, bottom, left)
|
||||||
|
spacing: Spacing between elements
|
||||||
|
halign: Horizontal alignment of elements
|
||||||
|
"""
|
||||||
|
self.elements = elements
|
||||||
|
self.page_size = page_size
|
||||||
|
self.margins = margins
|
||||||
|
self.spacing = spacing
|
||||||
|
self.halign = halign
|
||||||
|
self.state = PaginationState()
|
||||||
|
|
||||||
|
def paginate(self, max_pages: Optional[int] = None) -> List[List[Tuple[Layoutable, Tuple[int, int]]]]:
|
||||||
|
"""
|
||||||
|
Paginate all content into pages.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_pages: Maximum number of pages to generate (None for all)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of pages, where each page is a list of (element, position) tuples
|
||||||
|
"""
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
# Reset state
|
||||||
|
self.state = PaginationState()
|
||||||
|
|
||||||
|
# Create a generator for pagination
|
||||||
|
page_generator = self._paginate_generator()
|
||||||
|
|
||||||
|
# Generate pages up to max_pages or until all content is paginated
|
||||||
|
page_count = 0
|
||||||
|
for page in page_generator:
|
||||||
|
pages.append(page)
|
||||||
|
page_count += 1
|
||||||
|
if max_pages is not None and page_count >= max_pages:
|
||||||
|
break
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
||||||
|
def paginate_next(self) -> Optional[List[Tuple[Layoutable, Tuple[int, int]]]]:
|
||||||
|
"""
|
||||||
|
Paginate and return the next page only.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of (element, position) tuples for the next page, or None if no more content
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return next(self._paginate_generator())
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _paginate_generator(self) -> Generator[List[Tuple[Layoutable, Tuple[int, int]]], None, None]:
|
||||||
|
"""
|
||||||
|
Generator that yields one page at a time.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
A list of (element, position) tuples for each page
|
||||||
|
"""
|
||||||
|
# Calculate available space on a page
|
||||||
|
avail_width = self.page_size[0] - self.margins[1] - self.margins[3]
|
||||||
|
avail_height = self.page_size[1] - self.margins[0] - self.margins[2]
|
||||||
|
|
||||||
|
# Current position on the page
|
||||||
|
current_index = self.state.current_element_index
|
||||||
|
remaining_elements = self.elements[current_index:]
|
||||||
|
|
||||||
|
# Process elements until we run out
|
||||||
|
while current_index < len(self.elements):
|
||||||
|
# Start a new page
|
||||||
|
page_elements = []
|
||||||
|
current_y = self.margins[0]
|
||||||
|
|
||||||
|
# Fill the page with elements
|
||||||
|
while current_index < len(self.elements):
|
||||||
|
element = self.elements[current_index]
|
||||||
|
|
||||||
|
# Ensure element is laid out properly
|
||||||
|
if hasattr(element, 'layout'):
|
||||||
|
element.layout()
|
||||||
|
|
||||||
|
# Get element size
|
||||||
|
element_width = element.size[0] if hasattr(element, 'size') else 0
|
||||||
|
element_height = element.size[1] if hasattr(element, 'size') else 0
|
||||||
|
|
||||||
|
# Check if element fits on current page
|
||||||
|
if current_y + element_height > self.margins[0] + avail_height:
|
||||||
|
# Element doesn't fit, move to next page
|
||||||
|
break
|
||||||
|
|
||||||
|
# Position the element on the page based on alignment
|
||||||
|
if self.halign == Alignment.LEFT:
|
||||||
|
element_x = self.margins[3]
|
||||||
|
elif self.halign == Alignment.CENTER:
|
||||||
|
element_x = self.margins[3] + (avail_width - element_width) // 2
|
||||||
|
elif self.halign == Alignment.RIGHT:
|
||||||
|
element_x = self.margins[3] + (avail_width - element_width)
|
||||||
|
else:
|
||||||
|
element_x = self.margins[3] # Default to left alignment
|
||||||
|
|
||||||
|
# Add element to page
|
||||||
|
page_elements.append((element, (element_x, current_y)))
|
||||||
|
|
||||||
|
# Move to next element and update position
|
||||||
|
current_index += 1
|
||||||
|
current_y += element_height + self.spacing
|
||||||
|
|
||||||
|
# Update state
|
||||||
|
self.state.current_page += 1
|
||||||
|
self.state.current_element_index = current_index
|
||||||
|
|
||||||
|
# If we couldn't fit any elements on this page, we're done
|
||||||
|
if not page_elements and current_index < len(self.elements):
|
||||||
|
# This could happen if an element is too large for a page
|
||||||
|
# Skip the element to avoid an infinite loop
|
||||||
|
current_index += 1
|
||||||
|
self.state.current_element_index = current_index
|
||||||
|
|
||||||
|
# Add a warning element to the page
|
||||||
|
warning_message = f"Element at index {current_index-1} is too large to fit on a page"
|
||||||
|
print(f"Warning: {warning_message}")
|
||||||
|
|
||||||
|
# Yield the page if it has elements
|
||||||
|
if page_elements:
|
||||||
|
yield page_elements
|
||||||
|
else:
|
||||||
|
# No more elements to paginate
|
||||||
|
break
|
||||||
|
|
||||||
|
def get_state(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the current pagination state.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary representing pagination state
|
||||||
|
"""
|
||||||
|
return self.state.save()
|
||||||
|
|
||||||
|
def set_state(self, state: Dict[str, Any]) -> None:
|
||||||
|
"""
|
||||||
|
Set the pagination state.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
state: Dictionary representing pagination state
|
||||||
|
"""
|
||||||
|
self.state = PaginationState.load(state)
|
||||||
18
pyproject.toml
Normal file
18
pyproject.toml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=42", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "pyWebLayout"
|
||||||
|
description = "A Python library for HTML-like layout and rendering"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.6"
|
||||||
|
license = {file = "LICENSE"}
|
||||||
|
authors = [
|
||||||
|
{name = "Duncan Tourolle", email = "duncan@tourolle.paris"}
|
||||||
|
]
|
||||||
|
dynamic = ["version"]
|
||||||
|
dependencies = [
|
||||||
|
"Pillow",
|
||||||
|
"numpy",
|
||||||
|
]
|
||||||
23
setup.cfg
Normal file
23
setup.cfg
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
[metadata]
|
||||||
|
name = pyWebLayout
|
||||||
|
version = 0.1.0
|
||||||
|
author = Duncan Tourolle
|
||||||
|
author_email = duncan@tourolle.paris
|
||||||
|
description = A Python library for HTML-like layout and rendering
|
||||||
|
long_description = file: README.md
|
||||||
|
long_description_content_type = text/markdown
|
||||||
|
url = https://gitea.tourolle.paris/pyWebLayout
|
||||||
|
classifiers =
|
||||||
|
Programming Language :: Python :: 3
|
||||||
|
License :: OSI Approved :: MIT License
|
||||||
|
Operating System :: OS Independent
|
||||||
|
|
||||||
|
[options]
|
||||||
|
packages = find:
|
||||||
|
python_requires = >=3.6
|
||||||
|
install_requires =
|
||||||
|
Pillow
|
||||||
|
numpy
|
||||||
|
|
||||||
|
[options.packages.find]
|
||||||
|
include = pyWebLayout*
|
||||||
32
setup.py
Normal file
32
setup.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="pyWebLayout",
|
||||||
|
version="0.1.0",
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=[
|
||||||
|
"Pillow",
|
||||||
|
"numpy",
|
||||||
|
],
|
||||||
|
extras_require={
|
||||||
|
"test": [
|
||||||
|
"coverage>=5.0",
|
||||||
|
],
|
||||||
|
"dev": [
|
||||||
|
"coverage>=5.0",
|
||||||
|
"pytest>=6.0",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
author="Duncan Tourolle",
|
||||||
|
author_email="duncan@tourolle.paris",
|
||||||
|
description="A Python library for HTML-like layout and rendering",
|
||||||
|
long_description=open("README.md").read(),
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="https://gitea.tourolle.paris/pyWebLayout",
|
||||||
|
classifiers=[
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
],
|
||||||
|
python_requires=">=3.6",
|
||||||
|
)
|
||||||
299
tests/TESTING_STRATEGY.md
Normal file
299
tests/TESTING_STRATEGY.md
Normal file
@ -0,0 +1,299 @@
|
|||||||
|
# PyWebLayout Testing Strategy
|
||||||
|
|
||||||
|
This document outlines the comprehensive unit testing strategy for the pyWebLayout project.
|
||||||
|
|
||||||
|
## Testing Philosophy
|
||||||
|
|
||||||
|
The testing strategy follows these principles:
|
||||||
|
- **Separation of Concerns**: Each component is tested independently
|
||||||
|
- **Comprehensive Coverage**: All public APIs and critical functionality are tested
|
||||||
|
- **Integration Testing**: End-to-end workflows are validated
|
||||||
|
- **Regression Prevention**: Tests prevent breaking changes
|
||||||
|
- **Documentation**: Tests serve as living documentation of expected behavior
|
||||||
|
|
||||||
|
## Test Organization
|
||||||
|
|
||||||
|
### Current Test Files (Implemented)
|
||||||
|
|
||||||
|
#### ✅ `test_html_style.py`
|
||||||
|
Tests the `HTMLStyleManager` class for CSS parsing and style management.
|
||||||
|
|
||||||
|
**Coverage:**
|
||||||
|
- Style initialization and defaults
|
||||||
|
- Style stack operations (push/pop)
|
||||||
|
- CSS property parsing (font-size, font-weight, colors, etc.)
|
||||||
|
- Color parsing (named, hex, rgb, rgba)
|
||||||
|
- Tag-specific default styles
|
||||||
|
- Inline style parsing
|
||||||
|
- Font object creation
|
||||||
|
- Style combination (tag + inline styles)
|
||||||
|
|
||||||
|
#### ✅ `test_html_text.py`
|
||||||
|
Tests the `HTMLTextProcessor` class for text buffering and word creation.
|
||||||
|
|
||||||
|
**Coverage:**
|
||||||
|
- Text buffer management
|
||||||
|
- HTML entity reference handling
|
||||||
|
- Character reference processing (decimal/hex)
|
||||||
|
- Word creation with styling
|
||||||
|
- Paragraph management
|
||||||
|
- Text flushing operations
|
||||||
|
- Buffer state operations
|
||||||
|
|
||||||
|
#### ✅ `test_html_content.py`
|
||||||
|
Integration tests for the `HTMLContentReader` class covering complete HTML parsing.
|
||||||
|
|
||||||
|
**Coverage:**
|
||||||
|
- Simple paragraph parsing
|
||||||
|
- Heading levels (h1-h6)
|
||||||
|
- Styled text (bold, italic)
|
||||||
|
- Lists (ul, ol, dl)
|
||||||
|
- Tables with headers and cells
|
||||||
|
- Blockquotes with nested content
|
||||||
|
- Code blocks with language detection
|
||||||
|
- HTML entities
|
||||||
|
- Nested element structures
|
||||||
|
- Complex document parsing
|
||||||
|
|
||||||
|
#### ✅ `test_abstract_blocks.py`
|
||||||
|
Tests for the core abstract block element classes.
|
||||||
|
|
||||||
|
**Coverage:**
|
||||||
|
- Paragraph word management
|
||||||
|
- Heading levels and properties
|
||||||
|
- Quote nesting capabilities
|
||||||
|
- Code block line management
|
||||||
|
- List creation and item handling
|
||||||
|
- Table structure (rows, cells, sections)
|
||||||
|
- Image properties and scaling
|
||||||
|
- Simple elements (hr, br)
|
||||||
|
|
||||||
|
#### ✅ `test_runner.py`
|
||||||
|
Test runner script for executing all tests with summary reporting.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Additional Tests Needed
|
||||||
|
|
||||||
|
### 🔄 High Priority (Should Implement Next)
|
||||||
|
|
||||||
|
#### `test_abstract_inline.py`
|
||||||
|
Tests for inline elements and text formatting.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Word creation and properties
|
||||||
|
- Word hyphenation functionality
|
||||||
|
- FormattedSpan management
|
||||||
|
- Word chaining (previous/next relationships)
|
||||||
|
- Font style application
|
||||||
|
- Language-specific hyphenation
|
||||||
|
|
||||||
|
#### `test_abstract_document.py`
|
||||||
|
Tests for document structure and metadata.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Document creation and initialization
|
||||||
|
- Metadata management (title, author, language, etc.)
|
||||||
|
- Block addition and management
|
||||||
|
- Anchor creation and resolution
|
||||||
|
- Resource management
|
||||||
|
- Table of contents generation
|
||||||
|
- Chapter and book structures
|
||||||
|
|
||||||
|
#### `test_abstract_functional.py`
|
||||||
|
Tests for functional elements (links, buttons, forms).
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Link creation and type detection
|
||||||
|
- Link execution for different types
|
||||||
|
- Button functionality and state
|
||||||
|
- Form field management
|
||||||
|
- Form validation and submission
|
||||||
|
- Parameter handling
|
||||||
|
|
||||||
|
#### `test_style_system.py`
|
||||||
|
Tests for the style system (fonts, colors, alignment).
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Font creation and properties
|
||||||
|
- Color representation and manipulation
|
||||||
|
- Font weight, style, decoration enums
|
||||||
|
- Alignment enums and behavior
|
||||||
|
- Style inheritance and cascading
|
||||||
|
|
||||||
|
### 🔧 Medium Priority
|
||||||
|
|
||||||
|
#### `test_html_elements.py`
|
||||||
|
Unit tests for the HTML element handlers.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- BlockElementHandler individual methods
|
||||||
|
- ListElementHandler state management
|
||||||
|
- TableElementHandler complex scenarios
|
||||||
|
- InlineElementHandler link processing
|
||||||
|
- Handler coordination and delegation
|
||||||
|
- Error handling in handlers
|
||||||
|
|
||||||
|
#### `test_html_metadata.py`
|
||||||
|
Tests for HTML metadata extraction.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Meta tag parsing
|
||||||
|
- Open Graph extraction
|
||||||
|
- JSON-LD structured data
|
||||||
|
- Title and description extraction
|
||||||
|
- Language detection
|
||||||
|
- Character encoding handling
|
||||||
|
|
||||||
|
#### `test_html_resources.py`
|
||||||
|
Tests for HTML resource extraction.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- CSS stylesheet extraction
|
||||||
|
- JavaScript resource identification
|
||||||
|
- Image source collection
|
||||||
|
- Media element detection
|
||||||
|
- External resource resolution
|
||||||
|
- Base URL handling
|
||||||
|
|
||||||
|
#### `test_io_base.py`
|
||||||
|
Tests for the base reader architecture.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- BaseReader interface compliance
|
||||||
|
- MetadataReader abstract methods
|
||||||
|
- ContentReader abstract methods
|
||||||
|
- ResourceReader abstract methods
|
||||||
|
- CompositeReader coordination
|
||||||
|
|
||||||
|
### 🔍 Lower Priority
|
||||||
|
|
||||||
|
#### `test_concrete_elements.py`
|
||||||
|
Tests for concrete rendering implementations.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Box model calculations
|
||||||
|
- Text rendering specifics
|
||||||
|
- Image rendering and scaling
|
||||||
|
- Page layout management
|
||||||
|
- Functional element rendering
|
||||||
|
|
||||||
|
#### `test_typesetting.py`
|
||||||
|
Tests for the typesetting system.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Flow algorithms
|
||||||
|
- Pagination logic
|
||||||
|
- Document pagination
|
||||||
|
- Line breaking
|
||||||
|
- Hyphenation integration
|
||||||
|
|
||||||
|
#### `test_epub_reader.py`
|
||||||
|
Tests for EPUB reading functionality.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- EPUB file structure parsing
|
||||||
|
- Manifest processing
|
||||||
|
- Chapter extraction
|
||||||
|
- Metadata reading
|
||||||
|
- Navigation document parsing
|
||||||
|
|
||||||
|
#### `test_integration.py`
|
||||||
|
End-to-end integration tests.
|
||||||
|
|
||||||
|
**Needed Coverage:**
|
||||||
|
- Complete HTML-to-document workflows
|
||||||
|
- EPUB-to-document workflows
|
||||||
|
- Style application across parsers
|
||||||
|
- Resource resolution chains
|
||||||
|
- Error handling scenarios
|
||||||
|
|
||||||
|
## Testing Infrastructure
|
||||||
|
|
||||||
|
### Test Dependencies
|
||||||
|
```python
|
||||||
|
# Required for testing
|
||||||
|
unittest # Built-in Python testing framework
|
||||||
|
unittest.mock # For mocking and test doubles
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test Data
|
||||||
|
- Create `tests/data/` directory with sample files:
|
||||||
|
- `sample.html` - Well-formed HTML document
|
||||||
|
- `complex.html` - Complex nested HTML
|
||||||
|
- `malformed.html` - Edge cases and error conditions
|
||||||
|
- `sample.epub` - Sample EPUB file
|
||||||
|
- `test_images/` - Sample images for testing
|
||||||
|
|
||||||
|
### Continuous Integration
|
||||||
|
- Tests should run on Python 3.6+
|
||||||
|
- All tests must pass before merging
|
||||||
|
- Aim for >90% code coverage
|
||||||
|
- Performance regression testing for parsing speed
|
||||||
|
|
||||||
|
## Running Tests
|
||||||
|
|
||||||
|
### Run All Tests
|
||||||
|
```bash
|
||||||
|
python tests/test_runner.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run Specific Test Module
|
||||||
|
```bash
|
||||||
|
python tests/test_runner.py html_style
|
||||||
|
python -m unittest tests.test_html_style
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run Individual Test
|
||||||
|
```bash
|
||||||
|
python -m unittest tests.test_html_style.TestHTMLStyleManager.test_color_parsing
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run with Coverage
|
||||||
|
```bash
|
||||||
|
pip install coverage
|
||||||
|
coverage run -m unittest discover tests/
|
||||||
|
coverage report -m
|
||||||
|
coverage html # Generate HTML report
|
||||||
|
```
|
||||||
|
|
||||||
|
## Test Quality Guidelines
|
||||||
|
|
||||||
|
### Test Naming
|
||||||
|
- Test files: `test_<module_name>.py`
|
||||||
|
- Test classes: `Test<ClassName>`
|
||||||
|
- Test methods: `test_<specific_functionality>`
|
||||||
|
|
||||||
|
### Test Structure
|
||||||
|
1. **Arrange**: Set up test data and mocks
|
||||||
|
2. **Act**: Execute the functionality being tested
|
||||||
|
3. **Assert**: Verify the expected behavior
|
||||||
|
|
||||||
|
### Mock Usage
|
||||||
|
- Mock external dependencies (file I/O, network)
|
||||||
|
- Mock complex objects when testing units in isolation
|
||||||
|
- Prefer real objects for integration tests
|
||||||
|
|
||||||
|
### Edge Cases
|
||||||
|
- Empty inputs
|
||||||
|
- Invalid inputs
|
||||||
|
- Boundary conditions
|
||||||
|
- Error scenarios
|
||||||
|
- Performance edge cases
|
||||||
|
|
||||||
|
## Success Metrics
|
||||||
|
|
||||||
|
- **Coverage**: >90% line coverage across all modules
|
||||||
|
- **Performance**: No test takes longer than 1 second
|
||||||
|
- **Reliability**: Tests pass consistently across environments
|
||||||
|
- **Maintainability**: Tests are easy to understand and modify
|
||||||
|
- **Documentation**: Tests clearly show expected behavior
|
||||||
|
|
||||||
|
## Implementation Priority
|
||||||
|
|
||||||
|
1. **Week 1**: Complete high-priority abstract tests
|
||||||
|
2. **Week 2**: Implement HTML processing component tests
|
||||||
|
3. **Week 3**: Add integration and end-to-end tests
|
||||||
|
4. **Week 4**: Performance and edge case testing
|
||||||
|
|
||||||
|
This testing strategy ensures comprehensive coverage of the pyWebLayout library while maintaining good separation of concerns and providing clear documentation of expected behavior.
|
||||||
6
tests/__init__.py
Normal file
6
tests/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""
|
||||||
|
Test suite for pyWebLayout.
|
||||||
|
|
||||||
|
This package contains comprehensive unit tests for all components of the pyWebLayout library,
|
||||||
|
organized by module and functionality.
|
||||||
|
"""
|
||||||
275
tests/test_abstract_blocks.py
Normal file
275
tests/test_abstract_blocks.py
Normal file
@ -0,0 +1,275 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for abstract block elements.
|
||||||
|
|
||||||
|
Tests the core abstract block classes that form the foundation of the document model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from pyWebLayout.abstract.block import (
|
||||||
|
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
|
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
||||||
|
HorizontalRule, LineBreak, Image
|
||||||
|
)
|
||||||
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
from pyWebLayout.style import Font
|
||||||
|
|
||||||
|
|
||||||
|
class TestBlockElements(unittest.TestCase):
|
||||||
|
"""Test cases for basic block elements."""
|
||||||
|
|
||||||
|
def test_paragraph_creation(self):
|
||||||
|
"""Test creating and using paragraphs."""
|
||||||
|
paragraph = Parapgraph()
|
||||||
|
|
||||||
|
self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH)
|
||||||
|
self.assertEqual(paragraph.word_count, 0)
|
||||||
|
self.assertIsNone(paragraph.parent)
|
||||||
|
|
||||||
|
# Add words
|
||||||
|
font = Font()
|
||||||
|
word1 = Word("Hello", font)
|
||||||
|
word2 = Word("World", font)
|
||||||
|
|
||||||
|
paragraph.add_word(word1)
|
||||||
|
paragraph.add_word(word2)
|
||||||
|
|
||||||
|
self.assertEqual(paragraph.word_count, 2)
|
||||||
|
|
||||||
|
# Test word iteration
|
||||||
|
words = list(paragraph.words())
|
||||||
|
self.assertEqual(len(words), 2)
|
||||||
|
self.assertEqual(words[0][1].text, "Hello")
|
||||||
|
self.assertEqual(words[1][1].text, "World")
|
||||||
|
|
||||||
|
def test_heading_levels(self):
|
||||||
|
"""Test heading creation with different levels."""
|
||||||
|
h1 = Heading(HeadingLevel.H1)
|
||||||
|
h3 = Heading(HeadingLevel.H3)
|
||||||
|
h6 = Heading(HeadingLevel.H6)
|
||||||
|
|
||||||
|
self.assertEqual(h1.level, HeadingLevel.H1)
|
||||||
|
self.assertEqual(h3.level, HeadingLevel.H3)
|
||||||
|
self.assertEqual(h6.level, HeadingLevel.H6)
|
||||||
|
|
||||||
|
self.assertEqual(h1.block_type, BlockType.HEADING)
|
||||||
|
|
||||||
|
# Test level modification
|
||||||
|
h1.level = HeadingLevel.H2
|
||||||
|
self.assertEqual(h1.level, HeadingLevel.H2)
|
||||||
|
|
||||||
|
def test_quote_nesting(self):
|
||||||
|
"""Test blockquote with nested content."""
|
||||||
|
quote = Quote()
|
||||||
|
|
||||||
|
# Add nested paragraphs
|
||||||
|
p1 = Parapgraph()
|
||||||
|
p2 = Parapgraph()
|
||||||
|
|
||||||
|
quote.add_block(p1)
|
||||||
|
quote.add_block(p2)
|
||||||
|
|
||||||
|
self.assertEqual(p1.parent, quote)
|
||||||
|
self.assertEqual(p2.parent, quote)
|
||||||
|
|
||||||
|
# Test block iteration
|
||||||
|
blocks = list(quote.blocks())
|
||||||
|
self.assertEqual(len(blocks), 2)
|
||||||
|
self.assertEqual(blocks[0], p1)
|
||||||
|
self.assertEqual(blocks[1], p2)
|
||||||
|
|
||||||
|
def test_code_block(self):
|
||||||
|
"""Test code block functionality."""
|
||||||
|
code = CodeBlock("python")
|
||||||
|
|
||||||
|
self.assertEqual(code.language, "python")
|
||||||
|
self.assertEqual(code.line_count, 0)
|
||||||
|
|
||||||
|
# Add code lines
|
||||||
|
code.add_line("def hello():")
|
||||||
|
code.add_line(" print('Hello!')")
|
||||||
|
|
||||||
|
self.assertEqual(code.line_count, 2)
|
||||||
|
|
||||||
|
# Test line iteration
|
||||||
|
lines = list(code.lines())
|
||||||
|
self.assertEqual(len(lines), 2)
|
||||||
|
self.assertEqual(lines[0][1], "def hello():")
|
||||||
|
self.assertEqual(lines[1][1], " print('Hello!')")
|
||||||
|
|
||||||
|
# Test language modification
|
||||||
|
code.language = "javascript"
|
||||||
|
self.assertEqual(code.language, "javascript")
|
||||||
|
|
||||||
|
def test_list_creation(self):
|
||||||
|
"""Test list creation and item management."""
|
||||||
|
# Unordered list
|
||||||
|
ul = HList(ListStyle.UNORDERED)
|
||||||
|
self.assertEqual(ul.style, ListStyle.UNORDERED)
|
||||||
|
self.assertEqual(ul.item_count, 0)
|
||||||
|
|
||||||
|
# Add list items
|
||||||
|
item1 = ListItem()
|
||||||
|
item2 = ListItem()
|
||||||
|
|
||||||
|
ul.add_item(item1)
|
||||||
|
ul.add_item(item2)
|
||||||
|
|
||||||
|
self.assertEqual(ul.item_count, 2)
|
||||||
|
self.assertEqual(item1.parent, ul)
|
||||||
|
self.assertEqual(item2.parent, ul)
|
||||||
|
|
||||||
|
# Test item iteration
|
||||||
|
items = list(ul.items())
|
||||||
|
self.assertEqual(len(items), 2)
|
||||||
|
|
||||||
|
# Test list style change
|
||||||
|
ul.style = ListStyle.ORDERED
|
||||||
|
self.assertEqual(ul.style, ListStyle.ORDERED)
|
||||||
|
|
||||||
|
def test_definition_list(self):
|
||||||
|
"""Test definition list with terms."""
|
||||||
|
dl = HList(ListStyle.DEFINITION)
|
||||||
|
|
||||||
|
# Add definition items with terms
|
||||||
|
dt1 = ListItem(term="Python")
|
||||||
|
dt2 = ListItem(term="JavaScript")
|
||||||
|
|
||||||
|
dl.add_item(dt1)
|
||||||
|
dl.add_item(dt2)
|
||||||
|
|
||||||
|
self.assertEqual(dt1.term, "Python")
|
||||||
|
self.assertEqual(dt2.term, "JavaScript")
|
||||||
|
|
||||||
|
# Test term modification
|
||||||
|
dt1.term = "Python 3"
|
||||||
|
self.assertEqual(dt1.term, "Python 3")
|
||||||
|
|
||||||
|
def test_table_structure(self):
|
||||||
|
"""Test table, row, and cell structure."""
|
||||||
|
table = Table(caption="Test Table")
|
||||||
|
|
||||||
|
self.assertEqual(table.caption, "Test Table")
|
||||||
|
self.assertEqual(table.row_count["total"], 0)
|
||||||
|
|
||||||
|
# Create rows and cells
|
||||||
|
header_row = TableRow()
|
||||||
|
data_row = TableRow()
|
||||||
|
|
||||||
|
# Header cells
|
||||||
|
h1 = TableCell(is_header=True)
|
||||||
|
h2 = TableCell(is_header=True)
|
||||||
|
header_row.add_cell(h1)
|
||||||
|
header_row.add_cell(h2)
|
||||||
|
|
||||||
|
# Data cells
|
||||||
|
d1 = TableCell(is_header=False)
|
||||||
|
d2 = TableCell(is_header=False, colspan=2)
|
||||||
|
data_row.add_cell(d1)
|
||||||
|
data_row.add_cell(d2)
|
||||||
|
|
||||||
|
# Add rows to table
|
||||||
|
table.add_row(header_row, "header")
|
||||||
|
table.add_row(data_row, "body")
|
||||||
|
|
||||||
|
# Test structure
|
||||||
|
self.assertEqual(table.row_count["header"], 1)
|
||||||
|
self.assertEqual(table.row_count["body"], 1)
|
||||||
|
self.assertEqual(table.row_count["total"], 2)
|
||||||
|
|
||||||
|
# Test cell properties
|
||||||
|
self.assertTrue(h1.is_header)
|
||||||
|
self.assertFalse(d1.is_header)
|
||||||
|
self.assertEqual(d2.colspan, 2)
|
||||||
|
self.assertEqual(d2.rowspan, 1) # Default
|
||||||
|
|
||||||
|
# Test row cell count
|
||||||
|
self.assertEqual(header_row.cell_count, 2)
|
||||||
|
self.assertEqual(data_row.cell_count, 2)
|
||||||
|
|
||||||
|
def test_table_sections(self):
|
||||||
|
"""Test table header, body, and footer sections."""
|
||||||
|
table = Table()
|
||||||
|
|
||||||
|
# Add rows to different sections
|
||||||
|
header = TableRow()
|
||||||
|
body1 = TableRow()
|
||||||
|
body2 = TableRow()
|
||||||
|
footer = TableRow()
|
||||||
|
|
||||||
|
table.add_row(header, "header")
|
||||||
|
table.add_row(body1, "body")
|
||||||
|
table.add_row(body2, "body")
|
||||||
|
table.add_row(footer, "footer")
|
||||||
|
|
||||||
|
# Test section iteration
|
||||||
|
header_rows = list(table.header_rows())
|
||||||
|
body_rows = list(table.body_rows())
|
||||||
|
footer_rows = list(table.footer_rows())
|
||||||
|
|
||||||
|
self.assertEqual(len(header_rows), 1)
|
||||||
|
self.assertEqual(len(body_rows), 2)
|
||||||
|
self.assertEqual(len(footer_rows), 1)
|
||||||
|
|
||||||
|
# Test all_rows iteration
|
||||||
|
all_rows = list(table.all_rows())
|
||||||
|
self.assertEqual(len(all_rows), 4)
|
||||||
|
|
||||||
|
# Check section labels
|
||||||
|
sections = [section for section, row in all_rows]
|
||||||
|
self.assertEqual(sections, ["header", "body", "body", "footer"])
|
||||||
|
|
||||||
|
def test_image_loading(self):
|
||||||
|
"""Test image element properties."""
|
||||||
|
# Test with basic properties
|
||||||
|
img = Image("test.jpg", "Test image", 100, 200)
|
||||||
|
|
||||||
|
self.assertEqual(img.source, "test.jpg")
|
||||||
|
self.assertEqual(img.alt_text, "Test image")
|
||||||
|
self.assertEqual(img.width, 100)
|
||||||
|
self.assertEqual(img.height, 200)
|
||||||
|
|
||||||
|
# Test property modification
|
||||||
|
img.source = "new.png"
|
||||||
|
img.alt_text = "New image"
|
||||||
|
img.width = 150
|
||||||
|
img.height = 300
|
||||||
|
|
||||||
|
self.assertEqual(img.source, "new.png")
|
||||||
|
self.assertEqual(img.alt_text, "New image")
|
||||||
|
self.assertEqual(img.width, 150)
|
||||||
|
self.assertEqual(img.height, 300)
|
||||||
|
|
||||||
|
# Test dimensions tuple
|
||||||
|
self.assertEqual(img.get_dimensions(), (150, 300))
|
||||||
|
|
||||||
|
def test_aspect_ratio_calculation(self):
|
||||||
|
"""Test image aspect ratio calculations."""
|
||||||
|
# Test with specified dimensions
|
||||||
|
img = Image("test.jpg", width=400, height=200)
|
||||||
|
self.assertEqual(img.get_aspect_ratio(), 2.0) # 400/200
|
||||||
|
|
||||||
|
# Test with only one dimension
|
||||||
|
img2 = Image("test.jpg", width=300)
|
||||||
|
self.assertIsNone(img2.get_aspect_ratio()) # No height specified
|
||||||
|
|
||||||
|
# Test scaled dimensions
|
||||||
|
scaled = img.calculate_scaled_dimensions(max_width=200, max_height=150)
|
||||||
|
# Should scale down proportionally
|
||||||
|
self.assertEqual(scaled[0], 200) # Width limited by max_width
|
||||||
|
self.assertEqual(scaled[1], 100) # Height scaled proportionally
|
||||||
|
|
||||||
|
def test_simple_elements(self):
|
||||||
|
"""Test simple block elements."""
|
||||||
|
hr = HorizontalRule()
|
||||||
|
br = LineBreak()
|
||||||
|
|
||||||
|
self.assertEqual(hr.block_type, BlockType.HORIZONTAL_RULE)
|
||||||
|
self.assertEqual(br.block_type, BlockType.LINE_BREAK)
|
||||||
|
|
||||||
|
# These elements have no additional properties
|
||||||
|
self.assertIsNone(hr.parent)
|
||||||
|
self.assertIsNone(br.parent)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
354
tests/test_html_content.py
Normal file
354
tests/test_html_content.py
Normal file
@ -0,0 +1,354 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for HTML content reading.
|
||||||
|
|
||||||
|
Tests the HTMLContentReader class for parsing complete HTML documents.
|
||||||
|
This is more of an integration test covering the entire parsing pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
||||||
|
from pyWebLayout.abstract.document import Document
|
||||||
|
from pyWebLayout.abstract.block import (
|
||||||
|
Parapgraph, Heading, HeadingLevel, HList, ListStyle,
|
||||||
|
Table, Quote, CodeBlock, HorizontalRule, LineBreak
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLContentReader(unittest.TestCase):
|
||||||
|
"""Test cases for HTMLContentReader."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test fixtures."""
|
||||||
|
self.reader = HTMLContentReader()
|
||||||
|
self.document = Document()
|
||||||
|
|
||||||
|
def test_simple_paragraph(self):
|
||||||
|
"""Test parsing a simple paragraph."""
|
||||||
|
html = '<p>Hello world!</p>'
|
||||||
|
|
||||||
|
result = self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
self.assertIsInstance(self.document.blocks[0], Parapgraph)
|
||||||
|
|
||||||
|
paragraph = self.document.blocks[0]
|
||||||
|
words = list(paragraph.words())
|
||||||
|
self.assertEqual(len(words), 2)
|
||||||
|
self.assertEqual(words[0][1].text, "Hello")
|
||||||
|
self.assertEqual(words[1][1].text, "world!")
|
||||||
|
|
||||||
|
def test_headings(self):
|
||||||
|
"""Test parsing different heading levels."""
|
||||||
|
html = '''
|
||||||
|
<h1>Heading 1</h1>
|
||||||
|
<h2>Heading 2</h2>
|
||||||
|
<h3>Heading 3</h3>
|
||||||
|
<h6>Heading 6</h6>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
# Should have 4 heading blocks
|
||||||
|
headings = [block for block in self.document.blocks if isinstance(block, Heading)]
|
||||||
|
self.assertEqual(len(headings), 4)
|
||||||
|
|
||||||
|
# Check heading levels
|
||||||
|
self.assertEqual(headings[0].level, HeadingLevel.H1)
|
||||||
|
self.assertEqual(headings[1].level, HeadingLevel.H2)
|
||||||
|
self.assertEqual(headings[2].level, HeadingLevel.H3)
|
||||||
|
self.assertEqual(headings[3].level, HeadingLevel.H6)
|
||||||
|
|
||||||
|
# Check text content
|
||||||
|
h1_words = list(headings[0].words())
|
||||||
|
self.assertEqual(len(h1_words), 2)
|
||||||
|
self.assertEqual(h1_words[0][1].text, "Heading")
|
||||||
|
self.assertEqual(h1_words[1][1].text, "1")
|
||||||
|
|
||||||
|
def test_styled_text(self):
|
||||||
|
"""Test parsing text with inline styling."""
|
||||||
|
html = '<p>This is <b>bold</b> and <i>italic</i> text.</p>'
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
paragraph = self.document.blocks[0]
|
||||||
|
words = list(paragraph.words())
|
||||||
|
|
||||||
|
# Should have words: "This", "is", "bold", "and", "italic", "text."
|
||||||
|
self.assertEqual(len(words), 6)
|
||||||
|
|
||||||
|
# The styling information is embedded in the Font objects
|
||||||
|
# We can't easily test the exact styling without more complex setup
|
||||||
|
# but we can verify the words are created correctly
|
||||||
|
word_texts = [word[1].text for word in words]
|
||||||
|
self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."])
|
||||||
|
|
||||||
|
def test_unordered_list(self):
|
||||||
|
"""Test parsing unordered lists."""
|
||||||
|
html = '''
|
||||||
|
<ul>
|
||||||
|
<li>First item</li>
|
||||||
|
<li>Second item</li>
|
||||||
|
<li>Third item</li>
|
||||||
|
</ul>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
self.assertIsInstance(self.document.blocks[0], HList)
|
||||||
|
|
||||||
|
list_block = self.document.blocks[0]
|
||||||
|
self.assertEqual(list_block.style, ListStyle.UNORDERED)
|
||||||
|
|
||||||
|
items = list(list_block.items())
|
||||||
|
self.assertEqual(len(items), 3)
|
||||||
|
|
||||||
|
# Check first item content
|
||||||
|
first_item_blocks = list(items[0].blocks())
|
||||||
|
self.assertEqual(len(first_item_blocks), 1)
|
||||||
|
self.assertIsInstance(first_item_blocks[0], Parapgraph)
|
||||||
|
|
||||||
|
def test_ordered_list(self):
|
||||||
|
"""Test parsing ordered lists."""
|
||||||
|
html = '''
|
||||||
|
<ol>
|
||||||
|
<li>First step</li>
|
||||||
|
<li>Second step</li>
|
||||||
|
</ol>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
list_block = self.document.blocks[0]
|
||||||
|
self.assertEqual(list_block.style, ListStyle.ORDERED)
|
||||||
|
|
||||||
|
items = list(list_block.items())
|
||||||
|
self.assertEqual(len(items), 2)
|
||||||
|
|
||||||
|
def test_definition_list(self):
|
||||||
|
"""Test parsing definition lists."""
|
||||||
|
html = '''
|
||||||
|
<dl>
|
||||||
|
<dt>Term 1</dt>
|
||||||
|
<dd>Definition 1</dd>
|
||||||
|
<dt>Term 2</dt>
|
||||||
|
<dd>Definition 2</dd>
|
||||||
|
</dl>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
list_block = self.document.blocks[0]
|
||||||
|
self.assertEqual(list_block.style, ListStyle.DEFINITION)
|
||||||
|
|
||||||
|
items = list(list_block.items())
|
||||||
|
self.assertEqual(len(items), 2) # Two dt/dd pairs
|
||||||
|
|
||||||
|
def test_table(self):
|
||||||
|
"""Test parsing simple tables."""
|
||||||
|
html = '''
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Header 1</th>
|
||||||
|
<th>Header 2</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Cell 1</td>
|
||||||
|
<td>Cell 2</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
self.assertIsInstance(self.document.blocks[0], Table)
|
||||||
|
|
||||||
|
table = self.document.blocks[0]
|
||||||
|
|
||||||
|
# Check body rows
|
||||||
|
body_rows = list(table.body_rows())
|
||||||
|
self.assertEqual(len(body_rows), 2) # Header row + data row
|
||||||
|
|
||||||
|
# Check first row (header)
|
||||||
|
first_row_cells = list(body_rows[0].cells())
|
||||||
|
self.assertEqual(len(first_row_cells), 2)
|
||||||
|
self.assertTrue(first_row_cells[0].is_header)
|
||||||
|
self.assertTrue(first_row_cells[1].is_header)
|
||||||
|
|
||||||
|
# Check second row (data)
|
||||||
|
second_row_cells = list(body_rows[1].cells())
|
||||||
|
self.assertEqual(len(second_row_cells), 2)
|
||||||
|
self.assertFalse(second_row_cells[0].is_header)
|
||||||
|
self.assertFalse(second_row_cells[1].is_header)
|
||||||
|
|
||||||
|
def test_blockquote(self):
|
||||||
|
"""Test parsing blockquotes."""
|
||||||
|
html = '''
|
||||||
|
<blockquote>
|
||||||
|
<p>This is a quoted paragraph.</p>
|
||||||
|
<p>Another quoted paragraph.</p>
|
||||||
|
</blockquote>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
self.assertIsInstance(self.document.blocks[0], Quote)
|
||||||
|
|
||||||
|
quote = self.document.blocks[0]
|
||||||
|
quote_blocks = list(quote.blocks())
|
||||||
|
self.assertEqual(len(quote_blocks), 2)
|
||||||
|
self.assertIsInstance(quote_blocks[0], Parapgraph)
|
||||||
|
self.assertIsInstance(quote_blocks[1], Parapgraph)
|
||||||
|
|
||||||
|
def test_code_block(self):
|
||||||
|
"""Test parsing code blocks."""
|
||||||
|
html = '''
|
||||||
|
<pre><code class="language-python">
|
||||||
|
def hello():
|
||||||
|
print("Hello, world!")
|
||||||
|
</code></pre>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
|
self.assertIsInstance(self.document.blocks[0], CodeBlock)
|
||||||
|
|
||||||
|
code_block = self.document.blocks[0]
|
||||||
|
self.assertEqual(code_block.language, "python")
|
||||||
|
|
||||||
|
def test_horizontal_rule(self):
|
||||||
|
"""Test parsing horizontal rules."""
|
||||||
|
html = '<p>Before</p><hr><p>After</p>'
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.document.blocks), 3)
|
||||||
|
self.assertIsInstance(self.document.blocks[0], Parapgraph)
|
||||||
|
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
|
||||||
|
self.assertIsInstance(self.document.blocks[2], Parapgraph)
|
||||||
|
|
||||||
|
def test_html_entities(self):
|
||||||
|
"""Test handling HTML entities."""
|
||||||
|
html = '<p>Less than: < Greater than: > Ampersand: &</p>'
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
paragraph = self.document.blocks[0]
|
||||||
|
words = list(paragraph.words())
|
||||||
|
|
||||||
|
# Find the entity words
|
||||||
|
word_texts = [word[1].text for word in words]
|
||||||
|
self.assertIn('<', word_texts)
|
||||||
|
self.assertIn('>', word_texts)
|
||||||
|
self.assertIn('&', word_texts)
|
||||||
|
|
||||||
|
def test_nested_elements(self):
|
||||||
|
"""Test parsing nested HTML elements."""
|
||||||
|
html = '''
|
||||||
|
<div>
|
||||||
|
<h2>Section Title</h2>
|
||||||
|
<p>Section content with <strong>important</strong> text.</p>
|
||||||
|
<ul>
|
||||||
|
<li>List item 1</li>
|
||||||
|
<li>List item 2</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
# Should have multiple blocks
|
||||||
|
self.assertGreater(len(self.document.blocks), 1)
|
||||||
|
|
||||||
|
# Check that we have different types of blocks
|
||||||
|
block_types = [type(block).__name__ for block in self.document.blocks]
|
||||||
|
self.assertIn('Parapgraph', block_types) # From div
|
||||||
|
self.assertIn('Heading', block_types)
|
||||||
|
self.assertIn('HList', block_types)
|
||||||
|
|
||||||
|
def test_empty_elements(self):
|
||||||
|
"""Test handling empty HTML elements."""
|
||||||
|
html = '<p></p><div></div><ul></ul>'
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
# Empty elements should still create blocks
|
||||||
|
self.assertEqual(len(self.document.blocks), 3)
|
||||||
|
|
||||||
|
def test_whitespace_handling(self):
|
||||||
|
"""Test proper whitespace handling."""
|
||||||
|
html = '''
|
||||||
|
<p> Word1 Word2
|
||||||
|
Word3 </p>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
paragraph = self.document.blocks[0]
|
||||||
|
words = list(paragraph.words())
|
||||||
|
|
||||||
|
# Should normalize whitespace and create separate words
|
||||||
|
word_texts = [word[1].text for word in words]
|
||||||
|
self.assertEqual(word_texts, ["Word1", "Word2", "Word3"])
|
||||||
|
|
||||||
|
def test_base_url_setting(self):
|
||||||
|
"""Test setting base URL for link resolution."""
|
||||||
|
base_url = "https://example.com/path/"
|
||||||
|
self.reader.set_base_url(base_url)
|
||||||
|
|
||||||
|
# The base URL should be passed to the inline handler
|
||||||
|
self.assertEqual(self.reader.inline_handler.base_url, base_url)
|
||||||
|
|
||||||
|
def test_complex_document(self):
|
||||||
|
"""Test parsing a complex HTML document."""
|
||||||
|
html = '''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test Document</title>
|
||||||
|
<style>body { font-family: Arial; }</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Main Title</h1>
|
||||||
|
<p>Introduction paragraph with <em>emphasis</em>.</p>
|
||||||
|
|
||||||
|
<h2>Section 1</h2>
|
||||||
|
<p>Content with <a href="link.html">a link</a>.</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>Item 1</li>
|
||||||
|
<li>Item 2 with <strong>bold text</strong></li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h2>Section 2</h2>
|
||||||
|
<blockquote>
|
||||||
|
<p>A quoted paragraph.</p>
|
||||||
|
</blockquote>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr><th>Col1</th><th>Col2</th></tr>
|
||||||
|
<tr><td>A</td><td>B</td></tr>
|
||||||
|
</table>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
|
||||||
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
|
# Should have parsed multiple blocks
|
||||||
|
self.assertGreater(len(self.document.blocks), 5)
|
||||||
|
|
||||||
|
# Should have different types of content
|
||||||
|
block_types = set(type(block).__name__ for block in self.document.blocks)
|
||||||
|
expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'}
|
||||||
|
self.assertTrue(expected_types.issubset(block_types))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
182
tests/test_html_style.py
Normal file
182
tests/test_html_style.py
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for HTML style management.
|
||||||
|
|
||||||
|
Tests the HTMLStyleManager class for CSS parsing, style stacks, and font creation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
|
from pyWebLayout.style import FontStyle, FontWeight, TextDecoration
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLStyleManager(unittest.TestCase):
|
||||||
|
"""Test cases for HTMLStyleManager."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test fixtures."""
|
||||||
|
self.style_manager = HTMLStyleManager()
|
||||||
|
|
||||||
|
def test_initialization(self):
|
||||||
|
"""Test proper initialization of style manager."""
|
||||||
|
style = self.style_manager.get_current_style()
|
||||||
|
|
||||||
|
self.assertEqual(style['font_size'], 12)
|
||||||
|
self.assertEqual(style['font_weight'], FontWeight.NORMAL)
|
||||||
|
self.assertEqual(style['font_style'], FontStyle.NORMAL)
|
||||||
|
self.assertEqual(style['decoration'], TextDecoration.NONE)
|
||||||
|
self.assertEqual(style['color'], (0, 0, 0))
|
||||||
|
self.assertIsNone(style['background'])
|
||||||
|
self.assertEqual(style['language'], 'en_US')
|
||||||
|
|
||||||
|
def test_style_stack_operations(self):
|
||||||
|
"""Test push and pop operations on style stack."""
|
||||||
|
# Initial state
|
||||||
|
initial_style = self.style_manager.get_current_style()
|
||||||
|
|
||||||
|
# Push a new style
|
||||||
|
new_style = {'font_size': 16, 'font_weight': FontWeight.BOLD}
|
||||||
|
self.style_manager.push_style(new_style)
|
||||||
|
|
||||||
|
current_style = self.style_manager.get_current_style()
|
||||||
|
self.assertEqual(current_style['font_size'], 16)
|
||||||
|
self.assertEqual(current_style['font_weight'], FontWeight.BOLD)
|
||||||
|
self.assertEqual(current_style['color'], (0, 0, 0)) # Unchanged
|
||||||
|
|
||||||
|
# Pop the style
|
||||||
|
self.style_manager.pop_style()
|
||||||
|
restored_style = self.style_manager.get_current_style()
|
||||||
|
self.assertEqual(restored_style, initial_style)
|
||||||
|
|
||||||
|
def test_tag_styles(self):
|
||||||
|
"""Test default styles for HTML tags."""
|
||||||
|
h1_style = self.style_manager.get_tag_style('h1')
|
||||||
|
self.assertEqual(h1_style['font_size'], 24)
|
||||||
|
self.assertEqual(h1_style['font_weight'], FontWeight.BOLD)
|
||||||
|
|
||||||
|
h6_style = self.style_manager.get_tag_style('h6')
|
||||||
|
self.assertEqual(h6_style['font_size'], 12)
|
||||||
|
self.assertEqual(h6_style['font_weight'], FontWeight.BOLD)
|
||||||
|
|
||||||
|
em_style = self.style_manager.get_tag_style('em')
|
||||||
|
self.assertEqual(em_style['font_style'], FontStyle.ITALIC)
|
||||||
|
|
||||||
|
unknown_style = self.style_manager.get_tag_style('unknown')
|
||||||
|
self.assertEqual(unknown_style, {})
|
||||||
|
|
||||||
|
def test_inline_style_parsing(self):
|
||||||
|
"""Test parsing of inline CSS styles."""
|
||||||
|
# Test font-size
|
||||||
|
style = self.style_manager.parse_inline_style('font-size: 18px')
|
||||||
|
self.assertEqual(style['font_size'], 18)
|
||||||
|
|
||||||
|
style = self.style_manager.parse_inline_style('font-size: 14pt')
|
||||||
|
self.assertEqual(style['font_size'], 14)
|
||||||
|
|
||||||
|
# Test font-weight
|
||||||
|
style = self.style_manager.parse_inline_style('font-weight: bold')
|
||||||
|
self.assertEqual(style['font_weight'], FontWeight.BOLD)
|
||||||
|
|
||||||
|
# Test font-style
|
||||||
|
style = self.style_manager.parse_inline_style('font-style: italic')
|
||||||
|
self.assertEqual(style['font_style'], FontStyle.ITALIC)
|
||||||
|
|
||||||
|
# Test text-decoration
|
||||||
|
style = self.style_manager.parse_inline_style('text-decoration: underline')
|
||||||
|
self.assertEqual(style['decoration'], TextDecoration.UNDERLINE)
|
||||||
|
|
||||||
|
# Test multiple properties
|
||||||
|
style = self.style_manager.parse_inline_style(
|
||||||
|
'font-size: 20px; font-weight: bold; color: red'
|
||||||
|
)
|
||||||
|
self.assertEqual(style['font_size'], 20)
|
||||||
|
self.assertEqual(style['font_weight'], FontWeight.BOLD)
|
||||||
|
self.assertEqual(style['color'], (255, 0, 0))
|
||||||
|
|
||||||
|
def test_color_parsing(self):
|
||||||
|
"""Test CSS color parsing."""
|
||||||
|
# Named colors
|
||||||
|
self.assertEqual(self.style_manager.parse_color('red'), (255, 0, 0))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('blue'), (0, 0, 255))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('white'), (255, 255, 255))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('gray'), (128, 128, 128))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('grey'), (128, 128, 128))
|
||||||
|
|
||||||
|
# Hex colors
|
||||||
|
self.assertEqual(self.style_manager.parse_color('#ff0000'), (255, 0, 0))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('#00ff00'), (0, 255, 0))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('#f00'), (255, 0, 0))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('#0f0'), (0, 255, 0))
|
||||||
|
|
||||||
|
# RGB colors
|
||||||
|
self.assertEqual(self.style_manager.parse_color('rgb(255, 0, 0)'), (255, 0, 0))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('rgb(128, 128, 128)'), (128, 128, 128))
|
||||||
|
self.assertEqual(self.style_manager.parse_color('rgb( 255 , 255 , 255 )'), (255, 255, 255))
|
||||||
|
|
||||||
|
# RGBA colors (alpha ignored)
|
||||||
|
self.assertEqual(self.style_manager.parse_color('rgba(255, 0, 0, 0.5)'), (255, 0, 0))
|
||||||
|
|
||||||
|
# Invalid colors
|
||||||
|
self.assertIsNone(self.style_manager.parse_color('invalid'))
|
||||||
|
self.assertIsNone(self.style_manager.parse_color('#gg0000'))
|
||||||
|
self.assertIsNone(self.style_manager.parse_color('rgb(300, 0, 0)')) # Invalid values return None
|
||||||
|
|
||||||
|
def test_color_clamping(self):
|
||||||
|
"""Test that RGB values outside valid range return None."""
|
||||||
|
# Values outside 0-255 range should return None
|
||||||
|
color = self.style_manager.parse_color('rgb(300, -10, 128)')
|
||||||
|
self.assertIsNone(color) # Invalid values return None
|
||||||
|
|
||||||
|
def test_apply_style_to_element(self):
|
||||||
|
"""Test combining tag styles with inline styles."""
|
||||||
|
# Test h1 with inline style
|
||||||
|
attrs = {'style': 'color: blue; font-size: 30px'}
|
||||||
|
combined = self.style_manager.apply_style_to_element('h1', attrs)
|
||||||
|
|
||||||
|
# Should have h1 defaults plus inline overrides
|
||||||
|
self.assertEqual(combined['font_size'], 30) # Overridden
|
||||||
|
self.assertEqual(combined['font_weight'], FontWeight.BOLD) # From h1
|
||||||
|
self.assertEqual(combined['color'], (0, 0, 255)) # Inline
|
||||||
|
|
||||||
|
# Test without inline styles
|
||||||
|
combined = self.style_manager.apply_style_to_element('strong', {})
|
||||||
|
self.assertEqual(combined['font_weight'], FontWeight.BOLD)
|
||||||
|
|
||||||
|
def test_reset(self):
|
||||||
|
"""Test resetting the style manager."""
|
||||||
|
# Change the state
|
||||||
|
self.style_manager.push_style({'font_size': 20})
|
||||||
|
self.style_manager.push_style({'color': (255, 0, 0)})
|
||||||
|
|
||||||
|
# Reset
|
||||||
|
self.style_manager.reset()
|
||||||
|
|
||||||
|
# Should be back to initial state
|
||||||
|
style = self.style_manager.get_current_style()
|
||||||
|
self.assertEqual(style['font_size'], 12)
|
||||||
|
self.assertEqual(style['color'], (0, 0, 0))
|
||||||
|
self.assertEqual(len(self.style_manager._style_stack), 0)
|
||||||
|
|
||||||
|
def test_font_creation(self):
|
||||||
|
"""Test Font object creation from current style."""
|
||||||
|
# Set some specific styles
|
||||||
|
self.style_manager.push_style({
|
||||||
|
'font_size': 16,
|
||||||
|
'font_weight': FontWeight.BOLD,
|
||||||
|
'font_style': FontStyle.ITALIC,
|
||||||
|
'decoration': TextDecoration.UNDERLINE,
|
||||||
|
'color': (255, 0, 0),
|
||||||
|
'background': (255, 255, 0, 255)
|
||||||
|
})
|
||||||
|
|
||||||
|
font = self.style_manager.create_font()
|
||||||
|
|
||||||
|
self.assertEqual(font.font_size, 16)
|
||||||
|
self.assertEqual(font.weight, FontWeight.BOLD)
|
||||||
|
self.assertEqual(font.style, FontStyle.ITALIC)
|
||||||
|
self.assertEqual(font.decoration, TextDecoration.UNDERLINE)
|
||||||
|
self.assertEqual(font.colour, (255, 0, 0))
|
||||||
|
self.assertEqual(font.background, (255, 255, 0, 255))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
247
tests/test_html_text.py
Normal file
247
tests/test_html_text.py
Normal file
@ -0,0 +1,247 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for HTML text processing.
|
||||||
|
|
||||||
|
Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import Mock, MagicMock
|
||||||
|
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||||||
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
|
from pyWebLayout.abstract.block import Parapgraph
|
||||||
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLTextProcessor(unittest.TestCase):
|
||||||
|
"""Test cases for HTMLTextProcessor."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test fixtures."""
|
||||||
|
self.style_manager = HTMLStyleManager()
|
||||||
|
self.text_processor = HTMLTextProcessor(self.style_manager)
|
||||||
|
|
||||||
|
# Create a mock paragraph
|
||||||
|
self.mock_paragraph = Mock(spec=Parapgraph)
|
||||||
|
self.mock_paragraph.add_word = Mock()
|
||||||
|
|
||||||
|
def test_initialization(self):
|
||||||
|
"""Test proper initialization of text processor."""
|
||||||
|
self.assertEqual(self.text_processor._text_buffer, "")
|
||||||
|
self.assertIsNone(self.text_processor._current_paragraph)
|
||||||
|
self.assertEqual(self.text_processor._style_manager, self.style_manager)
|
||||||
|
|
||||||
|
def test_add_text(self):
|
||||||
|
"""Test adding text to buffer."""
|
||||||
|
self.text_processor.add_text("Hello")
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), "Hello")
|
||||||
|
|
||||||
|
self.text_processor.add_text(" World")
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), "Hello World")
|
||||||
|
|
||||||
|
def test_entity_references(self):
|
||||||
|
"""Test HTML entity reference handling."""
|
||||||
|
test_cases = [
|
||||||
|
('lt', '<'),
|
||||||
|
('gt', '>'),
|
||||||
|
('amp', '&'),
|
||||||
|
('quot', '"'),
|
||||||
|
('apos', "'"),
|
||||||
|
('nbsp', ' '),
|
||||||
|
('copy', '©'),
|
||||||
|
('reg', '®'),
|
||||||
|
('trade', '™'),
|
||||||
|
('mdash', '—'),
|
||||||
|
('ndash', '–'),
|
||||||
|
('hellip', '…'),
|
||||||
|
('euro', '€'),
|
||||||
|
('unknown', '&unknown;') # Unknown entities should be preserved
|
||||||
|
]
|
||||||
|
|
||||||
|
for entity, expected in test_cases:
|
||||||
|
with self.subTest(entity=entity):
|
||||||
|
self.text_processor.clear_buffer()
|
||||||
|
self.text_processor.add_entity_reference(entity)
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), expected)
|
||||||
|
|
||||||
|
def test_character_references(self):
|
||||||
|
"""Test character reference handling."""
|
||||||
|
# Decimal character references
|
||||||
|
self.text_processor.clear_buffer()
|
||||||
|
self.text_processor.add_character_reference('65') # 'A'
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
||||||
|
|
||||||
|
# Hexadecimal character references
|
||||||
|
self.text_processor.clear_buffer()
|
||||||
|
self.text_processor.add_character_reference('x41') # 'A'
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
||||||
|
|
||||||
|
# Unicode character
|
||||||
|
self.text_processor.clear_buffer()
|
||||||
|
self.text_processor.add_character_reference('8364') # Euro symbol
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), '€')
|
||||||
|
|
||||||
|
# Invalid character reference
|
||||||
|
self.text_processor.clear_buffer()
|
||||||
|
self.text_processor.add_character_reference('invalid')
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;')
|
||||||
|
|
||||||
|
# Out of range character
|
||||||
|
self.text_processor.clear_buffer()
|
||||||
|
self.text_processor.add_character_reference('99999999999')
|
||||||
|
self.assertTrue(self.text_processor.get_buffer_content().startswith('&#'))
|
||||||
|
|
||||||
|
def test_buffer_operations(self):
|
||||||
|
"""Test buffer state operations."""
|
||||||
|
# Test has_pending_text
|
||||||
|
self.assertFalse(self.text_processor.has_pending_text())
|
||||||
|
|
||||||
|
self.text_processor.add_text("Some text")
|
||||||
|
self.assertTrue(self.text_processor.has_pending_text())
|
||||||
|
|
||||||
|
# Test clear_buffer
|
||||||
|
self.text_processor.clear_buffer()
|
||||||
|
self.assertFalse(self.text_processor.has_pending_text())
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||||||
|
|
||||||
|
# Test with whitespace only
|
||||||
|
self.text_processor.add_text(" \n\t ")
|
||||||
|
self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace
|
||||||
|
|
||||||
|
def test_paragraph_management(self):
|
||||||
|
"""Test current paragraph setting."""
|
||||||
|
# Initially no paragraph
|
||||||
|
self.assertIsNone(self.text_processor._current_paragraph)
|
||||||
|
|
||||||
|
# Set paragraph
|
||||||
|
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||||
|
self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph)
|
||||||
|
|
||||||
|
# Clear paragraph
|
||||||
|
self.text_processor.set_current_paragraph(None)
|
||||||
|
self.assertIsNone(self.text_processor._current_paragraph)
|
||||||
|
|
||||||
|
def test_flush_text_with_paragraph(self):
|
||||||
|
"""Test flushing text when paragraph is set."""
|
||||||
|
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||||
|
self.text_processor.add_text("Hello world test")
|
||||||
|
|
||||||
|
# Mock the style manager to return a specific font
|
||||||
|
mock_font = Mock()
|
||||||
|
self.style_manager.create_font = Mock(return_value=mock_font)
|
||||||
|
|
||||||
|
result = self.text_processor.flush_text()
|
||||||
|
|
||||||
|
# Should return True (text was flushed)
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
# Should have created words
|
||||||
|
self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test"
|
||||||
|
|
||||||
|
# Verify the words were created with correct text
|
||||||
|
calls = self.mock_paragraph.add_word.call_args_list
|
||||||
|
word_texts = [call[0][0].text for call in calls]
|
||||||
|
self.assertEqual(word_texts, ["Hello", "world", "test"])
|
||||||
|
|
||||||
|
# Buffer should be empty after flush
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||||||
|
|
||||||
|
def test_flush_text_without_paragraph(self):
|
||||||
|
"""Test flushing text when no paragraph is set."""
|
||||||
|
self.text_processor.add_text("Hello world")
|
||||||
|
|
||||||
|
result = self.text_processor.flush_text()
|
||||||
|
|
||||||
|
# Should return False (no paragraph to flush to)
|
||||||
|
self.assertFalse(result)
|
||||||
|
|
||||||
|
# Buffer should be cleared anyway
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||||||
|
|
||||||
|
def test_flush_empty_buffer(self):
|
||||||
|
"""Test flushing when buffer is empty."""
|
||||||
|
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||||
|
|
||||||
|
result = self.text_processor.flush_text()
|
||||||
|
|
||||||
|
# Should return False (nothing to flush)
|
||||||
|
self.assertFalse(result)
|
||||||
|
|
||||||
|
# No words should be added
|
||||||
|
self.mock_paragraph.add_word.assert_not_called()
|
||||||
|
|
||||||
|
def test_flush_whitespace_only(self):
|
||||||
|
"""Test flushing when buffer contains only whitespace."""
|
||||||
|
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||||
|
self.text_processor.add_text(" \n\t ")
|
||||||
|
|
||||||
|
result = self.text_processor.flush_text()
|
||||||
|
|
||||||
|
# Should return False (no meaningful content)
|
||||||
|
self.assertFalse(result)
|
||||||
|
|
||||||
|
# No words should be added
|
||||||
|
self.mock_paragraph.add_word.assert_not_called()
|
||||||
|
|
||||||
|
def test_word_creation_with_styling(self):
|
||||||
|
"""Test that words are created with proper styling."""
|
||||||
|
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||||
|
self.text_processor.add_text("styled text")
|
||||||
|
|
||||||
|
# Set up style manager to return specific font
|
||||||
|
mock_font = Mock()
|
||||||
|
mock_font.font_size = 16
|
||||||
|
mock_font.weight = "bold"
|
||||||
|
self.style_manager.create_font = Mock(return_value=mock_font)
|
||||||
|
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
# Verify font was created
|
||||||
|
self.style_manager.create_font.assert_called()
|
||||||
|
|
||||||
|
# Verify words were created with the font
|
||||||
|
calls = self.mock_paragraph.add_word.call_args_list
|
||||||
|
for call in calls:
|
||||||
|
word = call[0][0]
|
||||||
|
self.assertEqual(word.style, mock_font)
|
||||||
|
|
||||||
|
def test_reset(self):
|
||||||
|
"""Test resetting the text processor."""
|
||||||
|
# Set up some state
|
||||||
|
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||||
|
self.text_processor.add_text("Some text")
|
||||||
|
|
||||||
|
# Reset
|
||||||
|
self.text_processor.reset()
|
||||||
|
|
||||||
|
# Should be back to initial state
|
||||||
|
self.assertEqual(self.text_processor._text_buffer, "")
|
||||||
|
self.assertIsNone(self.text_processor._current_paragraph)
|
||||||
|
|
||||||
|
def test_complex_text_processing(self):
|
||||||
|
"""Test processing text with mixed content."""
|
||||||
|
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||||
|
|
||||||
|
# Mock font creation
|
||||||
|
mock_font = Mock()
|
||||||
|
self.style_manager.create_font = Mock(return_value=mock_font)
|
||||||
|
|
||||||
|
# Add mixed content
|
||||||
|
self.text_processor.add_text("Hello ")
|
||||||
|
self.text_processor.add_entity_reference('amp')
|
||||||
|
self.text_processor.add_text(" world")
|
||||||
|
self.text_processor.add_character_reference('33') # '!'
|
||||||
|
|
||||||
|
# Should have "Hello & world!"
|
||||||
|
expected_content = "Hello & world!"
|
||||||
|
self.assertEqual(self.text_processor.get_buffer_content(), expected_content)
|
||||||
|
|
||||||
|
# Flush and verify words
|
||||||
|
self.text_processor.flush_text()
|
||||||
|
|
||||||
|
calls = self.mock_paragraph.add_word.call_args_list
|
||||||
|
word_texts = [call[0][0].text for call in calls]
|
||||||
|
self.assertEqual(word_texts, ["Hello", "&", "world!"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
84
tests/test_runner.py
Normal file
84
tests/test_runner.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
"""
|
||||||
|
Test runner for pyWebLayout.
|
||||||
|
|
||||||
|
This script runs all unit tests and provides a summary of results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the project root to the Python path
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
|
||||||
|
def run_all_tests():
|
||||||
|
"""Run all unit tests and return the result."""
|
||||||
|
# Discover and run all tests
|
||||||
|
loader = unittest.TestLoader()
|
||||||
|
start_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
suite = loader.discover(start_dir, pattern='test_*.py')
|
||||||
|
|
||||||
|
# Run tests with detailed output
|
||||||
|
runner = unittest.TextTestRunner(
|
||||||
|
verbosity=2,
|
||||||
|
stream=sys.stdout,
|
||||||
|
descriptions=True,
|
||||||
|
failfast=False
|
||||||
|
)
|
||||||
|
|
||||||
|
result = runner.run(suite)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("TEST SUMMARY")
|
||||||
|
print("="*70)
|
||||||
|
print(f"Tests run: {result.testsRun}")
|
||||||
|
print(f"Failures: {len(result.failures)}")
|
||||||
|
print(f"Errors: {len(result.errors)}")
|
||||||
|
print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
|
||||||
|
|
||||||
|
if result.failures:
|
||||||
|
print(f"\nFAILURES ({len(result.failures)}):")
|
||||||
|
for test, traceback in result.failures:
|
||||||
|
print(f"- {test}")
|
||||||
|
|
||||||
|
if result.errors:
|
||||||
|
print(f"\nERRORS ({len(result.errors)}):")
|
||||||
|
for test, traceback in result.errors:
|
||||||
|
print(f"- {test}")
|
||||||
|
|
||||||
|
success = len(result.failures) == 0 and len(result.errors) == 0
|
||||||
|
print(f"\nResult: {'PASSED' if success else 'FAILED'}")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
|
||||||
|
def run_specific_test(test_module):
|
||||||
|
"""Run a specific test module."""
|
||||||
|
loader = unittest.TestLoader()
|
||||||
|
suite = loader.loadTestsFromName(test_module)
|
||||||
|
|
||||||
|
runner = unittest.TextTestRunner(verbosity=2)
|
||||||
|
result = runner.run(suite)
|
||||||
|
|
||||||
|
return len(result.failures) == 0 and len(result.errors) == 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
# Run specific test
|
||||||
|
test_name = sys.argv[1]
|
||||||
|
if not test_name.startswith('test_'):
|
||||||
|
test_name = f'test_{test_name}'
|
||||||
|
if not test_name.endswith('.py'):
|
||||||
|
test_name = f'{test_name}.py'
|
||||||
|
|
||||||
|
module_name = test_name[:-3] # Remove .py extension
|
||||||
|
success = run_specific_test(module_name)
|
||||||
|
else:
|
||||||
|
# Run all tests
|
||||||
|
success = run_all_tests()
|
||||||
|
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
Loading…
x
Reference in New Issue
Block a user