This commit is contained in:
parent
ab84691278
commit
ad0ac238f3
@ -1011,14 +1011,246 @@ class Table(Block):
|
||||
elif section.lower() == "footer":
|
||||
self._footer_rows.append(row)
|
||||
else: # Default to body
|
||||
self._rows
|
||||
self._rows.append(row)
|
||||
|
||||
def create_row(self, section: str = "body", style=None) -> TableRow:
|
||||
"""
|
||||
Create a new table row and add it to this table.
|
||||
|
||||
Args:
|
||||
section: The section to add the row to ("header", "body", or "footer")
|
||||
style: Optional style override. If None, inherits from table
|
||||
|
||||
Returns:
|
||||
The newly created TableRow object
|
||||
"""
|
||||
return TableRow.create_and_add_to(self, section, style)
|
||||
|
||||
def header_rows(self) -> Iterator[TableRow]:
|
||||
"""
|
||||
Iterate over the header rows in this table.
|
||||
|
||||
Yields:
|
||||
Each TableRow in the header section
|
||||
"""
|
||||
for row in self._header_rows:
|
||||
yield row
|
||||
|
||||
def body_rows(self) -> Iterator[TableRow]:
|
||||
"""
|
||||
Iterate over the body rows in this table.
|
||||
|
||||
Yields:
|
||||
Each TableRow in the body section
|
||||
"""
|
||||
for row in self._rows:
|
||||
yield row
|
||||
|
||||
def footer_rows(self) -> Iterator[TableRow]:
|
||||
"""
|
||||
Iterate over the footer rows in this table.
|
||||
|
||||
Yields:
|
||||
Each TableRow in the footer section
|
||||
"""
|
||||
for row in self._footer_rows:
|
||||
yield row
|
||||
|
||||
def all_rows(self) -> Iterator[Tuple[str, TableRow]]:
|
||||
"""
|
||||
Iterate over all rows in this table with their section labels.
|
||||
|
||||
Yields:
|
||||
Tuples of (section, row) for each row in the table
|
||||
"""
|
||||
for row in self._header_rows:
|
||||
yield ("header", row)
|
||||
for row in self._rows:
|
||||
yield ("body", row)
|
||||
for row in self._footer_rows:
|
||||
yield ("footer", row)
|
||||
|
||||
@property
|
||||
def row_count(self) -> Dict[str, int]:
|
||||
"""Get the row counts by section"""
|
||||
return {
|
||||
"header": len(self._header_rows),
|
||||
"body": len(self._rows),
|
||||
"footer": len(self._footer_rows),
|
||||
"total": len(self._header_rows) + len(self._rows) + len(self._footer_rows)
|
||||
}
|
||||
|
||||
|
||||
class Image(Block):
|
||||
"""
|
||||
An image element with source, dimensions, and alternative text.
|
||||
"""
|
||||
|
||||
def __init__(self, source: str = "", alt_text: str = "", width: Optional[int] = None, height: Optional[int] = None):
|
||||
"""
|
||||
Initialize an image element.
|
||||
|
||||
Args:
|
||||
source: The image source URL or path
|
||||
alt_text: Alternative text for accessibility
|
||||
width: Optional image width in pixels
|
||||
height: Optional image height in pixels
|
||||
"""
|
||||
super().__init__(BlockType.IMAGE)
|
||||
self._source = source
|
||||
self._alt_text = alt_text
|
||||
self._width = width
|
||||
self._height = height
|
||||
|
||||
@classmethod
|
||||
def create_and_add_to(cls, container, source: str = "", alt_text: str = "",
|
||||
width: Optional[int] = None, height: Optional[int] = None) -> 'Image':
|
||||
"""
|
||||
Create a new Image and add it to a container.
|
||||
|
||||
Args:
|
||||
container: The container to add the image to (must have add_block method)
|
||||
source: The image source URL or path
|
||||
alt_text: Alternative text for accessibility
|
||||
width: Optional image width in pixels
|
||||
height: Optional image height in pixels
|
||||
|
||||
Returns:
|
||||
The newly created Image object
|
||||
|
||||
Raises:
|
||||
AttributeError: If the container doesn't have the required add_block method
|
||||
"""
|
||||
# Create the new image
|
||||
image = cls(source, alt_text, width, height)
|
||||
|
||||
# Add the image to the container
|
||||
if hasattr(container, 'add_block'):
|
||||
container.add_block(image)
|
||||
else:
|
||||
raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method")
|
||||
|
||||
return image
|
||||
|
||||
@property
|
||||
def source(self) -> str:
|
||||
"""Get the image source"""
|
||||
return self._source
|
||||
|
||||
@source.setter
|
||||
def source(self, source: str):
|
||||
"""Set the image source"""
|
||||
self._source = source
|
||||
|
||||
@property
|
||||
def alt_text(self) -> str:
|
||||
"""Get the alternative text"""
|
||||
return self._alt_text
|
||||
|
||||
@alt_text.setter
|
||||
def alt_text(self, alt_text: str):
|
||||
"""Set the alternative text"""
|
||||
self._alt_text = alt_text
|
||||
|
||||
@property
|
||||
def width(self) -> Optional[int]:
|
||||
"""Get the image width"""
|
||||
return self._width
|
||||
|
||||
@width.setter
|
||||
def width(self, width: Optional[int]):
|
||||
"""Set the image width"""
|
||||
self._width = width
|
||||
|
||||
@property
|
||||
def height(self) -> Optional[int]:
|
||||
"""Get the image height"""
|
||||
return self._height
|
||||
|
||||
@height.setter
|
||||
def height(self, height: Optional[int]):
|
||||
"""Set the image height"""
|
||||
self._height = height
|
||||
|
||||
def get_dimensions(self) -> Tuple[Optional[int], Optional[int]]:
|
||||
"""
|
||||
Get the image dimensions as a tuple.
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height)
|
||||
"""
|
||||
return (self._width, self._height)
|
||||
|
||||
def get_aspect_ratio(self) -> Optional[float]:
|
||||
"""
|
||||
Calculate the aspect ratio of the image.
|
||||
|
||||
Returns:
|
||||
The aspect ratio (width/height) or None if either dimension is missing
|
||||
"""
|
||||
if self._width is not None and self._height is not None and self._height > 0:
|
||||
return self._width / self._height
|
||||
return None
|
||||
|
||||
def calculate_scaled_dimensions(self, max_width: Optional[int] = None,
|
||||
max_height: Optional[int] = None) -> Tuple[Optional[int], Optional[int]]:
|
||||
"""
|
||||
Calculate scaled dimensions that fit within the given constraints.
|
||||
|
||||
Args:
|
||||
max_width: Maximum allowed width
|
||||
max_height: Maximum allowed height
|
||||
|
||||
Returns:
|
||||
Tuple of (scaled_width, scaled_height)
|
||||
"""
|
||||
if self._width is None or self._height is None:
|
||||
return (self._width, self._height)
|
||||
|
||||
width, height = self._width, self._height
|
||||
|
||||
# Scale down if needed
|
||||
if max_width is not None and width > max_width:
|
||||
height = int(height * max_width / width)
|
||||
width = max_width
|
||||
|
||||
if max_height is not None and height > max_height:
|
||||
width = int(width * max_height / height)
|
||||
height = max_height
|
||||
|
||||
return (width, height)
|
||||
|
||||
class Image:
|
||||
|
||||
pass
|
||||
|
||||
class HorizontalRule:
|
||||
|
||||
pass
|
||||
class HorizontalRule(Block):
|
||||
"""
|
||||
A horizontal rule element (hr tag).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize a horizontal rule element."""
|
||||
super().__init__(BlockType.HORIZONTAL_RULE)
|
||||
|
||||
@classmethod
|
||||
def create_and_add_to(cls, container) -> 'HorizontalRule':
|
||||
"""
|
||||
Create a new HorizontalRule and add it to a container.
|
||||
|
||||
Args:
|
||||
container: The container to add the horizontal rule to (must have add_block method)
|
||||
|
||||
Returns:
|
||||
The newly created HorizontalRule object
|
||||
|
||||
Raises:
|
||||
AttributeError: If the container doesn't have the required add_block method
|
||||
"""
|
||||
# Create the new horizontal rule
|
||||
hr = cls()
|
||||
|
||||
# Add the horizontal rule to the container
|
||||
if hasattr(container, 'add_block'):
|
||||
container.add_block(hr)
|
||||
else:
|
||||
raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method")
|
||||
|
||||
return hr
|
||||
|
||||
@ -124,6 +124,11 @@ class Button(Interactable):
|
||||
"""Enable or disable the button"""
|
||||
self._enabled = enabled
|
||||
|
||||
@property
|
||||
def params(self) -> Dict[str, Any]:
|
||||
"""Get the button parameters"""
|
||||
return self._params
|
||||
|
||||
def execute(self) -> Any:
|
||||
"""
|
||||
Execute the button's callback function if the button is enabled.
|
||||
|
||||
@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
from pyWebLayout.base import Queriable
|
||||
from pyWebLayout.style import Font
|
||||
from typing import Tuple, Union, List, Optional, Dict
|
||||
import pyphen
|
||||
|
||||
|
||||
class Word:
|
||||
@ -157,9 +158,6 @@ class Word:
|
||||
Returns:
|
||||
bool: True if the word can be hyphenated, False otherwise.
|
||||
"""
|
||||
# Only import pyphen when needed
|
||||
import pyphen
|
||||
|
||||
# Use the provided language or fall back to style language
|
||||
lang = language if language else self._style.language
|
||||
dic = pyphen.Pyphen(lang=lang)
|
||||
@ -178,9 +176,6 @@ class Word:
|
||||
Returns:
|
||||
bool: True if the word was hyphenated, False otherwise.
|
||||
"""
|
||||
# Only import pyphen when needed
|
||||
import pyphen
|
||||
|
||||
# Use the provided language or fall back to style language
|
||||
lang = language if language else self._style.language
|
||||
dic = pyphen.Pyphen(lang=lang)
|
||||
@ -333,5 +328,58 @@ class FormattedSpan:
|
||||
|
||||
|
||||
class LineBreak:
|
||||
|
||||
pass
|
||||
"""
|
||||
A line break element that forces a new line within text content.
|
||||
While this is an inline element that can occur within paragraphs,
|
||||
it has block-like properties for consistency with the abstract model.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize a line break element."""
|
||||
# Import here to avoid circular imports
|
||||
from .block import BlockType
|
||||
self._block_type = BlockType.LINE_BREAK
|
||||
self._parent = None
|
||||
|
||||
@property
|
||||
def block_type(self):
|
||||
"""Get the block type for this line break"""
|
||||
return self._block_type
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
"""Get the parent element containing this line break, if any"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, parent):
|
||||
"""Set the parent element"""
|
||||
self._parent = parent
|
||||
|
||||
@classmethod
|
||||
def create_and_add_to(cls, container) -> 'LineBreak':
|
||||
"""
|
||||
Create a new LineBreak and add it to a container.
|
||||
|
||||
Args:
|
||||
container: The container to add the line break to
|
||||
|
||||
Returns:
|
||||
The newly created LineBreak object
|
||||
"""
|
||||
# Create the new line break
|
||||
line_break = cls()
|
||||
|
||||
# Add the line break to the container if it has an appropriate method
|
||||
if hasattr(container, 'add_line_break'):
|
||||
container.add_line_break(line_break)
|
||||
elif hasattr(container, 'add_element'):
|
||||
container.add_element(line_break)
|
||||
elif hasattr(container, 'add_word'):
|
||||
# Some containers might treat line breaks like words
|
||||
container.add_word(line_break)
|
||||
else:
|
||||
# Set parent relationship manually
|
||||
line_break.parent = container
|
||||
|
||||
return line_break
|
||||
|
||||
@ -21,9 +21,11 @@ from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReade
|
||||
|
||||
# Specialized HTML readers
|
||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||
|
||||
# HTML extraction parser (the best approach)
|
||||
from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction
|
||||
|
||||
# Specialized EPUB readers
|
||||
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
|
||||
|
||||
|
||||
@ -11,13 +11,8 @@ from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, Com
|
||||
# HTML readers (decomposed)
|
||||
from .html import HTMLReader, read_html, read_html_file, parse_html_string
|
||||
from .html_metadata import HTMLMetadataReader
|
||||
from .html_content import HTMLContentReader
|
||||
from .html_resources import HTMLResourceReader
|
||||
|
||||
# HTML processing components (supporting modules)
|
||||
from .html_style import HTMLStyleManager
|
||||
from .html_text import HTMLTextProcessor
|
||||
from .html_elements import BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
|
||||
|
||||
# EPUB readers
|
||||
from .epub_reader import read_epub # Legacy
|
||||
@ -29,7 +24,7 @@ __all__ = [
|
||||
|
||||
# HTML readers
|
||||
'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string',
|
||||
'HTMLMetadataReader', 'HTMLContentReader', 'HTMLResourceReader',
|
||||
'HTMLMetadataReader', 'HTMLResourceReader',
|
||||
|
||||
# EPUB readers
|
||||
'read_epub', 'EPUBMetadataReader',
|
||||
|
||||
@ -1,36 +1,33 @@
|
||||
"""
|
||||
Modern HTML reader for pyWebLayout.
|
||||
|
||||
This module provides a decomposed HTML reader that uses specialized
|
||||
readers for metadata, content, and resources, following the pattern
|
||||
established in the abstract module.
|
||||
This module provides an HTML reader that uses the html_extraction module
|
||||
for clean, handler-based parsing using BeautifulSoup.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Union, Optional
|
||||
from pyWebLayout.abstract.document import Document
|
||||
from pyWebLayout.io.readers.base import CompositeReader
|
||||
from pyWebLayout.io.readers.base import BaseReader
|
||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
||||
from pyWebLayout.style import Font
|
||||
|
||||
|
||||
class HTMLReader(CompositeReader):
|
||||
class HTMLReader(BaseReader):
|
||||
"""
|
||||
Modern HTML reader using decomposed architecture.
|
||||
Modern HTML reader using the html_extraction parser.
|
||||
|
||||
This reader combines specialized readers for metadata, content,
|
||||
and resources to provide a complete HTML parsing solution.
|
||||
This reader uses the clean, handler-based architecture from html_extraction.py
|
||||
for parsing HTML content into pyWebLayout's abstract document structure.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the HTML reader with all specialized readers."""
|
||||
"""Initialize the HTML reader."""
|
||||
super().__init__()
|
||||
|
||||
# Set up specialized readers
|
||||
self.set_metadata_reader(HTMLMetadataReader())
|
||||
self.set_content_reader(HTMLContentReader())
|
||||
self.set_resource_reader(HTMLResourceReader())
|
||||
self._metadata_reader = HTMLMetadataReader()
|
||||
self._resource_reader = HTMLResourceReader()
|
||||
|
||||
def can_read(self, source: Union[str, bytes]) -> bool:
|
||||
"""
|
||||
@ -76,6 +73,7 @@ class HTMLReader(CompositeReader):
|
||||
- encoding: Character encoding (default: 'utf-8')
|
||||
- extract_metadata: Whether to extract metadata (default: True)
|
||||
- extract_resources: Whether to extract resources (default: True)
|
||||
- base_font: Base font for styling (default: None)
|
||||
|
||||
Returns:
|
||||
The parsed Document
|
||||
@ -85,6 +83,7 @@ class HTMLReader(CompositeReader):
|
||||
encoding = options.get('encoding', 'utf-8')
|
||||
extract_metadata = options.get('extract_metadata', True)
|
||||
extract_resources = options.get('extract_resources', True)
|
||||
base_font = options.get('base_font')
|
||||
|
||||
# Read the HTML content
|
||||
html_content = self._read_html_content(source, encoding)
|
||||
@ -93,10 +92,6 @@ class HTMLReader(CompositeReader):
|
||||
if not base_url and isinstance(source, str) and os.path.isfile(source):
|
||||
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
|
||||
|
||||
# Set base URL in content reader
|
||||
if self._content_reader and hasattr(self._content_reader, 'set_base_url'):
|
||||
self._content_reader.set_base_url(base_url)
|
||||
|
||||
# Create a new document
|
||||
document = Document()
|
||||
|
||||
@ -104,9 +99,10 @@ class HTMLReader(CompositeReader):
|
||||
if extract_metadata and self._metadata_reader:
|
||||
self._metadata_reader.extract_metadata(html_content, document)
|
||||
|
||||
# Extract content
|
||||
if self._content_reader:
|
||||
self._content_reader.extract_content(html_content, document)
|
||||
# Parse content using html_extraction
|
||||
blocks = parse_html_string(html_content, base_font)
|
||||
for block in blocks:
|
||||
document.add_block(block)
|
||||
|
||||
# Extract resources if enabled
|
||||
if extract_resources and self._resource_reader:
|
||||
|
||||
@ -1,269 +0,0 @@
|
||||
"""
|
||||
Modern HTML content reader for pyWebLayout.
|
||||
|
||||
This module provides a decomposed HTML content reader that uses specialized
|
||||
handlers and managers for different aspects of HTML parsing.
|
||||
"""
|
||||
|
||||
from html.parser import HTMLParser as BaseHTMLParser
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from pyWebLayout.abstract.document import Document
|
||||
from pyWebLayout.io.readers.base import ContentReader
|
||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||||
from pyWebLayout.io.readers.html_elements import (
|
||||
BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
|
||||
)
|
||||
|
||||
|
||||
class HTMLContentReader(ContentReader, BaseHTMLParser):
|
||||
"""
|
||||
Modern HTML content reader using decomposed architecture.
|
||||
|
||||
This class orchestrates specialized handlers to parse HTML content
|
||||
and convert it to pyWebLayout's abstract document model.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the HTML content reader."""
|
||||
BaseHTMLParser.__init__(self)
|
||||
|
||||
# Initialize managers and processors
|
||||
self.style_manager = HTMLStyleManager()
|
||||
self.text_processor = HTMLTextProcessor(self.style_manager)
|
||||
|
||||
# Initialize element handlers
|
||||
self.block_handler = BlockElementHandler(self.style_manager, self.text_processor)
|
||||
self.list_handler = ListElementHandler(self.text_processor)
|
||||
self.table_handler = TableElementHandler(self.text_processor)
|
||||
self.inline_handler = InlineElementHandler(self.text_processor)
|
||||
|
||||
# Document and parsing state
|
||||
self._document: Optional[Document] = None
|
||||
self._in_head = False
|
||||
self._in_script = False
|
||||
self._in_style = False
|
||||
|
||||
def extract_content(self, html_content: str, document: Document) -> Any:
|
||||
"""
|
||||
Extract content from HTML.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
document: The document to populate with content
|
||||
|
||||
Returns:
|
||||
The document with populated content
|
||||
"""
|
||||
self._document = document
|
||||
self._reset_state()
|
||||
|
||||
# Parse the HTML content
|
||||
self.feed(html_content)
|
||||
|
||||
# Flush any remaining text
|
||||
self.text_processor.flush_text()
|
||||
|
||||
return document
|
||||
|
||||
def set_base_url(self, base_url: str):
|
||||
"""Set the base URL for resolving relative links."""
|
||||
self.inline_handler.set_base_url(base_url)
|
||||
|
||||
def _reset_state(self):
|
||||
"""Reset all parser state for new content."""
|
||||
# Reset managers and processors
|
||||
self.style_manager.reset()
|
||||
self.text_processor.reset()
|
||||
|
||||
# Reset element handlers
|
||||
self.block_handler.reset()
|
||||
self.list_handler.reset()
|
||||
self.table_handler.reset()
|
||||
self.inline_handler.reset()
|
||||
|
||||
# Reset parser flags
|
||||
self._in_head = False
|
||||
self._in_script = False
|
||||
self._in_style = False
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
||||
"""Handle the start of an HTML tag."""
|
||||
tag = tag.lower()
|
||||
attrs_dict = dict(attrs)
|
||||
|
||||
# Skip content in head, script, style (except body)
|
||||
if self._should_skip_content(tag):
|
||||
return
|
||||
|
||||
# Handle special section markers
|
||||
if self._handle_special_sections_start(tag):
|
||||
return
|
||||
|
||||
# Apply styles for this element
|
||||
style = self.style_manager.apply_style_to_element(tag, attrs_dict)
|
||||
self.style_manager.push_style(style)
|
||||
|
||||
# Delegate to appropriate handler
|
||||
self._delegate_start_tag(tag, attrs_dict)
|
||||
|
||||
def handle_endtag(self, tag: str):
|
||||
"""Handle the end of an HTML tag."""
|
||||
tag = tag.lower()
|
||||
|
||||
# Handle special section markers
|
||||
if self._handle_special_sections_end(tag):
|
||||
return
|
||||
|
||||
# Skip content in head, script, style
|
||||
if self._in_head or self._in_script or self._in_style:
|
||||
return
|
||||
|
||||
# Flush any accumulated text
|
||||
self.text_processor.flush_text()
|
||||
|
||||
# Delegate to appropriate handler
|
||||
self._delegate_end_tag(tag)
|
||||
|
||||
# Pop style regardless of tag
|
||||
self.style_manager.pop_style()
|
||||
|
||||
def handle_data(self, data: str):
|
||||
"""Handle text data."""
|
||||
if self._in_head or self._in_script or self._in_style:
|
||||
return
|
||||
|
||||
self.text_processor.add_text(data)
|
||||
|
||||
def handle_entityref(self, name: str):
|
||||
"""Handle an HTML entity reference."""
|
||||
if self._in_head or self._in_script or self._in_style:
|
||||
return
|
||||
|
||||
self.text_processor.add_entity_reference(name)
|
||||
|
||||
def handle_charref(self, name: str):
|
||||
"""Handle a character reference."""
|
||||
if self._in_head or self._in_script or self._in_style:
|
||||
return
|
||||
|
||||
self.text_processor.add_character_reference(name)
|
||||
|
||||
def _should_skip_content(self, tag: str) -> bool:
|
||||
"""Check if we should skip content based on current state."""
|
||||
if self._in_head or self._in_script or self._in_style:
|
||||
if tag in ('head', 'script', 'style'):
|
||||
return False # Let special section handlers deal with these
|
||||
if tag != 'body':
|
||||
return True
|
||||
return False
|
||||
|
||||
def _handle_special_sections_start(self, tag: str) -> bool:
|
||||
"""Handle special section start tags. Returns True if handled."""
|
||||
if tag == 'head':
|
||||
self._in_head = True
|
||||
return True
|
||||
elif tag == 'body':
|
||||
self._in_head = False
|
||||
return True
|
||||
elif tag == 'script':
|
||||
self._in_script = True
|
||||
return True
|
||||
elif tag == 'style':
|
||||
self._in_style = True
|
||||
return True
|
||||
return False
|
||||
|
||||
def _handle_special_sections_end(self, tag: str) -> bool:
|
||||
"""Handle special section end tags. Returns True if handled."""
|
||||
if tag == 'head':
|
||||
self._in_head = False
|
||||
self.style_manager.pop_style()
|
||||
return True
|
||||
elif tag == 'script':
|
||||
self._in_script = False
|
||||
self.style_manager.pop_style()
|
||||
return True
|
||||
elif tag == 'style':
|
||||
self._in_style = False
|
||||
self.style_manager.pop_style()
|
||||
return True
|
||||
return False
|
||||
|
||||
def _delegate_start_tag(self, tag: str, attrs: Dict[str, str]):
|
||||
"""Delegate start tag handling to appropriate handler."""
|
||||
# Block elements
|
||||
if tag == 'p':
|
||||
self.block_handler.handle_paragraph_start(self._document)
|
||||
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
||||
self.block_handler.handle_heading_start(tag, self._document)
|
||||
elif tag == 'div':
|
||||
self.block_handler.handle_div_start(self._document)
|
||||
elif tag == 'blockquote':
|
||||
self.block_handler.handle_blockquote_start(self._document)
|
||||
elif tag == 'pre':
|
||||
self.block_handler.handle_pre_start(self._document)
|
||||
elif tag == 'code':
|
||||
self.block_handler.handle_code_start(attrs, self._document)
|
||||
|
||||
# List elements
|
||||
elif tag in ('ul', 'ol', 'dl'):
|
||||
self.list_handler.handle_list_start(tag, self.block_handler, self._document)
|
||||
elif tag == 'li':
|
||||
self.list_handler.handle_list_item_start(self.block_handler)
|
||||
elif tag in ('dt', 'dd'):
|
||||
self.list_handler.handle_definition_start(tag, self.block_handler)
|
||||
|
||||
# Table elements
|
||||
elif tag == 'table':
|
||||
self.table_handler.handle_table_start(attrs, self.block_handler, self._document)
|
||||
elif tag in ('thead', 'tbody', 'tfoot'):
|
||||
self.table_handler.handle_table_section_start(tag)
|
||||
elif tag == 'tr':
|
||||
self.table_handler.handle_table_row_start()
|
||||
elif tag in ('td', 'th'):
|
||||
self.table_handler.handle_table_cell_start(tag, attrs, self.block_handler)
|
||||
|
||||
# Inline elements
|
||||
elif tag == 'a':
|
||||
self.inline_handler.handle_link_start(attrs)
|
||||
elif tag == 'img':
|
||||
self.inline_handler.handle_image(attrs, self.block_handler, self._document)
|
||||
elif tag == 'br':
|
||||
self.inline_handler.handle_line_break(self.block_handler)
|
||||
elif tag == 'hr':
|
||||
self.inline_handler.handle_horizontal_rule(self.block_handler, self._document)
|
||||
|
||||
# Style-only elements (no special handling needed, just styling)
|
||||
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
|
||||
pass # Styles are already applied by style manager
|
||||
|
||||
def _delegate_end_tag(self, tag: str):
|
||||
"""Delegate end tag handling to appropriate handler."""
|
||||
# Block elements
|
||||
if tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code'):
|
||||
self.block_handler.handle_block_end()
|
||||
|
||||
# List elements
|
||||
elif tag in ('ul', 'ol', 'dl'):
|
||||
self.list_handler.handle_list_end(self.block_handler)
|
||||
elif tag in ('li', 'dt', 'dd'):
|
||||
self.list_handler.handle_list_item_end(self.block_handler)
|
||||
|
||||
# Table elements
|
||||
elif tag == 'table':
|
||||
self.table_handler.handle_table_end(self.block_handler)
|
||||
elif tag in ('thead', 'tbody', 'tfoot'):
|
||||
self.table_handler.handle_table_section_end()
|
||||
elif tag == 'tr':
|
||||
self.table_handler.handle_table_row_end()
|
||||
elif tag in ('td', 'th'):
|
||||
self.table_handler.handle_table_cell_end(self.block_handler)
|
||||
|
||||
# Inline elements
|
||||
elif tag == 'a':
|
||||
self.inline_handler.handle_link_end()
|
||||
|
||||
# Style-only elements (no special handling needed)
|
||||
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
|
||||
pass # Styles are handled by style manager
|
||||
@ -1,473 +0,0 @@
|
||||
"""
|
||||
HTML element handlers for pyWebLayout.
|
||||
|
||||
This module provides specialized handlers for different types of HTML elements,
|
||||
using composition and delegation to handle specific element types.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Any
|
||||
import urllib.parse
|
||||
from pyWebLayout.abstract.document import Document
|
||||
from pyWebLayout.abstract.block import (
|
||||
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
||||
HorizontalRule, Image
|
||||
)
|
||||
from pyWebLayout.abstract.inline import LineBreak
|
||||
from pyWebLayout.abstract.functional import Link, LinkType
|
||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||||
|
||||
|
||||
class BlockElementHandler:
|
||||
"""Handles block-level HTML elements like paragraphs, headings, divs."""
|
||||
|
||||
def __init__(self, style_manager: HTMLStyleManager, text_processor: HTMLTextProcessor):
|
||||
self.style_manager = style_manager
|
||||
self.text_processor = text_processor
|
||||
self.block_stack: List[Block] = []
|
||||
self.current_block: Optional[Block] = None
|
||||
self.current_paragraph: Optional[Paragraph] = None
|
||||
|
||||
def reset(self):
|
||||
"""Reset the handler state."""
|
||||
self.block_stack = []
|
||||
self.current_block = None
|
||||
self.current_paragraph = None
|
||||
|
||||
def add_block_to_document_or_parent(self, block: Block, document: Document):
|
||||
"""Add a block to the document or current parent block."""
|
||||
if self.current_block and hasattr(self.current_block, 'add_block'):
|
||||
self.current_block.add_block(block)
|
||||
else:
|
||||
document.add_block(block)
|
||||
|
||||
def handle_paragraph_start(self, document: Document):
|
||||
"""Handle the start of a paragraph element."""
|
||||
self.text_processor.flush_text()
|
||||
paragraph = Paragraph()
|
||||
|
||||
self.add_block_to_document_or_parent(paragraph, document)
|
||||
self.block_stack.append(paragraph)
|
||||
self.current_block = paragraph
|
||||
self.current_paragraph = paragraph
|
||||
self.text_processor.set_current_paragraph(paragraph)
|
||||
|
||||
def handle_heading_start(self, tag: str, document: Document):
|
||||
"""Handle the start of a heading element."""
|
||||
self.text_processor.flush_text()
|
||||
|
||||
level_map = {
|
||||
'h1': HeadingLevel.H1, 'h2': HeadingLevel.H2, 'h3': HeadingLevel.H3,
|
||||
'h4': HeadingLevel.H4, 'h5': HeadingLevel.H5, 'h6': HeadingLevel.H6
|
||||
}
|
||||
|
||||
heading = Heading(level=level_map[tag])
|
||||
self.add_block_to_document_or_parent(heading, document)
|
||||
self.block_stack.append(heading)
|
||||
self.current_block = heading
|
||||
self.current_paragraph = heading # Heading inherits from Paragraph
|
||||
self.text_processor.set_current_paragraph(heading)
|
||||
|
||||
def handle_div_start(self, document: Document):
|
||||
"""Handle the start of a div element."""
|
||||
self.text_processor.flush_text()
|
||||
div_para = Paragraph()
|
||||
|
||||
self.add_block_to_document_or_parent(div_para, document)
|
||||
self.block_stack.append(div_para)
|
||||
self.current_block = div_para
|
||||
self.current_paragraph = div_para
|
||||
self.text_processor.set_current_paragraph(div_para)
|
||||
|
||||
def handle_blockquote_start(self, document: Document):
|
||||
"""Handle the start of a blockquote element."""
|
||||
self.text_processor.flush_text()
|
||||
quote = Quote()
|
||||
|
||||
self.add_block_to_document_or_parent(quote, document)
|
||||
self.block_stack.append(quote)
|
||||
self.current_block = quote
|
||||
self.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
|
||||
def handle_pre_start(self, document: Document):
|
||||
"""Handle the start of a pre element."""
|
||||
self.text_processor.flush_text()
|
||||
pre_para = Paragraph()
|
||||
|
||||
self.add_block_to_document_or_parent(pre_para, document)
|
||||
self.block_stack.append(pre_para)
|
||||
self.current_block = pre_para
|
||||
self.current_paragraph = pre_para
|
||||
self.text_processor.set_current_paragraph(pre_para)
|
||||
|
||||
def handle_code_start(self, attrs: Dict[str, str], document: Document):
|
||||
"""Handle the start of a code element."""
|
||||
# If we're inside a pre, replace the paragraph with a code block
|
||||
if self.block_stack and isinstance(self.block_stack[-1], Paragraph):
|
||||
pre_para = self.block_stack.pop()
|
||||
|
||||
# Get the language from class if specified
|
||||
language = ""
|
||||
if 'class' in attrs:
|
||||
class_attr = attrs['class']
|
||||
if class_attr.startswith('language-'):
|
||||
language = class_attr[9:]
|
||||
|
||||
code_block = CodeBlock(language=language)
|
||||
|
||||
# Replace the paragraph with the code block in its parent
|
||||
if pre_para.parent:
|
||||
parent = pre_para.parent
|
||||
if hasattr(parent, '_blocks'):
|
||||
for i, block in enumerate(parent._blocks):
|
||||
if block == pre_para:
|
||||
parent._blocks[i] = code_block
|
||||
code_block.parent = parent
|
||||
break
|
||||
else:
|
||||
# Replace in document blocks
|
||||
for i, block in enumerate(document.blocks):
|
||||
if block == pre_para:
|
||||
document.blocks[i] = code_block
|
||||
break
|
||||
|
||||
self.block_stack.append(code_block)
|
||||
self.current_block = code_block
|
||||
self.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
|
||||
def handle_block_end(self):
|
||||
"""Handle the end of a block element."""
|
||||
if self.block_stack:
|
||||
self.block_stack.pop()
|
||||
|
||||
if self.block_stack:
|
||||
self.current_block = self.block_stack[-1]
|
||||
# Update current paragraph based on block type
|
||||
if isinstance(self.current_block, Paragraph):
|
||||
self.current_paragraph = self.current_block
|
||||
else:
|
||||
self.current_paragraph = None
|
||||
else:
|
||||
self.current_block = None
|
||||
self.current_paragraph = None
|
||||
|
||||
self.text_processor.set_current_paragraph(self.current_paragraph)
|
||||
|
||||
|
||||
class ListElementHandler:
|
||||
"""Handles list-related HTML elements (ul, ol, dl, li, dt, dd)."""
|
||||
|
||||
def __init__(self, text_processor: HTMLTextProcessor):
|
||||
self.text_processor = text_processor
|
||||
self.list_stack: List[HList] = []
|
||||
|
||||
def reset(self):
|
||||
"""Reset the handler state."""
|
||||
self.list_stack = []
|
||||
|
||||
def handle_list_start(self, tag: str, block_handler: BlockElementHandler, document: Document):
|
||||
"""Handle the start of a list element."""
|
||||
self.text_processor.flush_text()
|
||||
|
||||
style_map = {
|
||||
'ul': ListStyle.UNORDERED,
|
||||
'ol': ListStyle.ORDERED,
|
||||
'dl': ListStyle.DEFINITION
|
||||
}
|
||||
|
||||
list_block = HList(style=style_map[tag])
|
||||
block_handler.add_block_to_document_or_parent(list_block, document)
|
||||
|
||||
block_handler.block_stack.append(list_block)
|
||||
self.list_stack.append(list_block)
|
||||
block_handler.current_block = list_block
|
||||
block_handler.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
|
||||
def handle_list_item_start(self, block_handler: BlockElementHandler):
|
||||
"""Handle the start of a list item."""
|
||||
if not self.list_stack:
|
||||
return
|
||||
|
||||
self.text_processor.flush_text()
|
||||
list_item = ListItem()
|
||||
|
||||
current_list = self.list_stack[-1]
|
||||
current_list.add_item(list_item)
|
||||
|
||||
block_handler.block_stack.append(list_item)
|
||||
block_handler.current_block = list_item
|
||||
|
||||
# Create a paragraph for the list item content
|
||||
item_para = Paragraph()
|
||||
list_item.add_block(item_para)
|
||||
block_handler.current_paragraph = item_para
|
||||
self.text_processor.set_current_paragraph(item_para)
|
||||
|
||||
def handle_definition_start(self, tag: str, block_handler: BlockElementHandler):
|
||||
"""Handle the start of definition terms or descriptions."""
|
||||
if not self.list_stack or self.list_stack[-1].style != ListStyle.DEFINITION:
|
||||
return
|
||||
|
||||
self.text_processor.flush_text()
|
||||
current_list = self.list_stack[-1]
|
||||
|
||||
if tag == 'dt':
|
||||
list_item = ListItem(term="")
|
||||
current_list.add_item(list_item)
|
||||
block_handler.block_stack.append(list_item)
|
||||
block_handler.current_block = list_item
|
||||
|
||||
term_para = Paragraph()
|
||||
list_item.add_block(term_para)
|
||||
block_handler.current_paragraph = term_para
|
||||
self.text_processor.set_current_paragraph(term_para)
|
||||
|
||||
elif tag == 'dd':
|
||||
if current_list._items:
|
||||
list_item = current_list._items[-1]
|
||||
desc_para = Paragraph()
|
||||
list_item.add_block(desc_para)
|
||||
block_handler.current_paragraph = desc_para
|
||||
self.text_processor.set_current_paragraph(desc_para)
|
||||
|
||||
def handle_list_end(self, block_handler: BlockElementHandler):
|
||||
"""Handle the end of a list."""
|
||||
if block_handler.block_stack:
|
||||
block_handler.block_stack.pop()
|
||||
if self.list_stack:
|
||||
self.list_stack.pop()
|
||||
|
||||
if block_handler.block_stack:
|
||||
block_handler.current_block = block_handler.block_stack[-1]
|
||||
else:
|
||||
block_handler.current_block = None
|
||||
|
||||
block_handler.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
|
||||
def handle_list_item_end(self, block_handler: BlockElementHandler):
|
||||
"""Handle the end of a list item."""
|
||||
if block_handler.block_stack:
|
||||
block_handler.block_stack.pop()
|
||||
|
||||
if block_handler.block_stack:
|
||||
block_handler.current_block = block_handler.block_stack[-1]
|
||||
else:
|
||||
block_handler.current_block = None
|
||||
|
||||
block_handler.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
|
||||
|
||||
class TableElementHandler:
|
||||
"""Handles table-related HTML elements (table, tr, td, th, thead, tbody, tfoot)."""
|
||||
|
||||
def __init__(self, text_processor: HTMLTextProcessor):
|
||||
self.text_processor = text_processor
|
||||
self.table_stack: List[Table] = []
|
||||
self.current_table_row: Optional[TableRow] = None
|
||||
self.current_table_section = "body"
|
||||
|
||||
def reset(self):
|
||||
"""Reset the handler state."""
|
||||
self.table_stack = []
|
||||
self.current_table_row = None
|
||||
self.current_table_section = "body"
|
||||
|
||||
def handle_table_start(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
|
||||
"""Handle the start of a table element."""
|
||||
self.text_processor.flush_text()
|
||||
|
||||
caption = attrs.get('summary')
|
||||
table = Table(caption=caption)
|
||||
|
||||
block_handler.add_block_to_document_or_parent(table, document)
|
||||
block_handler.block_stack.append(table)
|
||||
self.table_stack.append(table)
|
||||
block_handler.current_block = table
|
||||
block_handler.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
|
||||
def handle_table_section_start(self, tag: str):
|
||||
"""Handle the start of a table section."""
|
||||
self.current_table_section = tag
|
||||
|
||||
def handle_table_row_start(self):
|
||||
"""Handle the start of a table row."""
|
||||
if not self.table_stack:
|
||||
return
|
||||
|
||||
self.text_processor.flush_text()
|
||||
row = TableRow()
|
||||
|
||||
current_table = self.table_stack[-1]
|
||||
section = self.current_table_section
|
||||
|
||||
if section == 'thead':
|
||||
section = "header"
|
||||
elif section == 'tfoot':
|
||||
section = "footer"
|
||||
else:
|
||||
section = "body"
|
||||
|
||||
current_table.add_row(row, section=section)
|
||||
self.current_table_row = row
|
||||
|
||||
def handle_table_cell_start(self, tag: str, attrs: Dict[str, str], block_handler: BlockElementHandler):
|
||||
"""Handle the start of a table cell."""
|
||||
if not self.current_table_row:
|
||||
return
|
||||
|
||||
self.text_processor.flush_text()
|
||||
|
||||
# Parse attributes
|
||||
try:
|
||||
colspan = int(attrs.get('colspan', 1))
|
||||
rowspan = int(attrs.get('rowspan', 1))
|
||||
except ValueError:
|
||||
colspan, rowspan = 1, 1
|
||||
|
||||
is_header = (tag == 'th')
|
||||
|
||||
cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan)
|
||||
self.current_table_row.add_cell(cell)
|
||||
|
||||
block_handler.block_stack.append(cell)
|
||||
block_handler.current_block = cell
|
||||
|
||||
# Create a paragraph for the cell content
|
||||
cell_para = Paragraph()
|
||||
cell.add_block(cell_para)
|
||||
block_handler.current_paragraph = cell_para
|
||||
self.text_processor.set_current_paragraph(cell_para)
|
||||
|
||||
def handle_table_end(self, block_handler: BlockElementHandler):
|
||||
"""Handle the end of a table."""
|
||||
if block_handler.block_stack:
|
||||
block_handler.block_stack.pop()
|
||||
if self.table_stack:
|
||||
self.table_stack.pop()
|
||||
|
||||
if block_handler.block_stack:
|
||||
block_handler.current_block = block_handler.block_stack[-1]
|
||||
else:
|
||||
block_handler.current_block = None
|
||||
|
||||
block_handler.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
self.current_table_row = None
|
||||
self.current_table_section = "body"
|
||||
|
||||
def handle_table_section_end(self):
|
||||
"""Handle the end of a table section."""
|
||||
self.current_table_section = "body"
|
||||
|
||||
def handle_table_row_end(self):
|
||||
"""Handle the end of a table row."""
|
||||
self.current_table_row = None
|
||||
|
||||
def handle_table_cell_end(self, block_handler: BlockElementHandler):
|
||||
"""Handle the end of a table cell."""
|
||||
if block_handler.block_stack:
|
||||
block_handler.block_stack.pop()
|
||||
|
||||
if block_handler.block_stack:
|
||||
block_handler.current_block = block_handler.block_stack[-1]
|
||||
else:
|
||||
block_handler.current_block = None
|
||||
|
||||
block_handler.current_paragraph = None
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
|
||||
|
||||
class InlineElementHandler:
|
||||
"""Handles inline and special HTML elements (a, img, br, hr)."""
|
||||
|
||||
def __init__(self, text_processor: HTMLTextProcessor, base_url: Optional[str] = None):
|
||||
self.text_processor = text_processor
|
||||
self.base_url = base_url
|
||||
self.in_link = False
|
||||
self.current_link: Optional[Link] = None
|
||||
|
||||
def reset(self):
|
||||
"""Reset the handler state."""
|
||||
self.in_link = False
|
||||
self.current_link = None
|
||||
|
||||
def set_base_url(self, base_url: Optional[str]):
|
||||
"""Set the base URL for resolving relative links."""
|
||||
self.base_url = base_url
|
||||
|
||||
def handle_link_start(self, attrs: Dict[str, str]):
|
||||
"""Handle the start of a link element."""
|
||||
self.text_processor.flush_text()
|
||||
|
||||
href = attrs.get('href', '')
|
||||
title = attrs.get('title', '')
|
||||
|
||||
# Determine link type
|
||||
link_type = LinkType.INTERNAL
|
||||
if href.startswith('http://') or href.startswith('https://'):
|
||||
link_type = LinkType.EXTERNAL
|
||||
elif href.startswith('javascript:'):
|
||||
link_type = LinkType.FUNCTION
|
||||
elif href.startswith('api:'):
|
||||
link_type = LinkType.API
|
||||
href = href[4:]
|
||||
|
||||
# Resolve relative URLs
|
||||
if self.base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')):
|
||||
href = urllib.parse.urljoin(self.base_url, href)
|
||||
|
||||
self.current_link = Link(
|
||||
location=href,
|
||||
link_type=link_type,
|
||||
title=title if title else None
|
||||
)
|
||||
|
||||
self.in_link = True
|
||||
|
||||
def handle_link_end(self):
|
||||
"""Handle the end of a link element."""
|
||||
self.in_link = False
|
||||
self.current_link = None
|
||||
|
||||
def handle_image(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
|
||||
"""Handle an image element."""
|
||||
src = attrs.get('src', '')
|
||||
alt = attrs.get('alt', '')
|
||||
|
||||
# Parse dimensions
|
||||
width = height = None
|
||||
try:
|
||||
if 'width' in attrs:
|
||||
width = int(attrs['width'])
|
||||
if 'height' in attrs:
|
||||
height = int(attrs['height'])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Resolve relative URLs
|
||||
if self.base_url and not src.startswith(('http://', 'https://')):
|
||||
src = urllib.parse.urljoin(self.base_url, src)
|
||||
|
||||
image = Image(source=src, alt_text=alt, width=width, height=height)
|
||||
block_handler.add_block_to_document_or_parent(image, document)
|
||||
|
||||
def handle_line_break(self, block_handler: BlockElementHandler):
|
||||
"""Handle a line break element."""
|
||||
if block_handler.current_paragraph:
|
||||
line_break = LineBreak()
|
||||
if hasattr(block_handler.current_paragraph, 'add_block'):
|
||||
block_handler.current_paragraph.add_block(line_break)
|
||||
self.text_processor.flush_text()
|
||||
|
||||
def handle_horizontal_rule(self, block_handler: BlockElementHandler, document: Document):
|
||||
"""Handle a horizontal rule element."""
|
||||
self.text_processor.flush_text()
|
||||
hr = HorizontalRule()
|
||||
block_handler.add_block_to_document_or_parent(hr, document)
|
||||
@ -12,7 +12,8 @@ from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
from pyWebLayout.abstract.inline import Word, FormattedSpan
|
||||
from pyWebLayout.abstract.block import (
|
||||
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||
HList, ListItem, ListStyle, Table, TableRow, TableCell
|
||||
HList, ListItem, ListStyle, Table, TableRow, TableCell,
|
||||
HorizontalRule, Image
|
||||
)
|
||||
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
||||
|
||||
@ -576,11 +577,9 @@ def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:
|
||||
return cell
|
||||
|
||||
|
||||
def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block:
|
||||
def horizontal_rule_handler(element: Tag, context: StyleContext) -> HorizontalRule:
|
||||
"""Handle <hr> elements."""
|
||||
# TODO: Create a specific HorizontalRule block type
|
||||
# For now, return an empty paragraph
|
||||
return Paragraph(context.font)
|
||||
return HorizontalRule()
|
||||
|
||||
|
||||
def line_break_handler(element: Tag, context: StyleContext) -> None:
|
||||
@ -589,18 +588,22 @@ def line_break_handler(element: Tag, context: StyleContext) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def image_handler(element: Tag, context: StyleContext) -> Block:
|
||||
def image_handler(element: Tag, context: StyleContext) -> Image:
|
||||
"""Handle <img> elements."""
|
||||
# TODO: Create Image block type
|
||||
# For now, return empty paragraph with alt text if available
|
||||
paragraph = Paragraph(context.font)
|
||||
src = context.element_attributes.get('src', '')
|
||||
alt_text = context.element_attributes.get('alt', '')
|
||||
if alt_text:
|
||||
words = alt_text.split()
|
||||
for word_text in words:
|
||||
if word_text:
|
||||
paragraph.add_word(Word(word_text, context.font))
|
||||
return paragraph
|
||||
|
||||
# Parse dimensions if provided
|
||||
width = height = None
|
||||
try:
|
||||
if 'width' in context.element_attributes:
|
||||
width = int(context.element_attributes['width'])
|
||||
if 'height' in context.element_attributes:
|
||||
height = int(context.element_attributes['height'])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return Image(source=src, alt_text=alt_text, width=width, height=height)
|
||||
|
||||
|
||||
def ignore_handler(element: Tag, context: StyleContext) -> None:
|
||||
|
||||
@ -1,281 +0,0 @@
|
||||
"""
|
||||
HTML style management for pyWebLayout.
|
||||
|
||||
This module provides specialized functionality for handling CSS styles,
|
||||
style stacks, and style parsing in HTML documents.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
import re
|
||||
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration
|
||||
|
||||
|
||||
class HTMLStyleManager:
|
||||
"""
|
||||
Manages CSS styles and style stacks during HTML parsing.
|
||||
|
||||
This class handles style parsing, style inheritance, and maintains
|
||||
the style stack for proper style nesting.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the style manager."""
|
||||
self._style_stack: List[Dict[str, Any]] = []
|
||||
self._current_style = self._get_default_style()
|
||||
|
||||
def _get_default_style(self) -> Dict[str, Any]:
|
||||
"""Get the default style settings."""
|
||||
return {
|
||||
'font_size': 12,
|
||||
'font_weight': FontWeight.NORMAL,
|
||||
'font_style': FontStyle.NORMAL,
|
||||
'decoration': TextDecoration.NONE,
|
||||
'color': (0, 0, 0),
|
||||
'background': None,
|
||||
'language': 'en_US'
|
||||
}
|
||||
|
||||
def reset(self):
|
||||
"""Reset the style manager to initial state."""
|
||||
self._style_stack = []
|
||||
self._current_style = self._get_default_style()
|
||||
|
||||
def push_style(self, style: Dict[str, Any]):
|
||||
"""
|
||||
Push a new style onto the style stack.
|
||||
|
||||
Args:
|
||||
style: The style to push
|
||||
"""
|
||||
# Save the current style
|
||||
self._style_stack.append(self._current_style.copy())
|
||||
|
||||
# Apply the new style
|
||||
for key, value in style.items():
|
||||
self._current_style[key] = value
|
||||
|
||||
def pop_style(self):
|
||||
"""Pop a style from the style stack."""
|
||||
if self._style_stack:
|
||||
self._current_style = self._style_stack.pop()
|
||||
|
||||
def get_current_style(self) -> Dict[str, Any]:
|
||||
"""Get the current style."""
|
||||
return self._current_style.copy()
|
||||
|
||||
def get_tag_style(self, tag: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get the default style for a tag.
|
||||
|
||||
Args:
|
||||
tag: The tag name
|
||||
|
||||
Returns:
|
||||
A dictionary of style properties
|
||||
"""
|
||||
tag_styles = {
|
||||
'h1': {'font_size': 24, 'font_weight': FontWeight.BOLD},
|
||||
'h2': {'font_size': 20, 'font_weight': FontWeight.BOLD},
|
||||
'h3': {'font_size': 18, 'font_weight': FontWeight.BOLD},
|
||||
'h4': {'font_size': 16, 'font_weight': FontWeight.BOLD},
|
||||
'h5': {'font_size': 14, 'font_weight': FontWeight.BOLD},
|
||||
'h6': {'font_size': 12, 'font_weight': FontWeight.BOLD},
|
||||
'b': {'font_weight': FontWeight.BOLD},
|
||||
'strong': {'font_weight': FontWeight.BOLD},
|
||||
'i': {'font_style': FontStyle.ITALIC},
|
||||
'em': {'font_style': FontStyle.ITALIC},
|
||||
'u': {'decoration': TextDecoration.UNDERLINE},
|
||||
'a': {'decoration': TextDecoration.UNDERLINE, 'color': (0, 0, 255)},
|
||||
'code': {'font_family': 'monospace', 'background': (240, 240, 240, 255)},
|
||||
'pre': {'font_family': 'monospace'},
|
||||
}
|
||||
|
||||
return tag_styles.get(tag, {})
|
||||
|
||||
def create_font(self) -> Font:
|
||||
"""
|
||||
Create a Font object from the current style.
|
||||
|
||||
Returns:
|
||||
Font: A font object with the current style settings
|
||||
"""
|
||||
return Font(
|
||||
font_size=self._current_style['font_size'],
|
||||
colour=self._current_style['color'],
|
||||
weight=self._current_style['font_weight'],
|
||||
style=self._current_style['font_style'],
|
||||
decoration=self._current_style['decoration'],
|
||||
background=self._current_style['background'],
|
||||
langauge=self._current_style['language']
|
||||
)
|
||||
|
||||
def parse_inline_style(self, style_str: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse inline CSS style string.
|
||||
|
||||
Args:
|
||||
style_str: CSS style string
|
||||
|
||||
Returns:
|
||||
Dictionary of style properties
|
||||
"""
|
||||
if not style_str:
|
||||
return {}
|
||||
|
||||
style_dict = {}
|
||||
declarations = [d.strip() for d in style_str.split(';') if d.strip()]
|
||||
|
||||
for declaration in declarations:
|
||||
parts = declaration.split(':', 1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
prop = parts[0].strip().lower()
|
||||
value = parts[1].strip()
|
||||
|
||||
# Handle specific properties
|
||||
if prop == 'font-size':
|
||||
if value.endswith('px'):
|
||||
try:
|
||||
size = int(value[:-2])
|
||||
style_dict['font_size'] = size
|
||||
except ValueError:
|
||||
pass
|
||||
elif value.endswith('pt'):
|
||||
try:
|
||||
size = int(value[:-2])
|
||||
style_dict['font_size'] = size
|
||||
except ValueError:
|
||||
pass
|
||||
elif prop == 'font-weight':
|
||||
if value == 'bold':
|
||||
style_dict['font_weight'] = FontWeight.BOLD
|
||||
elif value == 'normal':
|
||||
style_dict['font_weight'] = FontWeight.NORMAL
|
||||
elif prop == 'font-style':
|
||||
if value == 'italic':
|
||||
style_dict['font_style'] = FontStyle.ITALIC
|
||||
elif value == 'normal':
|
||||
style_dict['font_style'] = FontStyle.NORMAL
|
||||
elif prop == 'text-decoration':
|
||||
if value == 'underline':
|
||||
style_dict['decoration'] = TextDecoration.UNDERLINE
|
||||
elif value == 'line-through':
|
||||
style_dict['decoration'] = TextDecoration.STRIKETHROUGH
|
||||
elif value == 'none':
|
||||
style_dict['decoration'] = TextDecoration.NONE
|
||||
elif prop == 'color':
|
||||
color = self.parse_color(value)
|
||||
if color:
|
||||
style_dict['color'] = color
|
||||
elif prop == 'background-color':
|
||||
color = self.parse_color(value)
|
||||
if color:
|
||||
style_dict['background'] = color + (255,)
|
||||
|
||||
return style_dict
|
||||
|
||||
def parse_color(self, color_str: str) -> Optional[Tuple[int, int, int]]:
|
||||
"""
|
||||
Parse a CSS color string.
|
||||
|
||||
Args:
|
||||
color_str: CSS color string
|
||||
|
||||
Returns:
|
||||
RGB tuple or None if parsing fails
|
||||
"""
|
||||
# Named colors
|
||||
color_map = {
|
||||
'black': (0, 0, 0),
|
||||
'white': (255, 255, 255),
|
||||
'red': (255, 0, 0),
|
||||
'green': (0, 128, 0),
|
||||
'blue': (0, 0, 255),
|
||||
'yellow': (255, 255, 0),
|
||||
'cyan': (0, 255, 255),
|
||||
'magenta': (255, 0, 255),
|
||||
'gray': (128, 128, 128),
|
||||
'grey': (128, 128, 128),
|
||||
'silver': (192, 192, 192),
|
||||
'maroon': (128, 0, 0),
|
||||
'olive': (128, 128, 0),
|
||||
'navy': (0, 0, 128),
|
||||
'purple': (128, 0, 128),
|
||||
'teal': (0, 128, 128),
|
||||
'lime': (0, 255, 0),
|
||||
'aqua': (0, 255, 255),
|
||||
'fuchsia': (255, 0, 255),
|
||||
}
|
||||
|
||||
# Check for named color
|
||||
color_str = color_str.lower().strip()
|
||||
if color_str in color_map:
|
||||
return color_map[color_str]
|
||||
|
||||
# Check for hex color
|
||||
if color_str.startswith('#'):
|
||||
try:
|
||||
if len(color_str) == 4: # #RGB
|
||||
r = int(color_str[1] + color_str[1], 16)
|
||||
g = int(color_str[2] + color_str[2], 16)
|
||||
b = int(color_str[3] + color_str[3], 16)
|
||||
return (r, g, b)
|
||||
elif len(color_str) == 7: # #RRGGBB
|
||||
r = int(color_str[1:3], 16)
|
||||
g = int(color_str[3:5], 16)
|
||||
b = int(color_str[5:7], 16)
|
||||
return (r, g, b)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Check for rgb() color
|
||||
rgb_match = re.match(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', color_str)
|
||||
if rgb_match:
|
||||
try:
|
||||
r_val = int(rgb_match.group(1))
|
||||
g_val = int(rgb_match.group(2))
|
||||
b_val = int(rgb_match.group(3))
|
||||
|
||||
# Check if values are in valid range (0-255)
|
||||
if r_val > 255 or g_val > 255 or b_val > 255 or r_val < 0 or g_val < 0 or b_val < 0:
|
||||
return None # Invalid color values
|
||||
|
||||
return (r_val, g_val, b_val)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Check for rgba() color (ignore alpha)
|
||||
rgba_match = re.match(r'rgba\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*[\d.]+\s*\)', color_str)
|
||||
if rgba_match:
|
||||
try:
|
||||
r = min(255, max(0, int(rgba_match.group(1))))
|
||||
g = min(255, max(0, int(rgba_match.group(2))))
|
||||
b = min(255, max(0, int(rgba_match.group(3))))
|
||||
return (r, g, b)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Failed to parse color
|
||||
return None
|
||||
|
||||
def apply_style_to_element(self, tag: str, attrs: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Apply combined styles (tag defaults + inline styles) for an element.
|
||||
|
||||
Args:
|
||||
tag: The HTML tag name
|
||||
attrs: Dictionary of tag attributes
|
||||
|
||||
Returns:
|
||||
Combined style dictionary
|
||||
"""
|
||||
# Start with tag-specific styles
|
||||
style = self.get_tag_style(tag)
|
||||
|
||||
# Override with inline styles if present
|
||||
if 'style' in attrs:
|
||||
inline_style = self.parse_inline_style(attrs['style'])
|
||||
style.update(inline_style)
|
||||
|
||||
return style
|
||||
@ -1,163 +0,0 @@
|
||||
"""
|
||||
HTML text processing for pyWebLayout.
|
||||
|
||||
This module provides specialized functionality for handling text content,
|
||||
entity references, and word creation in HTML documents.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from pyWebLayout.abstract.inline import Word
|
||||
from pyWebLayout.abstract.block import Paragraph
|
||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||
|
||||
|
||||
class HTMLTextProcessor:
|
||||
"""
|
||||
Processes text content during HTML parsing.
|
||||
|
||||
This class handles text buffering, entity resolution, and word creation
|
||||
with proper styling applied.
|
||||
"""
|
||||
|
||||
def __init__(self, style_manager: HTMLStyleManager):
|
||||
"""
|
||||
Initialize the text processor.
|
||||
|
||||
Args:
|
||||
style_manager: The style manager for creating styled words
|
||||
"""
|
||||
self._style_manager = style_manager
|
||||
self._text_buffer = ""
|
||||
self._current_paragraph: Optional[Paragraph] = None
|
||||
|
||||
def reset(self):
|
||||
"""Reset the text processor state."""
|
||||
self._text_buffer = ""
|
||||
self._current_paragraph = None
|
||||
|
||||
def set_current_paragraph(self, paragraph: Optional[Paragraph]):
|
||||
"""
|
||||
Set the current paragraph for text output.
|
||||
|
||||
Args:
|
||||
paragraph: The paragraph to receive text, or None
|
||||
"""
|
||||
self._current_paragraph = paragraph
|
||||
|
||||
def add_text(self, text: str):
|
||||
"""
|
||||
Add text to the buffer.
|
||||
|
||||
Args:
|
||||
text: The text to add
|
||||
"""
|
||||
self._text_buffer += text
|
||||
|
||||
def add_entity_reference(self, name: str):
|
||||
"""
|
||||
Add an HTML entity reference to the buffer.
|
||||
|
||||
Args:
|
||||
name: The entity name (e.g., 'lt', 'gt', 'amp')
|
||||
"""
|
||||
# Map common entity references to characters
|
||||
entities = {
|
||||
'lt': '<',
|
||||
'gt': '>',
|
||||
'amp': '&',
|
||||
'quot': '"',
|
||||
'apos': "'",
|
||||
'nbsp': ' ',
|
||||
'copy': '©',
|
||||
'reg': '®',
|
||||
'trade': '™',
|
||||
'mdash': '—',
|
||||
'ndash': '–',
|
||||
'hellip': '…',
|
||||
'laquo': '«',
|
||||
'raquo': '»',
|
||||
'ldquo': '"',
|
||||
'rdquo': '"',
|
||||
'lsquo': ''',
|
||||
'rsquo': ''',
|
||||
'deg': '°',
|
||||
'plusmn': '±',
|
||||
'times': '×',
|
||||
'divide': '÷',
|
||||
'euro': '€',
|
||||
'pound': '£',
|
||||
'yen': '¥',
|
||||
}
|
||||
|
||||
char = entities.get(name, f'&{name};')
|
||||
self._text_buffer += char
|
||||
|
||||
def add_character_reference(self, name: str):
|
||||
"""
|
||||
Add a character reference to the buffer.
|
||||
|
||||
Args:
|
||||
name: The character reference (decimal or hex)
|
||||
"""
|
||||
try:
|
||||
if name.startswith('x'):
|
||||
# Hexadecimal reference
|
||||
char = chr(int(name[1:], 16))
|
||||
else:
|
||||
# Decimal reference
|
||||
char = chr(int(name))
|
||||
self._text_buffer += char
|
||||
except (ValueError, OverflowError):
|
||||
# Invalid character reference
|
||||
self._text_buffer += f'&#{name};'
|
||||
|
||||
def flush_text(self) -> bool:
|
||||
"""
|
||||
Flush the text buffer, creating words as needed.
|
||||
|
||||
Returns:
|
||||
True if text was flushed, False if buffer was empty
|
||||
"""
|
||||
if not self._text_buffer or not self._current_paragraph:
|
||||
self._text_buffer = ""
|
||||
return False
|
||||
|
||||
# Clean up the text
|
||||
text = self._text_buffer.strip()
|
||||
if not text:
|
||||
self._text_buffer = ""
|
||||
return False
|
||||
|
||||
# Create words from the text
|
||||
words = text.split()
|
||||
for word_text in words:
|
||||
if word_text:
|
||||
font = self._style_manager.create_font()
|
||||
word = Word(word_text, font)
|
||||
self._current_paragraph.add_word(word)
|
||||
|
||||
# Reset text buffer
|
||||
self._text_buffer = ""
|
||||
return True
|
||||
|
||||
def has_pending_text(self) -> bool:
|
||||
"""
|
||||
Check if there is pending text in the buffer.
|
||||
|
||||
Returns:
|
||||
True if there is text waiting to be flushed
|
||||
"""
|
||||
return bool(self._text_buffer.strip())
|
||||
|
||||
def get_buffer_content(self) -> str:
|
||||
"""
|
||||
Get the current buffer content without flushing.
|
||||
|
||||
Returns:
|
||||
The current text buffer content
|
||||
"""
|
||||
return self._text_buffer
|
||||
|
||||
def clear_buffer(self):
|
||||
"""Clear the text buffer without creating words."""
|
||||
self._text_buffer = ""
|
||||
@ -34,7 +34,7 @@ class Font:
|
||||
style: FontStyle = FontStyle.NORMAL,
|
||||
decoration: TextDecoration = TextDecoration.NONE,
|
||||
background: Optional[Tuple[int, int, int, int]] = None,
|
||||
langauge = "en_EN"):
|
||||
language = "en_EN"):
|
||||
"""
|
||||
Initialize a Font object with the specified properties.
|
||||
|
||||
@ -46,6 +46,7 @@ class Font:
|
||||
style: Font style (normal or italic).
|
||||
decoration: Text decoration (none, underline, or strikethrough).
|
||||
background: RGBA background color for the text. If None, transparent background.
|
||||
language: Language code for hyphenation and text processing.
|
||||
"""
|
||||
self._font_path = font_path
|
||||
self._font_size = font_size
|
||||
@ -54,7 +55,7 @@ class Font:
|
||||
self._style = style
|
||||
self._decoration = decoration
|
||||
self._background = background if background else (255, 255, 255, 0)
|
||||
self.language = langauge
|
||||
self.language = language
|
||||
# Load the font file or use default
|
||||
self._load_font()
|
||||
|
||||
|
||||
@ -1,354 +0,0 @@
|
||||
"""
|
||||
Unit tests for HTML content reading.
|
||||
|
||||
Tests the HTMLContentReader class for parsing complete HTML documents.
|
||||
This is more of an integration test covering the entire parsing pipeline.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
||||
from pyWebLayout.abstract.document import Document
|
||||
from pyWebLayout.abstract.block import (
|
||||
Paragraph, Heading, HeadingLevel, HList, ListStyle,
|
||||
Table, Quote, CodeBlock, HorizontalRule
|
||||
)
|
||||
from pyWebLayout.abstract.inline import LineBreak
|
||||
|
||||
class TestHTMLContentReader(unittest.TestCase):
|
||||
"""Test cases for HTMLContentReader."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.reader = HTMLContentReader()
|
||||
self.document = Document()
|
||||
|
||||
def test_simple_paragraph(self):
|
||||
"""Test parsing a simple paragraph."""
|
||||
html = '<p>Hello world!</p>'
|
||||
|
||||
result = self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
self.assertIsInstance(self.document.blocks[0], Paragraph)
|
||||
|
||||
paragraph = self.document.blocks[0]
|
||||
words = list(paragraph.words())
|
||||
self.assertEqual(len(words), 2)
|
||||
self.assertEqual(words[0][1].text, "Hello")
|
||||
self.assertEqual(words[1][1].text, "world!")
|
||||
|
||||
def test_headings(self):
|
||||
"""Test parsing different heading levels."""
|
||||
html = '''
|
||||
<h1>Heading 1</h1>
|
||||
<h2>Heading 2</h2>
|
||||
<h3>Heading 3</h3>
|
||||
<h6>Heading 6</h6>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
# Should have 4 heading blocks
|
||||
headings = [block for block in self.document.blocks if isinstance(block, Heading)]
|
||||
self.assertEqual(len(headings), 4)
|
||||
|
||||
# Check heading levels
|
||||
self.assertEqual(headings[0].level, HeadingLevel.H1)
|
||||
self.assertEqual(headings[1].level, HeadingLevel.H2)
|
||||
self.assertEqual(headings[2].level, HeadingLevel.H3)
|
||||
self.assertEqual(headings[3].level, HeadingLevel.H6)
|
||||
|
||||
# Check text content
|
||||
h1_words = list(headings[0].words())
|
||||
self.assertEqual(len(h1_words), 2)
|
||||
self.assertEqual(h1_words[0][1].text, "Heading")
|
||||
self.assertEqual(h1_words[1][1].text, "1")
|
||||
|
||||
def test_styled_text(self):
|
||||
"""Test parsing text with inline styling."""
|
||||
html = '<p>This is <b>bold</b> and <i>italic</i> text.</p>'
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
paragraph = self.document.blocks[0]
|
||||
words = list(paragraph.words())
|
||||
|
||||
# Should have words: "This", "is", "bold", "and", "italic", "text."
|
||||
self.assertEqual(len(words), 6)
|
||||
|
||||
# The styling information is embedded in the Font objects
|
||||
# We can't easily test the exact styling without more complex setup
|
||||
# but we can verify the words are created correctly
|
||||
word_texts = [word[1].text for word in words]
|
||||
self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."])
|
||||
|
||||
def test_unordered_list(self):
|
||||
"""Test parsing unordered lists."""
|
||||
html = '''
|
||||
<ul>
|
||||
<li>First item</li>
|
||||
<li>Second item</li>
|
||||
<li>Third item</li>
|
||||
</ul>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
self.assertIsInstance(self.document.blocks[0], HList)
|
||||
|
||||
list_block = self.document.blocks[0]
|
||||
self.assertEqual(list_block.style, ListStyle.UNORDERED)
|
||||
|
||||
items = list(list_block.items())
|
||||
self.assertEqual(len(items), 3)
|
||||
|
||||
# Check first item content
|
||||
first_item_blocks = list(items[0].blocks())
|
||||
self.assertEqual(len(first_item_blocks), 1)
|
||||
self.assertIsInstance(first_item_blocks[0], Paragraph)
|
||||
|
||||
def test_ordered_list(self):
|
||||
"""Test parsing ordered lists."""
|
||||
html = '''
|
||||
<ol>
|
||||
<li>First step</li>
|
||||
<li>Second step</li>
|
||||
</ol>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
list_block = self.document.blocks[0]
|
||||
self.assertEqual(list_block.style, ListStyle.ORDERED)
|
||||
|
||||
items = list(list_block.items())
|
||||
self.assertEqual(len(items), 2)
|
||||
|
||||
def test_definition_list(self):
|
||||
"""Test parsing definition lists."""
|
||||
html = '''
|
||||
<dl>
|
||||
<dt>Term 1</dt>
|
||||
<dd>Definition 1</dd>
|
||||
<dt>Term 2</dt>
|
||||
<dd>Definition 2</dd>
|
||||
</dl>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
list_block = self.document.blocks[0]
|
||||
self.assertEqual(list_block.style, ListStyle.DEFINITION)
|
||||
|
||||
items = list(list_block.items())
|
||||
self.assertEqual(len(items), 2) # Two dt/dd pairs
|
||||
|
||||
def test_table(self):
|
||||
"""Test parsing simple tables."""
|
||||
html = '''
|
||||
<table>
|
||||
<tr>
|
||||
<th>Header 1</th>
|
||||
<th>Header 2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 1</td>
|
||||
<td>Cell 2</td>
|
||||
</tr>
|
||||
</table>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
self.assertIsInstance(self.document.blocks[0], Table)
|
||||
|
||||
table = self.document.blocks[0]
|
||||
|
||||
# Check body rows
|
||||
body_rows = list(table.body_rows())
|
||||
self.assertEqual(len(body_rows), 2) # Header row + data row
|
||||
|
||||
# Check first row (header)
|
||||
first_row_cells = list(body_rows[0].cells())
|
||||
self.assertEqual(len(first_row_cells), 2)
|
||||
self.assertTrue(first_row_cells[0].is_header)
|
||||
self.assertTrue(first_row_cells[1].is_header)
|
||||
|
||||
# Check second row (data)
|
||||
second_row_cells = list(body_rows[1].cells())
|
||||
self.assertEqual(len(second_row_cells), 2)
|
||||
self.assertFalse(second_row_cells[0].is_header)
|
||||
self.assertFalse(second_row_cells[1].is_header)
|
||||
|
||||
def test_blockquote(self):
|
||||
"""Test parsing blockquotes."""
|
||||
html = '''
|
||||
<blockquote>
|
||||
<p>This is a quoted paragraph.</p>
|
||||
<p>Another quoted paragraph.</p>
|
||||
</blockquote>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
self.assertIsInstance(self.document.blocks[0], Quote)
|
||||
|
||||
quote = self.document.blocks[0]
|
||||
quote_blocks = list(quote.blocks())
|
||||
self.assertEqual(len(quote_blocks), 2)
|
||||
self.assertIsInstance(quote_blocks[0], Paragraph)
|
||||
self.assertIsInstance(quote_blocks[1], Paragraph)
|
||||
|
||||
def test_code_block(self):
|
||||
"""Test parsing code blocks."""
|
||||
html = '''
|
||||
<pre><code class="language-python">
|
||||
def hello():
|
||||
print("Hello, world!")
|
||||
</code></pre>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 1)
|
||||
self.assertIsInstance(self.document.blocks[0], CodeBlock)
|
||||
|
||||
code_block = self.document.blocks[0]
|
||||
self.assertEqual(code_block.language, "python")
|
||||
|
||||
def test_horizontal_rule(self):
|
||||
"""Test parsing horizontal rules."""
|
||||
html = '<p>Before</p><hr><p>After</p>'
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
self.assertEqual(len(self.document.blocks), 3)
|
||||
self.assertIsInstance(self.document.blocks[0], Paragraph)
|
||||
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
|
||||
self.assertIsInstance(self.document.blocks[2], Paragraph)
|
||||
|
||||
def test_html_entities(self):
|
||||
"""Test handling HTML entities."""
|
||||
html = '<p>Less than: < Greater than: > Ampersand: &</p>'
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
paragraph = self.document.blocks[0]
|
||||
words = list(paragraph.words())
|
||||
|
||||
# Find the entity words
|
||||
word_texts = [word[1].text for word in words]
|
||||
self.assertIn('<', word_texts)
|
||||
self.assertIn('>', word_texts)
|
||||
self.assertIn('&', word_texts)
|
||||
|
||||
def test_nested_elements(self):
|
||||
"""Test parsing nested HTML elements."""
|
||||
html = '''
|
||||
<div>
|
||||
<h2>Section Title</h2>
|
||||
<p>Section content with <strong>important</strong> text.</p>
|
||||
<ul>
|
||||
<li>List item 1</li>
|
||||
<li>List item 2</li>
|
||||
</ul>
|
||||
</div>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
# Should have multiple blocks
|
||||
self.assertGreater(len(self.document.blocks), 1)
|
||||
|
||||
# Check that we have different types of blocks
|
||||
block_types = [type(block).__name__ for block in self.document.blocks]
|
||||
self.assertIn('Paragraph', block_types) # From div
|
||||
self.assertIn('Heading', block_types)
|
||||
self.assertIn('HList', block_types)
|
||||
|
||||
def test_empty_elements(self):
|
||||
"""Test handling empty HTML elements."""
|
||||
html = '<p></p><div></div><ul></ul>'
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
# Empty elements should still create blocks
|
||||
self.assertEqual(len(self.document.blocks), 3)
|
||||
|
||||
def test_whitespace_handling(self):
|
||||
"""Test proper whitespace handling."""
|
||||
html = '''
|
||||
<p> Word1 Word2
|
||||
Word3 </p>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
paragraph = self.document.blocks[0]
|
||||
words = list(paragraph.words())
|
||||
|
||||
# Should normalize whitespace and create separate words
|
||||
word_texts = [word[1].text for word in words]
|
||||
self.assertEqual(word_texts, ["Word1", "Word2", "Word3"])
|
||||
|
||||
def test_base_url_setting(self):
|
||||
"""Test setting base URL for link resolution."""
|
||||
base_url = "https://example.com/path/"
|
||||
self.reader.set_base_url(base_url)
|
||||
|
||||
# The base URL should be passed to the inline handler
|
||||
self.assertEqual(self.reader.inline_handler.base_url, base_url)
|
||||
|
||||
def test_complex_document(self):
|
||||
"""Test parsing a complex HTML document."""
|
||||
html = '''
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Document</title>
|
||||
<style>body { font-family: Arial; }</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Main Title</h1>
|
||||
<p>Introduction paragraph with <em>emphasis</em>.</p>
|
||||
|
||||
<h2>Section 1</h2>
|
||||
<p>Content with <a href="link.html">a link</a>.</p>
|
||||
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2 with <strong>bold text</strong></li>
|
||||
</ul>
|
||||
|
||||
<h2>Section 2</h2>
|
||||
<blockquote>
|
||||
<p>A quoted paragraph.</p>
|
||||
</blockquote>
|
||||
|
||||
<table>
|
||||
<tr><th>Col1</th><th>Col2</th></tr>
|
||||
<tr><td>A</td><td>B</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
self.reader.extract_content(html, self.document)
|
||||
|
||||
# Should have parsed multiple blocks
|
||||
self.assertGreater(len(self.document.blocks), 5)
|
||||
|
||||
# Should have different types of content
|
||||
block_types = set(type(block).__name__ for block in self.document.blocks)
|
||||
expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'}
|
||||
self.assertTrue(expected_types.issubset(block_types))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@ -1,181 +1,181 @@
|
||||
"""
|
||||
Unit tests for HTML style management.
|
||||
Unit tests for pyWebLayout style objects.
|
||||
|
||||
Tests the HTMLStyleManager class for CSS parsing, style stacks, and font creation.
|
||||
Tests the Font class and style enums for proper functionality and immutability.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||
from pyWebLayout.style import FontStyle, FontWeight, TextDecoration
|
||||
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration, Alignment
|
||||
|
||||
|
||||
class TestHTMLStyleManager(unittest.TestCase):
|
||||
"""Test cases for HTMLStyleManager."""
|
||||
class TestStyleObjects(unittest.TestCase):
|
||||
"""Test cases for pyWebLayout style objects."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.style_manager = HTMLStyleManager()
|
||||
def test_font_weight_enum(self):
|
||||
"""Test FontWeight enum values."""
|
||||
self.assertEqual(FontWeight.NORMAL.value, "normal")
|
||||
self.assertEqual(FontWeight.BOLD.value, "bold")
|
||||
|
||||
# Test that all expected values exist
|
||||
weights = [FontWeight.NORMAL, FontWeight.BOLD]
|
||||
self.assertEqual(len(weights), 2)
|
||||
|
||||
def test_initialization(self):
|
||||
"""Test proper initialization of style manager."""
|
||||
style = self.style_manager.get_current_style()
|
||||
def test_font_style_enum(self):
|
||||
"""Test FontStyle enum values."""
|
||||
self.assertEqual(FontStyle.NORMAL.value, "normal")
|
||||
self.assertEqual(FontStyle.ITALIC.value, "italic")
|
||||
|
||||
self.assertEqual(style['font_size'], 12)
|
||||
self.assertEqual(style['font_weight'], FontWeight.NORMAL)
|
||||
self.assertEqual(style['font_style'], FontStyle.NORMAL)
|
||||
self.assertEqual(style['decoration'], TextDecoration.NONE)
|
||||
self.assertEqual(style['color'], (0, 0, 0))
|
||||
self.assertIsNone(style['background'])
|
||||
self.assertEqual(style['language'], 'en_US')
|
||||
# Test that all expected values exist
|
||||
styles = [FontStyle.NORMAL, FontStyle.ITALIC]
|
||||
self.assertEqual(len(styles), 2)
|
||||
|
||||
def test_style_stack_operations(self):
|
||||
"""Test push and pop operations on style stack."""
|
||||
# Initial state
|
||||
initial_style = self.style_manager.get_current_style()
|
||||
def test_text_decoration_enum(self):
|
||||
"""Test TextDecoration enum values."""
|
||||
self.assertEqual(TextDecoration.NONE.value, "none")
|
||||
self.assertEqual(TextDecoration.UNDERLINE.value, "underline")
|
||||
self.assertEqual(TextDecoration.STRIKETHROUGH.value, "strikethrough")
|
||||
|
||||
# Push a new style
|
||||
new_style = {'font_size': 16, 'font_weight': FontWeight.BOLD}
|
||||
self.style_manager.push_style(new_style)
|
||||
|
||||
current_style = self.style_manager.get_current_style()
|
||||
self.assertEqual(current_style['font_size'], 16)
|
||||
self.assertEqual(current_style['font_weight'], FontWeight.BOLD)
|
||||
self.assertEqual(current_style['color'], (0, 0, 0)) # Unchanged
|
||||
|
||||
# Pop the style
|
||||
self.style_manager.pop_style()
|
||||
restored_style = self.style_manager.get_current_style()
|
||||
self.assertEqual(restored_style, initial_style)
|
||||
# Test that all expected values exist
|
||||
decorations = [TextDecoration.NONE, TextDecoration.UNDERLINE, TextDecoration.STRIKETHROUGH]
|
||||
self.assertEqual(len(decorations), 3)
|
||||
|
||||
def test_tag_styles(self):
|
||||
"""Test default styles for HTML tags."""
|
||||
h1_style = self.style_manager.get_tag_style('h1')
|
||||
self.assertEqual(h1_style['font_size'], 24)
|
||||
self.assertEqual(h1_style['font_weight'], FontWeight.BOLD)
|
||||
|
||||
h6_style = self.style_manager.get_tag_style('h6')
|
||||
self.assertEqual(h6_style['font_size'], 12)
|
||||
self.assertEqual(h6_style['font_weight'], FontWeight.BOLD)
|
||||
|
||||
em_style = self.style_manager.get_tag_style('em')
|
||||
self.assertEqual(em_style['font_style'], FontStyle.ITALIC)
|
||||
|
||||
unknown_style = self.style_manager.get_tag_style('unknown')
|
||||
self.assertEqual(unknown_style, {})
|
||||
def test_alignment_enum(self):
|
||||
"""Test Alignment enum values."""
|
||||
self.assertEqual(Alignment.LEFT.value, 1)
|
||||
self.assertEqual(Alignment.CENTER.value, 2)
|
||||
self.assertEqual(Alignment.RIGHT.value, 3)
|
||||
self.assertEqual(Alignment.TOP.value, 4)
|
||||
self.assertEqual(Alignment.BOTTOM.value, 5)
|
||||
self.assertEqual(Alignment.JUSTIFY.value, 6)
|
||||
|
||||
def test_inline_style_parsing(self):
|
||||
"""Test parsing of inline CSS styles."""
|
||||
# Test font-size
|
||||
style = self.style_manager.parse_inline_style('font-size: 18px')
|
||||
self.assertEqual(style['font_size'], 18)
|
||||
def test_font_initialization_defaults(self):
|
||||
"""Test Font initialization with default values."""
|
||||
font = Font()
|
||||
|
||||
style = self.style_manager.parse_inline_style('font-size: 14pt')
|
||||
self.assertEqual(style['font_size'], 14)
|
||||
|
||||
# Test font-weight
|
||||
style = self.style_manager.parse_inline_style('font-weight: bold')
|
||||
self.assertEqual(style['font_weight'], FontWeight.BOLD)
|
||||
|
||||
# Test font-style
|
||||
style = self.style_manager.parse_inline_style('font-style: italic')
|
||||
self.assertEqual(style['font_style'], FontStyle.ITALIC)
|
||||
|
||||
# Test text-decoration
|
||||
style = self.style_manager.parse_inline_style('text-decoration: underline')
|
||||
self.assertEqual(style['decoration'], TextDecoration.UNDERLINE)
|
||||
|
||||
# Test multiple properties
|
||||
style = self.style_manager.parse_inline_style(
|
||||
'font-size: 20px; font-weight: bold; color: red'
|
||||
self.assertIsNone(font._font_path)
|
||||
self.assertEqual(font.font_size, 12)
|
||||
self.assertEqual(font.colour, (0, 0, 0))
|
||||
self.assertEqual(font.color, (0, 0, 0)) # Alias
|
||||
self.assertEqual(font.weight, FontWeight.NORMAL)
|
||||
self.assertEqual(font.style, FontStyle.NORMAL)
|
||||
self.assertEqual(font.decoration, TextDecoration.NONE)
|
||||
self.assertEqual(font.background, (255, 255, 255, 0)) # Transparent
|
||||
self.assertEqual(font.language, "en_EN")
|
||||
|
||||
def test_font_initialization_custom(self):
|
||||
"""Test Font initialization with custom values."""
|
||||
font = Font(
|
||||
font_path="/path/to/font.ttf",
|
||||
font_size=16,
|
||||
colour=(255, 0, 0),
|
||||
weight=FontWeight.BOLD,
|
||||
style=FontStyle.ITALIC,
|
||||
decoration=TextDecoration.UNDERLINE,
|
||||
background=(255, 255, 0, 255),
|
||||
langauge="fr_FR"
|
||||
)
|
||||
self.assertEqual(style['font_size'], 20)
|
||||
self.assertEqual(style['font_weight'], FontWeight.BOLD)
|
||||
self.assertEqual(style['color'], (255, 0, 0))
|
||||
|
||||
def test_color_parsing(self):
|
||||
"""Test CSS color parsing."""
|
||||
# Named colors
|
||||
self.assertEqual(self.style_manager.parse_color('red'), (255, 0, 0))
|
||||
self.assertEqual(self.style_manager.parse_color('blue'), (0, 0, 255))
|
||||
self.assertEqual(self.style_manager.parse_color('white'), (255, 255, 255))
|
||||
self.assertEqual(self.style_manager.parse_color('gray'), (128, 128, 128))
|
||||
self.assertEqual(self.style_manager.parse_color('grey'), (128, 128, 128))
|
||||
|
||||
# Hex colors
|
||||
self.assertEqual(self.style_manager.parse_color('#ff0000'), (255, 0, 0))
|
||||
self.assertEqual(self.style_manager.parse_color('#00ff00'), (0, 255, 0))
|
||||
self.assertEqual(self.style_manager.parse_color('#f00'), (255, 0, 0))
|
||||
self.assertEqual(self.style_manager.parse_color('#0f0'), (0, 255, 0))
|
||||
|
||||
# RGB colors
|
||||
self.assertEqual(self.style_manager.parse_color('rgb(255, 0, 0)'), (255, 0, 0))
|
||||
self.assertEqual(self.style_manager.parse_color('rgb(128, 128, 128)'), (128, 128, 128))
|
||||
self.assertEqual(self.style_manager.parse_color('rgb( 255 , 255 , 255 )'), (255, 255, 255))
|
||||
|
||||
# RGBA colors (alpha ignored)
|
||||
self.assertEqual(self.style_manager.parse_color('rgba(255, 0, 0, 0.5)'), (255, 0, 0))
|
||||
|
||||
# Invalid colors
|
||||
self.assertIsNone(self.style_manager.parse_color('invalid'))
|
||||
self.assertIsNone(self.style_manager.parse_color('#gg0000'))
|
||||
self.assertIsNone(self.style_manager.parse_color('rgb(300, 0, 0)')) # Invalid values return None
|
||||
|
||||
def test_color_clamping(self):
|
||||
"""Test that RGB values outside valid range return None."""
|
||||
# Values outside 0-255 range should return None
|
||||
color = self.style_manager.parse_color('rgb(300, -10, 128)')
|
||||
self.assertIsNone(color) # Invalid values return None
|
||||
|
||||
def test_apply_style_to_element(self):
|
||||
"""Test combining tag styles with inline styles."""
|
||||
# Test h1 with inline style
|
||||
attrs = {'style': 'color: blue; font-size: 30px'}
|
||||
combined = self.style_manager.apply_style_to_element('h1', attrs)
|
||||
|
||||
# Should have h1 defaults plus inline overrides
|
||||
self.assertEqual(combined['font_size'], 30) # Overridden
|
||||
self.assertEqual(combined['font_weight'], FontWeight.BOLD) # From h1
|
||||
self.assertEqual(combined['color'], (0, 0, 255)) # Inline
|
||||
|
||||
# Test without inline styles
|
||||
combined = self.style_manager.apply_style_to_element('strong', {})
|
||||
self.assertEqual(combined['font_weight'], FontWeight.BOLD)
|
||||
|
||||
def test_reset(self):
|
||||
"""Test resetting the style manager."""
|
||||
# Change the state
|
||||
self.style_manager.push_style({'font_size': 20})
|
||||
self.style_manager.push_style({'color': (255, 0, 0)})
|
||||
|
||||
# Reset
|
||||
self.style_manager.reset()
|
||||
|
||||
# Should be back to initial state
|
||||
style = self.style_manager.get_current_style()
|
||||
self.assertEqual(style['font_size'], 12)
|
||||
self.assertEqual(style['color'], (0, 0, 0))
|
||||
self.assertEqual(len(self.style_manager._style_stack), 0)
|
||||
|
||||
def test_font_creation(self):
|
||||
"""Test Font object creation from current style."""
|
||||
# Set some specific styles
|
||||
self.style_manager.push_style({
|
||||
'font_size': 16,
|
||||
'font_weight': FontWeight.BOLD,
|
||||
'font_style': FontStyle.ITALIC,
|
||||
'decoration': TextDecoration.UNDERLINE,
|
||||
'color': (255, 0, 0),
|
||||
'background': (255, 255, 0, 255)
|
||||
})
|
||||
|
||||
font = self.style_manager.create_font()
|
||||
|
||||
self.assertEqual(font._font_path, "/path/to/font.ttf")
|
||||
self.assertEqual(font.font_size, 16)
|
||||
self.assertEqual(font.colour, (255, 0, 0))
|
||||
self.assertEqual(font.weight, FontWeight.BOLD)
|
||||
self.assertEqual(font.style, FontStyle.ITALIC)
|
||||
self.assertEqual(font.decoration, TextDecoration.UNDERLINE)
|
||||
self.assertEqual(font.colour, (255, 0, 0))
|
||||
self.assertEqual(font.background, (255, 255, 0, 255))
|
||||
self.assertEqual(font.language, "fr_FR")
|
||||
|
||||
def test_font_with_methods(self):
|
||||
"""Test Font immutable modification methods."""
|
||||
original_font = Font(
|
||||
font_size=12,
|
||||
colour=(0, 0, 0),
|
||||
weight=FontWeight.NORMAL,
|
||||
style=FontStyle.NORMAL,
|
||||
decoration=TextDecoration.NONE
|
||||
)
|
||||
|
||||
# Test with_size
|
||||
size_font = original_font.with_size(16)
|
||||
self.assertEqual(size_font.font_size, 16)
|
||||
self.assertEqual(original_font.font_size, 12) # Original unchanged
|
||||
self.assertEqual(size_font.colour, (0, 0, 0)) # Other properties preserved
|
||||
|
||||
# Test with_colour
|
||||
color_font = original_font.with_colour((255, 0, 0))
|
||||
self.assertEqual(color_font.colour, (255, 0, 0))
|
||||
self.assertEqual(original_font.colour, (0, 0, 0)) # Original unchanged
|
||||
self.assertEqual(color_font.font_size, 12) # Other properties preserved
|
||||
|
||||
# Test with_weight
|
||||
weight_font = original_font.with_weight(FontWeight.BOLD)
|
||||
self.assertEqual(weight_font.weight, FontWeight.BOLD)
|
||||
self.assertEqual(original_font.weight, FontWeight.NORMAL) # Original unchanged
|
||||
|
||||
# Test with_style
|
||||
style_font = original_font.with_style(FontStyle.ITALIC)
|
||||
self.assertEqual(style_font.style, FontStyle.ITALIC)
|
||||
self.assertEqual(original_font.style, FontStyle.NORMAL) # Original unchanged
|
||||
|
||||
# Test with_decoration
|
||||
decoration_font = original_font.with_decoration(TextDecoration.UNDERLINE)
|
||||
self.assertEqual(decoration_font.decoration, TextDecoration.UNDERLINE)
|
||||
self.assertEqual(original_font.decoration, TextDecoration.NONE) # Original unchanged
|
||||
|
||||
def test_font_property_access(self):
|
||||
"""Test Font property access methods."""
|
||||
font = Font(
|
||||
font_size=20,
|
||||
colour=(128, 128, 128),
|
||||
weight=FontWeight.BOLD,
|
||||
style=FontStyle.ITALIC,
|
||||
decoration=TextDecoration.STRIKETHROUGH
|
||||
)
|
||||
|
||||
# Test all property getters
|
||||
self.assertEqual(font.font_size, 20)
|
||||
self.assertEqual(font.colour, (128, 128, 128))
|
||||
self.assertEqual(font.color, (128, 128, 128)) # Alias
|
||||
self.assertEqual(font.weight, FontWeight.BOLD)
|
||||
self.assertEqual(font.style, FontStyle.ITALIC)
|
||||
self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH)
|
||||
|
||||
# Test that font object is accessible
|
||||
self.assertIsNotNone(font.font)
|
||||
|
||||
def test_font_immutability(self):
|
||||
"""Test that Font objects behave immutably."""
|
||||
font1 = Font(font_size=12, colour=(0, 0, 0))
|
||||
font2 = font1.with_size(16)
|
||||
font3 = font2.with_colour((255, 0, 0))
|
||||
|
||||
# Each should be different objects
|
||||
self.assertIsNot(font1, font2)
|
||||
self.assertIsNot(font2, font3)
|
||||
self.assertIsNot(font1, font3)
|
||||
|
||||
# Original properties should be unchanged
|
||||
self.assertEqual(font1.font_size, 12)
|
||||
self.assertEqual(font1.colour, (0, 0, 0))
|
||||
|
||||
self.assertEqual(font2.font_size, 16)
|
||||
self.assertEqual(font2.colour, (0, 0, 0))
|
||||
|
||||
self.assertEqual(font3.font_size, 16)
|
||||
self.assertEqual(font3.colour, (255, 0, 0))
|
||||
|
||||
def test_background_handling(self):
|
||||
"""Test background color handling."""
|
||||
# Test default transparent background
|
||||
font1 = Font()
|
||||
self.assertEqual(font1.background, (255, 255, 255, 0))
|
||||
|
||||
# Test explicit background
|
||||
font2 = Font(background=(255, 0, 0, 128))
|
||||
self.assertEqual(font2.background, (255, 0, 0, 128))
|
||||
|
||||
# Test None background becomes transparent
|
||||
font3 = Font(background=None)
|
||||
self.assertEqual(font3.background, (255, 255, 255, 0))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -1,247 +0,0 @@
|
||||
"""
|
||||
Unit tests for HTML text processing.
|
||||
|
||||
Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from unittest.mock import Mock, MagicMock
|
||||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||
from pyWebLayout.abstract.block import Paragraph
|
||||
from pyWebLayout.abstract.inline import Word
|
||||
|
||||
|
||||
class TestHTMLTextProcessor(unittest.TestCase):
|
||||
"""Test cases for HTMLTextProcessor."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.style_manager = HTMLStyleManager()
|
||||
self.text_processor = HTMLTextProcessor(self.style_manager)
|
||||
|
||||
# Create a mock paragraph
|
||||
self.mock_paragraph = Mock(spec=Paragraph)
|
||||
self.mock_paragraph.add_word = Mock()
|
||||
|
||||
def test_initialization(self):
|
||||
"""Test proper initialization of text processor."""
|
||||
self.assertEqual(self.text_processor._text_buffer, "")
|
||||
self.assertIsNone(self.text_processor._current_paragraph)
|
||||
self.assertEqual(self.text_processor._style_manager, self.style_manager)
|
||||
|
||||
def test_add_text(self):
|
||||
"""Test adding text to buffer."""
|
||||
self.text_processor.add_text("Hello")
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), "Hello")
|
||||
|
||||
self.text_processor.add_text(" World")
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), "Hello World")
|
||||
|
||||
def test_entity_references(self):
|
||||
"""Test HTML entity reference handling."""
|
||||
test_cases = [
|
||||
('lt', '<'),
|
||||
('gt', '>'),
|
||||
('amp', '&'),
|
||||
('quot', '"'),
|
||||
('apos', "'"),
|
||||
('nbsp', ' '),
|
||||
('copy', '©'),
|
||||
('reg', '®'),
|
||||
('trade', '™'),
|
||||
('mdash', '—'),
|
||||
('ndash', '–'),
|
||||
('hellip', '…'),
|
||||
('euro', '€'),
|
||||
('unknown', '&unknown;') # Unknown entities should be preserved
|
||||
]
|
||||
|
||||
for entity, expected in test_cases:
|
||||
with self.subTest(entity=entity):
|
||||
self.text_processor.clear_buffer()
|
||||
self.text_processor.add_entity_reference(entity)
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), expected)
|
||||
|
||||
def test_character_references(self):
|
||||
"""Test character reference handling."""
|
||||
# Decimal character references
|
||||
self.text_processor.clear_buffer()
|
||||
self.text_processor.add_character_reference('65') # 'A'
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
||||
|
||||
# Hexadecimal character references
|
||||
self.text_processor.clear_buffer()
|
||||
self.text_processor.add_character_reference('x41') # 'A'
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
||||
|
||||
# Unicode character
|
||||
self.text_processor.clear_buffer()
|
||||
self.text_processor.add_character_reference('8364') # Euro symbol
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), '€')
|
||||
|
||||
# Invalid character reference
|
||||
self.text_processor.clear_buffer()
|
||||
self.text_processor.add_character_reference('invalid')
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;')
|
||||
|
||||
# Out of range character
|
||||
self.text_processor.clear_buffer()
|
||||
self.text_processor.add_character_reference('99999999999')
|
||||
self.assertTrue(self.text_processor.get_buffer_content().startswith('&#'))
|
||||
|
||||
def test_buffer_operations(self):
|
||||
"""Test buffer state operations."""
|
||||
# Test has_pending_text
|
||||
self.assertFalse(self.text_processor.has_pending_text())
|
||||
|
||||
self.text_processor.add_text("Some text")
|
||||
self.assertTrue(self.text_processor.has_pending_text())
|
||||
|
||||
# Test clear_buffer
|
||||
self.text_processor.clear_buffer()
|
||||
self.assertFalse(self.text_processor.has_pending_text())
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||||
|
||||
# Test with whitespace only
|
||||
self.text_processor.add_text(" \n\t ")
|
||||
self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace
|
||||
|
||||
def test_paragraph_management(self):
|
||||
"""Test current paragraph setting."""
|
||||
# Initially no paragraph
|
||||
self.assertIsNone(self.text_processor._current_paragraph)
|
||||
|
||||
# Set paragraph
|
||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||
self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph)
|
||||
|
||||
# Clear paragraph
|
||||
self.text_processor.set_current_paragraph(None)
|
||||
self.assertIsNone(self.text_processor._current_paragraph)
|
||||
|
||||
def test_flush_text_with_paragraph(self):
|
||||
"""Test flushing text when paragraph is set."""
|
||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||
self.text_processor.add_text("Hello world test")
|
||||
|
||||
# Mock the style manager to return a specific font
|
||||
mock_font = Mock()
|
||||
self.style_manager.create_font = Mock(return_value=mock_font)
|
||||
|
||||
result = self.text_processor.flush_text()
|
||||
|
||||
# Should return True (text was flushed)
|
||||
self.assertTrue(result)
|
||||
|
||||
# Should have created words
|
||||
self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test"
|
||||
|
||||
# Verify the words were created with correct text
|
||||
calls = self.mock_paragraph.add_word.call_args_list
|
||||
word_texts = [call[0][0].text for call in calls]
|
||||
self.assertEqual(word_texts, ["Hello", "world", "test"])
|
||||
|
||||
# Buffer should be empty after flush
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||||
|
||||
def test_flush_text_without_paragraph(self):
|
||||
"""Test flushing text when no paragraph is set."""
|
||||
self.text_processor.add_text("Hello world")
|
||||
|
||||
result = self.text_processor.flush_text()
|
||||
|
||||
# Should return False (no paragraph to flush to)
|
||||
self.assertFalse(result)
|
||||
|
||||
# Buffer should be cleared anyway
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||||
|
||||
def test_flush_empty_buffer(self):
|
||||
"""Test flushing when buffer is empty."""
|
||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||
|
||||
result = self.text_processor.flush_text()
|
||||
|
||||
# Should return False (nothing to flush)
|
||||
self.assertFalse(result)
|
||||
|
||||
# No words should be added
|
||||
self.mock_paragraph.add_word.assert_not_called()
|
||||
|
||||
def test_flush_whitespace_only(self):
|
||||
"""Test flushing when buffer contains only whitespace."""
|
||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||
self.text_processor.add_text(" \n\t ")
|
||||
|
||||
result = self.text_processor.flush_text()
|
||||
|
||||
# Should return False (no meaningful content)
|
||||
self.assertFalse(result)
|
||||
|
||||
# No words should be added
|
||||
self.mock_paragraph.add_word.assert_not_called()
|
||||
|
||||
def test_word_creation_with_styling(self):
|
||||
"""Test that words are created with proper styling."""
|
||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||
self.text_processor.add_text("styled text")
|
||||
|
||||
# Set up style manager to return specific font
|
||||
mock_font = Mock()
|
||||
mock_font.font_size = 16
|
||||
mock_font.weight = "bold"
|
||||
self.style_manager.create_font = Mock(return_value=mock_font)
|
||||
|
||||
self.text_processor.flush_text()
|
||||
|
||||
# Verify font was created
|
||||
self.style_manager.create_font.assert_called()
|
||||
|
||||
# Verify words were created with the font
|
||||
calls = self.mock_paragraph.add_word.call_args_list
|
||||
for call in calls:
|
||||
word = call[0][0]
|
||||
self.assertEqual(word.style, mock_font)
|
||||
|
||||
def test_reset(self):
|
||||
"""Test resetting the text processor."""
|
||||
# Set up some state
|
||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||
self.text_processor.add_text("Some text")
|
||||
|
||||
# Reset
|
||||
self.text_processor.reset()
|
||||
|
||||
# Should be back to initial state
|
||||
self.assertEqual(self.text_processor._text_buffer, "")
|
||||
self.assertIsNone(self.text_processor._current_paragraph)
|
||||
|
||||
def test_complex_text_processing(self):
|
||||
"""Test processing text with mixed content."""
|
||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||||
|
||||
# Mock font creation
|
||||
mock_font = Mock()
|
||||
self.style_manager.create_font = Mock(return_value=mock_font)
|
||||
|
||||
# Add mixed content
|
||||
self.text_processor.add_text("Hello ")
|
||||
self.text_processor.add_entity_reference('amp')
|
||||
self.text_processor.add_text(" world")
|
||||
self.text_processor.add_character_reference('33') # '!'
|
||||
|
||||
# Should have "Hello & world!"
|
||||
expected_content = "Hello & world!"
|
||||
self.assertEqual(self.text_processor.get_buffer_content(), expected_content)
|
||||
|
||||
# Flush and verify words
|
||||
self.text_processor.flush_text()
|
||||
|
||||
calls = self.mock_paragraph.add_word.call_args_list
|
||||
word_texts = [call[0][0].text for call in calls]
|
||||
self.assertEqual(word_texts, ["Hello", "&", "world!"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Loading…
x
Reference in New Issue
Block a user