This commit is contained in:
parent
ab84691278
commit
ad0ac238f3
@ -1011,14 +1011,246 @@ class Table(Block):
|
|||||||
elif section.lower() == "footer":
|
elif section.lower() == "footer":
|
||||||
self._footer_rows.append(row)
|
self._footer_rows.append(row)
|
||||||
else: # Default to body
|
else: # Default to body
|
||||||
self._rows
|
self._rows.append(row)
|
||||||
|
|
||||||
|
def create_row(self, section: str = "body", style=None) -> TableRow:
|
||||||
|
"""
|
||||||
|
Create a new table row and add it to this table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
section: The section to add the row to ("header", "body", or "footer")
|
||||||
|
style: Optional style override. If None, inherits from table
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The newly created TableRow object
|
||||||
|
"""
|
||||||
|
return TableRow.create_and_add_to(self, section, style)
|
||||||
|
|
||||||
|
def header_rows(self) -> Iterator[TableRow]:
|
||||||
|
"""
|
||||||
|
Iterate over the header rows in this table.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each TableRow in the header section
|
||||||
|
"""
|
||||||
|
for row in self._header_rows:
|
||||||
|
yield row
|
||||||
|
|
||||||
|
def body_rows(self) -> Iterator[TableRow]:
|
||||||
|
"""
|
||||||
|
Iterate over the body rows in this table.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each TableRow in the body section
|
||||||
|
"""
|
||||||
|
for row in self._rows:
|
||||||
|
yield row
|
||||||
|
|
||||||
|
def footer_rows(self) -> Iterator[TableRow]:
|
||||||
|
"""
|
||||||
|
Iterate over the footer rows in this table.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Each TableRow in the footer section
|
||||||
|
"""
|
||||||
|
for row in self._footer_rows:
|
||||||
|
yield row
|
||||||
|
|
||||||
|
def all_rows(self) -> Iterator[Tuple[str, TableRow]]:
|
||||||
|
"""
|
||||||
|
Iterate over all rows in this table with their section labels.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Tuples of (section, row) for each row in the table
|
||||||
|
"""
|
||||||
|
for row in self._header_rows:
|
||||||
|
yield ("header", row)
|
||||||
|
for row in self._rows:
|
||||||
|
yield ("body", row)
|
||||||
|
for row in self._footer_rows:
|
||||||
|
yield ("footer", row)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def row_count(self) -> Dict[str, int]:
|
||||||
|
"""Get the row counts by section"""
|
||||||
|
return {
|
||||||
|
"header": len(self._header_rows),
|
||||||
|
"body": len(self._rows),
|
||||||
|
"footer": len(self._footer_rows),
|
||||||
|
"total": len(self._header_rows) + len(self._rows) + len(self._footer_rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Image(Block):
|
||||||
|
"""
|
||||||
|
An image element with source, dimensions, and alternative text.
|
||||||
|
"""
|
||||||
|
|
||||||
class Image:
|
def __init__(self, source: str = "", alt_text: str = "", width: Optional[int] = None, height: Optional[int] = None):
|
||||||
|
"""
|
||||||
|
Initialize an image element.
|
||||||
|
|
||||||
pass
|
Args:
|
||||||
|
source: The image source URL or path
|
||||||
|
alt_text: Alternative text for accessibility
|
||||||
|
width: Optional image width in pixels
|
||||||
|
height: Optional image height in pixels
|
||||||
|
"""
|
||||||
|
super().__init__(BlockType.IMAGE)
|
||||||
|
self._source = source
|
||||||
|
self._alt_text = alt_text
|
||||||
|
self._width = width
|
||||||
|
self._height = height
|
||||||
|
|
||||||
class HorizontalRule:
|
@classmethod
|
||||||
|
def create_and_add_to(cls, container, source: str = "", alt_text: str = "",
|
||||||
|
width: Optional[int] = None, height: Optional[int] = None) -> 'Image':
|
||||||
|
"""
|
||||||
|
Create a new Image and add it to a container.
|
||||||
|
|
||||||
pass
|
Args:
|
||||||
|
container: The container to add the image to (must have add_block method)
|
||||||
|
source: The image source URL or path
|
||||||
|
alt_text: Alternative text for accessibility
|
||||||
|
width: Optional image width in pixels
|
||||||
|
height: Optional image height in pixels
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The newly created Image object
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
AttributeError: If the container doesn't have the required add_block method
|
||||||
|
"""
|
||||||
|
# Create the new image
|
||||||
|
image = cls(source, alt_text, width, height)
|
||||||
|
|
||||||
|
# Add the image to the container
|
||||||
|
if hasattr(container, 'add_block'):
|
||||||
|
container.add_block(image)
|
||||||
|
else:
|
||||||
|
raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method")
|
||||||
|
|
||||||
|
return image
|
||||||
|
|
||||||
|
@property
|
||||||
|
def source(self) -> str:
|
||||||
|
"""Get the image source"""
|
||||||
|
return self._source
|
||||||
|
|
||||||
|
@source.setter
|
||||||
|
def source(self, source: str):
|
||||||
|
"""Set the image source"""
|
||||||
|
self._source = source
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alt_text(self) -> str:
|
||||||
|
"""Get the alternative text"""
|
||||||
|
return self._alt_text
|
||||||
|
|
||||||
|
@alt_text.setter
|
||||||
|
def alt_text(self, alt_text: str):
|
||||||
|
"""Set the alternative text"""
|
||||||
|
self._alt_text = alt_text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def width(self) -> Optional[int]:
|
||||||
|
"""Get the image width"""
|
||||||
|
return self._width
|
||||||
|
|
||||||
|
@width.setter
|
||||||
|
def width(self, width: Optional[int]):
|
||||||
|
"""Set the image width"""
|
||||||
|
self._width = width
|
||||||
|
|
||||||
|
@property
|
||||||
|
def height(self) -> Optional[int]:
|
||||||
|
"""Get the image height"""
|
||||||
|
return self._height
|
||||||
|
|
||||||
|
@height.setter
|
||||||
|
def height(self, height: Optional[int]):
|
||||||
|
"""Set the image height"""
|
||||||
|
self._height = height
|
||||||
|
|
||||||
|
def get_dimensions(self) -> Tuple[Optional[int], Optional[int]]:
|
||||||
|
"""
|
||||||
|
Get the image dimensions as a tuple.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (width, height)
|
||||||
|
"""
|
||||||
|
return (self._width, self._height)
|
||||||
|
|
||||||
|
def get_aspect_ratio(self) -> Optional[float]:
|
||||||
|
"""
|
||||||
|
Calculate the aspect ratio of the image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The aspect ratio (width/height) or None if either dimension is missing
|
||||||
|
"""
|
||||||
|
if self._width is not None and self._height is not None and self._height > 0:
|
||||||
|
return self._width / self._height
|
||||||
|
return None
|
||||||
|
|
||||||
|
def calculate_scaled_dimensions(self, max_width: Optional[int] = None,
|
||||||
|
max_height: Optional[int] = None) -> Tuple[Optional[int], Optional[int]]:
|
||||||
|
"""
|
||||||
|
Calculate scaled dimensions that fit within the given constraints.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_width: Maximum allowed width
|
||||||
|
max_height: Maximum allowed height
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (scaled_width, scaled_height)
|
||||||
|
"""
|
||||||
|
if self._width is None or self._height is None:
|
||||||
|
return (self._width, self._height)
|
||||||
|
|
||||||
|
width, height = self._width, self._height
|
||||||
|
|
||||||
|
# Scale down if needed
|
||||||
|
if max_width is not None and width > max_width:
|
||||||
|
height = int(height * max_width / width)
|
||||||
|
width = max_width
|
||||||
|
|
||||||
|
if max_height is not None and height > max_height:
|
||||||
|
width = int(width * max_height / height)
|
||||||
|
height = max_height
|
||||||
|
|
||||||
|
return (width, height)
|
||||||
|
|
||||||
|
|
||||||
|
class HorizontalRule(Block):
|
||||||
|
"""
|
||||||
|
A horizontal rule element (hr tag).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize a horizontal rule element."""
|
||||||
|
super().__init__(BlockType.HORIZONTAL_RULE)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_and_add_to(cls, container) -> 'HorizontalRule':
|
||||||
|
"""
|
||||||
|
Create a new HorizontalRule and add it to a container.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
container: The container to add the horizontal rule to (must have add_block method)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The newly created HorizontalRule object
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
AttributeError: If the container doesn't have the required add_block method
|
||||||
|
"""
|
||||||
|
# Create the new horizontal rule
|
||||||
|
hr = cls()
|
||||||
|
|
||||||
|
# Add the horizontal rule to the container
|
||||||
|
if hasattr(container, 'add_block'):
|
||||||
|
container.add_block(hr)
|
||||||
|
else:
|
||||||
|
raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method")
|
||||||
|
|
||||||
|
return hr
|
||||||
|
|||||||
@ -124,6 +124,11 @@ class Button(Interactable):
|
|||||||
"""Enable or disable the button"""
|
"""Enable or disable the button"""
|
||||||
self._enabled = enabled
|
self._enabled = enabled
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self) -> Dict[str, Any]:
|
||||||
|
"""Get the button parameters"""
|
||||||
|
return self._params
|
||||||
|
|
||||||
def execute(self) -> Any:
|
def execute(self) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute the button's callback function if the button is enabled.
|
Execute the button's callback function if the button is enabled.
|
||||||
|
|||||||
@ -2,6 +2,7 @@ from __future__ import annotations
|
|||||||
from pyWebLayout.base import Queriable
|
from pyWebLayout.base import Queriable
|
||||||
from pyWebLayout.style import Font
|
from pyWebLayout.style import Font
|
||||||
from typing import Tuple, Union, List, Optional, Dict
|
from typing import Tuple, Union, List, Optional, Dict
|
||||||
|
import pyphen
|
||||||
|
|
||||||
|
|
||||||
class Word:
|
class Word:
|
||||||
@ -157,9 +158,6 @@ class Word:
|
|||||||
Returns:
|
Returns:
|
||||||
bool: True if the word can be hyphenated, False otherwise.
|
bool: True if the word can be hyphenated, False otherwise.
|
||||||
"""
|
"""
|
||||||
# Only import pyphen when needed
|
|
||||||
import pyphen
|
|
||||||
|
|
||||||
# Use the provided language or fall back to style language
|
# Use the provided language or fall back to style language
|
||||||
lang = language if language else self._style.language
|
lang = language if language else self._style.language
|
||||||
dic = pyphen.Pyphen(lang=lang)
|
dic = pyphen.Pyphen(lang=lang)
|
||||||
@ -178,9 +176,6 @@ class Word:
|
|||||||
Returns:
|
Returns:
|
||||||
bool: True if the word was hyphenated, False otherwise.
|
bool: True if the word was hyphenated, False otherwise.
|
||||||
"""
|
"""
|
||||||
# Only import pyphen when needed
|
|
||||||
import pyphen
|
|
||||||
|
|
||||||
# Use the provided language or fall back to style language
|
# Use the provided language or fall back to style language
|
||||||
lang = language if language else self._style.language
|
lang = language if language else self._style.language
|
||||||
dic = pyphen.Pyphen(lang=lang)
|
dic = pyphen.Pyphen(lang=lang)
|
||||||
@ -333,5 +328,58 @@ class FormattedSpan:
|
|||||||
|
|
||||||
|
|
||||||
class LineBreak:
|
class LineBreak:
|
||||||
|
"""
|
||||||
|
A line break element that forces a new line within text content.
|
||||||
|
While this is an inline element that can occur within paragraphs,
|
||||||
|
it has block-like properties for consistency with the abstract model.
|
||||||
|
"""
|
||||||
|
|
||||||
pass
|
def __init__(self):
|
||||||
|
"""Initialize a line break element."""
|
||||||
|
# Import here to avoid circular imports
|
||||||
|
from .block import BlockType
|
||||||
|
self._block_type = BlockType.LINE_BREAK
|
||||||
|
self._parent = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def block_type(self):
|
||||||
|
"""Get the block type for this line break"""
|
||||||
|
return self._block_type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parent(self):
|
||||||
|
"""Get the parent element containing this line break, if any"""
|
||||||
|
return self._parent
|
||||||
|
|
||||||
|
@parent.setter
|
||||||
|
def parent(self, parent):
|
||||||
|
"""Set the parent element"""
|
||||||
|
self._parent = parent
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_and_add_to(cls, container) -> 'LineBreak':
|
||||||
|
"""
|
||||||
|
Create a new LineBreak and add it to a container.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
container: The container to add the line break to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The newly created LineBreak object
|
||||||
|
"""
|
||||||
|
# Create the new line break
|
||||||
|
line_break = cls()
|
||||||
|
|
||||||
|
# Add the line break to the container if it has an appropriate method
|
||||||
|
if hasattr(container, 'add_line_break'):
|
||||||
|
container.add_line_break(line_break)
|
||||||
|
elif hasattr(container, 'add_element'):
|
||||||
|
container.add_element(line_break)
|
||||||
|
elif hasattr(container, 'add_word'):
|
||||||
|
# Some containers might treat line breaks like words
|
||||||
|
container.add_word(line_break)
|
||||||
|
else:
|
||||||
|
# Set parent relationship manually
|
||||||
|
line_break.parent = container
|
||||||
|
|
||||||
|
return line_break
|
||||||
|
|||||||
@ -21,9 +21,11 @@ from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReade
|
|||||||
|
|
||||||
# Specialized HTML readers
|
# Specialized HTML readers
|
||||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||||
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
|
||||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||||
|
|
||||||
|
# HTML extraction parser (the best approach)
|
||||||
|
from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction
|
||||||
|
|
||||||
# Specialized EPUB readers
|
# Specialized EPUB readers
|
||||||
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
|
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
|
||||||
|
|
||||||
|
|||||||
@ -11,13 +11,8 @@ from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, Com
|
|||||||
# HTML readers (decomposed)
|
# HTML readers (decomposed)
|
||||||
from .html import HTMLReader, read_html, read_html_file, parse_html_string
|
from .html import HTMLReader, read_html, read_html_file, parse_html_string
|
||||||
from .html_metadata import HTMLMetadataReader
|
from .html_metadata import HTMLMetadataReader
|
||||||
from .html_content import HTMLContentReader
|
|
||||||
from .html_resources import HTMLResourceReader
|
from .html_resources import HTMLResourceReader
|
||||||
|
|
||||||
# HTML processing components (supporting modules)
|
|
||||||
from .html_style import HTMLStyleManager
|
|
||||||
from .html_text import HTMLTextProcessor
|
|
||||||
from .html_elements import BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
|
|
||||||
|
|
||||||
# EPUB readers
|
# EPUB readers
|
||||||
from .epub_reader import read_epub # Legacy
|
from .epub_reader import read_epub # Legacy
|
||||||
@ -29,7 +24,7 @@ __all__ = [
|
|||||||
|
|
||||||
# HTML readers
|
# HTML readers
|
||||||
'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string',
|
'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string',
|
||||||
'HTMLMetadataReader', 'HTMLContentReader', 'HTMLResourceReader',
|
'HTMLMetadataReader', 'HTMLResourceReader',
|
||||||
|
|
||||||
# EPUB readers
|
# EPUB readers
|
||||||
'read_epub', 'EPUBMetadataReader',
|
'read_epub', 'EPUBMetadataReader',
|
||||||
|
|||||||
@ -1,36 +1,33 @@
|
|||||||
"""
|
"""
|
||||||
Modern HTML reader for pyWebLayout.
|
Modern HTML reader for pyWebLayout.
|
||||||
|
|
||||||
This module provides a decomposed HTML reader that uses specialized
|
This module provides an HTML reader that uses the html_extraction module
|
||||||
readers for metadata, content, and resources, following the pattern
|
for clean, handler-based parsing using BeautifulSoup.
|
||||||
established in the abstract module.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import Union, Optional
|
from typing import Union, Optional
|
||||||
from pyWebLayout.abstract.document import Document
|
from pyWebLayout.abstract.document import Document
|
||||||
from pyWebLayout.io.readers.base import CompositeReader
|
from pyWebLayout.io.readers.base import BaseReader
|
||||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||||
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
|
||||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||||
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
||||||
|
from pyWebLayout.style import Font
|
||||||
|
|
||||||
|
|
||||||
class HTMLReader(CompositeReader):
|
class HTMLReader(BaseReader):
|
||||||
"""
|
"""
|
||||||
Modern HTML reader using decomposed architecture.
|
Modern HTML reader using the html_extraction parser.
|
||||||
|
|
||||||
This reader combines specialized readers for metadata, content,
|
This reader uses the clean, handler-based architecture from html_extraction.py
|
||||||
and resources to provide a complete HTML parsing solution.
|
for parsing HTML content into pyWebLayout's abstract document structure.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""Initialize the HTML reader with all specialized readers."""
|
"""Initialize the HTML reader."""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self._metadata_reader = HTMLMetadataReader()
|
||||||
# Set up specialized readers
|
self._resource_reader = HTMLResourceReader()
|
||||||
self.set_metadata_reader(HTMLMetadataReader())
|
|
||||||
self.set_content_reader(HTMLContentReader())
|
|
||||||
self.set_resource_reader(HTMLResourceReader())
|
|
||||||
|
|
||||||
def can_read(self, source: Union[str, bytes]) -> bool:
|
def can_read(self, source: Union[str, bytes]) -> bool:
|
||||||
"""
|
"""
|
||||||
@ -76,6 +73,7 @@ class HTMLReader(CompositeReader):
|
|||||||
- encoding: Character encoding (default: 'utf-8')
|
- encoding: Character encoding (default: 'utf-8')
|
||||||
- extract_metadata: Whether to extract metadata (default: True)
|
- extract_metadata: Whether to extract metadata (default: True)
|
||||||
- extract_resources: Whether to extract resources (default: True)
|
- extract_resources: Whether to extract resources (default: True)
|
||||||
|
- base_font: Base font for styling (default: None)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The parsed Document
|
The parsed Document
|
||||||
@ -85,6 +83,7 @@ class HTMLReader(CompositeReader):
|
|||||||
encoding = options.get('encoding', 'utf-8')
|
encoding = options.get('encoding', 'utf-8')
|
||||||
extract_metadata = options.get('extract_metadata', True)
|
extract_metadata = options.get('extract_metadata', True)
|
||||||
extract_resources = options.get('extract_resources', True)
|
extract_resources = options.get('extract_resources', True)
|
||||||
|
base_font = options.get('base_font')
|
||||||
|
|
||||||
# Read the HTML content
|
# Read the HTML content
|
||||||
html_content = self._read_html_content(source, encoding)
|
html_content = self._read_html_content(source, encoding)
|
||||||
@ -93,10 +92,6 @@ class HTMLReader(CompositeReader):
|
|||||||
if not base_url and isinstance(source, str) and os.path.isfile(source):
|
if not base_url and isinstance(source, str) and os.path.isfile(source):
|
||||||
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
|
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
|
||||||
|
|
||||||
# Set base URL in content reader
|
|
||||||
if self._content_reader and hasattr(self._content_reader, 'set_base_url'):
|
|
||||||
self._content_reader.set_base_url(base_url)
|
|
||||||
|
|
||||||
# Create a new document
|
# Create a new document
|
||||||
document = Document()
|
document = Document()
|
||||||
|
|
||||||
@ -104,9 +99,10 @@ class HTMLReader(CompositeReader):
|
|||||||
if extract_metadata and self._metadata_reader:
|
if extract_metadata and self._metadata_reader:
|
||||||
self._metadata_reader.extract_metadata(html_content, document)
|
self._metadata_reader.extract_metadata(html_content, document)
|
||||||
|
|
||||||
# Extract content
|
# Parse content using html_extraction
|
||||||
if self._content_reader:
|
blocks = parse_html_string(html_content, base_font)
|
||||||
self._content_reader.extract_content(html_content, document)
|
for block in blocks:
|
||||||
|
document.add_block(block)
|
||||||
|
|
||||||
# Extract resources if enabled
|
# Extract resources if enabled
|
||||||
if extract_resources and self._resource_reader:
|
if extract_resources and self._resource_reader:
|
||||||
|
|||||||
@ -1,269 +0,0 @@
|
|||||||
"""
|
|
||||||
Modern HTML content reader for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides a decomposed HTML content reader that uses specialized
|
|
||||||
handlers and managers for different aspects of HTML parsing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from html.parser import HTMLParser as BaseHTMLParser
|
|
||||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
|
||||||
from pyWebLayout.abstract.document import Document
|
|
||||||
from pyWebLayout.io.readers.base import ContentReader
|
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
|
||||||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
|
||||||
from pyWebLayout.io.readers.html_elements import (
|
|
||||||
BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLContentReader(ContentReader, BaseHTMLParser):
|
|
||||||
"""
|
|
||||||
Modern HTML content reader using decomposed architecture.
|
|
||||||
|
|
||||||
This class orchestrates specialized handlers to parse HTML content
|
|
||||||
and convert it to pyWebLayout's abstract document model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the HTML content reader."""
|
|
||||||
BaseHTMLParser.__init__(self)
|
|
||||||
|
|
||||||
# Initialize managers and processors
|
|
||||||
self.style_manager = HTMLStyleManager()
|
|
||||||
self.text_processor = HTMLTextProcessor(self.style_manager)
|
|
||||||
|
|
||||||
# Initialize element handlers
|
|
||||||
self.block_handler = BlockElementHandler(self.style_manager, self.text_processor)
|
|
||||||
self.list_handler = ListElementHandler(self.text_processor)
|
|
||||||
self.table_handler = TableElementHandler(self.text_processor)
|
|
||||||
self.inline_handler = InlineElementHandler(self.text_processor)
|
|
||||||
|
|
||||||
# Document and parsing state
|
|
||||||
self._document: Optional[Document] = None
|
|
||||||
self._in_head = False
|
|
||||||
self._in_script = False
|
|
||||||
self._in_style = False
|
|
||||||
|
|
||||||
def extract_content(self, html_content: str, document: Document) -> Any:
|
|
||||||
"""
|
|
||||||
Extract content from HTML.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
document: The document to populate with content
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The document with populated content
|
|
||||||
"""
|
|
||||||
self._document = document
|
|
||||||
self._reset_state()
|
|
||||||
|
|
||||||
# Parse the HTML content
|
|
||||||
self.feed(html_content)
|
|
||||||
|
|
||||||
# Flush any remaining text
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
return document
|
|
||||||
|
|
||||||
def set_base_url(self, base_url: str):
|
|
||||||
"""Set the base URL for resolving relative links."""
|
|
||||||
self.inline_handler.set_base_url(base_url)
|
|
||||||
|
|
||||||
def _reset_state(self):
|
|
||||||
"""Reset all parser state for new content."""
|
|
||||||
# Reset managers and processors
|
|
||||||
self.style_manager.reset()
|
|
||||||
self.text_processor.reset()
|
|
||||||
|
|
||||||
# Reset element handlers
|
|
||||||
self.block_handler.reset()
|
|
||||||
self.list_handler.reset()
|
|
||||||
self.table_handler.reset()
|
|
||||||
self.inline_handler.reset()
|
|
||||||
|
|
||||||
# Reset parser flags
|
|
||||||
self._in_head = False
|
|
||||||
self._in_script = False
|
|
||||||
self._in_style = False
|
|
||||||
|
|
||||||
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
|
||||||
"""Handle the start of an HTML tag."""
|
|
||||||
tag = tag.lower()
|
|
||||||
attrs_dict = dict(attrs)
|
|
||||||
|
|
||||||
# Skip content in head, script, style (except body)
|
|
||||||
if self._should_skip_content(tag):
|
|
||||||
return
|
|
||||||
|
|
||||||
# Handle special section markers
|
|
||||||
if self._handle_special_sections_start(tag):
|
|
||||||
return
|
|
||||||
|
|
||||||
# Apply styles for this element
|
|
||||||
style = self.style_manager.apply_style_to_element(tag, attrs_dict)
|
|
||||||
self.style_manager.push_style(style)
|
|
||||||
|
|
||||||
# Delegate to appropriate handler
|
|
||||||
self._delegate_start_tag(tag, attrs_dict)
|
|
||||||
|
|
||||||
def handle_endtag(self, tag: str):
|
|
||||||
"""Handle the end of an HTML tag."""
|
|
||||||
tag = tag.lower()
|
|
||||||
|
|
||||||
# Handle special section markers
|
|
||||||
if self._handle_special_sections_end(tag):
|
|
||||||
return
|
|
||||||
|
|
||||||
# Skip content in head, script, style
|
|
||||||
if self._in_head or self._in_script or self._in_style:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Flush any accumulated text
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
# Delegate to appropriate handler
|
|
||||||
self._delegate_end_tag(tag)
|
|
||||||
|
|
||||||
# Pop style regardless of tag
|
|
||||||
self.style_manager.pop_style()
|
|
||||||
|
|
||||||
def handle_data(self, data: str):
|
|
||||||
"""Handle text data."""
|
|
||||||
if self._in_head or self._in_script or self._in_style:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.text_processor.add_text(data)
|
|
||||||
|
|
||||||
def handle_entityref(self, name: str):
|
|
||||||
"""Handle an HTML entity reference."""
|
|
||||||
if self._in_head or self._in_script or self._in_style:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.text_processor.add_entity_reference(name)
|
|
||||||
|
|
||||||
def handle_charref(self, name: str):
|
|
||||||
"""Handle a character reference."""
|
|
||||||
if self._in_head or self._in_script or self._in_style:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.text_processor.add_character_reference(name)
|
|
||||||
|
|
||||||
def _should_skip_content(self, tag: str) -> bool:
|
|
||||||
"""Check if we should skip content based on current state."""
|
|
||||||
if self._in_head or self._in_script or self._in_style:
|
|
||||||
if tag in ('head', 'script', 'style'):
|
|
||||||
return False # Let special section handlers deal with these
|
|
||||||
if tag != 'body':
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _handle_special_sections_start(self, tag: str) -> bool:
|
|
||||||
"""Handle special section start tags. Returns True if handled."""
|
|
||||||
if tag == 'head':
|
|
||||||
self._in_head = True
|
|
||||||
return True
|
|
||||||
elif tag == 'body':
|
|
||||||
self._in_head = False
|
|
||||||
return True
|
|
||||||
elif tag == 'script':
|
|
||||||
self._in_script = True
|
|
||||||
return True
|
|
||||||
elif tag == 'style':
|
|
||||||
self._in_style = True
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _handle_special_sections_end(self, tag: str) -> bool:
|
|
||||||
"""Handle special section end tags. Returns True if handled."""
|
|
||||||
if tag == 'head':
|
|
||||||
self._in_head = False
|
|
||||||
self.style_manager.pop_style()
|
|
||||||
return True
|
|
||||||
elif tag == 'script':
|
|
||||||
self._in_script = False
|
|
||||||
self.style_manager.pop_style()
|
|
||||||
return True
|
|
||||||
elif tag == 'style':
|
|
||||||
self._in_style = False
|
|
||||||
self.style_manager.pop_style()
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _delegate_start_tag(self, tag: str, attrs: Dict[str, str]):
|
|
||||||
"""Delegate start tag handling to appropriate handler."""
|
|
||||||
# Block elements
|
|
||||||
if tag == 'p':
|
|
||||||
self.block_handler.handle_paragraph_start(self._document)
|
|
||||||
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
||||||
self.block_handler.handle_heading_start(tag, self._document)
|
|
||||||
elif tag == 'div':
|
|
||||||
self.block_handler.handle_div_start(self._document)
|
|
||||||
elif tag == 'blockquote':
|
|
||||||
self.block_handler.handle_blockquote_start(self._document)
|
|
||||||
elif tag == 'pre':
|
|
||||||
self.block_handler.handle_pre_start(self._document)
|
|
||||||
elif tag == 'code':
|
|
||||||
self.block_handler.handle_code_start(attrs, self._document)
|
|
||||||
|
|
||||||
# List elements
|
|
||||||
elif tag in ('ul', 'ol', 'dl'):
|
|
||||||
self.list_handler.handle_list_start(tag, self.block_handler, self._document)
|
|
||||||
elif tag == 'li':
|
|
||||||
self.list_handler.handle_list_item_start(self.block_handler)
|
|
||||||
elif tag in ('dt', 'dd'):
|
|
||||||
self.list_handler.handle_definition_start(tag, self.block_handler)
|
|
||||||
|
|
||||||
# Table elements
|
|
||||||
elif tag == 'table':
|
|
||||||
self.table_handler.handle_table_start(attrs, self.block_handler, self._document)
|
|
||||||
elif tag in ('thead', 'tbody', 'tfoot'):
|
|
||||||
self.table_handler.handle_table_section_start(tag)
|
|
||||||
elif tag == 'tr':
|
|
||||||
self.table_handler.handle_table_row_start()
|
|
||||||
elif tag in ('td', 'th'):
|
|
||||||
self.table_handler.handle_table_cell_start(tag, attrs, self.block_handler)
|
|
||||||
|
|
||||||
# Inline elements
|
|
||||||
elif tag == 'a':
|
|
||||||
self.inline_handler.handle_link_start(attrs)
|
|
||||||
elif tag == 'img':
|
|
||||||
self.inline_handler.handle_image(attrs, self.block_handler, self._document)
|
|
||||||
elif tag == 'br':
|
|
||||||
self.inline_handler.handle_line_break(self.block_handler)
|
|
||||||
elif tag == 'hr':
|
|
||||||
self.inline_handler.handle_horizontal_rule(self.block_handler, self._document)
|
|
||||||
|
|
||||||
# Style-only elements (no special handling needed, just styling)
|
|
||||||
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
|
|
||||||
pass # Styles are already applied by style manager
|
|
||||||
|
|
||||||
def _delegate_end_tag(self, tag: str):
|
|
||||||
"""Delegate end tag handling to appropriate handler."""
|
|
||||||
# Block elements
|
|
||||||
if tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code'):
|
|
||||||
self.block_handler.handle_block_end()
|
|
||||||
|
|
||||||
# List elements
|
|
||||||
elif tag in ('ul', 'ol', 'dl'):
|
|
||||||
self.list_handler.handle_list_end(self.block_handler)
|
|
||||||
elif tag in ('li', 'dt', 'dd'):
|
|
||||||
self.list_handler.handle_list_item_end(self.block_handler)
|
|
||||||
|
|
||||||
# Table elements
|
|
||||||
elif tag == 'table':
|
|
||||||
self.table_handler.handle_table_end(self.block_handler)
|
|
||||||
elif tag in ('thead', 'tbody', 'tfoot'):
|
|
||||||
self.table_handler.handle_table_section_end()
|
|
||||||
elif tag == 'tr':
|
|
||||||
self.table_handler.handle_table_row_end()
|
|
||||||
elif tag in ('td', 'th'):
|
|
||||||
self.table_handler.handle_table_cell_end(self.block_handler)
|
|
||||||
|
|
||||||
# Inline elements
|
|
||||||
elif tag == 'a':
|
|
||||||
self.inline_handler.handle_link_end()
|
|
||||||
|
|
||||||
# Style-only elements (no special handling needed)
|
|
||||||
elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'):
|
|
||||||
pass # Styles are handled by style manager
|
|
||||||
@ -1,473 +0,0 @@
|
|||||||
"""
|
|
||||||
HTML element handlers for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides specialized handlers for different types of HTML elements,
|
|
||||||
using composition and delegation to handle specific element types.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Dict, List, Optional, Any
|
|
||||||
import urllib.parse
|
|
||||||
from pyWebLayout.abstract.document import Document
|
|
||||||
from pyWebLayout.abstract.block import (
|
|
||||||
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
|
||||||
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
|
||||||
HorizontalRule, Image
|
|
||||||
)
|
|
||||||
from pyWebLayout.abstract.inline import LineBreak
|
|
||||||
from pyWebLayout.abstract.functional import Link, LinkType
|
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
|
||||||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
|
||||||
|
|
||||||
|
|
||||||
class BlockElementHandler:
|
|
||||||
"""Handles block-level HTML elements like paragraphs, headings, divs."""
|
|
||||||
|
|
||||||
def __init__(self, style_manager: HTMLStyleManager, text_processor: HTMLTextProcessor):
|
|
||||||
self.style_manager = style_manager
|
|
||||||
self.text_processor = text_processor
|
|
||||||
self.block_stack: List[Block] = []
|
|
||||||
self.current_block: Optional[Block] = None
|
|
||||||
self.current_paragraph: Optional[Paragraph] = None
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""Reset the handler state."""
|
|
||||||
self.block_stack = []
|
|
||||||
self.current_block = None
|
|
||||||
self.current_paragraph = None
|
|
||||||
|
|
||||||
def add_block_to_document_or_parent(self, block: Block, document: Document):
|
|
||||||
"""Add a block to the document or current parent block."""
|
|
||||||
if self.current_block and hasattr(self.current_block, 'add_block'):
|
|
||||||
self.current_block.add_block(block)
|
|
||||||
else:
|
|
||||||
document.add_block(block)
|
|
||||||
|
|
||||||
def handle_paragraph_start(self, document: Document):
|
|
||||||
"""Handle the start of a paragraph element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
paragraph = Paragraph()
|
|
||||||
|
|
||||||
self.add_block_to_document_or_parent(paragraph, document)
|
|
||||||
self.block_stack.append(paragraph)
|
|
||||||
self.current_block = paragraph
|
|
||||||
self.current_paragraph = paragraph
|
|
||||||
self.text_processor.set_current_paragraph(paragraph)
|
|
||||||
|
|
||||||
def handle_heading_start(self, tag: str, document: Document):
|
|
||||||
"""Handle the start of a heading element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
level_map = {
|
|
||||||
'h1': HeadingLevel.H1, 'h2': HeadingLevel.H2, 'h3': HeadingLevel.H3,
|
|
||||||
'h4': HeadingLevel.H4, 'h5': HeadingLevel.H5, 'h6': HeadingLevel.H6
|
|
||||||
}
|
|
||||||
|
|
||||||
heading = Heading(level=level_map[tag])
|
|
||||||
self.add_block_to_document_or_parent(heading, document)
|
|
||||||
self.block_stack.append(heading)
|
|
||||||
self.current_block = heading
|
|
||||||
self.current_paragraph = heading # Heading inherits from Paragraph
|
|
||||||
self.text_processor.set_current_paragraph(heading)
|
|
||||||
|
|
||||||
def handle_div_start(self, document: Document):
|
|
||||||
"""Handle the start of a div element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
div_para = Paragraph()
|
|
||||||
|
|
||||||
self.add_block_to_document_or_parent(div_para, document)
|
|
||||||
self.block_stack.append(div_para)
|
|
||||||
self.current_block = div_para
|
|
||||||
self.current_paragraph = div_para
|
|
||||||
self.text_processor.set_current_paragraph(div_para)
|
|
||||||
|
|
||||||
def handle_blockquote_start(self, document: Document):
|
|
||||||
"""Handle the start of a blockquote element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
quote = Quote()
|
|
||||||
|
|
||||||
self.add_block_to_document_or_parent(quote, document)
|
|
||||||
self.block_stack.append(quote)
|
|
||||||
self.current_block = quote
|
|
||||||
self.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
|
|
||||||
def handle_pre_start(self, document: Document):
|
|
||||||
"""Handle the start of a pre element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
pre_para = Paragraph()
|
|
||||||
|
|
||||||
self.add_block_to_document_or_parent(pre_para, document)
|
|
||||||
self.block_stack.append(pre_para)
|
|
||||||
self.current_block = pre_para
|
|
||||||
self.current_paragraph = pre_para
|
|
||||||
self.text_processor.set_current_paragraph(pre_para)
|
|
||||||
|
|
||||||
def handle_code_start(self, attrs: Dict[str, str], document: Document):
|
|
||||||
"""Handle the start of a code element."""
|
|
||||||
# If we're inside a pre, replace the paragraph with a code block
|
|
||||||
if self.block_stack and isinstance(self.block_stack[-1], Paragraph):
|
|
||||||
pre_para = self.block_stack.pop()
|
|
||||||
|
|
||||||
# Get the language from class if specified
|
|
||||||
language = ""
|
|
||||||
if 'class' in attrs:
|
|
||||||
class_attr = attrs['class']
|
|
||||||
if class_attr.startswith('language-'):
|
|
||||||
language = class_attr[9:]
|
|
||||||
|
|
||||||
code_block = CodeBlock(language=language)
|
|
||||||
|
|
||||||
# Replace the paragraph with the code block in its parent
|
|
||||||
if pre_para.parent:
|
|
||||||
parent = pre_para.parent
|
|
||||||
if hasattr(parent, '_blocks'):
|
|
||||||
for i, block in enumerate(parent._blocks):
|
|
||||||
if block == pre_para:
|
|
||||||
parent._blocks[i] = code_block
|
|
||||||
code_block.parent = parent
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# Replace in document blocks
|
|
||||||
for i, block in enumerate(document.blocks):
|
|
||||||
if block == pre_para:
|
|
||||||
document.blocks[i] = code_block
|
|
||||||
break
|
|
||||||
|
|
||||||
self.block_stack.append(code_block)
|
|
||||||
self.current_block = code_block
|
|
||||||
self.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
|
|
||||||
def handle_block_end(self):
|
|
||||||
"""Handle the end of a block element."""
|
|
||||||
if self.block_stack:
|
|
||||||
self.block_stack.pop()
|
|
||||||
|
|
||||||
if self.block_stack:
|
|
||||||
self.current_block = self.block_stack[-1]
|
|
||||||
# Update current paragraph based on block type
|
|
||||||
if isinstance(self.current_block, Paragraph):
|
|
||||||
self.current_paragraph = self.current_block
|
|
||||||
else:
|
|
||||||
self.current_paragraph = None
|
|
||||||
else:
|
|
||||||
self.current_block = None
|
|
||||||
self.current_paragraph = None
|
|
||||||
|
|
||||||
self.text_processor.set_current_paragraph(self.current_paragraph)
|
|
||||||
|
|
||||||
|
|
||||||
class ListElementHandler:
|
|
||||||
"""Handles list-related HTML elements (ul, ol, dl, li, dt, dd)."""
|
|
||||||
|
|
||||||
def __init__(self, text_processor: HTMLTextProcessor):
|
|
||||||
self.text_processor = text_processor
|
|
||||||
self.list_stack: List[HList] = []
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""Reset the handler state."""
|
|
||||||
self.list_stack = []
|
|
||||||
|
|
||||||
def handle_list_start(self, tag: str, block_handler: BlockElementHandler, document: Document):
|
|
||||||
"""Handle the start of a list element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
style_map = {
|
|
||||||
'ul': ListStyle.UNORDERED,
|
|
||||||
'ol': ListStyle.ORDERED,
|
|
||||||
'dl': ListStyle.DEFINITION
|
|
||||||
}
|
|
||||||
|
|
||||||
list_block = HList(style=style_map[tag])
|
|
||||||
block_handler.add_block_to_document_or_parent(list_block, document)
|
|
||||||
|
|
||||||
block_handler.block_stack.append(list_block)
|
|
||||||
self.list_stack.append(list_block)
|
|
||||||
block_handler.current_block = list_block
|
|
||||||
block_handler.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
|
|
||||||
def handle_list_item_start(self, block_handler: BlockElementHandler):
|
|
||||||
"""Handle the start of a list item."""
|
|
||||||
if not self.list_stack:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
list_item = ListItem()
|
|
||||||
|
|
||||||
current_list = self.list_stack[-1]
|
|
||||||
current_list.add_item(list_item)
|
|
||||||
|
|
||||||
block_handler.block_stack.append(list_item)
|
|
||||||
block_handler.current_block = list_item
|
|
||||||
|
|
||||||
# Create a paragraph for the list item content
|
|
||||||
item_para = Paragraph()
|
|
||||||
list_item.add_block(item_para)
|
|
||||||
block_handler.current_paragraph = item_para
|
|
||||||
self.text_processor.set_current_paragraph(item_para)
|
|
||||||
|
|
||||||
def handle_definition_start(self, tag: str, block_handler: BlockElementHandler):
|
|
||||||
"""Handle the start of definition terms or descriptions."""
|
|
||||||
if not self.list_stack or self.list_stack[-1].style != ListStyle.DEFINITION:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
current_list = self.list_stack[-1]
|
|
||||||
|
|
||||||
if tag == 'dt':
|
|
||||||
list_item = ListItem(term="")
|
|
||||||
current_list.add_item(list_item)
|
|
||||||
block_handler.block_stack.append(list_item)
|
|
||||||
block_handler.current_block = list_item
|
|
||||||
|
|
||||||
term_para = Paragraph()
|
|
||||||
list_item.add_block(term_para)
|
|
||||||
block_handler.current_paragraph = term_para
|
|
||||||
self.text_processor.set_current_paragraph(term_para)
|
|
||||||
|
|
||||||
elif tag == 'dd':
|
|
||||||
if current_list._items:
|
|
||||||
list_item = current_list._items[-1]
|
|
||||||
desc_para = Paragraph()
|
|
||||||
list_item.add_block(desc_para)
|
|
||||||
block_handler.current_paragraph = desc_para
|
|
||||||
self.text_processor.set_current_paragraph(desc_para)
|
|
||||||
|
|
||||||
def handle_list_end(self, block_handler: BlockElementHandler):
|
|
||||||
"""Handle the end of a list."""
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.block_stack.pop()
|
|
||||||
if self.list_stack:
|
|
||||||
self.list_stack.pop()
|
|
||||||
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.current_block = block_handler.block_stack[-1]
|
|
||||||
else:
|
|
||||||
block_handler.current_block = None
|
|
||||||
|
|
||||||
block_handler.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
|
|
||||||
def handle_list_item_end(self, block_handler: BlockElementHandler):
|
|
||||||
"""Handle the end of a list item."""
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.block_stack.pop()
|
|
||||||
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.current_block = block_handler.block_stack[-1]
|
|
||||||
else:
|
|
||||||
block_handler.current_block = None
|
|
||||||
|
|
||||||
block_handler.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
|
|
||||||
|
|
||||||
class TableElementHandler:
|
|
||||||
"""Handles table-related HTML elements (table, tr, td, th, thead, tbody, tfoot)."""
|
|
||||||
|
|
||||||
def __init__(self, text_processor: HTMLTextProcessor):
|
|
||||||
self.text_processor = text_processor
|
|
||||||
self.table_stack: List[Table] = []
|
|
||||||
self.current_table_row: Optional[TableRow] = None
|
|
||||||
self.current_table_section = "body"
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""Reset the handler state."""
|
|
||||||
self.table_stack = []
|
|
||||||
self.current_table_row = None
|
|
||||||
self.current_table_section = "body"
|
|
||||||
|
|
||||||
def handle_table_start(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
|
|
||||||
"""Handle the start of a table element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
caption = attrs.get('summary')
|
|
||||||
table = Table(caption=caption)
|
|
||||||
|
|
||||||
block_handler.add_block_to_document_or_parent(table, document)
|
|
||||||
block_handler.block_stack.append(table)
|
|
||||||
self.table_stack.append(table)
|
|
||||||
block_handler.current_block = table
|
|
||||||
block_handler.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
|
|
||||||
def handle_table_section_start(self, tag: str):
|
|
||||||
"""Handle the start of a table section."""
|
|
||||||
self.current_table_section = tag
|
|
||||||
|
|
||||||
def handle_table_row_start(self):
|
|
||||||
"""Handle the start of a table row."""
|
|
||||||
if not self.table_stack:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
row = TableRow()
|
|
||||||
|
|
||||||
current_table = self.table_stack[-1]
|
|
||||||
section = self.current_table_section
|
|
||||||
|
|
||||||
if section == 'thead':
|
|
||||||
section = "header"
|
|
||||||
elif section == 'tfoot':
|
|
||||||
section = "footer"
|
|
||||||
else:
|
|
||||||
section = "body"
|
|
||||||
|
|
||||||
current_table.add_row(row, section=section)
|
|
||||||
self.current_table_row = row
|
|
||||||
|
|
||||||
def handle_table_cell_start(self, tag: str, attrs: Dict[str, str], block_handler: BlockElementHandler):
|
|
||||||
"""Handle the start of a table cell."""
|
|
||||||
if not self.current_table_row:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
# Parse attributes
|
|
||||||
try:
|
|
||||||
colspan = int(attrs.get('colspan', 1))
|
|
||||||
rowspan = int(attrs.get('rowspan', 1))
|
|
||||||
except ValueError:
|
|
||||||
colspan, rowspan = 1, 1
|
|
||||||
|
|
||||||
is_header = (tag == 'th')
|
|
||||||
|
|
||||||
cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan)
|
|
||||||
self.current_table_row.add_cell(cell)
|
|
||||||
|
|
||||||
block_handler.block_stack.append(cell)
|
|
||||||
block_handler.current_block = cell
|
|
||||||
|
|
||||||
# Create a paragraph for the cell content
|
|
||||||
cell_para = Paragraph()
|
|
||||||
cell.add_block(cell_para)
|
|
||||||
block_handler.current_paragraph = cell_para
|
|
||||||
self.text_processor.set_current_paragraph(cell_para)
|
|
||||||
|
|
||||||
def handle_table_end(self, block_handler: BlockElementHandler):
|
|
||||||
"""Handle the end of a table."""
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.block_stack.pop()
|
|
||||||
if self.table_stack:
|
|
||||||
self.table_stack.pop()
|
|
||||||
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.current_block = block_handler.block_stack[-1]
|
|
||||||
else:
|
|
||||||
block_handler.current_block = None
|
|
||||||
|
|
||||||
block_handler.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
self.current_table_row = None
|
|
||||||
self.current_table_section = "body"
|
|
||||||
|
|
||||||
def handle_table_section_end(self):
|
|
||||||
"""Handle the end of a table section."""
|
|
||||||
self.current_table_section = "body"
|
|
||||||
|
|
||||||
def handle_table_row_end(self):
|
|
||||||
"""Handle the end of a table row."""
|
|
||||||
self.current_table_row = None
|
|
||||||
|
|
||||||
def handle_table_cell_end(self, block_handler: BlockElementHandler):
|
|
||||||
"""Handle the end of a table cell."""
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.block_stack.pop()
|
|
||||||
|
|
||||||
if block_handler.block_stack:
|
|
||||||
block_handler.current_block = block_handler.block_stack[-1]
|
|
||||||
else:
|
|
||||||
block_handler.current_block = None
|
|
||||||
|
|
||||||
block_handler.current_paragraph = None
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
|
|
||||||
|
|
||||||
class InlineElementHandler:
|
|
||||||
"""Handles inline and special HTML elements (a, img, br, hr)."""
|
|
||||||
|
|
||||||
def __init__(self, text_processor: HTMLTextProcessor, base_url: Optional[str] = None):
|
|
||||||
self.text_processor = text_processor
|
|
||||||
self.base_url = base_url
|
|
||||||
self.in_link = False
|
|
||||||
self.current_link: Optional[Link] = None
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""Reset the handler state."""
|
|
||||||
self.in_link = False
|
|
||||||
self.current_link = None
|
|
||||||
|
|
||||||
def set_base_url(self, base_url: Optional[str]):
|
|
||||||
"""Set the base URL for resolving relative links."""
|
|
||||||
self.base_url = base_url
|
|
||||||
|
|
||||||
def handle_link_start(self, attrs: Dict[str, str]):
|
|
||||||
"""Handle the start of a link element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
href = attrs.get('href', '')
|
|
||||||
title = attrs.get('title', '')
|
|
||||||
|
|
||||||
# Determine link type
|
|
||||||
link_type = LinkType.INTERNAL
|
|
||||||
if href.startswith('http://') or href.startswith('https://'):
|
|
||||||
link_type = LinkType.EXTERNAL
|
|
||||||
elif href.startswith('javascript:'):
|
|
||||||
link_type = LinkType.FUNCTION
|
|
||||||
elif href.startswith('api:'):
|
|
||||||
link_type = LinkType.API
|
|
||||||
href = href[4:]
|
|
||||||
|
|
||||||
# Resolve relative URLs
|
|
||||||
if self.base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')):
|
|
||||||
href = urllib.parse.urljoin(self.base_url, href)
|
|
||||||
|
|
||||||
self.current_link = Link(
|
|
||||||
location=href,
|
|
||||||
link_type=link_type,
|
|
||||||
title=title if title else None
|
|
||||||
)
|
|
||||||
|
|
||||||
self.in_link = True
|
|
||||||
|
|
||||||
def handle_link_end(self):
|
|
||||||
"""Handle the end of a link element."""
|
|
||||||
self.in_link = False
|
|
||||||
self.current_link = None
|
|
||||||
|
|
||||||
def handle_image(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document):
|
|
||||||
"""Handle an image element."""
|
|
||||||
src = attrs.get('src', '')
|
|
||||||
alt = attrs.get('alt', '')
|
|
||||||
|
|
||||||
# Parse dimensions
|
|
||||||
width = height = None
|
|
||||||
try:
|
|
||||||
if 'width' in attrs:
|
|
||||||
width = int(attrs['width'])
|
|
||||||
if 'height' in attrs:
|
|
||||||
height = int(attrs['height'])
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Resolve relative URLs
|
|
||||||
if self.base_url and not src.startswith(('http://', 'https://')):
|
|
||||||
src = urllib.parse.urljoin(self.base_url, src)
|
|
||||||
|
|
||||||
image = Image(source=src, alt_text=alt, width=width, height=height)
|
|
||||||
block_handler.add_block_to_document_or_parent(image, document)
|
|
||||||
|
|
||||||
def handle_line_break(self, block_handler: BlockElementHandler):
|
|
||||||
"""Handle a line break element."""
|
|
||||||
if block_handler.current_paragraph:
|
|
||||||
line_break = LineBreak()
|
|
||||||
if hasattr(block_handler.current_paragraph, 'add_block'):
|
|
||||||
block_handler.current_paragraph.add_block(line_break)
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
def handle_horizontal_rule(self, block_handler: BlockElementHandler, document: Document):
|
|
||||||
"""Handle a horizontal rule element."""
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
hr = HorizontalRule()
|
|
||||||
block_handler.add_block_to_document_or_parent(hr, document)
|
|
||||||
@ -12,7 +12,8 @@ from bs4 import BeautifulSoup, Tag, NavigableString
|
|||||||
from pyWebLayout.abstract.inline import Word, FormattedSpan
|
from pyWebLayout.abstract.inline import Word, FormattedSpan
|
||||||
from pyWebLayout.abstract.block import (
|
from pyWebLayout.abstract.block import (
|
||||||
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
HList, ListItem, ListStyle, Table, TableRow, TableCell
|
HList, ListItem, ListStyle, Table, TableRow, TableCell,
|
||||||
|
HorizontalRule, Image
|
||||||
)
|
)
|
||||||
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
||||||
|
|
||||||
@ -576,11 +577,9 @@ def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:
|
|||||||
return cell
|
return cell
|
||||||
|
|
||||||
|
|
||||||
def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block:
|
def horizontal_rule_handler(element: Tag, context: StyleContext) -> HorizontalRule:
|
||||||
"""Handle <hr> elements."""
|
"""Handle <hr> elements."""
|
||||||
# TODO: Create a specific HorizontalRule block type
|
return HorizontalRule()
|
||||||
# For now, return an empty paragraph
|
|
||||||
return Paragraph(context.font)
|
|
||||||
|
|
||||||
|
|
||||||
def line_break_handler(element: Tag, context: StyleContext) -> None:
|
def line_break_handler(element: Tag, context: StyleContext) -> None:
|
||||||
@ -589,18 +588,22 @@ def line_break_handler(element: Tag, context: StyleContext) -> None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def image_handler(element: Tag, context: StyleContext) -> Block:
|
def image_handler(element: Tag, context: StyleContext) -> Image:
|
||||||
"""Handle <img> elements."""
|
"""Handle <img> elements."""
|
||||||
# TODO: Create Image block type
|
src = context.element_attributes.get('src', '')
|
||||||
# For now, return empty paragraph with alt text if available
|
|
||||||
paragraph = Paragraph(context.font)
|
|
||||||
alt_text = context.element_attributes.get('alt', '')
|
alt_text = context.element_attributes.get('alt', '')
|
||||||
if alt_text:
|
|
||||||
words = alt_text.split()
|
# Parse dimensions if provided
|
||||||
for word_text in words:
|
width = height = None
|
||||||
if word_text:
|
try:
|
||||||
paragraph.add_word(Word(word_text, context.font))
|
if 'width' in context.element_attributes:
|
||||||
return paragraph
|
width = int(context.element_attributes['width'])
|
||||||
|
if 'height' in context.element_attributes:
|
||||||
|
height = int(context.element_attributes['height'])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return Image(source=src, alt_text=alt_text, width=width, height=height)
|
||||||
|
|
||||||
|
|
||||||
def ignore_handler(element: Tag, context: StyleContext) -> None:
|
def ignore_handler(element: Tag, context: StyleContext) -> None:
|
||||||
|
|||||||
@ -1,281 +0,0 @@
|
|||||||
"""
|
|
||||||
HTML style management for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides specialized functionality for handling CSS styles,
|
|
||||||
style stacks, and style parsing in HTML documents.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Dict, List, Any, Optional, Tuple
|
|
||||||
import re
|
|
||||||
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLStyleManager:
|
|
||||||
"""
|
|
||||||
Manages CSS styles and style stacks during HTML parsing.
|
|
||||||
|
|
||||||
This class handles style parsing, style inheritance, and maintains
|
|
||||||
the style stack for proper style nesting.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the style manager."""
|
|
||||||
self._style_stack: List[Dict[str, Any]] = []
|
|
||||||
self._current_style = self._get_default_style()
|
|
||||||
|
|
||||||
def _get_default_style(self) -> Dict[str, Any]:
|
|
||||||
"""Get the default style settings."""
|
|
||||||
return {
|
|
||||||
'font_size': 12,
|
|
||||||
'font_weight': FontWeight.NORMAL,
|
|
||||||
'font_style': FontStyle.NORMAL,
|
|
||||||
'decoration': TextDecoration.NONE,
|
|
||||||
'color': (0, 0, 0),
|
|
||||||
'background': None,
|
|
||||||
'language': 'en_US'
|
|
||||||
}
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""Reset the style manager to initial state."""
|
|
||||||
self._style_stack = []
|
|
||||||
self._current_style = self._get_default_style()
|
|
||||||
|
|
||||||
def push_style(self, style: Dict[str, Any]):
|
|
||||||
"""
|
|
||||||
Push a new style onto the style stack.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
style: The style to push
|
|
||||||
"""
|
|
||||||
# Save the current style
|
|
||||||
self._style_stack.append(self._current_style.copy())
|
|
||||||
|
|
||||||
# Apply the new style
|
|
||||||
for key, value in style.items():
|
|
||||||
self._current_style[key] = value
|
|
||||||
|
|
||||||
def pop_style(self):
|
|
||||||
"""Pop a style from the style stack."""
|
|
||||||
if self._style_stack:
|
|
||||||
self._current_style = self._style_stack.pop()
|
|
||||||
|
|
||||||
def get_current_style(self) -> Dict[str, Any]:
|
|
||||||
"""Get the current style."""
|
|
||||||
return self._current_style.copy()
|
|
||||||
|
|
||||||
def get_tag_style(self, tag: str) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Get the default style for a tag.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tag: The tag name
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A dictionary of style properties
|
|
||||||
"""
|
|
||||||
tag_styles = {
|
|
||||||
'h1': {'font_size': 24, 'font_weight': FontWeight.BOLD},
|
|
||||||
'h2': {'font_size': 20, 'font_weight': FontWeight.BOLD},
|
|
||||||
'h3': {'font_size': 18, 'font_weight': FontWeight.BOLD},
|
|
||||||
'h4': {'font_size': 16, 'font_weight': FontWeight.BOLD},
|
|
||||||
'h5': {'font_size': 14, 'font_weight': FontWeight.BOLD},
|
|
||||||
'h6': {'font_size': 12, 'font_weight': FontWeight.BOLD},
|
|
||||||
'b': {'font_weight': FontWeight.BOLD},
|
|
||||||
'strong': {'font_weight': FontWeight.BOLD},
|
|
||||||
'i': {'font_style': FontStyle.ITALIC},
|
|
||||||
'em': {'font_style': FontStyle.ITALIC},
|
|
||||||
'u': {'decoration': TextDecoration.UNDERLINE},
|
|
||||||
'a': {'decoration': TextDecoration.UNDERLINE, 'color': (0, 0, 255)},
|
|
||||||
'code': {'font_family': 'monospace', 'background': (240, 240, 240, 255)},
|
|
||||||
'pre': {'font_family': 'monospace'},
|
|
||||||
}
|
|
||||||
|
|
||||||
return tag_styles.get(tag, {})
|
|
||||||
|
|
||||||
def create_font(self) -> Font:
|
|
||||||
"""
|
|
||||||
Create a Font object from the current style.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Font: A font object with the current style settings
|
|
||||||
"""
|
|
||||||
return Font(
|
|
||||||
font_size=self._current_style['font_size'],
|
|
||||||
colour=self._current_style['color'],
|
|
||||||
weight=self._current_style['font_weight'],
|
|
||||||
style=self._current_style['font_style'],
|
|
||||||
decoration=self._current_style['decoration'],
|
|
||||||
background=self._current_style['background'],
|
|
||||||
langauge=self._current_style['language']
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse_inline_style(self, style_str: str) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Parse inline CSS style string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
style_str: CSS style string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of style properties
|
|
||||||
"""
|
|
||||||
if not style_str:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
style_dict = {}
|
|
||||||
declarations = [d.strip() for d in style_str.split(';') if d.strip()]
|
|
||||||
|
|
||||||
for declaration in declarations:
|
|
||||||
parts = declaration.split(':', 1)
|
|
||||||
if len(parts) != 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
prop = parts[0].strip().lower()
|
|
||||||
value = parts[1].strip()
|
|
||||||
|
|
||||||
# Handle specific properties
|
|
||||||
if prop == 'font-size':
|
|
||||||
if value.endswith('px'):
|
|
||||||
try:
|
|
||||||
size = int(value[:-2])
|
|
||||||
style_dict['font_size'] = size
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
elif value.endswith('pt'):
|
|
||||||
try:
|
|
||||||
size = int(value[:-2])
|
|
||||||
style_dict['font_size'] = size
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
elif prop == 'font-weight':
|
|
||||||
if value == 'bold':
|
|
||||||
style_dict['font_weight'] = FontWeight.BOLD
|
|
||||||
elif value == 'normal':
|
|
||||||
style_dict['font_weight'] = FontWeight.NORMAL
|
|
||||||
elif prop == 'font-style':
|
|
||||||
if value == 'italic':
|
|
||||||
style_dict['font_style'] = FontStyle.ITALIC
|
|
||||||
elif value == 'normal':
|
|
||||||
style_dict['font_style'] = FontStyle.NORMAL
|
|
||||||
elif prop == 'text-decoration':
|
|
||||||
if value == 'underline':
|
|
||||||
style_dict['decoration'] = TextDecoration.UNDERLINE
|
|
||||||
elif value == 'line-through':
|
|
||||||
style_dict['decoration'] = TextDecoration.STRIKETHROUGH
|
|
||||||
elif value == 'none':
|
|
||||||
style_dict['decoration'] = TextDecoration.NONE
|
|
||||||
elif prop == 'color':
|
|
||||||
color = self.parse_color(value)
|
|
||||||
if color:
|
|
||||||
style_dict['color'] = color
|
|
||||||
elif prop == 'background-color':
|
|
||||||
color = self.parse_color(value)
|
|
||||||
if color:
|
|
||||||
style_dict['background'] = color + (255,)
|
|
||||||
|
|
||||||
return style_dict
|
|
||||||
|
|
||||||
def parse_color(self, color_str: str) -> Optional[Tuple[int, int, int]]:
|
|
||||||
"""
|
|
||||||
Parse a CSS color string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
color_str: CSS color string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
RGB tuple or None if parsing fails
|
|
||||||
"""
|
|
||||||
# Named colors
|
|
||||||
color_map = {
|
|
||||||
'black': (0, 0, 0),
|
|
||||||
'white': (255, 255, 255),
|
|
||||||
'red': (255, 0, 0),
|
|
||||||
'green': (0, 128, 0),
|
|
||||||
'blue': (0, 0, 255),
|
|
||||||
'yellow': (255, 255, 0),
|
|
||||||
'cyan': (0, 255, 255),
|
|
||||||
'magenta': (255, 0, 255),
|
|
||||||
'gray': (128, 128, 128),
|
|
||||||
'grey': (128, 128, 128),
|
|
||||||
'silver': (192, 192, 192),
|
|
||||||
'maroon': (128, 0, 0),
|
|
||||||
'olive': (128, 128, 0),
|
|
||||||
'navy': (0, 0, 128),
|
|
||||||
'purple': (128, 0, 128),
|
|
||||||
'teal': (0, 128, 128),
|
|
||||||
'lime': (0, 255, 0),
|
|
||||||
'aqua': (0, 255, 255),
|
|
||||||
'fuchsia': (255, 0, 255),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check for named color
|
|
||||||
color_str = color_str.lower().strip()
|
|
||||||
if color_str in color_map:
|
|
||||||
return color_map[color_str]
|
|
||||||
|
|
||||||
# Check for hex color
|
|
||||||
if color_str.startswith('#'):
|
|
||||||
try:
|
|
||||||
if len(color_str) == 4: # #RGB
|
|
||||||
r = int(color_str[1] + color_str[1], 16)
|
|
||||||
g = int(color_str[2] + color_str[2], 16)
|
|
||||||
b = int(color_str[3] + color_str[3], 16)
|
|
||||||
return (r, g, b)
|
|
||||||
elif len(color_str) == 7: # #RRGGBB
|
|
||||||
r = int(color_str[1:3], 16)
|
|
||||||
g = int(color_str[3:5], 16)
|
|
||||||
b = int(color_str[5:7], 16)
|
|
||||||
return (r, g, b)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Check for rgb() color
|
|
||||||
rgb_match = re.match(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', color_str)
|
|
||||||
if rgb_match:
|
|
||||||
try:
|
|
||||||
r_val = int(rgb_match.group(1))
|
|
||||||
g_val = int(rgb_match.group(2))
|
|
||||||
b_val = int(rgb_match.group(3))
|
|
||||||
|
|
||||||
# Check if values are in valid range (0-255)
|
|
||||||
if r_val > 255 or g_val > 255 or b_val > 255 or r_val < 0 or g_val < 0 or b_val < 0:
|
|
||||||
return None # Invalid color values
|
|
||||||
|
|
||||||
return (r_val, g_val, b_val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Check for rgba() color (ignore alpha)
|
|
||||||
rgba_match = re.match(r'rgba\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*[\d.]+\s*\)', color_str)
|
|
||||||
if rgba_match:
|
|
||||||
try:
|
|
||||||
r = min(255, max(0, int(rgba_match.group(1))))
|
|
||||||
g = min(255, max(0, int(rgba_match.group(2))))
|
|
||||||
b = min(255, max(0, int(rgba_match.group(3))))
|
|
||||||
return (r, g, b)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Failed to parse color
|
|
||||||
return None
|
|
||||||
|
|
||||||
def apply_style_to_element(self, tag: str, attrs: Dict[str, str]) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Apply combined styles (tag defaults + inline styles) for an element.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tag: The HTML tag name
|
|
||||||
attrs: Dictionary of tag attributes
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Combined style dictionary
|
|
||||||
"""
|
|
||||||
# Start with tag-specific styles
|
|
||||||
style = self.get_tag_style(tag)
|
|
||||||
|
|
||||||
# Override with inline styles if present
|
|
||||||
if 'style' in attrs:
|
|
||||||
inline_style = self.parse_inline_style(attrs['style'])
|
|
||||||
style.update(inline_style)
|
|
||||||
|
|
||||||
return style
|
|
||||||
@ -1,163 +0,0 @@
|
|||||||
"""
|
|
||||||
HTML text processing for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides specialized functionality for handling text content,
|
|
||||||
entity references, and word creation in HTML documents.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
from pyWebLayout.abstract.inline import Word
|
|
||||||
from pyWebLayout.abstract.block import Paragraph
|
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLTextProcessor:
|
|
||||||
"""
|
|
||||||
Processes text content during HTML parsing.
|
|
||||||
|
|
||||||
This class handles text buffering, entity resolution, and word creation
|
|
||||||
with proper styling applied.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, style_manager: HTMLStyleManager):
|
|
||||||
"""
|
|
||||||
Initialize the text processor.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
style_manager: The style manager for creating styled words
|
|
||||||
"""
|
|
||||||
self._style_manager = style_manager
|
|
||||||
self._text_buffer = ""
|
|
||||||
self._current_paragraph: Optional[Paragraph] = None
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""Reset the text processor state."""
|
|
||||||
self._text_buffer = ""
|
|
||||||
self._current_paragraph = None
|
|
||||||
|
|
||||||
def set_current_paragraph(self, paragraph: Optional[Paragraph]):
|
|
||||||
"""
|
|
||||||
Set the current paragraph for text output.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
paragraph: The paragraph to receive text, or None
|
|
||||||
"""
|
|
||||||
self._current_paragraph = paragraph
|
|
||||||
|
|
||||||
def add_text(self, text: str):
|
|
||||||
"""
|
|
||||||
Add text to the buffer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: The text to add
|
|
||||||
"""
|
|
||||||
self._text_buffer += text
|
|
||||||
|
|
||||||
def add_entity_reference(self, name: str):
|
|
||||||
"""
|
|
||||||
Add an HTML entity reference to the buffer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
name: The entity name (e.g., 'lt', 'gt', 'amp')
|
|
||||||
"""
|
|
||||||
# Map common entity references to characters
|
|
||||||
entities = {
|
|
||||||
'lt': '<',
|
|
||||||
'gt': '>',
|
|
||||||
'amp': '&',
|
|
||||||
'quot': '"',
|
|
||||||
'apos': "'",
|
|
||||||
'nbsp': ' ',
|
|
||||||
'copy': '©',
|
|
||||||
'reg': '®',
|
|
||||||
'trade': '™',
|
|
||||||
'mdash': '—',
|
|
||||||
'ndash': '–',
|
|
||||||
'hellip': '…',
|
|
||||||
'laquo': '«',
|
|
||||||
'raquo': '»',
|
|
||||||
'ldquo': '"',
|
|
||||||
'rdquo': '"',
|
|
||||||
'lsquo': ''',
|
|
||||||
'rsquo': ''',
|
|
||||||
'deg': '°',
|
|
||||||
'plusmn': '±',
|
|
||||||
'times': '×',
|
|
||||||
'divide': '÷',
|
|
||||||
'euro': '€',
|
|
||||||
'pound': '£',
|
|
||||||
'yen': '¥',
|
|
||||||
}
|
|
||||||
|
|
||||||
char = entities.get(name, f'&{name};')
|
|
||||||
self._text_buffer += char
|
|
||||||
|
|
||||||
def add_character_reference(self, name: str):
|
|
||||||
"""
|
|
||||||
Add a character reference to the buffer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
name: The character reference (decimal or hex)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if name.startswith('x'):
|
|
||||||
# Hexadecimal reference
|
|
||||||
char = chr(int(name[1:], 16))
|
|
||||||
else:
|
|
||||||
# Decimal reference
|
|
||||||
char = chr(int(name))
|
|
||||||
self._text_buffer += char
|
|
||||||
except (ValueError, OverflowError):
|
|
||||||
# Invalid character reference
|
|
||||||
self._text_buffer += f'&#{name};'
|
|
||||||
|
|
||||||
def flush_text(self) -> bool:
|
|
||||||
"""
|
|
||||||
Flush the text buffer, creating words as needed.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if text was flushed, False if buffer was empty
|
|
||||||
"""
|
|
||||||
if not self._text_buffer or not self._current_paragraph:
|
|
||||||
self._text_buffer = ""
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Clean up the text
|
|
||||||
text = self._text_buffer.strip()
|
|
||||||
if not text:
|
|
||||||
self._text_buffer = ""
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Create words from the text
|
|
||||||
words = text.split()
|
|
||||||
for word_text in words:
|
|
||||||
if word_text:
|
|
||||||
font = self._style_manager.create_font()
|
|
||||||
word = Word(word_text, font)
|
|
||||||
self._current_paragraph.add_word(word)
|
|
||||||
|
|
||||||
# Reset text buffer
|
|
||||||
self._text_buffer = ""
|
|
||||||
return True
|
|
||||||
|
|
||||||
def has_pending_text(self) -> bool:
|
|
||||||
"""
|
|
||||||
Check if there is pending text in the buffer.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if there is text waiting to be flushed
|
|
||||||
"""
|
|
||||||
return bool(self._text_buffer.strip())
|
|
||||||
|
|
||||||
def get_buffer_content(self) -> str:
|
|
||||||
"""
|
|
||||||
Get the current buffer content without flushing.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The current text buffer content
|
|
||||||
"""
|
|
||||||
return self._text_buffer
|
|
||||||
|
|
||||||
def clear_buffer(self):
|
|
||||||
"""Clear the text buffer without creating words."""
|
|
||||||
self._text_buffer = ""
|
|
||||||
@ -34,7 +34,7 @@ class Font:
|
|||||||
style: FontStyle = FontStyle.NORMAL,
|
style: FontStyle = FontStyle.NORMAL,
|
||||||
decoration: TextDecoration = TextDecoration.NONE,
|
decoration: TextDecoration = TextDecoration.NONE,
|
||||||
background: Optional[Tuple[int, int, int, int]] = None,
|
background: Optional[Tuple[int, int, int, int]] = None,
|
||||||
langauge = "en_EN"):
|
language = "en_EN"):
|
||||||
"""
|
"""
|
||||||
Initialize a Font object with the specified properties.
|
Initialize a Font object with the specified properties.
|
||||||
|
|
||||||
@ -46,6 +46,7 @@ class Font:
|
|||||||
style: Font style (normal or italic).
|
style: Font style (normal or italic).
|
||||||
decoration: Text decoration (none, underline, or strikethrough).
|
decoration: Text decoration (none, underline, or strikethrough).
|
||||||
background: RGBA background color for the text. If None, transparent background.
|
background: RGBA background color for the text. If None, transparent background.
|
||||||
|
language: Language code for hyphenation and text processing.
|
||||||
"""
|
"""
|
||||||
self._font_path = font_path
|
self._font_path = font_path
|
||||||
self._font_size = font_size
|
self._font_size = font_size
|
||||||
@ -54,7 +55,7 @@ class Font:
|
|||||||
self._style = style
|
self._style = style
|
||||||
self._decoration = decoration
|
self._decoration = decoration
|
||||||
self._background = background if background else (255, 255, 255, 0)
|
self._background = background if background else (255, 255, 255, 0)
|
||||||
self.language = langauge
|
self.language = language
|
||||||
# Load the font file or use default
|
# Load the font file or use default
|
||||||
self._load_font()
|
self._load_font()
|
||||||
|
|
||||||
|
|||||||
@ -1,354 +0,0 @@
|
|||||||
"""
|
|
||||||
Unit tests for HTML content reading.
|
|
||||||
|
|
||||||
Tests the HTMLContentReader class for parsing complete HTML documents.
|
|
||||||
This is more of an integration test covering the entire parsing pipeline.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
|
||||||
from pyWebLayout.abstract.document import Document
|
|
||||||
from pyWebLayout.abstract.block import (
|
|
||||||
Paragraph, Heading, HeadingLevel, HList, ListStyle,
|
|
||||||
Table, Quote, CodeBlock, HorizontalRule
|
|
||||||
)
|
|
||||||
from pyWebLayout.abstract.inline import LineBreak
|
|
||||||
|
|
||||||
class TestHTMLContentReader(unittest.TestCase):
|
|
||||||
"""Test cases for HTMLContentReader."""
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
"""Set up test fixtures."""
|
|
||||||
self.reader = HTMLContentReader()
|
|
||||||
self.document = Document()
|
|
||||||
|
|
||||||
def test_simple_paragraph(self):
|
|
||||||
"""Test parsing a simple paragraph."""
|
|
||||||
html = '<p>Hello world!</p>'
|
|
||||||
|
|
||||||
result = self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
self.assertIsInstance(self.document.blocks[0], Paragraph)
|
|
||||||
|
|
||||||
paragraph = self.document.blocks[0]
|
|
||||||
words = list(paragraph.words())
|
|
||||||
self.assertEqual(len(words), 2)
|
|
||||||
self.assertEqual(words[0][1].text, "Hello")
|
|
||||||
self.assertEqual(words[1][1].text, "world!")
|
|
||||||
|
|
||||||
def test_headings(self):
|
|
||||||
"""Test parsing different heading levels."""
|
|
||||||
html = '''
|
|
||||||
<h1>Heading 1</h1>
|
|
||||||
<h2>Heading 2</h2>
|
|
||||||
<h3>Heading 3</h3>
|
|
||||||
<h6>Heading 6</h6>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
# Should have 4 heading blocks
|
|
||||||
headings = [block for block in self.document.blocks if isinstance(block, Heading)]
|
|
||||||
self.assertEqual(len(headings), 4)
|
|
||||||
|
|
||||||
# Check heading levels
|
|
||||||
self.assertEqual(headings[0].level, HeadingLevel.H1)
|
|
||||||
self.assertEqual(headings[1].level, HeadingLevel.H2)
|
|
||||||
self.assertEqual(headings[2].level, HeadingLevel.H3)
|
|
||||||
self.assertEqual(headings[3].level, HeadingLevel.H6)
|
|
||||||
|
|
||||||
# Check text content
|
|
||||||
h1_words = list(headings[0].words())
|
|
||||||
self.assertEqual(len(h1_words), 2)
|
|
||||||
self.assertEqual(h1_words[0][1].text, "Heading")
|
|
||||||
self.assertEqual(h1_words[1][1].text, "1")
|
|
||||||
|
|
||||||
def test_styled_text(self):
|
|
||||||
"""Test parsing text with inline styling."""
|
|
||||||
html = '<p>This is <b>bold</b> and <i>italic</i> text.</p>'
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
paragraph = self.document.blocks[0]
|
|
||||||
words = list(paragraph.words())
|
|
||||||
|
|
||||||
# Should have words: "This", "is", "bold", "and", "italic", "text."
|
|
||||||
self.assertEqual(len(words), 6)
|
|
||||||
|
|
||||||
# The styling information is embedded in the Font objects
|
|
||||||
# We can't easily test the exact styling without more complex setup
|
|
||||||
# but we can verify the words are created correctly
|
|
||||||
word_texts = [word[1].text for word in words]
|
|
||||||
self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."])
|
|
||||||
|
|
||||||
def test_unordered_list(self):
|
|
||||||
"""Test parsing unordered lists."""
|
|
||||||
html = '''
|
|
||||||
<ul>
|
|
||||||
<li>First item</li>
|
|
||||||
<li>Second item</li>
|
|
||||||
<li>Third item</li>
|
|
||||||
</ul>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
self.assertIsInstance(self.document.blocks[0], HList)
|
|
||||||
|
|
||||||
list_block = self.document.blocks[0]
|
|
||||||
self.assertEqual(list_block.style, ListStyle.UNORDERED)
|
|
||||||
|
|
||||||
items = list(list_block.items())
|
|
||||||
self.assertEqual(len(items), 3)
|
|
||||||
|
|
||||||
# Check first item content
|
|
||||||
first_item_blocks = list(items[0].blocks())
|
|
||||||
self.assertEqual(len(first_item_blocks), 1)
|
|
||||||
self.assertIsInstance(first_item_blocks[0], Paragraph)
|
|
||||||
|
|
||||||
def test_ordered_list(self):
|
|
||||||
"""Test parsing ordered lists."""
|
|
||||||
html = '''
|
|
||||||
<ol>
|
|
||||||
<li>First step</li>
|
|
||||||
<li>Second step</li>
|
|
||||||
</ol>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
list_block = self.document.blocks[0]
|
|
||||||
self.assertEqual(list_block.style, ListStyle.ORDERED)
|
|
||||||
|
|
||||||
items = list(list_block.items())
|
|
||||||
self.assertEqual(len(items), 2)
|
|
||||||
|
|
||||||
def test_definition_list(self):
|
|
||||||
"""Test parsing definition lists."""
|
|
||||||
html = '''
|
|
||||||
<dl>
|
|
||||||
<dt>Term 1</dt>
|
|
||||||
<dd>Definition 1</dd>
|
|
||||||
<dt>Term 2</dt>
|
|
||||||
<dd>Definition 2</dd>
|
|
||||||
</dl>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
list_block = self.document.blocks[0]
|
|
||||||
self.assertEqual(list_block.style, ListStyle.DEFINITION)
|
|
||||||
|
|
||||||
items = list(list_block.items())
|
|
||||||
self.assertEqual(len(items), 2) # Two dt/dd pairs
|
|
||||||
|
|
||||||
def test_table(self):
|
|
||||||
"""Test parsing simple tables."""
|
|
||||||
html = '''
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<th>Header 1</th>
|
|
||||||
<th>Header 2</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>Cell 1</td>
|
|
||||||
<td>Cell 2</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
self.assertIsInstance(self.document.blocks[0], Table)
|
|
||||||
|
|
||||||
table = self.document.blocks[0]
|
|
||||||
|
|
||||||
# Check body rows
|
|
||||||
body_rows = list(table.body_rows())
|
|
||||||
self.assertEqual(len(body_rows), 2) # Header row + data row
|
|
||||||
|
|
||||||
# Check first row (header)
|
|
||||||
first_row_cells = list(body_rows[0].cells())
|
|
||||||
self.assertEqual(len(first_row_cells), 2)
|
|
||||||
self.assertTrue(first_row_cells[0].is_header)
|
|
||||||
self.assertTrue(first_row_cells[1].is_header)
|
|
||||||
|
|
||||||
# Check second row (data)
|
|
||||||
second_row_cells = list(body_rows[1].cells())
|
|
||||||
self.assertEqual(len(second_row_cells), 2)
|
|
||||||
self.assertFalse(second_row_cells[0].is_header)
|
|
||||||
self.assertFalse(second_row_cells[1].is_header)
|
|
||||||
|
|
||||||
def test_blockquote(self):
|
|
||||||
"""Test parsing blockquotes."""
|
|
||||||
html = '''
|
|
||||||
<blockquote>
|
|
||||||
<p>This is a quoted paragraph.</p>
|
|
||||||
<p>Another quoted paragraph.</p>
|
|
||||||
</blockquote>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
self.assertIsInstance(self.document.blocks[0], Quote)
|
|
||||||
|
|
||||||
quote = self.document.blocks[0]
|
|
||||||
quote_blocks = list(quote.blocks())
|
|
||||||
self.assertEqual(len(quote_blocks), 2)
|
|
||||||
self.assertIsInstance(quote_blocks[0], Paragraph)
|
|
||||||
self.assertIsInstance(quote_blocks[1], Paragraph)
|
|
||||||
|
|
||||||
def test_code_block(self):
|
|
||||||
"""Test parsing code blocks."""
|
|
||||||
html = '''
|
|
||||||
<pre><code class="language-python">
|
|
||||||
def hello():
|
|
||||||
print("Hello, world!")
|
|
||||||
</code></pre>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
|
||||||
self.assertIsInstance(self.document.blocks[0], CodeBlock)
|
|
||||||
|
|
||||||
code_block = self.document.blocks[0]
|
|
||||||
self.assertEqual(code_block.language, "python")
|
|
||||||
|
|
||||||
def test_horizontal_rule(self):
|
|
||||||
"""Test parsing horizontal rules."""
|
|
||||||
html = '<p>Before</p><hr><p>After</p>'
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 3)
|
|
||||||
self.assertIsInstance(self.document.blocks[0], Paragraph)
|
|
||||||
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
|
|
||||||
self.assertIsInstance(self.document.blocks[2], Paragraph)
|
|
||||||
|
|
||||||
def test_html_entities(self):
|
|
||||||
"""Test handling HTML entities."""
|
|
||||||
html = '<p>Less than: < Greater than: > Ampersand: &</p>'
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
paragraph = self.document.blocks[0]
|
|
||||||
words = list(paragraph.words())
|
|
||||||
|
|
||||||
# Find the entity words
|
|
||||||
word_texts = [word[1].text for word in words]
|
|
||||||
self.assertIn('<', word_texts)
|
|
||||||
self.assertIn('>', word_texts)
|
|
||||||
self.assertIn('&', word_texts)
|
|
||||||
|
|
||||||
def test_nested_elements(self):
|
|
||||||
"""Test parsing nested HTML elements."""
|
|
||||||
html = '''
|
|
||||||
<div>
|
|
||||||
<h2>Section Title</h2>
|
|
||||||
<p>Section content with <strong>important</strong> text.</p>
|
|
||||||
<ul>
|
|
||||||
<li>List item 1</li>
|
|
||||||
<li>List item 2</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
# Should have multiple blocks
|
|
||||||
self.assertGreater(len(self.document.blocks), 1)
|
|
||||||
|
|
||||||
# Check that we have different types of blocks
|
|
||||||
block_types = [type(block).__name__ for block in self.document.blocks]
|
|
||||||
self.assertIn('Paragraph', block_types) # From div
|
|
||||||
self.assertIn('Heading', block_types)
|
|
||||||
self.assertIn('HList', block_types)
|
|
||||||
|
|
||||||
def test_empty_elements(self):
|
|
||||||
"""Test handling empty HTML elements."""
|
|
||||||
html = '<p></p><div></div><ul></ul>'
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
# Empty elements should still create blocks
|
|
||||||
self.assertEqual(len(self.document.blocks), 3)
|
|
||||||
|
|
||||||
def test_whitespace_handling(self):
|
|
||||||
"""Test proper whitespace handling."""
|
|
||||||
html = '''
|
|
||||||
<p> Word1 Word2
|
|
||||||
Word3 </p>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
paragraph = self.document.blocks[0]
|
|
||||||
words = list(paragraph.words())
|
|
||||||
|
|
||||||
# Should normalize whitespace and create separate words
|
|
||||||
word_texts = [word[1].text for word in words]
|
|
||||||
self.assertEqual(word_texts, ["Word1", "Word2", "Word3"])
|
|
||||||
|
|
||||||
def test_base_url_setting(self):
|
|
||||||
"""Test setting base URL for link resolution."""
|
|
||||||
base_url = "https://example.com/path/"
|
|
||||||
self.reader.set_base_url(base_url)
|
|
||||||
|
|
||||||
# The base URL should be passed to the inline handler
|
|
||||||
self.assertEqual(self.reader.inline_handler.base_url, base_url)
|
|
||||||
|
|
||||||
def test_complex_document(self):
|
|
||||||
"""Test parsing a complex HTML document."""
|
|
||||||
html = '''
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Test Document</title>
|
|
||||||
<style>body { font-family: Arial; }</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h1>Main Title</h1>
|
|
||||||
<p>Introduction paragraph with <em>emphasis</em>.</p>
|
|
||||||
|
|
||||||
<h2>Section 1</h2>
|
|
||||||
<p>Content with <a href="link.html">a link</a>.</p>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>Item 1</li>
|
|
||||||
<li>Item 2 with <strong>bold text</strong></li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2>Section 2</h2>
|
|
||||||
<blockquote>
|
|
||||||
<p>A quoted paragraph.</p>
|
|
||||||
</blockquote>
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th>Col1</th><th>Col2</th></tr>
|
|
||||||
<tr><td>A</td><td>B</td></tr>
|
|
||||||
</table>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.reader.extract_content(html, self.document)
|
|
||||||
|
|
||||||
# Should have parsed multiple blocks
|
|
||||||
self.assertGreater(len(self.document.blocks), 5)
|
|
||||||
|
|
||||||
# Should have different types of content
|
|
||||||
block_types = set(type(block).__name__ for block in self.document.blocks)
|
|
||||||
expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'}
|
|
||||||
self.assertTrue(expected_types.issubset(block_types))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
unittest.main()
|
|
||||||
@ -1,181 +1,181 @@
|
|||||||
"""
|
"""
|
||||||
Unit tests for HTML style management.
|
Unit tests for pyWebLayout style objects.
|
||||||
|
|
||||||
Tests the HTMLStyleManager class for CSS parsing, style stacks, and font creation.
|
Tests the Font class and style enums for proper functionality and immutability.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration, Alignment
|
||||||
from pyWebLayout.style import FontStyle, FontWeight, TextDecoration
|
|
||||||
|
|
||||||
|
|
||||||
class TestHTMLStyleManager(unittest.TestCase):
|
class TestStyleObjects(unittest.TestCase):
|
||||||
"""Test cases for HTMLStyleManager."""
|
"""Test cases for pyWebLayout style objects."""
|
||||||
|
|
||||||
def setUp(self):
|
def test_font_weight_enum(self):
|
||||||
"""Set up test fixtures."""
|
"""Test FontWeight enum values."""
|
||||||
self.style_manager = HTMLStyleManager()
|
self.assertEqual(FontWeight.NORMAL.value, "normal")
|
||||||
|
self.assertEqual(FontWeight.BOLD.value, "bold")
|
||||||
|
|
||||||
def test_initialization(self):
|
# Test that all expected values exist
|
||||||
"""Test proper initialization of style manager."""
|
weights = [FontWeight.NORMAL, FontWeight.BOLD]
|
||||||
style = self.style_manager.get_current_style()
|
self.assertEqual(len(weights), 2)
|
||||||
|
|
||||||
self.assertEqual(style['font_size'], 12)
|
def test_font_style_enum(self):
|
||||||
self.assertEqual(style['font_weight'], FontWeight.NORMAL)
|
"""Test FontStyle enum values."""
|
||||||
self.assertEqual(style['font_style'], FontStyle.NORMAL)
|
self.assertEqual(FontStyle.NORMAL.value, "normal")
|
||||||
self.assertEqual(style['decoration'], TextDecoration.NONE)
|
self.assertEqual(FontStyle.ITALIC.value, "italic")
|
||||||
self.assertEqual(style['color'], (0, 0, 0))
|
|
||||||
self.assertIsNone(style['background'])
|
|
||||||
self.assertEqual(style['language'], 'en_US')
|
|
||||||
|
|
||||||
def test_style_stack_operations(self):
|
# Test that all expected values exist
|
||||||
"""Test push and pop operations on style stack."""
|
styles = [FontStyle.NORMAL, FontStyle.ITALIC]
|
||||||
# Initial state
|
self.assertEqual(len(styles), 2)
|
||||||
initial_style = self.style_manager.get_current_style()
|
|
||||||
|
|
||||||
# Push a new style
|
def test_text_decoration_enum(self):
|
||||||
new_style = {'font_size': 16, 'font_weight': FontWeight.BOLD}
|
"""Test TextDecoration enum values."""
|
||||||
self.style_manager.push_style(new_style)
|
self.assertEqual(TextDecoration.NONE.value, "none")
|
||||||
|
self.assertEqual(TextDecoration.UNDERLINE.value, "underline")
|
||||||
|
self.assertEqual(TextDecoration.STRIKETHROUGH.value, "strikethrough")
|
||||||
|
|
||||||
current_style = self.style_manager.get_current_style()
|
# Test that all expected values exist
|
||||||
self.assertEqual(current_style['font_size'], 16)
|
decorations = [TextDecoration.NONE, TextDecoration.UNDERLINE, TextDecoration.STRIKETHROUGH]
|
||||||
self.assertEqual(current_style['font_weight'], FontWeight.BOLD)
|
self.assertEqual(len(decorations), 3)
|
||||||
self.assertEqual(current_style['color'], (0, 0, 0)) # Unchanged
|
|
||||||
|
|
||||||
# Pop the style
|
def test_alignment_enum(self):
|
||||||
self.style_manager.pop_style()
|
"""Test Alignment enum values."""
|
||||||
restored_style = self.style_manager.get_current_style()
|
self.assertEqual(Alignment.LEFT.value, 1)
|
||||||
self.assertEqual(restored_style, initial_style)
|
self.assertEqual(Alignment.CENTER.value, 2)
|
||||||
|
self.assertEqual(Alignment.RIGHT.value, 3)
|
||||||
|
self.assertEqual(Alignment.TOP.value, 4)
|
||||||
|
self.assertEqual(Alignment.BOTTOM.value, 5)
|
||||||
|
self.assertEqual(Alignment.JUSTIFY.value, 6)
|
||||||
|
|
||||||
def test_tag_styles(self):
|
def test_font_initialization_defaults(self):
|
||||||
"""Test default styles for HTML tags."""
|
"""Test Font initialization with default values."""
|
||||||
h1_style = self.style_manager.get_tag_style('h1')
|
font = Font()
|
||||||
self.assertEqual(h1_style['font_size'], 24)
|
|
||||||
self.assertEqual(h1_style['font_weight'], FontWeight.BOLD)
|
|
||||||
|
|
||||||
h6_style = self.style_manager.get_tag_style('h6')
|
self.assertIsNone(font._font_path)
|
||||||
self.assertEqual(h6_style['font_size'], 12)
|
self.assertEqual(font.font_size, 12)
|
||||||
self.assertEqual(h6_style['font_weight'], FontWeight.BOLD)
|
self.assertEqual(font.colour, (0, 0, 0))
|
||||||
|
self.assertEqual(font.color, (0, 0, 0)) # Alias
|
||||||
|
self.assertEqual(font.weight, FontWeight.NORMAL)
|
||||||
|
self.assertEqual(font.style, FontStyle.NORMAL)
|
||||||
|
self.assertEqual(font.decoration, TextDecoration.NONE)
|
||||||
|
self.assertEqual(font.background, (255, 255, 255, 0)) # Transparent
|
||||||
|
self.assertEqual(font.language, "en_EN")
|
||||||
|
|
||||||
em_style = self.style_manager.get_tag_style('em')
|
def test_font_initialization_custom(self):
|
||||||
self.assertEqual(em_style['font_style'], FontStyle.ITALIC)
|
"""Test Font initialization with custom values."""
|
||||||
|
font = Font(
|
||||||
unknown_style = self.style_manager.get_tag_style('unknown')
|
font_path="/path/to/font.ttf",
|
||||||
self.assertEqual(unknown_style, {})
|
font_size=16,
|
||||||
|
colour=(255, 0, 0),
|
||||||
def test_inline_style_parsing(self):
|
weight=FontWeight.BOLD,
|
||||||
"""Test parsing of inline CSS styles."""
|
style=FontStyle.ITALIC,
|
||||||
# Test font-size
|
decoration=TextDecoration.UNDERLINE,
|
||||||
style = self.style_manager.parse_inline_style('font-size: 18px')
|
background=(255, 255, 0, 255),
|
||||||
self.assertEqual(style['font_size'], 18)
|
langauge="fr_FR"
|
||||||
|
|
||||||
style = self.style_manager.parse_inline_style('font-size: 14pt')
|
|
||||||
self.assertEqual(style['font_size'], 14)
|
|
||||||
|
|
||||||
# Test font-weight
|
|
||||||
style = self.style_manager.parse_inline_style('font-weight: bold')
|
|
||||||
self.assertEqual(style['font_weight'], FontWeight.BOLD)
|
|
||||||
|
|
||||||
# Test font-style
|
|
||||||
style = self.style_manager.parse_inline_style('font-style: italic')
|
|
||||||
self.assertEqual(style['font_style'], FontStyle.ITALIC)
|
|
||||||
|
|
||||||
# Test text-decoration
|
|
||||||
style = self.style_manager.parse_inline_style('text-decoration: underline')
|
|
||||||
self.assertEqual(style['decoration'], TextDecoration.UNDERLINE)
|
|
||||||
|
|
||||||
# Test multiple properties
|
|
||||||
style = self.style_manager.parse_inline_style(
|
|
||||||
'font-size: 20px; font-weight: bold; color: red'
|
|
||||||
)
|
)
|
||||||
self.assertEqual(style['font_size'], 20)
|
|
||||||
self.assertEqual(style['font_weight'], FontWeight.BOLD)
|
|
||||||
self.assertEqual(style['color'], (255, 0, 0))
|
|
||||||
|
|
||||||
def test_color_parsing(self):
|
|
||||||
"""Test CSS color parsing."""
|
|
||||||
# Named colors
|
|
||||||
self.assertEqual(self.style_manager.parse_color('red'), (255, 0, 0))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('blue'), (0, 0, 255))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('white'), (255, 255, 255))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('gray'), (128, 128, 128))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('grey'), (128, 128, 128))
|
|
||||||
|
|
||||||
# Hex colors
|
|
||||||
self.assertEqual(self.style_manager.parse_color('#ff0000'), (255, 0, 0))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('#00ff00'), (0, 255, 0))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('#f00'), (255, 0, 0))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('#0f0'), (0, 255, 0))
|
|
||||||
|
|
||||||
# RGB colors
|
|
||||||
self.assertEqual(self.style_manager.parse_color('rgb(255, 0, 0)'), (255, 0, 0))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('rgb(128, 128, 128)'), (128, 128, 128))
|
|
||||||
self.assertEqual(self.style_manager.parse_color('rgb( 255 , 255 , 255 )'), (255, 255, 255))
|
|
||||||
|
|
||||||
# RGBA colors (alpha ignored)
|
|
||||||
self.assertEqual(self.style_manager.parse_color('rgba(255, 0, 0, 0.5)'), (255, 0, 0))
|
|
||||||
|
|
||||||
# Invalid colors
|
|
||||||
self.assertIsNone(self.style_manager.parse_color('invalid'))
|
|
||||||
self.assertIsNone(self.style_manager.parse_color('#gg0000'))
|
|
||||||
self.assertIsNone(self.style_manager.parse_color('rgb(300, 0, 0)')) # Invalid values return None
|
|
||||||
|
|
||||||
def test_color_clamping(self):
|
|
||||||
"""Test that RGB values outside valid range return None."""
|
|
||||||
# Values outside 0-255 range should return None
|
|
||||||
color = self.style_manager.parse_color('rgb(300, -10, 128)')
|
|
||||||
self.assertIsNone(color) # Invalid values return None
|
|
||||||
|
|
||||||
def test_apply_style_to_element(self):
|
|
||||||
"""Test combining tag styles with inline styles."""
|
|
||||||
# Test h1 with inline style
|
|
||||||
attrs = {'style': 'color: blue; font-size: 30px'}
|
|
||||||
combined = self.style_manager.apply_style_to_element('h1', attrs)
|
|
||||||
|
|
||||||
# Should have h1 defaults plus inline overrides
|
|
||||||
self.assertEqual(combined['font_size'], 30) # Overridden
|
|
||||||
self.assertEqual(combined['font_weight'], FontWeight.BOLD) # From h1
|
|
||||||
self.assertEqual(combined['color'], (0, 0, 255)) # Inline
|
|
||||||
|
|
||||||
# Test without inline styles
|
|
||||||
combined = self.style_manager.apply_style_to_element('strong', {})
|
|
||||||
self.assertEqual(combined['font_weight'], FontWeight.BOLD)
|
|
||||||
|
|
||||||
def test_reset(self):
|
|
||||||
"""Test resetting the style manager."""
|
|
||||||
# Change the state
|
|
||||||
self.style_manager.push_style({'font_size': 20})
|
|
||||||
self.style_manager.push_style({'color': (255, 0, 0)})
|
|
||||||
|
|
||||||
# Reset
|
|
||||||
self.style_manager.reset()
|
|
||||||
|
|
||||||
# Should be back to initial state
|
|
||||||
style = self.style_manager.get_current_style()
|
|
||||||
self.assertEqual(style['font_size'], 12)
|
|
||||||
self.assertEqual(style['color'], (0, 0, 0))
|
|
||||||
self.assertEqual(len(self.style_manager._style_stack), 0)
|
|
||||||
|
|
||||||
def test_font_creation(self):
|
|
||||||
"""Test Font object creation from current style."""
|
|
||||||
# Set some specific styles
|
|
||||||
self.style_manager.push_style({
|
|
||||||
'font_size': 16,
|
|
||||||
'font_weight': FontWeight.BOLD,
|
|
||||||
'font_style': FontStyle.ITALIC,
|
|
||||||
'decoration': TextDecoration.UNDERLINE,
|
|
||||||
'color': (255, 0, 0),
|
|
||||||
'background': (255, 255, 0, 255)
|
|
||||||
})
|
|
||||||
|
|
||||||
font = self.style_manager.create_font()
|
|
||||||
|
|
||||||
|
self.assertEqual(font._font_path, "/path/to/font.ttf")
|
||||||
self.assertEqual(font.font_size, 16)
|
self.assertEqual(font.font_size, 16)
|
||||||
|
self.assertEqual(font.colour, (255, 0, 0))
|
||||||
self.assertEqual(font.weight, FontWeight.BOLD)
|
self.assertEqual(font.weight, FontWeight.BOLD)
|
||||||
self.assertEqual(font.style, FontStyle.ITALIC)
|
self.assertEqual(font.style, FontStyle.ITALIC)
|
||||||
self.assertEqual(font.decoration, TextDecoration.UNDERLINE)
|
self.assertEqual(font.decoration, TextDecoration.UNDERLINE)
|
||||||
self.assertEqual(font.colour, (255, 0, 0))
|
|
||||||
self.assertEqual(font.background, (255, 255, 0, 255))
|
self.assertEqual(font.background, (255, 255, 0, 255))
|
||||||
|
self.assertEqual(font.language, "fr_FR")
|
||||||
|
|
||||||
|
def test_font_with_methods(self):
|
||||||
|
"""Test Font immutable modification methods."""
|
||||||
|
original_font = Font(
|
||||||
|
font_size=12,
|
||||||
|
colour=(0, 0, 0),
|
||||||
|
weight=FontWeight.NORMAL,
|
||||||
|
style=FontStyle.NORMAL,
|
||||||
|
decoration=TextDecoration.NONE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with_size
|
||||||
|
size_font = original_font.with_size(16)
|
||||||
|
self.assertEqual(size_font.font_size, 16)
|
||||||
|
self.assertEqual(original_font.font_size, 12) # Original unchanged
|
||||||
|
self.assertEqual(size_font.colour, (0, 0, 0)) # Other properties preserved
|
||||||
|
|
||||||
|
# Test with_colour
|
||||||
|
color_font = original_font.with_colour((255, 0, 0))
|
||||||
|
self.assertEqual(color_font.colour, (255, 0, 0))
|
||||||
|
self.assertEqual(original_font.colour, (0, 0, 0)) # Original unchanged
|
||||||
|
self.assertEqual(color_font.font_size, 12) # Other properties preserved
|
||||||
|
|
||||||
|
# Test with_weight
|
||||||
|
weight_font = original_font.with_weight(FontWeight.BOLD)
|
||||||
|
self.assertEqual(weight_font.weight, FontWeight.BOLD)
|
||||||
|
self.assertEqual(original_font.weight, FontWeight.NORMAL) # Original unchanged
|
||||||
|
|
||||||
|
# Test with_style
|
||||||
|
style_font = original_font.with_style(FontStyle.ITALIC)
|
||||||
|
self.assertEqual(style_font.style, FontStyle.ITALIC)
|
||||||
|
self.assertEqual(original_font.style, FontStyle.NORMAL) # Original unchanged
|
||||||
|
|
||||||
|
# Test with_decoration
|
||||||
|
decoration_font = original_font.with_decoration(TextDecoration.UNDERLINE)
|
||||||
|
self.assertEqual(decoration_font.decoration, TextDecoration.UNDERLINE)
|
||||||
|
self.assertEqual(original_font.decoration, TextDecoration.NONE) # Original unchanged
|
||||||
|
|
||||||
|
def test_font_property_access(self):
|
||||||
|
"""Test Font property access methods."""
|
||||||
|
font = Font(
|
||||||
|
font_size=20,
|
||||||
|
colour=(128, 128, 128),
|
||||||
|
weight=FontWeight.BOLD,
|
||||||
|
style=FontStyle.ITALIC,
|
||||||
|
decoration=TextDecoration.STRIKETHROUGH
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test all property getters
|
||||||
|
self.assertEqual(font.font_size, 20)
|
||||||
|
self.assertEqual(font.colour, (128, 128, 128))
|
||||||
|
self.assertEqual(font.color, (128, 128, 128)) # Alias
|
||||||
|
self.assertEqual(font.weight, FontWeight.BOLD)
|
||||||
|
self.assertEqual(font.style, FontStyle.ITALIC)
|
||||||
|
self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH)
|
||||||
|
|
||||||
|
# Test that font object is accessible
|
||||||
|
self.assertIsNotNone(font.font)
|
||||||
|
|
||||||
|
def test_font_immutability(self):
|
||||||
|
"""Test that Font objects behave immutably."""
|
||||||
|
font1 = Font(font_size=12, colour=(0, 0, 0))
|
||||||
|
font2 = font1.with_size(16)
|
||||||
|
font3 = font2.with_colour((255, 0, 0))
|
||||||
|
|
||||||
|
# Each should be different objects
|
||||||
|
self.assertIsNot(font1, font2)
|
||||||
|
self.assertIsNot(font2, font3)
|
||||||
|
self.assertIsNot(font1, font3)
|
||||||
|
|
||||||
|
# Original properties should be unchanged
|
||||||
|
self.assertEqual(font1.font_size, 12)
|
||||||
|
self.assertEqual(font1.colour, (0, 0, 0))
|
||||||
|
|
||||||
|
self.assertEqual(font2.font_size, 16)
|
||||||
|
self.assertEqual(font2.colour, (0, 0, 0))
|
||||||
|
|
||||||
|
self.assertEqual(font3.font_size, 16)
|
||||||
|
self.assertEqual(font3.colour, (255, 0, 0))
|
||||||
|
|
||||||
|
def test_background_handling(self):
|
||||||
|
"""Test background color handling."""
|
||||||
|
# Test default transparent background
|
||||||
|
font1 = Font()
|
||||||
|
self.assertEqual(font1.background, (255, 255, 255, 0))
|
||||||
|
|
||||||
|
# Test explicit background
|
||||||
|
font2 = Font(background=(255, 0, 0, 128))
|
||||||
|
self.assertEqual(font2.background, (255, 0, 0, 128))
|
||||||
|
|
||||||
|
# Test None background becomes transparent
|
||||||
|
font3 = Font(background=None)
|
||||||
|
self.assertEqual(font3.background, (255, 255, 255, 0))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@ -1,247 +0,0 @@
|
|||||||
"""
|
|
||||||
Unit tests for HTML text processing.
|
|
||||||
|
|
||||||
Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
from unittest.mock import Mock, MagicMock
|
|
||||||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
|
||||||
from pyWebLayout.abstract.block import Paragraph
|
|
||||||
from pyWebLayout.abstract.inline import Word
|
|
||||||
|
|
||||||
|
|
||||||
class TestHTMLTextProcessor(unittest.TestCase):
|
|
||||||
"""Test cases for HTMLTextProcessor."""
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
"""Set up test fixtures."""
|
|
||||||
self.style_manager = HTMLStyleManager()
|
|
||||||
self.text_processor = HTMLTextProcessor(self.style_manager)
|
|
||||||
|
|
||||||
# Create a mock paragraph
|
|
||||||
self.mock_paragraph = Mock(spec=Paragraph)
|
|
||||||
self.mock_paragraph.add_word = Mock()
|
|
||||||
|
|
||||||
def test_initialization(self):
|
|
||||||
"""Test proper initialization of text processor."""
|
|
||||||
self.assertEqual(self.text_processor._text_buffer, "")
|
|
||||||
self.assertIsNone(self.text_processor._current_paragraph)
|
|
||||||
self.assertEqual(self.text_processor._style_manager, self.style_manager)
|
|
||||||
|
|
||||||
def test_add_text(self):
|
|
||||||
"""Test adding text to buffer."""
|
|
||||||
self.text_processor.add_text("Hello")
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), "Hello")
|
|
||||||
|
|
||||||
self.text_processor.add_text(" World")
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), "Hello World")
|
|
||||||
|
|
||||||
def test_entity_references(self):
|
|
||||||
"""Test HTML entity reference handling."""
|
|
||||||
test_cases = [
|
|
||||||
('lt', '<'),
|
|
||||||
('gt', '>'),
|
|
||||||
('amp', '&'),
|
|
||||||
('quot', '"'),
|
|
||||||
('apos', "'"),
|
|
||||||
('nbsp', ' '),
|
|
||||||
('copy', '©'),
|
|
||||||
('reg', '®'),
|
|
||||||
('trade', '™'),
|
|
||||||
('mdash', '—'),
|
|
||||||
('ndash', '–'),
|
|
||||||
('hellip', '…'),
|
|
||||||
('euro', '€'),
|
|
||||||
('unknown', '&unknown;') # Unknown entities should be preserved
|
|
||||||
]
|
|
||||||
|
|
||||||
for entity, expected in test_cases:
|
|
||||||
with self.subTest(entity=entity):
|
|
||||||
self.text_processor.clear_buffer()
|
|
||||||
self.text_processor.add_entity_reference(entity)
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), expected)
|
|
||||||
|
|
||||||
def test_character_references(self):
|
|
||||||
"""Test character reference handling."""
|
|
||||||
# Decimal character references
|
|
||||||
self.text_processor.clear_buffer()
|
|
||||||
self.text_processor.add_character_reference('65') # 'A'
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
|
||||||
|
|
||||||
# Hexadecimal character references
|
|
||||||
self.text_processor.clear_buffer()
|
|
||||||
self.text_processor.add_character_reference('x41') # 'A'
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
|
||||||
|
|
||||||
# Unicode character
|
|
||||||
self.text_processor.clear_buffer()
|
|
||||||
self.text_processor.add_character_reference('8364') # Euro symbol
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), '€')
|
|
||||||
|
|
||||||
# Invalid character reference
|
|
||||||
self.text_processor.clear_buffer()
|
|
||||||
self.text_processor.add_character_reference('invalid')
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;')
|
|
||||||
|
|
||||||
# Out of range character
|
|
||||||
self.text_processor.clear_buffer()
|
|
||||||
self.text_processor.add_character_reference('99999999999')
|
|
||||||
self.assertTrue(self.text_processor.get_buffer_content().startswith('&#'))
|
|
||||||
|
|
||||||
def test_buffer_operations(self):
|
|
||||||
"""Test buffer state operations."""
|
|
||||||
# Test has_pending_text
|
|
||||||
self.assertFalse(self.text_processor.has_pending_text())
|
|
||||||
|
|
||||||
self.text_processor.add_text("Some text")
|
|
||||||
self.assertTrue(self.text_processor.has_pending_text())
|
|
||||||
|
|
||||||
# Test clear_buffer
|
|
||||||
self.text_processor.clear_buffer()
|
|
||||||
self.assertFalse(self.text_processor.has_pending_text())
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
|
||||||
|
|
||||||
# Test with whitespace only
|
|
||||||
self.text_processor.add_text(" \n\t ")
|
|
||||||
self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace
|
|
||||||
|
|
||||||
def test_paragraph_management(self):
|
|
||||||
"""Test current paragraph setting."""
|
|
||||||
# Initially no paragraph
|
|
||||||
self.assertIsNone(self.text_processor._current_paragraph)
|
|
||||||
|
|
||||||
# Set paragraph
|
|
||||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
|
||||||
self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph)
|
|
||||||
|
|
||||||
# Clear paragraph
|
|
||||||
self.text_processor.set_current_paragraph(None)
|
|
||||||
self.assertIsNone(self.text_processor._current_paragraph)
|
|
||||||
|
|
||||||
def test_flush_text_with_paragraph(self):
|
|
||||||
"""Test flushing text when paragraph is set."""
|
|
||||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
|
||||||
self.text_processor.add_text("Hello world test")
|
|
||||||
|
|
||||||
# Mock the style manager to return a specific font
|
|
||||||
mock_font = Mock()
|
|
||||||
self.style_manager.create_font = Mock(return_value=mock_font)
|
|
||||||
|
|
||||||
result = self.text_processor.flush_text()
|
|
||||||
|
|
||||||
# Should return True (text was flushed)
|
|
||||||
self.assertTrue(result)
|
|
||||||
|
|
||||||
# Should have created words
|
|
||||||
self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test"
|
|
||||||
|
|
||||||
# Verify the words were created with correct text
|
|
||||||
calls = self.mock_paragraph.add_word.call_args_list
|
|
||||||
word_texts = [call[0][0].text for call in calls]
|
|
||||||
self.assertEqual(word_texts, ["Hello", "world", "test"])
|
|
||||||
|
|
||||||
# Buffer should be empty after flush
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
|
||||||
|
|
||||||
def test_flush_text_without_paragraph(self):
|
|
||||||
"""Test flushing text when no paragraph is set."""
|
|
||||||
self.text_processor.add_text("Hello world")
|
|
||||||
|
|
||||||
result = self.text_processor.flush_text()
|
|
||||||
|
|
||||||
# Should return False (no paragraph to flush to)
|
|
||||||
self.assertFalse(result)
|
|
||||||
|
|
||||||
# Buffer should be cleared anyway
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
|
||||||
|
|
||||||
def test_flush_empty_buffer(self):
|
|
||||||
"""Test flushing when buffer is empty."""
|
|
||||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
|
||||||
|
|
||||||
result = self.text_processor.flush_text()
|
|
||||||
|
|
||||||
# Should return False (nothing to flush)
|
|
||||||
self.assertFalse(result)
|
|
||||||
|
|
||||||
# No words should be added
|
|
||||||
self.mock_paragraph.add_word.assert_not_called()
|
|
||||||
|
|
||||||
def test_flush_whitespace_only(self):
|
|
||||||
"""Test flushing when buffer contains only whitespace."""
|
|
||||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
|
||||||
self.text_processor.add_text(" \n\t ")
|
|
||||||
|
|
||||||
result = self.text_processor.flush_text()
|
|
||||||
|
|
||||||
# Should return False (no meaningful content)
|
|
||||||
self.assertFalse(result)
|
|
||||||
|
|
||||||
# No words should be added
|
|
||||||
self.mock_paragraph.add_word.assert_not_called()
|
|
||||||
|
|
||||||
def test_word_creation_with_styling(self):
|
|
||||||
"""Test that words are created with proper styling."""
|
|
||||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
|
||||||
self.text_processor.add_text("styled text")
|
|
||||||
|
|
||||||
# Set up style manager to return specific font
|
|
||||||
mock_font = Mock()
|
|
||||||
mock_font.font_size = 16
|
|
||||||
mock_font.weight = "bold"
|
|
||||||
self.style_manager.create_font = Mock(return_value=mock_font)
|
|
||||||
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
# Verify font was created
|
|
||||||
self.style_manager.create_font.assert_called()
|
|
||||||
|
|
||||||
# Verify words were created with the font
|
|
||||||
calls = self.mock_paragraph.add_word.call_args_list
|
|
||||||
for call in calls:
|
|
||||||
word = call[0][0]
|
|
||||||
self.assertEqual(word.style, mock_font)
|
|
||||||
|
|
||||||
def test_reset(self):
|
|
||||||
"""Test resetting the text processor."""
|
|
||||||
# Set up some state
|
|
||||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
|
||||||
self.text_processor.add_text("Some text")
|
|
||||||
|
|
||||||
# Reset
|
|
||||||
self.text_processor.reset()
|
|
||||||
|
|
||||||
# Should be back to initial state
|
|
||||||
self.assertEqual(self.text_processor._text_buffer, "")
|
|
||||||
self.assertIsNone(self.text_processor._current_paragraph)
|
|
||||||
|
|
||||||
def test_complex_text_processing(self):
|
|
||||||
"""Test processing text with mixed content."""
|
|
||||||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
|
||||||
|
|
||||||
# Mock font creation
|
|
||||||
mock_font = Mock()
|
|
||||||
self.style_manager.create_font = Mock(return_value=mock_font)
|
|
||||||
|
|
||||||
# Add mixed content
|
|
||||||
self.text_processor.add_text("Hello ")
|
|
||||||
self.text_processor.add_entity_reference('amp')
|
|
||||||
self.text_processor.add_text(" world")
|
|
||||||
self.text_processor.add_character_reference('33') # '!'
|
|
||||||
|
|
||||||
# Should have "Hello & world!"
|
|
||||||
expected_content = "Hello & world!"
|
|
||||||
self.assertEqual(self.text_processor.get_buffer_content(), expected_content)
|
|
||||||
|
|
||||||
# Flush and verify words
|
|
||||||
self.text_processor.flush_text()
|
|
||||||
|
|
||||||
calls = self.mock_paragraph.add_word.call_args_list
|
|
||||||
word_texts = [call[0][0].text for call in calls]
|
|
||||||
self.assertEqual(word_texts, ["Hello", "&", "world!"])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
unittest.main()
|
|
||||||
Loading…
x
Reference in New Issue
Block a user