diff --git a/pyWebLayout/abstract/block.py b/pyWebLayout/abstract/block.py index c139fc5..8f568dd 100644 --- a/pyWebLayout/abstract/block.py +++ b/pyWebLayout/abstract/block.py @@ -1011,14 +1011,246 @@ class Table(Block): elif section.lower() == "footer": self._footer_rows.append(row) else: # Default to body - self._rows + self._rows.append(row) + + def create_row(self, section: str = "body", style=None) -> TableRow: + """ + Create a new table row and add it to this table. + + Args: + section: The section to add the row to ("header", "body", or "footer") + style: Optional style override. If None, inherits from table + + Returns: + The newly created TableRow object + """ + return TableRow.create_and_add_to(self, section, style) + + def header_rows(self) -> Iterator[TableRow]: + """ + Iterate over the header rows in this table. + + Yields: + Each TableRow in the header section + """ + for row in self._header_rows: + yield row + + def body_rows(self) -> Iterator[TableRow]: + """ + Iterate over the body rows in this table. + + Yields: + Each TableRow in the body section + """ + for row in self._rows: + yield row + + def footer_rows(self) -> Iterator[TableRow]: + """ + Iterate over the footer rows in this table. + + Yields: + Each TableRow in the footer section + """ + for row in self._footer_rows: + yield row + + def all_rows(self) -> Iterator[Tuple[str, TableRow]]: + """ + Iterate over all rows in this table with their section labels. + + Yields: + Tuples of (section, row) for each row in the table + """ + for row in self._header_rows: + yield ("header", row) + for row in self._rows: + yield ("body", row) + for row in self._footer_rows: + yield ("footer", row) + + @property + def row_count(self) -> Dict[str, int]: + """Get the row counts by section""" + return { + "header": len(self._header_rows), + "body": len(self._rows), + "footer": len(self._footer_rows), + "total": len(self._header_rows) + len(self._rows) + len(self._footer_rows) + } +class Image(Block): + """ + An image element with source, dimensions, and alternative text. + """ + + def __init__(self, source: str = "", alt_text: str = "", width: Optional[int] = None, height: Optional[int] = None): + """ + Initialize an image element. + + Args: + source: The image source URL or path + alt_text: Alternative text for accessibility + width: Optional image width in pixels + height: Optional image height in pixels + """ + super().__init__(BlockType.IMAGE) + self._source = source + self._alt_text = alt_text + self._width = width + self._height = height + + @classmethod + def create_and_add_to(cls, container, source: str = "", alt_text: str = "", + width: Optional[int] = None, height: Optional[int] = None) -> 'Image': + """ + Create a new Image and add it to a container. + + Args: + container: The container to add the image to (must have add_block method) + source: The image source URL or path + alt_text: Alternative text for accessibility + width: Optional image width in pixels + height: Optional image height in pixels + + Returns: + The newly created Image object + + Raises: + AttributeError: If the container doesn't have the required add_block method + """ + # Create the new image + image = cls(source, alt_text, width, height) + + # Add the image to the container + if hasattr(container, 'add_block'): + container.add_block(image) + else: + raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method") + + return image + + @property + def source(self) -> str: + """Get the image source""" + return self._source + + @source.setter + def source(self, source: str): + """Set the image source""" + self._source = source + + @property + def alt_text(self) -> str: + """Get the alternative text""" + return self._alt_text + + @alt_text.setter + def alt_text(self, alt_text: str): + """Set the alternative text""" + self._alt_text = alt_text + + @property + def width(self) -> Optional[int]: + """Get the image width""" + return self._width + + @width.setter + def width(self, width: Optional[int]): + """Set the image width""" + self._width = width + + @property + def height(self) -> Optional[int]: + """Get the image height""" + return self._height + + @height.setter + def height(self, height: Optional[int]): + """Set the image height""" + self._height = height + + def get_dimensions(self) -> Tuple[Optional[int], Optional[int]]: + """ + Get the image dimensions as a tuple. + + Returns: + Tuple of (width, height) + """ + return (self._width, self._height) + + def get_aspect_ratio(self) -> Optional[float]: + """ + Calculate the aspect ratio of the image. + + Returns: + The aspect ratio (width/height) or None if either dimension is missing + """ + if self._width is not None and self._height is not None and self._height > 0: + return self._width / self._height + return None + + def calculate_scaled_dimensions(self, max_width: Optional[int] = None, + max_height: Optional[int] = None) -> Tuple[Optional[int], Optional[int]]: + """ + Calculate scaled dimensions that fit within the given constraints. + + Args: + max_width: Maximum allowed width + max_height: Maximum allowed height + + Returns: + Tuple of (scaled_width, scaled_height) + """ + if self._width is None or self._height is None: + return (self._width, self._height) + + width, height = self._width, self._height + + # Scale down if needed + if max_width is not None and width > max_width: + height = int(height * max_width / width) + width = max_width + + if max_height is not None and height > max_height: + width = int(width * max_height / height) + height = max_height + + return (width, height) -class Image: - pass - -class HorizontalRule: - - pass \ No newline at end of file +class HorizontalRule(Block): + """ + A horizontal rule element (hr tag). + """ + + def __init__(self): + """Initialize a horizontal rule element.""" + super().__init__(BlockType.HORIZONTAL_RULE) + + @classmethod + def create_and_add_to(cls, container) -> 'HorizontalRule': + """ + Create a new HorizontalRule and add it to a container. + + Args: + container: The container to add the horizontal rule to (must have add_block method) + + Returns: + The newly created HorizontalRule object + + Raises: + AttributeError: If the container doesn't have the required add_block method + """ + # Create the new horizontal rule + hr = cls() + + # Add the horizontal rule to the container + if hasattr(container, 'add_block'): + container.add_block(hr) + else: + raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method") + + return hr diff --git a/pyWebLayout/abstract/functional.py b/pyWebLayout/abstract/functional.py index a8bf658..fa120f3 100644 --- a/pyWebLayout/abstract/functional.py +++ b/pyWebLayout/abstract/functional.py @@ -124,6 +124,11 @@ class Button(Interactable): """Enable or disable the button""" self._enabled = enabled + @property + def params(self) -> Dict[str, Any]: + """Get the button parameters""" + return self._params + def execute(self) -> Any: """ Execute the button's callback function if the button is enabled. diff --git a/pyWebLayout/abstract/inline.py b/pyWebLayout/abstract/inline.py index 7081612..e56b96a 100644 --- a/pyWebLayout/abstract/inline.py +++ b/pyWebLayout/abstract/inline.py @@ -2,6 +2,7 @@ from __future__ import annotations from pyWebLayout.base import Queriable from pyWebLayout.style import Font from typing import Tuple, Union, List, Optional, Dict +import pyphen class Word: @@ -157,9 +158,6 @@ class Word: Returns: bool: True if the word can be hyphenated, False otherwise. """ - # Only import pyphen when needed - import pyphen - # Use the provided language or fall back to style language lang = language if language else self._style.language dic = pyphen.Pyphen(lang=lang) @@ -178,9 +176,6 @@ class Word: Returns: bool: True if the word was hyphenated, False otherwise. """ - # Only import pyphen when needed - import pyphen - # Use the provided language or fall back to style language lang = language if language else self._style.language dic = pyphen.Pyphen(lang=lang) @@ -333,5 +328,58 @@ class FormattedSpan: class LineBreak: - - pass \ No newline at end of file + """ + A line break element that forces a new line within text content. + While this is an inline element that can occur within paragraphs, + it has block-like properties for consistency with the abstract model. + """ + + def __init__(self): + """Initialize a line break element.""" + # Import here to avoid circular imports + from .block import BlockType + self._block_type = BlockType.LINE_BREAK + self._parent = None + + @property + def block_type(self): + """Get the block type for this line break""" + return self._block_type + + @property + def parent(self): + """Get the parent element containing this line break, if any""" + return self._parent + + @parent.setter + def parent(self, parent): + """Set the parent element""" + self._parent = parent + + @classmethod + def create_and_add_to(cls, container) -> 'LineBreak': + """ + Create a new LineBreak and add it to a container. + + Args: + container: The container to add the line break to + + Returns: + The newly created LineBreak object + """ + # Create the new line break + line_break = cls() + + # Add the line break to the container if it has an appropriate method + if hasattr(container, 'add_line_break'): + container.add_line_break(line_break) + elif hasattr(container, 'add_element'): + container.add_element(line_break) + elif hasattr(container, 'add_word'): + # Some containers might treat line breaks like words + container.add_word(line_break) + else: + # Set parent relationship manually + line_break.parent = container + + return line_break diff --git a/pyWebLayout/io/__init__.py b/pyWebLayout/io/__init__.py index 3e3abcb..a1fd9ac 100644 --- a/pyWebLayout/io/__init__.py +++ b/pyWebLayout/io/__init__.py @@ -21,9 +21,11 @@ from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReade # Specialized HTML readers from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader -from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.io.readers.html_resources import HTMLResourceReader +# HTML extraction parser (the best approach) +from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction + # Specialized EPUB readers from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader diff --git a/pyWebLayout/io/readers/__init__.py b/pyWebLayout/io/readers/__init__.py index 93be0d7..950535a 100644 --- a/pyWebLayout/io/readers/__init__.py +++ b/pyWebLayout/io/readers/__init__.py @@ -11,13 +11,8 @@ from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, Com # HTML readers (decomposed) from .html import HTMLReader, read_html, read_html_file, parse_html_string from .html_metadata import HTMLMetadataReader -from .html_content import HTMLContentReader from .html_resources import HTMLResourceReader -# HTML processing components (supporting modules) -from .html_style import HTMLStyleManager -from .html_text import HTMLTextProcessor -from .html_elements import BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler # EPUB readers from .epub_reader import read_epub # Legacy @@ -29,7 +24,7 @@ __all__ = [ # HTML readers 'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string', - 'HTMLMetadataReader', 'HTMLContentReader', 'HTMLResourceReader', + 'HTMLMetadataReader', 'HTMLResourceReader', # EPUB readers 'read_epub', 'EPUBMetadataReader', diff --git a/pyWebLayout/io/readers/html.py b/pyWebLayout/io/readers/html.py index 4433421..4e1cc16 100644 --- a/pyWebLayout/io/readers/html.py +++ b/pyWebLayout/io/readers/html.py @@ -1,36 +1,33 @@ """ Modern HTML reader for pyWebLayout. -This module provides a decomposed HTML reader that uses specialized -readers for metadata, content, and resources, following the pattern -established in the abstract module. +This module provides an HTML reader that uses the html_extraction module +for clean, handler-based parsing using BeautifulSoup. """ import os from typing import Union, Optional from pyWebLayout.abstract.document import Document -from pyWebLayout.io.readers.base import CompositeReader +from pyWebLayout.io.readers.base import BaseReader from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader -from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.io.readers.html_resources import HTMLResourceReader +from pyWebLayout.io.readers.html_extraction import parse_html_string +from pyWebLayout.style import Font -class HTMLReader(CompositeReader): +class HTMLReader(BaseReader): """ - Modern HTML reader using decomposed architecture. + Modern HTML reader using the html_extraction parser. - This reader combines specialized readers for metadata, content, - and resources to provide a complete HTML parsing solution. + This reader uses the clean, handler-based architecture from html_extraction.py + for parsing HTML content into pyWebLayout's abstract document structure. """ def __init__(self): - """Initialize the HTML reader with all specialized readers.""" + """Initialize the HTML reader.""" super().__init__() - - # Set up specialized readers - self.set_metadata_reader(HTMLMetadataReader()) - self.set_content_reader(HTMLContentReader()) - self.set_resource_reader(HTMLResourceReader()) + self._metadata_reader = HTMLMetadataReader() + self._resource_reader = HTMLResourceReader() def can_read(self, source: Union[str, bytes]) -> bool: """ @@ -76,6 +73,7 @@ class HTMLReader(CompositeReader): - encoding: Character encoding (default: 'utf-8') - extract_metadata: Whether to extract metadata (default: True) - extract_resources: Whether to extract resources (default: True) + - base_font: Base font for styling (default: None) Returns: The parsed Document @@ -85,6 +83,7 @@ class HTMLReader(CompositeReader): encoding = options.get('encoding', 'utf-8') extract_metadata = options.get('extract_metadata', True) extract_resources = options.get('extract_resources', True) + base_font = options.get('base_font') # Read the HTML content html_content = self._read_html_content(source, encoding) @@ -93,10 +92,6 @@ class HTMLReader(CompositeReader): if not base_url and isinstance(source, str) and os.path.isfile(source): base_url = f"file://{os.path.dirname(os.path.abspath(source))}/" - # Set base URL in content reader - if self._content_reader and hasattr(self._content_reader, 'set_base_url'): - self._content_reader.set_base_url(base_url) - # Create a new document document = Document() @@ -104,9 +99,10 @@ class HTMLReader(CompositeReader): if extract_metadata and self._metadata_reader: self._metadata_reader.extract_metadata(html_content, document) - # Extract content - if self._content_reader: - self._content_reader.extract_content(html_content, document) + # Parse content using html_extraction + blocks = parse_html_string(html_content, base_font) + for block in blocks: + document.add_block(block) # Extract resources if enabled if extract_resources and self._resource_reader: diff --git a/pyWebLayout/io/readers/html_content.py b/pyWebLayout/io/readers/html_content.py deleted file mode 100644 index 18373e5..0000000 --- a/pyWebLayout/io/readers/html_content.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Modern HTML content reader for pyWebLayout. - -This module provides a decomposed HTML content reader that uses specialized -handlers and managers for different aspects of HTML parsing. -""" - -from html.parser import HTMLParser as BaseHTMLParser -from typing import Dict, List, Optional, Tuple, Union, Any -from pyWebLayout.abstract.document import Document -from pyWebLayout.io.readers.base import ContentReader -from pyWebLayout.io.readers.html_style import HTMLStyleManager -from pyWebLayout.io.readers.html_text import HTMLTextProcessor -from pyWebLayout.io.readers.html_elements import ( - BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler -) - - -class HTMLContentReader(ContentReader, BaseHTMLParser): - """ - Modern HTML content reader using decomposed architecture. - - This class orchestrates specialized handlers to parse HTML content - and convert it to pyWebLayout's abstract document model. - """ - - def __init__(self): - """Initialize the HTML content reader.""" - BaseHTMLParser.__init__(self) - - # Initialize managers and processors - self.style_manager = HTMLStyleManager() - self.text_processor = HTMLTextProcessor(self.style_manager) - - # Initialize element handlers - self.block_handler = BlockElementHandler(self.style_manager, self.text_processor) - self.list_handler = ListElementHandler(self.text_processor) - self.table_handler = TableElementHandler(self.text_processor) - self.inline_handler = InlineElementHandler(self.text_processor) - - # Document and parsing state - self._document: Optional[Document] = None - self._in_head = False - self._in_script = False - self._in_style = False - - def extract_content(self, html_content: str, document: Document) -> Any: - """ - Extract content from HTML. - - Args: - html_content: The HTML content to parse - document: The document to populate with content - - Returns: - The document with populated content - """ - self._document = document - self._reset_state() - - # Parse the HTML content - self.feed(html_content) - - # Flush any remaining text - self.text_processor.flush_text() - - return document - - def set_base_url(self, base_url: str): - """Set the base URL for resolving relative links.""" - self.inline_handler.set_base_url(base_url) - - def _reset_state(self): - """Reset all parser state for new content.""" - # Reset managers and processors - self.style_manager.reset() - self.text_processor.reset() - - # Reset element handlers - self.block_handler.reset() - self.list_handler.reset() - self.table_handler.reset() - self.inline_handler.reset() - - # Reset parser flags - self._in_head = False - self._in_script = False - self._in_style = False - - def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): - """Handle the start of an HTML tag.""" - tag = tag.lower() - attrs_dict = dict(attrs) - - # Skip content in head, script, style (except body) - if self._should_skip_content(tag): - return - - # Handle special section markers - if self._handle_special_sections_start(tag): - return - - # Apply styles for this element - style = self.style_manager.apply_style_to_element(tag, attrs_dict) - self.style_manager.push_style(style) - - # Delegate to appropriate handler - self._delegate_start_tag(tag, attrs_dict) - - def handle_endtag(self, tag: str): - """Handle the end of an HTML tag.""" - tag = tag.lower() - - # Handle special section markers - if self._handle_special_sections_end(tag): - return - - # Skip content in head, script, style - if self._in_head or self._in_script or self._in_style: - return - - # Flush any accumulated text - self.text_processor.flush_text() - - # Delegate to appropriate handler - self._delegate_end_tag(tag) - - # Pop style regardless of tag - self.style_manager.pop_style() - - def handle_data(self, data: str): - """Handle text data.""" - if self._in_head or self._in_script or self._in_style: - return - - self.text_processor.add_text(data) - - def handle_entityref(self, name: str): - """Handle an HTML entity reference.""" - if self._in_head or self._in_script or self._in_style: - return - - self.text_processor.add_entity_reference(name) - - def handle_charref(self, name: str): - """Handle a character reference.""" - if self._in_head or self._in_script or self._in_style: - return - - self.text_processor.add_character_reference(name) - - def _should_skip_content(self, tag: str) -> bool: - """Check if we should skip content based on current state.""" - if self._in_head or self._in_script or self._in_style: - if tag in ('head', 'script', 'style'): - return False # Let special section handlers deal with these - if tag != 'body': - return True - return False - - def _handle_special_sections_start(self, tag: str) -> bool: - """Handle special section start tags. Returns True if handled.""" - if tag == 'head': - self._in_head = True - return True - elif tag == 'body': - self._in_head = False - return True - elif tag == 'script': - self._in_script = True - return True - elif tag == 'style': - self._in_style = True - return True - return False - - def _handle_special_sections_end(self, tag: str) -> bool: - """Handle special section end tags. Returns True if handled.""" - if tag == 'head': - self._in_head = False - self.style_manager.pop_style() - return True - elif tag == 'script': - self._in_script = False - self.style_manager.pop_style() - return True - elif tag == 'style': - self._in_style = False - self.style_manager.pop_style() - return True - return False - - def _delegate_start_tag(self, tag: str, attrs: Dict[str, str]): - """Delegate start tag handling to appropriate handler.""" - # Block elements - if tag == 'p': - self.block_handler.handle_paragraph_start(self._document) - elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): - self.block_handler.handle_heading_start(tag, self._document) - elif tag == 'div': - self.block_handler.handle_div_start(self._document) - elif tag == 'blockquote': - self.block_handler.handle_blockquote_start(self._document) - elif tag == 'pre': - self.block_handler.handle_pre_start(self._document) - elif tag == 'code': - self.block_handler.handle_code_start(attrs, self._document) - - # List elements - elif tag in ('ul', 'ol', 'dl'): - self.list_handler.handle_list_start(tag, self.block_handler, self._document) - elif tag == 'li': - self.list_handler.handle_list_item_start(self.block_handler) - elif tag in ('dt', 'dd'): - self.list_handler.handle_definition_start(tag, self.block_handler) - - # Table elements - elif tag == 'table': - self.table_handler.handle_table_start(attrs, self.block_handler, self._document) - elif tag in ('thead', 'tbody', 'tfoot'): - self.table_handler.handle_table_section_start(tag) - elif tag == 'tr': - self.table_handler.handle_table_row_start() - elif tag in ('td', 'th'): - self.table_handler.handle_table_cell_start(tag, attrs, self.block_handler) - - # Inline elements - elif tag == 'a': - self.inline_handler.handle_link_start(attrs) - elif tag == 'img': - self.inline_handler.handle_image(attrs, self.block_handler, self._document) - elif tag == 'br': - self.inline_handler.handle_line_break(self.block_handler) - elif tag == 'hr': - self.inline_handler.handle_horizontal_rule(self.block_handler, self._document) - - # Style-only elements (no special handling needed, just styling) - elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'): - pass # Styles are already applied by style manager - - def _delegate_end_tag(self, tag: str): - """Delegate end tag handling to appropriate handler.""" - # Block elements - if tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code'): - self.block_handler.handle_block_end() - - # List elements - elif tag in ('ul', 'ol', 'dl'): - self.list_handler.handle_list_end(self.block_handler) - elif tag in ('li', 'dt', 'dd'): - self.list_handler.handle_list_item_end(self.block_handler) - - # Table elements - elif tag == 'table': - self.table_handler.handle_table_end(self.block_handler) - elif tag in ('thead', 'tbody', 'tfoot'): - self.table_handler.handle_table_section_end() - elif tag == 'tr': - self.table_handler.handle_table_row_end() - elif tag in ('td', 'th'): - self.table_handler.handle_table_cell_end(self.block_handler) - - # Inline elements - elif tag == 'a': - self.inline_handler.handle_link_end() - - # Style-only elements (no special handling needed) - elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'): - pass # Styles are handled by style manager diff --git a/pyWebLayout/io/readers/html_elements.py b/pyWebLayout/io/readers/html_elements.py deleted file mode 100644 index 57070d0..0000000 --- a/pyWebLayout/io/readers/html_elements.py +++ /dev/null @@ -1,473 +0,0 @@ -""" -HTML element handlers for pyWebLayout. - -This module provides specialized handlers for different types of HTML elements, -using composition and delegation to handle specific element types. -""" - -from typing import Dict, List, Optional, Any -import urllib.parse -from pyWebLayout.abstract.document import Document -from pyWebLayout.abstract.block import ( - Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, - HList, ListStyle, ListItem, Table, TableRow, TableCell, - HorizontalRule, Image -) -from pyWebLayout.abstract.inline import LineBreak -from pyWebLayout.abstract.functional import Link, LinkType -from pyWebLayout.io.readers.html_style import HTMLStyleManager -from pyWebLayout.io.readers.html_text import HTMLTextProcessor - - -class BlockElementHandler: - """Handles block-level HTML elements like paragraphs, headings, divs.""" - - def __init__(self, style_manager: HTMLStyleManager, text_processor: HTMLTextProcessor): - self.style_manager = style_manager - self.text_processor = text_processor - self.block_stack: List[Block] = [] - self.current_block: Optional[Block] = None - self.current_paragraph: Optional[Paragraph] = None - - def reset(self): - """Reset the handler state.""" - self.block_stack = [] - self.current_block = None - self.current_paragraph = None - - def add_block_to_document_or_parent(self, block: Block, document: Document): - """Add a block to the document or current parent block.""" - if self.current_block and hasattr(self.current_block, 'add_block'): - self.current_block.add_block(block) - else: - document.add_block(block) - - def handle_paragraph_start(self, document: Document): - """Handle the start of a paragraph element.""" - self.text_processor.flush_text() - paragraph = Paragraph() - - self.add_block_to_document_or_parent(paragraph, document) - self.block_stack.append(paragraph) - self.current_block = paragraph - self.current_paragraph = paragraph - self.text_processor.set_current_paragraph(paragraph) - - def handle_heading_start(self, tag: str, document: Document): - """Handle the start of a heading element.""" - self.text_processor.flush_text() - - level_map = { - 'h1': HeadingLevel.H1, 'h2': HeadingLevel.H2, 'h3': HeadingLevel.H3, - 'h4': HeadingLevel.H4, 'h5': HeadingLevel.H5, 'h6': HeadingLevel.H6 - } - - heading = Heading(level=level_map[tag]) - self.add_block_to_document_or_parent(heading, document) - self.block_stack.append(heading) - self.current_block = heading - self.current_paragraph = heading # Heading inherits from Paragraph - self.text_processor.set_current_paragraph(heading) - - def handle_div_start(self, document: Document): - """Handle the start of a div element.""" - self.text_processor.flush_text() - div_para = Paragraph() - - self.add_block_to_document_or_parent(div_para, document) - self.block_stack.append(div_para) - self.current_block = div_para - self.current_paragraph = div_para - self.text_processor.set_current_paragraph(div_para) - - def handle_blockquote_start(self, document: Document): - """Handle the start of a blockquote element.""" - self.text_processor.flush_text() - quote = Quote() - - self.add_block_to_document_or_parent(quote, document) - self.block_stack.append(quote) - self.current_block = quote - self.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_pre_start(self, document: Document): - """Handle the start of a pre element.""" - self.text_processor.flush_text() - pre_para = Paragraph() - - self.add_block_to_document_or_parent(pre_para, document) - self.block_stack.append(pre_para) - self.current_block = pre_para - self.current_paragraph = pre_para - self.text_processor.set_current_paragraph(pre_para) - - def handle_code_start(self, attrs: Dict[str, str], document: Document): - """Handle the start of a code element.""" - # If we're inside a pre, replace the paragraph with a code block - if self.block_stack and isinstance(self.block_stack[-1], Paragraph): - pre_para = self.block_stack.pop() - - # Get the language from class if specified - language = "" - if 'class' in attrs: - class_attr = attrs['class'] - if class_attr.startswith('language-'): - language = class_attr[9:] - - code_block = CodeBlock(language=language) - - # Replace the paragraph with the code block in its parent - if pre_para.parent: - parent = pre_para.parent - if hasattr(parent, '_blocks'): - for i, block in enumerate(parent._blocks): - if block == pre_para: - parent._blocks[i] = code_block - code_block.parent = parent - break - else: - # Replace in document blocks - for i, block in enumerate(document.blocks): - if block == pre_para: - document.blocks[i] = code_block - break - - self.block_stack.append(code_block) - self.current_block = code_block - self.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_block_end(self): - """Handle the end of a block element.""" - if self.block_stack: - self.block_stack.pop() - - if self.block_stack: - self.current_block = self.block_stack[-1] - # Update current paragraph based on block type - if isinstance(self.current_block, Paragraph): - self.current_paragraph = self.current_block - else: - self.current_paragraph = None - else: - self.current_block = None - self.current_paragraph = None - - self.text_processor.set_current_paragraph(self.current_paragraph) - - -class ListElementHandler: - """Handles list-related HTML elements (ul, ol, dl, li, dt, dd).""" - - def __init__(self, text_processor: HTMLTextProcessor): - self.text_processor = text_processor - self.list_stack: List[HList] = [] - - def reset(self): - """Reset the handler state.""" - self.list_stack = [] - - def handle_list_start(self, tag: str, block_handler: BlockElementHandler, document: Document): - """Handle the start of a list element.""" - self.text_processor.flush_text() - - style_map = { - 'ul': ListStyle.UNORDERED, - 'ol': ListStyle.ORDERED, - 'dl': ListStyle.DEFINITION - } - - list_block = HList(style=style_map[tag]) - block_handler.add_block_to_document_or_parent(list_block, document) - - block_handler.block_stack.append(list_block) - self.list_stack.append(list_block) - block_handler.current_block = list_block - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_list_item_start(self, block_handler: BlockElementHandler): - """Handle the start of a list item.""" - if not self.list_stack: - return - - self.text_processor.flush_text() - list_item = ListItem() - - current_list = self.list_stack[-1] - current_list.add_item(list_item) - - block_handler.block_stack.append(list_item) - block_handler.current_block = list_item - - # Create a paragraph for the list item content - item_para = Paragraph() - list_item.add_block(item_para) - block_handler.current_paragraph = item_para - self.text_processor.set_current_paragraph(item_para) - - def handle_definition_start(self, tag: str, block_handler: BlockElementHandler): - """Handle the start of definition terms or descriptions.""" - if not self.list_stack or self.list_stack[-1].style != ListStyle.DEFINITION: - return - - self.text_processor.flush_text() - current_list = self.list_stack[-1] - - if tag == 'dt': - list_item = ListItem(term="") - current_list.add_item(list_item) - block_handler.block_stack.append(list_item) - block_handler.current_block = list_item - - term_para = Paragraph() - list_item.add_block(term_para) - block_handler.current_paragraph = term_para - self.text_processor.set_current_paragraph(term_para) - - elif tag == 'dd': - if current_list._items: - list_item = current_list._items[-1] - desc_para = Paragraph() - list_item.add_block(desc_para) - block_handler.current_paragraph = desc_para - self.text_processor.set_current_paragraph(desc_para) - - def handle_list_end(self, block_handler: BlockElementHandler): - """Handle the end of a list.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - if self.list_stack: - self.list_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_list_item_end(self, block_handler: BlockElementHandler): - """Handle the end of a list item.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - -class TableElementHandler: - """Handles table-related HTML elements (table, tr, td, th, thead, tbody, tfoot).""" - - def __init__(self, text_processor: HTMLTextProcessor): - self.text_processor = text_processor - self.table_stack: List[Table] = [] - self.current_table_row: Optional[TableRow] = None - self.current_table_section = "body" - - def reset(self): - """Reset the handler state.""" - self.table_stack = [] - self.current_table_row = None - self.current_table_section = "body" - - def handle_table_start(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document): - """Handle the start of a table element.""" - self.text_processor.flush_text() - - caption = attrs.get('summary') - table = Table(caption=caption) - - block_handler.add_block_to_document_or_parent(table, document) - block_handler.block_stack.append(table) - self.table_stack.append(table) - block_handler.current_block = table - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_table_section_start(self, tag: str): - """Handle the start of a table section.""" - self.current_table_section = tag - - def handle_table_row_start(self): - """Handle the start of a table row.""" - if not self.table_stack: - return - - self.text_processor.flush_text() - row = TableRow() - - current_table = self.table_stack[-1] - section = self.current_table_section - - if section == 'thead': - section = "header" - elif section == 'tfoot': - section = "footer" - else: - section = "body" - - current_table.add_row(row, section=section) - self.current_table_row = row - - def handle_table_cell_start(self, tag: str, attrs: Dict[str, str], block_handler: BlockElementHandler): - """Handle the start of a table cell.""" - if not self.current_table_row: - return - - self.text_processor.flush_text() - - # Parse attributes - try: - colspan = int(attrs.get('colspan', 1)) - rowspan = int(attrs.get('rowspan', 1)) - except ValueError: - colspan, rowspan = 1, 1 - - is_header = (tag == 'th') - - cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan) - self.current_table_row.add_cell(cell) - - block_handler.block_stack.append(cell) - block_handler.current_block = cell - - # Create a paragraph for the cell content - cell_para = Paragraph() - cell.add_block(cell_para) - block_handler.current_paragraph = cell_para - self.text_processor.set_current_paragraph(cell_para) - - def handle_table_end(self, block_handler: BlockElementHandler): - """Handle the end of a table.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - if self.table_stack: - self.table_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - self.current_table_row = None - self.current_table_section = "body" - - def handle_table_section_end(self): - """Handle the end of a table section.""" - self.current_table_section = "body" - - def handle_table_row_end(self): - """Handle the end of a table row.""" - self.current_table_row = None - - def handle_table_cell_end(self, block_handler: BlockElementHandler): - """Handle the end of a table cell.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - -class InlineElementHandler: - """Handles inline and special HTML elements (a, img, br, hr).""" - - def __init__(self, text_processor: HTMLTextProcessor, base_url: Optional[str] = None): - self.text_processor = text_processor - self.base_url = base_url - self.in_link = False - self.current_link: Optional[Link] = None - - def reset(self): - """Reset the handler state.""" - self.in_link = False - self.current_link = None - - def set_base_url(self, base_url: Optional[str]): - """Set the base URL for resolving relative links.""" - self.base_url = base_url - - def handle_link_start(self, attrs: Dict[str, str]): - """Handle the start of a link element.""" - self.text_processor.flush_text() - - href = attrs.get('href', '') - title = attrs.get('title', '') - - # Determine link type - link_type = LinkType.INTERNAL - if href.startswith('http://') or href.startswith('https://'): - link_type = LinkType.EXTERNAL - elif href.startswith('javascript:'): - link_type = LinkType.FUNCTION - elif href.startswith('api:'): - link_type = LinkType.API - href = href[4:] - - # Resolve relative URLs - if self.base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')): - href = urllib.parse.urljoin(self.base_url, href) - - self.current_link = Link( - location=href, - link_type=link_type, - title=title if title else None - ) - - self.in_link = True - - def handle_link_end(self): - """Handle the end of a link element.""" - self.in_link = False - self.current_link = None - - def handle_image(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document): - """Handle an image element.""" - src = attrs.get('src', '') - alt = attrs.get('alt', '') - - # Parse dimensions - width = height = None - try: - if 'width' in attrs: - width = int(attrs['width']) - if 'height' in attrs: - height = int(attrs['height']) - except ValueError: - pass - - # Resolve relative URLs - if self.base_url and not src.startswith(('http://', 'https://')): - src = urllib.parse.urljoin(self.base_url, src) - - image = Image(source=src, alt_text=alt, width=width, height=height) - block_handler.add_block_to_document_or_parent(image, document) - - def handle_line_break(self, block_handler: BlockElementHandler): - """Handle a line break element.""" - if block_handler.current_paragraph: - line_break = LineBreak() - if hasattr(block_handler.current_paragraph, 'add_block'): - block_handler.current_paragraph.add_block(line_break) - self.text_processor.flush_text() - - def handle_horizontal_rule(self, block_handler: BlockElementHandler, document: Document): - """Handle a horizontal rule element.""" - self.text_processor.flush_text() - hr = HorizontalRule() - block_handler.add_block_to_document_or_parent(hr, document) diff --git a/pyWebLayout/io/readers/html_extraction.py b/pyWebLayout/io/readers/html_extraction.py index 6e359a3..73c2df2 100644 --- a/pyWebLayout/io/readers/html_extraction.py +++ b/pyWebLayout/io/readers/html_extraction.py @@ -12,7 +12,8 @@ from bs4 import BeautifulSoup, Tag, NavigableString from pyWebLayout.abstract.inline import Word, FormattedSpan from pyWebLayout.abstract.block import ( Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, - HList, ListItem, ListStyle, Table, TableRow, TableCell + HList, ListItem, ListStyle, Table, TableRow, TableCell, + HorizontalRule, Image ) from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration @@ -576,11 +577,9 @@ def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell: return cell -def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block: +def horizontal_rule_handler(element: Tag, context: StyleContext) -> HorizontalRule: """Handle
elements.""" - # TODO: Create a specific HorizontalRule block type - # For now, return an empty paragraph - return Paragraph(context.font) + return HorizontalRule() def line_break_handler(element: Tag, context: StyleContext) -> None: @@ -589,18 +588,22 @@ def line_break_handler(element: Tag, context: StyleContext) -> None: return None -def image_handler(element: Tag, context: StyleContext) -> Block: +def image_handler(element: Tag, context: StyleContext) -> Image: """Handle elements.""" - # TODO: Create Image block type - # For now, return empty paragraph with alt text if available - paragraph = Paragraph(context.font) + src = context.element_attributes.get('src', '') alt_text = context.element_attributes.get('alt', '') - if alt_text: - words = alt_text.split() - for word_text in words: - if word_text: - paragraph.add_word(Word(word_text, context.font)) - return paragraph + + # Parse dimensions if provided + width = height = None + try: + if 'width' in context.element_attributes: + width = int(context.element_attributes['width']) + if 'height' in context.element_attributes: + height = int(context.element_attributes['height']) + except ValueError: + pass + + return Image(source=src, alt_text=alt_text, width=width, height=height) def ignore_handler(element: Tag, context: StyleContext) -> None: diff --git a/pyWebLayout/io/readers/html_style.py b/pyWebLayout/io/readers/html_style.py deleted file mode 100644 index 19f7d49..0000000 --- a/pyWebLayout/io/readers/html_style.py +++ /dev/null @@ -1,281 +0,0 @@ -""" -HTML style management for pyWebLayout. - -This module provides specialized functionality for handling CSS styles, -style stacks, and style parsing in HTML documents. -""" - -from typing import Dict, List, Any, Optional, Tuple -import re -from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration - - -class HTMLStyleManager: - """ - Manages CSS styles and style stacks during HTML parsing. - - This class handles style parsing, style inheritance, and maintains - the style stack for proper style nesting. - """ - - def __init__(self): - """Initialize the style manager.""" - self._style_stack: List[Dict[str, Any]] = [] - self._current_style = self._get_default_style() - - def _get_default_style(self) -> Dict[str, Any]: - """Get the default style settings.""" - return { - 'font_size': 12, - 'font_weight': FontWeight.NORMAL, - 'font_style': FontStyle.NORMAL, - 'decoration': TextDecoration.NONE, - 'color': (0, 0, 0), - 'background': None, - 'language': 'en_US' - } - - def reset(self): - """Reset the style manager to initial state.""" - self._style_stack = [] - self._current_style = self._get_default_style() - - def push_style(self, style: Dict[str, Any]): - """ - Push a new style onto the style stack. - - Args: - style: The style to push - """ - # Save the current style - self._style_stack.append(self._current_style.copy()) - - # Apply the new style - for key, value in style.items(): - self._current_style[key] = value - - def pop_style(self): - """Pop a style from the style stack.""" - if self._style_stack: - self._current_style = self._style_stack.pop() - - def get_current_style(self) -> Dict[str, Any]: - """Get the current style.""" - return self._current_style.copy() - - def get_tag_style(self, tag: str) -> Dict[str, Any]: - """ - Get the default style for a tag. - - Args: - tag: The tag name - - Returns: - A dictionary of style properties - """ - tag_styles = { - 'h1': {'font_size': 24, 'font_weight': FontWeight.BOLD}, - 'h2': {'font_size': 20, 'font_weight': FontWeight.BOLD}, - 'h3': {'font_size': 18, 'font_weight': FontWeight.BOLD}, - 'h4': {'font_size': 16, 'font_weight': FontWeight.BOLD}, - 'h5': {'font_size': 14, 'font_weight': FontWeight.BOLD}, - 'h6': {'font_size': 12, 'font_weight': FontWeight.BOLD}, - 'b': {'font_weight': FontWeight.BOLD}, - 'strong': {'font_weight': FontWeight.BOLD}, - 'i': {'font_style': FontStyle.ITALIC}, - 'em': {'font_style': FontStyle.ITALIC}, - 'u': {'decoration': TextDecoration.UNDERLINE}, - 'a': {'decoration': TextDecoration.UNDERLINE, 'color': (0, 0, 255)}, - 'code': {'font_family': 'monospace', 'background': (240, 240, 240, 255)}, - 'pre': {'font_family': 'monospace'}, - } - - return tag_styles.get(tag, {}) - - def create_font(self) -> Font: - """ - Create a Font object from the current style. - - Returns: - Font: A font object with the current style settings - """ - return Font( - font_size=self._current_style['font_size'], - colour=self._current_style['color'], - weight=self._current_style['font_weight'], - style=self._current_style['font_style'], - decoration=self._current_style['decoration'], - background=self._current_style['background'], - langauge=self._current_style['language'] - ) - - def parse_inline_style(self, style_str: str) -> Dict[str, Any]: - """ - Parse inline CSS style string. - - Args: - style_str: CSS style string - - Returns: - Dictionary of style properties - """ - if not style_str: - return {} - - style_dict = {} - declarations = [d.strip() for d in style_str.split(';') if d.strip()] - - for declaration in declarations: - parts = declaration.split(':', 1) - if len(parts) != 2: - continue - - prop = parts[0].strip().lower() - value = parts[1].strip() - - # Handle specific properties - if prop == 'font-size': - if value.endswith('px'): - try: - size = int(value[:-2]) - style_dict['font_size'] = size - except ValueError: - pass - elif value.endswith('pt'): - try: - size = int(value[:-2]) - style_dict['font_size'] = size - except ValueError: - pass - elif prop == 'font-weight': - if value == 'bold': - style_dict['font_weight'] = FontWeight.BOLD - elif value == 'normal': - style_dict['font_weight'] = FontWeight.NORMAL - elif prop == 'font-style': - if value == 'italic': - style_dict['font_style'] = FontStyle.ITALIC - elif value == 'normal': - style_dict['font_style'] = FontStyle.NORMAL - elif prop == 'text-decoration': - if value == 'underline': - style_dict['decoration'] = TextDecoration.UNDERLINE - elif value == 'line-through': - style_dict['decoration'] = TextDecoration.STRIKETHROUGH - elif value == 'none': - style_dict['decoration'] = TextDecoration.NONE - elif prop == 'color': - color = self.parse_color(value) - if color: - style_dict['color'] = color - elif prop == 'background-color': - color = self.parse_color(value) - if color: - style_dict['background'] = color + (255,) - - return style_dict - - def parse_color(self, color_str: str) -> Optional[Tuple[int, int, int]]: - """ - Parse a CSS color string. - - Args: - color_str: CSS color string - - Returns: - RGB tuple or None if parsing fails - """ - # Named colors - color_map = { - 'black': (0, 0, 0), - 'white': (255, 255, 255), - 'red': (255, 0, 0), - 'green': (0, 128, 0), - 'blue': (0, 0, 255), - 'yellow': (255, 255, 0), - 'cyan': (0, 255, 255), - 'magenta': (255, 0, 255), - 'gray': (128, 128, 128), - 'grey': (128, 128, 128), - 'silver': (192, 192, 192), - 'maroon': (128, 0, 0), - 'olive': (128, 128, 0), - 'navy': (0, 0, 128), - 'purple': (128, 0, 128), - 'teal': (0, 128, 128), - 'lime': (0, 255, 0), - 'aqua': (0, 255, 255), - 'fuchsia': (255, 0, 255), - } - - # Check for named color - color_str = color_str.lower().strip() - if color_str in color_map: - return color_map[color_str] - - # Check for hex color - if color_str.startswith('#'): - try: - if len(color_str) == 4: # #RGB - r = int(color_str[1] + color_str[1], 16) - g = int(color_str[2] + color_str[2], 16) - b = int(color_str[3] + color_str[3], 16) - return (r, g, b) - elif len(color_str) == 7: # #RRGGBB - r = int(color_str[1:3], 16) - g = int(color_str[3:5], 16) - b = int(color_str[5:7], 16) - return (r, g, b) - except ValueError: - pass - - # Check for rgb() color - rgb_match = re.match(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', color_str) - if rgb_match: - try: - r_val = int(rgb_match.group(1)) - g_val = int(rgb_match.group(2)) - b_val = int(rgb_match.group(3)) - - # Check if values are in valid range (0-255) - if r_val > 255 or g_val > 255 or b_val > 255 or r_val < 0 or g_val < 0 or b_val < 0: - return None # Invalid color values - - return (r_val, g_val, b_val) - except ValueError: - pass - - # Check for rgba() color (ignore alpha) - rgba_match = re.match(r'rgba\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*[\d.]+\s*\)', color_str) - if rgba_match: - try: - r = min(255, max(0, int(rgba_match.group(1)))) - g = min(255, max(0, int(rgba_match.group(2)))) - b = min(255, max(0, int(rgba_match.group(3)))) - return (r, g, b) - except ValueError: - pass - - # Failed to parse color - return None - - def apply_style_to_element(self, tag: str, attrs: Dict[str, str]) -> Dict[str, Any]: - """ - Apply combined styles (tag defaults + inline styles) for an element. - - Args: - tag: The HTML tag name - attrs: Dictionary of tag attributes - - Returns: - Combined style dictionary - """ - # Start with tag-specific styles - style = self.get_tag_style(tag) - - # Override with inline styles if present - if 'style' in attrs: - inline_style = self.parse_inline_style(attrs['style']) - style.update(inline_style) - - return style diff --git a/pyWebLayout/io/readers/html_text.py b/pyWebLayout/io/readers/html_text.py deleted file mode 100644 index 7864148..0000000 --- a/pyWebLayout/io/readers/html_text.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -HTML text processing for pyWebLayout. - -This module provides specialized functionality for handling text content, -entity references, and word creation in HTML documents. -""" - -from typing import Optional -from pyWebLayout.abstract.inline import Word -from pyWebLayout.abstract.block import Paragraph -from pyWebLayout.io.readers.html_style import HTMLStyleManager - - -class HTMLTextProcessor: - """ - Processes text content during HTML parsing. - - This class handles text buffering, entity resolution, and word creation - with proper styling applied. - """ - - def __init__(self, style_manager: HTMLStyleManager): - """ - Initialize the text processor. - - Args: - style_manager: The style manager for creating styled words - """ - self._style_manager = style_manager - self._text_buffer = "" - self._current_paragraph: Optional[Paragraph] = None - - def reset(self): - """Reset the text processor state.""" - self._text_buffer = "" - self._current_paragraph = None - - def set_current_paragraph(self, paragraph: Optional[Paragraph]): - """ - Set the current paragraph for text output. - - Args: - paragraph: The paragraph to receive text, or None - """ - self._current_paragraph = paragraph - - def add_text(self, text: str): - """ - Add text to the buffer. - - Args: - text: The text to add - """ - self._text_buffer += text - - def add_entity_reference(self, name: str): - """ - Add an HTML entity reference to the buffer. - - Args: - name: The entity name (e.g., 'lt', 'gt', 'amp') - """ - # Map common entity references to characters - entities = { - 'lt': '<', - 'gt': '>', - 'amp': '&', - 'quot': '"', - 'apos': "'", - 'nbsp': ' ', - 'copy': '©', - 'reg': '®', - 'trade': '™', - 'mdash': '—', - 'ndash': '–', - 'hellip': '…', - 'laquo': '«', - 'raquo': '»', - 'ldquo': '"', - 'rdquo': '"', - 'lsquo': ''', - 'rsquo': ''', - 'deg': '°', - 'plusmn': '±', - 'times': '×', - 'divide': '÷', - 'euro': '€', - 'pound': '£', - 'yen': '¥', - } - - char = entities.get(name, f'&{name};') - self._text_buffer += char - - def add_character_reference(self, name: str): - """ - Add a character reference to the buffer. - - Args: - name: The character reference (decimal or hex) - """ - try: - if name.startswith('x'): - # Hexadecimal reference - char = chr(int(name[1:], 16)) - else: - # Decimal reference - char = chr(int(name)) - self._text_buffer += char - except (ValueError, OverflowError): - # Invalid character reference - self._text_buffer += f'&#{name};' - - def flush_text(self) -> bool: - """ - Flush the text buffer, creating words as needed. - - Returns: - True if text was flushed, False if buffer was empty - """ - if not self._text_buffer or not self._current_paragraph: - self._text_buffer = "" - return False - - # Clean up the text - text = self._text_buffer.strip() - if not text: - self._text_buffer = "" - return False - - # Create words from the text - words = text.split() - for word_text in words: - if word_text: - font = self._style_manager.create_font() - word = Word(word_text, font) - self._current_paragraph.add_word(word) - - # Reset text buffer - self._text_buffer = "" - return True - - def has_pending_text(self) -> bool: - """ - Check if there is pending text in the buffer. - - Returns: - True if there is text waiting to be flushed - """ - return bool(self._text_buffer.strip()) - - def get_buffer_content(self) -> str: - """ - Get the current buffer content without flushing. - - Returns: - The current text buffer content - """ - return self._text_buffer - - def clear_buffer(self): - """Clear the text buffer without creating words.""" - self._text_buffer = "" diff --git a/pyWebLayout/style/fonts.py b/pyWebLayout/style/fonts.py index 936ed62..42b07b7 100644 --- a/pyWebLayout/style/fonts.py +++ b/pyWebLayout/style/fonts.py @@ -34,7 +34,7 @@ class Font: style: FontStyle = FontStyle.NORMAL, decoration: TextDecoration = TextDecoration.NONE, background: Optional[Tuple[int, int, int, int]] = None, - langauge = "en_EN"): + language = "en_EN"): """ Initialize a Font object with the specified properties. @@ -46,6 +46,7 @@ class Font: style: Font style (normal or italic). decoration: Text decoration (none, underline, or strikethrough). background: RGBA background color for the text. If None, transparent background. + language: Language code for hyphenation and text processing. """ self._font_path = font_path self._font_size = font_size @@ -54,7 +55,7 @@ class Font: self._style = style self._decoration = decoration self._background = background if background else (255, 255, 255, 0) - self.language = langauge + self.language = language # Load the font file or use default self._load_font() diff --git a/tests/test_html_content.py b/tests/test_html_content.py deleted file mode 100644 index 7376c4c..0000000 --- a/tests/test_html_content.py +++ /dev/null @@ -1,354 +0,0 @@ -""" -Unit tests for HTML content reading. - -Tests the HTMLContentReader class for parsing complete HTML documents. -This is more of an integration test covering the entire parsing pipeline. -""" - -import unittest -from pyWebLayout.io.readers.html_content import HTMLContentReader -from pyWebLayout.abstract.document import Document -from pyWebLayout.abstract.block import ( - Paragraph, Heading, HeadingLevel, HList, ListStyle, - Table, Quote, CodeBlock, HorizontalRule -) -from pyWebLayout.abstract.inline import LineBreak - -class TestHTMLContentReader(unittest.TestCase): - """Test cases for HTMLContentReader.""" - - def setUp(self): - """Set up test fixtures.""" - self.reader = HTMLContentReader() - self.document = Document() - - def test_simple_paragraph(self): - """Test parsing a simple paragraph.""" - html = '

Hello world!

' - - result = self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], Paragraph) - - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - self.assertEqual(len(words), 2) - self.assertEqual(words[0][1].text, "Hello") - self.assertEqual(words[1][1].text, "world!") - - def test_headings(self): - """Test parsing different heading levels.""" - html = ''' -

Heading 1

-

Heading 2

-

Heading 3

-
Heading 6
- ''' - - self.reader.extract_content(html, self.document) - - # Should have 4 heading blocks - headings = [block for block in self.document.blocks if isinstance(block, Heading)] - self.assertEqual(len(headings), 4) - - # Check heading levels - self.assertEqual(headings[0].level, HeadingLevel.H1) - self.assertEqual(headings[1].level, HeadingLevel.H2) - self.assertEqual(headings[2].level, HeadingLevel.H3) - self.assertEqual(headings[3].level, HeadingLevel.H6) - - # Check text content - h1_words = list(headings[0].words()) - self.assertEqual(len(h1_words), 2) - self.assertEqual(h1_words[0][1].text, "Heading") - self.assertEqual(h1_words[1][1].text, "1") - - def test_styled_text(self): - """Test parsing text with inline styling.""" - html = '

This is bold and italic text.

' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - - # Should have words: "This", "is", "bold", "and", "italic", "text." - self.assertEqual(len(words), 6) - - # The styling information is embedded in the Font objects - # We can't easily test the exact styling without more complex setup - # but we can verify the words are created correctly - word_texts = [word[1].text for word in words] - self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."]) - - def test_unordered_list(self): - """Test parsing unordered lists.""" - html = ''' - - ''' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], HList) - - list_block = self.document.blocks[0] - self.assertEqual(list_block.style, ListStyle.UNORDERED) - - items = list(list_block.items()) - self.assertEqual(len(items), 3) - - # Check first item content - first_item_blocks = list(items[0].blocks()) - self.assertEqual(len(first_item_blocks), 1) - self.assertIsInstance(first_item_blocks[0], Paragraph) - - def test_ordered_list(self): - """Test parsing ordered lists.""" - html = ''' -
    -
  1. First step
  2. -
  3. Second step
  4. -
- ''' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - list_block = self.document.blocks[0] - self.assertEqual(list_block.style, ListStyle.ORDERED) - - items = list(list_block.items()) - self.assertEqual(len(items), 2) - - def test_definition_list(self): - """Test parsing definition lists.""" - html = ''' -
-
Term 1
-
Definition 1
-
Term 2
-
Definition 2
-
- ''' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - list_block = self.document.blocks[0] - self.assertEqual(list_block.style, ListStyle.DEFINITION) - - items = list(list_block.items()) - self.assertEqual(len(items), 2) # Two dt/dd pairs - - def test_table(self): - """Test parsing simple tables.""" - html = ''' - - - - - - - - - -
Header 1Header 2
Cell 1Cell 2
- ''' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], Table) - - table = self.document.blocks[0] - - # Check body rows - body_rows = list(table.body_rows()) - self.assertEqual(len(body_rows), 2) # Header row + data row - - # Check first row (header) - first_row_cells = list(body_rows[0].cells()) - self.assertEqual(len(first_row_cells), 2) - self.assertTrue(first_row_cells[0].is_header) - self.assertTrue(first_row_cells[1].is_header) - - # Check second row (data) - second_row_cells = list(body_rows[1].cells()) - self.assertEqual(len(second_row_cells), 2) - self.assertFalse(second_row_cells[0].is_header) - self.assertFalse(second_row_cells[1].is_header) - - def test_blockquote(self): - """Test parsing blockquotes.""" - html = ''' -
-

This is a quoted paragraph.

-

Another quoted paragraph.

-
- ''' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], Quote) - - quote = self.document.blocks[0] - quote_blocks = list(quote.blocks()) - self.assertEqual(len(quote_blocks), 2) - self.assertIsInstance(quote_blocks[0], Paragraph) - self.assertIsInstance(quote_blocks[1], Paragraph) - - def test_code_block(self): - """Test parsing code blocks.""" - html = ''' -

-def hello():
-    print("Hello, world!")
-        
- ''' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], CodeBlock) - - code_block = self.document.blocks[0] - self.assertEqual(code_block.language, "python") - - def test_horizontal_rule(self): - """Test parsing horizontal rules.""" - html = '

Before


After

' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 3) - self.assertIsInstance(self.document.blocks[0], Paragraph) - self.assertIsInstance(self.document.blocks[1], HorizontalRule) - self.assertIsInstance(self.document.blocks[2], Paragraph) - - def test_html_entities(self): - """Test handling HTML entities.""" - html = '

Less than: < Greater than: > Ampersand: &

' - - self.reader.extract_content(html, self.document) - - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - - # Find the entity words - word_texts = [word[1].text for word in words] - self.assertIn('<', word_texts) - self.assertIn('>', word_texts) - self.assertIn('&', word_texts) - - def test_nested_elements(self): - """Test parsing nested HTML elements.""" - html = ''' -
-

Section Title

-

Section content with important text.

- -
- ''' - - self.reader.extract_content(html, self.document) - - # Should have multiple blocks - self.assertGreater(len(self.document.blocks), 1) - - # Check that we have different types of blocks - block_types = [type(block).__name__ for block in self.document.blocks] - self.assertIn('Paragraph', block_types) # From div - self.assertIn('Heading', block_types) - self.assertIn('HList', block_types) - - def test_empty_elements(self): - """Test handling empty HTML elements.""" - html = '

' - - self.reader.extract_content(html, self.document) - - # Empty elements should still create blocks - self.assertEqual(len(self.document.blocks), 3) - - def test_whitespace_handling(self): - """Test proper whitespace handling.""" - html = ''' -

Word1 Word2 - Word3

- ''' - - self.reader.extract_content(html, self.document) - - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - - # Should normalize whitespace and create separate words - word_texts = [word[1].text for word in words] - self.assertEqual(word_texts, ["Word1", "Word2", "Word3"]) - - def test_base_url_setting(self): - """Test setting base URL for link resolution.""" - base_url = "https://example.com/path/" - self.reader.set_base_url(base_url) - - # The base URL should be passed to the inline handler - self.assertEqual(self.reader.inline_handler.base_url, base_url) - - def test_complex_document(self): - """Test parsing a complex HTML document.""" - html = ''' - - - - Test Document - - - -

Main Title

-

Introduction paragraph with emphasis.

- -

Section 1

-

Content with a link.

- - - -

Section 2

-
-

A quoted paragraph.

-
- - - - -
Col1Col2
AB
- - - ''' - - self.reader.extract_content(html, self.document) - - # Should have parsed multiple blocks - self.assertGreater(len(self.document.blocks), 5) - - # Should have different types of content - block_types = set(type(block).__name__ for block in self.document.blocks) - expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'} - self.assertTrue(expected_types.issubset(block_types)) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/test_html_style.py b/tests/test_html_style.py index 9cfb203..c5fb09d 100644 --- a/tests/test_html_style.py +++ b/tests/test_html_style.py @@ -1,181 +1,181 @@ """ -Unit tests for HTML style management. +Unit tests for pyWebLayout style objects. -Tests the HTMLStyleManager class for CSS parsing, style stacks, and font creation. +Tests the Font class and style enums for proper functionality and immutability. """ import unittest -from pyWebLayout.io.readers.html_style import HTMLStyleManager -from pyWebLayout.style import FontStyle, FontWeight, TextDecoration +from pyWebLayout.style import Font, FontStyle, FontWeight, TextDecoration, Alignment -class TestHTMLStyleManager(unittest.TestCase): - """Test cases for HTMLStyleManager.""" +class TestStyleObjects(unittest.TestCase): + """Test cases for pyWebLayout style objects.""" - def setUp(self): - """Set up test fixtures.""" - self.style_manager = HTMLStyleManager() + def test_font_weight_enum(self): + """Test FontWeight enum values.""" + self.assertEqual(FontWeight.NORMAL.value, "normal") + self.assertEqual(FontWeight.BOLD.value, "bold") + + # Test that all expected values exist + weights = [FontWeight.NORMAL, FontWeight.BOLD] + self.assertEqual(len(weights), 2) - def test_initialization(self): - """Test proper initialization of style manager.""" - style = self.style_manager.get_current_style() + def test_font_style_enum(self): + """Test FontStyle enum values.""" + self.assertEqual(FontStyle.NORMAL.value, "normal") + self.assertEqual(FontStyle.ITALIC.value, "italic") - self.assertEqual(style['font_size'], 12) - self.assertEqual(style['font_weight'], FontWeight.NORMAL) - self.assertEqual(style['font_style'], FontStyle.NORMAL) - self.assertEqual(style['decoration'], TextDecoration.NONE) - self.assertEqual(style['color'], (0, 0, 0)) - self.assertIsNone(style['background']) - self.assertEqual(style['language'], 'en_US') + # Test that all expected values exist + styles = [FontStyle.NORMAL, FontStyle.ITALIC] + self.assertEqual(len(styles), 2) - def test_style_stack_operations(self): - """Test push and pop operations on style stack.""" - # Initial state - initial_style = self.style_manager.get_current_style() + def test_text_decoration_enum(self): + """Test TextDecoration enum values.""" + self.assertEqual(TextDecoration.NONE.value, "none") + self.assertEqual(TextDecoration.UNDERLINE.value, "underline") + self.assertEqual(TextDecoration.STRIKETHROUGH.value, "strikethrough") - # Push a new style - new_style = {'font_size': 16, 'font_weight': FontWeight.BOLD} - self.style_manager.push_style(new_style) - - current_style = self.style_manager.get_current_style() - self.assertEqual(current_style['font_size'], 16) - self.assertEqual(current_style['font_weight'], FontWeight.BOLD) - self.assertEqual(current_style['color'], (0, 0, 0)) # Unchanged - - # Pop the style - self.style_manager.pop_style() - restored_style = self.style_manager.get_current_style() - self.assertEqual(restored_style, initial_style) + # Test that all expected values exist + decorations = [TextDecoration.NONE, TextDecoration.UNDERLINE, TextDecoration.STRIKETHROUGH] + self.assertEqual(len(decorations), 3) - def test_tag_styles(self): - """Test default styles for HTML tags.""" - h1_style = self.style_manager.get_tag_style('h1') - self.assertEqual(h1_style['font_size'], 24) - self.assertEqual(h1_style['font_weight'], FontWeight.BOLD) - - h6_style = self.style_manager.get_tag_style('h6') - self.assertEqual(h6_style['font_size'], 12) - self.assertEqual(h6_style['font_weight'], FontWeight.BOLD) - - em_style = self.style_manager.get_tag_style('em') - self.assertEqual(em_style['font_style'], FontStyle.ITALIC) - - unknown_style = self.style_manager.get_tag_style('unknown') - self.assertEqual(unknown_style, {}) + def test_alignment_enum(self): + """Test Alignment enum values.""" + self.assertEqual(Alignment.LEFT.value, 1) + self.assertEqual(Alignment.CENTER.value, 2) + self.assertEqual(Alignment.RIGHT.value, 3) + self.assertEqual(Alignment.TOP.value, 4) + self.assertEqual(Alignment.BOTTOM.value, 5) + self.assertEqual(Alignment.JUSTIFY.value, 6) - def test_inline_style_parsing(self): - """Test parsing of inline CSS styles.""" - # Test font-size - style = self.style_manager.parse_inline_style('font-size: 18px') - self.assertEqual(style['font_size'], 18) + def test_font_initialization_defaults(self): + """Test Font initialization with default values.""" + font = Font() - style = self.style_manager.parse_inline_style('font-size: 14pt') - self.assertEqual(style['font_size'], 14) - - # Test font-weight - style = self.style_manager.parse_inline_style('font-weight: bold') - self.assertEqual(style['font_weight'], FontWeight.BOLD) - - # Test font-style - style = self.style_manager.parse_inline_style('font-style: italic') - self.assertEqual(style['font_style'], FontStyle.ITALIC) - - # Test text-decoration - style = self.style_manager.parse_inline_style('text-decoration: underline') - self.assertEqual(style['decoration'], TextDecoration.UNDERLINE) - - # Test multiple properties - style = self.style_manager.parse_inline_style( - 'font-size: 20px; font-weight: bold; color: red' + self.assertIsNone(font._font_path) + self.assertEqual(font.font_size, 12) + self.assertEqual(font.colour, (0, 0, 0)) + self.assertEqual(font.color, (0, 0, 0)) # Alias + self.assertEqual(font.weight, FontWeight.NORMAL) + self.assertEqual(font.style, FontStyle.NORMAL) + self.assertEqual(font.decoration, TextDecoration.NONE) + self.assertEqual(font.background, (255, 255, 255, 0)) # Transparent + self.assertEqual(font.language, "en_EN") + + def test_font_initialization_custom(self): + """Test Font initialization with custom values.""" + font = Font( + font_path="/path/to/font.ttf", + font_size=16, + colour=(255, 0, 0), + weight=FontWeight.BOLD, + style=FontStyle.ITALIC, + decoration=TextDecoration.UNDERLINE, + background=(255, 255, 0, 255), + langauge="fr_FR" ) - self.assertEqual(style['font_size'], 20) - self.assertEqual(style['font_weight'], FontWeight.BOLD) - self.assertEqual(style['color'], (255, 0, 0)) - - def test_color_parsing(self): - """Test CSS color parsing.""" - # Named colors - self.assertEqual(self.style_manager.parse_color('red'), (255, 0, 0)) - self.assertEqual(self.style_manager.parse_color('blue'), (0, 0, 255)) - self.assertEqual(self.style_manager.parse_color('white'), (255, 255, 255)) - self.assertEqual(self.style_manager.parse_color('gray'), (128, 128, 128)) - self.assertEqual(self.style_manager.parse_color('grey'), (128, 128, 128)) - - # Hex colors - self.assertEqual(self.style_manager.parse_color('#ff0000'), (255, 0, 0)) - self.assertEqual(self.style_manager.parse_color('#00ff00'), (0, 255, 0)) - self.assertEqual(self.style_manager.parse_color('#f00'), (255, 0, 0)) - self.assertEqual(self.style_manager.parse_color('#0f0'), (0, 255, 0)) - - # RGB colors - self.assertEqual(self.style_manager.parse_color('rgb(255, 0, 0)'), (255, 0, 0)) - self.assertEqual(self.style_manager.parse_color('rgb(128, 128, 128)'), (128, 128, 128)) - self.assertEqual(self.style_manager.parse_color('rgb( 255 , 255 , 255 )'), (255, 255, 255)) - - # RGBA colors (alpha ignored) - self.assertEqual(self.style_manager.parse_color('rgba(255, 0, 0, 0.5)'), (255, 0, 0)) - - # Invalid colors - self.assertIsNone(self.style_manager.parse_color('invalid')) - self.assertIsNone(self.style_manager.parse_color('#gg0000')) - self.assertIsNone(self.style_manager.parse_color('rgb(300, 0, 0)')) # Invalid values return None - - def test_color_clamping(self): - """Test that RGB values outside valid range return None.""" - # Values outside 0-255 range should return None - color = self.style_manager.parse_color('rgb(300, -10, 128)') - self.assertIsNone(color) # Invalid values return None - - def test_apply_style_to_element(self): - """Test combining tag styles with inline styles.""" - # Test h1 with inline style - attrs = {'style': 'color: blue; font-size: 30px'} - combined = self.style_manager.apply_style_to_element('h1', attrs) - - # Should have h1 defaults plus inline overrides - self.assertEqual(combined['font_size'], 30) # Overridden - self.assertEqual(combined['font_weight'], FontWeight.BOLD) # From h1 - self.assertEqual(combined['color'], (0, 0, 255)) # Inline - - # Test without inline styles - combined = self.style_manager.apply_style_to_element('strong', {}) - self.assertEqual(combined['font_weight'], FontWeight.BOLD) - - def test_reset(self): - """Test resetting the style manager.""" - # Change the state - self.style_manager.push_style({'font_size': 20}) - self.style_manager.push_style({'color': (255, 0, 0)}) - - # Reset - self.style_manager.reset() - - # Should be back to initial state - style = self.style_manager.get_current_style() - self.assertEqual(style['font_size'], 12) - self.assertEqual(style['color'], (0, 0, 0)) - self.assertEqual(len(self.style_manager._style_stack), 0) - - def test_font_creation(self): - """Test Font object creation from current style.""" - # Set some specific styles - self.style_manager.push_style({ - 'font_size': 16, - 'font_weight': FontWeight.BOLD, - 'font_style': FontStyle.ITALIC, - 'decoration': TextDecoration.UNDERLINE, - 'color': (255, 0, 0), - 'background': (255, 255, 0, 255) - }) - - font = self.style_manager.create_font() + self.assertEqual(font._font_path, "/path/to/font.ttf") self.assertEqual(font.font_size, 16) + self.assertEqual(font.colour, (255, 0, 0)) self.assertEqual(font.weight, FontWeight.BOLD) self.assertEqual(font.style, FontStyle.ITALIC) self.assertEqual(font.decoration, TextDecoration.UNDERLINE) - self.assertEqual(font.colour, (255, 0, 0)) self.assertEqual(font.background, (255, 255, 0, 255)) + self.assertEqual(font.language, "fr_FR") + + def test_font_with_methods(self): + """Test Font immutable modification methods.""" + original_font = Font( + font_size=12, + colour=(0, 0, 0), + weight=FontWeight.NORMAL, + style=FontStyle.NORMAL, + decoration=TextDecoration.NONE + ) + + # Test with_size + size_font = original_font.with_size(16) + self.assertEqual(size_font.font_size, 16) + self.assertEqual(original_font.font_size, 12) # Original unchanged + self.assertEqual(size_font.colour, (0, 0, 0)) # Other properties preserved + + # Test with_colour + color_font = original_font.with_colour((255, 0, 0)) + self.assertEqual(color_font.colour, (255, 0, 0)) + self.assertEqual(original_font.colour, (0, 0, 0)) # Original unchanged + self.assertEqual(color_font.font_size, 12) # Other properties preserved + + # Test with_weight + weight_font = original_font.with_weight(FontWeight.BOLD) + self.assertEqual(weight_font.weight, FontWeight.BOLD) + self.assertEqual(original_font.weight, FontWeight.NORMAL) # Original unchanged + + # Test with_style + style_font = original_font.with_style(FontStyle.ITALIC) + self.assertEqual(style_font.style, FontStyle.ITALIC) + self.assertEqual(original_font.style, FontStyle.NORMAL) # Original unchanged + + # Test with_decoration + decoration_font = original_font.with_decoration(TextDecoration.UNDERLINE) + self.assertEqual(decoration_font.decoration, TextDecoration.UNDERLINE) + self.assertEqual(original_font.decoration, TextDecoration.NONE) # Original unchanged + + def test_font_property_access(self): + """Test Font property access methods.""" + font = Font( + font_size=20, + colour=(128, 128, 128), + weight=FontWeight.BOLD, + style=FontStyle.ITALIC, + decoration=TextDecoration.STRIKETHROUGH + ) + + # Test all property getters + self.assertEqual(font.font_size, 20) + self.assertEqual(font.colour, (128, 128, 128)) + self.assertEqual(font.color, (128, 128, 128)) # Alias + self.assertEqual(font.weight, FontWeight.BOLD) + self.assertEqual(font.style, FontStyle.ITALIC) + self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH) + + # Test that font object is accessible + self.assertIsNotNone(font.font) + + def test_font_immutability(self): + """Test that Font objects behave immutably.""" + font1 = Font(font_size=12, colour=(0, 0, 0)) + font2 = font1.with_size(16) + font3 = font2.with_colour((255, 0, 0)) + + # Each should be different objects + self.assertIsNot(font1, font2) + self.assertIsNot(font2, font3) + self.assertIsNot(font1, font3) + + # Original properties should be unchanged + self.assertEqual(font1.font_size, 12) + self.assertEqual(font1.colour, (0, 0, 0)) + + self.assertEqual(font2.font_size, 16) + self.assertEqual(font2.colour, (0, 0, 0)) + + self.assertEqual(font3.font_size, 16) + self.assertEqual(font3.colour, (255, 0, 0)) + + def test_background_handling(self): + """Test background color handling.""" + # Test default transparent background + font1 = Font() + self.assertEqual(font1.background, (255, 255, 255, 0)) + + # Test explicit background + font2 = Font(background=(255, 0, 0, 128)) + self.assertEqual(font2.background, (255, 0, 0, 128)) + + # Test None background becomes transparent + font3 = Font(background=None) + self.assertEqual(font3.background, (255, 255, 255, 0)) if __name__ == '__main__': diff --git a/tests/test_html_text.py b/tests/test_html_text.py deleted file mode 100644 index cb4f49b..0000000 --- a/tests/test_html_text.py +++ /dev/null @@ -1,247 +0,0 @@ -""" -Unit tests for HTML text processing. - -Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation. -""" - -import unittest -from unittest.mock import Mock, MagicMock -from pyWebLayout.io.readers.html_text import HTMLTextProcessor -from pyWebLayout.io.readers.html_style import HTMLStyleManager -from pyWebLayout.abstract.block import Paragraph -from pyWebLayout.abstract.inline import Word - - -class TestHTMLTextProcessor(unittest.TestCase): - """Test cases for HTMLTextProcessor.""" - - def setUp(self): - """Set up test fixtures.""" - self.style_manager = HTMLStyleManager() - self.text_processor = HTMLTextProcessor(self.style_manager) - - # Create a mock paragraph - self.mock_paragraph = Mock(spec=Paragraph) - self.mock_paragraph.add_word = Mock() - - def test_initialization(self): - """Test proper initialization of text processor.""" - self.assertEqual(self.text_processor._text_buffer, "") - self.assertIsNone(self.text_processor._current_paragraph) - self.assertEqual(self.text_processor._style_manager, self.style_manager) - - def test_add_text(self): - """Test adding text to buffer.""" - self.text_processor.add_text("Hello") - self.assertEqual(self.text_processor.get_buffer_content(), "Hello") - - self.text_processor.add_text(" World") - self.assertEqual(self.text_processor.get_buffer_content(), "Hello World") - - def test_entity_references(self): - """Test HTML entity reference handling.""" - test_cases = [ - ('lt', '<'), - ('gt', '>'), - ('amp', '&'), - ('quot', '"'), - ('apos', "'"), - ('nbsp', ' '), - ('copy', '©'), - ('reg', '®'), - ('trade', '™'), - ('mdash', '—'), - ('ndash', '–'), - ('hellip', '…'), - ('euro', '€'), - ('unknown', '&unknown;') # Unknown entities should be preserved - ] - - for entity, expected in test_cases: - with self.subTest(entity=entity): - self.text_processor.clear_buffer() - self.text_processor.add_entity_reference(entity) - self.assertEqual(self.text_processor.get_buffer_content(), expected) - - def test_character_references(self): - """Test character reference handling.""" - # Decimal character references - self.text_processor.clear_buffer() - self.text_processor.add_character_reference('65') # 'A' - self.assertEqual(self.text_processor.get_buffer_content(), 'A') - - # Hexadecimal character references - self.text_processor.clear_buffer() - self.text_processor.add_character_reference('x41') # 'A' - self.assertEqual(self.text_processor.get_buffer_content(), 'A') - - # Unicode character - self.text_processor.clear_buffer() - self.text_processor.add_character_reference('8364') # Euro symbol - self.assertEqual(self.text_processor.get_buffer_content(), '€') - - # Invalid character reference - self.text_processor.clear_buffer() - self.text_processor.add_character_reference('invalid') - self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;') - - # Out of range character - self.text_processor.clear_buffer() - self.text_processor.add_character_reference('99999999999') - self.assertTrue(self.text_processor.get_buffer_content().startswith('&#')) - - def test_buffer_operations(self): - """Test buffer state operations.""" - # Test has_pending_text - self.assertFalse(self.text_processor.has_pending_text()) - - self.text_processor.add_text("Some text") - self.assertTrue(self.text_processor.has_pending_text()) - - # Test clear_buffer - self.text_processor.clear_buffer() - self.assertFalse(self.text_processor.has_pending_text()) - self.assertEqual(self.text_processor.get_buffer_content(), "") - - # Test with whitespace only - self.text_processor.add_text(" \n\t ") - self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace - - def test_paragraph_management(self): - """Test current paragraph setting.""" - # Initially no paragraph - self.assertIsNone(self.text_processor._current_paragraph) - - # Set paragraph - self.text_processor.set_current_paragraph(self.mock_paragraph) - self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph) - - # Clear paragraph - self.text_processor.set_current_paragraph(None) - self.assertIsNone(self.text_processor._current_paragraph) - - def test_flush_text_with_paragraph(self): - """Test flushing text when paragraph is set.""" - self.text_processor.set_current_paragraph(self.mock_paragraph) - self.text_processor.add_text("Hello world test") - - # Mock the style manager to return a specific font - mock_font = Mock() - self.style_manager.create_font = Mock(return_value=mock_font) - - result = self.text_processor.flush_text() - - # Should return True (text was flushed) - self.assertTrue(result) - - # Should have created words - self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test" - - # Verify the words were created with correct text - calls = self.mock_paragraph.add_word.call_args_list - word_texts = [call[0][0].text for call in calls] - self.assertEqual(word_texts, ["Hello", "world", "test"]) - - # Buffer should be empty after flush - self.assertEqual(self.text_processor.get_buffer_content(), "") - - def test_flush_text_without_paragraph(self): - """Test flushing text when no paragraph is set.""" - self.text_processor.add_text("Hello world") - - result = self.text_processor.flush_text() - - # Should return False (no paragraph to flush to) - self.assertFalse(result) - - # Buffer should be cleared anyway - self.assertEqual(self.text_processor.get_buffer_content(), "") - - def test_flush_empty_buffer(self): - """Test flushing when buffer is empty.""" - self.text_processor.set_current_paragraph(self.mock_paragraph) - - result = self.text_processor.flush_text() - - # Should return False (nothing to flush) - self.assertFalse(result) - - # No words should be added - self.mock_paragraph.add_word.assert_not_called() - - def test_flush_whitespace_only(self): - """Test flushing when buffer contains only whitespace.""" - self.text_processor.set_current_paragraph(self.mock_paragraph) - self.text_processor.add_text(" \n\t ") - - result = self.text_processor.flush_text() - - # Should return False (no meaningful content) - self.assertFalse(result) - - # No words should be added - self.mock_paragraph.add_word.assert_not_called() - - def test_word_creation_with_styling(self): - """Test that words are created with proper styling.""" - self.text_processor.set_current_paragraph(self.mock_paragraph) - self.text_processor.add_text("styled text") - - # Set up style manager to return specific font - mock_font = Mock() - mock_font.font_size = 16 - mock_font.weight = "bold" - self.style_manager.create_font = Mock(return_value=mock_font) - - self.text_processor.flush_text() - - # Verify font was created - self.style_manager.create_font.assert_called() - - # Verify words were created with the font - calls = self.mock_paragraph.add_word.call_args_list - for call in calls: - word = call[0][0] - self.assertEqual(word.style, mock_font) - - def test_reset(self): - """Test resetting the text processor.""" - # Set up some state - self.text_processor.set_current_paragraph(self.mock_paragraph) - self.text_processor.add_text("Some text") - - # Reset - self.text_processor.reset() - - # Should be back to initial state - self.assertEqual(self.text_processor._text_buffer, "") - self.assertIsNone(self.text_processor._current_paragraph) - - def test_complex_text_processing(self): - """Test processing text with mixed content.""" - self.text_processor.set_current_paragraph(self.mock_paragraph) - - # Mock font creation - mock_font = Mock() - self.style_manager.create_font = Mock(return_value=mock_font) - - # Add mixed content - self.text_processor.add_text("Hello ") - self.text_processor.add_entity_reference('amp') - self.text_processor.add_text(" world") - self.text_processor.add_character_reference('33') # '!' - - # Should have "Hello & world!" - expected_content = "Hello & world!" - self.assertEqual(self.text_processor.get_buffer_content(), expected_content) - - # Flush and verify words - self.text_processor.flush_text() - - calls = self.mock_paragraph.add_word.call_args_list - word_texts = [call[0][0].text for call in calls] - self.assertEqual(word_texts, ["Hello", "&", "world!"]) - - -if __name__ == '__main__': - unittest.main()