diff --git a/pyWebLayout/abstract/block.py b/pyWebLayout/abstract/block.py index c139fc5..8f568dd 100644 --- a/pyWebLayout/abstract/block.py +++ b/pyWebLayout/abstract/block.py @@ -1011,14 +1011,246 @@ class Table(Block): elif section.lower() == "footer": self._footer_rows.append(row) else: # Default to body - self._rows + self._rows.append(row) + + def create_row(self, section: str = "body", style=None) -> TableRow: + """ + Create a new table row and add it to this table. + + Args: + section: The section to add the row to ("header", "body", or "footer") + style: Optional style override. If None, inherits from table + + Returns: + The newly created TableRow object + """ + return TableRow.create_and_add_to(self, section, style) + + def header_rows(self) -> Iterator[TableRow]: + """ + Iterate over the header rows in this table. + + Yields: + Each TableRow in the header section + """ + for row in self._header_rows: + yield row + + def body_rows(self) -> Iterator[TableRow]: + """ + Iterate over the body rows in this table. + + Yields: + Each TableRow in the body section + """ + for row in self._rows: + yield row + + def footer_rows(self) -> Iterator[TableRow]: + """ + Iterate over the footer rows in this table. + + Yields: + Each TableRow in the footer section + """ + for row in self._footer_rows: + yield row + + def all_rows(self) -> Iterator[Tuple[str, TableRow]]: + """ + Iterate over all rows in this table with their section labels. + + Yields: + Tuples of (section, row) for each row in the table + """ + for row in self._header_rows: + yield ("header", row) + for row in self._rows: + yield ("body", row) + for row in self._footer_rows: + yield ("footer", row) + + @property + def row_count(self) -> Dict[str, int]: + """Get the row counts by section""" + return { + "header": len(self._header_rows), + "body": len(self._rows), + "footer": len(self._footer_rows), + "total": len(self._header_rows) + len(self._rows) + len(self._footer_rows) + } +class Image(Block): + """ + An image element with source, dimensions, and alternative text. + """ + + def __init__(self, source: str = "", alt_text: str = "", width: Optional[int] = None, height: Optional[int] = None): + """ + Initialize an image element. + + Args: + source: The image source URL or path + alt_text: Alternative text for accessibility + width: Optional image width in pixels + height: Optional image height in pixels + """ + super().__init__(BlockType.IMAGE) + self._source = source + self._alt_text = alt_text + self._width = width + self._height = height + + @classmethod + def create_and_add_to(cls, container, source: str = "", alt_text: str = "", + width: Optional[int] = None, height: Optional[int] = None) -> 'Image': + """ + Create a new Image and add it to a container. + + Args: + container: The container to add the image to (must have add_block method) + source: The image source URL or path + alt_text: Alternative text for accessibility + width: Optional image width in pixels + height: Optional image height in pixels + + Returns: + The newly created Image object + + Raises: + AttributeError: If the container doesn't have the required add_block method + """ + # Create the new image + image = cls(source, alt_text, width, height) + + # Add the image to the container + if hasattr(container, 'add_block'): + container.add_block(image) + else: + raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method") + + return image + + @property + def source(self) -> str: + """Get the image source""" + return self._source + + @source.setter + def source(self, source: str): + """Set the image source""" + self._source = source + + @property + def alt_text(self) -> str: + """Get the alternative text""" + return self._alt_text + + @alt_text.setter + def alt_text(self, alt_text: str): + """Set the alternative text""" + self._alt_text = alt_text + + @property + def width(self) -> Optional[int]: + """Get the image width""" + return self._width + + @width.setter + def width(self, width: Optional[int]): + """Set the image width""" + self._width = width + + @property + def height(self) -> Optional[int]: + """Get the image height""" + return self._height + + @height.setter + def height(self, height: Optional[int]): + """Set the image height""" + self._height = height + + def get_dimensions(self) -> Tuple[Optional[int], Optional[int]]: + """ + Get the image dimensions as a tuple. + + Returns: + Tuple of (width, height) + """ + return (self._width, self._height) + + def get_aspect_ratio(self) -> Optional[float]: + """ + Calculate the aspect ratio of the image. + + Returns: + The aspect ratio (width/height) or None if either dimension is missing + """ + if self._width is not None and self._height is not None and self._height > 0: + return self._width / self._height + return None + + def calculate_scaled_dimensions(self, max_width: Optional[int] = None, + max_height: Optional[int] = None) -> Tuple[Optional[int], Optional[int]]: + """ + Calculate scaled dimensions that fit within the given constraints. + + Args: + max_width: Maximum allowed width + max_height: Maximum allowed height + + Returns: + Tuple of (scaled_width, scaled_height) + """ + if self._width is None or self._height is None: + return (self._width, self._height) + + width, height = self._width, self._height + + # Scale down if needed + if max_width is not None and width > max_width: + height = int(height * max_width / width) + width = max_width + + if max_height is not None and height > max_height: + width = int(width * max_height / height) + height = max_height + + return (width, height) -class Image: - pass - -class HorizontalRule: - - pass \ No newline at end of file +class HorizontalRule(Block): + """ + A horizontal rule element (hr tag). + """ + + def __init__(self): + """Initialize a horizontal rule element.""" + super().__init__(BlockType.HORIZONTAL_RULE) + + @classmethod + def create_and_add_to(cls, container) -> 'HorizontalRule': + """ + Create a new HorizontalRule and add it to a container. + + Args: + container: The container to add the horizontal rule to (must have add_block method) + + Returns: + The newly created HorizontalRule object + + Raises: + AttributeError: If the container doesn't have the required add_block method + """ + # Create the new horizontal rule + hr = cls() + + # Add the horizontal rule to the container + if hasattr(container, 'add_block'): + container.add_block(hr) + else: + raise AttributeError(f"Container {type(container).__name__} must have an 'add_block' method") + + return hr diff --git a/pyWebLayout/abstract/functional.py b/pyWebLayout/abstract/functional.py index a8bf658..fa120f3 100644 --- a/pyWebLayout/abstract/functional.py +++ b/pyWebLayout/abstract/functional.py @@ -124,6 +124,11 @@ class Button(Interactable): """Enable or disable the button""" self._enabled = enabled + @property + def params(self) -> Dict[str, Any]: + """Get the button parameters""" + return self._params + def execute(self) -> Any: """ Execute the button's callback function if the button is enabled. diff --git a/pyWebLayout/abstract/inline.py b/pyWebLayout/abstract/inline.py index 7081612..e56b96a 100644 --- a/pyWebLayout/abstract/inline.py +++ b/pyWebLayout/abstract/inline.py @@ -2,6 +2,7 @@ from __future__ import annotations from pyWebLayout.base import Queriable from pyWebLayout.style import Font from typing import Tuple, Union, List, Optional, Dict +import pyphen class Word: @@ -157,9 +158,6 @@ class Word: Returns: bool: True if the word can be hyphenated, False otherwise. """ - # Only import pyphen when needed - import pyphen - # Use the provided language or fall back to style language lang = language if language else self._style.language dic = pyphen.Pyphen(lang=lang) @@ -178,9 +176,6 @@ class Word: Returns: bool: True if the word was hyphenated, False otherwise. """ - # Only import pyphen when needed - import pyphen - # Use the provided language or fall back to style language lang = language if language else self._style.language dic = pyphen.Pyphen(lang=lang) @@ -333,5 +328,58 @@ class FormattedSpan: class LineBreak: - - pass \ No newline at end of file + """ + A line break element that forces a new line within text content. + While this is an inline element that can occur within paragraphs, + it has block-like properties for consistency with the abstract model. + """ + + def __init__(self): + """Initialize a line break element.""" + # Import here to avoid circular imports + from .block import BlockType + self._block_type = BlockType.LINE_BREAK + self._parent = None + + @property + def block_type(self): + """Get the block type for this line break""" + return self._block_type + + @property + def parent(self): + """Get the parent element containing this line break, if any""" + return self._parent + + @parent.setter + def parent(self, parent): + """Set the parent element""" + self._parent = parent + + @classmethod + def create_and_add_to(cls, container) -> 'LineBreak': + """ + Create a new LineBreak and add it to a container. + + Args: + container: The container to add the line break to + + Returns: + The newly created LineBreak object + """ + # Create the new line break + line_break = cls() + + # Add the line break to the container if it has an appropriate method + if hasattr(container, 'add_line_break'): + container.add_line_break(line_break) + elif hasattr(container, 'add_element'): + container.add_element(line_break) + elif hasattr(container, 'add_word'): + # Some containers might treat line breaks like words + container.add_word(line_break) + else: + # Set parent relationship manually + line_break.parent = container + + return line_break diff --git a/pyWebLayout/io/__init__.py b/pyWebLayout/io/__init__.py index 3e3abcb..a1fd9ac 100644 --- a/pyWebLayout/io/__init__.py +++ b/pyWebLayout/io/__init__.py @@ -21,9 +21,11 @@ from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReade # Specialized HTML readers from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader -from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.io.readers.html_resources import HTMLResourceReader +# HTML extraction parser (the best approach) +from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction + # Specialized EPUB readers from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader diff --git a/pyWebLayout/io/readers/__init__.py b/pyWebLayout/io/readers/__init__.py index 93be0d7..950535a 100644 --- a/pyWebLayout/io/readers/__init__.py +++ b/pyWebLayout/io/readers/__init__.py @@ -11,13 +11,8 @@ from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, Com # HTML readers (decomposed) from .html import HTMLReader, read_html, read_html_file, parse_html_string from .html_metadata import HTMLMetadataReader -from .html_content import HTMLContentReader from .html_resources import HTMLResourceReader -# HTML processing components (supporting modules) -from .html_style import HTMLStyleManager -from .html_text import HTMLTextProcessor -from .html_elements import BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler # EPUB readers from .epub_reader import read_epub # Legacy @@ -29,7 +24,7 @@ __all__ = [ # HTML readers 'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string', - 'HTMLMetadataReader', 'HTMLContentReader', 'HTMLResourceReader', + 'HTMLMetadataReader', 'HTMLResourceReader', # EPUB readers 'read_epub', 'EPUBMetadataReader', diff --git a/pyWebLayout/io/readers/html.py b/pyWebLayout/io/readers/html.py index 4433421..4e1cc16 100644 --- a/pyWebLayout/io/readers/html.py +++ b/pyWebLayout/io/readers/html.py @@ -1,36 +1,33 @@ """ Modern HTML reader for pyWebLayout. -This module provides a decomposed HTML reader that uses specialized -readers for metadata, content, and resources, following the pattern -established in the abstract module. +This module provides an HTML reader that uses the html_extraction module +for clean, handler-based parsing using BeautifulSoup. """ import os from typing import Union, Optional from pyWebLayout.abstract.document import Document -from pyWebLayout.io.readers.base import CompositeReader +from pyWebLayout.io.readers.base import BaseReader from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader -from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.io.readers.html_resources import HTMLResourceReader +from pyWebLayout.io.readers.html_extraction import parse_html_string +from pyWebLayout.style import Font -class HTMLReader(CompositeReader): +class HTMLReader(BaseReader): """ - Modern HTML reader using decomposed architecture. + Modern HTML reader using the html_extraction parser. - This reader combines specialized readers for metadata, content, - and resources to provide a complete HTML parsing solution. + This reader uses the clean, handler-based architecture from html_extraction.py + for parsing HTML content into pyWebLayout's abstract document structure. """ def __init__(self): - """Initialize the HTML reader with all specialized readers.""" + """Initialize the HTML reader.""" super().__init__() - - # Set up specialized readers - self.set_metadata_reader(HTMLMetadataReader()) - self.set_content_reader(HTMLContentReader()) - self.set_resource_reader(HTMLResourceReader()) + self._metadata_reader = HTMLMetadataReader() + self._resource_reader = HTMLResourceReader() def can_read(self, source: Union[str, bytes]) -> bool: """ @@ -76,6 +73,7 @@ class HTMLReader(CompositeReader): - encoding: Character encoding (default: 'utf-8') - extract_metadata: Whether to extract metadata (default: True) - extract_resources: Whether to extract resources (default: True) + - base_font: Base font for styling (default: None) Returns: The parsed Document @@ -85,6 +83,7 @@ class HTMLReader(CompositeReader): encoding = options.get('encoding', 'utf-8') extract_metadata = options.get('extract_metadata', True) extract_resources = options.get('extract_resources', True) + base_font = options.get('base_font') # Read the HTML content html_content = self._read_html_content(source, encoding) @@ -93,10 +92,6 @@ class HTMLReader(CompositeReader): if not base_url and isinstance(source, str) and os.path.isfile(source): base_url = f"file://{os.path.dirname(os.path.abspath(source))}/" - # Set base URL in content reader - if self._content_reader and hasattr(self._content_reader, 'set_base_url'): - self._content_reader.set_base_url(base_url) - # Create a new document document = Document() @@ -104,9 +99,10 @@ class HTMLReader(CompositeReader): if extract_metadata and self._metadata_reader: self._metadata_reader.extract_metadata(html_content, document) - # Extract content - if self._content_reader: - self._content_reader.extract_content(html_content, document) + # Parse content using html_extraction + blocks = parse_html_string(html_content, base_font) + for block in blocks: + document.add_block(block) # Extract resources if enabled if extract_resources and self._resource_reader: diff --git a/pyWebLayout/io/readers/html_content.py b/pyWebLayout/io/readers/html_content.py deleted file mode 100644 index 18373e5..0000000 --- a/pyWebLayout/io/readers/html_content.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Modern HTML content reader for pyWebLayout. - -This module provides a decomposed HTML content reader that uses specialized -handlers and managers for different aspects of HTML parsing. -""" - -from html.parser import HTMLParser as BaseHTMLParser -from typing import Dict, List, Optional, Tuple, Union, Any -from pyWebLayout.abstract.document import Document -from pyWebLayout.io.readers.base import ContentReader -from pyWebLayout.io.readers.html_style import HTMLStyleManager -from pyWebLayout.io.readers.html_text import HTMLTextProcessor -from pyWebLayout.io.readers.html_elements import ( - BlockElementHandler, ListElementHandler, TableElementHandler, InlineElementHandler -) - - -class HTMLContentReader(ContentReader, BaseHTMLParser): - """ - Modern HTML content reader using decomposed architecture. - - This class orchestrates specialized handlers to parse HTML content - and convert it to pyWebLayout's abstract document model. - """ - - def __init__(self): - """Initialize the HTML content reader.""" - BaseHTMLParser.__init__(self) - - # Initialize managers and processors - self.style_manager = HTMLStyleManager() - self.text_processor = HTMLTextProcessor(self.style_manager) - - # Initialize element handlers - self.block_handler = BlockElementHandler(self.style_manager, self.text_processor) - self.list_handler = ListElementHandler(self.text_processor) - self.table_handler = TableElementHandler(self.text_processor) - self.inline_handler = InlineElementHandler(self.text_processor) - - # Document and parsing state - self._document: Optional[Document] = None - self._in_head = False - self._in_script = False - self._in_style = False - - def extract_content(self, html_content: str, document: Document) -> Any: - """ - Extract content from HTML. - - Args: - html_content: The HTML content to parse - document: The document to populate with content - - Returns: - The document with populated content - """ - self._document = document - self._reset_state() - - # Parse the HTML content - self.feed(html_content) - - # Flush any remaining text - self.text_processor.flush_text() - - return document - - def set_base_url(self, base_url: str): - """Set the base URL for resolving relative links.""" - self.inline_handler.set_base_url(base_url) - - def _reset_state(self): - """Reset all parser state for new content.""" - # Reset managers and processors - self.style_manager.reset() - self.text_processor.reset() - - # Reset element handlers - self.block_handler.reset() - self.list_handler.reset() - self.table_handler.reset() - self.inline_handler.reset() - - # Reset parser flags - self._in_head = False - self._in_script = False - self._in_style = False - - def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): - """Handle the start of an HTML tag.""" - tag = tag.lower() - attrs_dict = dict(attrs) - - # Skip content in head, script, style (except body) - if self._should_skip_content(tag): - return - - # Handle special section markers - if self._handle_special_sections_start(tag): - return - - # Apply styles for this element - style = self.style_manager.apply_style_to_element(tag, attrs_dict) - self.style_manager.push_style(style) - - # Delegate to appropriate handler - self._delegate_start_tag(tag, attrs_dict) - - def handle_endtag(self, tag: str): - """Handle the end of an HTML tag.""" - tag = tag.lower() - - # Handle special section markers - if self._handle_special_sections_end(tag): - return - - # Skip content in head, script, style - if self._in_head or self._in_script or self._in_style: - return - - # Flush any accumulated text - self.text_processor.flush_text() - - # Delegate to appropriate handler - self._delegate_end_tag(tag) - - # Pop style regardless of tag - self.style_manager.pop_style() - - def handle_data(self, data: str): - """Handle text data.""" - if self._in_head or self._in_script or self._in_style: - return - - self.text_processor.add_text(data) - - def handle_entityref(self, name: str): - """Handle an HTML entity reference.""" - if self._in_head or self._in_script or self._in_style: - return - - self.text_processor.add_entity_reference(name) - - def handle_charref(self, name: str): - """Handle a character reference.""" - if self._in_head or self._in_script or self._in_style: - return - - self.text_processor.add_character_reference(name) - - def _should_skip_content(self, tag: str) -> bool: - """Check if we should skip content based on current state.""" - if self._in_head or self._in_script or self._in_style: - if tag in ('head', 'script', 'style'): - return False # Let special section handlers deal with these - if tag != 'body': - return True - return False - - def _handle_special_sections_start(self, tag: str) -> bool: - """Handle special section start tags. Returns True if handled.""" - if tag == 'head': - self._in_head = True - return True - elif tag == 'body': - self._in_head = False - return True - elif tag == 'script': - self._in_script = True - return True - elif tag == 'style': - self._in_style = True - return True - return False - - def _handle_special_sections_end(self, tag: str) -> bool: - """Handle special section end tags. Returns True if handled.""" - if tag == 'head': - self._in_head = False - self.style_manager.pop_style() - return True - elif tag == 'script': - self._in_script = False - self.style_manager.pop_style() - return True - elif tag == 'style': - self._in_style = False - self.style_manager.pop_style() - return True - return False - - def _delegate_start_tag(self, tag: str, attrs: Dict[str, str]): - """Delegate start tag handling to appropriate handler.""" - # Block elements - if tag == 'p': - self.block_handler.handle_paragraph_start(self._document) - elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): - self.block_handler.handle_heading_start(tag, self._document) - elif tag == 'div': - self.block_handler.handle_div_start(self._document) - elif tag == 'blockquote': - self.block_handler.handle_blockquote_start(self._document) - elif tag == 'pre': - self.block_handler.handle_pre_start(self._document) - elif tag == 'code': - self.block_handler.handle_code_start(attrs, self._document) - - # List elements - elif tag in ('ul', 'ol', 'dl'): - self.list_handler.handle_list_start(tag, self.block_handler, self._document) - elif tag == 'li': - self.list_handler.handle_list_item_start(self.block_handler) - elif tag in ('dt', 'dd'): - self.list_handler.handle_definition_start(tag, self.block_handler) - - # Table elements - elif tag == 'table': - self.table_handler.handle_table_start(attrs, self.block_handler, self._document) - elif tag in ('thead', 'tbody', 'tfoot'): - self.table_handler.handle_table_section_start(tag) - elif tag == 'tr': - self.table_handler.handle_table_row_start() - elif tag in ('td', 'th'): - self.table_handler.handle_table_cell_start(tag, attrs, self.block_handler) - - # Inline elements - elif tag == 'a': - self.inline_handler.handle_link_start(attrs) - elif tag == 'img': - self.inline_handler.handle_image(attrs, self.block_handler, self._document) - elif tag == 'br': - self.inline_handler.handle_line_break(self.block_handler) - elif tag == 'hr': - self.inline_handler.handle_horizontal_rule(self.block_handler, self._document) - - # Style-only elements (no special handling needed, just styling) - elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'): - pass # Styles are already applied by style manager - - def _delegate_end_tag(self, tag: str): - """Delegate end tag handling to appropriate handler.""" - # Block elements - if tag in ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code'): - self.block_handler.handle_block_end() - - # List elements - elif tag in ('ul', 'ol', 'dl'): - self.list_handler.handle_list_end(self.block_handler) - elif tag in ('li', 'dt', 'dd'): - self.list_handler.handle_list_item_end(self.block_handler) - - # Table elements - elif tag == 'table': - self.table_handler.handle_table_end(self.block_handler) - elif tag in ('thead', 'tbody', 'tfoot'): - self.table_handler.handle_table_section_end() - elif tag == 'tr': - self.table_handler.handle_table_row_end() - elif tag in ('td', 'th'): - self.table_handler.handle_table_cell_end(self.block_handler) - - # Inline elements - elif tag == 'a': - self.inline_handler.handle_link_end() - - # Style-only elements (no special handling needed) - elif tag in ('b', 'strong', 'i', 'em', 'u', 'span'): - pass # Styles are handled by style manager diff --git a/pyWebLayout/io/readers/html_elements.py b/pyWebLayout/io/readers/html_elements.py deleted file mode 100644 index 57070d0..0000000 --- a/pyWebLayout/io/readers/html_elements.py +++ /dev/null @@ -1,473 +0,0 @@ -""" -HTML element handlers for pyWebLayout. - -This module provides specialized handlers for different types of HTML elements, -using composition and delegation to handle specific element types. -""" - -from typing import Dict, List, Optional, Any -import urllib.parse -from pyWebLayout.abstract.document import Document -from pyWebLayout.abstract.block import ( - Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, - HList, ListStyle, ListItem, Table, TableRow, TableCell, - HorizontalRule, Image -) -from pyWebLayout.abstract.inline import LineBreak -from pyWebLayout.abstract.functional import Link, LinkType -from pyWebLayout.io.readers.html_style import HTMLStyleManager -from pyWebLayout.io.readers.html_text import HTMLTextProcessor - - -class BlockElementHandler: - """Handles block-level HTML elements like paragraphs, headings, divs.""" - - def __init__(self, style_manager: HTMLStyleManager, text_processor: HTMLTextProcessor): - self.style_manager = style_manager - self.text_processor = text_processor - self.block_stack: List[Block] = [] - self.current_block: Optional[Block] = None - self.current_paragraph: Optional[Paragraph] = None - - def reset(self): - """Reset the handler state.""" - self.block_stack = [] - self.current_block = None - self.current_paragraph = None - - def add_block_to_document_or_parent(self, block: Block, document: Document): - """Add a block to the document or current parent block.""" - if self.current_block and hasattr(self.current_block, 'add_block'): - self.current_block.add_block(block) - else: - document.add_block(block) - - def handle_paragraph_start(self, document: Document): - """Handle the start of a paragraph element.""" - self.text_processor.flush_text() - paragraph = Paragraph() - - self.add_block_to_document_or_parent(paragraph, document) - self.block_stack.append(paragraph) - self.current_block = paragraph - self.current_paragraph = paragraph - self.text_processor.set_current_paragraph(paragraph) - - def handle_heading_start(self, tag: str, document: Document): - """Handle the start of a heading element.""" - self.text_processor.flush_text() - - level_map = { - 'h1': HeadingLevel.H1, 'h2': HeadingLevel.H2, 'h3': HeadingLevel.H3, - 'h4': HeadingLevel.H4, 'h5': HeadingLevel.H5, 'h6': HeadingLevel.H6 - } - - heading = Heading(level=level_map[tag]) - self.add_block_to_document_or_parent(heading, document) - self.block_stack.append(heading) - self.current_block = heading - self.current_paragraph = heading # Heading inherits from Paragraph - self.text_processor.set_current_paragraph(heading) - - def handle_div_start(self, document: Document): - """Handle the start of a div element.""" - self.text_processor.flush_text() - div_para = Paragraph() - - self.add_block_to_document_or_parent(div_para, document) - self.block_stack.append(div_para) - self.current_block = div_para - self.current_paragraph = div_para - self.text_processor.set_current_paragraph(div_para) - - def handle_blockquote_start(self, document: Document): - """Handle the start of a blockquote element.""" - self.text_processor.flush_text() - quote = Quote() - - self.add_block_to_document_or_parent(quote, document) - self.block_stack.append(quote) - self.current_block = quote - self.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_pre_start(self, document: Document): - """Handle the start of a pre element.""" - self.text_processor.flush_text() - pre_para = Paragraph() - - self.add_block_to_document_or_parent(pre_para, document) - self.block_stack.append(pre_para) - self.current_block = pre_para - self.current_paragraph = pre_para - self.text_processor.set_current_paragraph(pre_para) - - def handle_code_start(self, attrs: Dict[str, str], document: Document): - """Handle the start of a code element.""" - # If we're inside a pre, replace the paragraph with a code block - if self.block_stack and isinstance(self.block_stack[-1], Paragraph): - pre_para = self.block_stack.pop() - - # Get the language from class if specified - language = "" - if 'class' in attrs: - class_attr = attrs['class'] - if class_attr.startswith('language-'): - language = class_attr[9:] - - code_block = CodeBlock(language=language) - - # Replace the paragraph with the code block in its parent - if pre_para.parent: - parent = pre_para.parent - if hasattr(parent, '_blocks'): - for i, block in enumerate(parent._blocks): - if block == pre_para: - parent._blocks[i] = code_block - code_block.parent = parent - break - else: - # Replace in document blocks - for i, block in enumerate(document.blocks): - if block == pre_para: - document.blocks[i] = code_block - break - - self.block_stack.append(code_block) - self.current_block = code_block - self.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_block_end(self): - """Handle the end of a block element.""" - if self.block_stack: - self.block_stack.pop() - - if self.block_stack: - self.current_block = self.block_stack[-1] - # Update current paragraph based on block type - if isinstance(self.current_block, Paragraph): - self.current_paragraph = self.current_block - else: - self.current_paragraph = None - else: - self.current_block = None - self.current_paragraph = None - - self.text_processor.set_current_paragraph(self.current_paragraph) - - -class ListElementHandler: - """Handles list-related HTML elements (ul, ol, dl, li, dt, dd).""" - - def __init__(self, text_processor: HTMLTextProcessor): - self.text_processor = text_processor - self.list_stack: List[HList] = [] - - def reset(self): - """Reset the handler state.""" - self.list_stack = [] - - def handle_list_start(self, tag: str, block_handler: BlockElementHandler, document: Document): - """Handle the start of a list element.""" - self.text_processor.flush_text() - - style_map = { - 'ul': ListStyle.UNORDERED, - 'ol': ListStyle.ORDERED, - 'dl': ListStyle.DEFINITION - } - - list_block = HList(style=style_map[tag]) - block_handler.add_block_to_document_or_parent(list_block, document) - - block_handler.block_stack.append(list_block) - self.list_stack.append(list_block) - block_handler.current_block = list_block - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_list_item_start(self, block_handler: BlockElementHandler): - """Handle the start of a list item.""" - if not self.list_stack: - return - - self.text_processor.flush_text() - list_item = ListItem() - - current_list = self.list_stack[-1] - current_list.add_item(list_item) - - block_handler.block_stack.append(list_item) - block_handler.current_block = list_item - - # Create a paragraph for the list item content - item_para = Paragraph() - list_item.add_block(item_para) - block_handler.current_paragraph = item_para - self.text_processor.set_current_paragraph(item_para) - - def handle_definition_start(self, tag: str, block_handler: BlockElementHandler): - """Handle the start of definition terms or descriptions.""" - if not self.list_stack or self.list_stack[-1].style != ListStyle.DEFINITION: - return - - self.text_processor.flush_text() - current_list = self.list_stack[-1] - - if tag == 'dt': - list_item = ListItem(term="") - current_list.add_item(list_item) - block_handler.block_stack.append(list_item) - block_handler.current_block = list_item - - term_para = Paragraph() - list_item.add_block(term_para) - block_handler.current_paragraph = term_para - self.text_processor.set_current_paragraph(term_para) - - elif tag == 'dd': - if current_list._items: - list_item = current_list._items[-1] - desc_para = Paragraph() - list_item.add_block(desc_para) - block_handler.current_paragraph = desc_para - self.text_processor.set_current_paragraph(desc_para) - - def handle_list_end(self, block_handler: BlockElementHandler): - """Handle the end of a list.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - if self.list_stack: - self.list_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_list_item_end(self, block_handler: BlockElementHandler): - """Handle the end of a list item.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - -class TableElementHandler: - """Handles table-related HTML elements (table, tr, td, th, thead, tbody, tfoot).""" - - def __init__(self, text_processor: HTMLTextProcessor): - self.text_processor = text_processor - self.table_stack: List[Table] = [] - self.current_table_row: Optional[TableRow] = None - self.current_table_section = "body" - - def reset(self): - """Reset the handler state.""" - self.table_stack = [] - self.current_table_row = None - self.current_table_section = "body" - - def handle_table_start(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document): - """Handle the start of a table element.""" - self.text_processor.flush_text() - - caption = attrs.get('summary') - table = Table(caption=caption) - - block_handler.add_block_to_document_or_parent(table, document) - block_handler.block_stack.append(table) - self.table_stack.append(table) - block_handler.current_block = table - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - def handle_table_section_start(self, tag: str): - """Handle the start of a table section.""" - self.current_table_section = tag - - def handle_table_row_start(self): - """Handle the start of a table row.""" - if not self.table_stack: - return - - self.text_processor.flush_text() - row = TableRow() - - current_table = self.table_stack[-1] - section = self.current_table_section - - if section == 'thead': - section = "header" - elif section == 'tfoot': - section = "footer" - else: - section = "body" - - current_table.add_row(row, section=section) - self.current_table_row = row - - def handle_table_cell_start(self, tag: str, attrs: Dict[str, str], block_handler: BlockElementHandler): - """Handle the start of a table cell.""" - if not self.current_table_row: - return - - self.text_processor.flush_text() - - # Parse attributes - try: - colspan = int(attrs.get('colspan', 1)) - rowspan = int(attrs.get('rowspan', 1)) - except ValueError: - colspan, rowspan = 1, 1 - - is_header = (tag == 'th') - - cell = TableCell(is_header=is_header, colspan=colspan, rowspan=rowspan) - self.current_table_row.add_cell(cell) - - block_handler.block_stack.append(cell) - block_handler.current_block = cell - - # Create a paragraph for the cell content - cell_para = Paragraph() - cell.add_block(cell_para) - block_handler.current_paragraph = cell_para - self.text_processor.set_current_paragraph(cell_para) - - def handle_table_end(self, block_handler: BlockElementHandler): - """Handle the end of a table.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - if self.table_stack: - self.table_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - self.current_table_row = None - self.current_table_section = "body" - - def handle_table_section_end(self): - """Handle the end of a table section.""" - self.current_table_section = "body" - - def handle_table_row_end(self): - """Handle the end of a table row.""" - self.current_table_row = None - - def handle_table_cell_end(self, block_handler: BlockElementHandler): - """Handle the end of a table cell.""" - if block_handler.block_stack: - block_handler.block_stack.pop() - - if block_handler.block_stack: - block_handler.current_block = block_handler.block_stack[-1] - else: - block_handler.current_block = None - - block_handler.current_paragraph = None - self.text_processor.set_current_paragraph(None) - - -class InlineElementHandler: - """Handles inline and special HTML elements (a, img, br, hr).""" - - def __init__(self, text_processor: HTMLTextProcessor, base_url: Optional[str] = None): - self.text_processor = text_processor - self.base_url = base_url - self.in_link = False - self.current_link: Optional[Link] = None - - def reset(self): - """Reset the handler state.""" - self.in_link = False - self.current_link = None - - def set_base_url(self, base_url: Optional[str]): - """Set the base URL for resolving relative links.""" - self.base_url = base_url - - def handle_link_start(self, attrs: Dict[str, str]): - """Handle the start of a link element.""" - self.text_processor.flush_text() - - href = attrs.get('href', '') - title = attrs.get('title', '') - - # Determine link type - link_type = LinkType.INTERNAL - if href.startswith('http://') or href.startswith('https://'): - link_type = LinkType.EXTERNAL - elif href.startswith('javascript:'): - link_type = LinkType.FUNCTION - elif href.startswith('api:'): - link_type = LinkType.API - href = href[4:] - - # Resolve relative URLs - if self.base_url and not href.startswith(('http://', 'https://', 'javascript:', 'api:', '#')): - href = urllib.parse.urljoin(self.base_url, href) - - self.current_link = Link( - location=href, - link_type=link_type, - title=title if title else None - ) - - self.in_link = True - - def handle_link_end(self): - """Handle the end of a link element.""" - self.in_link = False - self.current_link = None - - def handle_image(self, attrs: Dict[str, str], block_handler: BlockElementHandler, document: Document): - """Handle an image element.""" - src = attrs.get('src', '') - alt = attrs.get('alt', '') - - # Parse dimensions - width = height = None - try: - if 'width' in attrs: - width = int(attrs['width']) - if 'height' in attrs: - height = int(attrs['height']) - except ValueError: - pass - - # Resolve relative URLs - if self.base_url and not src.startswith(('http://', 'https://')): - src = urllib.parse.urljoin(self.base_url, src) - - image = Image(source=src, alt_text=alt, width=width, height=height) - block_handler.add_block_to_document_or_parent(image, document) - - def handle_line_break(self, block_handler: BlockElementHandler): - """Handle a line break element.""" - if block_handler.current_paragraph: - line_break = LineBreak() - if hasattr(block_handler.current_paragraph, 'add_block'): - block_handler.current_paragraph.add_block(line_break) - self.text_processor.flush_text() - - def handle_horizontal_rule(self, block_handler: BlockElementHandler, document: Document): - """Handle a horizontal rule element.""" - self.text_processor.flush_text() - hr = HorizontalRule() - block_handler.add_block_to_document_or_parent(hr, document) diff --git a/pyWebLayout/io/readers/html_extraction.py b/pyWebLayout/io/readers/html_extraction.py index 6e359a3..73c2df2 100644 --- a/pyWebLayout/io/readers/html_extraction.py +++ b/pyWebLayout/io/readers/html_extraction.py @@ -12,7 +12,8 @@ from bs4 import BeautifulSoup, Tag, NavigableString from pyWebLayout.abstract.inline import Word, FormattedSpan from pyWebLayout.abstract.block import ( Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, - HList, ListItem, ListStyle, Table, TableRow, TableCell + HList, ListItem, ListStyle, Table, TableRow, TableCell, + HorizontalRule, Image ) from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration @@ -576,11 +577,9 @@ def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell: return cell -def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block: +def horizontal_rule_handler(element: Tag, context: StyleContext) -> HorizontalRule: """Handle
Hello world!
' - - result = self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], Paragraph) - - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - self.assertEqual(len(words), 2) - self.assertEqual(words[0][1].text, "Hello") - self.assertEqual(words[1][1].text, "world!") - - def test_headings(self): - """Test parsing different heading levels.""" - html = ''' -This is bold and italic text.
' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - - # Should have words: "This", "is", "bold", "and", "italic", "text." - self.assertEqual(len(words), 6) - - # The styling information is embedded in the Font objects - # We can't easily test the exact styling without more complex setup - # but we can verify the words are created correctly - word_texts = [word[1].text for word in words] - self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."]) - - def test_unordered_list(self): - """Test parsing unordered lists.""" - html = ''' -| Header 1 | -Header 2 | -
|---|---|
| Cell 1 | -Cell 2 | -
-- ''' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], Quote) - - quote = self.document.blocks[0] - quote_blocks = list(quote.blocks()) - self.assertEqual(len(quote_blocks), 2) - self.assertIsInstance(quote_blocks[0], Paragraph) - self.assertIsInstance(quote_blocks[1], Paragraph) - - def test_code_block(self): - """Test parsing code blocks.""" - html = ''' -This is a quoted paragraph.
-Another quoted paragraph.
-
-def hello():
- print("Hello, world!")
-
- '''
-
- self.reader.extract_content(html, self.document)
-
- self.assertEqual(len(self.document.blocks), 1)
- self.assertIsInstance(self.document.blocks[0], CodeBlock)
-
- code_block = self.document.blocks[0]
- self.assertEqual(code_block.language, "python")
-
- def test_horizontal_rule(self):
- """Test parsing horizontal rules."""
- html = 'Before
After
' - - self.reader.extract_content(html, self.document) - - self.assertEqual(len(self.document.blocks), 3) - self.assertIsInstance(self.document.blocks[0], Paragraph) - self.assertIsInstance(self.document.blocks[1], HorizontalRule) - self.assertIsInstance(self.document.blocks[2], Paragraph) - - def test_html_entities(self): - """Test handling HTML entities.""" - html = 'Less than: < Greater than: > Ampersand: &
' - - self.reader.extract_content(html, self.document) - - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - - # Find the entity words - word_texts = [word[1].text for word in words] - self.assertIn('<', word_texts) - self.assertIn('>', word_texts) - self.assertIn('&', word_texts) - - def test_nested_elements(self): - """Test parsing nested HTML elements.""" - html = ''' -Section content with important text.
-Word1 Word2 - Word3
- ''' - - self.reader.extract_content(html, self.document) - - paragraph = self.document.blocks[0] - words = list(paragraph.words()) - - # Should normalize whitespace and create separate words - word_texts = [word[1].text for word in words] - self.assertEqual(word_texts, ["Word1", "Word2", "Word3"]) - - def test_base_url_setting(self): - """Test setting base URL for link resolution.""" - base_url = "https://example.com/path/" - self.reader.set_base_url(base_url) - - # The base URL should be passed to the inline handler - self.assertEqual(self.reader.inline_handler.base_url, base_url) - - def test_complex_document(self): - """Test parsing a complex HTML document.""" - html = ''' - - - -Introduction paragraph with emphasis.
- -Content with a link.
- --- -A quoted paragraph.
-
| Col1 | Col2 |
|---|---|
| A | B |