diff --git a/pyWebLayout/abstract/__init__.py b/pyWebLayout/abstract/__init__.py index c8b030f..0fb2ef7 100644 --- a/pyWebLayout/abstract/__init__.py +++ b/pyWebLayout/abstract/__init__.py @@ -1,6 +1,6 @@ from .block import Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell -from .block import HorizontalRule, LineBreak, Image +#from .block import HorizontalRule, LineBreak, Image from .inline import Word, FormattedSpan from .document import Document, MetadataType, Chapter, Book from .functional import Link, LinkType, Button, Form, FormField, FormFieldType diff --git a/pyWebLayout/abstract/block.py b/pyWebLayout/abstract/block.py index 2a05bf7..5ab98e0 100644 --- a/pyWebLayout/abstract/block.py +++ b/pyWebLayout/abstract/block.py @@ -183,6 +183,10 @@ class Paragraph(Block): def word_count(self) -> int: """Get the number of words in this paragraph""" return len(self._words) + + def __len__(self): + + return self.word_count class HeadingLevel(Enum): @@ -1008,3 +1012,9 @@ class Table(Block): self._footer_rows.append(row) else: # Default to body self._rows + + + +class Image: + + pass \ No newline at end of file diff --git a/pyWebLayout/abstract/inline.py b/pyWebLayout/abstract/inline.py index 857dd9f..7081612 100644 --- a/pyWebLayout/abstract/inline.py +++ b/pyWebLayout/abstract/inline.py @@ -330,3 +330,8 @@ class FormattedSpan: self._words.append(word) return word + + +class LineBreak: + + pass \ No newline at end of file diff --git a/pyWebLayout/html_parser.py b/pyWebLayout/html_parser.py index bc987c3..61b40a0 100644 --- a/pyWebLayout/html_parser.py +++ b/pyWebLayout/html_parser.py @@ -7,7 +7,7 @@ from PIL import Image from .style import Font, FontStyle, FontWeight, TextDecoration from .abstract.document import Document, MetadataType, Book, Chapter from .abstract.block import ( - Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, + Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak ) from .abstract.inline import Word, FormattedSpan @@ -138,7 +138,7 @@ class HTMLParser(BaseHTMLParser): elif tag == 'p': self._flush_text() # Flush any pending text - self._current_paragraph = Parapgraph() + self._current_paragraph = Paragraph() # Add the paragraph to the current block or document if self._current_block and hasattr(self._current_block, 'add_block'): @@ -180,7 +180,7 @@ class HTMLParser(BaseHTMLParser): self._flush_text() # Flush any pending text # For divs, we create a new paragraph as a container - div_para = Parapgraph() + div_para = Paragraph() # Add the div to the current block or document if self._current_block and hasattr(self._current_block, 'add_block'): @@ -214,7 +214,7 @@ class HTMLParser(BaseHTMLParser): # Pre can optionally contain a code block # We'll create a paragraph for now, and if we find a code tag inside, # we'll replace it with a code block - pre_para = Parapgraph() + pre_para = Paragraph() # Add the pre to the current block or document if self._current_block and hasattr(self._current_block, 'add_block'): @@ -229,7 +229,7 @@ class HTMLParser(BaseHTMLParser): elif tag == 'code': # If we're inside a pre, replace the paragraph with a code block - if self._block_stack and isinstance(self._block_stack[-1], Parapgraph): + if self._block_stack and isinstance(self._block_stack[-1], Paragraph): pre_para = self._block_stack.pop() # Get the language from class if specified (e.g., class="language-python") @@ -312,7 +312,7 @@ class HTMLParser(BaseHTMLParser): self._current_block = list_item # Create a paragraph for the term content - term_para = Parapgraph() + term_para = Paragraph() list_item.add_block(term_para) self._current_paragraph = term_para @@ -325,7 +325,7 @@ class HTMLParser(BaseHTMLParser): list_item = current_list._items[-1] # Create a paragraph for the description content - desc_para = Parapgraph() + desc_para = Paragraph() list_item.add_block(desc_para) # Update current state @@ -340,7 +340,7 @@ class HTMLParser(BaseHTMLParser): self._current_block = list_item # Create a paragraph for the description content - desc_para = Parapgraph() + desc_para = Paragraph() list_item.add_block(desc_para) self._current_paragraph = desc_para @@ -424,7 +424,7 @@ class HTMLParser(BaseHTMLParser): self._current_block = cell # Create a paragraph for the cell content - cell_para = Parapgraph() + cell_para = Paragraph() cell.add_block(cell_para) self._current_paragraph = cell_para @@ -508,6 +508,7 @@ class HTMLParser(BaseHTMLParser): }) elif tag == 'br': + # Add a line break if self._current_paragraph: line_break = LineBreak() diff --git a/pyWebLayout/io/readers/epub_reader.py b/pyWebLayout/io/readers/epub_reader.py index 2c1ca27..44e938f 100644 --- a/pyWebLayout/io/readers/epub_reader.py +++ b/pyWebLayout/io/readers/epub_reader.py @@ -379,10 +379,10 @@ class EPUBReader: except Exception as e: print(f"Error parsing chapter {i+1}: {str(e)}") # Add an error message block - from pyWebLayout.abstract.block import Parapgraph + from pyWebLayout.abstract.block import Paragraph from pyWebLayout.abstract.inline import Word from pyWebLayout.style import Font - error_para = Parapgraph() + error_para = Paragraph() # Create a default font style for the error message default_font = Font() error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font)) diff --git a/pyWebLayout/io/readers/html_elements.py b/pyWebLayout/io/readers/html_elements.py index 93e3240..0f3400e 100644 --- a/pyWebLayout/io/readers/html_elements.py +++ b/pyWebLayout/io/readers/html_elements.py @@ -9,9 +9,9 @@ from typing import Dict, List, Optional, Any import urllib.parse from pyWebLayout.abstract.document import Document from pyWebLayout.abstract.block import ( - Block, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, + Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, ListItem, Table, TableRow, TableCell, - HorizontalRule, LineBreak, Image + #HorizontalRule, LineBreak, Image ) from pyWebLayout.abstract.functional import Link, LinkType from pyWebLayout.io.readers.html_style import HTMLStyleManager @@ -26,7 +26,7 @@ class BlockElementHandler: self.text_processor = text_processor self.block_stack: List[Block] = [] self.current_block: Optional[Block] = None - self.current_paragraph: Optional[Parapgraph] = None + self.current_paragraph: Optional[Paragraph] = None def reset(self): """Reset the handler state.""" @@ -44,7 +44,7 @@ class BlockElementHandler: def handle_paragraph_start(self, document: Document): """Handle the start of a paragraph element.""" self.text_processor.flush_text() - paragraph = Parapgraph() + paragraph = Paragraph() self.add_block_to_document_or_parent(paragraph, document) self.block_stack.append(paragraph) @@ -71,7 +71,7 @@ class BlockElementHandler: def handle_div_start(self, document: Document): """Handle the start of a div element.""" self.text_processor.flush_text() - div_para = Parapgraph() + div_para = Paragraph() self.add_block_to_document_or_parent(div_para, document) self.block_stack.append(div_para) @@ -93,7 +93,7 @@ class BlockElementHandler: def handle_pre_start(self, document: Document): """Handle the start of a pre element.""" self.text_processor.flush_text() - pre_para = Parapgraph() + pre_para = Paragraph() self.add_block_to_document_or_parent(pre_para, document) self.block_stack.append(pre_para) @@ -104,7 +104,7 @@ class BlockElementHandler: def handle_code_start(self, attrs: Dict[str, str], document: Document): """Handle the start of a code element.""" # If we're inside a pre, replace the paragraph with a code block - if self.block_stack and isinstance(self.block_stack[-1], Parapgraph): + if self.block_stack and isinstance(self.block_stack[-1], Paragraph): pre_para = self.block_stack.pop() # Get the language from class if specified @@ -145,7 +145,7 @@ class BlockElementHandler: if self.block_stack: self.current_block = self.block_stack[-1] # Update current paragraph based on block type - if isinstance(self.current_block, Parapgraph): + if isinstance(self.current_block, Paragraph): self.current_paragraph = self.current_block else: self.current_paragraph = None @@ -201,7 +201,7 @@ class ListElementHandler: block_handler.current_block = list_item # Create a paragraph for the list item content - item_para = Parapgraph() + item_para = Paragraph() list_item.add_block(item_para) block_handler.current_paragraph = item_para self.text_processor.set_current_paragraph(item_para) @@ -220,7 +220,7 @@ class ListElementHandler: block_handler.block_stack.append(list_item) block_handler.current_block = list_item - term_para = Parapgraph() + term_para = Paragraph() list_item.add_block(term_para) block_handler.current_paragraph = term_para self.text_processor.set_current_paragraph(term_para) @@ -228,7 +228,7 @@ class ListElementHandler: elif tag == 'dd': if current_list._items: list_item = current_list._items[-1] - desc_para = Parapgraph() + desc_para = Paragraph() list_item.add_block(desc_para) block_handler.current_paragraph = desc_para self.text_processor.set_current_paragraph(desc_para) @@ -339,7 +339,7 @@ class TableElementHandler: block_handler.current_block = cell # Create a paragraph for the cell content - cell_para = Parapgraph() + cell_para = Paragraph() cell.add_block(cell_para) block_handler.current_paragraph = cell_para self.text_processor.set_current_paragraph(cell_para) diff --git a/pyWebLayout/io/readers/html_extraction.py b/pyWebLayout/io/readers/html_extraction.py new file mode 100644 index 0000000..6e359a3 --- /dev/null +++ b/pyWebLayout/io/readers/html_extraction.py @@ -0,0 +1,736 @@ +""" +HTML extraction module for converting HTML elements to pyWebLayout abstract elements. + +This module provides handler functions for converting HTML elements into the abstract document structure +used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting. +Each handler function has a robust signature that handles style hints, CSS classes, and attributes. +""" + +import re +from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple +from bs4 import BeautifulSoup, Tag, NavigableString +from pyWebLayout.abstract.inline import Word, FormattedSpan +from pyWebLayout.abstract.block import ( + Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, + HList, ListItem, ListStyle, Table, TableRow, TableCell +) +from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration + + +class StyleContext(NamedTuple): + """ + Immutable style context passed to handler functions. + Contains all styling information including inherited styles, CSS hints, and element attributes. + """ + font: Font + background: Optional[Tuple[int, int, int, int]] + css_classes: set + css_styles: Dict[str, str] + element_attributes: Dict[str, Any] + parent_elements: List[str] # Stack of parent element names + + def with_font(self, font: Font) -> 'StyleContext': + """Create new context with modified font.""" + return self._replace(font=font) + + def with_background(self, background: Optional[Tuple[int, int, int, int]]) -> 'StyleContext': + """Create new context with modified background.""" + return self._replace(background=background) + + def with_css_classes(self, css_classes: set) -> 'StyleContext': + """Create new context with modified CSS classes.""" + return self._replace(css_classes=css_classes) + + def with_css_styles(self, css_styles: Dict[str, str]) -> 'StyleContext': + """Create new context with modified CSS styles.""" + return self._replace(css_styles=css_styles) + + def with_attributes(self, attributes: Dict[str, Any]) -> 'StyleContext': + """Create new context with modified element attributes.""" + return self._replace(element_attributes=attributes) + + def push_element(self, element_name: str) -> 'StyleContext': + """Create new context with element pushed onto parent stack.""" + return self._replace(parent_elements=self.parent_elements + [element_name]) + + +def create_base_context(base_font: Optional[Font] = None) -> StyleContext: + """ + Create a base style context with default values. + + Args: + base_font: Base font to use, defaults to system default + + Returns: + StyleContext with default values + """ + return StyleContext( + font=base_font or Font(), + background=None, + css_classes=set(), + css_styles={}, + element_attributes={}, + parent_elements=[] + ) + + +def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext: + """ + Apply element-specific styling to context based on HTML element and attributes. + + Args: + context: Current style context + element: BeautifulSoup Tag object + + Returns: + New StyleContext with applied styling + """ + tag_name = element.name.lower() + attributes = dict(element.attrs) if element.attrs else {} + + # Start with current context + new_context = context.with_attributes(attributes).push_element(tag_name) + + # Apply CSS classes + css_classes = new_context.css_classes.copy() + if 'class' in attributes: + classes = attributes['class'].split() if isinstance(attributes['class'], str) else attributes['class'] + css_classes.update(classes) + new_context = new_context.with_css_classes(css_classes) + + # Apply inline styles + css_styles = new_context.css_styles.copy() + if 'style' in attributes: + inline_styles = parse_inline_styles(attributes['style']) + css_styles.update(inline_styles) + new_context = new_context.with_css_styles(css_styles) + + # Apply element-specific default styles + font = apply_element_font_styles(new_context.font, tag_name, css_styles) + new_context = new_context.with_font(font) + + # Apply background from styles + background = apply_background_styles(new_context.background, css_styles) + new_context = new_context.with_background(background) + + return new_context + + +def parse_inline_styles(style_text: str) -> Dict[str, str]: + """ + Parse CSS inline styles into dictionary. + + Args: + style_text: CSS style text (e.g., "color: red; font-weight: bold;") + + Returns: + Dictionary of CSS property-value pairs + """ + styles = {} + for declaration in style_text.split(';'): + if ':' in declaration: + prop, value = declaration.split(':', 1) + styles[prop.strip().lower()] = value.strip() + return styles + + +def apply_element_font_styles(font: Font, tag_name: str, css_styles: Dict[str, str]) -> Font: + """ + Apply font styling based on HTML element and CSS styles. + + Args: + font: Current font + tag_name: HTML tag name + css_styles: CSS styles dictionary + + Returns: + New Font object with applied styling + """ + # Default element styles + element_font_styles = { + 'b': {'weight': FontWeight.BOLD}, + 'strong': {'weight': FontWeight.BOLD}, + 'i': {'style': FontStyle.ITALIC}, + 'em': {'style': FontStyle.ITALIC}, + 'u': {'decoration': TextDecoration.UNDERLINE}, + 's': {'decoration': TextDecoration.STRIKETHROUGH}, + 'del': {'decoration': TextDecoration.STRIKETHROUGH}, + 'h1': {'size': 24, 'weight': FontWeight.BOLD}, + 'h2': {'size': 20, 'weight': FontWeight.BOLD}, + 'h3': {'size': 18, 'weight': FontWeight.BOLD}, + 'h4': {'size': 16, 'weight': FontWeight.BOLD}, + 'h5': {'size': 14, 'weight': FontWeight.BOLD}, + 'h6': {'size': 12, 'weight': FontWeight.BOLD}, + } + + # Start with current font properties + font_size = font.font_size + colour = font.colour + weight = font.weight + style = font.style + decoration = font.decoration + background = font.background + language = font.language + + # Apply element default styles + if tag_name in element_font_styles: + elem_styles = element_font_styles[tag_name] + if 'size' in elem_styles: + font_size = elem_styles['size'] + if 'weight' in elem_styles: + weight = elem_styles['weight'] + if 'style' in elem_styles: + style = elem_styles['style'] + if 'decoration' in elem_styles: + decoration = elem_styles['decoration'] + + # Apply CSS styles (override element defaults) + if 'font-size' in css_styles: + # Parse font-size (simplified - could be enhanced) + size_value = css_styles['font-size'].lower() + if size_value.endswith('px'): + try: + font_size = int(float(size_value[:-2])) + except ValueError: + pass + elif size_value.endswith('pt'): + try: + font_size = int(float(size_value[:-2])) + except ValueError: + pass + + if 'font-weight' in css_styles: + weight_value = css_styles['font-weight'].lower() + if weight_value in ['bold', '700', '800', '900']: + weight = FontWeight.BOLD + elif weight_value in ['normal', '400']: + weight = FontWeight.NORMAL + + if 'font-style' in css_styles: + style_value = css_styles['font-style'].lower() + if style_value == 'italic': + style = FontStyle.ITALIC + elif style_value == 'normal': + style = FontStyle.NORMAL + + if 'text-decoration' in css_styles: + decoration_value = css_styles['text-decoration'].lower() + if 'underline' in decoration_value: + decoration = TextDecoration.UNDERLINE + elif 'line-through' in decoration_value: + decoration = TextDecoration.STRIKETHROUGH + elif 'none' in decoration_value: + decoration = TextDecoration.NONE + + if 'color' in css_styles: + # Parse color (simplified - could be enhanced for hex, rgb, etc.) + color_value = css_styles['color'].lower() + color_map = { + 'black': (0, 0, 0), + 'white': (255, 255, 255), + 'red': (255, 0, 0), + 'green': (0, 255, 0), + 'blue': (0, 0, 255), + } + if color_value in color_map: + colour = color_map[color_value] + elif color_value.startswith('#') and len(color_value) == 7: + try: + r = int(color_value[1:3], 16) + g = int(color_value[3:5], 16) + b = int(color_value[5:7], 16) + colour = (r, g, b) + except ValueError: + pass + + return Font( + font_path=font._font_path, + font_size=font_size, + colour=colour, + weight=weight, + style=style, + decoration=decoration, + background=background, + langauge=language + ) + + +def apply_background_styles(current_background: Optional[Tuple[int, int, int, int]], + css_styles: Dict[str, str]) -> Optional[Tuple[int, int, int, int]]: + """ + Apply background styling from CSS. + + Args: + current_background: Current background color (RGBA) + css_styles: CSS styles dictionary + + Returns: + New background color or None + """ + if 'background-color' in css_styles: + bg_value = css_styles['background-color'].lower() + if bg_value == 'transparent': + return None + # Add color parsing logic here if needed + + return current_background + + +def extract_text_content(element: Tag, context: StyleContext) -> List[Word]: + """ + Extract text content from an element, handling inline formatting. + + Args: + element: BeautifulSoup Tag object + context: Current style context + + Returns: + List of Word objects + """ + words = [] + + for child in element.children: + if isinstance(child, NavigableString): + # Plain text - split into words + text = str(child).strip() + if text: + word_texts = text.split() + for word_text in word_texts: + if word_text: + words.append(Word(word_text, context.font, context.background)) + elif isinstance(child, Tag): + # Process inline elements + if child.name.lower() in ['span', 'a', 'strong', 'b', 'em', 'i', 'u', 's', 'del', 'ins', 'mark', 'small', 'sub', 'sup', 'code', 'q', 'cite', 'abbr', 'time']: + child_context = apply_element_styling(context, child) + child_words = extract_text_content(child, child_context) + words.extend(child_words) + else: + # Block element - shouldn't happen in well-formed HTML but handle gracefully + child_context = apply_element_styling(context, child) + child_result = process_element(child, child_context) + if isinstance(child_result, list): + for block in child_result: + if isinstance(block, Paragraph): + for _, word in block.words(): + words.append(word) + elif isinstance(child_result, Paragraph): + for _, word in child_result.words(): + words.append(word) + + return words + + +def process_element(element: Tag, context: StyleContext) -> Union[Block, List[Block], None]: + """ + Process a single HTML element using appropriate handler. + + Args: + element: BeautifulSoup Tag object + context: Current style context + + Returns: + Block object(s) or None if element should be ignored + """ + tag_name = element.name.lower() + handler = HANDLERS.get(tag_name, generic_handler) + return handler(element, context) + + +# Handler function signatures: +# All handlers receive (element: Tag, context: StyleContext) -> Union[Block, List[Block], None] + +def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph: + """Handle

elements.""" + paragraph = Paragraph(context.font) + words = extract_text_content(element, context) + for word in words: + paragraph.add_word(word) + return paragraph + + +def div_handler(element: Tag, context: StyleContext) -> List[Block]: + """Handle

elements - treat as generic container.""" + blocks = [] + for child in element.children: + if isinstance(child, Tag): + child_context = apply_element_styling(context, child) + result = process_element(child, child_context) + if result: + if isinstance(result, list): + blocks.extend(result) + else: + blocks.append(result) + return blocks + + +def heading_handler(element: Tag, context: StyleContext) -> Heading: + """Handle

-

elements.""" + level_map = { + 'h1': HeadingLevel.H1, + 'h2': HeadingLevel.H2, + 'h3': HeadingLevel.H3, + 'h4': HeadingLevel.H4, + 'h5': HeadingLevel.H5, + 'h6': HeadingLevel.H6, + } + + level = level_map.get(element.name.lower(), HeadingLevel.H1) + heading = Heading(level, context.font) + words = extract_text_content(element, context) + for word in words: + heading.add_word(word) + return heading + + +def blockquote_handler(element: Tag, context: StyleContext) -> Quote: + """Handle
elements.""" + quote = Quote(context.font) + for child in element.children: + if isinstance(child, Tag): + child_context = apply_element_styling(context, child) + result = process_element(child, child_context) + if result: + if isinstance(result, list): + for block in result: + quote.add_block(block) + else: + quote.add_block(result) + return quote + + +def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock: + """Handle
 elements."""
+    language = context.element_attributes.get('data-language', '')
+    code_block = CodeBlock(language)
+    
+    # Preserve whitespace and line breaks in preformatted text
+    text = element.get_text(separator='\n', strip=False)
+    for line in text.split('\n'):
+        code_block.add_line(line)
+    
+    return code_block
+
+
+def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]:
+    """Handle  elements."""
+    # If parent is 
, this is handled by preformatted_handler
+    if context.parent_elements and context.parent_elements[-1] == 'pre':
+        return None  # Will be handled by parent
+    
+    # Inline code - handled during text extraction
+    return None
+
+
+def unordered_list_handler(element: Tag, context: StyleContext) -> HList:
+    """Handle 
    elements.""" + hlist = HList(ListStyle.UNORDERED, context.font) + for child in element.children: + if isinstance(child, Tag) and child.name.lower() == 'li': + child_context = apply_element_styling(context, child) + item = process_element(child, child_context) + if item: + hlist.add_item(item) + return hlist + + +def ordered_list_handler(element: Tag, context: StyleContext) -> HList: + """Handle
      elements.""" + hlist = HList(ListStyle.ORDERED, context.font) + for child in element.children: + if isinstance(child, Tag) and child.name.lower() == 'li': + child_context = apply_element_styling(context, child) + item = process_element(child, child_context) + if item: + hlist.add_item(item) + return hlist + + +def list_item_handler(element: Tag, context: StyleContext) -> ListItem: + """Handle
    1. elements.""" + list_item = ListItem(None, context.font) + + for child in element.children: + if isinstance(child, Tag): + child_context = apply_element_styling(context, child) + result = process_element(child, child_context) + if result: + if isinstance(result, list): + for block in result: + list_item.add_block(block) + else: + list_item.add_block(result) + elif isinstance(child, NavigableString): + # Direct text in list item - create paragraph + text = str(child).strip() + if text: + paragraph = Paragraph(context.font) + words = text.split() + for word_text in words: + if word_text: + paragraph.add_word(Word(word_text, context.font)) + list_item.add_block(paragraph) + + return list_item + + +def table_handler(element: Tag, context: StyleContext) -> Table: + """Handle elements.""" + caption = None + caption_elem = element.find('caption') + if caption_elem: + caption = caption_elem.get_text(strip=True) + + table = Table(caption, context.font) + + # Process table rows + for child in element.children: + if isinstance(child, Tag): + if child.name.lower() == 'tr': + child_context = apply_element_styling(context, child) + row = process_element(child, child_context) + if row: + table.add_row(row) + elif child.name.lower() in ['thead', 'tbody', 'tfoot']: + section = 'header' if child.name.lower() == 'thead' else 'body' + section = 'footer' if child.name.lower() == 'tfoot' else section + + for row_elem in child.find_all('tr'): + child_context = apply_element_styling(context, row_elem) + row = process_element(row_elem, child_context) + if row: + table.add_row(row, section) + + return table + + +def table_row_handler(element: Tag, context: StyleContext) -> TableRow: + """Handle elements.""" + row = TableRow(context.font) + for child in element.children: + if isinstance(child, Tag) and child.name.lower() in ['td', 'th']: + child_context = apply_element_styling(context, child) + cell = process_element(child, child_context) + if cell: + row.add_cell(cell) + return row + + +def table_cell_handler(element: Tag, context: StyleContext) -> TableCell: + """Handle
      elements.""" + colspan = int(context.element_attributes.get('colspan', 1)) + rowspan = int(context.element_attributes.get('rowspan', 1)) + cell = TableCell(False, colspan, rowspan, context.font) + + # Process cell content + for child in element.children: + if isinstance(child, Tag): + child_context = apply_element_styling(context, child) + result = process_element(child, child_context) + if result: + if isinstance(result, list): + for block in result: + cell.add_block(block) + else: + cell.add_block(result) + elif isinstance(child, NavigableString): + # Direct text in cell - create paragraph + text = str(child).strip() + if text: + paragraph = Paragraph(context.font) + words = text.split() + for word_text in words: + if word_text: + paragraph.add_word(Word(word_text, context.font)) + cell.add_block(paragraph) + + return cell + + +def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell: + """Handle elements.""" + colspan = int(context.element_attributes.get('colspan', 1)) + rowspan = int(context.element_attributes.get('rowspan', 1)) + cell = TableCell(True, colspan, rowspan, context.font) + + # Process cell content (same as td) + for child in element.children: + if isinstance(child, Tag): + child_context = apply_element_styling(context, child) + result = process_element(child, child_context) + if result: + if isinstance(result, list): + for block in result: + cell.add_block(block) + else: + cell.add_block(result) + elif isinstance(child, NavigableString): + text = str(child).strip() + if text: + paragraph = Paragraph(context.font) + words = text.split() + for word_text in words: + if word_text: + paragraph.add_word(Word(word_text, context.font)) + cell.add_block(paragraph) + + return cell + + +def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block: + """Handle
      elements.""" + # TODO: Create a specific HorizontalRule block type + # For now, return an empty paragraph + return Paragraph(context.font) + + +def line_break_handler(element: Tag, context: StyleContext) -> None: + """Handle
      elements.""" + # Line breaks are typically handled at the paragraph level + return None + + +def image_handler(element: Tag, context: StyleContext) -> Block: + """Handle elements.""" + # TODO: Create Image block type + # For now, return empty paragraph with alt text if available + paragraph = Paragraph(context.font) + alt_text = context.element_attributes.get('alt', '') + if alt_text: + words = alt_text.split() + for word_text in words: + if word_text: + paragraph.add_word(Word(word_text, context.font)) + return paragraph + + +def ignore_handler(element: Tag, context: StyleContext) -> None: + """Handle elements that should be ignored.""" + return None + + +def generic_handler(element: Tag, context: StyleContext) -> List[Block]: + """Handle unknown elements as generic containers.""" + return div_handler(element, context) + + +# Handler registry - maps HTML tag names to handler functions +HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = { + # Block elements + 'p': paragraph_handler, + 'div': div_handler, + 'h1': heading_handler, + 'h2': heading_handler, + 'h3': heading_handler, + 'h4': heading_handler, + 'h5': heading_handler, + 'h6': heading_handler, + 'blockquote': blockquote_handler, + 'pre': preformatted_handler, + 'code': code_handler, + 'ul': unordered_list_handler, + 'ol': ordered_list_handler, + 'li': list_item_handler, + 'table': table_handler, + 'tr': table_row_handler, + 'td': table_cell_handler, + 'th': table_header_cell_handler, + 'hr': horizontal_rule_handler, + 'br': line_break_handler, + + # Semantic elements (treated as containers) + 'section': div_handler, + 'article': div_handler, + 'aside': div_handler, + 'nav': div_handler, + 'header': div_handler, + 'footer': div_handler, + 'main': div_handler, + 'figure': div_handler, + 'figcaption': paragraph_handler, + + # Media elements + 'img': image_handler, + + # Inline elements (handled during text extraction) + 'span': ignore_handler, + 'a': ignore_handler, + 'strong': ignore_handler, + 'b': ignore_handler, + 'em': ignore_handler, + 'i': ignore_handler, + 'u': ignore_handler, + 's': ignore_handler, + 'del': ignore_handler, + 'ins': ignore_handler, + 'mark': ignore_handler, + 'small': ignore_handler, + 'sub': ignore_handler, + 'sup': ignore_handler, + 'q': ignore_handler, + 'cite': ignore_handler, + 'abbr': ignore_handler, + 'time': ignore_handler, + + # Ignored elements + 'script': ignore_handler, + 'style': ignore_handler, + 'meta': ignore_handler, + 'link': ignore_handler, + 'head': ignore_handler, + 'title': ignore_handler, +} + + +def parse_html_string(html_string: str, base_font: Optional[Font] = None) -> List[Block]: + """ + Parse HTML string and return list of Block objects. + + Args: + html_string: HTML content to parse + base_font: Base font for styling, defaults to system default + + Returns: + List of Block objects representing the document structure + """ + soup = BeautifulSoup(html_string, 'html.parser') + context = create_base_context(base_font) + blocks = [] + + # Process the body if it exists, otherwise process all top-level elements + root_element = soup.find('body') or soup + + for element in root_element.children: + if isinstance(element, Tag): + element_context = apply_element_styling(context, element) + result = process_element(element, element_context) + if result: + if isinstance(result, list): + blocks.extend(result) + else: + blocks.append(result) + + return blocks + + +def register_handler(tag_name: str, handler: Callable[[Tag, StyleContext], Union[Block, List[Block], None]]): + """ + Register a custom handler for an HTML tag. + + Args: + tag_name: HTML tag name (lowercase) + handler: Handler function with signature (element: Tag, context: StyleContext) -> Union[Block, List[Block], None] + """ + HANDLERS[tag_name] = handler + + +def get_handler(tag_name: str) -> Callable[[Tag, StyleContext], Union[Block, List[Block], None]]: + """ + Get handler function for HTML tag. + + Args: + tag_name: HTML tag name (lowercase) + + Returns: + Handler function or generic_handler if tag not found + """ + return HANDLERS.get(tag_name.lower(), generic_handler) diff --git a/pyWebLayout/io/readers/html_text.py b/pyWebLayout/io/readers/html_text.py index d59ccef..7864148 100644 --- a/pyWebLayout/io/readers/html_text.py +++ b/pyWebLayout/io/readers/html_text.py @@ -7,7 +7,7 @@ entity references, and word creation in HTML documents. from typing import Optional from pyWebLayout.abstract.inline import Word -from pyWebLayout.abstract.block import Parapgraph +from pyWebLayout.abstract.block import Paragraph from pyWebLayout.io.readers.html_style import HTMLStyleManager @@ -28,14 +28,14 @@ class HTMLTextProcessor: """ self._style_manager = style_manager self._text_buffer = "" - self._current_paragraph: Optional[Parapgraph] = None + self._current_paragraph: Optional[Paragraph] = None def reset(self): """Reset the text processor state.""" self._text_buffer = "" self._current_paragraph = None - def set_current_paragraph(self, paragraph: Optional[Parapgraph]): + def set_current_paragraph(self, paragraph: Optional[Paragraph]): """ Set the current paragraph for text output. diff --git a/pyWebLayout/typesetting/document_pagination.py b/pyWebLayout/typesetting/document_pagination.py index 764e49f..6352604 100644 --- a/pyWebLayout/typesetting/document_pagination.py +++ b/pyWebLayout/typesetting/document_pagination.py @@ -139,7 +139,7 @@ class DocumentPaginator: for chapter in self.document.chapters: # Add a heading block for the chapter if it has a title if chapter.title: - from pyWebLayout.abstract.block import Heading, HeadingLevel, Parapgraph + from pyWebLayout.abstract.block import Heading, HeadingLevel, Paragraph from pyWebLayout.abstract.inline import Word # Create a heading for the chapter diff --git a/tests/test_abstract_blocks.py b/tests/test_abstract_blocks.py index 8697d8e..22c3f2b 100644 --- a/tests/test_abstract_blocks.py +++ b/tests/test_abstract_blocks.py @@ -6,7 +6,7 @@ Tests the core abstract block classes that form the foundation of the document m import unittest from pyWebLayout.abstract.block import ( - Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, + Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak, Image ) @@ -19,7 +19,7 @@ class TestBlockElements(unittest.TestCase): def test_paragraph_creation(self): """Test creating and using paragraphs.""" - paragraph = Parapgraph() + paragraph = Paragraph() self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH) self.assertEqual(paragraph.word_count, 0) @@ -62,8 +62,8 @@ class TestBlockElements(unittest.TestCase): quote = Quote() # Add nested paragraphs - p1 = Parapgraph() - p2 = Parapgraph() + p1 = Paragraph() + p2 = Paragraph() quote.add_block(p1) quote.add_block(p2) diff --git a/tests/test_abstract_document.py b/tests/test_abstract_document.py index 68d8de3..e379901 100644 --- a/tests/test_abstract_document.py +++ b/tests/test_abstract_document.py @@ -7,7 +7,7 @@ document structure and metadata management. import unittest from pyWebLayout.abstract.document import Document, Chapter, Book, MetadataType -from pyWebLayout.abstract.block import Parapgraph, Heading, HeadingLevel, BlockType +from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, BlockType from pyWebLayout.abstract.inline import Word, FormattedSpan from pyWebLayout.style import Font @@ -77,8 +77,8 @@ class TestDocument(unittest.TestCase): def test_block_management(self): """Test adding and managing blocks.""" # Create some blocks - para1 = Parapgraph() - para2 = Parapgraph() + para1 = Paragraph() + para2 = Paragraph() heading = Heading(HeadingLevel.H1) # Add blocks @@ -95,7 +95,7 @@ class TestDocument(unittest.TestCase): def test_anchor_management(self): """Test named anchor functionality.""" heading = Heading(HeadingLevel.H1) - para = Parapgraph() + para = Paragraph() # Add anchors self.doc.add_anchor("intro", heading) @@ -154,8 +154,8 @@ class TestDocument(unittest.TestCase): def test_find_blocks_by_type(self): """Test finding blocks by type.""" # Create blocks of different types - para1 = Parapgraph() - para2 = Parapgraph() + para1 = Paragraph() + para2 = Paragraph() heading1 = Heading(HeadingLevel.H1) heading2 = Heading(HeadingLevel.H2) @@ -180,7 +180,7 @@ class TestDocument(unittest.TestCase): def test_find_headings(self): """Test finding heading blocks specifically.""" # Create mixed blocks - para = Parapgraph() + para = Paragraph() h1 = Heading(HeadingLevel.H1) h2 = Heading(HeadingLevel.H2) @@ -284,8 +284,8 @@ class TestChapter(unittest.TestCase): def test_block_management(self): """Test adding blocks to chapter.""" - para1 = Parapgraph() - para2 = Parapgraph() + para1 = Paragraph() + para2 = Paragraph() heading = Heading(HeadingLevel.H2) # Add blocks @@ -450,7 +450,7 @@ class TestBook(unittest.TestCase): """Test that Book inherits all Document functionality.""" # Test that book can use all document methods # Add blocks directly to book - para = Parapgraph() + para = Paragraph() self.book.add_block(para) self.assertEqual(len(self.book.blocks), 1) diff --git a/tests/test_epub_fix.py b/tests/test_epub_fix.py deleted file mode 100644 index f262acc..0000000 --- a/tests/test_epub_fix.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple test script to verify that the EPUB reader fixes are working correctly. -""" - -import sys -import os - -# Add the pyWebLayout directory to the Python path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'pyWebLayout')) - -try: - from pyWebLayout.io.readers.epub_reader import read_epub - print("Successfully imported epub_reader module") - - # Test reading the EPUB file - epub_path = os.path.join('pyWebLayout', 'examples', 'pg174-images-3.epub') - - if not os.path.exists(epub_path): - print(f"EPUB file not found: {epub_path}") - sys.exit(1) - - print(f"Reading EPUB file: {epub_path}") - - # Try to read the EPUB - book = read_epub(epub_path) - - print(f"Successfully read EPUB file!") - print(f"Book title: {book.title}") - print(f"Number of chapters: {len(book.chapters)}") - - # Check first chapter - if book.chapters: - first_chapter = book.chapters[0] - print(f"First chapter title: {first_chapter.title}") - print(f"First chapter has {len(first_chapter.blocks)} blocks") - -except Exception as e: - print(f"Error: {e}") - import traceback - traceback.print_exc() - sys.exit(1) - -print("Test completed successfully!") diff --git a/tests/test_html_content.py b/tests/test_html_content.py index 6c92a3e..05cd2e9 100644 --- a/tests/test_html_content.py +++ b/tests/test_html_content.py @@ -9,7 +9,7 @@ import unittest from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.abstract.document import Document from pyWebLayout.abstract.block import ( - Parapgraph, Heading, HeadingLevel, HList, ListStyle, + Paragraph, Heading, HeadingLevel, HList, ListStyle, Table, Quote, CodeBlock, HorizontalRule, LineBreak ) @@ -29,7 +29,7 @@ class TestHTMLContentReader(unittest.TestCase): result = self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) - self.assertIsInstance(self.document.blocks[0], Parapgraph) + self.assertIsInstance(self.document.blocks[0], Paragraph) paragraph = self.document.blocks[0] words = list(paragraph.words()) @@ -107,7 +107,7 @@ class TestHTMLContentReader(unittest.TestCase): # Check first item content first_item_blocks = list(items[0].blocks()) self.assertEqual(len(first_item_blocks), 1) - self.assertIsInstance(first_item_blocks[0], Parapgraph) + self.assertIsInstance(first_item_blocks[0], Paragraph) def test_ordered_list(self): """Test parsing ordered lists.""" @@ -202,8 +202,8 @@ class TestHTMLContentReader(unittest.TestCase): quote = self.document.blocks[0] quote_blocks = list(quote.blocks()) self.assertEqual(len(quote_blocks), 2) - self.assertIsInstance(quote_blocks[0], Parapgraph) - self.assertIsInstance(quote_blocks[1], Parapgraph) + self.assertIsInstance(quote_blocks[0], Paragraph) + self.assertIsInstance(quote_blocks[1], Paragraph) def test_code_block(self): """Test parsing code blocks.""" @@ -229,9 +229,9 @@ def hello(): self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 3) - self.assertIsInstance(self.document.blocks[0], Parapgraph) + self.assertIsInstance(self.document.blocks[0], Paragraph) self.assertIsInstance(self.document.blocks[1], HorizontalRule) - self.assertIsInstance(self.document.blocks[2], Parapgraph) + self.assertIsInstance(self.document.blocks[2], Paragraph) def test_html_entities(self): """Test handling HTML entities.""" @@ -268,7 +268,7 @@ def hello(): # Check that we have different types of blocks block_types = [type(block).__name__ for block in self.document.blocks] - self.assertIn('Parapgraph', block_types) # From div + self.assertIn('Paragraph', block_types) # From div self.assertIn('Heading', block_types) self.assertIn('HList', block_types) @@ -346,7 +346,7 @@ def hello(): # Should have different types of content block_types = set(type(block).__name__ for block in self.document.blocks) - expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'} + expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'} self.assertTrue(expected_types.issubset(block_types)) diff --git a/tests/test_html_extraction.py b/tests/test_html_extraction.py new file mode 100644 index 0000000..0cbf1cb --- /dev/null +++ b/tests/test_html_extraction.py @@ -0,0 +1,384 @@ +""" +Unit tests for HTML extraction functionality. + +Tests the HTML parsing and conversion to pyWebLayout abstract elements, +including styled content within paragraphs and block-level elements. +""" + +import unittest +from pyWebLayout.io.readers.html_extraction import parse_html_string +from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table +from pyWebLayout.style import FontWeight, FontStyle, TextDecoration + + +class TestHTMLParagraph(unittest.TestCase): + """Test cases for basic paragraph parsing.""" + + def test_simple(self): + text = "

      This is a paragraph.

      " + paragraphs = parse_html_string(text) + self.assertEqual(len(paragraphs), 1) + self.assertEqual(len(paragraphs[0]), 4) + + for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")): + self.assertEqual(w1[1].text, t1) + + def test_multiple(self): + text = "

      This is a paragraph.

      This is another paragraph.

      " + paragraphs = parse_html_string(text) + self.assertEqual(len(paragraphs), 2) + self.assertEqual(len(paragraphs[0]), 4) + self.assertEqual(len(paragraphs[1]), 4) + + for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")): + self.assertEqual(w1[1].text, t1) + + for w1, t1 in zip(paragraphs[1].words(), "This is another paragraph.".split(" ")): + self.assertEqual(w1[1].text, t1) + + +class TestHTMLStyledParagraphs(unittest.TestCase): + """Test cases for paragraphs with inline styling.""" + + def test_bold_text(self): + """Test paragraphs with bold text using and tags.""" + text = "

      This is bold text in a paragraph.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Paragraph) + + words = list(blocks[0].words()) + self.assertEqual(len(words), 7) # "This is bold text in a paragraph." + + # Check that 'bold' and 'text' words have bold font weight + bold_word = words[2][1] # 'bold' + text_word = words[3][1] # 'text' + self.assertEqual(bold_word.text, "bold") + self.assertEqual(bold_word.style.weight, FontWeight.BOLD) + self.assertEqual(text_word.text, "text") + self.assertEqual(text_word.style.weight, FontWeight.BOLD) + + # Check that other words are not bold + normal_word = words[0][1] # 'This' + self.assertEqual(normal_word.text, "This") + self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD) + + def test_italic_text(self): + """Test paragraphs with italic text using and tags.""" + text = "

      This is italic text in a paragraph.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Paragraph) + + words = list(blocks[0].words()) + + # Check that 'italic' and 'text' words have italic font style + italic_word = words[2][1] # 'italic' + text_word = words[3][1] # 'text' + self.assertEqual(italic_word.text, "italic") + self.assertEqual(italic_word.style.style, FontStyle.ITALIC) + self.assertEqual(text_word.text, "text") + self.assertEqual(text_word.style.style, FontStyle.ITALIC) + + def test_underlined_text(self): + """Test paragraphs with underlined text using tag.""" + text = "

      This is underlined text here.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + + words = list(blocks[0].words()) + underlined_word = words[2][1] # 'underlined' + self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE) + + def test_strikethrough_text(self): + """Test paragraphs with strikethrough text using and tags.""" + text = "

      This is strikethrough text here.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + + words = list(blocks[0].words()) + strike_word = words[2][1] # 'strikethrough' + self.assertEqual(strike_word.style.decoration, TextDecoration.STRIKETHROUGH) + + def test_span_with_inline_styles(self): + """Test paragraphs with span elements containing inline CSS styles.""" + text = '

      This text is normal, but this part is red and bold.

      ' + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Paragraph) + + words = list(blocks[0].words()) + + # Find the styled words + styled_words = [] + for _, word in words: + if word.text in ["this", "part", "is", "red", "and", "bold"]: + if word.style.weight == FontWeight.BOLD: + styled_words.append(word) + + self.assertGreater(len(styled_words), 0, "Should have bold words in styled span") + + # Check that at least one word has the red color + red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)] + self.assertGreater(len(red_words), 0, "Should have red colored words") + + def test_mixed_formatting(self): + """Test paragraphs with multiple formatting elements combined.""" + text = "

      This paragraph contains bold, italic, blue, and highlighted text all together.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Paragraph) + + words = list(blocks[0].words()) + + # Check for bold word + bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] + self.assertGreater(len(bold_words), 0, "Should have bold words") + + # Check for italic word + italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC] + self.assertGreater(len(italic_words), 0, "Should have italic words") + + # Check for blue colored word + blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)] + self.assertGreater(len(blue_words), 0, "Should have blue colored words") + + def test_nested_formatting(self): + """Test nested formatting elements.""" + text = "

      This has bold with italic inside formatting.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + + words = list(blocks[0].words()) + + # Find words that should be both bold and italic + bold_italic_words = [w for _, w in words + if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC] + self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic") + + def test_color_variations(self): + """Test different color formats in CSS.""" + text = '

      Hex red and Named green.

      ' + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + + words = list(blocks[0].words()) + + # Check for hex red color + hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)] + self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words") + + # Check for named green color + green_words = [w for _, w in words if w.style.colour == (0, 255, 0)] + self.assertGreater(len(green_words), 0, "Should have green colored words") + + +class TestHTMLBlockElements(unittest.TestCase): + """Test cases for block-level HTML elements.""" + + def test_body_element(self): + """Test parsing of body element containing other elements.""" + text = "

      Paragraph one.

      Paragraph two.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 2) + self.assertIsInstance(blocks[0], Paragraph) + self.assertIsInstance(blocks[1], Paragraph) + + def test_div_container(self): + """Test div elements as generic containers.""" + text = "

      First paragraph.

      Second paragraph.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 2) + self.assertIsInstance(blocks[0], Paragraph) + self.assertIsInstance(blocks[1], Paragraph) + + def test_headings(self): + """Test all heading levels h1-h6.""" + text = "

      Heading 1

      Heading 2

      Heading 3

      Heading 4

      Heading 5
      Heading 6
      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 6) + + expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3, + HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6] + + for i, block in enumerate(blocks): + self.assertIsInstance(block, Heading) + self.assertEqual(block.level, expected_levels[i]) + + words = list(block.words()) + self.assertEqual(len(words), 2) # "Heading" and number + self.assertEqual(words[0][1].text, "Heading") + + def test_blockquote(self): + """Test blockquote elements.""" + text = "

      This is a quoted paragraph.

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Quote) + + # Check that the quote contains a paragraph + quote_blocks = list(blocks[0].blocks()) + self.assertEqual(len(quote_blocks), 1) + self.assertIsInstance(quote_blocks[0], Paragraph) + + def test_preformatted_code(self): + """Test preformatted code blocks.""" + text = "
      function hello() {\n  console.log('Hello');\n}
      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], CodeBlock) + + lines = list(blocks[0].lines()) + self.assertGreater(len(lines), 0) + + def test_unordered_list(self): + """Test unordered lists.""" + text = "
      • First item
      • Second item
      • Third item
      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], HList) + self.assertEqual(blocks[0].style, ListStyle.UNORDERED) + + items = list(blocks[0].items()) + self.assertEqual(len(items), 3) + + def test_ordered_list(self): + """Test ordered lists.""" + text = "
      1. First item
      2. Second item
      3. Third item
      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], HList) + self.assertEqual(blocks[0].style, ListStyle.ORDERED) + + def test_list_with_styled_content(self): + """Test lists containing styled content.""" + text = "
      • Normal item
      • Bold item
      • Item with italic text
      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], HList) + + items = list(blocks[0].items()) + self.assertEqual(len(items), 3) + + # Check second item has bold text + second_item_blocks = list(items[1].blocks()) + if second_item_blocks: + words = list(second_item_blocks[0].words()) + bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] + self.assertGreater(len(bold_words), 0) + + def test_table_basic(self): + """Test basic table structure.""" + text = """ + + + + + + + + + +
      Header 1Header 2
      Cell 1Cell 2
      + """ + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Table) + + def test_semantic_elements(self): + """Test semantic HTML5 elements treated as containers.""" + text = "

      Article content

      " + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Paragraph) + + def test_nested_block_elements(self): + """Test nested block elements.""" + text = """ +
      +

      Section Title

      +

      Some introductory text.

      +
      +

      A quoted paragraph.

      +
      +
      + """ + blocks = parse_html_string(text) + self.assertGreater(len(blocks), 2) + + # Should have at least a heading, paragraph, and quote + has_heading = any(isinstance(b, Heading) for b in blocks) + has_paragraph = any(isinstance(b, Paragraph) for b in blocks) + has_quote = any(isinstance(b, Quote) for b in blocks) + + self.assertTrue(has_heading, "Should contain a heading") + self.assertTrue(has_paragraph, "Should contain a paragraph") + self.assertTrue(has_quote, "Should contain a quote") + + def test_empty_elements(self): + """Test handling of empty elements.""" + text = "

      " + blocks = parse_html_string(text) + # Empty elements may not create blocks, which is acceptable behavior + self.assertGreaterEqual(len(blocks), 0) + + # Test that empty paragraph with some content does create a block + text_with_content = "

      " # Contains whitespace + blocks_with_content = parse_html_string(text_with_content) + # This should create at least one block since there's whitespace content + self.assertGreaterEqual(len(blocks_with_content), 0) + + +class TestHTMLComplexStructures(unittest.TestCase): + """Test cases for complex HTML structures combining multiple features.""" + + def test_article_with_mixed_content(self): + """Test a realistic article structure with mixed content.""" + text = """ +
      +

      Article Title

      +

      This is the introduction paragraph with some emphasis.

      +
      +

      This is a quoted section with styling.

      +
      +
        +
      • First important point
      • +
      • Second point with inline code
      • +
      +
      + """ + blocks = parse_html_string(text) + self.assertGreater(len(blocks), 3) + + # Verify we have the expected block types + block_types = [type(b).__name__ for b in blocks] + self.assertIn('Heading', block_types) + self.assertIn('Paragraph', block_types) + self.assertIn('Quote', block_types) + self.assertIn('HList', block_types) + + def test_styled_table_content(self): + """Test table with styled cell content.""" + text = """ + + + + + + + + + + + + + +
      ProductPrice
      Item with red text$19.99
      + """ + blocks = parse_html_string(text) + self.assertEqual(len(blocks), 1) + self.assertIsInstance(blocks[0], Table) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_html_text.py b/tests/test_html_text.py index e0d8c68..cb4f49b 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -8,7 +8,7 @@ import unittest from unittest.mock import Mock, MagicMock from pyWebLayout.io.readers.html_text import HTMLTextProcessor from pyWebLayout.io.readers.html_style import HTMLStyleManager -from pyWebLayout.abstract.block import Parapgraph +from pyWebLayout.abstract.block import Paragraph from pyWebLayout.abstract.inline import Word @@ -21,7 +21,7 @@ class TestHTMLTextProcessor(unittest.TestCase): self.text_processor = HTMLTextProcessor(self.style_manager) # Create a mock paragraph - self.mock_paragraph = Mock(spec=Parapgraph) + self.mock_paragraph = Mock(spec=Paragraph) self.mock_paragraph.add_word = Mock() def test_initialization(self):