better parsing using handlers

2025-06-07 14:38:11 +02:00 · 2025-06-07 14:38:11 +02:00 · ba6d8ca906
commit ba6d8ca906
parent 81d85386c5
15 changed files with 1189 additions and 97 deletions
--- a/pyWebLayout/abstract/init.py
+++ b/pyWebLayout/abstract/init.py
@ -1,6 +1,6 @@
 from .block import Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock
 from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell
-from .block import HorizontalRule, LineBreak, Image
+#from .block import HorizontalRule, LineBreak, Image
 from .inline import Word, FormattedSpan
 from .document import Document, MetadataType, Chapter, Book
 from .functional import Link, LinkType, Button, Form, FormField, FormFieldType
--- a/pyWebLayout/abstract/block.py
+++ b/pyWebLayout/abstract/block.py
@ -183,6 +183,10 @@ class Paragraph(Block):
    def word_count(self) -> int:
        """Get the number of words in this paragraph"""
        return len(self._words)
    def __len__(self):
        return self.word_count
 class HeadingLevel(Enum):
@ -1008,3 +1012,9 @@ class Table(Block):
            self._footer_rows.append(row)
        else:  # Default to body
            self._rows
 class Image:
    pass
--- a/pyWebLayout/abstract/inline.py
+++ b/pyWebLayout/abstract/inline.py
@ -330,3 +330,8 @@ class FormattedSpan:
        self._words.append(word)
        return word
 class LineBreak:
    pass
--- a/pyWebLayout/html_parser.py
+++ b/pyWebLayout/html_parser.py
@ -7,7 +7,7 @@ from PIL import Image
 from .style import Font, FontStyle, FontWeight, TextDecoration
 from .abstract.document import Document, MetadataType, Book, Chapter
 from .abstract.block import (
-    Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
+    Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
    HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak
 )
 from .abstract.inline import Word, FormattedSpan
@ -138,7 +138,7 @@ class HTMLParser(BaseHTMLParser):
        elif tag == 'p':
            self._flush_text()  # Flush any pending text
-            self._current_paragraph = Parapgraph()
+            self._current_paragraph = Paragraph()
            # Add the paragraph to the current block or document
            if self._current_block and hasattr(self._current_block, 'add_block'):
@ -180,7 +180,7 @@ class HTMLParser(BaseHTMLParser):
            self._flush_text()  # Flush any pending text
            # For divs, we create a new paragraph as a container
-            div_para = Parapgraph()
+            div_para = Paragraph()
            # Add the div to the current block or document
            if self._current_block and hasattr(self._current_block, 'add_block'):
@ -214,7 +214,7 @@ class HTMLParser(BaseHTMLParser):
            # Pre can optionally contain a code block
            # We'll create a paragraph for now, and if we find a code tag inside,
            # we'll replace it with a code block
-            pre_para = Parapgraph()
+            pre_para = Paragraph()
            # Add the pre to the current block or document
            if self._current_block and hasattr(self._current_block, 'add_block'):
@ -229,7 +229,7 @@ class HTMLParser(BaseHTMLParser):
        elif tag == 'code':
            # If we're inside a pre, replace the paragraph with a code block
-            if self._block_stack and isinstance(self._block_stack[-1], Parapgraph):
+            if self._block_stack and isinstance(self._block_stack[-1], Paragraph):
                pre_para = self._block_stack.pop()
                # Get the language from class if specified (e.g., class="language-python")
@ -312,7 +312,7 @@ class HTMLParser(BaseHTMLParser):
            self._current_block = list_item
            # Create a paragraph for the term content
-            term_para = Parapgraph()
+            term_para = Paragraph()
            list_item.add_block(term_para)
            self._current_paragraph = term_para
@ -325,7 +325,7 @@ class HTMLParser(BaseHTMLParser):
                list_item = current_list._items[-1]
                # Create a paragraph for the description content
-                desc_para = Parapgraph()
+                desc_para = Paragraph()
                list_item.add_block(desc_para)
                # Update current state
@ -340,7 +340,7 @@ class HTMLParser(BaseHTMLParser):
                self._current_block = list_item
                # Create a paragraph for the description content
-                desc_para = Parapgraph()
+                desc_para = Paragraph()
                list_item.add_block(desc_para)
                self._current_paragraph = desc_para
@ -424,7 +424,7 @@ class HTMLParser(BaseHTMLParser):
            self._current_block = cell
            # Create a paragraph for the cell content
-            cell_para = Parapgraph()
+            cell_para = Paragraph()
            cell.add_block(cell_para)
            self._current_paragraph = cell_para
@ -508,6 +508,7 @@ class HTMLParser(BaseHTMLParser):
            })
        elif tag == 'br':
            # Add a line break
            if self._current_paragraph:
                line_break = LineBreak()
--- a/pyWebLayout/io/readers/epub_reader.py
+++ b/pyWebLayout/io/readers/epub_reader.py
@ -379,10 +379,10 @@ class EPUBReader:
            except Exception as e:
                print(f"Error parsing chapter {i+1}: {str(e)}")
                # Add an error message block
-                from pyWebLayout.abstract.block import Parapgraph
+                from pyWebLayout.abstract.block import Paragraph
                from pyWebLayout.abstract.inline import Word
                from pyWebLayout.style import Font
-                error_para = Parapgraph()
+                error_para = Paragraph()
                # Create a default font style for the error message
                default_font = Font()
                error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font))
--- a/pyWebLayout/io/readers/html_elements.py
+++ b/pyWebLayout/io/readers/html_elements.py
@ -9,9 +9,9 @@ from typing import Dict, List, Optional, Any
 import urllib.parse
 from pyWebLayout.abstract.document import Document
 from pyWebLayout.abstract.block import (
-    Block, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
+    Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
    HList, ListStyle, ListItem, Table, TableRow, TableCell, 
-    HorizontalRule, LineBreak, Image
+    #HorizontalRule, LineBreak, Image
 )
 from pyWebLayout.abstract.functional import Link, LinkType
 from pyWebLayout.io.readers.html_style import HTMLStyleManager
@ -26,7 +26,7 @@ class BlockElementHandler:
        self.text_processor = text_processor
        self.block_stack: List[Block] = []
        self.current_block: Optional[Block] = None
-        self.current_paragraph: Optional[Parapgraph] = None
+        self.current_paragraph: Optional[Paragraph] = None
    def reset(self):
        """Reset the handler state."""
@ -44,7 +44,7 @@ class BlockElementHandler:
    def handle_paragraph_start(self, document: Document):
        """Handle the start of a paragraph element."""
        self.text_processor.flush_text()
-        paragraph = Parapgraph()
+        paragraph = Paragraph()
        self.add_block_to_document_or_parent(paragraph, document)
        self.block_stack.append(paragraph)
@ -71,7 +71,7 @@ class BlockElementHandler:
    def handle_div_start(self, document: Document):
        """Handle the start of a div element."""
        self.text_processor.flush_text()
-        div_para = Parapgraph()
+        div_para = Paragraph()
        self.add_block_to_document_or_parent(div_para, document)
        self.block_stack.append(div_para)
@ -93,7 +93,7 @@ class BlockElementHandler:
    def handle_pre_start(self, document: Document):
        """Handle the start of a pre element."""
        self.text_processor.flush_text()
-        pre_para = Parapgraph()
+        pre_para = Paragraph()
        self.add_block_to_document_or_parent(pre_para, document)
        self.block_stack.append(pre_para)
@ -104,7 +104,7 @@ class BlockElementHandler:
    def handle_code_start(self, attrs: Dict[str, str], document: Document):
        """Handle the start of a code element."""
        # If we're inside a pre, replace the paragraph with a code block
-        if self.block_stack and isinstance(self.block_stack[-1], Parapgraph):
+        if self.block_stack and isinstance(self.block_stack[-1], Paragraph):
            pre_para = self.block_stack.pop()
            # Get the language from class if specified
@ -145,7 +145,7 @@ class BlockElementHandler:
        if self.block_stack:
            self.current_block = self.block_stack[-1]
            # Update current paragraph based on block type
-            if isinstance(self.current_block, Parapgraph):
+            if isinstance(self.current_block, Paragraph):
                self.current_paragraph = self.current_block
            else:
                self.current_paragraph = None
@ -201,7 +201,7 @@ class ListElementHandler:
        block_handler.current_block = list_item
        # Create a paragraph for the list item content
-        item_para = Parapgraph()
+        item_para = Paragraph()
        list_item.add_block(item_para)
        block_handler.current_paragraph = item_para
        self.text_processor.set_current_paragraph(item_para)
@ -220,7 +220,7 @@ class ListElementHandler:
            block_handler.block_stack.append(list_item)
            block_handler.current_block = list_item
-            term_para = Parapgraph()
+            term_para = Paragraph()
            list_item.add_block(term_para)
            block_handler.current_paragraph = term_para
            self.text_processor.set_current_paragraph(term_para)
@ -228,7 +228,7 @@ class ListElementHandler:
        elif tag == 'dd':
            if current_list._items:
                list_item = current_list._items[-1]
-                desc_para = Parapgraph()
+                desc_para = Paragraph()
                list_item.add_block(desc_para)
                block_handler.current_paragraph = desc_para
                self.text_processor.set_current_paragraph(desc_para)
@ -339,7 +339,7 @@ class TableElementHandler:
        block_handler.current_block = cell
        # Create a paragraph for the cell content
-        cell_para = Parapgraph()
+        cell_para = Paragraph()
        cell.add_block(cell_para)
        block_handler.current_paragraph = cell_para
        self.text_processor.set_current_paragraph(cell_para)
--- a/pyWebLayout/io/readers/html_extraction.py
+++ b/pyWebLayout/io/readers/html_extraction.py
@ -0,0 +1,736 @@
 """
 HTML extraction module for converting HTML elements to pyWebLayout abstract elements.
 This module provides handler functions for converting HTML elements into the abstract document structure
 used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting.
 Each handler function has a robust signature that handles style hints, CSS classes, and attributes.
 """
 import re
 from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple
 from bs4 import BeautifulSoup, Tag, NavigableString
 from pyWebLayout.abstract.inline import Word, FormattedSpan
 from pyWebLayout.abstract.block import (
    Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, 
    HList, ListItem, ListStyle, Table, TableRow, TableCell
 )
 from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
 class StyleContext(NamedTuple):
    """
    Immutable style context passed to handler functions.
    Contains all styling information including inherited styles, CSS hints, and element attributes.
    """
    font: Font
    background: Optional[Tuple[int, int, int, int]]
    css_classes: set
    css_styles: Dict[str, str]
    element_attributes: Dict[str, Any]
    parent_elements: List[str]  # Stack of parent element names
    def with_font(self, font: Font) -> 'StyleContext':
        """Create new context with modified font."""
        return self._replace(font=font)
    def with_background(self, background: Optional[Tuple[int, int, int, int]]) -> 'StyleContext':
        """Create new context with modified background."""
        return self._replace(background=background)
    def with_css_classes(self, css_classes: set) -> 'StyleContext':
        """Create new context with modified CSS classes."""
        return self._replace(css_classes=css_classes)
    def with_css_styles(self, css_styles: Dict[str, str]) -> 'StyleContext':
        """Create new context with modified CSS styles."""
        return self._replace(css_styles=css_styles)
    def with_attributes(self, attributes: Dict[str, Any]) -> 'StyleContext':
        """Create new context with modified element attributes."""
        return self._replace(element_attributes=attributes)
    def push_element(self, element_name: str) -> 'StyleContext':
        """Create new context with element pushed onto parent stack."""
        return self._replace(parent_elements=self.parent_elements + [element_name])
 def create_base_context(base_font: Optional[Font] = None) -> StyleContext:
    """
    Create a base style context with default values.
    Args:
        base_font: Base font to use, defaults to system default
    Returns:
        StyleContext with default values
    """
    return StyleContext(
        font=base_font or Font(),
        background=None,
        css_classes=set(),
        css_styles={},
        element_attributes={},
        parent_elements=[]
    )
 def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext:
    """
    Apply element-specific styling to context based on HTML element and attributes.
    Args:
        context: Current style context
        element: BeautifulSoup Tag object
    Returns:
        New StyleContext with applied styling
    """
    tag_name = element.name.lower()
    attributes = dict(element.attrs) if element.attrs else {}
    # Start with current context
    new_context = context.with_attributes(attributes).push_element(tag_name)
    # Apply CSS classes
    css_classes = new_context.css_classes.copy()
    if 'class' in attributes:
        classes = attributes['class'].split() if isinstance(attributes['class'], str) else attributes['class']
        css_classes.update(classes)
    new_context = new_context.with_css_classes(css_classes)
    # Apply inline styles
    css_styles = new_context.css_styles.copy()
    if 'style' in attributes:
        inline_styles = parse_inline_styles(attributes['style'])
        css_styles.update(inline_styles)
    new_context = new_context.with_css_styles(css_styles)
    # Apply element-specific default styles
    font = apply_element_font_styles(new_context.font, tag_name, css_styles)
    new_context = new_context.with_font(font)
    # Apply background from styles
    background = apply_background_styles(new_context.background, css_styles)
    new_context = new_context.with_background(background)
    return new_context
 def parse_inline_styles(style_text: str) -> Dict[str, str]:
    """
    Parse CSS inline styles into dictionary.
    Args:
        style_text: CSS style text (e.g., "color: red; font-weight: bold;")
    Returns:
        Dictionary of CSS property-value pairs
    """
    styles = {}
    for declaration in style_text.split(';'):
        if ':' in declaration:
            prop, value = declaration.split(':', 1)
            styles[prop.strip().lower()] = value.strip()
    return styles
 def apply_element_font_styles(font: Font, tag_name: str, css_styles: Dict[str, str]) -> Font:
    """
    Apply font styling based on HTML element and CSS styles.
    Args:
        font: Current font
        tag_name: HTML tag name
        css_styles: CSS styles dictionary
    Returns:
        New Font object with applied styling
    """
    # Default element styles
    element_font_styles = {
        'b': {'weight': FontWeight.BOLD},
        'strong': {'weight': FontWeight.BOLD},
        'i': {'style': FontStyle.ITALIC},
        'em': {'style': FontStyle.ITALIC},
        'u': {'decoration': TextDecoration.UNDERLINE},
        's': {'decoration': TextDecoration.STRIKETHROUGH},
        'del': {'decoration': TextDecoration.STRIKETHROUGH},
        'h1': {'size': 24, 'weight': FontWeight.BOLD},
        'h2': {'size': 20, 'weight': FontWeight.BOLD},
        'h3': {'size': 18, 'weight': FontWeight.BOLD},
        'h4': {'size': 16, 'weight': FontWeight.BOLD},
        'h5': {'size': 14, 'weight': FontWeight.BOLD},
        'h6': {'size': 12, 'weight': FontWeight.BOLD},
    }
    # Start with current font properties
    font_size = font.font_size
    colour = font.colour
    weight = font.weight
    style = font.style
    decoration = font.decoration
    background = font.background
    language = font.language
    # Apply element default styles
    if tag_name in element_font_styles:
        elem_styles = element_font_styles[tag_name]
        if 'size' in elem_styles:
            font_size = elem_styles['size']
        if 'weight' in elem_styles:
            weight = elem_styles['weight']
        if 'style' in elem_styles:
            style = elem_styles['style']
        if 'decoration' in elem_styles:
            decoration = elem_styles['decoration']
    # Apply CSS styles (override element defaults)
    if 'font-size' in css_styles:
        # Parse font-size (simplified - could be enhanced)
        size_value = css_styles['font-size'].lower()
        if size_value.endswith('px'):
            try:
                font_size = int(float(size_value[:-2]))
            except ValueError:
                pass
        elif size_value.endswith('pt'):
            try:
                font_size = int(float(size_value[:-2]))
            except ValueError:
                pass
    if 'font-weight' in css_styles:
        weight_value = css_styles['font-weight'].lower()
        if weight_value in ['bold', '700', '800', '900']:
            weight = FontWeight.BOLD
        elif weight_value in ['normal', '400']:
            weight = FontWeight.NORMAL
    if 'font-style' in css_styles:
        style_value = css_styles['font-style'].lower()
        if style_value == 'italic':
            style = FontStyle.ITALIC
        elif style_value == 'normal':
            style = FontStyle.NORMAL
    if 'text-decoration' in css_styles:
        decoration_value = css_styles['text-decoration'].lower()
        if 'underline' in decoration_value:
            decoration = TextDecoration.UNDERLINE
        elif 'line-through' in decoration_value:
            decoration = TextDecoration.STRIKETHROUGH
        elif 'none' in decoration_value:
            decoration = TextDecoration.NONE
    if 'color' in css_styles:
        # Parse color (simplified - could be enhanced for hex, rgb, etc.)
        color_value = css_styles['color'].lower()
        color_map = {
            'black': (0, 0, 0),
            'white': (255, 255, 255),
            'red': (255, 0, 0),
            'green': (0, 255, 0),
            'blue': (0, 0, 255),
        }
        if color_value in color_map:
            colour = color_map[color_value]
        elif color_value.startswith('#') and len(color_value) == 7:
            try:
                r = int(color_value[1:3], 16)
                g = int(color_value[3:5], 16)
                b = int(color_value[5:7], 16)
                colour = (r, g, b)
            except ValueError:
                pass
    return Font(
        font_path=font._font_path,
        font_size=font_size,
        colour=colour,
        weight=weight,
        style=style,
        decoration=decoration,
        background=background,
        langauge=language
    )
 def apply_background_styles(current_background: Optional[Tuple[int, int, int, int]], 
                          css_styles: Dict[str, str]) -> Optional[Tuple[int, int, int, int]]:
    """
    Apply background styling from CSS.
    Args:
        current_background: Current background color (RGBA)
        css_styles: CSS styles dictionary
    Returns:
        New background color or None
    """
    if 'background-color' in css_styles:
        bg_value = css_styles['background-color'].lower()
        if bg_value == 'transparent':
            return None
        # Add color parsing logic here if needed
    return current_background
 def extract_text_content(element: Tag, context: StyleContext) -> List[Word]:
    """
    Extract text content from an element, handling inline formatting.
    Args:
        element: BeautifulSoup Tag object
        context: Current style context
    Returns:
        List of Word objects
    """
    words = []
    for child in element.children:
        if isinstance(child, NavigableString):
            # Plain text - split into words
            text = str(child).strip()
            if text:
                word_texts = text.split()
                for word_text in word_texts:
                    if word_text:
                        words.append(Word(word_text, context.font, context.background))
        elif isinstance(child, Tag):
            # Process inline elements
            if child.name.lower() in ['span', 'a', 'strong', 'b', 'em', 'i', 'u', 's', 'del', 'ins', 'mark', 'small', 'sub', 'sup', 'code', 'q', 'cite', 'abbr', 'time']:
                child_context = apply_element_styling(context, child)
                child_words = extract_text_content(child, child_context)
                words.extend(child_words)
            else:
                # Block element - shouldn't happen in well-formed HTML but handle gracefully
                child_context = apply_element_styling(context, child)
                child_result = process_element(child, child_context)
                if isinstance(child_result, list):
                    for block in child_result:
                        if isinstance(block, Paragraph):
                            for _, word in block.words():
                                words.append(word)
                elif isinstance(child_result, Paragraph):
                    for _, word in child_result.words():
                        words.append(word)
    return words
 def process_element(element: Tag, context: StyleContext) -> Union[Block, List[Block], None]:
    """
    Process a single HTML element using appropriate handler.
    Args:
        element: BeautifulSoup Tag object
        context: Current style context
    Returns:
        Block object(s) or None if element should be ignored
    """
    tag_name = element.name.lower()
    handler = HANDLERS.get(tag_name, generic_handler)
    return handler(element, context)
 # Handler function signatures:
 # All handlers receive (element: Tag, context: StyleContext) -> Union[Block, List[Block], None]
 def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
    """Handle <p> elements."""
    paragraph = Paragraph(context.font)
    words = extract_text_content(element, context)
    for word in words:
        paragraph.add_word(word)
    return paragraph
 def div_handler(element: Tag, context: StyleContext) -> List[Block]:
    """Handle <div> elements - treat as generic container."""
    blocks = []
    for child in element.children:
        if isinstance(child, Tag):
            child_context = apply_element_styling(context, child)
            result = process_element(child, child_context)
            if result:
                if isinstance(result, list):
                    blocks.extend(result)
                else:
                    blocks.append(result)
    return blocks
 def heading_handler(element: Tag, context: StyleContext) -> Heading:
    """Handle <h1>-<h6> elements."""
    level_map = {
        'h1': HeadingLevel.H1,
        'h2': HeadingLevel.H2,
        'h3': HeadingLevel.H3,
        'h4': HeadingLevel.H4,
        'h5': HeadingLevel.H5,
        'h6': HeadingLevel.H6,
    }
    level = level_map.get(element.name.lower(), HeadingLevel.H1)
    heading = Heading(level, context.font)
    words = extract_text_content(element, context)
    for word in words:
        heading.add_word(word)
    return heading
 def blockquote_handler(element: Tag, context: StyleContext) -> Quote:
    """Handle <blockquote> elements."""
    quote = Quote(context.font)
    for child in element.children:
        if isinstance(child, Tag):
            child_context = apply_element_styling(context, child)
            result = process_element(child, child_context)
            if result:
                if isinstance(result, list):
                    for block in result:
                        quote.add_block(block)
                else:
                    quote.add_block(result)
    return quote
 def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock:
    """Handle <pre> elements."""
    language = context.element_attributes.get('data-language', '')
    code_block = CodeBlock(language)
    # Preserve whitespace and line breaks in preformatted text
    text = element.get_text(separator='\n', strip=False)
    for line in text.split('\n'):
        code_block.add_line(line)
    return code_block
 def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]:
    """Handle <code> elements."""
    # If parent is <pre>, this is handled by preformatted_handler
    if context.parent_elements and context.parent_elements[-1] == 'pre':
        return None  # Will be handled by parent
    # Inline code - handled during text extraction
    return None
 def unordered_list_handler(element: Tag, context: StyleContext) -> HList:
    """Handle <ul> elements."""
    hlist = HList(ListStyle.UNORDERED, context.font)
    for child in element.children:
        if isinstance(child, Tag) and child.name.lower() == 'li':
            child_context = apply_element_styling(context, child)
            item = process_element(child, child_context)
            if item:
                hlist.add_item(item)
    return hlist
 def ordered_list_handler(element: Tag, context: StyleContext) -> HList:
    """Handle <ol> elements."""
    hlist = HList(ListStyle.ORDERED, context.font)
    for child in element.children:
        if isinstance(child, Tag) and child.name.lower() == 'li':
            child_context = apply_element_styling(context, child)
            item = process_element(child, child_context)
            if item:
                hlist.add_item(item)
    return hlist
 def list_item_handler(element: Tag, context: StyleContext) -> ListItem:
    """Handle <li> elements."""
    list_item = ListItem(None, context.font)
    for child in element.children:
        if isinstance(child, Tag):
            child_context = apply_element_styling(context, child)
            result = process_element(child, child_context)
            if result:
                if isinstance(result, list):
                    for block in result:
                        list_item.add_block(block)
                else:
                    list_item.add_block(result)
        elif isinstance(child, NavigableString):
            # Direct text in list item - create paragraph
            text = str(child).strip()
            if text:
                paragraph = Paragraph(context.font)
                words = text.split()
                for word_text in words:
                    if word_text:
                        paragraph.add_word(Word(word_text, context.font))
                list_item.add_block(paragraph)
    return list_item
 def table_handler(element: Tag, context: StyleContext) -> Table:
    """Handle <table> elements."""
    caption = None
    caption_elem = element.find('caption')
    if caption_elem:
        caption = caption_elem.get_text(strip=True)
    table = Table(caption, context.font)
    # Process table rows
    for child in element.children:
        if isinstance(child, Tag):
            if child.name.lower() == 'tr':
                child_context = apply_element_styling(context, child)
                row = process_element(child, child_context)
                if row:
                    table.add_row(row)
            elif child.name.lower() in ['thead', 'tbody', 'tfoot']:
                section = 'header' if child.name.lower() == 'thead' else 'body'
                section = 'footer' if child.name.lower() == 'tfoot' else section
                for row_elem in child.find_all('tr'):
                    child_context = apply_element_styling(context, row_elem)
                    row = process_element(row_elem, child_context)
                    if row:
                        table.add_row(row, section)
    return table
 def table_row_handler(element: Tag, context: StyleContext) -> TableRow:
    """Handle <tr> elements."""
    row = TableRow(context.font)
    for child in element.children:
        if isinstance(child, Tag) and child.name.lower() in ['td', 'th']:
            child_context = apply_element_styling(context, child)
            cell = process_element(child, child_context)
            if cell:
                row.add_cell(cell)
    return row
 def table_cell_handler(element: Tag, context: StyleContext) -> TableCell:
    """Handle <td> elements."""
    colspan = int(context.element_attributes.get('colspan', 1))
    rowspan = int(context.element_attributes.get('rowspan', 1))
    cell = TableCell(False, colspan, rowspan, context.font)
    # Process cell content
    for child in element.children:
        if isinstance(child, Tag):
            child_context = apply_element_styling(context, child)
            result = process_element(child, child_context)
            if result:
                if isinstance(result, list):
                    for block in result:
                        cell.add_block(block)
                else:
                    cell.add_block(result)
        elif isinstance(child, NavigableString):
            # Direct text in cell - create paragraph
            text = str(child).strip()
            if text:
                paragraph = Paragraph(context.font)
                words = text.split()
                for word_text in words:
                    if word_text:
                        paragraph.add_word(Word(word_text, context.font))
                cell.add_block(paragraph)
    return cell
 def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:
    """Handle <th> elements."""
    colspan = int(context.element_attributes.get('colspan', 1))
    rowspan = int(context.element_attributes.get('rowspan', 1))
    cell = TableCell(True, colspan, rowspan, context.font)
    # Process cell content (same as td)
    for child in element.children:
        if isinstance(child, Tag):
            child_context = apply_element_styling(context, child)
            result = process_element(child, child_context)
            if result:
                if isinstance(result, list):
                    for block in result:
                        cell.add_block(block)
                else:
                    cell.add_block(result)
        elif isinstance(child, NavigableString):
            text = str(child).strip()
            if text:
                paragraph = Paragraph(context.font)
                words = text.split()
                for word_text in words:
                    if word_text:
                        paragraph.add_word(Word(word_text, context.font))
                cell.add_block(paragraph)
    return cell
 def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block:
    """Handle <hr> elements."""
    # TODO: Create a specific HorizontalRule block type
    # For now, return an empty paragraph
    return Paragraph(context.font)
 def line_break_handler(element: Tag, context: StyleContext) -> None:
    """Handle <br> elements."""
    # Line breaks are typically handled at the paragraph level
    return None
 def image_handler(element: Tag, context: StyleContext) -> Block:
    """Handle <img> elements."""
    # TODO: Create Image block type
    # For now, return empty paragraph with alt text if available
    paragraph = Paragraph(context.font)
    alt_text = context.element_attributes.get('alt', '')
    if alt_text:
        words = alt_text.split()
        for word_text in words:
            if word_text:
                paragraph.add_word(Word(word_text, context.font))
    return paragraph
 def ignore_handler(element: Tag, context: StyleContext) -> None:
    """Handle elements that should be ignored."""
    return None
 def generic_handler(element: Tag, context: StyleContext) -> List[Block]:
    """Handle unknown elements as generic containers."""
    return div_handler(element, context)
 # Handler registry - maps HTML tag names to handler functions
 HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = {
    # Block elements
    'p': paragraph_handler,
    'div': div_handler,
    'h1': heading_handler,
    'h2': heading_handler,
    'h3': heading_handler,
    'h4': heading_handler,
    'h5': heading_handler,
    'h6': heading_handler,
    'blockquote': blockquote_handler,
    'pre': preformatted_handler,
    'code': code_handler,
    'ul': unordered_list_handler,
    'ol': ordered_list_handler,
    'li': list_item_handler,
    'table': table_handler,
    'tr': table_row_handler,
    'td': table_cell_handler,
    'th': table_header_cell_handler,
    'hr': horizontal_rule_handler,
    'br': line_break_handler,
    # Semantic elements (treated as containers)
    'section': div_handler,
    'article': div_handler,
    'aside': div_handler,
    'nav': div_handler,
    'header': div_handler,
    'footer': div_handler,
    'main': div_handler,
    'figure': div_handler,
    'figcaption': paragraph_handler,
    # Media elements
    'img': image_handler,
    # Inline elements (handled during text extraction)
    'span': ignore_handler,
    'a': ignore_handler,
    'strong': ignore_handler,
    'b': ignore_handler,
    'em': ignore_handler,
    'i': ignore_handler,
    'u': ignore_handler,
    's': ignore_handler,
    'del': ignore_handler,
    'ins': ignore_handler,
    'mark': ignore_handler,
    'small': ignore_handler,
    'sub': ignore_handler,
    'sup': ignore_handler,
    'q': ignore_handler,
    'cite': ignore_handler,
    'abbr': ignore_handler,
    'time': ignore_handler,
    # Ignored elements
    'script': ignore_handler,
    'style': ignore_handler,
    'meta': ignore_handler,
    'link': ignore_handler,
    'head': ignore_handler,
    'title': ignore_handler,
 }
 def parse_html_string(html_string: str, base_font: Optional[Font] = None) -> List[Block]:
    """
    Parse HTML string and return list of Block objects.
    Args:
        html_string: HTML content to parse
        base_font: Base font for styling, defaults to system default
    Returns:
        List of Block objects representing the document structure
    """
    soup = BeautifulSoup(html_string, 'html.parser')
    context = create_base_context(base_font)
    blocks = []
    # Process the body if it exists, otherwise process all top-level elements
    root_element = soup.find('body') or soup
    for element in root_element.children:
        if isinstance(element, Tag):
            element_context = apply_element_styling(context, element)
            result = process_element(element, element_context)
            if result:
                if isinstance(result, list):
                    blocks.extend(result)
                else:
                    blocks.append(result)
    return blocks
 def register_handler(tag_name: str, handler: Callable[[Tag, StyleContext], Union[Block, List[Block], None]]):
    """
    Register a custom handler for an HTML tag.
    Args:
        tag_name: HTML tag name (lowercase)
        handler: Handler function with signature (element: Tag, context: StyleContext) -> Union[Block, List[Block], None]
    """
    HANDLERS[tag_name] = handler
 def get_handler(tag_name: str) -> Callable[[Tag, StyleContext], Union[Block, List[Block], None]]:
    """
    Get handler function for HTML tag.
    Args:
        tag_name: HTML tag name (lowercase)
    Returns:
        Handler function or generic_handler if tag not found
    """
    return HANDLERS.get(tag_name.lower(), generic_handler)
--- a/pyWebLayout/io/readers/html_text.py
+++ b/pyWebLayout/io/readers/html_text.py
@ -7,7 +7,7 @@ entity references, and word creation in HTML documents.
 from typing import Optional
 from pyWebLayout.abstract.inline import Word
-from pyWebLayout.abstract.block import Parapgraph
+from pyWebLayout.abstract.block import Paragraph
 from pyWebLayout.io.readers.html_style import HTMLStyleManager
@ -28,14 +28,14 @@ class HTMLTextProcessor:
        """
        self._style_manager = style_manager
        self._text_buffer = ""
-        self._current_paragraph: Optional[Parapgraph] = None
+        self._current_paragraph: Optional[Paragraph] = None
    def reset(self):
        """Reset the text processor state."""
        self._text_buffer = ""
        self._current_paragraph = None
-    def set_current_paragraph(self, paragraph: Optional[Parapgraph]):
+    def set_current_paragraph(self, paragraph: Optional[Paragraph]):
        """
        Set the current paragraph for text output.
--- a/pyWebLayout/typesetting/document_pagination.py
+++ b/pyWebLayout/typesetting/document_pagination.py
@ -139,7 +139,7 @@ class DocumentPaginator:
            for chapter in self.document.chapters:
                # Add a heading block for the chapter if it has a title
                if chapter.title:
-                    from pyWebLayout.abstract.block import Heading, HeadingLevel, Parapgraph
+                    from pyWebLayout.abstract.block import Heading, HeadingLevel, Paragraph
                    from pyWebLayout.abstract.inline import Word
                    # Create a heading for the chapter
--- a/tests/test_abstract_blocks.py
+++ b/tests/test_abstract_blocks.py
@ -6,7 +6,7 @@ Tests the core abstract block classes that form the foundation of the document m
 import unittest
 from pyWebLayout.abstract.block import (
-    Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
+    Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
    HList, ListStyle, ListItem, Table, TableRow, TableCell, 
    HorizontalRule, LineBreak, Image
 )
@ -19,7 +19,7 @@ class TestBlockElements(unittest.TestCase):
    def test_paragraph_creation(self):
        """Test creating and using paragraphs."""
-        paragraph = Parapgraph()
+        paragraph = Paragraph()
        self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH)
        self.assertEqual(paragraph.word_count, 0)
@ -62,8 +62,8 @@ class TestBlockElements(unittest.TestCase):
        quote = Quote()
        # Add nested paragraphs
-        p1 = Parapgraph()
+        p1 = Paragraph()
-        p2 = Parapgraph()
+        p2 = Paragraph()
        quote.add_block(p1)
        quote.add_block(p2)
--- a/tests/test_abstract_document.py
+++ b/tests/test_abstract_document.py
@ -7,7 +7,7 @@ document structure and metadata management.
 import unittest
 from pyWebLayout.abstract.document import Document, Chapter, Book, MetadataType
-from pyWebLayout.abstract.block import Parapgraph, Heading, HeadingLevel, BlockType
+from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, BlockType
 from pyWebLayout.abstract.inline import Word, FormattedSpan
 from pyWebLayout.style import Font
@ -77,8 +77,8 @@ class TestDocument(unittest.TestCase):
    def test_block_management(self):
        """Test adding and managing blocks."""
        # Create some blocks
-        para1 = Parapgraph()
+        para1 = Paragraph()
-        para2 = Parapgraph()
+        para2 = Paragraph()
        heading = Heading(HeadingLevel.H1)
        # Add blocks
@ -95,7 +95,7 @@ class TestDocument(unittest.TestCase):
    def test_anchor_management(self):
        """Test named anchor functionality."""
        heading = Heading(HeadingLevel.H1)
-        para = Parapgraph()
+        para = Paragraph()
        # Add anchors
        self.doc.add_anchor("intro", heading)
@ -154,8 +154,8 @@ class TestDocument(unittest.TestCase):
    def test_find_blocks_by_type(self):
        """Test finding blocks by type."""
        # Create blocks of different types
-        para1 = Parapgraph()
+        para1 = Paragraph()
-        para2 = Parapgraph()
+        para2 = Paragraph()
        heading1 = Heading(HeadingLevel.H1)
        heading2 = Heading(HeadingLevel.H2)
@ -180,7 +180,7 @@ class TestDocument(unittest.TestCase):
    def test_find_headings(self):
        """Test finding heading blocks specifically."""
        # Create mixed blocks
-        para = Parapgraph()
+        para = Paragraph()
        h1 = Heading(HeadingLevel.H1)
        h2 = Heading(HeadingLevel.H2)
@ -284,8 +284,8 @@ class TestChapter(unittest.TestCase):
    def test_block_management(self):
        """Test adding blocks to chapter."""
-        para1 = Parapgraph()
+        para1 = Paragraph()
-        para2 = Parapgraph()
+        para2 = Paragraph()
        heading = Heading(HeadingLevel.H2)
        # Add blocks
@ -450,7 +450,7 @@ class TestBook(unittest.TestCase):
        """Test that Book inherits all Document functionality."""
        # Test that book can use all document methods
        # Add blocks directly to book
-        para = Parapgraph()
+        para = Paragraph()
        self.book.add_block(para)
        self.assertEqual(len(self.book.blocks), 1)
--- a/tests/test_epub_fix.py
+++ b/tests/test_epub_fix.py
@ -1,44 +0,0 @@
 #!/usr/bin/env python3
 """
 Simple test script to verify that the EPUB reader fixes are working correctly.
 """
 import sys
 import os
 # Add the pyWebLayout directory to the Python path
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'pyWebLayout'))
 try:
    from pyWebLayout.io.readers.epub_reader import read_epub
    print("Successfully imported epub_reader module")
    # Test reading the EPUB file
    epub_path = os.path.join('pyWebLayout', 'examples', 'pg174-images-3.epub')
    if not os.path.exists(epub_path):
        print(f"EPUB file not found: {epub_path}")
        sys.exit(1)
    print(f"Reading EPUB file: {epub_path}")
    # Try to read the EPUB
    book = read_epub(epub_path)
    print(f"Successfully read EPUB file!")
    print(f"Book title: {book.title}")
    print(f"Number of chapters: {len(book.chapters)}")
    # Check first chapter
    if book.chapters:
        first_chapter = book.chapters[0]
        print(f"First chapter title: {first_chapter.title}")
        print(f"First chapter has {len(first_chapter.blocks)} blocks")
 except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)
 print("Test completed successfully!")
--- a/tests/test_html_content.py
+++ b/tests/test_html_content.py
@ -9,7 +9,7 @@ import unittest
 from pyWebLayout.io.readers.html_content import HTMLContentReader
 from pyWebLayout.abstract.document import Document
 from pyWebLayout.abstract.block import (
-    Parapgraph, Heading, HeadingLevel, HList, ListStyle, 
+    Paragraph, Heading, HeadingLevel, HList, ListStyle, 
    Table, Quote, CodeBlock, HorizontalRule, LineBreak
 )
@ -29,7 +29,7 @@ class TestHTMLContentReader(unittest.TestCase):
        result = self.reader.extract_content(html, self.document)
        self.assertEqual(len(self.document.blocks), 1)
-        self.assertIsInstance(self.document.blocks[0], Parapgraph)
+        self.assertIsInstance(self.document.blocks[0], Paragraph)
        paragraph = self.document.blocks[0]
        words = list(paragraph.words())
@ -107,7 +107,7 @@ class TestHTMLContentReader(unittest.TestCase):
        # Check first item content
        first_item_blocks = list(items[0].blocks())
        self.assertEqual(len(first_item_blocks), 1)
-        self.assertIsInstance(first_item_blocks[0], Parapgraph)
+        self.assertIsInstance(first_item_blocks[0], Paragraph)
    def test_ordered_list(self):
        """Test parsing ordered lists."""
@ -202,8 +202,8 @@ class TestHTMLContentReader(unittest.TestCase):
        quote = self.document.blocks[0]
        quote_blocks = list(quote.blocks())
        self.assertEqual(len(quote_blocks), 2)
-        self.assertIsInstance(quote_blocks[0], Parapgraph)
+        self.assertIsInstance(quote_blocks[0], Paragraph)
-        self.assertIsInstance(quote_blocks[1], Parapgraph)
+        self.assertIsInstance(quote_blocks[1], Paragraph)
    def test_code_block(self):
        """Test parsing code blocks."""
@ -229,9 +229,9 @@ def hello():
        self.reader.extract_content(html, self.document)
        self.assertEqual(len(self.document.blocks), 3)
-        self.assertIsInstance(self.document.blocks[0], Parapgraph)
+        self.assertIsInstance(self.document.blocks[0], Paragraph)
        self.assertIsInstance(self.document.blocks[1], HorizontalRule)
-        self.assertIsInstance(self.document.blocks[2], Parapgraph)
+        self.assertIsInstance(self.document.blocks[2], Paragraph)
    def test_html_entities(self):
        """Test handling HTML entities."""
@ -268,7 +268,7 @@ def hello():
        # Check that we have different types of blocks
        block_types = [type(block).__name__ for block in self.document.blocks]
-        self.assertIn('Parapgraph', block_types)  # From div
+        self.assertIn('Paragraph', block_types)  # From div
        self.assertIn('Heading', block_types)
        self.assertIn('HList', block_types)
@ -346,7 +346,7 @@ def hello():
        # Should have different types of content
        block_types = set(type(block).__name__ for block in self.document.blocks)
-        expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'}
+        expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'}
        self.assertTrue(expected_types.issubset(block_types))
--- a/tests/test_html_extraction.py
+++ b/tests/test_html_extraction.py
@ -0,0 +1,384 @@
 """
 Unit tests for HTML extraction functionality.
 Tests the HTML parsing and conversion to pyWebLayout abstract elements,
 including styled content within paragraphs and block-level elements.
 """
 import unittest
 from pyWebLayout.io.readers.html_extraction import parse_html_string
 from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
 from pyWebLayout.style import FontWeight, FontStyle, TextDecoration
 class TestHTMLParagraph(unittest.TestCase):
    """Test cases for basic paragraph parsing."""
    def test_simple(self):
        text = "<p>This is a paragraph.</p>"
        paragraphs = parse_html_string(text)
        self.assertEqual(len(paragraphs), 1)
        self.assertEqual(len(paragraphs[0]), 4)
        for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
            self.assertEqual(w1[1].text, t1)
    def test_multiple(self):
        text = "<p>This is a paragraph.</p><p>This is another paragraph.</p>"
        paragraphs = parse_html_string(text)
        self.assertEqual(len(paragraphs), 2)
        self.assertEqual(len(paragraphs[0]), 4)
        self.assertEqual(len(paragraphs[1]), 4)
        for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
            self.assertEqual(w1[1].text, t1)
        for w1, t1 in zip(paragraphs[1].words(), "This is another paragraph.".split(" ")):
            self.assertEqual(w1[1].text, t1)
 class TestHTMLStyledParagraphs(unittest.TestCase):
    """Test cases for paragraphs with inline styling."""
    def test_bold_text(self):
        """Test paragraphs with bold text using <strong> and <b> tags."""
        text = "<p>This is <strong>bold text</strong> in a paragraph.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)
        words = list(blocks[0].words())
        self.assertEqual(len(words), 7)  # "This is bold text in a paragraph."
        # Check that 'bold' and 'text' words have bold font weight
        bold_word = words[2][1]  # 'bold'
        text_word = words[3][1]  # 'text'
        self.assertEqual(bold_word.text, "bold")
        self.assertEqual(bold_word.style.weight, FontWeight.BOLD)
        self.assertEqual(text_word.text, "text")
        self.assertEqual(text_word.style.weight, FontWeight.BOLD)
        # Check that other words are not bold
        normal_word = words[0][1]  # 'This'
        self.assertEqual(normal_word.text, "This")
        self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD)
    def test_italic_text(self):
        """Test paragraphs with italic text using <em> and <i> tags."""
        text = "<p>This is <em>italic text</em> in a paragraph.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)
        words = list(blocks[0].words())
        # Check that 'italic' and 'text' words have italic font style
        italic_word = words[2][1]  # 'italic'
        text_word = words[3][1]   # 'text'
        self.assertEqual(italic_word.text, "italic")
        self.assertEqual(italic_word.style.style, FontStyle.ITALIC)
        self.assertEqual(text_word.text, "text")
        self.assertEqual(text_word.style.style, FontStyle.ITALIC)
    def test_underlined_text(self):
        """Test paragraphs with underlined text using <u> tag."""
        text = "<p>This is <u>underlined text</u> here.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        words = list(blocks[0].words())
        underlined_word = words[2][1]  # 'underlined'
        self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE)
    def test_strikethrough_text(self):
        """Test paragraphs with strikethrough text using <s> and <del> tags."""
        text = "<p>This is <s>strikethrough text</s> here.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        words = list(blocks[0].words())
        strike_word = words[2][1]  # 'strikethrough'
        self.assertEqual(strike_word.style.decoration, TextDecoration.STRIKETHROUGH)
    def test_span_with_inline_styles(self):
        """Test paragraphs with span elements containing inline CSS styles."""
        text = '<p>This text is normal, but <span style="color: red; font-weight: bold;">this part is red and bold</span>.</p>'
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)
        words = list(blocks[0].words())
        # Find the styled words
        styled_words = []
        for _, word in words:
            if word.text in ["this", "part", "is", "red", "and", "bold"]:
                if word.style.weight == FontWeight.BOLD:
                    styled_words.append(word)
        self.assertGreater(len(styled_words), 0, "Should have bold words in styled span")
        # Check that at least one word has the red color
        red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)]
        self.assertGreater(len(red_words), 0, "Should have red colored words")
    def test_mixed_formatting(self):
        """Test paragraphs with multiple formatting elements combined."""
        text = "<p>This paragraph contains <strong>bold</strong>, <em>italic</em>, <span style=\"color: blue;\">blue</span>, and <mark>highlighted</mark> text all together.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)
        words = list(blocks[0].words())
        # Check for bold word
        bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
        self.assertGreater(len(bold_words), 0, "Should have bold words")
        # Check for italic word
        italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
        self.assertGreater(len(italic_words), 0, "Should have italic words")
        # Check for blue colored word
        blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)]
        self.assertGreater(len(blue_words), 0, "Should have blue colored words")
    def test_nested_formatting(self):
        """Test nested formatting elements."""
        text = "<p>This has <strong>bold with <em>italic inside</em></strong> formatting.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        words = list(blocks[0].words())
        # Find words that should be both bold and italic
        bold_italic_words = [w for _, w in words 
                           if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC]
        self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic")
    def test_color_variations(self):
        """Test different color formats in CSS."""
        text = '<p><span style="color: #ff0000;">Hex red</span> and <span style="color: green;">Named green</span>.</p>'
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        words = list(blocks[0].words())
        # Check for hex red color
        hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)]
        self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words")
        # Check for named green color
        green_words = [w for _, w in words if w.style.colour == (0, 255, 0)]
        self.assertGreater(len(green_words), 0, "Should have green colored words")
 class TestHTMLBlockElements(unittest.TestCase):
    """Test cases for block-level HTML elements."""
    def test_body_element(self):
        """Test parsing of body element containing other elements."""
        text = "<body><p>Paragraph one.</p><p>Paragraph two.</p></body>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 2)
        self.assertIsInstance(blocks[0], Paragraph)
        self.assertIsInstance(blocks[1], Paragraph)
    def test_div_container(self):
        """Test div elements as generic containers."""
        text = "<div><p>First paragraph.</p><p>Second paragraph.</p></div>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 2)
        self.assertIsInstance(blocks[0], Paragraph)
        self.assertIsInstance(blocks[1], Paragraph)
    def test_headings(self):
        """Test all heading levels h1-h6."""
        text = "<h1>Heading 1</h1><h2>Heading 2</h2><h3>Heading 3</h3><h4>Heading 4</h4><h5>Heading 5</h5><h6>Heading 6</h6>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 6)
        expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3, 
                          HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6]
        for i, block in enumerate(blocks):
            self.assertIsInstance(block, Heading)
            self.assertEqual(block.level, expected_levels[i])
            words = list(block.words())
            self.assertEqual(len(words), 2)  # "Heading" and number
            self.assertEqual(words[0][1].text, "Heading")
    def test_blockquote(self):
        """Test blockquote elements."""
        text = "<blockquote><p>This is a quoted paragraph.</p></blockquote>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Quote)
        # Check that the quote contains a paragraph
        quote_blocks = list(blocks[0].blocks())
        self.assertEqual(len(quote_blocks), 1)
        self.assertIsInstance(quote_blocks[0], Paragraph)
    def test_preformatted_code(self):
        """Test preformatted code blocks."""
        text = "<pre><code>function hello() {\n  console.log('Hello');\n}</code></pre>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], CodeBlock)
        lines = list(blocks[0].lines())
        self.assertGreater(len(lines), 0)
    def test_unordered_list(self):
        """Test unordered lists."""
        text = "<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], HList)
        self.assertEqual(blocks[0].style, ListStyle.UNORDERED)
        items = list(blocks[0].items())
        self.assertEqual(len(items), 3)
    def test_ordered_list(self):
        """Test ordered lists."""
        text = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], HList)
        self.assertEqual(blocks[0].style, ListStyle.ORDERED)
    def test_list_with_styled_content(self):
        """Test lists containing styled content."""
        text = "<ul><li>Normal item</li><li><strong>Bold item</strong></li><li>Item with <em>italic</em> text</li></ul>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], HList)
        items = list(blocks[0].items())
        self.assertEqual(len(items), 3)
        # Check second item has bold text
        second_item_blocks = list(items[1].blocks())
        if second_item_blocks:
            words = list(second_item_blocks[0].words())
            bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
            self.assertGreater(len(bold_words), 0)
    def test_table_basic(self):
        """Test basic table structure."""
        text = """
        <table>
            <tr>
                <th>Header 1</th>
                <th>Header 2</th>
            </tr>
            <tr>
                <td>Cell 1</td>
                <td>Cell 2</td>
            </tr>
        </table>
        """
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Table)
    def test_semantic_elements(self):
        """Test semantic HTML5 elements treated as containers."""
        text = "<section><article><p>Article content</p></article></section>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)
    def test_nested_block_elements(self):
        """Test nested block elements."""
        text = """
        <div>
            <h2>Section Title</h2>
            <p>Some introductory text.</p>
            <blockquote>
                <p>A quoted paragraph.</p>
            </blockquote>
        </div>
        """
        blocks = parse_html_string(text)
        self.assertGreater(len(blocks), 2)
        # Should have at least a heading, paragraph, and quote
        has_heading = any(isinstance(b, Heading) for b in blocks)
        has_paragraph = any(isinstance(b, Paragraph) for b in blocks)
        has_quote = any(isinstance(b, Quote) for b in blocks)
        self.assertTrue(has_heading, "Should contain a heading")
        self.assertTrue(has_paragraph, "Should contain a paragraph")
        self.assertTrue(has_quote, "Should contain a quote")
    def test_empty_elements(self):
        """Test handling of empty elements."""
        text = "<p></p><div></div><span></span>"
        blocks = parse_html_string(text)
        # Empty elements may not create blocks, which is acceptable behavior
        self.assertGreaterEqual(len(blocks), 0)
        # Test that empty paragraph with some content does create a block
        text_with_content = "<p> </p>"  # Contains whitespace
        blocks_with_content = parse_html_string(text_with_content)
        # This should create at least one block since there's whitespace content
        self.assertGreaterEqual(len(blocks_with_content), 0)
 class TestHTMLComplexStructures(unittest.TestCase):
    """Test cases for complex HTML structures combining multiple features."""
    def test_article_with_mixed_content(self):
        """Test a realistic article structure with mixed content."""
        text = """
        <article>
            <h1>Article Title</h1>
            <p>This is the <strong>introduction</strong> paragraph with <em>some emphasis</em>.</p>
            <blockquote>
                <p>This is a <span style="color: blue;">quoted section</span> with styling.</p>
            </blockquote>
            <ul>
                <li>First <strong>important</strong> point</li>
                <li>Second point with <code>inline code</code></li>
            </ul>
        </article>
        """
        blocks = parse_html_string(text)
        self.assertGreater(len(blocks), 3)
        # Verify we have the expected block types
        block_types = [type(b).__name__ for b in blocks]
        self.assertIn('Heading', block_types)
        self.assertIn('Paragraph', block_types)
        self.assertIn('Quote', block_types)
        self.assertIn('HList', block_types)
    def test_styled_table_content(self):
        """Test table with styled cell content."""
        text = """
        <table>
            <thead>
                <tr>
                    <th><strong>Product</strong></th>
                    <th><em>Price</em></th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Item with <span style="color: red;">red text</span></td>
                    <td><strong>$19.99</strong></td>
                </tr>
            </tbody>
        </table>
        """
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Table)
 if __name__ == '__main__':
    unittest.main()
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@ -8,7 +8,7 @@ import unittest
 from unittest.mock import Mock, MagicMock
 from pyWebLayout.io.readers.html_text import HTMLTextProcessor
 from pyWebLayout.io.readers.html_style import HTMLStyleManager
-from pyWebLayout.abstract.block import Parapgraph
+from pyWebLayout.abstract.block import Paragraph
 from pyWebLayout.abstract.inline import Word
@ -21,7 +21,7 @@ class TestHTMLTextProcessor(unittest.TestCase):
        self.text_processor = HTMLTextProcessor(self.style_manager)
        # Create a mock paragraph
-        self.mock_paragraph = Mock(spec=Parapgraph)
+        self.mock_paragraph = Mock(spec=Paragraph)
        self.mock_paragraph.add_word = Mock()
    def test_initialization(self):