Article content
diff --git a/pyWebLayout/abstract/__init__.py b/pyWebLayout/abstract/__init__.py index c8b030f..0fb2ef7 100644 --- a/pyWebLayout/abstract/__init__.py +++ b/pyWebLayout/abstract/__init__.py @@ -1,6 +1,6 @@ from .block import Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell -from .block import HorizontalRule, LineBreak, Image +#from .block import HorizontalRule, LineBreak, Image from .inline import Word, FormattedSpan from .document import Document, MetadataType, Chapter, Book from .functional import Link, LinkType, Button, Form, FormField, FormFieldType diff --git a/pyWebLayout/abstract/block.py b/pyWebLayout/abstract/block.py index 2a05bf7..5ab98e0 100644 --- a/pyWebLayout/abstract/block.py +++ b/pyWebLayout/abstract/block.py @@ -183,6 +183,10 @@ class Paragraph(Block): def word_count(self) -> int: """Get the number of words in this paragraph""" return len(self._words) + + def __len__(self): + + return self.word_count class HeadingLevel(Enum): @@ -1008,3 +1012,9 @@ class Table(Block): self._footer_rows.append(row) else: # Default to body self._rows + + + +class Image: + + pass \ No newline at end of file diff --git a/pyWebLayout/abstract/inline.py b/pyWebLayout/abstract/inline.py index 857dd9f..7081612 100644 --- a/pyWebLayout/abstract/inline.py +++ b/pyWebLayout/abstract/inline.py @@ -330,3 +330,8 @@ class FormattedSpan: self._words.append(word) return word + + +class LineBreak: + + pass \ No newline at end of file diff --git a/pyWebLayout/html_parser.py b/pyWebLayout/html_parser.py index bc987c3..61b40a0 100644 --- a/pyWebLayout/html_parser.py +++ b/pyWebLayout/html_parser.py @@ -7,7 +7,7 @@ from PIL import Image from .style import Font, FontStyle, FontWeight, TextDecoration from .abstract.document import Document, MetadataType, Book, Chapter from .abstract.block import ( - Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, + Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak ) from .abstract.inline import Word, FormattedSpan @@ -138,7 +138,7 @@ class HTMLParser(BaseHTMLParser): elif tag == 'p': self._flush_text() # Flush any pending text - self._current_paragraph = Parapgraph() + self._current_paragraph = Paragraph() # Add the paragraph to the current block or document if self._current_block and hasattr(self._current_block, 'add_block'): @@ -180,7 +180,7 @@ class HTMLParser(BaseHTMLParser): self._flush_text() # Flush any pending text # For divs, we create a new paragraph as a container - div_para = Parapgraph() + div_para = Paragraph() # Add the div to the current block or document if self._current_block and hasattr(self._current_block, 'add_block'): @@ -214,7 +214,7 @@ class HTMLParser(BaseHTMLParser): # Pre can optionally contain a code block # We'll create a paragraph for now, and if we find a code tag inside, # we'll replace it with a code block - pre_para = Parapgraph() + pre_para = Paragraph() # Add the pre to the current block or document if self._current_block and hasattr(self._current_block, 'add_block'): @@ -229,7 +229,7 @@ class HTMLParser(BaseHTMLParser): elif tag == 'code': # If we're inside a pre, replace the paragraph with a code block - if self._block_stack and isinstance(self._block_stack[-1], Parapgraph): + if self._block_stack and isinstance(self._block_stack[-1], Paragraph): pre_para = self._block_stack.pop() # Get the language from class if specified (e.g., class="language-python") @@ -312,7 +312,7 @@ class HTMLParser(BaseHTMLParser): self._current_block = list_item # Create a paragraph for the term content - term_para = Parapgraph() + term_para = Paragraph() list_item.add_block(term_para) self._current_paragraph = term_para @@ -325,7 +325,7 @@ class HTMLParser(BaseHTMLParser): list_item = current_list._items[-1] # Create a paragraph for the description content - desc_para = Parapgraph() + desc_para = Paragraph() list_item.add_block(desc_para) # Update current state @@ -340,7 +340,7 @@ class HTMLParser(BaseHTMLParser): self._current_block = list_item # Create a paragraph for the description content - desc_para = Parapgraph() + desc_para = Paragraph() list_item.add_block(desc_para) self._current_paragraph = desc_para @@ -424,7 +424,7 @@ class HTMLParser(BaseHTMLParser): self._current_block = cell # Create a paragraph for the cell content - cell_para = Parapgraph() + cell_para = Paragraph() cell.add_block(cell_para) self._current_paragraph = cell_para @@ -508,6 +508,7 @@ class HTMLParser(BaseHTMLParser): }) elif tag == 'br': + # Add a line break if self._current_paragraph: line_break = LineBreak() diff --git a/pyWebLayout/io/readers/epub_reader.py b/pyWebLayout/io/readers/epub_reader.py index 2c1ca27..44e938f 100644 --- a/pyWebLayout/io/readers/epub_reader.py +++ b/pyWebLayout/io/readers/epub_reader.py @@ -379,10 +379,10 @@ class EPUBReader: except Exception as e: print(f"Error parsing chapter {i+1}: {str(e)}") # Add an error message block - from pyWebLayout.abstract.block import Parapgraph + from pyWebLayout.abstract.block import Paragraph from pyWebLayout.abstract.inline import Word from pyWebLayout.style import Font - error_para = Parapgraph() + error_para = Paragraph() # Create a default font style for the error message default_font = Font() error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font)) diff --git a/pyWebLayout/io/readers/html_elements.py b/pyWebLayout/io/readers/html_elements.py index 93e3240..0f3400e 100644 --- a/pyWebLayout/io/readers/html_elements.py +++ b/pyWebLayout/io/readers/html_elements.py @@ -9,9 +9,9 @@ from typing import Dict, List, Optional, Any import urllib.parse from pyWebLayout.abstract.document import Document from pyWebLayout.abstract.block import ( - Block, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, + Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, ListItem, Table, TableRow, TableCell, - HorizontalRule, LineBreak, Image + #HorizontalRule, LineBreak, Image ) from pyWebLayout.abstract.functional import Link, LinkType from pyWebLayout.io.readers.html_style import HTMLStyleManager @@ -26,7 +26,7 @@ class BlockElementHandler: self.text_processor = text_processor self.block_stack: List[Block] = [] self.current_block: Optional[Block] = None - self.current_paragraph: Optional[Parapgraph] = None + self.current_paragraph: Optional[Paragraph] = None def reset(self): """Reset the handler state.""" @@ -44,7 +44,7 @@ class BlockElementHandler: def handle_paragraph_start(self, document: Document): """Handle the start of a paragraph element.""" self.text_processor.flush_text() - paragraph = Parapgraph() + paragraph = Paragraph() self.add_block_to_document_or_parent(paragraph, document) self.block_stack.append(paragraph) @@ -71,7 +71,7 @@ class BlockElementHandler: def handle_div_start(self, document: Document): """Handle the start of a div element.""" self.text_processor.flush_text() - div_para = Parapgraph() + div_para = Paragraph() self.add_block_to_document_or_parent(div_para, document) self.block_stack.append(div_para) @@ -93,7 +93,7 @@ class BlockElementHandler: def handle_pre_start(self, document: Document): """Handle the start of a pre element.""" self.text_processor.flush_text() - pre_para = Parapgraph() + pre_para = Paragraph() self.add_block_to_document_or_parent(pre_para, document) self.block_stack.append(pre_para) @@ -104,7 +104,7 @@ class BlockElementHandler: def handle_code_start(self, attrs: Dict[str, str], document: Document): """Handle the start of a code element.""" # If we're inside a pre, replace the paragraph with a code block - if self.block_stack and isinstance(self.block_stack[-1], Parapgraph): + if self.block_stack and isinstance(self.block_stack[-1], Paragraph): pre_para = self.block_stack.pop() # Get the language from class if specified @@ -145,7 +145,7 @@ class BlockElementHandler: if self.block_stack: self.current_block = self.block_stack[-1] # Update current paragraph based on block type - if isinstance(self.current_block, Parapgraph): + if isinstance(self.current_block, Paragraph): self.current_paragraph = self.current_block else: self.current_paragraph = None @@ -201,7 +201,7 @@ class ListElementHandler: block_handler.current_block = list_item # Create a paragraph for the list item content - item_para = Parapgraph() + item_para = Paragraph() list_item.add_block(item_para) block_handler.current_paragraph = item_para self.text_processor.set_current_paragraph(item_para) @@ -220,7 +220,7 @@ class ListElementHandler: block_handler.block_stack.append(list_item) block_handler.current_block = list_item - term_para = Parapgraph() + term_para = Paragraph() list_item.add_block(term_para) block_handler.current_paragraph = term_para self.text_processor.set_current_paragraph(term_para) @@ -228,7 +228,7 @@ class ListElementHandler: elif tag == 'dd': if current_list._items: list_item = current_list._items[-1] - desc_para = Parapgraph() + desc_para = Paragraph() list_item.add_block(desc_para) block_handler.current_paragraph = desc_para self.text_processor.set_current_paragraph(desc_para) @@ -339,7 +339,7 @@ class TableElementHandler: block_handler.current_block = cell # Create a paragraph for the cell content - cell_para = Parapgraph() + cell_para = Paragraph() cell.add_block(cell_para) block_handler.current_paragraph = cell_para self.text_processor.set_current_paragraph(cell_para) diff --git a/pyWebLayout/io/readers/html_extraction.py b/pyWebLayout/io/readers/html_extraction.py new file mode 100644 index 0000000..6e359a3 --- /dev/null +++ b/pyWebLayout/io/readers/html_extraction.py @@ -0,0 +1,736 @@ +""" +HTML extraction module for converting HTML elements to pyWebLayout abstract elements. + +This module provides handler functions for converting HTML elements into the abstract document structure +used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting. +Each handler function has a robust signature that handles style hints, CSS classes, and attributes. +""" + +import re +from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple +from bs4 import BeautifulSoup, Tag, NavigableString +from pyWebLayout.abstract.inline import Word, FormattedSpan +from pyWebLayout.abstract.block import ( + Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock, + HList, ListItem, ListStyle, Table, TableRow, TableCell +) +from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration + + +class StyleContext(NamedTuple): + """ + Immutable style context passed to handler functions. + Contains all styling information including inherited styles, CSS hints, and element attributes. + """ + font: Font + background: Optional[Tuple[int, int, int, int]] + css_classes: set + css_styles: Dict[str, str] + element_attributes: Dict[str, Any] + parent_elements: List[str] # Stack of parent element names + + def with_font(self, font: Font) -> 'StyleContext': + """Create new context with modified font.""" + return self._replace(font=font) + + def with_background(self, background: Optional[Tuple[int, int, int, int]]) -> 'StyleContext': + """Create new context with modified background.""" + return self._replace(background=background) + + def with_css_classes(self, css_classes: set) -> 'StyleContext': + """Create new context with modified CSS classes.""" + return self._replace(css_classes=css_classes) + + def with_css_styles(self, css_styles: Dict[str, str]) -> 'StyleContext': + """Create new context with modified CSS styles.""" + return self._replace(css_styles=css_styles) + + def with_attributes(self, attributes: Dict[str, Any]) -> 'StyleContext': + """Create new context with modified element attributes.""" + return self._replace(element_attributes=attributes) + + def push_element(self, element_name: str) -> 'StyleContext': + """Create new context with element pushed onto parent stack.""" + return self._replace(parent_elements=self.parent_elements + [element_name]) + + +def create_base_context(base_font: Optional[Font] = None) -> StyleContext: + """ + Create a base style context with default values. + + Args: + base_font: Base font to use, defaults to system default + + Returns: + StyleContext with default values + """ + return StyleContext( + font=base_font or Font(), + background=None, + css_classes=set(), + css_styles={}, + element_attributes={}, + parent_elements=[] + ) + + +def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext: + """ + Apply element-specific styling to context based on HTML element and attributes. + + Args: + context: Current style context + element: BeautifulSoup Tag object + + Returns: + New StyleContext with applied styling + """ + tag_name = element.name.lower() + attributes = dict(element.attrs) if element.attrs else {} + + # Start with current context + new_context = context.with_attributes(attributes).push_element(tag_name) + + # Apply CSS classes + css_classes = new_context.css_classes.copy() + if 'class' in attributes: + classes = attributes['class'].split() if isinstance(attributes['class'], str) else attributes['class'] + css_classes.update(classes) + new_context = new_context.with_css_classes(css_classes) + + # Apply inline styles + css_styles = new_context.css_styles.copy() + if 'style' in attributes: + inline_styles = parse_inline_styles(attributes['style']) + css_styles.update(inline_styles) + new_context = new_context.with_css_styles(css_styles) + + # Apply element-specific default styles + font = apply_element_font_styles(new_context.font, tag_name, css_styles) + new_context = new_context.with_font(font) + + # Apply background from styles + background = apply_background_styles(new_context.background, css_styles) + new_context = new_context.with_background(background) + + return new_context + + +def parse_inline_styles(style_text: str) -> Dict[str, str]: + """ + Parse CSS inline styles into dictionary. + + Args: + style_text: CSS style text (e.g., "color: red; font-weight: bold;") + + Returns: + Dictionary of CSS property-value pairs + """ + styles = {} + for declaration in style_text.split(';'): + if ':' in declaration: + prop, value = declaration.split(':', 1) + styles[prop.strip().lower()] = value.strip() + return styles + + +def apply_element_font_styles(font: Font, tag_name: str, css_styles: Dict[str, str]) -> Font: + """ + Apply font styling based on HTML element and CSS styles. + + Args: + font: Current font + tag_name: HTML tag name + css_styles: CSS styles dictionary + + Returns: + New Font object with applied styling + """ + # Default element styles + element_font_styles = { + 'b': {'weight': FontWeight.BOLD}, + 'strong': {'weight': FontWeight.BOLD}, + 'i': {'style': FontStyle.ITALIC}, + 'em': {'style': FontStyle.ITALIC}, + 'u': {'decoration': TextDecoration.UNDERLINE}, + 's': {'decoration': TextDecoration.STRIKETHROUGH}, + 'del': {'decoration': TextDecoration.STRIKETHROUGH}, + 'h1': {'size': 24, 'weight': FontWeight.BOLD}, + 'h2': {'size': 20, 'weight': FontWeight.BOLD}, + 'h3': {'size': 18, 'weight': FontWeight.BOLD}, + 'h4': {'size': 16, 'weight': FontWeight.BOLD}, + 'h5': {'size': 14, 'weight': FontWeight.BOLD}, + 'h6': {'size': 12, 'weight': FontWeight.BOLD}, + } + + # Start with current font properties + font_size = font.font_size + colour = font.colour + weight = font.weight + style = font.style + decoration = font.decoration + background = font.background + language = font.language + + # Apply element default styles + if tag_name in element_font_styles: + elem_styles = element_font_styles[tag_name] + if 'size' in elem_styles: + font_size = elem_styles['size'] + if 'weight' in elem_styles: + weight = elem_styles['weight'] + if 'style' in elem_styles: + style = elem_styles['style'] + if 'decoration' in elem_styles: + decoration = elem_styles['decoration'] + + # Apply CSS styles (override element defaults) + if 'font-size' in css_styles: + # Parse font-size (simplified - could be enhanced) + size_value = css_styles['font-size'].lower() + if size_value.endswith('px'): + try: + font_size = int(float(size_value[:-2])) + except ValueError: + pass + elif size_value.endswith('pt'): + try: + font_size = int(float(size_value[:-2])) + except ValueError: + pass + + if 'font-weight' in css_styles: + weight_value = css_styles['font-weight'].lower() + if weight_value in ['bold', '700', '800', '900']: + weight = FontWeight.BOLD + elif weight_value in ['normal', '400']: + weight = FontWeight.NORMAL + + if 'font-style' in css_styles: + style_value = css_styles['font-style'].lower() + if style_value == 'italic': + style = FontStyle.ITALIC + elif style_value == 'normal': + style = FontStyle.NORMAL + + if 'text-decoration' in css_styles: + decoration_value = css_styles['text-decoration'].lower() + if 'underline' in decoration_value: + decoration = TextDecoration.UNDERLINE + elif 'line-through' in decoration_value: + decoration = TextDecoration.STRIKETHROUGH + elif 'none' in decoration_value: + decoration = TextDecoration.NONE + + if 'color' in css_styles: + # Parse color (simplified - could be enhanced for hex, rgb, etc.) + color_value = css_styles['color'].lower() + color_map = { + 'black': (0, 0, 0), + 'white': (255, 255, 255), + 'red': (255, 0, 0), + 'green': (0, 255, 0), + 'blue': (0, 0, 255), + } + if color_value in color_map: + colour = color_map[color_value] + elif color_value.startswith('#') and len(color_value) == 7: + try: + r = int(color_value[1:3], 16) + g = int(color_value[3:5], 16) + b = int(color_value[5:7], 16) + colour = (r, g, b) + except ValueError: + pass + + return Font( + font_path=font._font_path, + font_size=font_size, + colour=colour, + weight=weight, + style=style, + decoration=decoration, + background=background, + langauge=language + ) + + +def apply_background_styles(current_background: Optional[Tuple[int, int, int, int]], + css_styles: Dict[str, str]) -> Optional[Tuple[int, int, int, int]]: + """ + Apply background styling from CSS. + + Args: + current_background: Current background color (RGBA) + css_styles: CSS styles dictionary + + Returns: + New background color or None + """ + if 'background-color' in css_styles: + bg_value = css_styles['background-color'].lower() + if bg_value == 'transparent': + return None + # Add color parsing logic here if needed + + return current_background + + +def extract_text_content(element: Tag, context: StyleContext) -> List[Word]: + """ + Extract text content from an element, handling inline formatting. + + Args: + element: BeautifulSoup Tag object + context: Current style context + + Returns: + List of Word objects + """ + words = [] + + for child in element.children: + if isinstance(child, NavigableString): + # Plain text - split into words + text = str(child).strip() + if text: + word_texts = text.split() + for word_text in word_texts: + if word_text: + words.append(Word(word_text, context.font, context.background)) + elif isinstance(child, Tag): + # Process inline elements + if child.name.lower() in ['span', 'a', 'strong', 'b', 'em', 'i', 'u', 's', 'del', 'ins', 'mark', 'small', 'sub', 'sup', 'code', 'q', 'cite', 'abbr', 'time']: + child_context = apply_element_styling(context, child) + child_words = extract_text_content(child, child_context) + words.extend(child_words) + else: + # Block element - shouldn't happen in well-formed HTML but handle gracefully + child_context = apply_element_styling(context, child) + child_result = process_element(child, child_context) + if isinstance(child_result, list): + for block in child_result: + if isinstance(block, Paragraph): + for _, word in block.words(): + words.append(word) + elif isinstance(child_result, Paragraph): + for _, word in child_result.words(): + words.append(word) + + return words + + +def process_element(element: Tag, context: StyleContext) -> Union[Block, List[Block], None]: + """ + Process a single HTML element using appropriate handler. + + Args: + element: BeautifulSoup Tag object + context: Current style context + + Returns: + Block object(s) or None if element should be ignored + """ + tag_name = element.name.lower() + handler = HANDLERS.get(tag_name, generic_handler) + return handler(element, context) + + +# Handler function signatures: +# All handlers receive (element: Tag, context: StyleContext) -> Union[Block, List[Block], None] + +def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph: + """Handle
elements.""" + paragraph = Paragraph(context.font) + words = extract_text_content(element, context) + for word in words: + paragraph.add_word(word) + return paragraph + + +def div_handler(element: Tag, context: StyleContext) -> List[Block]: + """Handle
elements."""
+ quote = Quote(context.font)
+ for child in element.children:
+ if isinstance(child, Tag):
+ child_context = apply_element_styling(context, child)
+ result = process_element(child, child_context)
+ if result:
+ if isinstance(result, list):
+ for block in result:
+ quote.add_block(block)
+ else:
+ quote.add_block(result)
+ return quote
+
+
+def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock:
+ """Handle elements."""
+ language = context.element_attributes.get('data-language', '')
+ code_block = CodeBlock(language)
+
+ # Preserve whitespace and line breaks in preformatted text
+ text = element.get_text(separator='\n', strip=False)
+ for line in text.split('\n'):
+ code_block.add_line(line)
+
+ return code_block
+
+
+def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]:
+ """Handle elements."""
+ # If parent is , this is handled by preformatted_handler
+ if context.parent_elements and context.parent_elements[-1] == 'pre':
+ return None # Will be handled by parent
+
+ # Inline code - handled during text extraction
+ return None
+
+
+def unordered_list_handler(element: Tag, context: StyleContext) -> HList:
+ """Handle
elements."""
+ hlist = HList(ListStyle.UNORDERED, context.font)
+ for child in element.children:
+ if isinstance(child, Tag) and child.name.lower() == 'li':
+ child_context = apply_element_styling(context, child)
+ item = process_element(child, child_context)
+ if item:
+ hlist.add_item(item)
+ return hlist
+
+
+def ordered_list_handler(element: Tag, context: StyleContext) -> HList:
+ """Handle elements."""
+ hlist = HList(ListStyle.ORDERED, context.font)
+ for child in element.children:
+ if isinstance(child, Tag) and child.name.lower() == 'li':
+ child_context = apply_element_styling(context, child)
+ item = process_element(child, child_context)
+ if item:
+ hlist.add_item(item)
+ return hlist
+
+
+def list_item_handler(element: Tag, context: StyleContext) -> ListItem:
+ """Handle - elements."""
+ list_item = ListItem(None, context.font)
+
+ for child in element.children:
+ if isinstance(child, Tag):
+ child_context = apply_element_styling(context, child)
+ result = process_element(child, child_context)
+ if result:
+ if isinstance(result, list):
+ for block in result:
+ list_item.add_block(block)
+ else:
+ list_item.add_block(result)
+ elif isinstance(child, NavigableString):
+ # Direct text in list item - create paragraph
+ text = str(child).strip()
+ if text:
+ paragraph = Paragraph(context.font)
+ words = text.split()
+ for word_text in words:
+ if word_text:
+ paragraph.add_word(Word(word_text, context.font))
+ list_item.add_block(paragraph)
+
+ return list_item
+
+
+def table_handler(element: Tag, context: StyleContext) -> Table:
+ """Handle
elements."""
+ caption = None
+ caption_elem = element.find('caption')
+ if caption_elem:
+ caption = caption_elem.get_text(strip=True)
+
+ table = Table(caption, context.font)
+
+ # Process table rows
+ for child in element.children:
+ if isinstance(child, Tag):
+ if child.name.lower() == 'tr':
+ child_context = apply_element_styling(context, child)
+ row = process_element(child, child_context)
+ if row:
+ table.add_row(row)
+ elif child.name.lower() in ['thead', 'tbody', 'tfoot']:
+ section = 'header' if child.name.lower() == 'thead' else 'body'
+ section = 'footer' if child.name.lower() == 'tfoot' else section
+
+ for row_elem in child.find_all('tr'):
+ child_context = apply_element_styling(context, row_elem)
+ row = process_element(row_elem, child_context)
+ if row:
+ table.add_row(row, section)
+
+ return table
+
+
+def table_row_handler(element: Tag, context: StyleContext) -> TableRow:
+ """Handle elements."""
+ row = TableRow(context.font)
+ for child in element.children:
+ if isinstance(child, Tag) and child.name.lower() in ['td', 'th']:
+ child_context = apply_element_styling(context, child)
+ cell = process_element(child, child_context)
+ if cell:
+ row.add_cell(cell)
+ return row
+
+
+def table_cell_handler(element: Tag, context: StyleContext) -> TableCell:
+ """Handle elements."""
+ colspan = int(context.element_attributes.get('colspan', 1))
+ rowspan = int(context.element_attributes.get('rowspan', 1))
+ cell = TableCell(False, colspan, rowspan, context.font)
+
+ # Process cell content
+ for child in element.children:
+ if isinstance(child, Tag):
+ child_context = apply_element_styling(context, child)
+ result = process_element(child, child_context)
+ if result:
+ if isinstance(result, list):
+ for block in result:
+ cell.add_block(block)
+ else:
+ cell.add_block(result)
+ elif isinstance(child, NavigableString):
+ # Direct text in cell - create paragraph
+ text = str(child).strip()
+ if text:
+ paragraph = Paragraph(context.font)
+ words = text.split()
+ for word_text in words:
+ if word_text:
+ paragraph.add_word(Word(word_text, context.font))
+ cell.add_block(paragraph)
+
+ return cell
+
+
+def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:
+ """Handle elements."""
+ colspan = int(context.element_attributes.get('colspan', 1))
+ rowspan = int(context.element_attributes.get('rowspan', 1))
+ cell = TableCell(True, colspan, rowspan, context.font)
+
+ # Process cell content (same as td)
+ for child in element.children:
+ if isinstance(child, Tag):
+ child_context = apply_element_styling(context, child)
+ result = process_element(child, child_context)
+ if result:
+ if isinstance(result, list):
+ for block in result:
+ cell.add_block(block)
+ else:
+ cell.add_block(result)
+ elif isinstance(child, NavigableString):
+ text = str(child).strip()
+ if text:
+ paragraph = Paragraph(context.font)
+ words = text.split()
+ for word_text in words:
+ if word_text:
+ paragraph.add_word(Word(word_text, context.font))
+ cell.add_block(paragraph)
+
+ return cell
+
+
+def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block:
+ """Handle
elements."""
+ # TODO: Create a specific HorizontalRule block type
+ # For now, return an empty paragraph
+ return Paragraph(context.font)
+
+
+def line_break_handler(element: Tag, context: StyleContext) -> None:
+ """Handle
elements."""
+ # Line breaks are typically handled at the paragraph level
+ return None
+
+
+def image_handler(element: Tag, context: StyleContext) -> Block:
+ """Handle
elements."""
+ # TODO: Create Image block type
+ # For now, return empty paragraph with alt text if available
+ paragraph = Paragraph(context.font)
+ alt_text = context.element_attributes.get('alt', '')
+ if alt_text:
+ words = alt_text.split()
+ for word_text in words:
+ if word_text:
+ paragraph.add_word(Word(word_text, context.font))
+ return paragraph
+
+
+def ignore_handler(element: Tag, context: StyleContext) -> None:
+ """Handle elements that should be ignored."""
+ return None
+
+
+def generic_handler(element: Tag, context: StyleContext) -> List[Block]:
+ """Handle unknown elements as generic containers."""
+ return div_handler(element, context)
+
+
+# Handler registry - maps HTML tag names to handler functions
+HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = {
+ # Block elements
+ 'p': paragraph_handler,
+ 'div': div_handler,
+ 'h1': heading_handler,
+ 'h2': heading_handler,
+ 'h3': heading_handler,
+ 'h4': heading_handler,
+ 'h5': heading_handler,
+ 'h6': heading_handler,
+ 'blockquote': blockquote_handler,
+ 'pre': preformatted_handler,
+ 'code': code_handler,
+ 'ul': unordered_list_handler,
+ 'ol': ordered_list_handler,
+ 'li': list_item_handler,
+ 'table': table_handler,
+ 'tr': table_row_handler,
+ 'td': table_cell_handler,
+ 'th': table_header_cell_handler,
+ 'hr': horizontal_rule_handler,
+ 'br': line_break_handler,
+
+ # Semantic elements (treated as containers)
+ 'section': div_handler,
+ 'article': div_handler,
+ 'aside': div_handler,
+ 'nav': div_handler,
+ 'header': div_handler,
+ 'footer': div_handler,
+ 'main': div_handler,
+ 'figure': div_handler,
+ 'figcaption': paragraph_handler,
+
+ # Media elements
+ 'img': image_handler,
+
+ # Inline elements (handled during text extraction)
+ 'span': ignore_handler,
+ 'a': ignore_handler,
+ 'strong': ignore_handler,
+ 'b': ignore_handler,
+ 'em': ignore_handler,
+ 'i': ignore_handler,
+ 'u': ignore_handler,
+ 's': ignore_handler,
+ 'del': ignore_handler,
+ 'ins': ignore_handler,
+ 'mark': ignore_handler,
+ 'small': ignore_handler,
+ 'sub': ignore_handler,
+ 'sup': ignore_handler,
+ 'q': ignore_handler,
+ 'cite': ignore_handler,
+ 'abbr': ignore_handler,
+ 'time': ignore_handler,
+
+ # Ignored elements
+ 'script': ignore_handler,
+ 'style': ignore_handler,
+ 'meta': ignore_handler,
+ 'link': ignore_handler,
+ 'head': ignore_handler,
+ 'title': ignore_handler,
+}
+
+
+def parse_html_string(html_string: str, base_font: Optional[Font] = None) -> List[Block]:
+ """
+ Parse HTML string and return list of Block objects.
+
+ Args:
+ html_string: HTML content to parse
+ base_font: Base font for styling, defaults to system default
+
+ Returns:
+ List of Block objects representing the document structure
+ """
+ soup = BeautifulSoup(html_string, 'html.parser')
+ context = create_base_context(base_font)
+ blocks = []
+
+ # Process the body if it exists, otherwise process all top-level elements
+ root_element = soup.find('body') or soup
+
+ for element in root_element.children:
+ if isinstance(element, Tag):
+ element_context = apply_element_styling(context, element)
+ result = process_element(element, element_context)
+ if result:
+ if isinstance(result, list):
+ blocks.extend(result)
+ else:
+ blocks.append(result)
+
+ return blocks
+
+
+def register_handler(tag_name: str, handler: Callable[[Tag, StyleContext], Union[Block, List[Block], None]]):
+ """
+ Register a custom handler for an HTML tag.
+
+ Args:
+ tag_name: HTML tag name (lowercase)
+ handler: Handler function with signature (element: Tag, context: StyleContext) -> Union[Block, List[Block], None]
+ """
+ HANDLERS[tag_name] = handler
+
+
+def get_handler(tag_name: str) -> Callable[[Tag, StyleContext], Union[Block, List[Block], None]]:
+ """
+ Get handler function for HTML tag.
+
+ Args:
+ tag_name: HTML tag name (lowercase)
+
+ Returns:
+ Handler function or generic_handler if tag not found
+ """
+ return HANDLERS.get(tag_name.lower(), generic_handler)
diff --git a/pyWebLayout/io/readers/html_text.py b/pyWebLayout/io/readers/html_text.py
index d59ccef..7864148 100644
--- a/pyWebLayout/io/readers/html_text.py
+++ b/pyWebLayout/io/readers/html_text.py
@@ -7,7 +7,7 @@ entity references, and word creation in HTML documents.
from typing import Optional
from pyWebLayout.abstract.inline import Word
-from pyWebLayout.abstract.block import Parapgraph
+from pyWebLayout.abstract.block import Paragraph
from pyWebLayout.io.readers.html_style import HTMLStyleManager
@@ -28,14 +28,14 @@ class HTMLTextProcessor:
"""
self._style_manager = style_manager
self._text_buffer = ""
- self._current_paragraph: Optional[Parapgraph] = None
+ self._current_paragraph: Optional[Paragraph] = None
def reset(self):
"""Reset the text processor state."""
self._text_buffer = ""
self._current_paragraph = None
- def set_current_paragraph(self, paragraph: Optional[Parapgraph]):
+ def set_current_paragraph(self, paragraph: Optional[Paragraph]):
"""
Set the current paragraph for text output.
diff --git a/pyWebLayout/typesetting/document_pagination.py b/pyWebLayout/typesetting/document_pagination.py
index 764e49f..6352604 100644
--- a/pyWebLayout/typesetting/document_pagination.py
+++ b/pyWebLayout/typesetting/document_pagination.py
@@ -139,7 +139,7 @@ class DocumentPaginator:
for chapter in self.document.chapters:
# Add a heading block for the chapter if it has a title
if chapter.title:
- from pyWebLayout.abstract.block import Heading, HeadingLevel, Parapgraph
+ from pyWebLayout.abstract.block import Heading, HeadingLevel, Paragraph
from pyWebLayout.abstract.inline import Word
# Create a heading for the chapter
diff --git a/tests/test_abstract_blocks.py b/tests/test_abstract_blocks.py
index 8697d8e..22c3f2b 100644
--- a/tests/test_abstract_blocks.py
+++ b/tests/test_abstract_blocks.py
@@ -6,7 +6,7 @@ Tests the core abstract block classes that form the foundation of the document m
import unittest
from pyWebLayout.abstract.block import (
- Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
+ Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, ListItem, Table, TableRow, TableCell,
HorizontalRule, LineBreak, Image
)
@@ -19,7 +19,7 @@ class TestBlockElements(unittest.TestCase):
def test_paragraph_creation(self):
"""Test creating and using paragraphs."""
- paragraph = Parapgraph()
+ paragraph = Paragraph()
self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH)
self.assertEqual(paragraph.word_count, 0)
@@ -62,8 +62,8 @@ class TestBlockElements(unittest.TestCase):
quote = Quote()
# Add nested paragraphs
- p1 = Parapgraph()
- p2 = Parapgraph()
+ p1 = Paragraph()
+ p2 = Paragraph()
quote.add_block(p1)
quote.add_block(p2)
diff --git a/tests/test_abstract_document.py b/tests/test_abstract_document.py
index 68d8de3..e379901 100644
--- a/tests/test_abstract_document.py
+++ b/tests/test_abstract_document.py
@@ -7,7 +7,7 @@ document structure and metadata management.
import unittest
from pyWebLayout.abstract.document import Document, Chapter, Book, MetadataType
-from pyWebLayout.abstract.block import Parapgraph, Heading, HeadingLevel, BlockType
+from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, BlockType
from pyWebLayout.abstract.inline import Word, FormattedSpan
from pyWebLayout.style import Font
@@ -77,8 +77,8 @@ class TestDocument(unittest.TestCase):
def test_block_management(self):
"""Test adding and managing blocks."""
# Create some blocks
- para1 = Parapgraph()
- para2 = Parapgraph()
+ para1 = Paragraph()
+ para2 = Paragraph()
heading = Heading(HeadingLevel.H1)
# Add blocks
@@ -95,7 +95,7 @@ class TestDocument(unittest.TestCase):
def test_anchor_management(self):
"""Test named anchor functionality."""
heading = Heading(HeadingLevel.H1)
- para = Parapgraph()
+ para = Paragraph()
# Add anchors
self.doc.add_anchor("intro", heading)
@@ -154,8 +154,8 @@ class TestDocument(unittest.TestCase):
def test_find_blocks_by_type(self):
"""Test finding blocks by type."""
# Create blocks of different types
- para1 = Parapgraph()
- para2 = Parapgraph()
+ para1 = Paragraph()
+ para2 = Paragraph()
heading1 = Heading(HeadingLevel.H1)
heading2 = Heading(HeadingLevel.H2)
@@ -180,7 +180,7 @@ class TestDocument(unittest.TestCase):
def test_find_headings(self):
"""Test finding heading blocks specifically."""
# Create mixed blocks
- para = Parapgraph()
+ para = Paragraph()
h1 = Heading(HeadingLevel.H1)
h2 = Heading(HeadingLevel.H2)
@@ -284,8 +284,8 @@ class TestChapter(unittest.TestCase):
def test_block_management(self):
"""Test adding blocks to chapter."""
- para1 = Parapgraph()
- para2 = Parapgraph()
+ para1 = Paragraph()
+ para2 = Paragraph()
heading = Heading(HeadingLevel.H2)
# Add blocks
@@ -450,7 +450,7 @@ class TestBook(unittest.TestCase):
"""Test that Book inherits all Document functionality."""
# Test that book can use all document methods
# Add blocks directly to book
- para = Parapgraph()
+ para = Paragraph()
self.book.add_block(para)
self.assertEqual(len(self.book.blocks), 1)
diff --git a/tests/test_epub_fix.py b/tests/test_epub_fix.py
deleted file mode 100644
index f262acc..0000000
--- a/tests/test_epub_fix.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple test script to verify that the EPUB reader fixes are working correctly.
-"""
-
-import sys
-import os
-
-# Add the pyWebLayout directory to the Python path
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'pyWebLayout'))
-
-try:
- from pyWebLayout.io.readers.epub_reader import read_epub
- print("Successfully imported epub_reader module")
-
- # Test reading the EPUB file
- epub_path = os.path.join('pyWebLayout', 'examples', 'pg174-images-3.epub')
-
- if not os.path.exists(epub_path):
- print(f"EPUB file not found: {epub_path}")
- sys.exit(1)
-
- print(f"Reading EPUB file: {epub_path}")
-
- # Try to read the EPUB
- book = read_epub(epub_path)
-
- print(f"Successfully read EPUB file!")
- print(f"Book title: {book.title}")
- print(f"Number of chapters: {len(book.chapters)}")
-
- # Check first chapter
- if book.chapters:
- first_chapter = book.chapters[0]
- print(f"First chapter title: {first_chapter.title}")
- print(f"First chapter has {len(first_chapter.blocks)} blocks")
-
-except Exception as e:
- print(f"Error: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
-
-print("Test completed successfully!")
diff --git a/tests/test_html_content.py b/tests/test_html_content.py
index 6c92a3e..05cd2e9 100644
--- a/tests/test_html_content.py
+++ b/tests/test_html_content.py
@@ -9,7 +9,7 @@ import unittest
from pyWebLayout.io.readers.html_content import HTMLContentReader
from pyWebLayout.abstract.document import Document
from pyWebLayout.abstract.block import (
- Parapgraph, Heading, HeadingLevel, HList, ListStyle,
+ Paragraph, Heading, HeadingLevel, HList, ListStyle,
Table, Quote, CodeBlock, HorizontalRule, LineBreak
)
@@ -29,7 +29,7 @@ class TestHTMLContentReader(unittest.TestCase):
result = self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
- self.assertIsInstance(self.document.blocks[0], Parapgraph)
+ self.assertIsInstance(self.document.blocks[0], Paragraph)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
@@ -107,7 +107,7 @@ class TestHTMLContentReader(unittest.TestCase):
# Check first item content
first_item_blocks = list(items[0].blocks())
self.assertEqual(len(first_item_blocks), 1)
- self.assertIsInstance(first_item_blocks[0], Parapgraph)
+ self.assertIsInstance(first_item_blocks[0], Paragraph)
def test_ordered_list(self):
"""Test parsing ordered lists."""
@@ -202,8 +202,8 @@ class TestHTMLContentReader(unittest.TestCase):
quote = self.document.blocks[0]
quote_blocks = list(quote.blocks())
self.assertEqual(len(quote_blocks), 2)
- self.assertIsInstance(quote_blocks[0], Parapgraph)
- self.assertIsInstance(quote_blocks[1], Parapgraph)
+ self.assertIsInstance(quote_blocks[0], Paragraph)
+ self.assertIsInstance(quote_blocks[1], Paragraph)
def test_code_block(self):
"""Test parsing code blocks."""
@@ -229,9 +229,9 @@ def hello():
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 3)
- self.assertIsInstance(self.document.blocks[0], Parapgraph)
+ self.assertIsInstance(self.document.blocks[0], Paragraph)
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
- self.assertIsInstance(self.document.blocks[2], Parapgraph)
+ self.assertIsInstance(self.document.blocks[2], Paragraph)
def test_html_entities(self):
"""Test handling HTML entities."""
@@ -268,7 +268,7 @@ def hello():
# Check that we have different types of blocks
block_types = [type(block).__name__ for block in self.document.blocks]
- self.assertIn('Parapgraph', block_types) # From div
+ self.assertIn('Paragraph', block_types) # From div
self.assertIn('Heading', block_types)
self.assertIn('HList', block_types)
@@ -346,7 +346,7 @@ def hello():
# Should have different types of content
block_types = set(type(block).__name__ for block in self.document.blocks)
- expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'}
+ expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'}
self.assertTrue(expected_types.issubset(block_types))
diff --git a/tests/test_html_extraction.py b/tests/test_html_extraction.py
new file mode 100644
index 0000000..0cbf1cb
--- /dev/null
+++ b/tests/test_html_extraction.py
@@ -0,0 +1,384 @@
+"""
+Unit tests for HTML extraction functionality.
+
+Tests the HTML parsing and conversion to pyWebLayout abstract elements,
+including styled content within paragraphs and block-level elements.
+"""
+
+import unittest
+from pyWebLayout.io.readers.html_extraction import parse_html_string
+from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
+from pyWebLayout.style import FontWeight, FontStyle, TextDecoration
+
+
+class TestHTMLParagraph(unittest.TestCase):
+ """Test cases for basic paragraph parsing."""
+
+ def test_simple(self):
+ text = "This is a paragraph.
"
+ paragraphs = parse_html_string(text)
+ self.assertEqual(len(paragraphs), 1)
+ self.assertEqual(len(paragraphs[0]), 4)
+
+ for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
+ self.assertEqual(w1[1].text, t1)
+
+ def test_multiple(self):
+ text = "This is a paragraph.
This is another paragraph.
"
+ paragraphs = parse_html_string(text)
+ self.assertEqual(len(paragraphs), 2)
+ self.assertEqual(len(paragraphs[0]), 4)
+ self.assertEqual(len(paragraphs[1]), 4)
+
+ for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
+ self.assertEqual(w1[1].text, t1)
+
+ for w1, t1 in zip(paragraphs[1].words(), "This is another paragraph.".split(" ")):
+ self.assertEqual(w1[1].text, t1)
+
+
+class TestHTMLStyledParagraphs(unittest.TestCase):
+ """Test cases for paragraphs with inline styling."""
+
+ def test_bold_text(self):
+ """Test paragraphs with bold text using and tags."""
+ text = "This is bold text in a paragraph.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Paragraph)
+
+ words = list(blocks[0].words())
+ self.assertEqual(len(words), 7) # "This is bold text in a paragraph."
+
+ # Check that 'bold' and 'text' words have bold font weight
+ bold_word = words[2][1] # 'bold'
+ text_word = words[3][1] # 'text'
+ self.assertEqual(bold_word.text, "bold")
+ self.assertEqual(bold_word.style.weight, FontWeight.BOLD)
+ self.assertEqual(text_word.text, "text")
+ self.assertEqual(text_word.style.weight, FontWeight.BOLD)
+
+ # Check that other words are not bold
+ normal_word = words[0][1] # 'This'
+ self.assertEqual(normal_word.text, "This")
+ self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD)
+
+ def test_italic_text(self):
+ """Test paragraphs with italic text using and tags."""
+ text = "This is italic text in a paragraph.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Paragraph)
+
+ words = list(blocks[0].words())
+
+ # Check that 'italic' and 'text' words have italic font style
+ italic_word = words[2][1] # 'italic'
+ text_word = words[3][1] # 'text'
+ self.assertEqual(italic_word.text, "italic")
+ self.assertEqual(italic_word.style.style, FontStyle.ITALIC)
+ self.assertEqual(text_word.text, "text")
+ self.assertEqual(text_word.style.style, FontStyle.ITALIC)
+
+ def test_underlined_text(self):
+ """Test paragraphs with underlined text using tag."""
+ text = "This is underlined text here.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+
+ words = list(blocks[0].words())
+ underlined_word = words[2][1] # 'underlined'
+ self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE)
+
+ def test_strikethrough_text(self):
+ """Test paragraphs with strikethrough text using and tags."""
+ text = "This is strikethrough text here.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+
+ words = list(blocks[0].words())
+ strike_word = words[2][1] # 'strikethrough'
+ self.assertEqual(strike_word.style.decoration, TextDecoration.STRIKETHROUGH)
+
+ def test_span_with_inline_styles(self):
+ """Test paragraphs with span elements containing inline CSS styles."""
+ text = 'This text is normal, but this part is red and bold.
'
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Paragraph)
+
+ words = list(blocks[0].words())
+
+ # Find the styled words
+ styled_words = []
+ for _, word in words:
+ if word.text in ["this", "part", "is", "red", "and", "bold"]:
+ if word.style.weight == FontWeight.BOLD:
+ styled_words.append(word)
+
+ self.assertGreater(len(styled_words), 0, "Should have bold words in styled span")
+
+ # Check that at least one word has the red color
+ red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)]
+ self.assertGreater(len(red_words), 0, "Should have red colored words")
+
+ def test_mixed_formatting(self):
+ """Test paragraphs with multiple formatting elements combined."""
+ text = "This paragraph contains bold, italic, blue, and highlighted text all together.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Paragraph)
+
+ words = list(blocks[0].words())
+
+ # Check for bold word
+ bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
+ self.assertGreater(len(bold_words), 0, "Should have bold words")
+
+ # Check for italic word
+ italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
+ self.assertGreater(len(italic_words), 0, "Should have italic words")
+
+ # Check for blue colored word
+ blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)]
+ self.assertGreater(len(blue_words), 0, "Should have blue colored words")
+
+ def test_nested_formatting(self):
+ """Test nested formatting elements."""
+ text = "This has bold with italic inside formatting.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+
+ words = list(blocks[0].words())
+
+ # Find words that should be both bold and italic
+ bold_italic_words = [w for _, w in words
+ if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC]
+ self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic")
+
+ def test_color_variations(self):
+ """Test different color formats in CSS."""
+ text = 'Hex red and Named green.
'
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+
+ words = list(blocks[0].words())
+
+ # Check for hex red color
+ hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)]
+ self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words")
+
+ # Check for named green color
+ green_words = [w for _, w in words if w.style.colour == (0, 255, 0)]
+ self.assertGreater(len(green_words), 0, "Should have green colored words")
+
+
+class TestHTMLBlockElements(unittest.TestCase):
+ """Test cases for block-level HTML elements."""
+
+ def test_body_element(self):
+ """Test parsing of body element containing other elements."""
+ text = "Paragraph one.
Paragraph two.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 2)
+ self.assertIsInstance(blocks[0], Paragraph)
+ self.assertIsInstance(blocks[1], Paragraph)
+
+ def test_div_container(self):
+ """Test div elements as generic containers."""
+ text = "First paragraph.
Second paragraph.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 2)
+ self.assertIsInstance(blocks[0], Paragraph)
+ self.assertIsInstance(blocks[1], Paragraph)
+
+ def test_headings(self):
+ """Test all heading levels h1-h6."""
+ text = "Heading 1
Heading 2
Heading 3
Heading 4
Heading 5
Heading 6
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 6)
+
+ expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3,
+ HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6]
+
+ for i, block in enumerate(blocks):
+ self.assertIsInstance(block, Heading)
+ self.assertEqual(block.level, expected_levels[i])
+
+ words = list(block.words())
+ self.assertEqual(len(words), 2) # "Heading" and number
+ self.assertEqual(words[0][1].text, "Heading")
+
+ def test_blockquote(self):
+ """Test blockquote elements."""
+ text = "This is a quoted paragraph.
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Quote)
+
+ # Check that the quote contains a paragraph
+ quote_blocks = list(blocks[0].blocks())
+ self.assertEqual(len(quote_blocks), 1)
+ self.assertIsInstance(quote_blocks[0], Paragraph)
+
+ def test_preformatted_code(self):
+ """Test preformatted code blocks."""
+ text = "function hello() {\n console.log('Hello');\n}
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], CodeBlock)
+
+ lines = list(blocks[0].lines())
+ self.assertGreater(len(lines), 0)
+
+ def test_unordered_list(self):
+ """Test unordered lists."""
+ text = "- First item
- Second item
- Third item
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], HList)
+ self.assertEqual(blocks[0].style, ListStyle.UNORDERED)
+
+ items = list(blocks[0].items())
+ self.assertEqual(len(items), 3)
+
+ def test_ordered_list(self):
+ """Test ordered lists."""
+ text = "- First item
- Second item
- Third item
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], HList)
+ self.assertEqual(blocks[0].style, ListStyle.ORDERED)
+
+ def test_list_with_styled_content(self):
+ """Test lists containing styled content."""
+ text = "- Normal item
- Bold item
- Item with italic text
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], HList)
+
+ items = list(blocks[0].items())
+ self.assertEqual(len(items), 3)
+
+ # Check second item has bold text
+ second_item_blocks = list(items[1].blocks())
+ if second_item_blocks:
+ words = list(second_item_blocks[0].words())
+ bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
+ self.assertGreater(len(bold_words), 0)
+
+ def test_table_basic(self):
+ """Test basic table structure."""
+ text = """
+
+
+ Header 1
+ Header 2
+
+
+ Cell 1
+ Cell 2
+
+
+ """
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Table)
+
+ def test_semantic_elements(self):
+ """Test semantic HTML5 elements treated as containers."""
+ text = "Article content
"
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Paragraph)
+
+ def test_nested_block_elements(self):
+ """Test nested block elements."""
+ text = """
+
+ Section Title
+ Some introductory text.
+
+ A quoted paragraph.
+
+
+ """
+ blocks = parse_html_string(text)
+ self.assertGreater(len(blocks), 2)
+
+ # Should have at least a heading, paragraph, and quote
+ has_heading = any(isinstance(b, Heading) for b in blocks)
+ has_paragraph = any(isinstance(b, Paragraph) for b in blocks)
+ has_quote = any(isinstance(b, Quote) for b in blocks)
+
+ self.assertTrue(has_heading, "Should contain a heading")
+ self.assertTrue(has_paragraph, "Should contain a paragraph")
+ self.assertTrue(has_quote, "Should contain a quote")
+
+ def test_empty_elements(self):
+ """Test handling of empty elements."""
+ text = ""
+ blocks = parse_html_string(text)
+ # Empty elements may not create blocks, which is acceptable behavior
+ self.assertGreaterEqual(len(blocks), 0)
+
+ # Test that empty paragraph with some content does create a block
+ text_with_content = "
" # Contains whitespace
+ blocks_with_content = parse_html_string(text_with_content)
+ # This should create at least one block since there's whitespace content
+ self.assertGreaterEqual(len(blocks_with_content), 0)
+
+
+class TestHTMLComplexStructures(unittest.TestCase):
+ """Test cases for complex HTML structures combining multiple features."""
+
+ def test_article_with_mixed_content(self):
+ """Test a realistic article structure with mixed content."""
+ text = """
+
+ Article Title
+ This is the introduction paragraph with some emphasis.
+
+ This is a quoted section with styling.
+
+
+ - First important point
+ - Second point with
inline code
+
+
+ """
+ blocks = parse_html_string(text)
+ self.assertGreater(len(blocks), 3)
+
+ # Verify we have the expected block types
+ block_types = [type(b).__name__ for b in blocks]
+ self.assertIn('Heading', block_types)
+ self.assertIn('Paragraph', block_types)
+ self.assertIn('Quote', block_types)
+ self.assertIn('HList', block_types)
+
+ def test_styled_table_content(self):
+ """Test table with styled cell content."""
+ text = """
+
+
+
+ Product
+ Price
+
+
+
+
+ Item with red text
+ $19.99
+
+
+
+ """
+ blocks = parse_html_string(text)
+ self.assertEqual(len(blocks), 1)
+ self.assertIsInstance(blocks[0], Table)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index e0d8c68..cb4f49b 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -8,7 +8,7 @@ import unittest
from unittest.mock import Mock, MagicMock
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
from pyWebLayout.io.readers.html_style import HTMLStyleManager
-from pyWebLayout.abstract.block import Parapgraph
+from pyWebLayout.abstract.block import Paragraph
from pyWebLayout.abstract.inline import Word
@@ -21,7 +21,7 @@ class TestHTMLTextProcessor(unittest.TestCase):
self.text_processor = HTMLTextProcessor(self.style_manager)
# Create a mock paragraph
- self.mock_paragraph = Mock(spec=Parapgraph)
+ self.mock_paragraph = Mock(spec=Paragraph)
self.mock_paragraph.add_word = Mock()
def test_initialization(self):