better parsing using handlers
Some checks failed
Python CI / test (push) Failing after 21s

This commit is contained in:
Duncan Tourolle 2025-06-07 14:38:11 +02:00
parent 81d85386c5
commit ba6d8ca906
15 changed files with 1189 additions and 97 deletions

View File

@ -1,6 +1,6 @@
from .block import Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock from .block import Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock
from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell
from .block import HorizontalRule, LineBreak, Image #from .block import HorizontalRule, LineBreak, Image
from .inline import Word, FormattedSpan from .inline import Word, FormattedSpan
from .document import Document, MetadataType, Chapter, Book from .document import Document, MetadataType, Chapter, Book
from .functional import Link, LinkType, Button, Form, FormField, FormFieldType from .functional import Link, LinkType, Button, Form, FormField, FormFieldType

View File

@ -183,6 +183,10 @@ class Paragraph(Block):
def word_count(self) -> int: def word_count(self) -> int:
"""Get the number of words in this paragraph""" """Get the number of words in this paragraph"""
return len(self._words) return len(self._words)
def __len__(self):
return self.word_count
class HeadingLevel(Enum): class HeadingLevel(Enum):
@ -1008,3 +1012,9 @@ class Table(Block):
self._footer_rows.append(row) self._footer_rows.append(row)
else: # Default to body else: # Default to body
self._rows self._rows
class Image:
pass

View File

@ -330,3 +330,8 @@ class FormattedSpan:
self._words.append(word) self._words.append(word)
return word return word
class LineBreak:
pass

View File

@ -7,7 +7,7 @@ from PIL import Image
from .style import Font, FontStyle, FontWeight, TextDecoration from .style import Font, FontStyle, FontWeight, TextDecoration
from .abstract.document import Document, MetadataType, Book, Chapter from .abstract.document import Document, MetadataType, Book, Chapter
from .abstract.block import ( from .abstract.block import (
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak
) )
from .abstract.inline import Word, FormattedSpan from .abstract.inline import Word, FormattedSpan
@ -138,7 +138,7 @@ class HTMLParser(BaseHTMLParser):
elif tag == 'p': elif tag == 'p':
self._flush_text() # Flush any pending text self._flush_text() # Flush any pending text
self._current_paragraph = Parapgraph() self._current_paragraph = Paragraph()
# Add the paragraph to the current block or document # Add the paragraph to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'): if self._current_block and hasattr(self._current_block, 'add_block'):
@ -180,7 +180,7 @@ class HTMLParser(BaseHTMLParser):
self._flush_text() # Flush any pending text self._flush_text() # Flush any pending text
# For divs, we create a new paragraph as a container # For divs, we create a new paragraph as a container
div_para = Parapgraph() div_para = Paragraph()
# Add the div to the current block or document # Add the div to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'): if self._current_block and hasattr(self._current_block, 'add_block'):
@ -214,7 +214,7 @@ class HTMLParser(BaseHTMLParser):
# Pre can optionally contain a code block # Pre can optionally contain a code block
# We'll create a paragraph for now, and if we find a code tag inside, # We'll create a paragraph for now, and if we find a code tag inside,
# we'll replace it with a code block # we'll replace it with a code block
pre_para = Parapgraph() pre_para = Paragraph()
# Add the pre to the current block or document # Add the pre to the current block or document
if self._current_block and hasattr(self._current_block, 'add_block'): if self._current_block and hasattr(self._current_block, 'add_block'):
@ -229,7 +229,7 @@ class HTMLParser(BaseHTMLParser):
elif tag == 'code': elif tag == 'code':
# If we're inside a pre, replace the paragraph with a code block # If we're inside a pre, replace the paragraph with a code block
if self._block_stack and isinstance(self._block_stack[-1], Parapgraph): if self._block_stack and isinstance(self._block_stack[-1], Paragraph):
pre_para = self._block_stack.pop() pre_para = self._block_stack.pop()
# Get the language from class if specified (e.g., class="language-python") # Get the language from class if specified (e.g., class="language-python")
@ -312,7 +312,7 @@ class HTMLParser(BaseHTMLParser):
self._current_block = list_item self._current_block = list_item
# Create a paragraph for the term content # Create a paragraph for the term content
term_para = Parapgraph() term_para = Paragraph()
list_item.add_block(term_para) list_item.add_block(term_para)
self._current_paragraph = term_para self._current_paragraph = term_para
@ -325,7 +325,7 @@ class HTMLParser(BaseHTMLParser):
list_item = current_list._items[-1] list_item = current_list._items[-1]
# Create a paragraph for the description content # Create a paragraph for the description content
desc_para = Parapgraph() desc_para = Paragraph()
list_item.add_block(desc_para) list_item.add_block(desc_para)
# Update current state # Update current state
@ -340,7 +340,7 @@ class HTMLParser(BaseHTMLParser):
self._current_block = list_item self._current_block = list_item
# Create a paragraph for the description content # Create a paragraph for the description content
desc_para = Parapgraph() desc_para = Paragraph()
list_item.add_block(desc_para) list_item.add_block(desc_para)
self._current_paragraph = desc_para self._current_paragraph = desc_para
@ -424,7 +424,7 @@ class HTMLParser(BaseHTMLParser):
self._current_block = cell self._current_block = cell
# Create a paragraph for the cell content # Create a paragraph for the cell content
cell_para = Parapgraph() cell_para = Paragraph()
cell.add_block(cell_para) cell.add_block(cell_para)
self._current_paragraph = cell_para self._current_paragraph = cell_para
@ -508,6 +508,7 @@ class HTMLParser(BaseHTMLParser):
}) })
elif tag == 'br': elif tag == 'br':
# Add a line break # Add a line break
if self._current_paragraph: if self._current_paragraph:
line_break = LineBreak() line_break = LineBreak()

View File

@ -379,10 +379,10 @@ class EPUBReader:
except Exception as e: except Exception as e:
print(f"Error parsing chapter {i+1}: {str(e)}") print(f"Error parsing chapter {i+1}: {str(e)}")
# Add an error message block # Add an error message block
from pyWebLayout.abstract.block import Parapgraph from pyWebLayout.abstract.block import Paragraph
from pyWebLayout.abstract.inline import Word from pyWebLayout.abstract.inline import Word
from pyWebLayout.style import Font from pyWebLayout.style import Font
error_para = Parapgraph() error_para = Paragraph()
# Create a default font style for the error message # Create a default font style for the error message
default_font = Font() default_font = Font()
error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font)) error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font))

View File

@ -9,9 +9,9 @@ from typing import Dict, List, Optional, Any
import urllib.parse import urllib.parse
from pyWebLayout.abstract.document import Document from pyWebLayout.abstract.document import Document
from pyWebLayout.abstract.block import ( from pyWebLayout.abstract.block import (
Block, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, ListItem, Table, TableRow, TableCell, HList, ListStyle, ListItem, Table, TableRow, TableCell,
HorizontalRule, LineBreak, Image #HorizontalRule, LineBreak, Image
) )
from pyWebLayout.abstract.functional import Link, LinkType from pyWebLayout.abstract.functional import Link, LinkType
from pyWebLayout.io.readers.html_style import HTMLStyleManager from pyWebLayout.io.readers.html_style import HTMLStyleManager
@ -26,7 +26,7 @@ class BlockElementHandler:
self.text_processor = text_processor self.text_processor = text_processor
self.block_stack: List[Block] = [] self.block_stack: List[Block] = []
self.current_block: Optional[Block] = None self.current_block: Optional[Block] = None
self.current_paragraph: Optional[Parapgraph] = None self.current_paragraph: Optional[Paragraph] = None
def reset(self): def reset(self):
"""Reset the handler state.""" """Reset the handler state."""
@ -44,7 +44,7 @@ class BlockElementHandler:
def handle_paragraph_start(self, document: Document): def handle_paragraph_start(self, document: Document):
"""Handle the start of a paragraph element.""" """Handle the start of a paragraph element."""
self.text_processor.flush_text() self.text_processor.flush_text()
paragraph = Parapgraph() paragraph = Paragraph()
self.add_block_to_document_or_parent(paragraph, document) self.add_block_to_document_or_parent(paragraph, document)
self.block_stack.append(paragraph) self.block_stack.append(paragraph)
@ -71,7 +71,7 @@ class BlockElementHandler:
def handle_div_start(self, document: Document): def handle_div_start(self, document: Document):
"""Handle the start of a div element.""" """Handle the start of a div element."""
self.text_processor.flush_text() self.text_processor.flush_text()
div_para = Parapgraph() div_para = Paragraph()
self.add_block_to_document_or_parent(div_para, document) self.add_block_to_document_or_parent(div_para, document)
self.block_stack.append(div_para) self.block_stack.append(div_para)
@ -93,7 +93,7 @@ class BlockElementHandler:
def handle_pre_start(self, document: Document): def handle_pre_start(self, document: Document):
"""Handle the start of a pre element.""" """Handle the start of a pre element."""
self.text_processor.flush_text() self.text_processor.flush_text()
pre_para = Parapgraph() pre_para = Paragraph()
self.add_block_to_document_or_parent(pre_para, document) self.add_block_to_document_or_parent(pre_para, document)
self.block_stack.append(pre_para) self.block_stack.append(pre_para)
@ -104,7 +104,7 @@ class BlockElementHandler:
def handle_code_start(self, attrs: Dict[str, str], document: Document): def handle_code_start(self, attrs: Dict[str, str], document: Document):
"""Handle the start of a code element.""" """Handle the start of a code element."""
# If we're inside a pre, replace the paragraph with a code block # If we're inside a pre, replace the paragraph with a code block
if self.block_stack and isinstance(self.block_stack[-1], Parapgraph): if self.block_stack and isinstance(self.block_stack[-1], Paragraph):
pre_para = self.block_stack.pop() pre_para = self.block_stack.pop()
# Get the language from class if specified # Get the language from class if specified
@ -145,7 +145,7 @@ class BlockElementHandler:
if self.block_stack: if self.block_stack:
self.current_block = self.block_stack[-1] self.current_block = self.block_stack[-1]
# Update current paragraph based on block type # Update current paragraph based on block type
if isinstance(self.current_block, Parapgraph): if isinstance(self.current_block, Paragraph):
self.current_paragraph = self.current_block self.current_paragraph = self.current_block
else: else:
self.current_paragraph = None self.current_paragraph = None
@ -201,7 +201,7 @@ class ListElementHandler:
block_handler.current_block = list_item block_handler.current_block = list_item
# Create a paragraph for the list item content # Create a paragraph for the list item content
item_para = Parapgraph() item_para = Paragraph()
list_item.add_block(item_para) list_item.add_block(item_para)
block_handler.current_paragraph = item_para block_handler.current_paragraph = item_para
self.text_processor.set_current_paragraph(item_para) self.text_processor.set_current_paragraph(item_para)
@ -220,7 +220,7 @@ class ListElementHandler:
block_handler.block_stack.append(list_item) block_handler.block_stack.append(list_item)
block_handler.current_block = list_item block_handler.current_block = list_item
term_para = Parapgraph() term_para = Paragraph()
list_item.add_block(term_para) list_item.add_block(term_para)
block_handler.current_paragraph = term_para block_handler.current_paragraph = term_para
self.text_processor.set_current_paragraph(term_para) self.text_processor.set_current_paragraph(term_para)
@ -228,7 +228,7 @@ class ListElementHandler:
elif tag == 'dd': elif tag == 'dd':
if current_list._items: if current_list._items:
list_item = current_list._items[-1] list_item = current_list._items[-1]
desc_para = Parapgraph() desc_para = Paragraph()
list_item.add_block(desc_para) list_item.add_block(desc_para)
block_handler.current_paragraph = desc_para block_handler.current_paragraph = desc_para
self.text_processor.set_current_paragraph(desc_para) self.text_processor.set_current_paragraph(desc_para)
@ -339,7 +339,7 @@ class TableElementHandler:
block_handler.current_block = cell block_handler.current_block = cell
# Create a paragraph for the cell content # Create a paragraph for the cell content
cell_para = Parapgraph() cell_para = Paragraph()
cell.add_block(cell_para) cell.add_block(cell_para)
block_handler.current_paragraph = cell_para block_handler.current_paragraph = cell_para
self.text_processor.set_current_paragraph(cell_para) self.text_processor.set_current_paragraph(cell_para)

View File

@ -0,0 +1,736 @@
"""
HTML extraction module for converting HTML elements to pyWebLayout abstract elements.
This module provides handler functions for converting HTML elements into the abstract document structure
used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting.
Each handler function has a robust signature that handles style hints, CSS classes, and attributes.
"""
import re
from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple
from bs4 import BeautifulSoup, Tag, NavigableString
from pyWebLayout.abstract.inline import Word, FormattedSpan
from pyWebLayout.abstract.block import (
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListItem, ListStyle, Table, TableRow, TableCell
)
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
class StyleContext(NamedTuple):
"""
Immutable style context passed to handler functions.
Contains all styling information including inherited styles, CSS hints, and element attributes.
"""
font: Font
background: Optional[Tuple[int, int, int, int]]
css_classes: set
css_styles: Dict[str, str]
element_attributes: Dict[str, Any]
parent_elements: List[str] # Stack of parent element names
def with_font(self, font: Font) -> 'StyleContext':
"""Create new context with modified font."""
return self._replace(font=font)
def with_background(self, background: Optional[Tuple[int, int, int, int]]) -> 'StyleContext':
"""Create new context with modified background."""
return self._replace(background=background)
def with_css_classes(self, css_classes: set) -> 'StyleContext':
"""Create new context with modified CSS classes."""
return self._replace(css_classes=css_classes)
def with_css_styles(self, css_styles: Dict[str, str]) -> 'StyleContext':
"""Create new context with modified CSS styles."""
return self._replace(css_styles=css_styles)
def with_attributes(self, attributes: Dict[str, Any]) -> 'StyleContext':
"""Create new context with modified element attributes."""
return self._replace(element_attributes=attributes)
def push_element(self, element_name: str) -> 'StyleContext':
"""Create new context with element pushed onto parent stack."""
return self._replace(parent_elements=self.parent_elements + [element_name])
def create_base_context(base_font: Optional[Font] = None) -> StyleContext:
"""
Create a base style context with default values.
Args:
base_font: Base font to use, defaults to system default
Returns:
StyleContext with default values
"""
return StyleContext(
font=base_font or Font(),
background=None,
css_classes=set(),
css_styles={},
element_attributes={},
parent_elements=[]
)
def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext:
"""
Apply element-specific styling to context based on HTML element and attributes.
Args:
context: Current style context
element: BeautifulSoup Tag object
Returns:
New StyleContext with applied styling
"""
tag_name = element.name.lower()
attributes = dict(element.attrs) if element.attrs else {}
# Start with current context
new_context = context.with_attributes(attributes).push_element(tag_name)
# Apply CSS classes
css_classes = new_context.css_classes.copy()
if 'class' in attributes:
classes = attributes['class'].split() if isinstance(attributes['class'], str) else attributes['class']
css_classes.update(classes)
new_context = new_context.with_css_classes(css_classes)
# Apply inline styles
css_styles = new_context.css_styles.copy()
if 'style' in attributes:
inline_styles = parse_inline_styles(attributes['style'])
css_styles.update(inline_styles)
new_context = new_context.with_css_styles(css_styles)
# Apply element-specific default styles
font = apply_element_font_styles(new_context.font, tag_name, css_styles)
new_context = new_context.with_font(font)
# Apply background from styles
background = apply_background_styles(new_context.background, css_styles)
new_context = new_context.with_background(background)
return new_context
def parse_inline_styles(style_text: str) -> Dict[str, str]:
"""
Parse CSS inline styles into dictionary.
Args:
style_text: CSS style text (e.g., "color: red; font-weight: bold;")
Returns:
Dictionary of CSS property-value pairs
"""
styles = {}
for declaration in style_text.split(';'):
if ':' in declaration:
prop, value = declaration.split(':', 1)
styles[prop.strip().lower()] = value.strip()
return styles
def apply_element_font_styles(font: Font, tag_name: str, css_styles: Dict[str, str]) -> Font:
"""
Apply font styling based on HTML element and CSS styles.
Args:
font: Current font
tag_name: HTML tag name
css_styles: CSS styles dictionary
Returns:
New Font object with applied styling
"""
# Default element styles
element_font_styles = {
'b': {'weight': FontWeight.BOLD},
'strong': {'weight': FontWeight.BOLD},
'i': {'style': FontStyle.ITALIC},
'em': {'style': FontStyle.ITALIC},
'u': {'decoration': TextDecoration.UNDERLINE},
's': {'decoration': TextDecoration.STRIKETHROUGH},
'del': {'decoration': TextDecoration.STRIKETHROUGH},
'h1': {'size': 24, 'weight': FontWeight.BOLD},
'h2': {'size': 20, 'weight': FontWeight.BOLD},
'h3': {'size': 18, 'weight': FontWeight.BOLD},
'h4': {'size': 16, 'weight': FontWeight.BOLD},
'h5': {'size': 14, 'weight': FontWeight.BOLD},
'h6': {'size': 12, 'weight': FontWeight.BOLD},
}
# Start with current font properties
font_size = font.font_size
colour = font.colour
weight = font.weight
style = font.style
decoration = font.decoration
background = font.background
language = font.language
# Apply element default styles
if tag_name in element_font_styles:
elem_styles = element_font_styles[tag_name]
if 'size' in elem_styles:
font_size = elem_styles['size']
if 'weight' in elem_styles:
weight = elem_styles['weight']
if 'style' in elem_styles:
style = elem_styles['style']
if 'decoration' in elem_styles:
decoration = elem_styles['decoration']
# Apply CSS styles (override element defaults)
if 'font-size' in css_styles:
# Parse font-size (simplified - could be enhanced)
size_value = css_styles['font-size'].lower()
if size_value.endswith('px'):
try:
font_size = int(float(size_value[:-2]))
except ValueError:
pass
elif size_value.endswith('pt'):
try:
font_size = int(float(size_value[:-2]))
except ValueError:
pass
if 'font-weight' in css_styles:
weight_value = css_styles['font-weight'].lower()
if weight_value in ['bold', '700', '800', '900']:
weight = FontWeight.BOLD
elif weight_value in ['normal', '400']:
weight = FontWeight.NORMAL
if 'font-style' in css_styles:
style_value = css_styles['font-style'].lower()
if style_value == 'italic':
style = FontStyle.ITALIC
elif style_value == 'normal':
style = FontStyle.NORMAL
if 'text-decoration' in css_styles:
decoration_value = css_styles['text-decoration'].lower()
if 'underline' in decoration_value:
decoration = TextDecoration.UNDERLINE
elif 'line-through' in decoration_value:
decoration = TextDecoration.STRIKETHROUGH
elif 'none' in decoration_value:
decoration = TextDecoration.NONE
if 'color' in css_styles:
# Parse color (simplified - could be enhanced for hex, rgb, etc.)
color_value = css_styles['color'].lower()
color_map = {
'black': (0, 0, 0),
'white': (255, 255, 255),
'red': (255, 0, 0),
'green': (0, 255, 0),
'blue': (0, 0, 255),
}
if color_value in color_map:
colour = color_map[color_value]
elif color_value.startswith('#') and len(color_value) == 7:
try:
r = int(color_value[1:3], 16)
g = int(color_value[3:5], 16)
b = int(color_value[5:7], 16)
colour = (r, g, b)
except ValueError:
pass
return Font(
font_path=font._font_path,
font_size=font_size,
colour=colour,
weight=weight,
style=style,
decoration=decoration,
background=background,
langauge=language
)
def apply_background_styles(current_background: Optional[Tuple[int, int, int, int]],
css_styles: Dict[str, str]) -> Optional[Tuple[int, int, int, int]]:
"""
Apply background styling from CSS.
Args:
current_background: Current background color (RGBA)
css_styles: CSS styles dictionary
Returns:
New background color or None
"""
if 'background-color' in css_styles:
bg_value = css_styles['background-color'].lower()
if bg_value == 'transparent':
return None
# Add color parsing logic here if needed
return current_background
def extract_text_content(element: Tag, context: StyleContext) -> List[Word]:
"""
Extract text content from an element, handling inline formatting.
Args:
element: BeautifulSoup Tag object
context: Current style context
Returns:
List of Word objects
"""
words = []
for child in element.children:
if isinstance(child, NavigableString):
# Plain text - split into words
text = str(child).strip()
if text:
word_texts = text.split()
for word_text in word_texts:
if word_text:
words.append(Word(word_text, context.font, context.background))
elif isinstance(child, Tag):
# Process inline elements
if child.name.lower() in ['span', 'a', 'strong', 'b', 'em', 'i', 'u', 's', 'del', 'ins', 'mark', 'small', 'sub', 'sup', 'code', 'q', 'cite', 'abbr', 'time']:
child_context = apply_element_styling(context, child)
child_words = extract_text_content(child, child_context)
words.extend(child_words)
else:
# Block element - shouldn't happen in well-formed HTML but handle gracefully
child_context = apply_element_styling(context, child)
child_result = process_element(child, child_context)
if isinstance(child_result, list):
for block in child_result:
if isinstance(block, Paragraph):
for _, word in block.words():
words.append(word)
elif isinstance(child_result, Paragraph):
for _, word in child_result.words():
words.append(word)
return words
def process_element(element: Tag, context: StyleContext) -> Union[Block, List[Block], None]:
"""
Process a single HTML element using appropriate handler.
Args:
element: BeautifulSoup Tag object
context: Current style context
Returns:
Block object(s) or None if element should be ignored
"""
tag_name = element.name.lower()
handler = HANDLERS.get(tag_name, generic_handler)
return handler(element, context)
# Handler function signatures:
# All handlers receive (element: Tag, context: StyleContext) -> Union[Block, List[Block], None]
def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
"""Handle <p> elements."""
paragraph = Paragraph(context.font)
words = extract_text_content(element, context)
for word in words:
paragraph.add_word(word)
return paragraph
def div_handler(element: Tag, context: StyleContext) -> List[Block]:
"""Handle <div> elements - treat as generic container."""
blocks = []
for child in element.children:
if isinstance(child, Tag):
child_context = apply_element_styling(context, child)
result = process_element(child, child_context)
if result:
if isinstance(result, list):
blocks.extend(result)
else:
blocks.append(result)
return blocks
def heading_handler(element: Tag, context: StyleContext) -> Heading:
"""Handle <h1>-<h6> elements."""
level_map = {
'h1': HeadingLevel.H1,
'h2': HeadingLevel.H2,
'h3': HeadingLevel.H3,
'h4': HeadingLevel.H4,
'h5': HeadingLevel.H5,
'h6': HeadingLevel.H6,
}
level = level_map.get(element.name.lower(), HeadingLevel.H1)
heading = Heading(level, context.font)
words = extract_text_content(element, context)
for word in words:
heading.add_word(word)
return heading
def blockquote_handler(element: Tag, context: StyleContext) -> Quote:
"""Handle <blockquote> elements."""
quote = Quote(context.font)
for child in element.children:
if isinstance(child, Tag):
child_context = apply_element_styling(context, child)
result = process_element(child, child_context)
if result:
if isinstance(result, list):
for block in result:
quote.add_block(block)
else:
quote.add_block(result)
return quote
def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock:
"""Handle <pre> elements."""
language = context.element_attributes.get('data-language', '')
code_block = CodeBlock(language)
# Preserve whitespace and line breaks in preformatted text
text = element.get_text(separator='\n', strip=False)
for line in text.split('\n'):
code_block.add_line(line)
return code_block
def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]:
"""Handle <code> elements."""
# If parent is <pre>, this is handled by preformatted_handler
if context.parent_elements and context.parent_elements[-1] == 'pre':
return None # Will be handled by parent
# Inline code - handled during text extraction
return None
def unordered_list_handler(element: Tag, context: StyleContext) -> HList:
"""Handle <ul> elements."""
hlist = HList(ListStyle.UNORDERED, context.font)
for child in element.children:
if isinstance(child, Tag) and child.name.lower() == 'li':
child_context = apply_element_styling(context, child)
item = process_element(child, child_context)
if item:
hlist.add_item(item)
return hlist
def ordered_list_handler(element: Tag, context: StyleContext) -> HList:
"""Handle <ol> elements."""
hlist = HList(ListStyle.ORDERED, context.font)
for child in element.children:
if isinstance(child, Tag) and child.name.lower() == 'li':
child_context = apply_element_styling(context, child)
item = process_element(child, child_context)
if item:
hlist.add_item(item)
return hlist
def list_item_handler(element: Tag, context: StyleContext) -> ListItem:
"""Handle <li> elements."""
list_item = ListItem(None, context.font)
for child in element.children:
if isinstance(child, Tag):
child_context = apply_element_styling(context, child)
result = process_element(child, child_context)
if result:
if isinstance(result, list):
for block in result:
list_item.add_block(block)
else:
list_item.add_block(result)
elif isinstance(child, NavigableString):
# Direct text in list item - create paragraph
text = str(child).strip()
if text:
paragraph = Paragraph(context.font)
words = text.split()
for word_text in words:
if word_text:
paragraph.add_word(Word(word_text, context.font))
list_item.add_block(paragraph)
return list_item
def table_handler(element: Tag, context: StyleContext) -> Table:
"""Handle <table> elements."""
caption = None
caption_elem = element.find('caption')
if caption_elem:
caption = caption_elem.get_text(strip=True)
table = Table(caption, context.font)
# Process table rows
for child in element.children:
if isinstance(child, Tag):
if child.name.lower() == 'tr':
child_context = apply_element_styling(context, child)
row = process_element(child, child_context)
if row:
table.add_row(row)
elif child.name.lower() in ['thead', 'tbody', 'tfoot']:
section = 'header' if child.name.lower() == 'thead' else 'body'
section = 'footer' if child.name.lower() == 'tfoot' else section
for row_elem in child.find_all('tr'):
child_context = apply_element_styling(context, row_elem)
row = process_element(row_elem, child_context)
if row:
table.add_row(row, section)
return table
def table_row_handler(element: Tag, context: StyleContext) -> TableRow:
"""Handle <tr> elements."""
row = TableRow(context.font)
for child in element.children:
if isinstance(child, Tag) and child.name.lower() in ['td', 'th']:
child_context = apply_element_styling(context, child)
cell = process_element(child, child_context)
if cell:
row.add_cell(cell)
return row
def table_cell_handler(element: Tag, context: StyleContext) -> TableCell:
"""Handle <td> elements."""
colspan = int(context.element_attributes.get('colspan', 1))
rowspan = int(context.element_attributes.get('rowspan', 1))
cell = TableCell(False, colspan, rowspan, context.font)
# Process cell content
for child in element.children:
if isinstance(child, Tag):
child_context = apply_element_styling(context, child)
result = process_element(child, child_context)
if result:
if isinstance(result, list):
for block in result:
cell.add_block(block)
else:
cell.add_block(result)
elif isinstance(child, NavigableString):
# Direct text in cell - create paragraph
text = str(child).strip()
if text:
paragraph = Paragraph(context.font)
words = text.split()
for word_text in words:
if word_text:
paragraph.add_word(Word(word_text, context.font))
cell.add_block(paragraph)
return cell
def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:
"""Handle <th> elements."""
colspan = int(context.element_attributes.get('colspan', 1))
rowspan = int(context.element_attributes.get('rowspan', 1))
cell = TableCell(True, colspan, rowspan, context.font)
# Process cell content (same as td)
for child in element.children:
if isinstance(child, Tag):
child_context = apply_element_styling(context, child)
result = process_element(child, child_context)
if result:
if isinstance(result, list):
for block in result:
cell.add_block(block)
else:
cell.add_block(result)
elif isinstance(child, NavigableString):
text = str(child).strip()
if text:
paragraph = Paragraph(context.font)
words = text.split()
for word_text in words:
if word_text:
paragraph.add_word(Word(word_text, context.font))
cell.add_block(paragraph)
return cell
def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block:
"""Handle <hr> elements."""
# TODO: Create a specific HorizontalRule block type
# For now, return an empty paragraph
return Paragraph(context.font)
def line_break_handler(element: Tag, context: StyleContext) -> None:
"""Handle <br> elements."""
# Line breaks are typically handled at the paragraph level
return None
def image_handler(element: Tag, context: StyleContext) -> Block:
"""Handle <img> elements."""
# TODO: Create Image block type
# For now, return empty paragraph with alt text if available
paragraph = Paragraph(context.font)
alt_text = context.element_attributes.get('alt', '')
if alt_text:
words = alt_text.split()
for word_text in words:
if word_text:
paragraph.add_word(Word(word_text, context.font))
return paragraph
def ignore_handler(element: Tag, context: StyleContext) -> None:
"""Handle elements that should be ignored."""
return None
def generic_handler(element: Tag, context: StyleContext) -> List[Block]:
"""Handle unknown elements as generic containers."""
return div_handler(element, context)
# Handler registry - maps HTML tag names to handler functions
HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = {
# Block elements
'p': paragraph_handler,
'div': div_handler,
'h1': heading_handler,
'h2': heading_handler,
'h3': heading_handler,
'h4': heading_handler,
'h5': heading_handler,
'h6': heading_handler,
'blockquote': blockquote_handler,
'pre': preformatted_handler,
'code': code_handler,
'ul': unordered_list_handler,
'ol': ordered_list_handler,
'li': list_item_handler,
'table': table_handler,
'tr': table_row_handler,
'td': table_cell_handler,
'th': table_header_cell_handler,
'hr': horizontal_rule_handler,
'br': line_break_handler,
# Semantic elements (treated as containers)
'section': div_handler,
'article': div_handler,
'aside': div_handler,
'nav': div_handler,
'header': div_handler,
'footer': div_handler,
'main': div_handler,
'figure': div_handler,
'figcaption': paragraph_handler,
# Media elements
'img': image_handler,
# Inline elements (handled during text extraction)
'span': ignore_handler,
'a': ignore_handler,
'strong': ignore_handler,
'b': ignore_handler,
'em': ignore_handler,
'i': ignore_handler,
'u': ignore_handler,
's': ignore_handler,
'del': ignore_handler,
'ins': ignore_handler,
'mark': ignore_handler,
'small': ignore_handler,
'sub': ignore_handler,
'sup': ignore_handler,
'q': ignore_handler,
'cite': ignore_handler,
'abbr': ignore_handler,
'time': ignore_handler,
# Ignored elements
'script': ignore_handler,
'style': ignore_handler,
'meta': ignore_handler,
'link': ignore_handler,
'head': ignore_handler,
'title': ignore_handler,
}
def parse_html_string(html_string: str, base_font: Optional[Font] = None) -> List[Block]:
"""
Parse HTML string and return list of Block objects.
Args:
html_string: HTML content to parse
base_font: Base font for styling, defaults to system default
Returns:
List of Block objects representing the document structure
"""
soup = BeautifulSoup(html_string, 'html.parser')
context = create_base_context(base_font)
blocks = []
# Process the body if it exists, otherwise process all top-level elements
root_element = soup.find('body') or soup
for element in root_element.children:
if isinstance(element, Tag):
element_context = apply_element_styling(context, element)
result = process_element(element, element_context)
if result:
if isinstance(result, list):
blocks.extend(result)
else:
blocks.append(result)
return blocks
def register_handler(tag_name: str, handler: Callable[[Tag, StyleContext], Union[Block, List[Block], None]]):
"""
Register a custom handler for an HTML tag.
Args:
tag_name: HTML tag name (lowercase)
handler: Handler function with signature (element: Tag, context: StyleContext) -> Union[Block, List[Block], None]
"""
HANDLERS[tag_name] = handler
def get_handler(tag_name: str) -> Callable[[Tag, StyleContext], Union[Block, List[Block], None]]:
"""
Get handler function for HTML tag.
Args:
tag_name: HTML tag name (lowercase)
Returns:
Handler function or generic_handler if tag not found
"""
return HANDLERS.get(tag_name.lower(), generic_handler)

View File

@ -7,7 +7,7 @@ entity references, and word creation in HTML documents.
from typing import Optional from typing import Optional
from pyWebLayout.abstract.inline import Word from pyWebLayout.abstract.inline import Word
from pyWebLayout.abstract.block import Parapgraph from pyWebLayout.abstract.block import Paragraph
from pyWebLayout.io.readers.html_style import HTMLStyleManager from pyWebLayout.io.readers.html_style import HTMLStyleManager
@ -28,14 +28,14 @@ class HTMLTextProcessor:
""" """
self._style_manager = style_manager self._style_manager = style_manager
self._text_buffer = "" self._text_buffer = ""
self._current_paragraph: Optional[Parapgraph] = None self._current_paragraph: Optional[Paragraph] = None
def reset(self): def reset(self):
"""Reset the text processor state.""" """Reset the text processor state."""
self._text_buffer = "" self._text_buffer = ""
self._current_paragraph = None self._current_paragraph = None
def set_current_paragraph(self, paragraph: Optional[Parapgraph]): def set_current_paragraph(self, paragraph: Optional[Paragraph]):
""" """
Set the current paragraph for text output. Set the current paragraph for text output.

View File

@ -139,7 +139,7 @@ class DocumentPaginator:
for chapter in self.document.chapters: for chapter in self.document.chapters:
# Add a heading block for the chapter if it has a title # Add a heading block for the chapter if it has a title
if chapter.title: if chapter.title:
from pyWebLayout.abstract.block import Heading, HeadingLevel, Parapgraph from pyWebLayout.abstract.block import Heading, HeadingLevel, Paragraph
from pyWebLayout.abstract.inline import Word from pyWebLayout.abstract.inline import Word
# Create a heading for the chapter # Create a heading for the chapter

View File

@ -6,7 +6,7 @@ Tests the core abstract block classes that form the foundation of the document m
import unittest import unittest
from pyWebLayout.abstract.block import ( from pyWebLayout.abstract.block import (
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock, Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, ListItem, Table, TableRow, TableCell, HList, ListStyle, ListItem, Table, TableRow, TableCell,
HorizontalRule, LineBreak, Image HorizontalRule, LineBreak, Image
) )
@ -19,7 +19,7 @@ class TestBlockElements(unittest.TestCase):
def test_paragraph_creation(self): def test_paragraph_creation(self):
"""Test creating and using paragraphs.""" """Test creating and using paragraphs."""
paragraph = Parapgraph() paragraph = Paragraph()
self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH) self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH)
self.assertEqual(paragraph.word_count, 0) self.assertEqual(paragraph.word_count, 0)
@ -62,8 +62,8 @@ class TestBlockElements(unittest.TestCase):
quote = Quote() quote = Quote()
# Add nested paragraphs # Add nested paragraphs
p1 = Parapgraph() p1 = Paragraph()
p2 = Parapgraph() p2 = Paragraph()
quote.add_block(p1) quote.add_block(p1)
quote.add_block(p2) quote.add_block(p2)

View File

@ -7,7 +7,7 @@ document structure and metadata management.
import unittest import unittest
from pyWebLayout.abstract.document import Document, Chapter, Book, MetadataType from pyWebLayout.abstract.document import Document, Chapter, Book, MetadataType
from pyWebLayout.abstract.block import Parapgraph, Heading, HeadingLevel, BlockType from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, BlockType
from pyWebLayout.abstract.inline import Word, FormattedSpan from pyWebLayout.abstract.inline import Word, FormattedSpan
from pyWebLayout.style import Font from pyWebLayout.style import Font
@ -77,8 +77,8 @@ class TestDocument(unittest.TestCase):
def test_block_management(self): def test_block_management(self):
"""Test adding and managing blocks.""" """Test adding and managing blocks."""
# Create some blocks # Create some blocks
para1 = Parapgraph() para1 = Paragraph()
para2 = Parapgraph() para2 = Paragraph()
heading = Heading(HeadingLevel.H1) heading = Heading(HeadingLevel.H1)
# Add blocks # Add blocks
@ -95,7 +95,7 @@ class TestDocument(unittest.TestCase):
def test_anchor_management(self): def test_anchor_management(self):
"""Test named anchor functionality.""" """Test named anchor functionality."""
heading = Heading(HeadingLevel.H1) heading = Heading(HeadingLevel.H1)
para = Parapgraph() para = Paragraph()
# Add anchors # Add anchors
self.doc.add_anchor("intro", heading) self.doc.add_anchor("intro", heading)
@ -154,8 +154,8 @@ class TestDocument(unittest.TestCase):
def test_find_blocks_by_type(self): def test_find_blocks_by_type(self):
"""Test finding blocks by type.""" """Test finding blocks by type."""
# Create blocks of different types # Create blocks of different types
para1 = Parapgraph() para1 = Paragraph()
para2 = Parapgraph() para2 = Paragraph()
heading1 = Heading(HeadingLevel.H1) heading1 = Heading(HeadingLevel.H1)
heading2 = Heading(HeadingLevel.H2) heading2 = Heading(HeadingLevel.H2)
@ -180,7 +180,7 @@ class TestDocument(unittest.TestCase):
def test_find_headings(self): def test_find_headings(self):
"""Test finding heading blocks specifically.""" """Test finding heading blocks specifically."""
# Create mixed blocks # Create mixed blocks
para = Parapgraph() para = Paragraph()
h1 = Heading(HeadingLevel.H1) h1 = Heading(HeadingLevel.H1)
h2 = Heading(HeadingLevel.H2) h2 = Heading(HeadingLevel.H2)
@ -284,8 +284,8 @@ class TestChapter(unittest.TestCase):
def test_block_management(self): def test_block_management(self):
"""Test adding blocks to chapter.""" """Test adding blocks to chapter."""
para1 = Parapgraph() para1 = Paragraph()
para2 = Parapgraph() para2 = Paragraph()
heading = Heading(HeadingLevel.H2) heading = Heading(HeadingLevel.H2)
# Add blocks # Add blocks
@ -450,7 +450,7 @@ class TestBook(unittest.TestCase):
"""Test that Book inherits all Document functionality.""" """Test that Book inherits all Document functionality."""
# Test that book can use all document methods # Test that book can use all document methods
# Add blocks directly to book # Add blocks directly to book
para = Parapgraph() para = Paragraph()
self.book.add_block(para) self.book.add_block(para)
self.assertEqual(len(self.book.blocks), 1) self.assertEqual(len(self.book.blocks), 1)

View File

@ -1,44 +0,0 @@
#!/usr/bin/env python3
"""
Simple test script to verify that the EPUB reader fixes are working correctly.
"""
import sys
import os
# Add the pyWebLayout directory to the Python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'pyWebLayout'))
try:
from pyWebLayout.io.readers.epub_reader import read_epub
print("Successfully imported epub_reader module")
# Test reading the EPUB file
epub_path = os.path.join('pyWebLayout', 'examples', 'pg174-images-3.epub')
if not os.path.exists(epub_path):
print(f"EPUB file not found: {epub_path}")
sys.exit(1)
print(f"Reading EPUB file: {epub_path}")
# Try to read the EPUB
book = read_epub(epub_path)
print(f"Successfully read EPUB file!")
print(f"Book title: {book.title}")
print(f"Number of chapters: {len(book.chapters)}")
# Check first chapter
if book.chapters:
first_chapter = book.chapters[0]
print(f"First chapter title: {first_chapter.title}")
print(f"First chapter has {len(first_chapter.blocks)} blocks")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print("Test completed successfully!")

View File

@ -9,7 +9,7 @@ import unittest
from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.io.readers.html_content import HTMLContentReader
from pyWebLayout.abstract.document import Document from pyWebLayout.abstract.document import Document
from pyWebLayout.abstract.block import ( from pyWebLayout.abstract.block import (
Parapgraph, Heading, HeadingLevel, HList, ListStyle, Paragraph, Heading, HeadingLevel, HList, ListStyle,
Table, Quote, CodeBlock, HorizontalRule, LineBreak Table, Quote, CodeBlock, HorizontalRule, LineBreak
) )
@ -29,7 +29,7 @@ class TestHTMLContentReader(unittest.TestCase):
result = self.reader.extract_content(html, self.document) result = self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1) self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], Parapgraph) self.assertIsInstance(self.document.blocks[0], Paragraph)
paragraph = self.document.blocks[0] paragraph = self.document.blocks[0]
words = list(paragraph.words()) words = list(paragraph.words())
@ -107,7 +107,7 @@ class TestHTMLContentReader(unittest.TestCase):
# Check first item content # Check first item content
first_item_blocks = list(items[0].blocks()) first_item_blocks = list(items[0].blocks())
self.assertEqual(len(first_item_blocks), 1) self.assertEqual(len(first_item_blocks), 1)
self.assertIsInstance(first_item_blocks[0], Parapgraph) self.assertIsInstance(first_item_blocks[0], Paragraph)
def test_ordered_list(self): def test_ordered_list(self):
"""Test parsing ordered lists.""" """Test parsing ordered lists."""
@ -202,8 +202,8 @@ class TestHTMLContentReader(unittest.TestCase):
quote = self.document.blocks[0] quote = self.document.blocks[0]
quote_blocks = list(quote.blocks()) quote_blocks = list(quote.blocks())
self.assertEqual(len(quote_blocks), 2) self.assertEqual(len(quote_blocks), 2)
self.assertIsInstance(quote_blocks[0], Parapgraph) self.assertIsInstance(quote_blocks[0], Paragraph)
self.assertIsInstance(quote_blocks[1], Parapgraph) self.assertIsInstance(quote_blocks[1], Paragraph)
def test_code_block(self): def test_code_block(self):
"""Test parsing code blocks.""" """Test parsing code blocks."""
@ -229,9 +229,9 @@ def hello():
self.reader.extract_content(html, self.document) self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 3) self.assertEqual(len(self.document.blocks), 3)
self.assertIsInstance(self.document.blocks[0], Parapgraph) self.assertIsInstance(self.document.blocks[0], Paragraph)
self.assertIsInstance(self.document.blocks[1], HorizontalRule) self.assertIsInstance(self.document.blocks[1], HorizontalRule)
self.assertIsInstance(self.document.blocks[2], Parapgraph) self.assertIsInstance(self.document.blocks[2], Paragraph)
def test_html_entities(self): def test_html_entities(self):
"""Test handling HTML entities.""" """Test handling HTML entities."""
@ -268,7 +268,7 @@ def hello():
# Check that we have different types of blocks # Check that we have different types of blocks
block_types = [type(block).__name__ for block in self.document.blocks] block_types = [type(block).__name__ for block in self.document.blocks]
self.assertIn('Parapgraph', block_types) # From div self.assertIn('Paragraph', block_types) # From div
self.assertIn('Heading', block_types) self.assertIn('Heading', block_types)
self.assertIn('HList', block_types) self.assertIn('HList', block_types)
@ -346,7 +346,7 @@ def hello():
# Should have different types of content # Should have different types of content
block_types = set(type(block).__name__ for block in self.document.blocks) block_types = set(type(block).__name__ for block in self.document.blocks)
expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'} expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'}
self.assertTrue(expected_types.issubset(block_types)) self.assertTrue(expected_types.issubset(block_types))

View File

@ -0,0 +1,384 @@
"""
Unit tests for HTML extraction functionality.
Tests the HTML parsing and conversion to pyWebLayout abstract elements,
including styled content within paragraphs and block-level elements.
"""
import unittest
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
from pyWebLayout.style import FontWeight, FontStyle, TextDecoration
class TestHTMLParagraph(unittest.TestCase):
"""Test cases for basic paragraph parsing."""
def test_simple(self):
text = "<p>This is a paragraph.</p>"
paragraphs = parse_html_string(text)
self.assertEqual(len(paragraphs), 1)
self.assertEqual(len(paragraphs[0]), 4)
for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
self.assertEqual(w1[1].text, t1)
def test_multiple(self):
text = "<p>This is a paragraph.</p><p>This is another paragraph.</p>"
paragraphs = parse_html_string(text)
self.assertEqual(len(paragraphs), 2)
self.assertEqual(len(paragraphs[0]), 4)
self.assertEqual(len(paragraphs[1]), 4)
for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
self.assertEqual(w1[1].text, t1)
for w1, t1 in zip(paragraphs[1].words(), "This is another paragraph.".split(" ")):
self.assertEqual(w1[1].text, t1)
class TestHTMLStyledParagraphs(unittest.TestCase):
"""Test cases for paragraphs with inline styling."""
def test_bold_text(self):
"""Test paragraphs with bold text using <strong> and <b> tags."""
text = "<p>This is <strong>bold text</strong> in a paragraph.</p>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Paragraph)
words = list(blocks[0].words())
self.assertEqual(len(words), 7) # "This is bold text in a paragraph."
# Check that 'bold' and 'text' words have bold font weight
bold_word = words[2][1] # 'bold'
text_word = words[3][1] # 'text'
self.assertEqual(bold_word.text, "bold")
self.assertEqual(bold_word.style.weight, FontWeight.BOLD)
self.assertEqual(text_word.text, "text")
self.assertEqual(text_word.style.weight, FontWeight.BOLD)
# Check that other words are not bold
normal_word = words[0][1] # 'This'
self.assertEqual(normal_word.text, "This")
self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD)
def test_italic_text(self):
"""Test paragraphs with italic text using <em> and <i> tags."""
text = "<p>This is <em>italic text</em> in a paragraph.</p>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Paragraph)
words = list(blocks[0].words())
# Check that 'italic' and 'text' words have italic font style
italic_word = words[2][1] # 'italic'
text_word = words[3][1] # 'text'
self.assertEqual(italic_word.text, "italic")
self.assertEqual(italic_word.style.style, FontStyle.ITALIC)
self.assertEqual(text_word.text, "text")
self.assertEqual(text_word.style.style, FontStyle.ITALIC)
def test_underlined_text(self):
"""Test paragraphs with underlined text using <u> tag."""
text = "<p>This is <u>underlined text</u> here.</p>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
words = list(blocks[0].words())
underlined_word = words[2][1] # 'underlined'
self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE)
def test_strikethrough_text(self):
"""Test paragraphs with strikethrough text using <s> and <del> tags."""
text = "<p>This is <s>strikethrough text</s> here.</p>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
words = list(blocks[0].words())
strike_word = words[2][1] # 'strikethrough'
self.assertEqual(strike_word.style.decoration, TextDecoration.STRIKETHROUGH)
def test_span_with_inline_styles(self):
"""Test paragraphs with span elements containing inline CSS styles."""
text = '<p>This text is normal, but <span style="color: red; font-weight: bold;">this part is red and bold</span>.</p>'
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Paragraph)
words = list(blocks[0].words())
# Find the styled words
styled_words = []
for _, word in words:
if word.text in ["this", "part", "is", "red", "and", "bold"]:
if word.style.weight == FontWeight.BOLD:
styled_words.append(word)
self.assertGreater(len(styled_words), 0, "Should have bold words in styled span")
# Check that at least one word has the red color
red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)]
self.assertGreater(len(red_words), 0, "Should have red colored words")
def test_mixed_formatting(self):
"""Test paragraphs with multiple formatting elements combined."""
text = "<p>This paragraph contains <strong>bold</strong>, <em>italic</em>, <span style=\"color: blue;\">blue</span>, and <mark>highlighted</mark> text all together.</p>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Paragraph)
words = list(blocks[0].words())
# Check for bold word
bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
self.assertGreater(len(bold_words), 0, "Should have bold words")
# Check for italic word
italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
self.assertGreater(len(italic_words), 0, "Should have italic words")
# Check for blue colored word
blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)]
self.assertGreater(len(blue_words), 0, "Should have blue colored words")
def test_nested_formatting(self):
"""Test nested formatting elements."""
text = "<p>This has <strong>bold with <em>italic inside</em></strong> formatting.</p>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
words = list(blocks[0].words())
# Find words that should be both bold and italic
bold_italic_words = [w for _, w in words
if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC]
self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic")
def test_color_variations(self):
"""Test different color formats in CSS."""
text = '<p><span style="color: #ff0000;">Hex red</span> and <span style="color: green;">Named green</span>.</p>'
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
words = list(blocks[0].words())
# Check for hex red color
hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)]
self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words")
# Check for named green color
green_words = [w for _, w in words if w.style.colour == (0, 255, 0)]
self.assertGreater(len(green_words), 0, "Should have green colored words")
class TestHTMLBlockElements(unittest.TestCase):
"""Test cases for block-level HTML elements."""
def test_body_element(self):
"""Test parsing of body element containing other elements."""
text = "<body><p>Paragraph one.</p><p>Paragraph two.</p></body>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 2)
self.assertIsInstance(blocks[0], Paragraph)
self.assertIsInstance(blocks[1], Paragraph)
def test_div_container(self):
"""Test div elements as generic containers."""
text = "<div><p>First paragraph.</p><p>Second paragraph.</p></div>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 2)
self.assertIsInstance(blocks[0], Paragraph)
self.assertIsInstance(blocks[1], Paragraph)
def test_headings(self):
"""Test all heading levels h1-h6."""
text = "<h1>Heading 1</h1><h2>Heading 2</h2><h3>Heading 3</h3><h4>Heading 4</h4><h5>Heading 5</h5><h6>Heading 6</h6>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 6)
expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3,
HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6]
for i, block in enumerate(blocks):
self.assertIsInstance(block, Heading)
self.assertEqual(block.level, expected_levels[i])
words = list(block.words())
self.assertEqual(len(words), 2) # "Heading" and number
self.assertEqual(words[0][1].text, "Heading")
def test_blockquote(self):
"""Test blockquote elements."""
text = "<blockquote><p>This is a quoted paragraph.</p></blockquote>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Quote)
# Check that the quote contains a paragraph
quote_blocks = list(blocks[0].blocks())
self.assertEqual(len(quote_blocks), 1)
self.assertIsInstance(quote_blocks[0], Paragraph)
def test_preformatted_code(self):
"""Test preformatted code blocks."""
text = "<pre><code>function hello() {\n console.log('Hello');\n}</code></pre>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], CodeBlock)
lines = list(blocks[0].lines())
self.assertGreater(len(lines), 0)
def test_unordered_list(self):
"""Test unordered lists."""
text = "<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], HList)
self.assertEqual(blocks[0].style, ListStyle.UNORDERED)
items = list(blocks[0].items())
self.assertEqual(len(items), 3)
def test_ordered_list(self):
"""Test ordered lists."""
text = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], HList)
self.assertEqual(blocks[0].style, ListStyle.ORDERED)
def test_list_with_styled_content(self):
"""Test lists containing styled content."""
text = "<ul><li>Normal item</li><li><strong>Bold item</strong></li><li>Item with <em>italic</em> text</li></ul>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], HList)
items = list(blocks[0].items())
self.assertEqual(len(items), 3)
# Check second item has bold text
second_item_blocks = list(items[1].blocks())
if second_item_blocks:
words = list(second_item_blocks[0].words())
bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
self.assertGreater(len(bold_words), 0)
def test_table_basic(self):
"""Test basic table structure."""
text = """
<table>
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
<tr>
<td>Cell 1</td>
<td>Cell 2</td>
</tr>
</table>
"""
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Table)
def test_semantic_elements(self):
"""Test semantic HTML5 elements treated as containers."""
text = "<section><article><p>Article content</p></article></section>"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Paragraph)
def test_nested_block_elements(self):
"""Test nested block elements."""
text = """
<div>
<h2>Section Title</h2>
<p>Some introductory text.</p>
<blockquote>
<p>A quoted paragraph.</p>
</blockquote>
</div>
"""
blocks = parse_html_string(text)
self.assertGreater(len(blocks), 2)
# Should have at least a heading, paragraph, and quote
has_heading = any(isinstance(b, Heading) for b in blocks)
has_paragraph = any(isinstance(b, Paragraph) for b in blocks)
has_quote = any(isinstance(b, Quote) for b in blocks)
self.assertTrue(has_heading, "Should contain a heading")
self.assertTrue(has_paragraph, "Should contain a paragraph")
self.assertTrue(has_quote, "Should contain a quote")
def test_empty_elements(self):
"""Test handling of empty elements."""
text = "<p></p><div></div><span></span>"
blocks = parse_html_string(text)
# Empty elements may not create blocks, which is acceptable behavior
self.assertGreaterEqual(len(blocks), 0)
# Test that empty paragraph with some content does create a block
text_with_content = "<p> </p>" # Contains whitespace
blocks_with_content = parse_html_string(text_with_content)
# This should create at least one block since there's whitespace content
self.assertGreaterEqual(len(blocks_with_content), 0)
class TestHTMLComplexStructures(unittest.TestCase):
"""Test cases for complex HTML structures combining multiple features."""
def test_article_with_mixed_content(self):
"""Test a realistic article structure with mixed content."""
text = """
<article>
<h1>Article Title</h1>
<p>This is the <strong>introduction</strong> paragraph with <em>some emphasis</em>.</p>
<blockquote>
<p>This is a <span style="color: blue;">quoted section</span> with styling.</p>
</blockquote>
<ul>
<li>First <strong>important</strong> point</li>
<li>Second point with <code>inline code</code></li>
</ul>
</article>
"""
blocks = parse_html_string(text)
self.assertGreater(len(blocks), 3)
# Verify we have the expected block types
block_types = [type(b).__name__ for b in blocks]
self.assertIn('Heading', block_types)
self.assertIn('Paragraph', block_types)
self.assertIn('Quote', block_types)
self.assertIn('HList', block_types)
def test_styled_table_content(self):
"""Test table with styled cell content."""
text = """
<table>
<thead>
<tr>
<th><strong>Product</strong></th>
<th><em>Price</em></th>
</tr>
</thead>
<tbody>
<tr>
<td>Item with <span style="color: red;">red text</span></td>
<td><strong>$19.99</strong></td>
</tr>
</tbody>
</table>
"""
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Table)
if __name__ == '__main__':
unittest.main()

View File

@ -8,7 +8,7 @@ import unittest
from unittest.mock import Mock, MagicMock from unittest.mock import Mock, MagicMock
from pyWebLayout.io.readers.html_text import HTMLTextProcessor from pyWebLayout.io.readers.html_text import HTMLTextProcessor
from pyWebLayout.io.readers.html_style import HTMLStyleManager from pyWebLayout.io.readers.html_style import HTMLStyleManager
from pyWebLayout.abstract.block import Parapgraph from pyWebLayout.abstract.block import Paragraph
from pyWebLayout.abstract.inline import Word from pyWebLayout.abstract.inline import Word
@ -21,7 +21,7 @@ class TestHTMLTextProcessor(unittest.TestCase):
self.text_processor = HTMLTextProcessor(self.style_manager) self.text_processor = HTMLTextProcessor(self.style_manager)
# Create a mock paragraph # Create a mock paragraph
self.mock_paragraph = Mock(spec=Parapgraph) self.mock_paragraph = Mock(spec=Paragraph)
self.mock_paragraph.add_word = Mock() self.mock_paragraph.add_word = Mock()
def test_initialization(self): def test_initialization(self):