This commit is contained in:
parent
81d85386c5
commit
ba6d8ca906
@ -1,6 +1,6 @@
|
|||||||
from .block import Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock
|
from .block import Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock
|
||||||
from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell
|
from .block import HList, ListItem, ListStyle, Table, TableRow, TableCell
|
||||||
from .block import HorizontalRule, LineBreak, Image
|
#from .block import HorizontalRule, LineBreak, Image
|
||||||
from .inline import Word, FormattedSpan
|
from .inline import Word, FormattedSpan
|
||||||
from .document import Document, MetadataType, Chapter, Book
|
from .document import Document, MetadataType, Chapter, Book
|
||||||
from .functional import Link, LinkType, Button, Form, FormField, FormFieldType
|
from .functional import Link, LinkType, Button, Form, FormField, FormFieldType
|
||||||
|
|||||||
@ -183,6 +183,10 @@ class Paragraph(Block):
|
|||||||
def word_count(self) -> int:
|
def word_count(self) -> int:
|
||||||
"""Get the number of words in this paragraph"""
|
"""Get the number of words in this paragraph"""
|
||||||
return len(self._words)
|
return len(self._words)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
|
||||||
|
return self.word_count
|
||||||
|
|
||||||
|
|
||||||
class HeadingLevel(Enum):
|
class HeadingLevel(Enum):
|
||||||
@ -1008,3 +1012,9 @@ class Table(Block):
|
|||||||
self._footer_rows.append(row)
|
self._footer_rows.append(row)
|
||||||
else: # Default to body
|
else: # Default to body
|
||||||
self._rows
|
self._rows
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Image:
|
||||||
|
|
||||||
|
pass
|
||||||
@ -330,3 +330,8 @@ class FormattedSpan:
|
|||||||
self._words.append(word)
|
self._words.append(word)
|
||||||
|
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
class LineBreak:
|
||||||
|
|
||||||
|
pass
|
||||||
@ -7,7 +7,7 @@ from PIL import Image
|
|||||||
from .style import Font, FontStyle, FontWeight, TextDecoration
|
from .style import Font, FontStyle, FontWeight, TextDecoration
|
||||||
from .abstract.document import Document, MetadataType, Book, Chapter
|
from .abstract.document import Document, MetadataType, Book, Chapter
|
||||||
from .abstract.block import (
|
from .abstract.block import (
|
||||||
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
|
Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak
|
HList, ListStyle, ListItem, Table, TableRow, TableCell, HorizontalRule, LineBreak
|
||||||
)
|
)
|
||||||
from .abstract.inline import Word, FormattedSpan
|
from .abstract.inline import Word, FormattedSpan
|
||||||
@ -138,7 +138,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
|
|
||||||
elif tag == 'p':
|
elif tag == 'p':
|
||||||
self._flush_text() # Flush any pending text
|
self._flush_text() # Flush any pending text
|
||||||
self._current_paragraph = Parapgraph()
|
self._current_paragraph = Paragraph()
|
||||||
|
|
||||||
# Add the paragraph to the current block or document
|
# Add the paragraph to the current block or document
|
||||||
if self._current_block and hasattr(self._current_block, 'add_block'):
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
@ -180,7 +180,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
self._flush_text() # Flush any pending text
|
self._flush_text() # Flush any pending text
|
||||||
|
|
||||||
# For divs, we create a new paragraph as a container
|
# For divs, we create a new paragraph as a container
|
||||||
div_para = Parapgraph()
|
div_para = Paragraph()
|
||||||
|
|
||||||
# Add the div to the current block or document
|
# Add the div to the current block or document
|
||||||
if self._current_block and hasattr(self._current_block, 'add_block'):
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
@ -214,7 +214,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
# Pre can optionally contain a code block
|
# Pre can optionally contain a code block
|
||||||
# We'll create a paragraph for now, and if we find a code tag inside,
|
# We'll create a paragraph for now, and if we find a code tag inside,
|
||||||
# we'll replace it with a code block
|
# we'll replace it with a code block
|
||||||
pre_para = Parapgraph()
|
pre_para = Paragraph()
|
||||||
|
|
||||||
# Add the pre to the current block or document
|
# Add the pre to the current block or document
|
||||||
if self._current_block and hasattr(self._current_block, 'add_block'):
|
if self._current_block and hasattr(self._current_block, 'add_block'):
|
||||||
@ -229,7 +229,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
|
|
||||||
elif tag == 'code':
|
elif tag == 'code':
|
||||||
# If we're inside a pre, replace the paragraph with a code block
|
# If we're inside a pre, replace the paragraph with a code block
|
||||||
if self._block_stack and isinstance(self._block_stack[-1], Parapgraph):
|
if self._block_stack and isinstance(self._block_stack[-1], Paragraph):
|
||||||
pre_para = self._block_stack.pop()
|
pre_para = self._block_stack.pop()
|
||||||
|
|
||||||
# Get the language from class if specified (e.g., class="language-python")
|
# Get the language from class if specified (e.g., class="language-python")
|
||||||
@ -312,7 +312,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
self._current_block = list_item
|
self._current_block = list_item
|
||||||
|
|
||||||
# Create a paragraph for the term content
|
# Create a paragraph for the term content
|
||||||
term_para = Parapgraph()
|
term_para = Paragraph()
|
||||||
list_item.add_block(term_para)
|
list_item.add_block(term_para)
|
||||||
self._current_paragraph = term_para
|
self._current_paragraph = term_para
|
||||||
|
|
||||||
@ -325,7 +325,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
list_item = current_list._items[-1]
|
list_item = current_list._items[-1]
|
||||||
|
|
||||||
# Create a paragraph for the description content
|
# Create a paragraph for the description content
|
||||||
desc_para = Parapgraph()
|
desc_para = Paragraph()
|
||||||
list_item.add_block(desc_para)
|
list_item.add_block(desc_para)
|
||||||
|
|
||||||
# Update current state
|
# Update current state
|
||||||
@ -340,7 +340,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
self._current_block = list_item
|
self._current_block = list_item
|
||||||
|
|
||||||
# Create a paragraph for the description content
|
# Create a paragraph for the description content
|
||||||
desc_para = Parapgraph()
|
desc_para = Paragraph()
|
||||||
list_item.add_block(desc_para)
|
list_item.add_block(desc_para)
|
||||||
self._current_paragraph = desc_para
|
self._current_paragraph = desc_para
|
||||||
|
|
||||||
@ -424,7 +424,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
self._current_block = cell
|
self._current_block = cell
|
||||||
|
|
||||||
# Create a paragraph for the cell content
|
# Create a paragraph for the cell content
|
||||||
cell_para = Parapgraph()
|
cell_para = Paragraph()
|
||||||
cell.add_block(cell_para)
|
cell.add_block(cell_para)
|
||||||
self._current_paragraph = cell_para
|
self._current_paragraph = cell_para
|
||||||
|
|
||||||
@ -508,6 +508,7 @@ class HTMLParser(BaseHTMLParser):
|
|||||||
})
|
})
|
||||||
|
|
||||||
elif tag == 'br':
|
elif tag == 'br':
|
||||||
|
|
||||||
# Add a line break
|
# Add a line break
|
||||||
if self._current_paragraph:
|
if self._current_paragraph:
|
||||||
line_break = LineBreak()
|
line_break = LineBreak()
|
||||||
|
|||||||
@ -379,10 +379,10 @@ class EPUBReader:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error parsing chapter {i+1}: {str(e)}")
|
print(f"Error parsing chapter {i+1}: {str(e)}")
|
||||||
# Add an error message block
|
# Add an error message block
|
||||||
from pyWebLayout.abstract.block import Parapgraph
|
from pyWebLayout.abstract.block import Paragraph
|
||||||
from pyWebLayout.abstract.inline import Word
|
from pyWebLayout.abstract.inline import Word
|
||||||
from pyWebLayout.style import Font
|
from pyWebLayout.style import Font
|
||||||
error_para = Parapgraph()
|
error_para = Paragraph()
|
||||||
# Create a default font style for the error message
|
# Create a default font style for the error message
|
||||||
default_font = Font()
|
default_font = Font()
|
||||||
error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font))
|
error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font))
|
||||||
|
|||||||
@ -9,9 +9,9 @@ from typing import Dict, List, Optional, Any
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
from pyWebLayout.abstract.document import Document
|
from pyWebLayout.abstract.document import Document
|
||||||
from pyWebLayout.abstract.block import (
|
from pyWebLayout.abstract.block import (
|
||||||
Block, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
|
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
||||||
HorizontalRule, LineBreak, Image
|
#HorizontalRule, LineBreak, Image
|
||||||
)
|
)
|
||||||
from pyWebLayout.abstract.functional import Link, LinkType
|
from pyWebLayout.abstract.functional import Link, LinkType
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
@ -26,7 +26,7 @@ class BlockElementHandler:
|
|||||||
self.text_processor = text_processor
|
self.text_processor = text_processor
|
||||||
self.block_stack: List[Block] = []
|
self.block_stack: List[Block] = []
|
||||||
self.current_block: Optional[Block] = None
|
self.current_block: Optional[Block] = None
|
||||||
self.current_paragraph: Optional[Parapgraph] = None
|
self.current_paragraph: Optional[Paragraph] = None
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""Reset the handler state."""
|
"""Reset the handler state."""
|
||||||
@ -44,7 +44,7 @@ class BlockElementHandler:
|
|||||||
def handle_paragraph_start(self, document: Document):
|
def handle_paragraph_start(self, document: Document):
|
||||||
"""Handle the start of a paragraph element."""
|
"""Handle the start of a paragraph element."""
|
||||||
self.text_processor.flush_text()
|
self.text_processor.flush_text()
|
||||||
paragraph = Parapgraph()
|
paragraph = Paragraph()
|
||||||
|
|
||||||
self.add_block_to_document_or_parent(paragraph, document)
|
self.add_block_to_document_or_parent(paragraph, document)
|
||||||
self.block_stack.append(paragraph)
|
self.block_stack.append(paragraph)
|
||||||
@ -71,7 +71,7 @@ class BlockElementHandler:
|
|||||||
def handle_div_start(self, document: Document):
|
def handle_div_start(self, document: Document):
|
||||||
"""Handle the start of a div element."""
|
"""Handle the start of a div element."""
|
||||||
self.text_processor.flush_text()
|
self.text_processor.flush_text()
|
||||||
div_para = Parapgraph()
|
div_para = Paragraph()
|
||||||
|
|
||||||
self.add_block_to_document_or_parent(div_para, document)
|
self.add_block_to_document_or_parent(div_para, document)
|
||||||
self.block_stack.append(div_para)
|
self.block_stack.append(div_para)
|
||||||
@ -93,7 +93,7 @@ class BlockElementHandler:
|
|||||||
def handle_pre_start(self, document: Document):
|
def handle_pre_start(self, document: Document):
|
||||||
"""Handle the start of a pre element."""
|
"""Handle the start of a pre element."""
|
||||||
self.text_processor.flush_text()
|
self.text_processor.flush_text()
|
||||||
pre_para = Parapgraph()
|
pre_para = Paragraph()
|
||||||
|
|
||||||
self.add_block_to_document_or_parent(pre_para, document)
|
self.add_block_to_document_or_parent(pre_para, document)
|
||||||
self.block_stack.append(pre_para)
|
self.block_stack.append(pre_para)
|
||||||
@ -104,7 +104,7 @@ class BlockElementHandler:
|
|||||||
def handle_code_start(self, attrs: Dict[str, str], document: Document):
|
def handle_code_start(self, attrs: Dict[str, str], document: Document):
|
||||||
"""Handle the start of a code element."""
|
"""Handle the start of a code element."""
|
||||||
# If we're inside a pre, replace the paragraph with a code block
|
# If we're inside a pre, replace the paragraph with a code block
|
||||||
if self.block_stack and isinstance(self.block_stack[-1], Parapgraph):
|
if self.block_stack and isinstance(self.block_stack[-1], Paragraph):
|
||||||
pre_para = self.block_stack.pop()
|
pre_para = self.block_stack.pop()
|
||||||
|
|
||||||
# Get the language from class if specified
|
# Get the language from class if specified
|
||||||
@ -145,7 +145,7 @@ class BlockElementHandler:
|
|||||||
if self.block_stack:
|
if self.block_stack:
|
||||||
self.current_block = self.block_stack[-1]
|
self.current_block = self.block_stack[-1]
|
||||||
# Update current paragraph based on block type
|
# Update current paragraph based on block type
|
||||||
if isinstance(self.current_block, Parapgraph):
|
if isinstance(self.current_block, Paragraph):
|
||||||
self.current_paragraph = self.current_block
|
self.current_paragraph = self.current_block
|
||||||
else:
|
else:
|
||||||
self.current_paragraph = None
|
self.current_paragraph = None
|
||||||
@ -201,7 +201,7 @@ class ListElementHandler:
|
|||||||
block_handler.current_block = list_item
|
block_handler.current_block = list_item
|
||||||
|
|
||||||
# Create a paragraph for the list item content
|
# Create a paragraph for the list item content
|
||||||
item_para = Parapgraph()
|
item_para = Paragraph()
|
||||||
list_item.add_block(item_para)
|
list_item.add_block(item_para)
|
||||||
block_handler.current_paragraph = item_para
|
block_handler.current_paragraph = item_para
|
||||||
self.text_processor.set_current_paragraph(item_para)
|
self.text_processor.set_current_paragraph(item_para)
|
||||||
@ -220,7 +220,7 @@ class ListElementHandler:
|
|||||||
block_handler.block_stack.append(list_item)
|
block_handler.block_stack.append(list_item)
|
||||||
block_handler.current_block = list_item
|
block_handler.current_block = list_item
|
||||||
|
|
||||||
term_para = Parapgraph()
|
term_para = Paragraph()
|
||||||
list_item.add_block(term_para)
|
list_item.add_block(term_para)
|
||||||
block_handler.current_paragraph = term_para
|
block_handler.current_paragraph = term_para
|
||||||
self.text_processor.set_current_paragraph(term_para)
|
self.text_processor.set_current_paragraph(term_para)
|
||||||
@ -228,7 +228,7 @@ class ListElementHandler:
|
|||||||
elif tag == 'dd':
|
elif tag == 'dd':
|
||||||
if current_list._items:
|
if current_list._items:
|
||||||
list_item = current_list._items[-1]
|
list_item = current_list._items[-1]
|
||||||
desc_para = Parapgraph()
|
desc_para = Paragraph()
|
||||||
list_item.add_block(desc_para)
|
list_item.add_block(desc_para)
|
||||||
block_handler.current_paragraph = desc_para
|
block_handler.current_paragraph = desc_para
|
||||||
self.text_processor.set_current_paragraph(desc_para)
|
self.text_processor.set_current_paragraph(desc_para)
|
||||||
@ -339,7 +339,7 @@ class TableElementHandler:
|
|||||||
block_handler.current_block = cell
|
block_handler.current_block = cell
|
||||||
|
|
||||||
# Create a paragraph for the cell content
|
# Create a paragraph for the cell content
|
||||||
cell_para = Parapgraph()
|
cell_para = Paragraph()
|
||||||
cell.add_block(cell_para)
|
cell.add_block(cell_para)
|
||||||
block_handler.current_paragraph = cell_para
|
block_handler.current_paragraph = cell_para
|
||||||
self.text_processor.set_current_paragraph(cell_para)
|
self.text_processor.set_current_paragraph(cell_para)
|
||||||
|
|||||||
736
pyWebLayout/io/readers/html_extraction.py
Normal file
736
pyWebLayout/io/readers/html_extraction.py
Normal file
@ -0,0 +1,736 @@
|
|||||||
|
"""
|
||||||
|
HTML extraction module for converting HTML elements to pyWebLayout abstract elements.
|
||||||
|
|
||||||
|
This module provides handler functions for converting HTML elements into the abstract document structure
|
||||||
|
used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting.
|
||||||
|
Each handler function has a robust signature that handles style hints, CSS classes, and attributes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple
|
||||||
|
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||||
|
from pyWebLayout.abstract.inline import Word, FormattedSpan
|
||||||
|
from pyWebLayout.abstract.block import (
|
||||||
|
Block, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
|
HList, ListItem, ListStyle, Table, TableRow, TableCell
|
||||||
|
)
|
||||||
|
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
||||||
|
|
||||||
|
|
||||||
|
class StyleContext(NamedTuple):
|
||||||
|
"""
|
||||||
|
Immutable style context passed to handler functions.
|
||||||
|
Contains all styling information including inherited styles, CSS hints, and element attributes.
|
||||||
|
"""
|
||||||
|
font: Font
|
||||||
|
background: Optional[Tuple[int, int, int, int]]
|
||||||
|
css_classes: set
|
||||||
|
css_styles: Dict[str, str]
|
||||||
|
element_attributes: Dict[str, Any]
|
||||||
|
parent_elements: List[str] # Stack of parent element names
|
||||||
|
|
||||||
|
def with_font(self, font: Font) -> 'StyleContext':
|
||||||
|
"""Create new context with modified font."""
|
||||||
|
return self._replace(font=font)
|
||||||
|
|
||||||
|
def with_background(self, background: Optional[Tuple[int, int, int, int]]) -> 'StyleContext':
|
||||||
|
"""Create new context with modified background."""
|
||||||
|
return self._replace(background=background)
|
||||||
|
|
||||||
|
def with_css_classes(self, css_classes: set) -> 'StyleContext':
|
||||||
|
"""Create new context with modified CSS classes."""
|
||||||
|
return self._replace(css_classes=css_classes)
|
||||||
|
|
||||||
|
def with_css_styles(self, css_styles: Dict[str, str]) -> 'StyleContext':
|
||||||
|
"""Create new context with modified CSS styles."""
|
||||||
|
return self._replace(css_styles=css_styles)
|
||||||
|
|
||||||
|
def with_attributes(self, attributes: Dict[str, Any]) -> 'StyleContext':
|
||||||
|
"""Create new context with modified element attributes."""
|
||||||
|
return self._replace(element_attributes=attributes)
|
||||||
|
|
||||||
|
def push_element(self, element_name: str) -> 'StyleContext':
|
||||||
|
"""Create new context with element pushed onto parent stack."""
|
||||||
|
return self._replace(parent_elements=self.parent_elements + [element_name])
|
||||||
|
|
||||||
|
|
||||||
|
def create_base_context(base_font: Optional[Font] = None) -> StyleContext:
|
||||||
|
"""
|
||||||
|
Create a base style context with default values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_font: Base font to use, defaults to system default
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
StyleContext with default values
|
||||||
|
"""
|
||||||
|
return StyleContext(
|
||||||
|
font=base_font or Font(),
|
||||||
|
background=None,
|
||||||
|
css_classes=set(),
|
||||||
|
css_styles={},
|
||||||
|
element_attributes={},
|
||||||
|
parent_elements=[]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext:
|
||||||
|
"""
|
||||||
|
Apply element-specific styling to context based on HTML element and attributes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
context: Current style context
|
||||||
|
element: BeautifulSoup Tag object
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
New StyleContext with applied styling
|
||||||
|
"""
|
||||||
|
tag_name = element.name.lower()
|
||||||
|
attributes = dict(element.attrs) if element.attrs else {}
|
||||||
|
|
||||||
|
# Start with current context
|
||||||
|
new_context = context.with_attributes(attributes).push_element(tag_name)
|
||||||
|
|
||||||
|
# Apply CSS classes
|
||||||
|
css_classes = new_context.css_classes.copy()
|
||||||
|
if 'class' in attributes:
|
||||||
|
classes = attributes['class'].split() if isinstance(attributes['class'], str) else attributes['class']
|
||||||
|
css_classes.update(classes)
|
||||||
|
new_context = new_context.with_css_classes(css_classes)
|
||||||
|
|
||||||
|
# Apply inline styles
|
||||||
|
css_styles = new_context.css_styles.copy()
|
||||||
|
if 'style' in attributes:
|
||||||
|
inline_styles = parse_inline_styles(attributes['style'])
|
||||||
|
css_styles.update(inline_styles)
|
||||||
|
new_context = new_context.with_css_styles(css_styles)
|
||||||
|
|
||||||
|
# Apply element-specific default styles
|
||||||
|
font = apply_element_font_styles(new_context.font, tag_name, css_styles)
|
||||||
|
new_context = new_context.with_font(font)
|
||||||
|
|
||||||
|
# Apply background from styles
|
||||||
|
background = apply_background_styles(new_context.background, css_styles)
|
||||||
|
new_context = new_context.with_background(background)
|
||||||
|
|
||||||
|
return new_context
|
||||||
|
|
||||||
|
|
||||||
|
def parse_inline_styles(style_text: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Parse CSS inline styles into dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style_text: CSS style text (e.g., "color: red; font-weight: bold;")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of CSS property-value pairs
|
||||||
|
"""
|
||||||
|
styles = {}
|
||||||
|
for declaration in style_text.split(';'):
|
||||||
|
if ':' in declaration:
|
||||||
|
prop, value = declaration.split(':', 1)
|
||||||
|
styles[prop.strip().lower()] = value.strip()
|
||||||
|
return styles
|
||||||
|
|
||||||
|
|
||||||
|
def apply_element_font_styles(font: Font, tag_name: str, css_styles: Dict[str, str]) -> Font:
|
||||||
|
"""
|
||||||
|
Apply font styling based on HTML element and CSS styles.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
font: Current font
|
||||||
|
tag_name: HTML tag name
|
||||||
|
css_styles: CSS styles dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
New Font object with applied styling
|
||||||
|
"""
|
||||||
|
# Default element styles
|
||||||
|
element_font_styles = {
|
||||||
|
'b': {'weight': FontWeight.BOLD},
|
||||||
|
'strong': {'weight': FontWeight.BOLD},
|
||||||
|
'i': {'style': FontStyle.ITALIC},
|
||||||
|
'em': {'style': FontStyle.ITALIC},
|
||||||
|
'u': {'decoration': TextDecoration.UNDERLINE},
|
||||||
|
's': {'decoration': TextDecoration.STRIKETHROUGH},
|
||||||
|
'del': {'decoration': TextDecoration.STRIKETHROUGH},
|
||||||
|
'h1': {'size': 24, 'weight': FontWeight.BOLD},
|
||||||
|
'h2': {'size': 20, 'weight': FontWeight.BOLD},
|
||||||
|
'h3': {'size': 18, 'weight': FontWeight.BOLD},
|
||||||
|
'h4': {'size': 16, 'weight': FontWeight.BOLD},
|
||||||
|
'h5': {'size': 14, 'weight': FontWeight.BOLD},
|
||||||
|
'h6': {'size': 12, 'weight': FontWeight.BOLD},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Start with current font properties
|
||||||
|
font_size = font.font_size
|
||||||
|
colour = font.colour
|
||||||
|
weight = font.weight
|
||||||
|
style = font.style
|
||||||
|
decoration = font.decoration
|
||||||
|
background = font.background
|
||||||
|
language = font.language
|
||||||
|
|
||||||
|
# Apply element default styles
|
||||||
|
if tag_name in element_font_styles:
|
||||||
|
elem_styles = element_font_styles[tag_name]
|
||||||
|
if 'size' in elem_styles:
|
||||||
|
font_size = elem_styles['size']
|
||||||
|
if 'weight' in elem_styles:
|
||||||
|
weight = elem_styles['weight']
|
||||||
|
if 'style' in elem_styles:
|
||||||
|
style = elem_styles['style']
|
||||||
|
if 'decoration' in elem_styles:
|
||||||
|
decoration = elem_styles['decoration']
|
||||||
|
|
||||||
|
# Apply CSS styles (override element defaults)
|
||||||
|
if 'font-size' in css_styles:
|
||||||
|
# Parse font-size (simplified - could be enhanced)
|
||||||
|
size_value = css_styles['font-size'].lower()
|
||||||
|
if size_value.endswith('px'):
|
||||||
|
try:
|
||||||
|
font_size = int(float(size_value[:-2]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
elif size_value.endswith('pt'):
|
||||||
|
try:
|
||||||
|
font_size = int(float(size_value[:-2]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if 'font-weight' in css_styles:
|
||||||
|
weight_value = css_styles['font-weight'].lower()
|
||||||
|
if weight_value in ['bold', '700', '800', '900']:
|
||||||
|
weight = FontWeight.BOLD
|
||||||
|
elif weight_value in ['normal', '400']:
|
||||||
|
weight = FontWeight.NORMAL
|
||||||
|
|
||||||
|
if 'font-style' in css_styles:
|
||||||
|
style_value = css_styles['font-style'].lower()
|
||||||
|
if style_value == 'italic':
|
||||||
|
style = FontStyle.ITALIC
|
||||||
|
elif style_value == 'normal':
|
||||||
|
style = FontStyle.NORMAL
|
||||||
|
|
||||||
|
if 'text-decoration' in css_styles:
|
||||||
|
decoration_value = css_styles['text-decoration'].lower()
|
||||||
|
if 'underline' in decoration_value:
|
||||||
|
decoration = TextDecoration.UNDERLINE
|
||||||
|
elif 'line-through' in decoration_value:
|
||||||
|
decoration = TextDecoration.STRIKETHROUGH
|
||||||
|
elif 'none' in decoration_value:
|
||||||
|
decoration = TextDecoration.NONE
|
||||||
|
|
||||||
|
if 'color' in css_styles:
|
||||||
|
# Parse color (simplified - could be enhanced for hex, rgb, etc.)
|
||||||
|
color_value = css_styles['color'].lower()
|
||||||
|
color_map = {
|
||||||
|
'black': (0, 0, 0),
|
||||||
|
'white': (255, 255, 255),
|
||||||
|
'red': (255, 0, 0),
|
||||||
|
'green': (0, 255, 0),
|
||||||
|
'blue': (0, 0, 255),
|
||||||
|
}
|
||||||
|
if color_value in color_map:
|
||||||
|
colour = color_map[color_value]
|
||||||
|
elif color_value.startswith('#') and len(color_value) == 7:
|
||||||
|
try:
|
||||||
|
r = int(color_value[1:3], 16)
|
||||||
|
g = int(color_value[3:5], 16)
|
||||||
|
b = int(color_value[5:7], 16)
|
||||||
|
colour = (r, g, b)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return Font(
|
||||||
|
font_path=font._font_path,
|
||||||
|
font_size=font_size,
|
||||||
|
colour=colour,
|
||||||
|
weight=weight,
|
||||||
|
style=style,
|
||||||
|
decoration=decoration,
|
||||||
|
background=background,
|
||||||
|
langauge=language
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_background_styles(current_background: Optional[Tuple[int, int, int, int]],
|
||||||
|
css_styles: Dict[str, str]) -> Optional[Tuple[int, int, int, int]]:
|
||||||
|
"""
|
||||||
|
Apply background styling from CSS.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current_background: Current background color (RGBA)
|
||||||
|
css_styles: CSS styles dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
New background color or None
|
||||||
|
"""
|
||||||
|
if 'background-color' in css_styles:
|
||||||
|
bg_value = css_styles['background-color'].lower()
|
||||||
|
if bg_value == 'transparent':
|
||||||
|
return None
|
||||||
|
# Add color parsing logic here if needed
|
||||||
|
|
||||||
|
return current_background
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_content(element: Tag, context: StyleContext) -> List[Word]:
|
||||||
|
"""
|
||||||
|
Extract text content from an element, handling inline formatting.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element: BeautifulSoup Tag object
|
||||||
|
context: Current style context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Word objects
|
||||||
|
"""
|
||||||
|
words = []
|
||||||
|
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, NavigableString):
|
||||||
|
# Plain text - split into words
|
||||||
|
text = str(child).strip()
|
||||||
|
if text:
|
||||||
|
word_texts = text.split()
|
||||||
|
for word_text in word_texts:
|
||||||
|
if word_text:
|
||||||
|
words.append(Word(word_text, context.font, context.background))
|
||||||
|
elif isinstance(child, Tag):
|
||||||
|
# Process inline elements
|
||||||
|
if child.name.lower() in ['span', 'a', 'strong', 'b', 'em', 'i', 'u', 's', 'del', 'ins', 'mark', 'small', 'sub', 'sup', 'code', 'q', 'cite', 'abbr', 'time']:
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
child_words = extract_text_content(child, child_context)
|
||||||
|
words.extend(child_words)
|
||||||
|
else:
|
||||||
|
# Block element - shouldn't happen in well-formed HTML but handle gracefully
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
child_result = process_element(child, child_context)
|
||||||
|
if isinstance(child_result, list):
|
||||||
|
for block in child_result:
|
||||||
|
if isinstance(block, Paragraph):
|
||||||
|
for _, word in block.words():
|
||||||
|
words.append(word)
|
||||||
|
elif isinstance(child_result, Paragraph):
|
||||||
|
for _, word in child_result.words():
|
||||||
|
words.append(word)
|
||||||
|
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def process_element(element: Tag, context: StyleContext) -> Union[Block, List[Block], None]:
|
||||||
|
"""
|
||||||
|
Process a single HTML element using appropriate handler.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element: BeautifulSoup Tag object
|
||||||
|
context: Current style context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Block object(s) or None if element should be ignored
|
||||||
|
"""
|
||||||
|
tag_name = element.name.lower()
|
||||||
|
handler = HANDLERS.get(tag_name, generic_handler)
|
||||||
|
return handler(element, context)
|
||||||
|
|
||||||
|
|
||||||
|
# Handler function signatures:
|
||||||
|
# All handlers receive (element: Tag, context: StyleContext) -> Union[Block, List[Block], None]
|
||||||
|
|
||||||
|
def paragraph_handler(element: Tag, context: StyleContext) -> Paragraph:
|
||||||
|
"""Handle <p> elements."""
|
||||||
|
paragraph = Paragraph(context.font)
|
||||||
|
words = extract_text_content(element, context)
|
||||||
|
for word in words:
|
||||||
|
paragraph.add_word(word)
|
||||||
|
return paragraph
|
||||||
|
|
||||||
|
|
||||||
|
def div_handler(element: Tag, context: StyleContext) -> List[Block]:
|
||||||
|
"""Handle <div> elements - treat as generic container."""
|
||||||
|
blocks = []
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
result = process_element(child, child_context)
|
||||||
|
if result:
|
||||||
|
if isinstance(result, list):
|
||||||
|
blocks.extend(result)
|
||||||
|
else:
|
||||||
|
blocks.append(result)
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
|
def heading_handler(element: Tag, context: StyleContext) -> Heading:
|
||||||
|
"""Handle <h1>-<h6> elements."""
|
||||||
|
level_map = {
|
||||||
|
'h1': HeadingLevel.H1,
|
||||||
|
'h2': HeadingLevel.H2,
|
||||||
|
'h3': HeadingLevel.H3,
|
||||||
|
'h4': HeadingLevel.H4,
|
||||||
|
'h5': HeadingLevel.H5,
|
||||||
|
'h6': HeadingLevel.H6,
|
||||||
|
}
|
||||||
|
|
||||||
|
level = level_map.get(element.name.lower(), HeadingLevel.H1)
|
||||||
|
heading = Heading(level, context.font)
|
||||||
|
words = extract_text_content(element, context)
|
||||||
|
for word in words:
|
||||||
|
heading.add_word(word)
|
||||||
|
return heading
|
||||||
|
|
||||||
|
|
||||||
|
def blockquote_handler(element: Tag, context: StyleContext) -> Quote:
|
||||||
|
"""Handle <blockquote> elements."""
|
||||||
|
quote = Quote(context.font)
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
result = process_element(child, child_context)
|
||||||
|
if result:
|
||||||
|
if isinstance(result, list):
|
||||||
|
for block in result:
|
||||||
|
quote.add_block(block)
|
||||||
|
else:
|
||||||
|
quote.add_block(result)
|
||||||
|
return quote
|
||||||
|
|
||||||
|
|
||||||
|
def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock:
|
||||||
|
"""Handle <pre> elements."""
|
||||||
|
language = context.element_attributes.get('data-language', '')
|
||||||
|
code_block = CodeBlock(language)
|
||||||
|
|
||||||
|
# Preserve whitespace and line breaks in preformatted text
|
||||||
|
text = element.get_text(separator='\n', strip=False)
|
||||||
|
for line in text.split('\n'):
|
||||||
|
code_block.add_line(line)
|
||||||
|
|
||||||
|
return code_block
|
||||||
|
|
||||||
|
|
||||||
|
def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]:
|
||||||
|
"""Handle <code> elements."""
|
||||||
|
# If parent is <pre>, this is handled by preformatted_handler
|
||||||
|
if context.parent_elements and context.parent_elements[-1] == 'pre':
|
||||||
|
return None # Will be handled by parent
|
||||||
|
|
||||||
|
# Inline code - handled during text extraction
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def unordered_list_handler(element: Tag, context: StyleContext) -> HList:
|
||||||
|
"""Handle <ul> elements."""
|
||||||
|
hlist = HList(ListStyle.UNORDERED, context.font)
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag) and child.name.lower() == 'li':
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
item = process_element(child, child_context)
|
||||||
|
if item:
|
||||||
|
hlist.add_item(item)
|
||||||
|
return hlist
|
||||||
|
|
||||||
|
|
||||||
|
def ordered_list_handler(element: Tag, context: StyleContext) -> HList:
|
||||||
|
"""Handle <ol> elements."""
|
||||||
|
hlist = HList(ListStyle.ORDERED, context.font)
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag) and child.name.lower() == 'li':
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
item = process_element(child, child_context)
|
||||||
|
if item:
|
||||||
|
hlist.add_item(item)
|
||||||
|
return hlist
|
||||||
|
|
||||||
|
|
||||||
|
def list_item_handler(element: Tag, context: StyleContext) -> ListItem:
|
||||||
|
"""Handle <li> elements."""
|
||||||
|
list_item = ListItem(None, context.font)
|
||||||
|
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
result = process_element(child, child_context)
|
||||||
|
if result:
|
||||||
|
if isinstance(result, list):
|
||||||
|
for block in result:
|
||||||
|
list_item.add_block(block)
|
||||||
|
else:
|
||||||
|
list_item.add_block(result)
|
||||||
|
elif isinstance(child, NavigableString):
|
||||||
|
# Direct text in list item - create paragraph
|
||||||
|
text = str(child).strip()
|
||||||
|
if text:
|
||||||
|
paragraph = Paragraph(context.font)
|
||||||
|
words = text.split()
|
||||||
|
for word_text in words:
|
||||||
|
if word_text:
|
||||||
|
paragraph.add_word(Word(word_text, context.font))
|
||||||
|
list_item.add_block(paragraph)
|
||||||
|
|
||||||
|
return list_item
|
||||||
|
|
||||||
|
|
||||||
|
def table_handler(element: Tag, context: StyleContext) -> Table:
|
||||||
|
"""Handle <table> elements."""
|
||||||
|
caption = None
|
||||||
|
caption_elem = element.find('caption')
|
||||||
|
if caption_elem:
|
||||||
|
caption = caption_elem.get_text(strip=True)
|
||||||
|
|
||||||
|
table = Table(caption, context.font)
|
||||||
|
|
||||||
|
# Process table rows
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
if child.name.lower() == 'tr':
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
row = process_element(child, child_context)
|
||||||
|
if row:
|
||||||
|
table.add_row(row)
|
||||||
|
elif child.name.lower() in ['thead', 'tbody', 'tfoot']:
|
||||||
|
section = 'header' if child.name.lower() == 'thead' else 'body'
|
||||||
|
section = 'footer' if child.name.lower() == 'tfoot' else section
|
||||||
|
|
||||||
|
for row_elem in child.find_all('tr'):
|
||||||
|
child_context = apply_element_styling(context, row_elem)
|
||||||
|
row = process_element(row_elem, child_context)
|
||||||
|
if row:
|
||||||
|
table.add_row(row, section)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def table_row_handler(element: Tag, context: StyleContext) -> TableRow:
|
||||||
|
"""Handle <tr> elements."""
|
||||||
|
row = TableRow(context.font)
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag) and child.name.lower() in ['td', 'th']:
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
cell = process_element(child, child_context)
|
||||||
|
if cell:
|
||||||
|
row.add_cell(cell)
|
||||||
|
return row
|
||||||
|
|
||||||
|
|
||||||
|
def table_cell_handler(element: Tag, context: StyleContext) -> TableCell:
|
||||||
|
"""Handle <td> elements."""
|
||||||
|
colspan = int(context.element_attributes.get('colspan', 1))
|
||||||
|
rowspan = int(context.element_attributes.get('rowspan', 1))
|
||||||
|
cell = TableCell(False, colspan, rowspan, context.font)
|
||||||
|
|
||||||
|
# Process cell content
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
result = process_element(child, child_context)
|
||||||
|
if result:
|
||||||
|
if isinstance(result, list):
|
||||||
|
for block in result:
|
||||||
|
cell.add_block(block)
|
||||||
|
else:
|
||||||
|
cell.add_block(result)
|
||||||
|
elif isinstance(child, NavigableString):
|
||||||
|
# Direct text in cell - create paragraph
|
||||||
|
text = str(child).strip()
|
||||||
|
if text:
|
||||||
|
paragraph = Paragraph(context.font)
|
||||||
|
words = text.split()
|
||||||
|
for word_text in words:
|
||||||
|
if word_text:
|
||||||
|
paragraph.add_word(Word(word_text, context.font))
|
||||||
|
cell.add_block(paragraph)
|
||||||
|
|
||||||
|
return cell
|
||||||
|
|
||||||
|
|
||||||
|
def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:
|
||||||
|
"""Handle <th> elements."""
|
||||||
|
colspan = int(context.element_attributes.get('colspan', 1))
|
||||||
|
rowspan = int(context.element_attributes.get('rowspan', 1))
|
||||||
|
cell = TableCell(True, colspan, rowspan, context.font)
|
||||||
|
|
||||||
|
# Process cell content (same as td)
|
||||||
|
for child in element.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
child_context = apply_element_styling(context, child)
|
||||||
|
result = process_element(child, child_context)
|
||||||
|
if result:
|
||||||
|
if isinstance(result, list):
|
||||||
|
for block in result:
|
||||||
|
cell.add_block(block)
|
||||||
|
else:
|
||||||
|
cell.add_block(result)
|
||||||
|
elif isinstance(child, NavigableString):
|
||||||
|
text = str(child).strip()
|
||||||
|
if text:
|
||||||
|
paragraph = Paragraph(context.font)
|
||||||
|
words = text.split()
|
||||||
|
for word_text in words:
|
||||||
|
if word_text:
|
||||||
|
paragraph.add_word(Word(word_text, context.font))
|
||||||
|
cell.add_block(paragraph)
|
||||||
|
|
||||||
|
return cell
|
||||||
|
|
||||||
|
|
||||||
|
def horizontal_rule_handler(element: Tag, context: StyleContext) -> Block:
|
||||||
|
"""Handle <hr> elements."""
|
||||||
|
# TODO: Create a specific HorizontalRule block type
|
||||||
|
# For now, return an empty paragraph
|
||||||
|
return Paragraph(context.font)
|
||||||
|
|
||||||
|
|
||||||
|
def line_break_handler(element: Tag, context: StyleContext) -> None:
|
||||||
|
"""Handle <br> elements."""
|
||||||
|
# Line breaks are typically handled at the paragraph level
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def image_handler(element: Tag, context: StyleContext) -> Block:
|
||||||
|
"""Handle <img> elements."""
|
||||||
|
# TODO: Create Image block type
|
||||||
|
# For now, return empty paragraph with alt text if available
|
||||||
|
paragraph = Paragraph(context.font)
|
||||||
|
alt_text = context.element_attributes.get('alt', '')
|
||||||
|
if alt_text:
|
||||||
|
words = alt_text.split()
|
||||||
|
for word_text in words:
|
||||||
|
if word_text:
|
||||||
|
paragraph.add_word(Word(word_text, context.font))
|
||||||
|
return paragraph
|
||||||
|
|
||||||
|
|
||||||
|
def ignore_handler(element: Tag, context: StyleContext) -> None:
|
||||||
|
"""Handle elements that should be ignored."""
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def generic_handler(element: Tag, context: StyleContext) -> List[Block]:
|
||||||
|
"""Handle unknown elements as generic containers."""
|
||||||
|
return div_handler(element, context)
|
||||||
|
|
||||||
|
|
||||||
|
# Handler registry - maps HTML tag names to handler functions
|
||||||
|
HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = {
|
||||||
|
# Block elements
|
||||||
|
'p': paragraph_handler,
|
||||||
|
'div': div_handler,
|
||||||
|
'h1': heading_handler,
|
||||||
|
'h2': heading_handler,
|
||||||
|
'h3': heading_handler,
|
||||||
|
'h4': heading_handler,
|
||||||
|
'h5': heading_handler,
|
||||||
|
'h6': heading_handler,
|
||||||
|
'blockquote': blockquote_handler,
|
||||||
|
'pre': preformatted_handler,
|
||||||
|
'code': code_handler,
|
||||||
|
'ul': unordered_list_handler,
|
||||||
|
'ol': ordered_list_handler,
|
||||||
|
'li': list_item_handler,
|
||||||
|
'table': table_handler,
|
||||||
|
'tr': table_row_handler,
|
||||||
|
'td': table_cell_handler,
|
||||||
|
'th': table_header_cell_handler,
|
||||||
|
'hr': horizontal_rule_handler,
|
||||||
|
'br': line_break_handler,
|
||||||
|
|
||||||
|
# Semantic elements (treated as containers)
|
||||||
|
'section': div_handler,
|
||||||
|
'article': div_handler,
|
||||||
|
'aside': div_handler,
|
||||||
|
'nav': div_handler,
|
||||||
|
'header': div_handler,
|
||||||
|
'footer': div_handler,
|
||||||
|
'main': div_handler,
|
||||||
|
'figure': div_handler,
|
||||||
|
'figcaption': paragraph_handler,
|
||||||
|
|
||||||
|
# Media elements
|
||||||
|
'img': image_handler,
|
||||||
|
|
||||||
|
# Inline elements (handled during text extraction)
|
||||||
|
'span': ignore_handler,
|
||||||
|
'a': ignore_handler,
|
||||||
|
'strong': ignore_handler,
|
||||||
|
'b': ignore_handler,
|
||||||
|
'em': ignore_handler,
|
||||||
|
'i': ignore_handler,
|
||||||
|
'u': ignore_handler,
|
||||||
|
's': ignore_handler,
|
||||||
|
'del': ignore_handler,
|
||||||
|
'ins': ignore_handler,
|
||||||
|
'mark': ignore_handler,
|
||||||
|
'small': ignore_handler,
|
||||||
|
'sub': ignore_handler,
|
||||||
|
'sup': ignore_handler,
|
||||||
|
'q': ignore_handler,
|
||||||
|
'cite': ignore_handler,
|
||||||
|
'abbr': ignore_handler,
|
||||||
|
'time': ignore_handler,
|
||||||
|
|
||||||
|
# Ignored elements
|
||||||
|
'script': ignore_handler,
|
||||||
|
'style': ignore_handler,
|
||||||
|
'meta': ignore_handler,
|
||||||
|
'link': ignore_handler,
|
||||||
|
'head': ignore_handler,
|
||||||
|
'title': ignore_handler,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html_string(html_string: str, base_font: Optional[Font] = None) -> List[Block]:
|
||||||
|
"""
|
||||||
|
Parse HTML string and return list of Block objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_string: HTML content to parse
|
||||||
|
base_font: Base font for styling, defaults to system default
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Block objects representing the document structure
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html_string, 'html.parser')
|
||||||
|
context = create_base_context(base_font)
|
||||||
|
blocks = []
|
||||||
|
|
||||||
|
# Process the body if it exists, otherwise process all top-level elements
|
||||||
|
root_element = soup.find('body') or soup
|
||||||
|
|
||||||
|
for element in root_element.children:
|
||||||
|
if isinstance(element, Tag):
|
||||||
|
element_context = apply_element_styling(context, element)
|
||||||
|
result = process_element(element, element_context)
|
||||||
|
if result:
|
||||||
|
if isinstance(result, list):
|
||||||
|
blocks.extend(result)
|
||||||
|
else:
|
||||||
|
blocks.append(result)
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
|
def register_handler(tag_name: str, handler: Callable[[Tag, StyleContext], Union[Block, List[Block], None]]):
|
||||||
|
"""
|
||||||
|
Register a custom handler for an HTML tag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag_name: HTML tag name (lowercase)
|
||||||
|
handler: Handler function with signature (element: Tag, context: StyleContext) -> Union[Block, List[Block], None]
|
||||||
|
"""
|
||||||
|
HANDLERS[tag_name] = handler
|
||||||
|
|
||||||
|
|
||||||
|
def get_handler(tag_name: str) -> Callable[[Tag, StyleContext], Union[Block, List[Block], None]]:
|
||||||
|
"""
|
||||||
|
Get handler function for HTML tag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag_name: HTML tag name (lowercase)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Handler function or generic_handler if tag not found
|
||||||
|
"""
|
||||||
|
return HANDLERS.get(tag_name.lower(), generic_handler)
|
||||||
@ -7,7 +7,7 @@ entity references, and word creation in HTML documents.
|
|||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from pyWebLayout.abstract.inline import Word
|
from pyWebLayout.abstract.inline import Word
|
||||||
from pyWebLayout.abstract.block import Parapgraph
|
from pyWebLayout.abstract.block import Paragraph
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
|
|
||||||
|
|
||||||
@ -28,14 +28,14 @@ class HTMLTextProcessor:
|
|||||||
"""
|
"""
|
||||||
self._style_manager = style_manager
|
self._style_manager = style_manager
|
||||||
self._text_buffer = ""
|
self._text_buffer = ""
|
||||||
self._current_paragraph: Optional[Parapgraph] = None
|
self._current_paragraph: Optional[Paragraph] = None
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""Reset the text processor state."""
|
"""Reset the text processor state."""
|
||||||
self._text_buffer = ""
|
self._text_buffer = ""
|
||||||
self._current_paragraph = None
|
self._current_paragraph = None
|
||||||
|
|
||||||
def set_current_paragraph(self, paragraph: Optional[Parapgraph]):
|
def set_current_paragraph(self, paragraph: Optional[Paragraph]):
|
||||||
"""
|
"""
|
||||||
Set the current paragraph for text output.
|
Set the current paragraph for text output.
|
||||||
|
|
||||||
|
|||||||
@ -139,7 +139,7 @@ class DocumentPaginator:
|
|||||||
for chapter in self.document.chapters:
|
for chapter in self.document.chapters:
|
||||||
# Add a heading block for the chapter if it has a title
|
# Add a heading block for the chapter if it has a title
|
||||||
if chapter.title:
|
if chapter.title:
|
||||||
from pyWebLayout.abstract.block import Heading, HeadingLevel, Parapgraph
|
from pyWebLayout.abstract.block import Heading, HeadingLevel, Paragraph
|
||||||
from pyWebLayout.abstract.inline import Word
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
|
||||||
# Create a heading for the chapter
|
# Create a heading for the chapter
|
||||||
|
|||||||
@ -6,7 +6,7 @@ Tests the core abstract block classes that form the foundation of the document m
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from pyWebLayout.abstract.block import (
|
from pyWebLayout.abstract.block import (
|
||||||
Block, BlockType, Parapgraph, Heading, HeadingLevel, Quote, CodeBlock,
|
Block, BlockType, Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
||||||
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
HList, ListStyle, ListItem, Table, TableRow, TableCell,
|
||||||
HorizontalRule, LineBreak, Image
|
HorizontalRule, LineBreak, Image
|
||||||
)
|
)
|
||||||
@ -19,7 +19,7 @@ class TestBlockElements(unittest.TestCase):
|
|||||||
|
|
||||||
def test_paragraph_creation(self):
|
def test_paragraph_creation(self):
|
||||||
"""Test creating and using paragraphs."""
|
"""Test creating and using paragraphs."""
|
||||||
paragraph = Parapgraph()
|
paragraph = Paragraph()
|
||||||
|
|
||||||
self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH)
|
self.assertEqual(paragraph.block_type, BlockType.PARAGRAPH)
|
||||||
self.assertEqual(paragraph.word_count, 0)
|
self.assertEqual(paragraph.word_count, 0)
|
||||||
@ -62,8 +62,8 @@ class TestBlockElements(unittest.TestCase):
|
|||||||
quote = Quote()
|
quote = Quote()
|
||||||
|
|
||||||
# Add nested paragraphs
|
# Add nested paragraphs
|
||||||
p1 = Parapgraph()
|
p1 = Paragraph()
|
||||||
p2 = Parapgraph()
|
p2 = Paragraph()
|
||||||
|
|
||||||
quote.add_block(p1)
|
quote.add_block(p1)
|
||||||
quote.add_block(p2)
|
quote.add_block(p2)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ document structure and metadata management.
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from pyWebLayout.abstract.document import Document, Chapter, Book, MetadataType
|
from pyWebLayout.abstract.document import Document, Chapter, Book, MetadataType
|
||||||
from pyWebLayout.abstract.block import Parapgraph, Heading, HeadingLevel, BlockType
|
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, BlockType
|
||||||
from pyWebLayout.abstract.inline import Word, FormattedSpan
|
from pyWebLayout.abstract.inline import Word, FormattedSpan
|
||||||
from pyWebLayout.style import Font
|
from pyWebLayout.style import Font
|
||||||
|
|
||||||
@ -77,8 +77,8 @@ class TestDocument(unittest.TestCase):
|
|||||||
def test_block_management(self):
|
def test_block_management(self):
|
||||||
"""Test adding and managing blocks."""
|
"""Test adding and managing blocks."""
|
||||||
# Create some blocks
|
# Create some blocks
|
||||||
para1 = Parapgraph()
|
para1 = Paragraph()
|
||||||
para2 = Parapgraph()
|
para2 = Paragraph()
|
||||||
heading = Heading(HeadingLevel.H1)
|
heading = Heading(HeadingLevel.H1)
|
||||||
|
|
||||||
# Add blocks
|
# Add blocks
|
||||||
@ -95,7 +95,7 @@ class TestDocument(unittest.TestCase):
|
|||||||
def test_anchor_management(self):
|
def test_anchor_management(self):
|
||||||
"""Test named anchor functionality."""
|
"""Test named anchor functionality."""
|
||||||
heading = Heading(HeadingLevel.H1)
|
heading = Heading(HeadingLevel.H1)
|
||||||
para = Parapgraph()
|
para = Paragraph()
|
||||||
|
|
||||||
# Add anchors
|
# Add anchors
|
||||||
self.doc.add_anchor("intro", heading)
|
self.doc.add_anchor("intro", heading)
|
||||||
@ -154,8 +154,8 @@ class TestDocument(unittest.TestCase):
|
|||||||
def test_find_blocks_by_type(self):
|
def test_find_blocks_by_type(self):
|
||||||
"""Test finding blocks by type."""
|
"""Test finding blocks by type."""
|
||||||
# Create blocks of different types
|
# Create blocks of different types
|
||||||
para1 = Parapgraph()
|
para1 = Paragraph()
|
||||||
para2 = Parapgraph()
|
para2 = Paragraph()
|
||||||
heading1 = Heading(HeadingLevel.H1)
|
heading1 = Heading(HeadingLevel.H1)
|
||||||
heading2 = Heading(HeadingLevel.H2)
|
heading2 = Heading(HeadingLevel.H2)
|
||||||
|
|
||||||
@ -180,7 +180,7 @@ class TestDocument(unittest.TestCase):
|
|||||||
def test_find_headings(self):
|
def test_find_headings(self):
|
||||||
"""Test finding heading blocks specifically."""
|
"""Test finding heading blocks specifically."""
|
||||||
# Create mixed blocks
|
# Create mixed blocks
|
||||||
para = Parapgraph()
|
para = Paragraph()
|
||||||
h1 = Heading(HeadingLevel.H1)
|
h1 = Heading(HeadingLevel.H1)
|
||||||
h2 = Heading(HeadingLevel.H2)
|
h2 = Heading(HeadingLevel.H2)
|
||||||
|
|
||||||
@ -284,8 +284,8 @@ class TestChapter(unittest.TestCase):
|
|||||||
|
|
||||||
def test_block_management(self):
|
def test_block_management(self):
|
||||||
"""Test adding blocks to chapter."""
|
"""Test adding blocks to chapter."""
|
||||||
para1 = Parapgraph()
|
para1 = Paragraph()
|
||||||
para2 = Parapgraph()
|
para2 = Paragraph()
|
||||||
heading = Heading(HeadingLevel.H2)
|
heading = Heading(HeadingLevel.H2)
|
||||||
|
|
||||||
# Add blocks
|
# Add blocks
|
||||||
@ -450,7 +450,7 @@ class TestBook(unittest.TestCase):
|
|||||||
"""Test that Book inherits all Document functionality."""
|
"""Test that Book inherits all Document functionality."""
|
||||||
# Test that book can use all document methods
|
# Test that book can use all document methods
|
||||||
# Add blocks directly to book
|
# Add blocks directly to book
|
||||||
para = Parapgraph()
|
para = Paragraph()
|
||||||
self.book.add_block(para)
|
self.book.add_block(para)
|
||||||
self.assertEqual(len(self.book.blocks), 1)
|
self.assertEqual(len(self.book.blocks), 1)
|
||||||
|
|
||||||
|
|||||||
@ -1,44 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Simple test script to verify that the EPUB reader fixes are working correctly.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Add the pyWebLayout directory to the Python path
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'pyWebLayout'))
|
|
||||||
|
|
||||||
try:
|
|
||||||
from pyWebLayout.io.readers.epub_reader import read_epub
|
|
||||||
print("Successfully imported epub_reader module")
|
|
||||||
|
|
||||||
# Test reading the EPUB file
|
|
||||||
epub_path = os.path.join('pyWebLayout', 'examples', 'pg174-images-3.epub')
|
|
||||||
|
|
||||||
if not os.path.exists(epub_path):
|
|
||||||
print(f"EPUB file not found: {epub_path}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
print(f"Reading EPUB file: {epub_path}")
|
|
||||||
|
|
||||||
# Try to read the EPUB
|
|
||||||
book = read_epub(epub_path)
|
|
||||||
|
|
||||||
print(f"Successfully read EPUB file!")
|
|
||||||
print(f"Book title: {book.title}")
|
|
||||||
print(f"Number of chapters: {len(book.chapters)}")
|
|
||||||
|
|
||||||
# Check first chapter
|
|
||||||
if book.chapters:
|
|
||||||
first_chapter = book.chapters[0]
|
|
||||||
print(f"First chapter title: {first_chapter.title}")
|
|
||||||
print(f"First chapter has {len(first_chapter.blocks)} blocks")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
print("Test completed successfully!")
|
|
||||||
@ -9,7 +9,7 @@ import unittest
|
|||||||
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
||||||
from pyWebLayout.abstract.document import Document
|
from pyWebLayout.abstract.document import Document
|
||||||
from pyWebLayout.abstract.block import (
|
from pyWebLayout.abstract.block import (
|
||||||
Parapgraph, Heading, HeadingLevel, HList, ListStyle,
|
Paragraph, Heading, HeadingLevel, HList, ListStyle,
|
||||||
Table, Quote, CodeBlock, HorizontalRule, LineBreak
|
Table, Quote, CodeBlock, HorizontalRule, LineBreak
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ class TestHTMLContentReader(unittest.TestCase):
|
|||||||
result = self.reader.extract_content(html, self.document)
|
result = self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 1)
|
self.assertEqual(len(self.document.blocks), 1)
|
||||||
self.assertIsInstance(self.document.blocks[0], Parapgraph)
|
self.assertIsInstance(self.document.blocks[0], Paragraph)
|
||||||
|
|
||||||
paragraph = self.document.blocks[0]
|
paragraph = self.document.blocks[0]
|
||||||
words = list(paragraph.words())
|
words = list(paragraph.words())
|
||||||
@ -107,7 +107,7 @@ class TestHTMLContentReader(unittest.TestCase):
|
|||||||
# Check first item content
|
# Check first item content
|
||||||
first_item_blocks = list(items[0].blocks())
|
first_item_blocks = list(items[0].blocks())
|
||||||
self.assertEqual(len(first_item_blocks), 1)
|
self.assertEqual(len(first_item_blocks), 1)
|
||||||
self.assertIsInstance(first_item_blocks[0], Parapgraph)
|
self.assertIsInstance(first_item_blocks[0], Paragraph)
|
||||||
|
|
||||||
def test_ordered_list(self):
|
def test_ordered_list(self):
|
||||||
"""Test parsing ordered lists."""
|
"""Test parsing ordered lists."""
|
||||||
@ -202,8 +202,8 @@ class TestHTMLContentReader(unittest.TestCase):
|
|||||||
quote = self.document.blocks[0]
|
quote = self.document.blocks[0]
|
||||||
quote_blocks = list(quote.blocks())
|
quote_blocks = list(quote.blocks())
|
||||||
self.assertEqual(len(quote_blocks), 2)
|
self.assertEqual(len(quote_blocks), 2)
|
||||||
self.assertIsInstance(quote_blocks[0], Parapgraph)
|
self.assertIsInstance(quote_blocks[0], Paragraph)
|
||||||
self.assertIsInstance(quote_blocks[1], Parapgraph)
|
self.assertIsInstance(quote_blocks[1], Paragraph)
|
||||||
|
|
||||||
def test_code_block(self):
|
def test_code_block(self):
|
||||||
"""Test parsing code blocks."""
|
"""Test parsing code blocks."""
|
||||||
@ -229,9 +229,9 @@ def hello():
|
|||||||
self.reader.extract_content(html, self.document)
|
self.reader.extract_content(html, self.document)
|
||||||
|
|
||||||
self.assertEqual(len(self.document.blocks), 3)
|
self.assertEqual(len(self.document.blocks), 3)
|
||||||
self.assertIsInstance(self.document.blocks[0], Parapgraph)
|
self.assertIsInstance(self.document.blocks[0], Paragraph)
|
||||||
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
|
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
|
||||||
self.assertIsInstance(self.document.blocks[2], Parapgraph)
|
self.assertIsInstance(self.document.blocks[2], Paragraph)
|
||||||
|
|
||||||
def test_html_entities(self):
|
def test_html_entities(self):
|
||||||
"""Test handling HTML entities."""
|
"""Test handling HTML entities."""
|
||||||
@ -268,7 +268,7 @@ def hello():
|
|||||||
|
|
||||||
# Check that we have different types of blocks
|
# Check that we have different types of blocks
|
||||||
block_types = [type(block).__name__ for block in self.document.blocks]
|
block_types = [type(block).__name__ for block in self.document.blocks]
|
||||||
self.assertIn('Parapgraph', block_types) # From div
|
self.assertIn('Paragraph', block_types) # From div
|
||||||
self.assertIn('Heading', block_types)
|
self.assertIn('Heading', block_types)
|
||||||
self.assertIn('HList', block_types)
|
self.assertIn('HList', block_types)
|
||||||
|
|
||||||
@ -346,7 +346,7 @@ def hello():
|
|||||||
|
|
||||||
# Should have different types of content
|
# Should have different types of content
|
||||||
block_types = set(type(block).__name__ for block in self.document.blocks)
|
block_types = set(type(block).__name__ for block in self.document.blocks)
|
||||||
expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'}
|
expected_types = {'Heading', 'Paragraph', 'HList', 'Quote', 'Table'}
|
||||||
self.assertTrue(expected_types.issubset(block_types))
|
self.assertTrue(expected_types.issubset(block_types))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
384
tests/test_html_extraction.py
Normal file
384
tests/test_html_extraction.py
Normal file
@ -0,0 +1,384 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for HTML extraction functionality.
|
||||||
|
|
||||||
|
Tests the HTML parsing and conversion to pyWebLayout abstract elements,
|
||||||
|
including styled content within paragraphs and block-level elements.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
||||||
|
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
|
||||||
|
from pyWebLayout.style import FontWeight, FontStyle, TextDecoration
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLParagraph(unittest.TestCase):
|
||||||
|
"""Test cases for basic paragraph parsing."""
|
||||||
|
|
||||||
|
def test_simple(self):
|
||||||
|
text = "<p>This is a paragraph.</p>"
|
||||||
|
paragraphs = parse_html_string(text)
|
||||||
|
self.assertEqual(len(paragraphs), 1)
|
||||||
|
self.assertEqual(len(paragraphs[0]), 4)
|
||||||
|
|
||||||
|
for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
|
||||||
|
self.assertEqual(w1[1].text, t1)
|
||||||
|
|
||||||
|
def test_multiple(self):
|
||||||
|
text = "<p>This is a paragraph.</p><p>This is another paragraph.</p>"
|
||||||
|
paragraphs = parse_html_string(text)
|
||||||
|
self.assertEqual(len(paragraphs), 2)
|
||||||
|
self.assertEqual(len(paragraphs[0]), 4)
|
||||||
|
self.assertEqual(len(paragraphs[1]), 4)
|
||||||
|
|
||||||
|
for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
|
||||||
|
self.assertEqual(w1[1].text, t1)
|
||||||
|
|
||||||
|
for w1, t1 in zip(paragraphs[1].words(), "This is another paragraph.".split(" ")):
|
||||||
|
self.assertEqual(w1[1].text, t1)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLStyledParagraphs(unittest.TestCase):
|
||||||
|
"""Test cases for paragraphs with inline styling."""
|
||||||
|
|
||||||
|
def test_bold_text(self):
|
||||||
|
"""Test paragraphs with bold text using <strong> and <b> tags."""
|
||||||
|
text = "<p>This is <strong>bold text</strong> in a paragraph.</p>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
self.assertEqual(len(words), 7) # "This is bold text in a paragraph."
|
||||||
|
|
||||||
|
# Check that 'bold' and 'text' words have bold font weight
|
||||||
|
bold_word = words[2][1] # 'bold'
|
||||||
|
text_word = words[3][1] # 'text'
|
||||||
|
self.assertEqual(bold_word.text, "bold")
|
||||||
|
self.assertEqual(bold_word.style.weight, FontWeight.BOLD)
|
||||||
|
self.assertEqual(text_word.text, "text")
|
||||||
|
self.assertEqual(text_word.style.weight, FontWeight.BOLD)
|
||||||
|
|
||||||
|
# Check that other words are not bold
|
||||||
|
normal_word = words[0][1] # 'This'
|
||||||
|
self.assertEqual(normal_word.text, "This")
|
||||||
|
self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD)
|
||||||
|
|
||||||
|
def test_italic_text(self):
|
||||||
|
"""Test paragraphs with italic text using <em> and <i> tags."""
|
||||||
|
text = "<p>This is <em>italic text</em> in a paragraph.</p>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
|
||||||
|
# Check that 'italic' and 'text' words have italic font style
|
||||||
|
italic_word = words[2][1] # 'italic'
|
||||||
|
text_word = words[3][1] # 'text'
|
||||||
|
self.assertEqual(italic_word.text, "italic")
|
||||||
|
self.assertEqual(italic_word.style.style, FontStyle.ITALIC)
|
||||||
|
self.assertEqual(text_word.text, "text")
|
||||||
|
self.assertEqual(text_word.style.style, FontStyle.ITALIC)
|
||||||
|
|
||||||
|
def test_underlined_text(self):
|
||||||
|
"""Test paragraphs with underlined text using <u> tag."""
|
||||||
|
text = "<p>This is <u>underlined text</u> here.</p>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
underlined_word = words[2][1] # 'underlined'
|
||||||
|
self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE)
|
||||||
|
|
||||||
|
def test_strikethrough_text(self):
|
||||||
|
"""Test paragraphs with strikethrough text using <s> and <del> tags."""
|
||||||
|
text = "<p>This is <s>strikethrough text</s> here.</p>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
strike_word = words[2][1] # 'strikethrough'
|
||||||
|
self.assertEqual(strike_word.style.decoration, TextDecoration.STRIKETHROUGH)
|
||||||
|
|
||||||
|
def test_span_with_inline_styles(self):
|
||||||
|
"""Test paragraphs with span elements containing inline CSS styles."""
|
||||||
|
text = '<p>This text is normal, but <span style="color: red; font-weight: bold;">this part is red and bold</span>.</p>'
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
|
||||||
|
# Find the styled words
|
||||||
|
styled_words = []
|
||||||
|
for _, word in words:
|
||||||
|
if word.text in ["this", "part", "is", "red", "and", "bold"]:
|
||||||
|
if word.style.weight == FontWeight.BOLD:
|
||||||
|
styled_words.append(word)
|
||||||
|
|
||||||
|
self.assertGreater(len(styled_words), 0, "Should have bold words in styled span")
|
||||||
|
|
||||||
|
# Check that at least one word has the red color
|
||||||
|
red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)]
|
||||||
|
self.assertGreater(len(red_words), 0, "Should have red colored words")
|
||||||
|
|
||||||
|
def test_mixed_formatting(self):
|
||||||
|
"""Test paragraphs with multiple formatting elements combined."""
|
||||||
|
text = "<p>This paragraph contains <strong>bold</strong>, <em>italic</em>, <span style=\"color: blue;\">blue</span>, and <mark>highlighted</mark> text all together.</p>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
|
||||||
|
# Check for bold word
|
||||||
|
bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
|
||||||
|
self.assertGreater(len(bold_words), 0, "Should have bold words")
|
||||||
|
|
||||||
|
# Check for italic word
|
||||||
|
italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
|
||||||
|
self.assertGreater(len(italic_words), 0, "Should have italic words")
|
||||||
|
|
||||||
|
# Check for blue colored word
|
||||||
|
blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)]
|
||||||
|
self.assertGreater(len(blue_words), 0, "Should have blue colored words")
|
||||||
|
|
||||||
|
def test_nested_formatting(self):
|
||||||
|
"""Test nested formatting elements."""
|
||||||
|
text = "<p>This has <strong>bold with <em>italic inside</em></strong> formatting.</p>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
|
||||||
|
# Find words that should be both bold and italic
|
||||||
|
bold_italic_words = [w for _, w in words
|
||||||
|
if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC]
|
||||||
|
self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic")
|
||||||
|
|
||||||
|
def test_color_variations(self):
|
||||||
|
"""Test different color formats in CSS."""
|
||||||
|
text = '<p><span style="color: #ff0000;">Hex red</span> and <span style="color: green;">Named green</span>.</p>'
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
|
||||||
|
words = list(blocks[0].words())
|
||||||
|
|
||||||
|
# Check for hex red color
|
||||||
|
hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)]
|
||||||
|
self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words")
|
||||||
|
|
||||||
|
# Check for named green color
|
||||||
|
green_words = [w for _, w in words if w.style.colour == (0, 255, 0)]
|
||||||
|
self.assertGreater(len(green_words), 0, "Should have green colored words")
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLBlockElements(unittest.TestCase):
|
||||||
|
"""Test cases for block-level HTML elements."""
|
||||||
|
|
||||||
|
def test_body_element(self):
|
||||||
|
"""Test parsing of body element containing other elements."""
|
||||||
|
text = "<body><p>Paragraph one.</p><p>Paragraph two.</p></body>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 2)
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph)
|
||||||
|
self.assertIsInstance(blocks[1], Paragraph)
|
||||||
|
|
||||||
|
def test_div_container(self):
|
||||||
|
"""Test div elements as generic containers."""
|
||||||
|
text = "<div><p>First paragraph.</p><p>Second paragraph.</p></div>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 2)
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph)
|
||||||
|
self.assertIsInstance(blocks[1], Paragraph)
|
||||||
|
|
||||||
|
def test_headings(self):
|
||||||
|
"""Test all heading levels h1-h6."""
|
||||||
|
text = "<h1>Heading 1</h1><h2>Heading 2</h2><h3>Heading 3</h3><h4>Heading 4</h4><h5>Heading 5</h5><h6>Heading 6</h6>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 6)
|
||||||
|
|
||||||
|
expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3,
|
||||||
|
HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6]
|
||||||
|
|
||||||
|
for i, block in enumerate(blocks):
|
||||||
|
self.assertIsInstance(block, Heading)
|
||||||
|
self.assertEqual(block.level, expected_levels[i])
|
||||||
|
|
||||||
|
words = list(block.words())
|
||||||
|
self.assertEqual(len(words), 2) # "Heading" and number
|
||||||
|
self.assertEqual(words[0][1].text, "Heading")
|
||||||
|
|
||||||
|
def test_blockquote(self):
|
||||||
|
"""Test blockquote elements."""
|
||||||
|
text = "<blockquote><p>This is a quoted paragraph.</p></blockquote>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Quote)
|
||||||
|
|
||||||
|
# Check that the quote contains a paragraph
|
||||||
|
quote_blocks = list(blocks[0].blocks())
|
||||||
|
self.assertEqual(len(quote_blocks), 1)
|
||||||
|
self.assertIsInstance(quote_blocks[0], Paragraph)
|
||||||
|
|
||||||
|
def test_preformatted_code(self):
|
||||||
|
"""Test preformatted code blocks."""
|
||||||
|
text = "<pre><code>function hello() {\n console.log('Hello');\n}</code></pre>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], CodeBlock)
|
||||||
|
|
||||||
|
lines = list(blocks[0].lines())
|
||||||
|
self.assertGreater(len(lines), 0)
|
||||||
|
|
||||||
|
def test_unordered_list(self):
|
||||||
|
"""Test unordered lists."""
|
||||||
|
text = "<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], HList)
|
||||||
|
self.assertEqual(blocks[0].style, ListStyle.UNORDERED)
|
||||||
|
|
||||||
|
items = list(blocks[0].items())
|
||||||
|
self.assertEqual(len(items), 3)
|
||||||
|
|
||||||
|
def test_ordered_list(self):
|
||||||
|
"""Test ordered lists."""
|
||||||
|
text = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], HList)
|
||||||
|
self.assertEqual(blocks[0].style, ListStyle.ORDERED)
|
||||||
|
|
||||||
|
def test_list_with_styled_content(self):
|
||||||
|
"""Test lists containing styled content."""
|
||||||
|
text = "<ul><li>Normal item</li><li><strong>Bold item</strong></li><li>Item with <em>italic</em> text</li></ul>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], HList)
|
||||||
|
|
||||||
|
items = list(blocks[0].items())
|
||||||
|
self.assertEqual(len(items), 3)
|
||||||
|
|
||||||
|
# Check second item has bold text
|
||||||
|
second_item_blocks = list(items[1].blocks())
|
||||||
|
if second_item_blocks:
|
||||||
|
words = list(second_item_blocks[0].words())
|
||||||
|
bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
|
||||||
|
self.assertGreater(len(bold_words), 0)
|
||||||
|
|
||||||
|
def test_table_basic(self):
|
||||||
|
"""Test basic table structure."""
|
||||||
|
text = """
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Header 1</th>
|
||||||
|
<th>Header 2</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Cell 1</td>
|
||||||
|
<td>Cell 2</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Table)
|
||||||
|
|
||||||
|
def test_semantic_elements(self):
|
||||||
|
"""Test semantic HTML5 elements treated as containers."""
|
||||||
|
text = "<section><article><p>Article content</p></article></section>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Paragraph)
|
||||||
|
|
||||||
|
def test_nested_block_elements(self):
|
||||||
|
"""Test nested block elements."""
|
||||||
|
text = """
|
||||||
|
<div>
|
||||||
|
<h2>Section Title</h2>
|
||||||
|
<p>Some introductory text.</p>
|
||||||
|
<blockquote>
|
||||||
|
<p>A quoted paragraph.</p>
|
||||||
|
</blockquote>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertGreater(len(blocks), 2)
|
||||||
|
|
||||||
|
# Should have at least a heading, paragraph, and quote
|
||||||
|
has_heading = any(isinstance(b, Heading) for b in blocks)
|
||||||
|
has_paragraph = any(isinstance(b, Paragraph) for b in blocks)
|
||||||
|
has_quote = any(isinstance(b, Quote) for b in blocks)
|
||||||
|
|
||||||
|
self.assertTrue(has_heading, "Should contain a heading")
|
||||||
|
self.assertTrue(has_paragraph, "Should contain a paragraph")
|
||||||
|
self.assertTrue(has_quote, "Should contain a quote")
|
||||||
|
|
||||||
|
def test_empty_elements(self):
|
||||||
|
"""Test handling of empty elements."""
|
||||||
|
text = "<p></p><div></div><span></span>"
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
# Empty elements may not create blocks, which is acceptable behavior
|
||||||
|
self.assertGreaterEqual(len(blocks), 0)
|
||||||
|
|
||||||
|
# Test that empty paragraph with some content does create a block
|
||||||
|
text_with_content = "<p> </p>" # Contains whitespace
|
||||||
|
blocks_with_content = parse_html_string(text_with_content)
|
||||||
|
# This should create at least one block since there's whitespace content
|
||||||
|
self.assertGreaterEqual(len(blocks_with_content), 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLComplexStructures(unittest.TestCase):
|
||||||
|
"""Test cases for complex HTML structures combining multiple features."""
|
||||||
|
|
||||||
|
def test_article_with_mixed_content(self):
|
||||||
|
"""Test a realistic article structure with mixed content."""
|
||||||
|
text = """
|
||||||
|
<article>
|
||||||
|
<h1>Article Title</h1>
|
||||||
|
<p>This is the <strong>introduction</strong> paragraph with <em>some emphasis</em>.</p>
|
||||||
|
<blockquote>
|
||||||
|
<p>This is a <span style="color: blue;">quoted section</span> with styling.</p>
|
||||||
|
</blockquote>
|
||||||
|
<ul>
|
||||||
|
<li>First <strong>important</strong> point</li>
|
||||||
|
<li>Second point with <code>inline code</code></li>
|
||||||
|
</ul>
|
||||||
|
</article>
|
||||||
|
"""
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertGreater(len(blocks), 3)
|
||||||
|
|
||||||
|
# Verify we have the expected block types
|
||||||
|
block_types = [type(b).__name__ for b in blocks]
|
||||||
|
self.assertIn('Heading', block_types)
|
||||||
|
self.assertIn('Paragraph', block_types)
|
||||||
|
self.assertIn('Quote', block_types)
|
||||||
|
self.assertIn('HList', block_types)
|
||||||
|
|
||||||
|
def test_styled_table_content(self):
|
||||||
|
"""Test table with styled cell content."""
|
||||||
|
text = """
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th><strong>Product</strong></th>
|
||||||
|
<th><em>Price</em></th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>Item with <span style="color: red;">red text</span></td>
|
||||||
|
<td><strong>$19.99</strong></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
blocks = parse_html_string(text)
|
||||||
|
self.assertEqual(len(blocks), 1)
|
||||||
|
self.assertIsInstance(blocks[0], Table)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
@ -8,7 +8,7 @@ import unittest
|
|||||||
from unittest.mock import Mock, MagicMock
|
from unittest.mock import Mock, MagicMock
|
||||||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||||||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||||||
from pyWebLayout.abstract.block import Parapgraph
|
from pyWebLayout.abstract.block import Paragraph
|
||||||
from pyWebLayout.abstract.inline import Word
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ class TestHTMLTextProcessor(unittest.TestCase):
|
|||||||
self.text_processor = HTMLTextProcessor(self.style_manager)
|
self.text_processor = HTMLTextProcessor(self.style_manager)
|
||||||
|
|
||||||
# Create a mock paragraph
|
# Create a mock paragraph
|
||||||
self.mock_paragraph = Mock(spec=Parapgraph)
|
self.mock_paragraph = Mock(spec=Paragraph)
|
||||||
self.mock_paragraph.add_word = Mock()
|
self.mock_paragraph.add_word = Mock()
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user