pyWebLayout/tests/io_tests/test_html_extraction_functions.py

525 lines
20 KiB
Python

"""
Unit tests for individual HTML extraction functions.
Tests the specific handler functions and utility functions in html_extraction module,
reusing test patterns from test_html_extraction.py that are known to pass.
"""
import unittest
from bs4 import BeautifulSoup
from pyWebLayout.io.readers.html_extraction import (
create_base_context,
apply_element_styling,
parse_inline_styles,
apply_element_font_styles,
extract_text_content,
paragraph_handler,
div_handler,
heading_handler,
blockquote_handler,
preformatted_handler,
unordered_list_handler,
ordered_list_handler,
list_item_handler,
table_handler,
table_row_handler,
table_cell_handler,
table_header_cell_handler,
horizontal_rule_handler,
image_handler,
StyleContext,
)
from pyWebLayout.abstract.block import (
Paragraph,
Heading,
HeadingLevel,
Quote,
CodeBlock,
HList,
ListItem,
ListStyle,
Table,
TableRow,
TableCell,
HorizontalRule,
Image,
)
from pyWebLayout.abstract.inline import Word
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
class TestUtilityFunctions(unittest.TestCase):
"""Test cases for utility functions."""
def test_create_base_context(self):
"""Test creation of base style context."""
context = create_base_context()
self.assertIsInstance(context, StyleContext)
self.assertIsInstance(context.font, Font)
self.assertIsNone(context.background)
self.assertEqual(context.css_classes, set())
self.assertEqual(context.css_styles, {})
self.assertEqual(context.element_attributes, {})
self.assertEqual(context.parent_elements, [])
def test_parse_inline_styles_from_existing_tests(self):
"""Test parsing CSS inline styles - adapted from test_span_with_inline_styles."""
# From: '<span style="color: red; font-weight: bold;">this part is red and bold</span>'
style_text = "color: red; font-weight: bold;"
styles = parse_inline_styles(style_text)
expected = {
"color": "red",
"font-weight": "bold"
}
self.assertEqual(styles, expected)
def test_parse_inline_styles_color_variations(self):
"""Test parsing different color formats - adapted from test_color_variations."""
# Test hex color parsing
hex_style = "color: #ff0000;"
styles = parse_inline_styles(hex_style)
self.assertEqual(styles.get("color"), "#ff0000")
# Test named color parsing
named_style = "color: green;"
styles = parse_inline_styles(named_style)
self.assertEqual(styles.get("color"), "green")
def test_apply_element_font_styles_bold_elements(self):
"""Test font style application for bold elements - adapted from test_bold_text."""
base_font = Font()
# Test <strong> tag - from "<strong>bold text</strong>"
font = apply_element_font_styles(base_font, "strong", {})
self.assertEqual(font.weight, FontWeight.BOLD)
# Test <b> tag
font = apply_element_font_styles(base_font, "b", {})
self.assertEqual(font.weight, FontWeight.BOLD)
def test_apply_element_font_styles_italic_elements(self):
"""Test font style application for italic elements - adapted from test_italic_text."""
base_font = Font()
# Test <em> tag - from "<em>italic text</em>"
font = apply_element_font_styles(base_font, "em", {})
self.assertEqual(font.style, FontStyle.ITALIC)
# Test <i> tag
font = apply_element_font_styles(base_font, "i", {})
self.assertEqual(font.style, FontStyle.ITALIC)
def test_apply_element_font_styles_decoration_elements(self):
"""Test font decoration - adapted from test_underlined_text and test_strikethrough_text."""
base_font = Font()
# Test <u> tag - from "<u>underlined text</u>"
font = apply_element_font_styles(base_font, "u", {})
self.assertEqual(font.decoration, TextDecoration.UNDERLINE)
# Test <s> tag - from "<s>strikethrough text</s>"
font = apply_element_font_styles(base_font, "s", {})
self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH)
# Test <del> tag
font = apply_element_font_styles(base_font, "del", {})
self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH)
def test_apply_element_font_styles_headings(self):
"""Test heading font styles - adapted from test_headings."""
base_font = Font()
# Test heading sizes and weights - from test_headings which tests h1-h6
headings = [("h1", 24), ("h2", 20), ("h3", 18),
("h4", 16), ("h5", 14), ("h6", 12)]
for tag, expected_size in headings:
font = apply_element_font_styles(base_font, tag, {})
self.assertEqual(font.font_size, expected_size, f"Size mismatch for {tag}")
self.assertEqual(
font.weight,
FontWeight.BOLD,
f"Weight should be bold for {tag}")
def test_apply_element_font_styles_color_parsing(self):
"""Test color parsing - adapted from test_color_variations."""
base_font = Font()
# Test named colors - from '<span style="color: green;">Named green</span>'
css_styles = {"color": "green"}
font = apply_element_font_styles(base_font, "span", css_styles)
self.assertEqual(font.colour, (0, 255, 0))
# Test hex colors - from '<span style="color: #ff0000;">Hex red</span>'
css_styles = {"color": "#ff0000"}
font = apply_element_font_styles(base_font, "span", css_styles)
self.assertEqual(font.colour, (255, 0, 0))
def test_apply_element_styling_with_classes_and_styles(self):
"""Test complete element styling - adapted from test_span_with_inline_styles."""
# From: '<span style="color: red; font-weight: bold;">this part is red and bold</span>'
soup = BeautifulSoup(
'<span class="highlight" style="color: red; font-weight: bold;">text</span>',
'html.parser')
element = soup.find('span')
base_context = create_base_context()
styled_context = apply_element_styling(base_context, element)
# Check CSS classes
self.assertIn("highlight", styled_context.css_classes)
# Check CSS styles
self.assertEqual(styled_context.css_styles.get("color"), "red")
self.assertEqual(styled_context.css_styles.get("font-weight"), "bold")
# Check font styling
self.assertEqual(styled_context.font.colour, (255, 0, 0))
self.assertEqual(styled_context.font.weight, FontWeight.BOLD)
class TestExtractTextContent(unittest.TestCase):
"""Test cases for text content extraction."""
def setUp(self):
"""Set up test fixtures."""
self.base_context = create_base_context()
def test_extract_simple_text(self):
"""Test extracting simple text - adapted from test_simple."""
# From: "<p>This is a paragraph.</p>"
soup = BeautifulSoup('<p>This is a paragraph.</p>', 'html.parser')
element = soup.find('p')
words = extract_text_content(element, self.base_context)
# Should match the expected word count from original test
self.assertEqual(len(words), 4) # "This", "is", "a", "paragraph."
self.assertIsInstance(words[0], Word)
self.assertEqual(words[0].text, "This")
def test_extract_styled_text_bold(self):
"""Test extracting bold styled text - adapted from test_bold_text."""
# From: "<p>This is <strong>bold text</strong> in a paragraph.</p>"
soup = BeautifulSoup(
'<span>This is <strong>bold text</strong> in a paragraph.</span>',
'html.parser')
element = soup.find('span')
words = extract_text_content(element, self.base_context)
# Find the bold words
bold_words = [w for w in words if w.style.weight == FontWeight.BOLD]
self.assertGreater(len(bold_words), 0, "Should have bold words")
# Check specific words are bold (from original test expectations)
bold_word_texts = [w.text for w in bold_words]
self.assertIn("bold", bold_word_texts)
self.assertIn("text", bold_word_texts)
def test_extract_nested_formatting(self):
"""Test nested formatting - adapted from test_nested_formatting."""
# From: "<p>This has <strong>bold with <em>italic inside</em></strong> formatting.</p>"
soup = BeautifulSoup(
'<span>This has <strong>bold with <em>italic inside</em></strong> formatting.</span>',
'html.parser')
element = soup.find('span')
words = extract_text_content(element, self.base_context)
# Find words that should be both bold and italic
bold_italic_words = [w for w in words if w.style.weight ==
FontWeight.BOLD and w.style.style == FontStyle.ITALIC]
self.assertGreater(
len(bold_italic_words),
0,
"Should have words that are both bold and italic")
class TestHandlerFunctions(unittest.TestCase):
"""Test cases for HTML element handler functions using known working patterns."""
def setUp(self):
"""Set up test fixtures."""
self.base_context = create_base_context()
def test_paragraph_handler_simple(self):
"""Test paragraph handler - adapted from test_simple."""
# From: "<p>This is a paragraph.</p>"
soup = BeautifulSoup('<p>This is a paragraph.</p>', 'html.parser')
element = soup.find('p')
result = paragraph_handler(element, self.base_context)
self.assertIsInstance(result, Paragraph)
# Should match original test expectations
self.assertEqual(len(result), 4) # 4 words
words = list(result.words_iter())
expected_texts = ["This", "is", "a", "paragraph."]
for i, expected_text in enumerate(expected_texts):
self.assertEqual(words[i][1].text, expected_text)
def test_heading_handler_all_levels(self):
"""Test heading handler - adapted from test_headings."""
# From: "<h1>Heading 1</h1><h2>Heading 2</h2>..."
expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3,
HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6]
for i, expected_level in enumerate(expected_levels, 1):
tag = f"h{i}"
soup = BeautifulSoup(f'<{tag}>Heading {i}</{tag}>', 'html.parser')
element = soup.find(tag)
result = heading_handler(element, self.base_context)
self.assertIsInstance(result, Heading)
self.assertEqual(result.level, expected_level)
# Should match original test word expectations
words = list(result.words_iter())
self.assertEqual(len(words), 2) # "Heading" and number
self.assertEqual(words[0][1].text, "Heading")
def test_blockquote_handler(self):
"""Test blockquote handler - adapted from test_blockquote."""
# From: "<blockquote><p>This is a quoted paragraph.</p></blockquote>"
soup = BeautifulSoup(
'<blockquote><p>This is a quoted paragraph.</p></blockquote>',
'html.parser')
element = soup.find('blockquote')
result = blockquote_handler(element, self.base_context)
self.assertIsInstance(result, Quote)
# Check that the quote contains a paragraph (from original test)
quote_blocks = list(result.blocks())
self.assertEqual(len(quote_blocks), 1)
self.assertIsInstance(quote_blocks[0], Paragraph)
def test_preformatted_handler(self):
"""Test preformatted handler - adapted from test_preformatted_code."""
# From: "<pre><code>function hello() {\n console.log('Hello');\n}</code></pre>"
soup = BeautifulSoup(
'<pre><code>function hello() {\n console.log(\'Hello\');\n}</code></pre>',
'html.parser')
element = soup.find('pre')
result = preformatted_handler(element, self.base_context)
self.assertIsInstance(result, CodeBlock)
# Should have lines (from original test expectation)
lines = list(result.lines())
self.assertGreater(len(lines), 0)
def test_unordered_list_handler(self):
"""Test unordered list handler - adapted from test_unordered_list."""
# From: "<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>"
soup = BeautifulSoup(
'<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>',
'html.parser')
element = soup.find('ul')
result = unordered_list_handler(element, self.base_context)
self.assertIsInstance(result, HList)
self.assertEqual(result.style, ListStyle.UNORDERED)
# Should match original test expectations
items = list(result.items())
self.assertEqual(len(items), 3)
def test_ordered_list_handler(self):
"""Test ordered list handler - adapted from test_ordered_list."""
# From: "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>"
soup = BeautifulSoup(
'<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>',
'html.parser')
element = soup.find('ol')
result = ordered_list_handler(element, self.base_context)
self.assertIsInstance(result, HList)
self.assertEqual(result.style, ListStyle.ORDERED)
# Should match original test expectations
items = list(result.items())
self.assertEqual(len(items), 3) # "First item", "Second item", "Third item"
def test_list_item_handler(self):
"""Test list item handler."""
soup = BeautifulSoup('<li>List item content</li>', 'html.parser')
element = soup.find('li')
result = list_item_handler(element, self.base_context)
self.assertIsInstance(result, ListItem)
blocks = list(result.blocks())
self.assertGreater(len(blocks), 0)
def test_table_handler(self):
"""Test table handler - adapted from test_table_basic."""
# From test_table_basic structure
soup = BeautifulSoup('''
<table>
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
<tr>
<td>Cell 1</td>
<td>Cell 2</td>
</tr>
</table>
''', 'html.parser')
element = soup.find('table')
result = table_handler(element, self.base_context)
self.assertIsInstance(result, Table)
def test_table_row_handler(self):
"""Test table row handler."""
soup = BeautifulSoup('<tr><td>Cell 1</td><td>Cell 2</td></tr>', 'html.parser')
element = soup.find('tr')
result = table_row_handler(element, self.base_context)
self.assertIsInstance(result, TableRow)
def test_table_cell_handler(self):
"""Test table cell handler."""
soup = BeautifulSoup('<td>Cell content</td>', 'html.parser')
element = soup.find('td')
# Apply styling to get attributes
styled_context = apply_element_styling(self.base_context, element)
result = table_cell_handler(element, styled_context)
self.assertIsInstance(result, TableCell)
self.assertEqual(result.is_header, False)
def test_table_header_cell_handler(self):
"""Test table header cell handler."""
soup = BeautifulSoup('<th>Header content</th>', 'html.parser')
element = soup.find('th')
# Apply styling to get attributes
styled_context = apply_element_styling(self.base_context, element)
result = table_header_cell_handler(element, styled_context)
self.assertIsInstance(result, TableCell)
self.assertEqual(result.is_header, True)
def test_horizontal_rule_handler(self):
"""Test horizontal rule handler."""
soup = BeautifulSoup('<hr>', 'html.parser')
element = soup.find('hr')
result = horizontal_rule_handler(element, self.base_context)
self.assertIsInstance(result, HorizontalRule)
def test_image_handler(self):
"""Test image handler."""
soup = BeautifulSoup(
'<img src="test.jpg" alt="Test image" width="100" height="50">',
'html.parser')
element = soup.find('img')
# Need to apply styling first to get attributes
styled_context = apply_element_styling(self.base_context, element)
result = image_handler(element, styled_context)
self.assertIsInstance(result, Image)
self.assertEqual(result.source, "test.jpg")
self.assertEqual(result.alt_text, "Test image")
self.assertEqual(result.width, 100)
self.assertEqual(result.height, 50)
def test_div_handler_container(self):
"""Test div handler - adapted from test_div_container."""
# From: "<div><p>First paragraph.</p><p>Second paragraph.</p></div>"
soup = BeautifulSoup(
'<div><p>First paragraph.</p><p>Second paragraph.</p></div>',
'html.parser')
element = soup.find('div')
result = div_handler(element, self.base_context)
self.assertIsInstance(result, list)
# Should match original test expectations
self.assertEqual(len(result), 2)
self.assertIsInstance(result[0], Paragraph)
self.assertIsInstance(result[1], Paragraph)
class TestStyledContentHandling(unittest.TestCase):
"""Test styled content handling using patterns from existing tests."""
def setUp(self):
"""Set up test fixtures."""
self.base_context = create_base_context()
def test_paragraph_with_bold_content(self):
"""Test paragraph with bold content - adapted from test_bold_text."""
# From: "<p>This is <strong>bold text</strong> in a paragraph.</p>"
soup = BeautifulSoup(
'<p>This is <strong>bold text</strong> in a paragraph.</p>',
'html.parser')
element = soup.find('p')
result = paragraph_handler(element, self.base_context)
self.assertIsInstance(result, Paragraph)
words = list(result.words_iter())
self.assertEqual(len(words), 7) # From original test expectation
# Check that 'bold' and 'text' words have bold font weight (from original test)
bold_word = words[2][1] # 'bold'
text_word = words[3][1] # 'text'
self.assertEqual(bold_word.text, "bold")
self.assertEqual(bold_word.style.weight, FontWeight.BOLD)
self.assertEqual(text_word.text, "text")
self.assertEqual(text_word.style.weight, FontWeight.BOLD)
# Check that other words are not bold (from original test)
normal_word = words[0][1] # 'This'
self.assertEqual(normal_word.text, "This")
self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD)
def test_paragraph_with_mixed_formatting(self):
"""Test mixed formatting - adapted from test_mixed_formatting."""
# From: "<p>This paragraph contains <strong>bold</strong>, <em>italic</em>..."
html_str = (
'<p>This paragraph contains <strong>bold</strong>, <em>italic</em>, '
'<span style="color: blue;">blue</span> text.</p>'
)
soup = BeautifulSoup(html_str, 'html.parser')
element = soup.find('p')
result = paragraph_handler(element, self.base_context)
self.assertIsInstance(result, Paragraph)
words = list(result.words_iter())
# Check for bold word (from original test pattern)
bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
self.assertGreater(len(bold_words), 0, "Should have bold words")
# Check for italic word (from original test pattern)
italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
self.assertGreater(len(italic_words), 0, "Should have italic words")
# Check for blue colored word (from original test pattern)
blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)]
self.assertGreater(len(blue_words), 0, "Should have blue colored words")
if __name__ == '__main__':
unittest.main()