494 lines
20 KiB
Python
494 lines
20 KiB
Python
"""
|
|
Unit tests for individual HTML extraction functions.
|
|
|
|
Tests the specific handler functions and utility functions in html_extraction module,
|
|
reusing test patterns from test_html_extraction.py that are known to pass.
|
|
"""
|
|
|
|
import unittest
|
|
from bs4 import BeautifulSoup, Tag
|
|
from pyWebLayout.io.readers.html_extraction import (
|
|
create_base_context,
|
|
apply_element_styling,
|
|
parse_inline_styles,
|
|
apply_element_font_styles,
|
|
extract_text_content,
|
|
paragraph_handler,
|
|
div_handler,
|
|
heading_handler,
|
|
blockquote_handler,
|
|
preformatted_handler,
|
|
unordered_list_handler,
|
|
ordered_list_handler,
|
|
list_item_handler,
|
|
table_handler,
|
|
table_row_handler,
|
|
table_cell_handler,
|
|
table_header_cell_handler,
|
|
horizontal_rule_handler,
|
|
image_handler,
|
|
StyleContext,
|
|
)
|
|
from pyWebLayout.abstract.block import (
|
|
Paragraph,
|
|
Heading,
|
|
HeadingLevel,
|
|
Quote,
|
|
CodeBlock,
|
|
HList,
|
|
ListItem,
|
|
ListStyle,
|
|
Table,
|
|
TableRow,
|
|
TableCell,
|
|
HorizontalRule,
|
|
Image,
|
|
)
|
|
from pyWebLayout.abstract.inline import Word
|
|
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
|
|
|
|
|
|
class TestUtilityFunctions(unittest.TestCase):
|
|
"""Test cases for utility functions."""
|
|
|
|
def test_create_base_context(self):
|
|
"""Test creation of base style context."""
|
|
context = create_base_context()
|
|
|
|
self.assertIsInstance(context, StyleContext)
|
|
self.assertIsInstance(context.font, Font)
|
|
self.assertIsNone(context.background)
|
|
self.assertEqual(context.css_classes, set())
|
|
self.assertEqual(context.css_styles, {})
|
|
self.assertEqual(context.element_attributes, {})
|
|
self.assertEqual(context.parent_elements, [])
|
|
|
|
def test_parse_inline_styles_from_existing_tests(self):
|
|
"""Test parsing CSS inline styles - adapted from test_span_with_inline_styles."""
|
|
# From: '<span style="color: red; font-weight: bold;">this part is red and bold</span>'
|
|
style_text = "color: red; font-weight: bold;"
|
|
styles = parse_inline_styles(style_text)
|
|
|
|
expected = {
|
|
"color": "red",
|
|
"font-weight": "bold"
|
|
}
|
|
self.assertEqual(styles, expected)
|
|
|
|
def test_parse_inline_styles_color_variations(self):
|
|
"""Test parsing different color formats - adapted from test_color_variations."""
|
|
# Test hex color parsing
|
|
hex_style = "color: #ff0000;"
|
|
styles = parse_inline_styles(hex_style)
|
|
self.assertEqual(styles.get("color"), "#ff0000")
|
|
|
|
# Test named color parsing
|
|
named_style = "color: green;"
|
|
styles = parse_inline_styles(named_style)
|
|
self.assertEqual(styles.get("color"), "green")
|
|
|
|
def test_apply_element_font_styles_bold_elements(self):
|
|
"""Test font style application for bold elements - adapted from test_bold_text."""
|
|
base_font = Font()
|
|
|
|
# Test <strong> tag - from "<strong>bold text</strong>"
|
|
font = apply_element_font_styles(base_font, "strong", {})
|
|
self.assertEqual(font.weight, FontWeight.BOLD)
|
|
|
|
# Test <b> tag
|
|
font = apply_element_font_styles(base_font, "b", {})
|
|
self.assertEqual(font.weight, FontWeight.BOLD)
|
|
|
|
def test_apply_element_font_styles_italic_elements(self):
|
|
"""Test font style application for italic elements - adapted from test_italic_text."""
|
|
base_font = Font()
|
|
|
|
# Test <em> tag - from "<em>italic text</em>"
|
|
font = apply_element_font_styles(base_font, "em", {})
|
|
self.assertEqual(font.style, FontStyle.ITALIC)
|
|
|
|
# Test <i> tag
|
|
font = apply_element_font_styles(base_font, "i", {})
|
|
self.assertEqual(font.style, FontStyle.ITALIC)
|
|
|
|
def test_apply_element_font_styles_decoration_elements(self):
|
|
"""Test font decoration - adapted from test_underlined_text and test_strikethrough_text."""
|
|
base_font = Font()
|
|
|
|
# Test <u> tag - from "<u>underlined text</u>"
|
|
font = apply_element_font_styles(base_font, "u", {})
|
|
self.assertEqual(font.decoration, TextDecoration.UNDERLINE)
|
|
|
|
# Test <s> tag - from "<s>strikethrough text</s>"
|
|
font = apply_element_font_styles(base_font, "s", {})
|
|
self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH)
|
|
|
|
# Test <del> tag
|
|
font = apply_element_font_styles(base_font, "del", {})
|
|
self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH)
|
|
|
|
def test_apply_element_font_styles_headings(self):
|
|
"""Test heading font styles - adapted from test_headings."""
|
|
base_font = Font()
|
|
|
|
# Test heading sizes and weights - from test_headings which tests h1-h6
|
|
headings = [("h1", 24), ("h2", 20), ("h3", 18), ("h4", 16), ("h5", 14), ("h6", 12)]
|
|
|
|
for tag, expected_size in headings:
|
|
font = apply_element_font_styles(base_font, tag, {})
|
|
self.assertEqual(font.font_size, expected_size, f"Size mismatch for {tag}")
|
|
self.assertEqual(font.weight, FontWeight.BOLD, f"Weight should be bold for {tag}")
|
|
|
|
def test_apply_element_font_styles_color_parsing(self):
|
|
"""Test color parsing - adapted from test_color_variations."""
|
|
base_font = Font()
|
|
|
|
# Test named colors - from '<span style="color: green;">Named green</span>'
|
|
css_styles = {"color": "green"}
|
|
font = apply_element_font_styles(base_font, "span", css_styles)
|
|
self.assertEqual(font.colour, (0, 255, 0))
|
|
|
|
# Test hex colors - from '<span style="color: #ff0000;">Hex red</span>'
|
|
css_styles = {"color": "#ff0000"}
|
|
font = apply_element_font_styles(base_font, "span", css_styles)
|
|
self.assertEqual(font.colour, (255, 0, 0))
|
|
|
|
def test_apply_element_styling_with_classes_and_styles(self):
|
|
"""Test complete element styling - adapted from test_span_with_inline_styles."""
|
|
# From: '<span style="color: red; font-weight: bold;">this part is red and bold</span>'
|
|
soup = BeautifulSoup('<span class="highlight" style="color: red; font-weight: bold;">text</span>', 'html.parser')
|
|
element = soup.find('span')
|
|
base_context = create_base_context()
|
|
|
|
styled_context = apply_element_styling(base_context, element)
|
|
|
|
# Check CSS classes
|
|
self.assertIn("highlight", styled_context.css_classes)
|
|
|
|
# Check CSS styles
|
|
self.assertEqual(styled_context.css_styles.get("color"), "red")
|
|
self.assertEqual(styled_context.css_styles.get("font-weight"), "bold")
|
|
|
|
# Check font styling
|
|
self.assertEqual(styled_context.font.colour, (255, 0, 0))
|
|
self.assertEqual(styled_context.font.weight, FontWeight.BOLD)
|
|
|
|
|
|
class TestExtractTextContent(unittest.TestCase):
|
|
"""Test cases for text content extraction."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.base_context = create_base_context()
|
|
|
|
def test_extract_simple_text(self):
|
|
"""Test extracting simple text - adapted from test_simple."""
|
|
# From: "<p>This is a paragraph.</p>"
|
|
soup = BeautifulSoup('<p>This is a paragraph.</p>', 'html.parser')
|
|
element = soup.find('p')
|
|
|
|
words = extract_text_content(element, self.base_context)
|
|
|
|
# Should match the expected word count from original test
|
|
self.assertEqual(len(words), 4) # "This", "is", "a", "paragraph."
|
|
self.assertIsInstance(words[0], Word)
|
|
self.assertEqual(words[0].text, "This")
|
|
|
|
def test_extract_styled_text_bold(self):
|
|
"""Test extracting bold styled text - adapted from test_bold_text."""
|
|
# From: "<p>This is <strong>bold text</strong> in a paragraph.</p>"
|
|
soup = BeautifulSoup('<span>This is <strong>bold text</strong> in a paragraph.</span>', 'html.parser')
|
|
element = soup.find('span')
|
|
|
|
words = extract_text_content(element, self.base_context)
|
|
|
|
# Find the bold words
|
|
bold_words = [w for w in words if w.style.weight == FontWeight.BOLD]
|
|
self.assertGreater(len(bold_words), 0, "Should have bold words")
|
|
|
|
# Check specific words are bold (from original test expectations)
|
|
bold_word_texts = [w.text for w in bold_words]
|
|
self.assertIn("bold", bold_word_texts)
|
|
self.assertIn("text", bold_word_texts)
|
|
|
|
def test_extract_nested_formatting(self):
|
|
"""Test nested formatting - adapted from test_nested_formatting."""
|
|
# From: "<p>This has <strong>bold with <em>italic inside</em></strong> formatting.</p>"
|
|
soup = BeautifulSoup('<span>This has <strong>bold with <em>italic inside</em></strong> formatting.</span>', 'html.parser')
|
|
element = soup.find('span')
|
|
|
|
words = extract_text_content(element, self.base_context)
|
|
|
|
# Find words that should be both bold and italic
|
|
bold_italic_words = [w for w in words
|
|
if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC]
|
|
self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic")
|
|
|
|
|
|
class TestHandlerFunctions(unittest.TestCase):
|
|
"""Test cases for HTML element handler functions using known working patterns."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.base_context = create_base_context()
|
|
|
|
def test_paragraph_handler_simple(self):
|
|
"""Test paragraph handler - adapted from test_simple."""
|
|
# From: "<p>This is a paragraph.</p>"
|
|
soup = BeautifulSoup('<p>This is a paragraph.</p>', 'html.parser')
|
|
element = soup.find('p')
|
|
|
|
result = paragraph_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, Paragraph)
|
|
# Should match original test expectations
|
|
self.assertEqual(len(result), 4) # 4 words
|
|
|
|
words = list(result.words_iter())
|
|
expected_texts = ["This", "is", "a", "paragraph."]
|
|
for i, expected_text in enumerate(expected_texts):
|
|
self.assertEqual(words[i][1].text, expected_text)
|
|
|
|
def test_heading_handler_all_levels(self):
|
|
"""Test heading handler - adapted from test_headings."""
|
|
# From: "<h1>Heading 1</h1><h2>Heading 2</h2>..."
|
|
expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3,
|
|
HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6]
|
|
|
|
for i, expected_level in enumerate(expected_levels, 1):
|
|
tag = f"h{i}"
|
|
soup = BeautifulSoup(f'<{tag}>Heading {i}</{tag}>', 'html.parser')
|
|
element = soup.find(tag)
|
|
|
|
result = heading_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, Heading)
|
|
self.assertEqual(result.level, expected_level)
|
|
|
|
# Should match original test word expectations
|
|
words = list(result.words_iter())
|
|
self.assertEqual(len(words), 2) # "Heading" and number
|
|
self.assertEqual(words[0][1].text, "Heading")
|
|
|
|
def test_blockquote_handler(self):
|
|
"""Test blockquote handler - adapted from test_blockquote."""
|
|
# From: "<blockquote><p>This is a quoted paragraph.</p></blockquote>"
|
|
soup = BeautifulSoup('<blockquote><p>This is a quoted paragraph.</p></blockquote>', 'html.parser')
|
|
element = soup.find('blockquote')
|
|
|
|
result = blockquote_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, Quote)
|
|
|
|
# Check that the quote contains a paragraph (from original test)
|
|
quote_blocks = list(result.blocks())
|
|
self.assertEqual(len(quote_blocks), 1)
|
|
self.assertIsInstance(quote_blocks[0], Paragraph)
|
|
|
|
def test_preformatted_handler(self):
|
|
"""Test preformatted handler - adapted from test_preformatted_code."""
|
|
# From: "<pre><code>function hello() {\n console.log('Hello');\n}</code></pre>"
|
|
soup = BeautifulSoup('<pre><code>function hello() {\n console.log(\'Hello\');\n}</code></pre>', 'html.parser')
|
|
element = soup.find('pre')
|
|
|
|
result = preformatted_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, CodeBlock)
|
|
|
|
# Should have lines (from original test expectation)
|
|
lines = list(result.lines())
|
|
self.assertGreater(len(lines), 0)
|
|
|
|
def test_unordered_list_handler(self):
|
|
"""Test unordered list handler - adapted from test_unordered_list."""
|
|
# From: "<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>"
|
|
soup = BeautifulSoup('<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>', 'html.parser')
|
|
element = soup.find('ul')
|
|
|
|
result = unordered_list_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, HList)
|
|
self.assertEqual(result.style, ListStyle.UNORDERED)
|
|
|
|
# Should match original test expectations
|
|
items = list(result.items())
|
|
self.assertEqual(len(items), 3)
|
|
|
|
def test_ordered_list_handler(self):
|
|
"""Test ordered list handler - adapted from test_ordered_list."""
|
|
# From: "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>"
|
|
soup = BeautifulSoup('<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>', 'html.parser')
|
|
element = soup.find('ol')
|
|
|
|
result = ordered_list_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, HList)
|
|
self.assertEqual(result.style, ListStyle.ORDERED)
|
|
|
|
# Should match original test expectations
|
|
items = list(result.items())
|
|
self.assertEqual(len(items), 3) # "First item", "Second item", "Third item"
|
|
|
|
def test_list_item_handler(self):
|
|
"""Test list item handler."""
|
|
soup = BeautifulSoup('<li>List item content</li>', 'html.parser')
|
|
element = soup.find('li')
|
|
|
|
result = list_item_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, ListItem)
|
|
blocks = list(result.blocks())
|
|
self.assertGreater(len(blocks), 0)
|
|
|
|
def test_table_handler(self):
|
|
"""Test table handler - adapted from test_table_basic."""
|
|
# From test_table_basic structure
|
|
soup = BeautifulSoup('''
|
|
<table>
|
|
<tr>
|
|
<th>Header 1</th>
|
|
<th>Header 2</th>
|
|
</tr>
|
|
<tr>
|
|
<td>Cell 1</td>
|
|
<td>Cell 2</td>
|
|
</tr>
|
|
</table>
|
|
''', 'html.parser')
|
|
element = soup.find('table')
|
|
|
|
result = table_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, Table)
|
|
|
|
def test_table_row_handler(self):
|
|
"""Test table row handler."""
|
|
soup = BeautifulSoup('<tr><td>Cell 1</td><td>Cell 2</td></tr>', 'html.parser')
|
|
element = soup.find('tr')
|
|
|
|
result = table_row_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, TableRow)
|
|
|
|
def test_table_cell_handler(self):
|
|
"""Test table cell handler."""
|
|
soup = BeautifulSoup('<td>Cell content</td>', 'html.parser')
|
|
element = soup.find('td')
|
|
|
|
# Apply styling to get attributes
|
|
styled_context = apply_element_styling(self.base_context, element)
|
|
result = table_cell_handler(element, styled_context)
|
|
|
|
self.assertIsInstance(result, TableCell)
|
|
self.assertEqual(result.is_header, False)
|
|
|
|
def test_table_header_cell_handler(self):
|
|
"""Test table header cell handler."""
|
|
soup = BeautifulSoup('<th>Header content</th>', 'html.parser')
|
|
element = soup.find('th')
|
|
|
|
# Apply styling to get attributes
|
|
styled_context = apply_element_styling(self.base_context, element)
|
|
result = table_header_cell_handler(element, styled_context)
|
|
|
|
self.assertIsInstance(result, TableCell)
|
|
self.assertEqual(result.is_header, True)
|
|
|
|
def test_horizontal_rule_handler(self):
|
|
"""Test horizontal rule handler."""
|
|
soup = BeautifulSoup('<hr>', 'html.parser')
|
|
element = soup.find('hr')
|
|
|
|
result = horizontal_rule_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, HorizontalRule)
|
|
|
|
def test_image_handler(self):
|
|
"""Test image handler."""
|
|
soup = BeautifulSoup('<img src="test.jpg" alt="Test image" width="100" height="50">', 'html.parser')
|
|
element = soup.find('img')
|
|
|
|
# Need to apply styling first to get attributes
|
|
styled_context = apply_element_styling(self.base_context, element)
|
|
result = image_handler(element, styled_context)
|
|
|
|
self.assertIsInstance(result, Image)
|
|
self.assertEqual(result.source, "test.jpg")
|
|
self.assertEqual(result.alt_text, "Test image")
|
|
self.assertEqual(result.width, 100)
|
|
self.assertEqual(result.height, 50)
|
|
|
|
def test_div_handler_container(self):
|
|
"""Test div handler - adapted from test_div_container."""
|
|
# From: "<div><p>First paragraph.</p><p>Second paragraph.</p></div>"
|
|
soup = BeautifulSoup('<div><p>First paragraph.</p><p>Second paragraph.</p></div>', 'html.parser')
|
|
element = soup.find('div')
|
|
|
|
result = div_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, list)
|
|
# Should match original test expectations
|
|
self.assertEqual(len(result), 2)
|
|
self.assertIsInstance(result[0], Paragraph)
|
|
self.assertIsInstance(result[1], Paragraph)
|
|
|
|
|
|
class TestStyledContentHandling(unittest.TestCase):
|
|
"""Test styled content handling using patterns from existing tests."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.base_context = create_base_context()
|
|
|
|
def test_paragraph_with_bold_content(self):
|
|
"""Test paragraph with bold content - adapted from test_bold_text."""
|
|
# From: "<p>This is <strong>bold text</strong> in a paragraph.</p>"
|
|
soup = BeautifulSoup('<p>This is <strong>bold text</strong> in a paragraph.</p>', 'html.parser')
|
|
element = soup.find('p')
|
|
|
|
result = paragraph_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, Paragraph)
|
|
words = list(result.words_iter())
|
|
self.assertEqual(len(words), 7) # From original test expectation
|
|
|
|
# Check that 'bold' and 'text' words have bold font weight (from original test)
|
|
bold_word = words[2][1] # 'bold'
|
|
text_word = words[3][1] # 'text'
|
|
self.assertEqual(bold_word.text, "bold")
|
|
self.assertEqual(bold_word.style.weight, FontWeight.BOLD)
|
|
self.assertEqual(text_word.text, "text")
|
|
self.assertEqual(text_word.style.weight, FontWeight.BOLD)
|
|
|
|
# Check that other words are not bold (from original test)
|
|
normal_word = words[0][1] # 'This'
|
|
self.assertEqual(normal_word.text, "This")
|
|
self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD)
|
|
|
|
def test_paragraph_with_mixed_formatting(self):
|
|
"""Test mixed formatting - adapted from test_mixed_formatting."""
|
|
# From: "<p>This paragraph contains <strong>bold</strong>, <em>italic</em>, <span style=\"color: blue;\">blue</span>..."
|
|
soup = BeautifulSoup('<p>This paragraph contains <strong>bold</strong>, <em>italic</em>, <span style="color: blue;">blue</span> text.</p>', 'html.parser')
|
|
element = soup.find('p')
|
|
|
|
result = paragraph_handler(element, self.base_context)
|
|
|
|
self.assertIsInstance(result, Paragraph)
|
|
words = list(result.words_iter())
|
|
|
|
# Check for bold word (from original test pattern)
|
|
bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
|
|
self.assertGreater(len(bold_words), 0, "Should have bold words")
|
|
|
|
# Check for italic word (from original test pattern)
|
|
italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
|
|
self.assertGreater(len(italic_words), 0, "Should have italic words")
|
|
|
|
# Check for blue colored word (from original test pattern)
|
|
blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)]
|
|
self.assertGreater(len(blue_words), 0, "Should have blue colored words")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|