diff --git a/tests/test_html_extraction_functions.py b/tests/test_html_extraction_functions.py new file mode 100644 index 0000000..e483241 --- /dev/null +++ b/tests/test_html_extraction_functions.py @@ -0,0 +1,493 @@ +""" +Unit tests for individual HTML extraction functions. + +Tests the specific handler functions and utility functions in html_extraction module, +reusing test patterns from test_html_extraction.py that are known to pass. +""" + +import unittest +from bs4 import BeautifulSoup, Tag +from pyWebLayout.io.readers.html_extraction import ( + create_base_context, + apply_element_styling, + parse_inline_styles, + apply_element_font_styles, + extract_text_content, + paragraph_handler, + div_handler, + heading_handler, + blockquote_handler, + preformatted_handler, + unordered_list_handler, + ordered_list_handler, + list_item_handler, + table_handler, + table_row_handler, + table_cell_handler, + table_header_cell_handler, + horizontal_rule_handler, + image_handler, + StyleContext, +) +from pyWebLayout.abstract.block import ( + Paragraph, + Heading, + HeadingLevel, + Quote, + CodeBlock, + HList, + ListItem, + ListStyle, + Table, + TableRow, + TableCell, + HorizontalRule, + Image, +) +from pyWebLayout.abstract.inline import Word +from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration + + +class TestUtilityFunctions(unittest.TestCase): + """Test cases for utility functions.""" + + def test_create_base_context(self): + """Test creation of base style context.""" + context = create_base_context() + + self.assertIsInstance(context, StyleContext) + self.assertIsInstance(context.font, Font) + self.assertIsNone(context.background) + self.assertEqual(context.css_classes, set()) + self.assertEqual(context.css_styles, {}) + self.assertEqual(context.element_attributes, {}) + self.assertEqual(context.parent_elements, []) + + def test_parse_inline_styles_from_existing_tests(self): + """Test parsing CSS inline styles - adapted from test_span_with_inline_styles.""" + # From: 'this part is red and bold' + style_text = "color: red; font-weight: bold;" + styles = parse_inline_styles(style_text) + + expected = { + "color": "red", + "font-weight": "bold" + } + self.assertEqual(styles, expected) + + def test_parse_inline_styles_color_variations(self): + """Test parsing different color formats - adapted from test_color_variations.""" + # Test hex color parsing + hex_style = "color: #ff0000;" + styles = parse_inline_styles(hex_style) + self.assertEqual(styles.get("color"), "#ff0000") + + # Test named color parsing + named_style = "color: green;" + styles = parse_inline_styles(named_style) + self.assertEqual(styles.get("color"), "green") + + def test_apply_element_font_styles_bold_elements(self): + """Test font style application for bold elements - adapted from test_bold_text.""" + base_font = Font() + + # Test tag - from "bold text" + font = apply_element_font_styles(base_font, "strong", {}) + self.assertEqual(font.weight, FontWeight.BOLD) + + # Test tag + font = apply_element_font_styles(base_font, "b", {}) + self.assertEqual(font.weight, FontWeight.BOLD) + + def test_apply_element_font_styles_italic_elements(self): + """Test font style application for italic elements - adapted from test_italic_text.""" + base_font = Font() + + # Test tag - from "italic text" + font = apply_element_font_styles(base_font, "em", {}) + self.assertEqual(font.style, FontStyle.ITALIC) + + # Test tag + font = apply_element_font_styles(base_font, "i", {}) + self.assertEqual(font.style, FontStyle.ITALIC) + + def test_apply_element_font_styles_decoration_elements(self): + """Test font decoration - adapted from test_underlined_text and test_strikethrough_text.""" + base_font = Font() + + # Test tag - from "underlined text" + font = apply_element_font_styles(base_font, "u", {}) + self.assertEqual(font.decoration, TextDecoration.UNDERLINE) + + # Test tag - from "strikethrough text" + font = apply_element_font_styles(base_font, "s", {}) + self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH) + + # Test tag + font = apply_element_font_styles(base_font, "del", {}) + self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH) + + def test_apply_element_font_styles_headings(self): + """Test heading font styles - adapted from test_headings.""" + base_font = Font() + + # Test heading sizes and weights - from test_headings which tests h1-h6 + headings = [("h1", 24), ("h2", 20), ("h3", 18), ("h4", 16), ("h5", 14), ("h6", 12)] + + for tag, expected_size in headings: + font = apply_element_font_styles(base_font, tag, {}) + self.assertEqual(font.font_size, expected_size, f"Size mismatch for {tag}") + self.assertEqual(font.weight, FontWeight.BOLD, f"Weight should be bold for {tag}") + + def test_apply_element_font_styles_color_parsing(self): + """Test color parsing - adapted from test_color_variations.""" + base_font = Font() + + # Test named colors - from 'Named green' + css_styles = {"color": "green"} + font = apply_element_font_styles(base_font, "span", css_styles) + self.assertEqual(font.colour, (0, 255, 0)) + + # Test hex colors - from 'Hex red' + css_styles = {"color": "#ff0000"} + font = apply_element_font_styles(base_font, "span", css_styles) + self.assertEqual(font.colour, (255, 0, 0)) + + def test_apply_element_styling_with_classes_and_styles(self): + """Test complete element styling - adapted from test_span_with_inline_styles.""" + # From: 'this part is red and bold' + soup = BeautifulSoup('text', 'html.parser') + element = soup.find('span') + base_context = create_base_context() + + styled_context = apply_element_styling(base_context, element) + + # Check CSS classes + self.assertIn("highlight", styled_context.css_classes) + + # Check CSS styles + self.assertEqual(styled_context.css_styles.get("color"), "red") + self.assertEqual(styled_context.css_styles.get("font-weight"), "bold") + + # Check font styling + self.assertEqual(styled_context.font.colour, (255, 0, 0)) + self.assertEqual(styled_context.font.weight, FontWeight.BOLD) + + +class TestExtractTextContent(unittest.TestCase): + """Test cases for text content extraction.""" + + def setUp(self): + """Set up test fixtures.""" + self.base_context = create_base_context() + + def test_extract_simple_text(self): + """Test extracting simple text - adapted from test_simple.""" + # From: "

This is a paragraph.

" + soup = BeautifulSoup('

This is a paragraph.

', 'html.parser') + element = soup.find('p') + + words = extract_text_content(element, self.base_context) + + # Should match the expected word count from original test + self.assertEqual(len(words), 4) # "This", "is", "a", "paragraph." + self.assertIsInstance(words[0], Word) + self.assertEqual(words[0].text, "This") + + def test_extract_styled_text_bold(self): + """Test extracting bold styled text - adapted from test_bold_text.""" + # From: "

This is bold text in a paragraph.

" + soup = BeautifulSoup('This is bold text in a paragraph.', 'html.parser') + element = soup.find('span') + + words = extract_text_content(element, self.base_context) + + # Find the bold words + bold_words = [w for w in words if w.style.weight == FontWeight.BOLD] + self.assertGreater(len(bold_words), 0, "Should have bold words") + + # Check specific words are bold (from original test expectations) + bold_word_texts = [w.text for w in bold_words] + self.assertIn("bold", bold_word_texts) + self.assertIn("text", bold_word_texts) + + def test_extract_nested_formatting(self): + """Test nested formatting - adapted from test_nested_formatting.""" + # From: "

This has bold with italic inside formatting.

" + soup = BeautifulSoup('This has bold with italic inside formatting.', 'html.parser') + element = soup.find('span') + + words = extract_text_content(element, self.base_context) + + # Find words that should be both bold and italic + bold_italic_words = [w for w in words + if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC] + self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic") + + +class TestHandlerFunctions(unittest.TestCase): + """Test cases for HTML element handler functions using known working patterns.""" + + def setUp(self): + """Set up test fixtures.""" + self.base_context = create_base_context() + + def test_paragraph_handler_simple(self): + """Test paragraph handler - adapted from test_simple.""" + # From: "

This is a paragraph.

" + soup = BeautifulSoup('

This is a paragraph.

', 'html.parser') + element = soup.find('p') + + result = paragraph_handler(element, self.base_context) + + self.assertIsInstance(result, Paragraph) + # Should match original test expectations + self.assertEqual(len(result), 4) # 4 words + + words = list(result.words()) + expected_texts = ["This", "is", "a", "paragraph."] + for i, expected_text in enumerate(expected_texts): + self.assertEqual(words[i][1].text, expected_text) + + def test_heading_handler_all_levels(self): + """Test heading handler - adapted from test_headings.""" + # From: "

Heading 1

Heading 2

..." + expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3, + HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6] + + for i, expected_level in enumerate(expected_levels, 1): + tag = f"h{i}" + soup = BeautifulSoup(f'<{tag}>Heading {i}', 'html.parser') + element = soup.find(tag) + + result = heading_handler(element, self.base_context) + + self.assertIsInstance(result, Heading) + self.assertEqual(result.level, expected_level) + + # Should match original test word expectations + words = list(result.words()) + self.assertEqual(len(words), 2) # "Heading" and number + self.assertEqual(words[0][1].text, "Heading") + + def test_blockquote_handler(self): + """Test blockquote handler - adapted from test_blockquote.""" + # From: "

This is a quoted paragraph.

" + soup = BeautifulSoup('

This is a quoted paragraph.

', 'html.parser') + element = soup.find('blockquote') + + result = blockquote_handler(element, self.base_context) + + self.assertIsInstance(result, Quote) + + # Check that the quote contains a paragraph (from original test) + quote_blocks = list(result.blocks()) + self.assertEqual(len(quote_blocks), 1) + self.assertIsInstance(quote_blocks[0], Paragraph) + + def test_preformatted_handler(self): + """Test preformatted handler - adapted from test_preformatted_code.""" + # From: "
function hello() {\n  console.log('Hello');\n}
" + soup = BeautifulSoup('
function hello() {\n  console.log(\'Hello\');\n}
', 'html.parser') + element = soup.find('pre') + + result = preformatted_handler(element, self.base_context) + + self.assertIsInstance(result, CodeBlock) + + # Should have lines (from original test expectation) + lines = list(result.lines()) + self.assertGreater(len(lines), 0) + + def test_unordered_list_handler(self): + """Test unordered list handler - adapted from test_unordered_list.""" + # From: "
  • First item
  • Second item
  • Third item
" + soup = BeautifulSoup('
  • First item
  • Second item
  • Third item
', 'html.parser') + element = soup.find('ul') + + result = unordered_list_handler(element, self.base_context) + + self.assertIsInstance(result, HList) + self.assertEqual(result.style, ListStyle.UNORDERED) + + # Should match original test expectations + items = list(result.items()) + self.assertEqual(len(items), 3) + + def test_ordered_list_handler(self): + """Test ordered list handler - adapted from test_ordered_list.""" + # From: "
  1. First item
  2. Second item
  3. Third item
" + soup = BeautifulSoup('
  1. First item
  2. Second item
  3. Third item
', 'html.parser') + element = soup.find('ol') + + result = ordered_list_handler(element, self.base_context) + + self.assertIsInstance(result, HList) + self.assertEqual(result.style, ListStyle.ORDERED) + + # Should match original test expectations + items = list(result.items()) + self.assertEqual(len(items), 3) # "First item", "Second item", "Third item" + + def test_list_item_handler(self): + """Test list item handler.""" + soup = BeautifulSoup('
  • List item content
  • ', 'html.parser') + element = soup.find('li') + + result = list_item_handler(element, self.base_context) + + self.assertIsInstance(result, ListItem) + blocks = list(result.blocks()) + self.assertGreater(len(blocks), 0) + + def test_table_handler(self): + """Test table handler - adapted from test_table_basic.""" + # From test_table_basic structure + soup = BeautifulSoup(''' + + + + + + + + + +
    Header 1Header 2
    Cell 1Cell 2
    + ''', 'html.parser') + element = soup.find('table') + + result = table_handler(element, self.base_context) + + self.assertIsInstance(result, Table) + + def test_table_row_handler(self): + """Test table row handler.""" + soup = BeautifulSoup('Cell 1Cell 2', 'html.parser') + element = soup.find('tr') + + result = table_row_handler(element, self.base_context) + + self.assertIsInstance(result, TableRow) + + def test_table_cell_handler(self): + """Test table cell handler.""" + soup = BeautifulSoup('Cell content', 'html.parser') + element = soup.find('td') + + # Apply styling to get attributes + styled_context = apply_element_styling(self.base_context, element) + result = table_cell_handler(element, styled_context) + + self.assertIsInstance(result, TableCell) + self.assertEqual(result.is_header, False) + + def test_table_header_cell_handler(self): + """Test table header cell handler.""" + soup = BeautifulSoup('Header content', 'html.parser') + element = soup.find('th') + + # Apply styling to get attributes + styled_context = apply_element_styling(self.base_context, element) + result = table_header_cell_handler(element, styled_context) + + self.assertIsInstance(result, TableCell) + self.assertEqual(result.is_header, True) + + def test_horizontal_rule_handler(self): + """Test horizontal rule handler.""" + soup = BeautifulSoup('
    ', 'html.parser') + element = soup.find('hr') + + result = horizontal_rule_handler(element, self.base_context) + + self.assertIsInstance(result, HorizontalRule) + + def test_image_handler(self): + """Test image handler.""" + soup = BeautifulSoup('Test image', 'html.parser') + element = soup.find('img') + + # Need to apply styling first to get attributes + styled_context = apply_element_styling(self.base_context, element) + result = image_handler(element, styled_context) + + self.assertIsInstance(result, Image) + self.assertEqual(result.source, "test.jpg") + self.assertEqual(result.alt_text, "Test image") + self.assertEqual(result.width, 100) + self.assertEqual(result.height, 50) + + def test_div_handler_container(self): + """Test div handler - adapted from test_div_container.""" + # From: "

    First paragraph.

    Second paragraph.

    " + soup = BeautifulSoup('

    First paragraph.

    Second paragraph.

    ', 'html.parser') + element = soup.find('div') + + result = div_handler(element, self.base_context) + + self.assertIsInstance(result, list) + # Should match original test expectations + self.assertEqual(len(result), 2) + self.assertIsInstance(result[0], Paragraph) + self.assertIsInstance(result[1], Paragraph) + + +class TestStyledContentHandling(unittest.TestCase): + """Test styled content handling using patterns from existing tests.""" + + def setUp(self): + """Set up test fixtures.""" + self.base_context = create_base_context() + + def test_paragraph_with_bold_content(self): + """Test paragraph with bold content - adapted from test_bold_text.""" + # From: "

    This is bold text in a paragraph.

    " + soup = BeautifulSoup('

    This is bold text in a paragraph.

    ', 'html.parser') + element = soup.find('p') + + result = paragraph_handler(element, self.base_context) + + self.assertIsInstance(result, Paragraph) + words = list(result.words()) + self.assertEqual(len(words), 7) # From original test expectation + + # Check that 'bold' and 'text' words have bold font weight (from original test) + bold_word = words[2][1] # 'bold' + text_word = words[3][1] # 'text' + self.assertEqual(bold_word.text, "bold") + self.assertEqual(bold_word.style.weight, FontWeight.BOLD) + self.assertEqual(text_word.text, "text") + self.assertEqual(text_word.style.weight, FontWeight.BOLD) + + # Check that other words are not bold (from original test) + normal_word = words[0][1] # 'This' + self.assertEqual(normal_word.text, "This") + self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD) + + def test_paragraph_with_mixed_formatting(self): + """Test mixed formatting - adapted from test_mixed_formatting.""" + # From: "

    This paragraph contains bold, italic, blue..." + soup = BeautifulSoup('

    This paragraph contains bold, italic, blue text.

    ', 'html.parser') + element = soup.find('p') + + result = paragraph_handler(element, self.base_context) + + self.assertIsInstance(result, Paragraph) + words = list(result.words()) + + # Check for bold word (from original test pattern) + bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] + self.assertGreater(len(bold_words), 0, "Should have bold words") + + # Check for italic word (from original test pattern) + italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC] + self.assertGreater(len(italic_words), 0, "Should have italic words") + + # Check for blue colored word (from original test pattern) + blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)] + self.assertGreater(len(blue_words), 0, "Should have blue colored words") + + +if __name__ == '__main__': + unittest.main()