""" Unit tests for individual HTML extraction functions. Tests the specific handler functions and utility functions in html_extraction module, reusing test patterns from test_html_extraction.py that are known to pass. """ import unittest from bs4 import BeautifulSoup, Tag from pyWebLayout.io.readers.html_extraction import ( create_base_context, apply_element_styling, parse_inline_styles, apply_element_font_styles, extract_text_content, paragraph_handler, div_handler, heading_handler, blockquote_handler, preformatted_handler, unordered_list_handler, ordered_list_handler, list_item_handler, table_handler, table_row_handler, table_cell_handler, table_header_cell_handler, horizontal_rule_handler, image_handler, StyleContext, ) from pyWebLayout.abstract.block import ( Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListItem, ListStyle, Table, TableRow, TableCell, HorizontalRule, Image, ) from pyWebLayout.abstract.inline import Word from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration class TestUtilityFunctions(unittest.TestCase): """Test cases for utility functions.""" def test_create_base_context(self): """Test creation of base style context.""" context = create_base_context() self.assertIsInstance(context, StyleContext) self.assertIsInstance(context.font, Font) self.assertIsNone(context.background) self.assertEqual(context.css_classes, set()) self.assertEqual(context.css_styles, {}) self.assertEqual(context.element_attributes, {}) self.assertEqual(context.parent_elements, []) def test_parse_inline_styles_from_existing_tests(self): """Test parsing CSS inline styles - adapted from test_span_with_inline_styles.""" # From: 'this part is red and bold' style_text = "color: red; font-weight: bold;" styles = parse_inline_styles(style_text) expected = { "color": "red", "font-weight": "bold" } self.assertEqual(styles, expected) def test_parse_inline_styles_color_variations(self): """Test parsing different color formats - adapted from test_color_variations.""" # Test hex color parsing hex_style = "color: #ff0000;" styles = parse_inline_styles(hex_style) self.assertEqual(styles.get("color"), "#ff0000") # Test named color parsing named_style = "color: green;" styles = parse_inline_styles(named_style) self.assertEqual(styles.get("color"), "green") def test_apply_element_font_styles_bold_elements(self): """Test font style application for bold elements - adapted from test_bold_text.""" base_font = Font() # Test tag - from "bold text" font = apply_element_font_styles(base_font, "strong", {}) self.assertEqual(font.weight, FontWeight.BOLD) # Test tag font = apply_element_font_styles(base_font, "b", {}) self.assertEqual(font.weight, FontWeight.BOLD) def test_apply_element_font_styles_italic_elements(self): """Test font style application for italic elements - adapted from test_italic_text.""" base_font = Font() # Test tag - from "italic text" font = apply_element_font_styles(base_font, "em", {}) self.assertEqual(font.style, FontStyle.ITALIC) # Test tag font = apply_element_font_styles(base_font, "i", {}) self.assertEqual(font.style, FontStyle.ITALIC) def test_apply_element_font_styles_decoration_elements(self): """Test font decoration - adapted from test_underlined_text and test_strikethrough_text.""" base_font = Font() # Test tag - from "underlined text" font = apply_element_font_styles(base_font, "u", {}) self.assertEqual(font.decoration, TextDecoration.UNDERLINE) # Test tag - from "strikethrough text" font = apply_element_font_styles(base_font, "s", {}) self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH) # Test tag font = apply_element_font_styles(base_font, "del", {}) self.assertEqual(font.decoration, TextDecoration.STRIKETHROUGH) def test_apply_element_font_styles_headings(self): """Test heading font styles - adapted from test_headings.""" base_font = Font() # Test heading sizes and weights - from test_headings which tests h1-h6 headings = [("h1", 24), ("h2", 20), ("h3", 18), ("h4", 16), ("h5", 14), ("h6", 12)] for tag, expected_size in headings: font = apply_element_font_styles(base_font, tag, {}) self.assertEqual(font.font_size, expected_size, f"Size mismatch for {tag}") self.assertEqual(font.weight, FontWeight.BOLD, f"Weight should be bold for {tag}") def test_apply_element_font_styles_color_parsing(self): """Test color parsing - adapted from test_color_variations.""" base_font = Font() # Test named colors - from 'Named green' css_styles = {"color": "green"} font = apply_element_font_styles(base_font, "span", css_styles) self.assertEqual(font.colour, (0, 255, 0)) # Test hex colors - from 'Hex red' css_styles = {"color": "#ff0000"} font = apply_element_font_styles(base_font, "span", css_styles) self.assertEqual(font.colour, (255, 0, 0)) def test_apply_element_styling_with_classes_and_styles(self): """Test complete element styling - adapted from test_span_with_inline_styles.""" # From: 'this part is red and bold' soup = BeautifulSoup('text', 'html.parser') element = soup.find('span') base_context = create_base_context() styled_context = apply_element_styling(base_context, element) # Check CSS classes self.assertIn("highlight", styled_context.css_classes) # Check CSS styles self.assertEqual(styled_context.css_styles.get("color"), "red") self.assertEqual(styled_context.css_styles.get("font-weight"), "bold") # Check font styling self.assertEqual(styled_context.font.colour, (255, 0, 0)) self.assertEqual(styled_context.font.weight, FontWeight.BOLD) class TestExtractTextContent(unittest.TestCase): """Test cases for text content extraction.""" def setUp(self): """Set up test fixtures.""" self.base_context = create_base_context() def test_extract_simple_text(self): """Test extracting simple text - adapted from test_simple.""" # From: "

This is a paragraph.

" soup = BeautifulSoup('

This is a paragraph.

', 'html.parser') element = soup.find('p') words = extract_text_content(element, self.base_context) # Should match the expected word count from original test self.assertEqual(len(words), 4) # "This", "is", "a", "paragraph." self.assertIsInstance(words[0], Word) self.assertEqual(words[0].text, "This") def test_extract_styled_text_bold(self): """Test extracting bold styled text - adapted from test_bold_text.""" # From: "

This is bold text in a paragraph.

" soup = BeautifulSoup('This is bold text in a paragraph.', 'html.parser') element = soup.find('span') words = extract_text_content(element, self.base_context) # Find the bold words bold_words = [w for w in words if w.style.weight == FontWeight.BOLD] self.assertGreater(len(bold_words), 0, "Should have bold words") # Check specific words are bold (from original test expectations) bold_word_texts = [w.text for w in bold_words] self.assertIn("bold", bold_word_texts) self.assertIn("text", bold_word_texts) def test_extract_nested_formatting(self): """Test nested formatting - adapted from test_nested_formatting.""" # From: "

This has bold with italic inside formatting.

" soup = BeautifulSoup('This has bold with italic inside formatting.', 'html.parser') element = soup.find('span') words = extract_text_content(element, self.base_context) # Find words that should be both bold and italic bold_italic_words = [w for w in words if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC] self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic") class TestHandlerFunctions(unittest.TestCase): """Test cases for HTML element handler functions using known working patterns.""" def setUp(self): """Set up test fixtures.""" self.base_context = create_base_context() def test_paragraph_handler_simple(self): """Test paragraph handler - adapted from test_simple.""" # From: "

This is a paragraph.

" soup = BeautifulSoup('

This is a paragraph.

', 'html.parser') element = soup.find('p') result = paragraph_handler(element, self.base_context) self.assertIsInstance(result, Paragraph) # Should match original test expectations self.assertEqual(len(result), 4) # 4 words words = list(result.words_iter()) expected_texts = ["This", "is", "a", "paragraph."] for i, expected_text in enumerate(expected_texts): self.assertEqual(words[i][1].text, expected_text) def test_heading_handler_all_levels(self): """Test heading handler - adapted from test_headings.""" # From: "

Heading 1

Heading 2

..." expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3, HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6] for i, expected_level in enumerate(expected_levels, 1): tag = f"h{i}" soup = BeautifulSoup(f'<{tag}>Heading {i}', 'html.parser') element = soup.find(tag) result = heading_handler(element, self.base_context) self.assertIsInstance(result, Heading) self.assertEqual(result.level, expected_level) # Should match original test word expectations words = list(result.words_iter()) self.assertEqual(len(words), 2) # "Heading" and number self.assertEqual(words[0][1].text, "Heading") def test_blockquote_handler(self): """Test blockquote handler - adapted from test_blockquote.""" # From: "

This is a quoted paragraph.

" soup = BeautifulSoup('

This is a quoted paragraph.

', 'html.parser') element = soup.find('blockquote') result = blockquote_handler(element, self.base_context) self.assertIsInstance(result, Quote) # Check that the quote contains a paragraph (from original test) quote_blocks = list(result.blocks()) self.assertEqual(len(quote_blocks), 1) self.assertIsInstance(quote_blocks[0], Paragraph) def test_preformatted_handler(self): """Test preformatted handler - adapted from test_preformatted_code.""" # From: "
function hello() {\n  console.log('Hello');\n}
" soup = BeautifulSoup('
function hello() {\n  console.log(\'Hello\');\n}
', 'html.parser') element = soup.find('pre') result = preformatted_handler(element, self.base_context) self.assertIsInstance(result, CodeBlock) # Should have lines (from original test expectation) lines = list(result.lines()) self.assertGreater(len(lines), 0) def test_unordered_list_handler(self): """Test unordered list handler - adapted from test_unordered_list.""" # From: "
  • First item
  • Second item
  • Third item
" soup = BeautifulSoup('
  • First item
  • Second item
  • Third item
', 'html.parser') element = soup.find('ul') result = unordered_list_handler(element, self.base_context) self.assertIsInstance(result, HList) self.assertEqual(result.style, ListStyle.UNORDERED) # Should match original test expectations items = list(result.items()) self.assertEqual(len(items), 3) def test_ordered_list_handler(self): """Test ordered list handler - adapted from test_ordered_list.""" # From: "
  1. First item
  2. Second item
  3. Third item
" soup = BeautifulSoup('
  1. First item
  2. Second item
  3. Third item
', 'html.parser') element = soup.find('ol') result = ordered_list_handler(element, self.base_context) self.assertIsInstance(result, HList) self.assertEqual(result.style, ListStyle.ORDERED) # Should match original test expectations items = list(result.items()) self.assertEqual(len(items), 3) # "First item", "Second item", "Third item" def test_list_item_handler(self): """Test list item handler.""" soup = BeautifulSoup('
  • List item content
  • ', 'html.parser') element = soup.find('li') result = list_item_handler(element, self.base_context) self.assertIsInstance(result, ListItem) blocks = list(result.blocks()) self.assertGreater(len(blocks), 0) def test_table_handler(self): """Test table handler - adapted from test_table_basic.""" # From test_table_basic structure soup = BeautifulSoup('''
    Header 1 Header 2
    Cell 1 Cell 2
    ''', 'html.parser') element = soup.find('table') result = table_handler(element, self.base_context) self.assertIsInstance(result, Table) def test_table_row_handler(self): """Test table row handler.""" soup = BeautifulSoup('Cell 1Cell 2', 'html.parser') element = soup.find('tr') result = table_row_handler(element, self.base_context) self.assertIsInstance(result, TableRow) def test_table_cell_handler(self): """Test table cell handler.""" soup = BeautifulSoup('Cell content', 'html.parser') element = soup.find('td') # Apply styling to get attributes styled_context = apply_element_styling(self.base_context, element) result = table_cell_handler(element, styled_context) self.assertIsInstance(result, TableCell) self.assertEqual(result.is_header, False) def test_table_header_cell_handler(self): """Test table header cell handler.""" soup = BeautifulSoup('Header content', 'html.parser') element = soup.find('th') # Apply styling to get attributes styled_context = apply_element_styling(self.base_context, element) result = table_header_cell_handler(element, styled_context) self.assertIsInstance(result, TableCell) self.assertEqual(result.is_header, True) def test_horizontal_rule_handler(self): """Test horizontal rule handler.""" soup = BeautifulSoup('
    ', 'html.parser') element = soup.find('hr') result = horizontal_rule_handler(element, self.base_context) self.assertIsInstance(result, HorizontalRule) def test_image_handler(self): """Test image handler.""" soup = BeautifulSoup('Test image', 'html.parser') element = soup.find('img') # Need to apply styling first to get attributes styled_context = apply_element_styling(self.base_context, element) result = image_handler(element, styled_context) self.assertIsInstance(result, Image) self.assertEqual(result.source, "test.jpg") self.assertEqual(result.alt_text, "Test image") self.assertEqual(result.width, 100) self.assertEqual(result.height, 50) def test_div_handler_container(self): """Test div handler - adapted from test_div_container.""" # From: "

    First paragraph.

    Second paragraph.

    " soup = BeautifulSoup('

    First paragraph.

    Second paragraph.

    ', 'html.parser') element = soup.find('div') result = div_handler(element, self.base_context) self.assertIsInstance(result, list) # Should match original test expectations self.assertEqual(len(result), 2) self.assertIsInstance(result[0], Paragraph) self.assertIsInstance(result[1], Paragraph) class TestStyledContentHandling(unittest.TestCase): """Test styled content handling using patterns from existing tests.""" def setUp(self): """Set up test fixtures.""" self.base_context = create_base_context() def test_paragraph_with_bold_content(self): """Test paragraph with bold content - adapted from test_bold_text.""" # From: "

    This is bold text in a paragraph.

    " soup = BeautifulSoup('

    This is bold text in a paragraph.

    ', 'html.parser') element = soup.find('p') result = paragraph_handler(element, self.base_context) self.assertIsInstance(result, Paragraph) words = list(result.words_iter()) self.assertEqual(len(words), 7) # From original test expectation # Check that 'bold' and 'text' words have bold font weight (from original test) bold_word = words[2][1] # 'bold' text_word = words[3][1] # 'text' self.assertEqual(bold_word.text, "bold") self.assertEqual(bold_word.style.weight, FontWeight.BOLD) self.assertEqual(text_word.text, "text") self.assertEqual(text_word.style.weight, FontWeight.BOLD) # Check that other words are not bold (from original test) normal_word = words[0][1] # 'This' self.assertEqual(normal_word.text, "This") self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD) def test_paragraph_with_mixed_formatting(self): """Test mixed formatting - adapted from test_mixed_formatting.""" # From: "

    This paragraph contains bold, italic, blue..." soup = BeautifulSoup('

    This paragraph contains bold, italic, blue text.

    ', 'html.parser') element = soup.find('p') result = paragraph_handler(element, self.base_context) self.assertIsInstance(result, Paragraph) words = list(result.words_iter()) # Check for bold word (from original test pattern) bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] self.assertGreater(len(bold_words), 0, "Should have bold words") # Check for italic word (from original test pattern) italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC] self.assertGreater(len(italic_words), 0, "Should have italic words") # Check for blue colored word (from original test pattern) blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)] self.assertGreater(len(blue_words), 0, "Should have blue colored words") if __name__ == '__main__': unittest.main()