""" Unit tests for HTML content reading. Tests the HTMLContentReader class for parsing complete HTML documents. This is more of an integration test covering the entire parsing pipeline. """ import unittest from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.abstract.document import Document from pyWebLayout.abstract.block import ( Parapgraph, Heading, HeadingLevel, HList, ListStyle, Table, Quote, CodeBlock, HorizontalRule, LineBreak ) class TestHTMLContentReader(unittest.TestCase): """Test cases for HTMLContentReader.""" def setUp(self): """Set up test fixtures.""" self.reader = HTMLContentReader() self.document = Document() def test_simple_paragraph(self): """Test parsing a simple paragraph.""" html = '

Hello world!

' result = self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) self.assertIsInstance(self.document.blocks[0], Parapgraph) paragraph = self.document.blocks[0] words = list(paragraph.words()) self.assertEqual(len(words), 2) self.assertEqual(words[0][1].text, "Hello") self.assertEqual(words[1][1].text, "world!") def test_headings(self): """Test parsing different heading levels.""" html = '''

Heading 1

Heading 2

Heading 3

Heading 6

''' self.reader.extract_content(html, self.document) # Should have 4 heading blocks headings = [block for block in self.document.blocks if isinstance(block, Heading)] self.assertEqual(len(headings), 4) # Check heading levels self.assertEqual(headings[0].level, HeadingLevel.H1) self.assertEqual(headings[1].level, HeadingLevel.H2) self.assertEqual(headings[2].level, HeadingLevel.H3) self.assertEqual(headings[3].level, HeadingLevel.H6) # Check text content h1_words = list(headings[0].words()) self.assertEqual(len(h1_words), 2) self.assertEqual(h1_words[0][1].text, "Heading") self.assertEqual(h1_words[1][1].text, "1") def test_styled_text(self): """Test parsing text with inline styling.""" html = '

This is bold and italic text.

' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) paragraph = self.document.blocks[0] words = list(paragraph.words()) # Should have words: "This", "is", "bold", "and", "italic", "text." self.assertEqual(len(words), 6) # The styling information is embedded in the Font objects # We can't easily test the exact styling without more complex setup # but we can verify the words are created correctly word_texts = [word[1].text for word in words] self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."]) def test_unordered_list(self): """Test parsing unordered lists.""" html = '''

First item
Second item
Third item

''' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) self.assertIsInstance(self.document.blocks[0], HList) list_block = self.document.blocks[0] self.assertEqual(list_block.style, ListStyle.UNORDERED) items = list(list_block.items()) self.assertEqual(len(items), 3) # Check first item content first_item_blocks = list(items[0].blocks()) self.assertEqual(len(first_item_blocks), 1) self.assertIsInstance(first_item_blocks[0], Parapgraph) def test_ordered_list(self): """Test parsing ordered lists.""" html = '''

First step
Second step

''' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) list_block = self.document.blocks[0] self.assertEqual(list_block.style, ListStyle.ORDERED) items = list(list_block.items()) self.assertEqual(len(items), 2) def test_definition_list(self): """Test parsing definition lists.""" html = '''

Term 1: Definition 1
Term 2: Definition 2

''' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) list_block = self.document.blocks[0] self.assertEqual(list_block.style, ListStyle.DEFINITION) items = list(list_block.items()) self.assertEqual(len(items), 2) # Two dt/dd pairs def test_table(self): """Test parsing simple tables.""" html = '''

Header 1	Header 2
Cell 1	Cell 2

''' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) self.assertIsInstance(self.document.blocks[0], Table) table = self.document.blocks[0] # Check body rows body_rows = list(table.body_rows()) self.assertEqual(len(body_rows), 2) # Header row + data row # Check first row (header) first_row_cells = list(body_rows[0].cells()) self.assertEqual(len(first_row_cells), 2) self.assertTrue(first_row_cells[0].is_header) self.assertTrue(first_row_cells[1].is_header) # Check second row (data) second_row_cells = list(body_rows[1].cells()) self.assertEqual(len(second_row_cells), 2) self.assertFalse(second_row_cells[0].is_header) self.assertFalse(second_row_cells[1].is_header) def test_blockquote(self): """Test parsing blockquotes.""" html = '''

This is a quoted paragraph.

Another quoted paragraph.

''' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) self.assertIsInstance(self.document.blocks[0], Quote) quote = self.document.blocks[0] quote_blocks = list(quote.blocks()) self.assertEqual(len(quote_blocks), 2) self.assertIsInstance(quote_blocks[0], Parapgraph) self.assertIsInstance(quote_blocks[1], Parapgraph) def test_code_block(self): """Test parsing code blocks.""" html = '''


def hello():
    print("Hello, world!")

''' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) self.assertIsInstance(self.document.blocks[0], CodeBlock) code_block = self.document.blocks[0] self.assertEqual(code_block.language, "python") def test_horizontal_rule(self): """Test parsing horizontal rules.""" html = '

Before

After

' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 3) self.assertIsInstance(self.document.blocks[0], Parapgraph) self.assertIsInstance(self.document.blocks[1], HorizontalRule) self.assertIsInstance(self.document.blocks[2], Parapgraph) def test_html_entities(self): """Test handling HTML entities.""" html = '

Less than: < Greater than: > Ampersand: &

' self.reader.extract_content(html, self.document) paragraph = self.document.blocks[0] words = list(paragraph.words()) # Find the entity words word_texts = [word[1].text for word in words] self.assertIn('<', word_texts) self.assertIn('>', word_texts) self.assertIn('&', word_texts) def test_nested_elements(self): """Test parsing nested HTML elements.""" html = '''

Section Title

Section content with important text.

List item 1
List item 2

''' self.reader.extract_content(html, self.document) # Should have multiple blocks self.assertGreater(len(self.document.blocks), 1) # Check that we have different types of blocks block_types = [type(block).__name__ for block in self.document.blocks] self.assertIn('Parapgraph', block_types) # From div self.assertIn('Heading', block_types) self.assertIn('HList', block_types) def test_empty_elements(self): """Test handling empty HTML elements.""" html = '

' self.reader.extract_content(html, self.document) # Empty elements should still create blocks self.assertEqual(len(self.document.blocks), 3) def test_whitespace_handling(self): """Test proper whitespace handling.""" html = '''

Word1 Word2 Word3

''' self.reader.extract_content(html, self.document) paragraph = self.document.blocks[0] words = list(paragraph.words()) # Should normalize whitespace and create separate words word_texts = [word[1].text for word in words] self.assertEqual(word_texts, ["Word1", "Word2", "Word3"]) def test_base_url_setting(self): """Test setting base URL for link resolution.""" base_url = "https://example.com/path/" self.reader.set_base_url(base_url) # The base URL should be passed to the inline handler self.assertEqual(self.reader.inline_handler.base_url, base_url) def test_complex_document(self): """Test parsing a complex HTML document.""" html = ''' Test Document

Main Title

Introduction paragraph with emphasis.

Section 1

Content with a link.

Item 1
Item 2 with bold text

Section 2

A quoted paragraph.

Col1	Col2
A	B

''' self.reader.extract_content(html, self.document) # Should have parsed multiple blocks self.assertGreater(len(self.document.blocks), 5) # Should have different types of content block_types = set(type(block).__name__ for block in self.document.blocks) expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'} self.assertTrue(expected_types.issubset(block_types)) if __name__ == '__main__': unittest.main()