""" Unit tests for HTML content reading. Tests the HTMLContentReader class for parsing complete HTML documents. This is more of an integration test covering the entire parsing pipeline. """ import unittest from pyWebLayout.io.readers.html_content import HTMLContentReader from pyWebLayout.abstract.document import Document from pyWebLayout.abstract.block import ( Parapgraph, Heading, HeadingLevel, HList, ListStyle, Table, Quote, CodeBlock, HorizontalRule, LineBreak ) class TestHTMLContentReader(unittest.TestCase): """Test cases for HTMLContentReader.""" def setUp(self): """Set up test fixtures.""" self.reader = HTMLContentReader() self.document = Document() def test_simple_paragraph(self): """Test parsing a simple paragraph.""" html = '
Hello world!
' result = self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) self.assertIsInstance(self.document.blocks[0], Parapgraph) paragraph = self.document.blocks[0] words = list(paragraph.words()) self.assertEqual(len(words), 2) self.assertEqual(words[0][1].text, "Hello") self.assertEqual(words[1][1].text, "world!") def test_headings(self): """Test parsing different heading levels.""" html = '''This is bold and italic text.
' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) paragraph = self.document.blocks[0] words = list(paragraph.words()) # Should have words: "This", "is", "bold", "and", "italic", "text." self.assertEqual(len(words), 6) # The styling information is embedded in the Font objects # We can't easily test the exact styling without more complex setup # but we can verify the words are created correctly word_texts = [word[1].text for word in words] self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."]) def test_unordered_list(self): """Test parsing unordered lists.""" html = '''| Header 1 | Header 2 |
|---|---|
| Cell 1 | Cell 2 |
''' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 1) self.assertIsInstance(self.document.blocks[0], Quote) quote = self.document.blocks[0] quote_blocks = list(quote.blocks()) self.assertEqual(len(quote_blocks), 2) self.assertIsInstance(quote_blocks[0], Parapgraph) self.assertIsInstance(quote_blocks[1], Parapgraph) def test_code_block(self): """Test parsing code blocks.""" html = '''This is a quoted paragraph.
Another quoted paragraph.
def hello():
print("Hello, world!")
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], CodeBlock)
code_block = self.document.blocks[0]
self.assertEqual(code_block.language, "python")
def test_horizontal_rule(self):
"""Test parsing horizontal rules."""
html = 'Before
After
' self.reader.extract_content(html, self.document) self.assertEqual(len(self.document.blocks), 3) self.assertIsInstance(self.document.blocks[0], Parapgraph) self.assertIsInstance(self.document.blocks[1], HorizontalRule) self.assertIsInstance(self.document.blocks[2], Parapgraph) def test_html_entities(self): """Test handling HTML entities.""" html = 'Less than: < Greater than: > Ampersand: &
' self.reader.extract_content(html, self.document) paragraph = self.document.blocks[0] words = list(paragraph.words()) # Find the entity words word_texts = [word[1].text for word in words] self.assertIn('<', word_texts) self.assertIn('>', word_texts) self.assertIn('&', word_texts) def test_nested_elements(self): """Test parsing nested HTML elements.""" html = '''Section content with important text.
Word1 Word2 Word3
''' self.reader.extract_content(html, self.document) paragraph = self.document.blocks[0] words = list(paragraph.words()) # Should normalize whitespace and create separate words word_texts = [word[1].text for word in words] self.assertEqual(word_texts, ["Word1", "Word2", "Word3"]) def test_base_url_setting(self): """Test setting base URL for link resolution.""" base_url = "https://example.com/path/" self.reader.set_base_url(base_url) # The base URL should be passed to the inline handler self.assertEqual(self.reader.inline_handler.base_url, base_url) def test_complex_document(self): """Test parsing a complex HTML document.""" html = '''Introduction paragraph with emphasis.
Content with a link.
A quoted paragraph.
| Col1 | Col2 |
|---|---|
| A | B |