pyWebLayout/tests/test_html_content.py

355 lines
12 KiB
Python

"""
Unit tests for HTML content reading.
Tests the HTMLContentReader class for parsing complete HTML documents.
This is more of an integration test covering the entire parsing pipeline.
"""
import unittest
from pyWebLayout.io.readers.html_content import HTMLContentReader
from pyWebLayout.abstract.document import Document
from pyWebLayout.abstract.block import (
Parapgraph, Heading, HeadingLevel, HList, ListStyle,
Table, Quote, CodeBlock, HorizontalRule, LineBreak
)
class TestHTMLContentReader(unittest.TestCase):
"""Test cases for HTMLContentReader."""
def setUp(self):
"""Set up test fixtures."""
self.reader = HTMLContentReader()
self.document = Document()
def test_simple_paragraph(self):
"""Test parsing a simple paragraph."""
html = '<p>Hello world!</p>'
result = self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], Parapgraph)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
self.assertEqual(len(words), 2)
self.assertEqual(words[0][1].text, "Hello")
self.assertEqual(words[1][1].text, "world!")
def test_headings(self):
"""Test parsing different heading levels."""
html = '''
<h1>Heading 1</h1>
<h2>Heading 2</h2>
<h3>Heading 3</h3>
<h6>Heading 6</h6>
'''
self.reader.extract_content(html, self.document)
# Should have 4 heading blocks
headings = [block for block in self.document.blocks if isinstance(block, Heading)]
self.assertEqual(len(headings), 4)
# Check heading levels
self.assertEqual(headings[0].level, HeadingLevel.H1)
self.assertEqual(headings[1].level, HeadingLevel.H2)
self.assertEqual(headings[2].level, HeadingLevel.H3)
self.assertEqual(headings[3].level, HeadingLevel.H6)
# Check text content
h1_words = list(headings[0].words())
self.assertEqual(len(h1_words), 2)
self.assertEqual(h1_words[0][1].text, "Heading")
self.assertEqual(h1_words[1][1].text, "1")
def test_styled_text(self):
"""Test parsing text with inline styling."""
html = '<p>This is <b>bold</b> and <i>italic</i> text.</p>'
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
# Should have words: "This", "is", "bold", "and", "italic", "text."
self.assertEqual(len(words), 6)
# The styling information is embedded in the Font objects
# We can't easily test the exact styling without more complex setup
# but we can verify the words are created correctly
word_texts = [word[1].text for word in words]
self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."])
def test_unordered_list(self):
"""Test parsing unordered lists."""
html = '''
<ul>
<li>First item</li>
<li>Second item</li>
<li>Third item</li>
</ul>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], HList)
list_block = self.document.blocks[0]
self.assertEqual(list_block.style, ListStyle.UNORDERED)
items = list(list_block.items())
self.assertEqual(len(items), 3)
# Check first item content
first_item_blocks = list(items[0].blocks())
self.assertEqual(len(first_item_blocks), 1)
self.assertIsInstance(first_item_blocks[0], Parapgraph)
def test_ordered_list(self):
"""Test parsing ordered lists."""
html = '''
<ol>
<li>First step</li>
<li>Second step</li>
</ol>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
list_block = self.document.blocks[0]
self.assertEqual(list_block.style, ListStyle.ORDERED)
items = list(list_block.items())
self.assertEqual(len(items), 2)
def test_definition_list(self):
"""Test parsing definition lists."""
html = '''
<dl>
<dt>Term 1</dt>
<dd>Definition 1</dd>
<dt>Term 2</dt>
<dd>Definition 2</dd>
</dl>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
list_block = self.document.blocks[0]
self.assertEqual(list_block.style, ListStyle.DEFINITION)
items = list(list_block.items())
self.assertEqual(len(items), 2) # Two dt/dd pairs
def test_table(self):
"""Test parsing simple tables."""
html = '''
<table>
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
<tr>
<td>Cell 1</td>
<td>Cell 2</td>
</tr>
</table>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], Table)
table = self.document.blocks[0]
# Check body rows
body_rows = list(table.body_rows())
self.assertEqual(len(body_rows), 2) # Header row + data row
# Check first row (header)
first_row_cells = list(body_rows[0].cells())
self.assertEqual(len(first_row_cells), 2)
self.assertTrue(first_row_cells[0].is_header)
self.assertTrue(first_row_cells[1].is_header)
# Check second row (data)
second_row_cells = list(body_rows[1].cells())
self.assertEqual(len(second_row_cells), 2)
self.assertFalse(second_row_cells[0].is_header)
self.assertFalse(second_row_cells[1].is_header)
def test_blockquote(self):
"""Test parsing blockquotes."""
html = '''
<blockquote>
<p>This is a quoted paragraph.</p>
<p>Another quoted paragraph.</p>
</blockquote>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], Quote)
quote = self.document.blocks[0]
quote_blocks = list(quote.blocks())
self.assertEqual(len(quote_blocks), 2)
self.assertIsInstance(quote_blocks[0], Parapgraph)
self.assertIsInstance(quote_blocks[1], Parapgraph)
def test_code_block(self):
"""Test parsing code blocks."""
html = '''
<pre><code class="language-python">
def hello():
print("Hello, world!")
</code></pre>
'''
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 1)
self.assertIsInstance(self.document.blocks[0], CodeBlock)
code_block = self.document.blocks[0]
self.assertEqual(code_block.language, "python")
def test_horizontal_rule(self):
"""Test parsing horizontal rules."""
html = '<p>Before</p><hr><p>After</p>'
self.reader.extract_content(html, self.document)
self.assertEqual(len(self.document.blocks), 3)
self.assertIsInstance(self.document.blocks[0], Parapgraph)
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
self.assertIsInstance(self.document.blocks[2], Parapgraph)
def test_html_entities(self):
"""Test handling HTML entities."""
html = '<p>Less than: &lt; Greater than: &gt; Ampersand: &amp;</p>'
self.reader.extract_content(html, self.document)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
# Find the entity words
word_texts = [word[1].text for word in words]
self.assertIn('<', word_texts)
self.assertIn('>', word_texts)
self.assertIn('&', word_texts)
def test_nested_elements(self):
"""Test parsing nested HTML elements."""
html = '''
<div>
<h2>Section Title</h2>
<p>Section content with <strong>important</strong> text.</p>
<ul>
<li>List item 1</li>
<li>List item 2</li>
</ul>
</div>
'''
self.reader.extract_content(html, self.document)
# Should have multiple blocks
self.assertGreater(len(self.document.blocks), 1)
# Check that we have different types of blocks
block_types = [type(block).__name__ for block in self.document.blocks]
self.assertIn('Parapgraph', block_types) # From div
self.assertIn('Heading', block_types)
self.assertIn('HList', block_types)
def test_empty_elements(self):
"""Test handling empty HTML elements."""
html = '<p></p><div></div><ul></ul>'
self.reader.extract_content(html, self.document)
# Empty elements should still create blocks
self.assertEqual(len(self.document.blocks), 3)
def test_whitespace_handling(self):
"""Test proper whitespace handling."""
html = '''
<p> Word1 Word2
Word3 </p>
'''
self.reader.extract_content(html, self.document)
paragraph = self.document.blocks[0]
words = list(paragraph.words())
# Should normalize whitespace and create separate words
word_texts = [word[1].text for word in words]
self.assertEqual(word_texts, ["Word1", "Word2", "Word3"])
def test_base_url_setting(self):
"""Test setting base URL for link resolution."""
base_url = "https://example.com/path/"
self.reader.set_base_url(base_url)
# The base URL should be passed to the inline handler
self.assertEqual(self.reader.inline_handler.base_url, base_url)
def test_complex_document(self):
"""Test parsing a complex HTML document."""
html = '''
<!DOCTYPE html>
<html>
<head>
<title>Test Document</title>
<style>body { font-family: Arial; }</style>
</head>
<body>
<h1>Main Title</h1>
<p>Introduction paragraph with <em>emphasis</em>.</p>
<h2>Section 1</h2>
<p>Content with <a href="link.html">a link</a>.</p>
<ul>
<li>Item 1</li>
<li>Item 2 with <strong>bold text</strong></li>
</ul>
<h2>Section 2</h2>
<blockquote>
<p>A quoted paragraph.</p>
</blockquote>
<table>
<tr><th>Col1</th><th>Col2</th></tr>
<tr><td>A</td><td>B</td></tr>
</table>
</body>
</html>
'''
self.reader.extract_content(html, self.document)
# Should have parsed multiple blocks
self.assertGreater(len(self.document.blocks), 5)
# Should have different types of content
block_types = set(type(block).__name__ for block in self.document.blocks)
expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'}
self.assertTrue(expected_types.issubset(block_types))
if __name__ == '__main__':
unittest.main()