355 lines
12 KiB
Python
355 lines
12 KiB
Python
"""
|
|
Unit tests for HTML content reading.
|
|
|
|
Tests the HTMLContentReader class for parsing complete HTML documents.
|
|
This is more of an integration test covering the entire parsing pipeline.
|
|
"""
|
|
|
|
import unittest
|
|
from pyWebLayout.io.readers.html_content import HTMLContentReader
|
|
from pyWebLayout.abstract.document import Document
|
|
from pyWebLayout.abstract.block import (
|
|
Parapgraph, Heading, HeadingLevel, HList, ListStyle,
|
|
Table, Quote, CodeBlock, HorizontalRule, LineBreak
|
|
)
|
|
|
|
|
|
class TestHTMLContentReader(unittest.TestCase):
|
|
"""Test cases for HTMLContentReader."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.reader = HTMLContentReader()
|
|
self.document = Document()
|
|
|
|
def test_simple_paragraph(self):
|
|
"""Test parsing a simple paragraph."""
|
|
html = '<p>Hello world!</p>'
|
|
|
|
result = self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
self.assertIsInstance(self.document.blocks[0], Parapgraph)
|
|
|
|
paragraph = self.document.blocks[0]
|
|
words = list(paragraph.words())
|
|
self.assertEqual(len(words), 2)
|
|
self.assertEqual(words[0][1].text, "Hello")
|
|
self.assertEqual(words[1][1].text, "world!")
|
|
|
|
def test_headings(self):
|
|
"""Test parsing different heading levels."""
|
|
html = '''
|
|
<h1>Heading 1</h1>
|
|
<h2>Heading 2</h2>
|
|
<h3>Heading 3</h3>
|
|
<h6>Heading 6</h6>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
# Should have 4 heading blocks
|
|
headings = [block for block in self.document.blocks if isinstance(block, Heading)]
|
|
self.assertEqual(len(headings), 4)
|
|
|
|
# Check heading levels
|
|
self.assertEqual(headings[0].level, HeadingLevel.H1)
|
|
self.assertEqual(headings[1].level, HeadingLevel.H2)
|
|
self.assertEqual(headings[2].level, HeadingLevel.H3)
|
|
self.assertEqual(headings[3].level, HeadingLevel.H6)
|
|
|
|
# Check text content
|
|
h1_words = list(headings[0].words())
|
|
self.assertEqual(len(h1_words), 2)
|
|
self.assertEqual(h1_words[0][1].text, "Heading")
|
|
self.assertEqual(h1_words[1][1].text, "1")
|
|
|
|
def test_styled_text(self):
|
|
"""Test parsing text with inline styling."""
|
|
html = '<p>This is <b>bold</b> and <i>italic</i> text.</p>'
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
paragraph = self.document.blocks[0]
|
|
words = list(paragraph.words())
|
|
|
|
# Should have words: "This", "is", "bold", "and", "italic", "text."
|
|
self.assertEqual(len(words), 6)
|
|
|
|
# The styling information is embedded in the Font objects
|
|
# We can't easily test the exact styling without more complex setup
|
|
# but we can verify the words are created correctly
|
|
word_texts = [word[1].text for word in words]
|
|
self.assertEqual(word_texts, ["This", "is", "bold", "and", "italic", "text."])
|
|
|
|
def test_unordered_list(self):
|
|
"""Test parsing unordered lists."""
|
|
html = '''
|
|
<ul>
|
|
<li>First item</li>
|
|
<li>Second item</li>
|
|
<li>Third item</li>
|
|
</ul>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
self.assertIsInstance(self.document.blocks[0], HList)
|
|
|
|
list_block = self.document.blocks[0]
|
|
self.assertEqual(list_block.style, ListStyle.UNORDERED)
|
|
|
|
items = list(list_block.items())
|
|
self.assertEqual(len(items), 3)
|
|
|
|
# Check first item content
|
|
first_item_blocks = list(items[0].blocks())
|
|
self.assertEqual(len(first_item_blocks), 1)
|
|
self.assertIsInstance(first_item_blocks[0], Parapgraph)
|
|
|
|
def test_ordered_list(self):
|
|
"""Test parsing ordered lists."""
|
|
html = '''
|
|
<ol>
|
|
<li>First step</li>
|
|
<li>Second step</li>
|
|
</ol>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
list_block = self.document.blocks[0]
|
|
self.assertEqual(list_block.style, ListStyle.ORDERED)
|
|
|
|
items = list(list_block.items())
|
|
self.assertEqual(len(items), 2)
|
|
|
|
def test_definition_list(self):
|
|
"""Test parsing definition lists."""
|
|
html = '''
|
|
<dl>
|
|
<dt>Term 1</dt>
|
|
<dd>Definition 1</dd>
|
|
<dt>Term 2</dt>
|
|
<dd>Definition 2</dd>
|
|
</dl>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
list_block = self.document.blocks[0]
|
|
self.assertEqual(list_block.style, ListStyle.DEFINITION)
|
|
|
|
items = list(list_block.items())
|
|
self.assertEqual(len(items), 2) # Two dt/dd pairs
|
|
|
|
def test_table(self):
|
|
"""Test parsing simple tables."""
|
|
html = '''
|
|
<table>
|
|
<tr>
|
|
<th>Header 1</th>
|
|
<th>Header 2</th>
|
|
</tr>
|
|
<tr>
|
|
<td>Cell 1</td>
|
|
<td>Cell 2</td>
|
|
</tr>
|
|
</table>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
self.assertIsInstance(self.document.blocks[0], Table)
|
|
|
|
table = self.document.blocks[0]
|
|
|
|
# Check body rows
|
|
body_rows = list(table.body_rows())
|
|
self.assertEqual(len(body_rows), 2) # Header row + data row
|
|
|
|
# Check first row (header)
|
|
first_row_cells = list(body_rows[0].cells())
|
|
self.assertEqual(len(first_row_cells), 2)
|
|
self.assertTrue(first_row_cells[0].is_header)
|
|
self.assertTrue(first_row_cells[1].is_header)
|
|
|
|
# Check second row (data)
|
|
second_row_cells = list(body_rows[1].cells())
|
|
self.assertEqual(len(second_row_cells), 2)
|
|
self.assertFalse(second_row_cells[0].is_header)
|
|
self.assertFalse(second_row_cells[1].is_header)
|
|
|
|
def test_blockquote(self):
|
|
"""Test parsing blockquotes."""
|
|
html = '''
|
|
<blockquote>
|
|
<p>This is a quoted paragraph.</p>
|
|
<p>Another quoted paragraph.</p>
|
|
</blockquote>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
self.assertIsInstance(self.document.blocks[0], Quote)
|
|
|
|
quote = self.document.blocks[0]
|
|
quote_blocks = list(quote.blocks())
|
|
self.assertEqual(len(quote_blocks), 2)
|
|
self.assertIsInstance(quote_blocks[0], Parapgraph)
|
|
self.assertIsInstance(quote_blocks[1], Parapgraph)
|
|
|
|
def test_code_block(self):
|
|
"""Test parsing code blocks."""
|
|
html = '''
|
|
<pre><code class="language-python">
|
|
def hello():
|
|
print("Hello, world!")
|
|
</code></pre>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 1)
|
|
self.assertIsInstance(self.document.blocks[0], CodeBlock)
|
|
|
|
code_block = self.document.blocks[0]
|
|
self.assertEqual(code_block.language, "python")
|
|
|
|
def test_horizontal_rule(self):
|
|
"""Test parsing horizontal rules."""
|
|
html = '<p>Before</p><hr><p>After</p>'
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
self.assertEqual(len(self.document.blocks), 3)
|
|
self.assertIsInstance(self.document.blocks[0], Parapgraph)
|
|
self.assertIsInstance(self.document.blocks[1], HorizontalRule)
|
|
self.assertIsInstance(self.document.blocks[2], Parapgraph)
|
|
|
|
def test_html_entities(self):
|
|
"""Test handling HTML entities."""
|
|
html = '<p>Less than: < Greater than: > Ampersand: &</p>'
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
paragraph = self.document.blocks[0]
|
|
words = list(paragraph.words())
|
|
|
|
# Find the entity words
|
|
word_texts = [word[1].text for word in words]
|
|
self.assertIn('<', word_texts)
|
|
self.assertIn('>', word_texts)
|
|
self.assertIn('&', word_texts)
|
|
|
|
def test_nested_elements(self):
|
|
"""Test parsing nested HTML elements."""
|
|
html = '''
|
|
<div>
|
|
<h2>Section Title</h2>
|
|
<p>Section content with <strong>important</strong> text.</p>
|
|
<ul>
|
|
<li>List item 1</li>
|
|
<li>List item 2</li>
|
|
</ul>
|
|
</div>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
# Should have multiple blocks
|
|
self.assertGreater(len(self.document.blocks), 1)
|
|
|
|
# Check that we have different types of blocks
|
|
block_types = [type(block).__name__ for block in self.document.blocks]
|
|
self.assertIn('Parapgraph', block_types) # From div
|
|
self.assertIn('Heading', block_types)
|
|
self.assertIn('HList', block_types)
|
|
|
|
def test_empty_elements(self):
|
|
"""Test handling empty HTML elements."""
|
|
html = '<p></p><div></div><ul></ul>'
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
# Empty elements should still create blocks
|
|
self.assertEqual(len(self.document.blocks), 3)
|
|
|
|
def test_whitespace_handling(self):
|
|
"""Test proper whitespace handling."""
|
|
html = '''
|
|
<p> Word1 Word2
|
|
Word3 </p>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
paragraph = self.document.blocks[0]
|
|
words = list(paragraph.words())
|
|
|
|
# Should normalize whitespace and create separate words
|
|
word_texts = [word[1].text for word in words]
|
|
self.assertEqual(word_texts, ["Word1", "Word2", "Word3"])
|
|
|
|
def test_base_url_setting(self):
|
|
"""Test setting base URL for link resolution."""
|
|
base_url = "https://example.com/path/"
|
|
self.reader.set_base_url(base_url)
|
|
|
|
# The base URL should be passed to the inline handler
|
|
self.assertEqual(self.reader.inline_handler.base_url, base_url)
|
|
|
|
def test_complex_document(self):
|
|
"""Test parsing a complex HTML document."""
|
|
html = '''
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Test Document</title>
|
|
<style>body { font-family: Arial; }</style>
|
|
</head>
|
|
<body>
|
|
<h1>Main Title</h1>
|
|
<p>Introduction paragraph with <em>emphasis</em>.</p>
|
|
|
|
<h2>Section 1</h2>
|
|
<p>Content with <a href="link.html">a link</a>.</p>
|
|
|
|
<ul>
|
|
<li>Item 1</li>
|
|
<li>Item 2 with <strong>bold text</strong></li>
|
|
</ul>
|
|
|
|
<h2>Section 2</h2>
|
|
<blockquote>
|
|
<p>A quoted paragraph.</p>
|
|
</blockquote>
|
|
|
|
<table>
|
|
<tr><th>Col1</th><th>Col2</th></tr>
|
|
<tr><td>A</td><td>B</td></tr>
|
|
</table>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
self.reader.extract_content(html, self.document)
|
|
|
|
# Should have parsed multiple blocks
|
|
self.assertGreater(len(self.document.blocks), 5)
|
|
|
|
# Should have different types of content
|
|
block_types = set(type(block).__name__ for block in self.document.blocks)
|
|
expected_types = {'Heading', 'Parapgraph', 'HList', 'Quote', 'Table'}
|
|
self.assertTrue(expected_types.issubset(block_types))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|