248 lines
9.4 KiB
Python
248 lines
9.4 KiB
Python
"""
|
||
Unit tests for HTML text processing.
|
||
|
||
Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation.
|
||
"""
|
||
|
||
import unittest
|
||
from unittest.mock import Mock, MagicMock
|
||
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
|
||
from pyWebLayout.io.readers.html_style import HTMLStyleManager
|
||
from pyWebLayout.abstract.block import Parapgraph
|
||
from pyWebLayout.abstract.inline import Word
|
||
|
||
|
||
class TestHTMLTextProcessor(unittest.TestCase):
|
||
"""Test cases for HTMLTextProcessor."""
|
||
|
||
def setUp(self):
|
||
"""Set up test fixtures."""
|
||
self.style_manager = HTMLStyleManager()
|
||
self.text_processor = HTMLTextProcessor(self.style_manager)
|
||
|
||
# Create a mock paragraph
|
||
self.mock_paragraph = Mock(spec=Parapgraph)
|
||
self.mock_paragraph.add_word = Mock()
|
||
|
||
def test_initialization(self):
|
||
"""Test proper initialization of text processor."""
|
||
self.assertEqual(self.text_processor._text_buffer, "")
|
||
self.assertIsNone(self.text_processor._current_paragraph)
|
||
self.assertEqual(self.text_processor._style_manager, self.style_manager)
|
||
|
||
def test_add_text(self):
|
||
"""Test adding text to buffer."""
|
||
self.text_processor.add_text("Hello")
|
||
self.assertEqual(self.text_processor.get_buffer_content(), "Hello")
|
||
|
||
self.text_processor.add_text(" World")
|
||
self.assertEqual(self.text_processor.get_buffer_content(), "Hello World")
|
||
|
||
def test_entity_references(self):
|
||
"""Test HTML entity reference handling."""
|
||
test_cases = [
|
||
('lt', '<'),
|
||
('gt', '>'),
|
||
('amp', '&'),
|
||
('quot', '"'),
|
||
('apos', "'"),
|
||
('nbsp', ' '),
|
||
('copy', '©'),
|
||
('reg', '®'),
|
||
('trade', '™'),
|
||
('mdash', '—'),
|
||
('ndash', '–'),
|
||
('hellip', '…'),
|
||
('euro', '€'),
|
||
('unknown', '&unknown;') # Unknown entities should be preserved
|
||
]
|
||
|
||
for entity, expected in test_cases:
|
||
with self.subTest(entity=entity):
|
||
self.text_processor.clear_buffer()
|
||
self.text_processor.add_entity_reference(entity)
|
||
self.assertEqual(self.text_processor.get_buffer_content(), expected)
|
||
|
||
def test_character_references(self):
|
||
"""Test character reference handling."""
|
||
# Decimal character references
|
||
self.text_processor.clear_buffer()
|
||
self.text_processor.add_character_reference('65') # 'A'
|
||
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
||
|
||
# Hexadecimal character references
|
||
self.text_processor.clear_buffer()
|
||
self.text_processor.add_character_reference('x41') # 'A'
|
||
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
|
||
|
||
# Unicode character
|
||
self.text_processor.clear_buffer()
|
||
self.text_processor.add_character_reference('8364') # Euro symbol
|
||
self.assertEqual(self.text_processor.get_buffer_content(), '€')
|
||
|
||
# Invalid character reference
|
||
self.text_processor.clear_buffer()
|
||
self.text_processor.add_character_reference('invalid')
|
||
self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;')
|
||
|
||
# Out of range character
|
||
self.text_processor.clear_buffer()
|
||
self.text_processor.add_character_reference('99999999999')
|
||
self.assertTrue(self.text_processor.get_buffer_content().startswith('&#'))
|
||
|
||
def test_buffer_operations(self):
|
||
"""Test buffer state operations."""
|
||
# Test has_pending_text
|
||
self.assertFalse(self.text_processor.has_pending_text())
|
||
|
||
self.text_processor.add_text("Some text")
|
||
self.assertTrue(self.text_processor.has_pending_text())
|
||
|
||
# Test clear_buffer
|
||
self.text_processor.clear_buffer()
|
||
self.assertFalse(self.text_processor.has_pending_text())
|
||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||
|
||
# Test with whitespace only
|
||
self.text_processor.add_text(" \n\t ")
|
||
self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace
|
||
|
||
def test_paragraph_management(self):
|
||
"""Test current paragraph setting."""
|
||
# Initially no paragraph
|
||
self.assertIsNone(self.text_processor._current_paragraph)
|
||
|
||
# Set paragraph
|
||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||
self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph)
|
||
|
||
# Clear paragraph
|
||
self.text_processor.set_current_paragraph(None)
|
||
self.assertIsNone(self.text_processor._current_paragraph)
|
||
|
||
def test_flush_text_with_paragraph(self):
|
||
"""Test flushing text when paragraph is set."""
|
||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||
self.text_processor.add_text("Hello world test")
|
||
|
||
# Mock the style manager to return a specific font
|
||
mock_font = Mock()
|
||
self.style_manager.create_font = Mock(return_value=mock_font)
|
||
|
||
result = self.text_processor.flush_text()
|
||
|
||
# Should return True (text was flushed)
|
||
self.assertTrue(result)
|
||
|
||
# Should have created words
|
||
self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test"
|
||
|
||
# Verify the words were created with correct text
|
||
calls = self.mock_paragraph.add_word.call_args_list
|
||
word_texts = [call[0][0].text for call in calls]
|
||
self.assertEqual(word_texts, ["Hello", "world", "test"])
|
||
|
||
# Buffer should be empty after flush
|
||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||
|
||
def test_flush_text_without_paragraph(self):
|
||
"""Test flushing text when no paragraph is set."""
|
||
self.text_processor.add_text("Hello world")
|
||
|
||
result = self.text_processor.flush_text()
|
||
|
||
# Should return False (no paragraph to flush to)
|
||
self.assertFalse(result)
|
||
|
||
# Buffer should be cleared anyway
|
||
self.assertEqual(self.text_processor.get_buffer_content(), "")
|
||
|
||
def test_flush_empty_buffer(self):
|
||
"""Test flushing when buffer is empty."""
|
||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||
|
||
result = self.text_processor.flush_text()
|
||
|
||
# Should return False (nothing to flush)
|
||
self.assertFalse(result)
|
||
|
||
# No words should be added
|
||
self.mock_paragraph.add_word.assert_not_called()
|
||
|
||
def test_flush_whitespace_only(self):
|
||
"""Test flushing when buffer contains only whitespace."""
|
||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||
self.text_processor.add_text(" \n\t ")
|
||
|
||
result = self.text_processor.flush_text()
|
||
|
||
# Should return False (no meaningful content)
|
||
self.assertFalse(result)
|
||
|
||
# No words should be added
|
||
self.mock_paragraph.add_word.assert_not_called()
|
||
|
||
def test_word_creation_with_styling(self):
|
||
"""Test that words are created with proper styling."""
|
||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||
self.text_processor.add_text("styled text")
|
||
|
||
# Set up style manager to return specific font
|
||
mock_font = Mock()
|
||
mock_font.font_size = 16
|
||
mock_font.weight = "bold"
|
||
self.style_manager.create_font = Mock(return_value=mock_font)
|
||
|
||
self.text_processor.flush_text()
|
||
|
||
# Verify font was created
|
||
self.style_manager.create_font.assert_called()
|
||
|
||
# Verify words were created with the font
|
||
calls = self.mock_paragraph.add_word.call_args_list
|
||
for call in calls:
|
||
word = call[0][0]
|
||
self.assertEqual(word.style, mock_font)
|
||
|
||
def test_reset(self):
|
||
"""Test resetting the text processor."""
|
||
# Set up some state
|
||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||
self.text_processor.add_text("Some text")
|
||
|
||
# Reset
|
||
self.text_processor.reset()
|
||
|
||
# Should be back to initial state
|
||
self.assertEqual(self.text_processor._text_buffer, "")
|
||
self.assertIsNone(self.text_processor._current_paragraph)
|
||
|
||
def test_complex_text_processing(self):
|
||
"""Test processing text with mixed content."""
|
||
self.text_processor.set_current_paragraph(self.mock_paragraph)
|
||
|
||
# Mock font creation
|
||
mock_font = Mock()
|
||
self.style_manager.create_font = Mock(return_value=mock_font)
|
||
|
||
# Add mixed content
|
||
self.text_processor.add_text("Hello ")
|
||
self.text_processor.add_entity_reference('amp')
|
||
self.text_processor.add_text(" world")
|
||
self.text_processor.add_character_reference('33') # '!'
|
||
|
||
# Should have "Hello & world!"
|
||
expected_content = "Hello & world!"
|
||
self.assertEqual(self.text_processor.get_buffer_content(), expected_content)
|
||
|
||
# Flush and verify words
|
||
self.text_processor.flush_text()
|
||
|
||
calls = self.mock_paragraph.add_word.call_args_list
|
||
word_texts = [call[0][0].text for call in calls]
|
||
self.assertEqual(word_texts, ["Hello", "&", "world!"])
|
||
|
||
|
||
if __name__ == '__main__':
|
||
unittest.main()
|