"""
Unit tests for HTML text processing.
Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation.
"""
import unittest
from unittest.mock import Mock, MagicMock
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
from pyWebLayout.io.readers.html_style import HTMLStyleManager
from pyWebLayout.abstract.block import Parapgraph
from pyWebLayout.abstract.inline import Word
class TestHTMLTextProcessor(unittest.TestCase):
"""Test cases for HTMLTextProcessor."""
def setUp(self):
"""Set up test fixtures."""
self.style_manager = HTMLStyleManager()
self.text_processor = HTMLTextProcessor(self.style_manager)
# Create a mock paragraph
self.mock_paragraph = Mock(spec=Parapgraph)
self.mock_paragraph.add_word = Mock()
def test_initialization(self):
"""Test proper initialization of text processor."""
self.assertEqual(self.text_processor._text_buffer, "")
self.assertIsNone(self.text_processor._current_paragraph)
self.assertEqual(self.text_processor._style_manager, self.style_manager)
def test_add_text(self):
"""Test adding text to buffer."""
self.text_processor.add_text("Hello")
self.assertEqual(self.text_processor.get_buffer_content(), "Hello")
self.text_processor.add_text(" World")
self.assertEqual(self.text_processor.get_buffer_content(), "Hello World")
def test_entity_references(self):
"""Test HTML entity reference handling."""
test_cases = [
('lt', '<'),
('gt', '>'),
('amp', '&'),
('quot', '"'),
('apos', "'"),
('nbsp', ' '),
('copy', '©'),
('reg', '®'),
('trade', '™'),
('mdash', '—'),
('ndash', '–'),
('hellip', '…'),
('euro', '€'),
('unknown', '&unknown;') # Unknown entities should be preserved
]
for entity, expected in test_cases:
with self.subTest(entity=entity):
self.text_processor.clear_buffer()
self.text_processor.add_entity_reference(entity)
self.assertEqual(self.text_processor.get_buffer_content(), expected)
def test_character_references(self):
"""Test character reference handling."""
# Decimal character references
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('65') # 'A'
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
# Hexadecimal character references
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('x41') # 'A'
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
# Unicode character
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('8364') # Euro symbol
self.assertEqual(self.text_processor.get_buffer_content(), '€')
# Invalid character reference
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('invalid')
self.assertEqual(self.text_processor.get_buffer_content(), 'invalid;')
# Out of range character
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('99999999999')
self.assertTrue(self.text_processor.get_buffer_content().startswith(''))
def test_buffer_operations(self):
"""Test buffer state operations."""
# Test has_pending_text
self.assertFalse(self.text_processor.has_pending_text())
self.text_processor.add_text("Some text")
self.assertTrue(self.text_processor.has_pending_text())
# Test clear_buffer
self.text_processor.clear_buffer()
self.assertFalse(self.text_processor.has_pending_text())
self.assertEqual(self.text_processor.get_buffer_content(), "")
# Test with whitespace only
self.text_processor.add_text(" \n\t ")
self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace
def test_paragraph_management(self):
"""Test current paragraph setting."""
# Initially no paragraph
self.assertIsNone(self.text_processor._current_paragraph)
# Set paragraph
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph)
# Clear paragraph
self.text_processor.set_current_paragraph(None)
self.assertIsNone(self.text_processor._current_paragraph)
def test_flush_text_with_paragraph(self):
"""Test flushing text when paragraph is set."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("Hello world test")
# Mock the style manager to return a specific font
mock_font = Mock()
self.style_manager.create_font = Mock(return_value=mock_font)
result = self.text_processor.flush_text()
# Should return True (text was flushed)
self.assertTrue(result)
# Should have created words
self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test"
# Verify the words were created with correct text
calls = self.mock_paragraph.add_word.call_args_list
word_texts = [call[0][0].text for call in calls]
self.assertEqual(word_texts, ["Hello", "world", "test"])
# Buffer should be empty after flush
self.assertEqual(self.text_processor.get_buffer_content(), "")
def test_flush_text_without_paragraph(self):
"""Test flushing text when no paragraph is set."""
self.text_processor.add_text("Hello world")
result = self.text_processor.flush_text()
# Should return False (no paragraph to flush to)
self.assertFalse(result)
# Buffer should be cleared anyway
self.assertEqual(self.text_processor.get_buffer_content(), "")
def test_flush_empty_buffer(self):
"""Test flushing when buffer is empty."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
result = self.text_processor.flush_text()
# Should return False (nothing to flush)
self.assertFalse(result)
# No words should be added
self.mock_paragraph.add_word.assert_not_called()
def test_flush_whitespace_only(self):
"""Test flushing when buffer contains only whitespace."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text(" \n\t ")
result = self.text_processor.flush_text()
# Should return False (no meaningful content)
self.assertFalse(result)
# No words should be added
self.mock_paragraph.add_word.assert_not_called()
def test_word_creation_with_styling(self):
"""Test that words are created with proper styling."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("styled text")
# Set up style manager to return specific font
mock_font = Mock()
mock_font.font_size = 16
mock_font.weight = "bold"
self.style_manager.create_font = Mock(return_value=mock_font)
self.text_processor.flush_text()
# Verify font was created
self.style_manager.create_font.assert_called()
# Verify words were created with the font
calls = self.mock_paragraph.add_word.call_args_list
for call in calls:
word = call[0][0]
self.assertEqual(word.style, mock_font)
def test_reset(self):
"""Test resetting the text processor."""
# Set up some state
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("Some text")
# Reset
self.text_processor.reset()
# Should be back to initial state
self.assertEqual(self.text_processor._text_buffer, "")
self.assertIsNone(self.text_processor._current_paragraph)
def test_complex_text_processing(self):
"""Test processing text with mixed content."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
# Mock font creation
mock_font = Mock()
self.style_manager.create_font = Mock(return_value=mock_font)
# Add mixed content
self.text_processor.add_text("Hello ")
self.text_processor.add_entity_reference('amp')
self.text_processor.add_text(" world")
self.text_processor.add_character_reference('33') # '!'
# Should have "Hello & world!"
expected_content = "Hello & world!"
self.assertEqual(self.text_processor.get_buffer_content(), expected_content)
# Flush and verify words
self.text_processor.flush_text()
calls = self.mock_paragraph.add_word.call_args_list
word_texts = [call[0][0].text for call in calls]
self.assertEqual(word_texts, ["Hello", "&", "world!"])
if __name__ == '__main__':
unittest.main()