""" Unit tests for HTML text processing. Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation. """ import unittest from unittest.mock import Mock, MagicMock from pyWebLayout.io.readers.html_text import HTMLTextProcessor from pyWebLayout.io.readers.html_style import HTMLStyleManager from pyWebLayout.abstract.block import Parapgraph from pyWebLayout.abstract.inline import Word class TestHTMLTextProcessor(unittest.TestCase): """Test cases for HTMLTextProcessor.""" def setUp(self): """Set up test fixtures.""" self.style_manager = HTMLStyleManager() self.text_processor = HTMLTextProcessor(self.style_manager) # Create a mock paragraph self.mock_paragraph = Mock(spec=Parapgraph) self.mock_paragraph.add_word = Mock() def test_initialization(self): """Test proper initialization of text processor.""" self.assertEqual(self.text_processor._text_buffer, "") self.assertIsNone(self.text_processor._current_paragraph) self.assertEqual(self.text_processor._style_manager, self.style_manager) def test_add_text(self): """Test adding text to buffer.""" self.text_processor.add_text("Hello") self.assertEqual(self.text_processor.get_buffer_content(), "Hello") self.text_processor.add_text(" World") self.assertEqual(self.text_processor.get_buffer_content(), "Hello World") def test_entity_references(self): """Test HTML entity reference handling.""" test_cases = [ ('lt', '<'), ('gt', '>'), ('amp', '&'), ('quot', '"'), ('apos', "'"), ('nbsp', ' '), ('copy', '©'), ('reg', '®'), ('trade', '™'), ('mdash', '—'), ('ndash', '–'), ('hellip', '…'), ('euro', '€'), ('unknown', '&unknown;') # Unknown entities should be preserved ] for entity, expected in test_cases: with self.subTest(entity=entity): self.text_processor.clear_buffer() self.text_processor.add_entity_reference(entity) self.assertEqual(self.text_processor.get_buffer_content(), expected) def test_character_references(self): """Test character reference handling.""" # Decimal character references self.text_processor.clear_buffer() self.text_processor.add_character_reference('65') # 'A' self.assertEqual(self.text_processor.get_buffer_content(), 'A') # Hexadecimal character references self.text_processor.clear_buffer() self.text_processor.add_character_reference('x41') # 'A' self.assertEqual(self.text_processor.get_buffer_content(), 'A') # Unicode character self.text_processor.clear_buffer() self.text_processor.add_character_reference('8364') # Euro symbol self.assertEqual(self.text_processor.get_buffer_content(), '€') # Invalid character reference self.text_processor.clear_buffer() self.text_processor.add_character_reference('invalid') self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;') # Out of range character self.text_processor.clear_buffer() self.text_processor.add_character_reference('99999999999') self.assertTrue(self.text_processor.get_buffer_content().startswith('&#')) def test_buffer_operations(self): """Test buffer state operations.""" # Test has_pending_text self.assertFalse(self.text_processor.has_pending_text()) self.text_processor.add_text("Some text") self.assertTrue(self.text_processor.has_pending_text()) # Test clear_buffer self.text_processor.clear_buffer() self.assertFalse(self.text_processor.has_pending_text()) self.assertEqual(self.text_processor.get_buffer_content(), "") # Test with whitespace only self.text_processor.add_text(" \n\t ") self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace def test_paragraph_management(self): """Test current paragraph setting.""" # Initially no paragraph self.assertIsNone(self.text_processor._current_paragraph) # Set paragraph self.text_processor.set_current_paragraph(self.mock_paragraph) self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph) # Clear paragraph self.text_processor.set_current_paragraph(None) self.assertIsNone(self.text_processor._current_paragraph) def test_flush_text_with_paragraph(self): """Test flushing text when paragraph is set.""" self.text_processor.set_current_paragraph(self.mock_paragraph) self.text_processor.add_text("Hello world test") # Mock the style manager to return a specific font mock_font = Mock() self.style_manager.create_font = Mock(return_value=mock_font) result = self.text_processor.flush_text() # Should return True (text was flushed) self.assertTrue(result) # Should have created words self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test" # Verify the words were created with correct text calls = self.mock_paragraph.add_word.call_args_list word_texts = [call[0][0].text for call in calls] self.assertEqual(word_texts, ["Hello", "world", "test"]) # Buffer should be empty after flush self.assertEqual(self.text_processor.get_buffer_content(), "") def test_flush_text_without_paragraph(self): """Test flushing text when no paragraph is set.""" self.text_processor.add_text("Hello world") result = self.text_processor.flush_text() # Should return False (no paragraph to flush to) self.assertFalse(result) # Buffer should be cleared anyway self.assertEqual(self.text_processor.get_buffer_content(), "") def test_flush_empty_buffer(self): """Test flushing when buffer is empty.""" self.text_processor.set_current_paragraph(self.mock_paragraph) result = self.text_processor.flush_text() # Should return False (nothing to flush) self.assertFalse(result) # No words should be added self.mock_paragraph.add_word.assert_not_called() def test_flush_whitespace_only(self): """Test flushing when buffer contains only whitespace.""" self.text_processor.set_current_paragraph(self.mock_paragraph) self.text_processor.add_text(" \n\t ") result = self.text_processor.flush_text() # Should return False (no meaningful content) self.assertFalse(result) # No words should be added self.mock_paragraph.add_word.assert_not_called() def test_word_creation_with_styling(self): """Test that words are created with proper styling.""" self.text_processor.set_current_paragraph(self.mock_paragraph) self.text_processor.add_text("styled text") # Set up style manager to return specific font mock_font = Mock() mock_font.font_size = 16 mock_font.weight = "bold" self.style_manager.create_font = Mock(return_value=mock_font) self.text_processor.flush_text() # Verify font was created self.style_manager.create_font.assert_called() # Verify words were created with the font calls = self.mock_paragraph.add_word.call_args_list for call in calls: word = call[0][0] self.assertEqual(word.style, mock_font) def test_reset(self): """Test resetting the text processor.""" # Set up some state self.text_processor.set_current_paragraph(self.mock_paragraph) self.text_processor.add_text("Some text") # Reset self.text_processor.reset() # Should be back to initial state self.assertEqual(self.text_processor._text_buffer, "") self.assertIsNone(self.text_processor._current_paragraph) def test_complex_text_processing(self): """Test processing text with mixed content.""" self.text_processor.set_current_paragraph(self.mock_paragraph) # Mock font creation mock_font = Mock() self.style_manager.create_font = Mock(return_value=mock_font) # Add mixed content self.text_processor.add_text("Hello ") self.text_processor.add_entity_reference('amp') self.text_processor.add_text(" world") self.text_processor.add_character_reference('33') # '!' # Should have "Hello & world!" expected_content = "Hello & world!" self.assertEqual(self.text_processor.get_buffer_content(), expected_content) # Flush and verify words self.text_processor.flush_text() calls = self.mock_paragraph.add_word.call_args_list word_texts = [call[0][0].text for call in calls] self.assertEqual(word_texts, ["Hello", "&", "world!"]) if __name__ == '__main__': unittest.main()