pyWebLayout/tests/test_html_text.py

248 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Unit tests for HTML text processing.
Tests the HTMLTextProcessor class for text buffering, entity handling, and word creation.
"""
import unittest
from unittest.mock import Mock, MagicMock
from pyWebLayout.io.readers.html_text import HTMLTextProcessor
from pyWebLayout.io.readers.html_style import HTMLStyleManager
from pyWebLayout.abstract.block import Parapgraph
from pyWebLayout.abstract.inline import Word
class TestHTMLTextProcessor(unittest.TestCase):
"""Test cases for HTMLTextProcessor."""
def setUp(self):
"""Set up test fixtures."""
self.style_manager = HTMLStyleManager()
self.text_processor = HTMLTextProcessor(self.style_manager)
# Create a mock paragraph
self.mock_paragraph = Mock(spec=Parapgraph)
self.mock_paragraph.add_word = Mock()
def test_initialization(self):
"""Test proper initialization of text processor."""
self.assertEqual(self.text_processor._text_buffer, "")
self.assertIsNone(self.text_processor._current_paragraph)
self.assertEqual(self.text_processor._style_manager, self.style_manager)
def test_add_text(self):
"""Test adding text to buffer."""
self.text_processor.add_text("Hello")
self.assertEqual(self.text_processor.get_buffer_content(), "Hello")
self.text_processor.add_text(" World")
self.assertEqual(self.text_processor.get_buffer_content(), "Hello World")
def test_entity_references(self):
"""Test HTML entity reference handling."""
test_cases = [
('lt', '<'),
('gt', '>'),
('amp', '&'),
('quot', '"'),
('apos', "'"),
('nbsp', ' '),
('copy', '©'),
('reg', '®'),
('trade', ''),
('mdash', ''),
('ndash', ''),
('hellip', ''),
('euro', ''),
('unknown', '&unknown;') # Unknown entities should be preserved
]
for entity, expected in test_cases:
with self.subTest(entity=entity):
self.text_processor.clear_buffer()
self.text_processor.add_entity_reference(entity)
self.assertEqual(self.text_processor.get_buffer_content(), expected)
def test_character_references(self):
"""Test character reference handling."""
# Decimal character references
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('65') # 'A'
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
# Hexadecimal character references
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('x41') # 'A'
self.assertEqual(self.text_processor.get_buffer_content(), 'A')
# Unicode character
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('8364') # Euro symbol
self.assertEqual(self.text_processor.get_buffer_content(), '')
# Invalid character reference
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('invalid')
self.assertEqual(self.text_processor.get_buffer_content(), '&#invalid;')
# Out of range character
self.text_processor.clear_buffer()
self.text_processor.add_character_reference('99999999999')
self.assertTrue(self.text_processor.get_buffer_content().startswith('&#'))
def test_buffer_operations(self):
"""Test buffer state operations."""
# Test has_pending_text
self.assertFalse(self.text_processor.has_pending_text())
self.text_processor.add_text("Some text")
self.assertTrue(self.text_processor.has_pending_text())
# Test clear_buffer
self.text_processor.clear_buffer()
self.assertFalse(self.text_processor.has_pending_text())
self.assertEqual(self.text_processor.get_buffer_content(), "")
# Test with whitespace only
self.text_processor.add_text(" \n\t ")
self.assertFalse(self.text_processor.has_pending_text()) # Should ignore whitespace
def test_paragraph_management(self):
"""Test current paragraph setting."""
# Initially no paragraph
self.assertIsNone(self.text_processor._current_paragraph)
# Set paragraph
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.assertEqual(self.text_processor._current_paragraph, self.mock_paragraph)
# Clear paragraph
self.text_processor.set_current_paragraph(None)
self.assertIsNone(self.text_processor._current_paragraph)
def test_flush_text_with_paragraph(self):
"""Test flushing text when paragraph is set."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("Hello world test")
# Mock the style manager to return a specific font
mock_font = Mock()
self.style_manager.create_font = Mock(return_value=mock_font)
result = self.text_processor.flush_text()
# Should return True (text was flushed)
self.assertTrue(result)
# Should have created words
self.assertEqual(self.mock_paragraph.add_word.call_count, 3) # "Hello", "world", "test"
# Verify the words were created with correct text
calls = self.mock_paragraph.add_word.call_args_list
word_texts = [call[0][0].text for call in calls]
self.assertEqual(word_texts, ["Hello", "world", "test"])
# Buffer should be empty after flush
self.assertEqual(self.text_processor.get_buffer_content(), "")
def test_flush_text_without_paragraph(self):
"""Test flushing text when no paragraph is set."""
self.text_processor.add_text("Hello world")
result = self.text_processor.flush_text()
# Should return False (no paragraph to flush to)
self.assertFalse(result)
# Buffer should be cleared anyway
self.assertEqual(self.text_processor.get_buffer_content(), "")
def test_flush_empty_buffer(self):
"""Test flushing when buffer is empty."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
result = self.text_processor.flush_text()
# Should return False (nothing to flush)
self.assertFalse(result)
# No words should be added
self.mock_paragraph.add_word.assert_not_called()
def test_flush_whitespace_only(self):
"""Test flushing when buffer contains only whitespace."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text(" \n\t ")
result = self.text_processor.flush_text()
# Should return False (no meaningful content)
self.assertFalse(result)
# No words should be added
self.mock_paragraph.add_word.assert_not_called()
def test_word_creation_with_styling(self):
"""Test that words are created with proper styling."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("styled text")
# Set up style manager to return specific font
mock_font = Mock()
mock_font.font_size = 16
mock_font.weight = "bold"
self.style_manager.create_font = Mock(return_value=mock_font)
self.text_processor.flush_text()
# Verify font was created
self.style_manager.create_font.assert_called()
# Verify words were created with the font
calls = self.mock_paragraph.add_word.call_args_list
for call in calls:
word = call[0][0]
self.assertEqual(word.style, mock_font)
def test_reset(self):
"""Test resetting the text processor."""
# Set up some state
self.text_processor.set_current_paragraph(self.mock_paragraph)
self.text_processor.add_text("Some text")
# Reset
self.text_processor.reset()
# Should be back to initial state
self.assertEqual(self.text_processor._text_buffer, "")
self.assertIsNone(self.text_processor._current_paragraph)
def test_complex_text_processing(self):
"""Test processing text with mixed content."""
self.text_processor.set_current_paragraph(self.mock_paragraph)
# Mock font creation
mock_font = Mock()
self.style_manager.create_font = Mock(return_value=mock_font)
# Add mixed content
self.text_processor.add_text("Hello ")
self.text_processor.add_entity_reference('amp')
self.text_processor.add_text(" world")
self.text_processor.add_character_reference('33') # '!'
# Should have "Hello & world!"
expected_content = "Hello & world!"
self.assertEqual(self.text_processor.get_buffer_content(), expected_content)
# Flush and verify words
self.text_processor.flush_text()
calls = self.mock_paragraph.add_word.call_args_list
word_texts = [call[0][0].text for call in calls]
self.assertEqual(word_texts, ["Hello", "&", "world!"])
if __name__ == '__main__':
unittest.main()