""" Test module for loading HTML files using the html_extraction module. This test verifies that HTML files can be loaded from disk and processed using the html_extraction.parse_html_string function. """ import os import unittest from pyWebLayout.io.readers.html_extraction import parse_html_string from pyWebLayout.abstract.block import Block from pyWebLayout.style import Font class TestHTMLFileLoader(unittest.TestCase): """Test class for HTML file loading functionality.""" def test_load_html_file(self): """Test loading and parsing an HTML file from disk.""" # Path to the test HTML file html_file_path = os.path.join( "tests", "data", "Kimi Räikkönen - Wikipedia.html") # Verify the test file exists self.assertTrue( os.path.exists(html_file_path), f"Test HTML file not found: {html_file_path}") # Read the HTML file with open(html_file_path, 'r', encoding='utf-8') as file: html_content = file.read() # Verify we got some content self.assertGreater(len(html_content), 0, "HTML file should not be empty") # Parse the HTML content using the html_extraction module try: blocks = parse_html_string(html_content) except Exception as e: self.fail(f"Failed to parse HTML file: {e}") # Verify we got some blocks self.assertIsInstance(blocks, list, "parse_html_string should return a list") self.assertGreater( len(blocks), 0, "Should extract at least one block from the HTML file") # Verify all returned items are Block instances for i, block in enumerate(blocks): self.assertIsInstance( block, Block, f"Item {i} should be a Block instance, got {type(block)}" ) print(f"Successfully loaded and parsed HTML file with {len(blocks)} blocks") def test_load_html_file_with_custom_font(self): """Test loading HTML file with a custom base font.""" html_file_path = os.path.join( "tests", "data", "Kimi Räikkönen - Wikipedia.html") # Skip if file doesn't exist if not os.path.exists(html_file_path): self.skipTest(f"Test HTML file not found: {html_file_path}") # Create a custom font custom_font = Font(font_size=14, colour=(100, 100, 100)) # Read and parse with custom font with open(html_file_path, 'r', encoding='utf-8') as file: html_content = file.read() blocks = parse_html_string(html_content, base_font=custom_font) # Verify we got blocks self.assertGreater(len(blocks), 0, "Should extract blocks with custom font") print( f"Successfully parsed HTML file with custom font, got { len(blocks)} blocks") def test_load_html_file_content_types(self): """Test that the loaded HTML file contains expected content types.""" html_file_path = os.path.join( "tests", "data", "Kimi Räikkönen - Wikipedia.html") # Skip if file doesn't exist if not os.path.exists(html_file_path): self.skipTest(f"Test HTML file not found: {html_file_path}") with open(html_file_path, 'r', encoding='utf-8') as file: html_content = file.read() blocks = parse_html_string(html_content) # Check that we have different types of blocks block_type_names = [type(block).__name__ for block in blocks] unique_types = set(block_type_names) # A Wikipedia page should contain multiple types of content self.assertGreater( len(unique_types), 1, "Should have multiple types of blocks in Wikipedia page") print(f"Found block types: {sorted(unique_types)}") def test_html_file_size_handling(self): """Test that large HTML files can be handled gracefully.""" html_file_path = os.path.join( "tests", "data", "Kimi Räikkönen - Wikipedia.html") # Skip if file doesn't exist if not os.path.exists(html_file_path): self.skipTest(f"Test HTML file not found: {html_file_path}") # Get file size file_size = os.path.getsize(html_file_path) print(f"HTML file size: {file_size} bytes") # Read and parse with open(html_file_path, 'r', encoding='utf-8') as file: html_content = file.read() # This should not raise an exception even for large files blocks = parse_html_string(html_content) # Basic verification self.assertIsInstance(blocks, list) print(f"Successfully processed {file_size} byte file into {len(blocks)} blocks") if __name__ == '__main__': unittest.main()