119 lines
4.7 KiB
Python
119 lines
4.7 KiB
Python
"""
|
|
Test module for loading HTML files using the html_extraction module.
|
|
|
|
This test verifies that HTML files can be loaded from disk and processed
|
|
using the html_extraction.parse_html_string function.
|
|
"""
|
|
|
|
import os
|
|
import unittest
|
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
|
from pyWebLayout.abstract.block import Block
|
|
from pyWebLayout.style import Font
|
|
|
|
|
|
class TestHTMLFileLoader(unittest.TestCase):
|
|
"""Test class for HTML file loading functionality."""
|
|
|
|
def test_load_html_file(self):
|
|
"""Test loading and parsing an HTML file from disk."""
|
|
# Path to the test HTML file
|
|
html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")
|
|
|
|
# Verify the test file exists
|
|
self.assertTrue(os.path.exists(html_file_path), f"Test HTML file not found: {html_file_path}")
|
|
|
|
# Read the HTML file
|
|
with open(html_file_path, 'r', encoding='utf-8') as file:
|
|
html_content = file.read()
|
|
|
|
# Verify we got some content
|
|
self.assertGreater(len(html_content), 0, "HTML file should not be empty")
|
|
|
|
# Parse the HTML content using the html_extraction module
|
|
try:
|
|
blocks = parse_html_string(html_content)
|
|
except Exception as e:
|
|
self.fail(f"Failed to parse HTML file: {e}")
|
|
|
|
# Verify we got some blocks
|
|
self.assertIsInstance(blocks, list, "parse_html_string should return a list")
|
|
self.assertGreater(len(blocks), 0, "Should extract at least one block from the HTML file")
|
|
|
|
# Verify all returned items are Block instances
|
|
for i, block in enumerate(blocks):
|
|
self.assertIsInstance(block, Block, f"Item {i} should be a Block instance, got {type(block)}")
|
|
|
|
print(f"Successfully loaded and parsed HTML file with {len(blocks)} blocks")
|
|
|
|
def test_load_html_file_with_custom_font(self):
|
|
"""Test loading HTML file with a custom base font."""
|
|
html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")
|
|
|
|
# Skip if file doesn't exist
|
|
if not os.path.exists(html_file_path):
|
|
self.skipTest(f"Test HTML file not found: {html_file_path}")
|
|
|
|
# Create a custom font
|
|
custom_font = Font(font_size=14, colour=(100, 100, 100))
|
|
|
|
# Read and parse with custom font
|
|
with open(html_file_path, 'r', encoding='utf-8') as file:
|
|
html_content = file.read()
|
|
|
|
blocks = parse_html_string(html_content, base_font=custom_font)
|
|
|
|
# Verify we got blocks
|
|
self.assertGreater(len(blocks), 0, "Should extract blocks with custom font")
|
|
|
|
print(f"Successfully parsed HTML file with custom font, got {len(blocks)} blocks")
|
|
|
|
def test_load_html_file_content_types(self):
|
|
"""Test that the loaded HTML file contains expected content types."""
|
|
html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")
|
|
|
|
# Skip if file doesn't exist
|
|
if not os.path.exists(html_file_path):
|
|
self.skipTest(f"Test HTML file not found: {html_file_path}")
|
|
|
|
with open(html_file_path, 'r', encoding='utf-8') as file:
|
|
html_content = file.read()
|
|
|
|
blocks = parse_html_string(html_content)
|
|
|
|
# Check that we have different types of blocks
|
|
block_type_names = [type(block).__name__ for block in blocks]
|
|
unique_types = set(block_type_names)
|
|
|
|
# A Wikipedia page should contain multiple types of content
|
|
self.assertGreater(len(unique_types), 1, "Should have multiple types of blocks in Wikipedia page")
|
|
|
|
print(f"Found block types: {sorted(unique_types)}")
|
|
|
|
def test_html_file_size_handling(self):
|
|
"""Test that large HTML files can be handled gracefully."""
|
|
html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")
|
|
|
|
# Skip if file doesn't exist
|
|
if not os.path.exists(html_file_path):
|
|
self.skipTest(f"Test HTML file not found: {html_file_path}")
|
|
|
|
# Get file size
|
|
file_size = os.path.getsize(html_file_path)
|
|
print(f"HTML file size: {file_size} bytes")
|
|
|
|
# Read and parse
|
|
with open(html_file_path, 'r', encoding='utf-8') as file:
|
|
html_content = file.read()
|
|
|
|
# This should not raise an exception even for large files
|
|
blocks = parse_html_string(html_content)
|
|
|
|
# Basic verification
|
|
self.assertIsInstance(blocks, list)
|
|
print(f"Successfully processed {file_size} byte file into {len(blocks)} blocks")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|