pyWebLayout/tests/io_tests/test_html_file_loader.py
Duncan Tourolle 10612fefae
Some checks failed
Python CI / test (push) Failing after 43s
undoing more autoflake8 damage
2025-11-09 00:11:48 +01:00

137 lines
4.8 KiB
Python

"""
Test module for loading HTML files using the html_extraction module.
This test verifies that HTML files can be loaded from disk and processed
using the html_extraction.parse_html_string function.
"""
import os
import unittest
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.abstract.block import Block
from pyWebLayout.style import Font
class TestHTMLFileLoader(unittest.TestCase):
"""Test class for HTML file loading functionality."""
def test_load_html_file(self):
"""Test loading and parsing an HTML file from disk."""
# Path to the test HTML file
html_file_path = os.path.join(
"tests", "data", "Kimi Räikkönen - Wikipedia.html")
# Verify the test file exists
self.assertTrue(
os.path.exists(html_file_path),
f"Test HTML file not found: {html_file_path}")
# Read the HTML file
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# Verify we got some content
self.assertGreater(len(html_content), 0, "HTML file should not be empty")
# Parse the HTML content using the html_extraction module
try:
blocks = parse_html_string(html_content)
except Exception as e:
self.fail(f"Failed to parse HTML file: {e}")
# Verify we got some blocks
self.assertIsInstance(blocks, list, "parse_html_string should return a list")
self.assertGreater(
len(blocks),
0,
"Should extract at least one block from the HTML file")
# Verify all returned items are Block instances
for i, block in enumerate(blocks):
self.assertIsInstance(
block,
Block,
f"Item {i} should be a Block instance, got {type(block)}"
)
print(f"Successfully loaded and parsed HTML file with {len(blocks)} blocks")
def test_load_html_file_with_custom_font(self):
"""Test loading HTML file with a custom base font."""
html_file_path = os.path.join(
"tests", "data", "Kimi Räikkönen - Wikipedia.html")
# Skip if file doesn't exist
if not os.path.exists(html_file_path):
self.skipTest(f"Test HTML file not found: {html_file_path}")
# Create a custom font
custom_font = Font(font_size=14, colour=(100, 100, 100))
# Read and parse with custom font
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
blocks = parse_html_string(html_content, base_font=custom_font)
# Verify we got blocks
self.assertGreater(len(blocks), 0, "Should extract blocks with custom font")
print(
f"Successfully parsed HTML file with custom font, got {len(blocks)} blocks"
)
def test_load_html_file_content_types(self):
"""Test that the loaded HTML file contains expected content types."""
html_file_path = os.path.join(
"tests", "data", "Kimi Räikkönen - Wikipedia.html")
# Skip if file doesn't exist
if not os.path.exists(html_file_path):
self.skipTest(f"Test HTML file not found: {html_file_path}")
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
blocks = parse_html_string(html_content)
# Check that we have different types of blocks
block_type_names = [type(block).__name__ for block in blocks]
unique_types = set(block_type_names)
# A Wikipedia page should contain multiple types of content
self.assertGreater(
len(unique_types),
1,
"Should have multiple types of blocks in Wikipedia page")
print(f"Found block types: {sorted(unique_types)}")
def test_html_file_size_handling(self):
"""Test that large HTML files can be handled gracefully."""
html_file_path = os.path.join(
"tests", "data", "Kimi Räikkönen - Wikipedia.html")
# Skip if file doesn't exist
if not os.path.exists(html_file_path):
self.skipTest(f"Test HTML file not found: {html_file_path}")
# Get file size
file_size = os.path.getsize(html_file_path)
print(f"HTML file size: {file_size} bytes")
# Read and parse
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# This should not raise an exception even for large files
blocks = parse_html_string(html_content)
# Basic verification
self.assertIsInstance(blocks, list)
print(f"Successfully processed {file_size} byte file into {len(blocks)} blocks")
if __name__ == '__main__':
unittest.main()