pyWebLayout/tests/io_tests/test_html_file_loader.py

"""
Test module for loading HTML files using the html_extraction module.

This test verifies that HTML files can be loaded from disk and processed
using the html_extraction.parse_html_string function.
"""

import os
import unittest
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.abstract.block import Block
from pyWebLayout.style import Font


class TestHTMLFileLoader(unittest.TestCase):
    """Test class for HTML file loading functionality."""

    def test_load_html_file(self):
        """Test loading and parsing an HTML file from disk."""
        # Path to the test HTML file
        html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")

        # Verify the test file exists
        self.assertTrue(os.path.exists(html_file_path), f"Test HTML file not found: {html_file_path}")

        # Read the HTML file
        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # Verify we got some content
        self.assertGreater(len(html_content), 0, "HTML file should not be empty")

        # Parse the HTML content using the html_extraction module
        try:
            blocks = parse_html_string(html_content)
        except Exception as e:
            self.fail(f"Failed to parse HTML file: {e}")

        # Verify we got some blocks
        self.assertIsInstance(blocks, list, "parse_html_string should return a list")
        self.assertGreater(len(blocks), 0, "Should extract at least one block from the HTML file")

        # Verify all returned items are Block instances
        for i, block in enumerate(blocks):
            self.assertIsInstance(block, Block, f"Item {i} should be a Block instance, got {type(block)}")

        print(f"Successfully loaded and parsed HTML file with {len(blocks)} blocks")

    def test_load_html_file_with_custom_font(self):
        """Test loading HTML file with a custom base font."""
        html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")

        # Skip if file doesn't exist
        if not os.path.exists(html_file_path):
            self.skipTest(f"Test HTML file not found: {html_file_path}")

        # Create a custom font
        custom_font = Font(font_size=14, colour=(100, 100, 100))

        # Read and parse with custom font
        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        blocks = parse_html_string(html_content, base_font=custom_font)

        # Verify we got blocks
        self.assertGreater(len(blocks), 0, "Should extract blocks with custom font")

        print(f"Successfully parsed HTML file with custom font, got {len(blocks)} blocks")

    def test_load_html_file_content_types(self):
        """Test that the loaded HTML file contains expected content types."""
        html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")

        # Skip if file doesn't exist
        if not os.path.exists(html_file_path):
            self.skipTest(f"Test HTML file not found: {html_file_path}")

        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        blocks = parse_html_string(html_content)

        # Check that we have different types of blocks
        block_type_names = [type(block).__name__ for block in blocks]
        unique_types = set(block_type_names)

        # A Wikipedia page should contain multiple types of content
        self.assertGreater(len(unique_types), 1, "Should have multiple types of blocks in Wikipedia page")

        print(f"Found block types: {sorted(unique_types)}")

    def test_html_file_size_handling(self):
        """Test that large HTML files can be handled gracefully."""
        html_file_path = os.path.join("tests", "data", "Kimi Räikkönen - Wikipedia.html")

        # Skip if file doesn't exist
        if not os.path.exists(html_file_path):
            self.skipTest(f"Test HTML file not found: {html_file_path}")

        # Get file size
        file_size = os.path.getsize(html_file_path)
        print(f"HTML file size: {file_size} bytes")

        # Read and parse
        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # This should not raise an exception even for large files
        blocks = parse_html_string(html_content)

        # Basic verification
        self.assertIsInstance(blocks, list)
        print(f"Successfully processed {file_size} byte file into {len(blocks)} blocks")


if __name__ == '__main__':
    unittest.main()