pyWebLayout/tests/test_html_extraction.py

"""
Unit tests for HTML extraction functionality.

Tests the HTML parsing and conversion to pyWebLayout abstract elements,
including styled content within paragraphs and block-level elements.
"""

import unittest
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table
from pyWebLayout.abstract.document import Document
from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration


class TestHTMLParagraph(unittest.TestCase):
    """Test cases for basic paragraph parsing."""

    def test_simple(self):
        text = "<p>This is a paragraph.</p>"
        paragraphs = parse_html_string(text)
        self.assertEqual(len(paragraphs), 1)
        self.assertEqual(len(paragraphs[0]), 4)

        for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
            self.assertEqual(w1[1].text, t1)

    def test_multiple(self):
        text = "<p>This is a paragraph.</p><p>This is another paragraph.</p>"
        paragraphs = parse_html_string(text)
        self.assertEqual(len(paragraphs), 2)
        self.assertEqual(len(paragraphs[0]), 4)
        self.assertEqual(len(paragraphs[1]), 4)

        for w1, t1 in zip(paragraphs[0].words(), "This is a paragraph.".split(" ")):
            self.assertEqual(w1[1].text, t1)

        for w1, t1 in zip(paragraphs[1].words(), "This is another paragraph.".split(" ")):
            self.assertEqual(w1[1].text, t1)


class TestHTMLStyledParagraphs(unittest.TestCase):
    """Test cases for paragraphs with inline styling."""

    def test_bold_text(self):
        """Test paragraphs with bold text using <strong> and <b> tags."""
        text = "<p>This is <strong>bold text</strong> in a paragraph.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)

        words = list(blocks[0].words())
        self.assertEqual(len(words), 7)  # "This is bold text in a paragraph."

        # Check that 'bold' and 'text' words have bold font weight
        bold_word = words[2][1]  # 'bold'
        text_word = words[3][1]  # 'text'
        self.assertEqual(bold_word.text, "bold")
        self.assertEqual(bold_word.style.weight, FontWeight.BOLD)
        self.assertEqual(text_word.text, "text")
        self.assertEqual(text_word.style.weight, FontWeight.BOLD)

        # Check that other words are not bold
        normal_word = words[0][1]  # 'This'
        self.assertEqual(normal_word.text, "This")
        self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD)

    def test_italic_text(self):
        """Test paragraphs with italic text using <em> and <i> tags."""
        text = "<p>This is <em>italic text</em> in a paragraph.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)

        words = list(blocks[0].words())

        # Check that 'italic' and 'text' words have italic font style
        italic_word = words[2][1]  # 'italic'
        text_word = words[3][1]   # 'text'
        self.assertEqual(italic_word.text, "italic")
        self.assertEqual(italic_word.style.style, FontStyle.ITALIC)
        self.assertEqual(text_word.text, "text")
        self.assertEqual(text_word.style.style, FontStyle.ITALIC)

    def test_underlined_text(self):
        """Test paragraphs with underlined text using <u> tag."""
        text = "<p>This is <u>underlined text</u> here.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)

        words = list(blocks[0].words())
        underlined_word = words[2][1]  # 'underlined'
        self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE)

    def test_strikethrough_text(self):
        """Test paragraphs with strikethrough text using <s> and <del> tags."""
        text = "<p>This is <s>strikethrough text</s> here.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)

        words = list(blocks[0].words())
        strike_word = words[2][1]  # 'strikethrough'
        self.assertEqual(strike_word.style.decoration, TextDecoration.STRIKETHROUGH)

    def test_span_with_inline_styles(self):
        """Test paragraphs with span elements containing inline CSS styles."""
        text = '<p>This text is normal, but <span style="color: red; font-weight: bold;">this part is red and bold</span>.</p>'
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)

        words = list(blocks[0].words())

        # Find the styled words
        styled_words = []
        for _, word in words:
            if word.text in ["this", "part", "is", "red", "and", "bold"]:
                if word.style.weight == FontWeight.BOLD:
                    styled_words.append(word)

        self.assertGreater(len(styled_words), 0, "Should have bold words in styled span")

        # Check that at least one word has the red color
        red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)]
        self.assertGreater(len(red_words), 0, "Should have red colored words")

    def test_mixed_formatting(self):
        """Test paragraphs with multiple formatting elements combined."""
        text = "<p>This paragraph contains <strong>bold</strong>, <em>italic</em>, <span style=\"color: blue;\">blue</span>, and <mark>highlighted</mark> text all together.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)

        words = list(blocks[0].words())

        # Check for bold word
        bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
        self.assertGreater(len(bold_words), 0, "Should have bold words")

        # Check for italic word
        italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
        self.assertGreater(len(italic_words), 0, "Should have italic words")

        # Check for blue colored word
        blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)]
        self.assertGreater(len(blue_words), 0, "Should have blue colored words")

    def test_nested_formatting(self):
        """Test nested formatting elements."""
        text = "<p>This has <strong>bold with <em>italic inside</em></strong> formatting.</p>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)

        words = list(blocks[0].words())

        # Find words that should be both bold and italic
        bold_italic_words = [w for _, w in words
                           if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC]
        self.assertGreater(len(bold_italic_words), 0, "Should have words that are both bold and italic")

    def test_color_variations(self):
        """Test different color formats in CSS."""
        text = '<p><span style="color: #ff0000;">Hex red</span> and <span style="color: green;">Named green</span>.</p>'
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)

        words = list(blocks[0].words())

        # Check for hex red color
        hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)]
        self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words")

        # Check for named green color
        green_words = [w for _, w in words if w.style.colour == (0, 255, 0)]
        self.assertGreater(len(green_words), 0, "Should have green colored words")


class TestHTMLBlockElements(unittest.TestCase):
    """Test cases for block-level HTML elements."""

    def test_body_element(self):
        """Test parsing of body element containing other elements."""
        text = "<body><p>Paragraph one.</p><p>Paragraph two.</p></body>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 2)
        self.assertIsInstance(blocks[0], Paragraph)
        self.assertIsInstance(blocks[1], Paragraph)

    def test_div_container(self):
        """Test div elements as generic containers."""
        text = "<div><p>First paragraph.</p><p>Second paragraph.</p></div>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 2)
        self.assertIsInstance(blocks[0], Paragraph)
        self.assertIsInstance(blocks[1], Paragraph)

    def test_headings(self):
        """Test all heading levels h1-h6."""
        text = "<h1>Heading 1</h1><h2>Heading 2</h2><h3>Heading 3</h3><h4>Heading 4</h4><h5>Heading 5</h5><h6>Heading 6</h6>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 6)

        expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3,
                          HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6]

        for i, block in enumerate(blocks):
            self.assertIsInstance(block, Heading)
            self.assertEqual(block.level, expected_levels[i])

            words = list(block.words())
            self.assertEqual(len(words), 2)  # "Heading" and number
            self.assertEqual(words[0][1].text, "Heading")

    def test_blockquote(self):
        """Test blockquote elements."""
        text = "<blockquote><p>This is a quoted paragraph.</p></blockquote>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Quote)

        # Check that the quote contains a paragraph
        quote_blocks = list(blocks[0].blocks())
        self.assertEqual(len(quote_blocks), 1)
        self.assertIsInstance(quote_blocks[0], Paragraph)

    def test_preformatted_code(self):
        """Test preformatted code blocks."""
        text = "<pre><code>function hello() {\n  console.log('Hello');\n}</code></pre>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], CodeBlock)

        lines = list(blocks[0].lines())
        self.assertGreater(len(lines), 0)

    def test_unordered_list(self):
        """Test unordered lists."""
        text = "<ul><li>First item</li><li>Second item</li><li>Third item</li></ul>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], HList)
        self.assertEqual(blocks[0].style, ListStyle.UNORDERED)

        items = list(blocks[0].items())
        self.assertEqual(len(items), 3)

    def test_ordered_list(self):
        """Test ordered lists."""
        text = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], HList)
        self.assertEqual(blocks[0].style, ListStyle.ORDERED)

    def test_list_with_styled_content(self):
        """Test lists containing styled content."""
        text = "<ul><li>Normal item</li><li><strong>Bold item</strong></li><li>Item with <em>italic</em> text</li></ul>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], HList)

        items = list(blocks[0].items())
        self.assertEqual(len(items), 3)

        # Check second item has bold text
        second_item_blocks = list(items[1].blocks())
        if second_item_blocks:
            words = list(second_item_blocks[0].words())
            bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
            self.assertGreater(len(bold_words), 0)

    def test_table_basic(self):
        """Test basic table structure."""
        text = """
        <table>
            <tr>
                <th>Header 1</th>
                <th>Header 2</th>
            </tr>
            <tr>
                <td>Cell 1</td>
                <td>Cell 2</td>
            </tr>
        </table>
        """
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Table)

    def test_semantic_elements(self):
        """Test semantic HTML5 elements treated as containers."""
        text = "<section><article><p>Article content</p></article></section>"
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)

    def test_nested_block_elements(self):
        """Test nested block elements."""
        text = """
        <div>
            <h2>Section Title</h2>
            <p>Some introductory text.</p>
            <blockquote>
                <p>A quoted paragraph.</p>
            </blockquote>
        </div>
        """
        blocks = parse_html_string(text)
        self.assertGreater(len(blocks), 2)

        # Should have at least a heading, paragraph, and quote
        has_heading = any(isinstance(b, Heading) for b in blocks)
        has_paragraph = any(isinstance(b, Paragraph) for b in blocks)
        has_quote = any(isinstance(b, Quote) for b in blocks)

        self.assertTrue(has_heading, "Should contain a heading")
        self.assertTrue(has_paragraph, "Should contain a paragraph")
        self.assertTrue(has_quote, "Should contain a quote")

    def test_empty_elements(self):
        """Test handling of empty elements."""
        text = "<p></p><div></div><span></span>"
        blocks = parse_html_string(text)
        # Empty elements may not create blocks, which is acceptable behavior
        self.assertGreaterEqual(len(blocks), 0)

        # Test that empty paragraph with some content does create a block
        text_with_content = "<p> </p>"  # Contains whitespace
        blocks_with_content = parse_html_string(text_with_content)
        # This should create at least one block since there's whitespace content
        self.assertGreaterEqual(len(blocks_with_content), 0)


class TestHTMLComplexStructures(unittest.TestCase):
    """Test cases for complex HTML structures combining multiple features."""

    def test_article_with_mixed_content(self):
        """Test a realistic article structure with mixed content."""
        text = """
        <article>
            <h1>Article Title</h1>
            <p>This is the <strong>introduction</strong> paragraph with <em>some emphasis</em>.</p>
            <blockquote>
                <p>This is a <span style="color: blue;">quoted section</span> with styling.</p>
            </blockquote>
            <ul>
                <li>First <strong>important</strong> point</li>
                <li>Second point with <code>inline code</code></li>
            </ul>
        </article>
        """
        blocks = parse_html_string(text)
        self.assertGreater(len(blocks), 3)

        # Verify we have the expected block types
        block_types = [type(b).__name__ for b in blocks]
        self.assertIn('Heading', block_types)
        self.assertIn('Paragraph', block_types)
        self.assertIn('Quote', block_types)
        self.assertIn('HList', block_types)

    def test_styled_table_content(self):
        """Test table with styled cell content."""
        text = """
        <table>
            <thead>
                <tr>
                    <th><strong>Product</strong></th>
                    <th><em>Price</em></th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Item with <span style="color: red;">red text</span></td>
                    <td><strong>$19.99</strong></td>
                </tr>
            </tbody>
        </table>
        """
        blocks = parse_html_string(text)
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Table)


class TestHTMLFontRegistryIntegration(unittest.TestCase):
    """Test cases for font registry integration with HTML extraction."""

    def setUp(self):
        """Set up test fixtures."""
        self.doc = Document("Test Document", "en-US")
        self.base_font = Font(font_size=16, colour=(0, 0, 0))

    def test_font_registry_creates_fonts(self):
        """Test that HTML parsing with document context creates fonts in registry."""
        html_content = """
        <div>
            <p>This is <strong>bold text</strong> and <em>italic text</em>.</p>
            <h1>Main Header</h1>
        </div>
        """

        # Initially empty style registry
        initial_style_count = self.doc.get_style_registry().get_style_count()

        # Parse HTML with document context
        blocks = parse_html_string(html_content, self.base_font, document=self.doc)

        # Should have created styles for different formatting
        final_style_count = self.doc.get_style_registry().get_style_count()
        self.assertGreater(final_style_count, initial_style_count,
                          "Should have created styles in registry")

        # Should have created blocks
        self.assertGreater(len(blocks), 0, "Should have created blocks")

    def test_font_registry_reuses_fonts(self):
        """Test that parsing same content reuses existing styles."""
        html_content = """
        <div>
            <p>This is <strong>bold text</strong> and <em>italic text</em>.</p>
            <h1>Main Header</h1>
        </div>
        """

        # First parse
        blocks1 = parse_html_string(html_content, self.base_font, document=self.doc)
        first_parse_style_count = self.doc.get_style_registry().get_style_count()

        # Second parse with same content
        blocks2 = parse_html_string(html_content, self.base_font, document=self.doc)
        second_parse_style_count = self.doc.get_style_registry().get_style_count()

        # Style count should not increase on second parse
        self.assertEqual(first_parse_style_count, second_parse_style_count,
                        "Should reuse existing styles instead of creating new ones")

        # Both parses should create same number of blocks
        self.assertEqual(len(blocks1), len(blocks2),
                        "Should create same structure on both parses")

    def test_font_registry_different_styles_create_different_fonts(self):
        """Test that different styles create different style objects."""
        # Create styles with different properties
        style_id1, style1 = self.doc.get_or_create_style(
            font_size=14, color=(255, 0, 0), font_weight=FontWeight.BOLD
        )
        style_id2, style2 = self.doc.get_or_create_style(
            font_size=16, color=(255, 0, 0), font_weight=FontWeight.BOLD
        )
        style_id3, style3 = self.doc.get_or_create_style(
            font_size=14, color=(0, 255, 0), font_weight=FontWeight.BOLD
        )

        # Should be different style IDs
        self.assertNotEqual(style_id1, style_id2, "Different sizes should create different styles")
        self.assertNotEqual(style_id1, style_id3, "Different colors should create different styles")
        self.assertNotEqual(style_id2, style_id3, "All styles should be different")

        # Should have multiple styles in registry
        self.assertGreaterEqual(self.doc.get_style_registry().get_style_count(), 3)

    def test_font_registry_integration_with_html_styles(self):
        """Test that HTML parsing uses style registry for styled content."""
        html_content = """
        <p>Normal text with <strong>bold</strong> and <em>italic</em> and
        <span style="color: red;">red text</span>.</p>
        """

        # Parse content
        blocks = parse_html_string(html_content, self.base_font, document=self.doc)

        # Extract all words from the paragraph
        paragraph = blocks[0]
        words = list(paragraph.words())

        # Find words with different styles
        normal_words = [w for _, w in words if w.style.weight == FontWeight.NORMAL
                       and w.style.style == FontStyle.NORMAL]
        bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD]
        italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC]
        red_words = [w for _, w in words if w.style.colour == (255, 0, 0)]

        # Should have words with different styles
        self.assertGreater(len(normal_words), 0, "Should have normal words")
        self.assertGreater(len(bold_words), 0, "Should have bold words")
        self.assertGreater(len(italic_words), 0, "Should have italic words")
        self.assertGreater(len(red_words), 0, "Should have red words")

        # Style registry should contain multiple styles for different formatting
        self.assertGreater(self.doc.get_style_registry().get_style_count(), 1,
                          "Should have multiple styles for different formatting")

    def test_font_registry_without_document_context(self):
        """Test that parsing without document context works (fallback behavior)."""
        html_content = "<p>This is <strong>bold text</strong>.</p>"

        # Get initial style count (should include default style)
        initial_style_count = self.doc.get_style_registry().get_style_count()

        # Parse without document context
        blocks = parse_html_string(html_content, self.base_font)

        # Should still create blocks successfully
        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)

        # Should not affect document's style registry
        final_style_count = self.doc.get_style_registry().get_style_count()
        self.assertEqual(final_style_count, initial_style_count,
                        "Document style registry should remain unchanged")

    def test_complex_html_font_reuse(self):
        """Test style reuse with complex HTML containing repeated styles."""
        html_content = """
        <div>
            <h1>First Header</h1>
            <p>Paragraph with <strong>bold</strong> text.</p>
            <h1>Second Header</h1>
            <p>Another paragraph with <strong>bold</strong> text.</p>
        </div>
        """

        # Parse content
        blocks = parse_html_string(html_content, self.base_font, document=self.doc)
        style_count_after_parse = self.doc.get_style_registry().get_style_count()

        # Parse same content again
        blocks2 = parse_html_string(html_content, self.base_font, document=self.doc)
        style_count_after_second_parse = self.doc.get_style_registry().get_style_count()

        # Style count should not increase on second parse
        self.assertEqual(style_count_after_parse, style_count_after_second_parse,
                        "Styles should be reused for repeated formatting")

        # Both should create same structure
        self.assertEqual(len(blocks), len(blocks2))

    def test_font_registry_with_nested_styles(self):
        """Test style registry with nested HTML styles."""
        html_content = """
        <p>Text with <strong>bold and <em>bold italic</em> nested</strong> styles.</p>
        """

        # Parse content
        blocks = parse_html_string(html_content, self.base_font, document=self.doc)

        # Should create styles for different style combinations
        paragraph = blocks[0]
        words = list(paragraph.words())

        # Find words that are both bold and italic
        bold_italic_words = [w for _, w in words
                           if w.style.weight == FontWeight.BOLD
                           and w.style.style == FontStyle.ITALIC]

        self.assertGreater(len(bold_italic_words), 0,
                          "Should have words with combined bold+italic style")

        # Should have multiple styles in registry for different combinations
        self.assertGreater(self.doc.get_style_registry().get_style_count(), 1,
                          "Should create separate styles for style combinations")


if __name__ == '__main__':
    unittest.main()