pyWebLayout/tests/io_tests/test_html_links.py

"""
Unit tests for HTML link extraction.
"""

import unittest
from bs4 import BeautifulSoup
from pyWebLayout.io.readers.html_extraction import (
    parse_html_string,
    extract_text_content,
    create_base_context
)
from pyWebLayout.abstract.inline import LinkedWord
from pyWebLayout.abstract.functional import LinkType
from pyWebLayout.abstract.block import Paragraph


class TestHTMLLinkExtraction(unittest.TestCase):
    """Test cases for HTML hyperlink extraction."""

    def setUp(self):
        """Set up test fixtures."""
        self.base_context = create_base_context()

    def test_simple_external_link(self):
        """Test extracting a simple external link."""
        html = '<p>Visit <a href="https://example.com">this site</a> for more.</p>'
        blocks = parse_html_string(html)

        self.assertEqual(len(blocks), 1)
        self.assertIsInstance(blocks[0], Paragraph)

        paragraph = blocks[0]
        words = list(paragraph.words)

        # Should have: "Visit", "this", "site", "for", "more."
        self.assertEqual(len(words), 5)

        # Check that "this" and "site" are LinkedWords
        self.assertIsInstance(words[1], LinkedWord)
        self.assertIsInstance(words[2], LinkedWord)

        # Check link properties
        self.assertEqual(words[1].location, "https://example.com")
        self.assertEqual(words[1].link_type, LinkType.EXTERNAL)
        self.assertEqual(words[2].location, "https://example.com")
        self.assertEqual(words[2].link_type, LinkType.EXTERNAL)

    def test_internal_link(self):
        """Test extracting an internal anchor link."""
        html = '<p>Go to <a href="#section2">section 2</a> below.</p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        # Find LinkedWords
        linked_words = [w for w in words if isinstance(w, LinkedWord)]
        self.assertEqual(len(linked_words), 2)  # "section" and "2"

        # Check they're internal links
        for word in linked_words:
            self.assertEqual(word.link_type, LinkType.INTERNAL)
            self.assertEqual(word.location, "#section2")

    def test_multi_word_link(self):
        """Test that multi-word links create separate LinkedWords."""
        html = '<p><a href="/next">click here for next page</a></p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        # All words should be LinkedWords
        self.assertEqual(len(words), 5)
        for word in words:
            self.assertIsInstance(word, LinkedWord)
            self.assertEqual(word.location, "/next")
            self.assertEqual(word.link_type, LinkType.INTERNAL)

    def test_link_with_title(self):
        """Test extracting link with title attribute."""
        html = '<p><a href="https://example.com" title="Visit Example">click</a></p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        self.assertEqual(len(words), 1)
        self.assertIsInstance(words[0], LinkedWord)
        self.assertEqual(words[0].link_title, "Visit Example")

    def test_mixed_linked_and_normal_text(self):
        """Test paragraph with both linked and normal text."""
        html = '<p>Some <a href="/page">linked text</a> and normal text.</p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        # "Some" - normal
        # "linked" - LinkedWord
        # "text" - LinkedWord
        # "and" - normal
        # "normal" - normal
        # "text." - normal

        self.assertNotIsInstance(words[0], LinkedWord)  # "Some"
        self.assertIsInstance(words[1], LinkedWord)     # "linked"
        self.assertIsInstance(words[2], LinkedWord)     # "text"
        self.assertNotIsInstance(words[3], LinkedWord)  # "and"

    def test_link_without_href(self):
        """Test that <a> without href is treated as normal text."""
        html = '<p><a>not a link</a></p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        # Should be regular Words, not LinkedWords
        for word in words:
            self.assertNotIsInstance(word, LinkedWord)

    def test_javascript_link(self):
        """Test that javascript: links are detected as API type."""
        html = '<p><a href="javascript:alert()">click</a></p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        self.assertIsInstance(words[0], LinkedWord)
        self.assertEqual(words[0].link_type, LinkType.API)

    def test_nested_formatting_in_link(self):
        """Test link with nested formatting."""
        html = '<p><a href="/page">text with <strong>bold</strong> word</a></p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        # All should be LinkedWords regardless of formatting
        for word in words:
            self.assertIsInstance(word, LinkedWord)
            self.assertEqual(word.location, "/page")

    def test_multiple_links_in_paragraph(self):
        """Test paragraph with multiple separate links."""
        html = '<p><a href="/page1">first</a> and <a href="/page2">second</a> link</p>'
        blocks = parse_html_string(html)

        paragraph = blocks[0]
        words = list(paragraph.words)

        # Find LinkedWords and their locations
        linked_words = [(w.text, w.location)
                        for w in words if isinstance(w, LinkedWord)]

        # Should have "first" linked to /page1 and "second" linked to /page2
        self.assertIn(("first", "/page1"), linked_words)
        self.assertIn(("second", "/page2"), linked_words)

    def test_extract_text_content_with_links(self):
        """Test extract_text_content directly with link elements."""
        html = '<span>Visit <a href="https://example.com">our site</a> today</span>'
        soup = BeautifulSoup(html, 'html.parser')
        element = soup.find('span')

        context = create_base_context()
        words = extract_text_content(element, context)

        # Should have: "Visit", "our", "site", "today"
        self.assertEqual(len(words), 4)

        # Check types
        self.assertNotIsInstance(words[0], LinkedWord)  # "Visit"
        self.assertIsInstance(words[1], LinkedWord)     # "our"
        self.assertIsInstance(words[2], LinkedWord)     # "site"
        self.assertNotIsInstance(words[3], LinkedWord)  # "today"


if __name__ == '__main__':
    unittest.main()