pyWebLayout/tests/io_tests/test_html_links.py

185 lines
6.5 KiB
Python

"""
Unit tests for HTML link extraction.
"""
import unittest
from bs4 import BeautifulSoup
from pyWebLayout.io.readers.html_extraction import (
parse_html_string,
extract_text_content,
create_base_context
)
from pyWebLayout.abstract.inline import LinkedWord
from pyWebLayout.abstract.functional import LinkType
from pyWebLayout.abstract.block import Paragraph
class TestHTMLLinkExtraction(unittest.TestCase):
"""Test cases for HTML hyperlink extraction."""
def setUp(self):
"""Set up test fixtures."""
self.base_context = create_base_context()
def test_simple_external_link(self):
"""Test extracting a simple external link."""
html = '<p>Visit <a href="https://example.com">this site</a> for more.</p>'
blocks = parse_html_string(html)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], Paragraph)
paragraph = blocks[0]
words = list(paragraph.words)
# Should have: "Visit", "this", "site", "for", "more."
self.assertEqual(len(words), 5)
# Check that "this" and "site" are LinkedWords
self.assertIsInstance(words[1], LinkedWord)
self.assertIsInstance(words[2], LinkedWord)
# Check link properties
self.assertEqual(words[1].location, "https://example.com")
self.assertEqual(words[1].link_type, LinkType.EXTERNAL)
self.assertEqual(words[2].location, "https://example.com")
self.assertEqual(words[2].link_type, LinkType.EXTERNAL)
def test_internal_link(self):
"""Test extracting an internal anchor link."""
html = '<p>Go to <a href="#section2">section 2</a> below.</p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
# Find LinkedWords
linked_words = [w for w in words if isinstance(w, LinkedWord)]
self.assertEqual(len(linked_words), 2) # "section" and "2"
# Check they're internal links
for word in linked_words:
self.assertEqual(word.link_type, LinkType.INTERNAL)
self.assertEqual(word.location, "#section2")
def test_multi_word_link(self):
"""Test that multi-word links create separate LinkedWords."""
html = '<p><a href="/next">click here for next page</a></p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
# All words should be LinkedWords
self.assertEqual(len(words), 5)
for word in words:
self.assertIsInstance(word, LinkedWord)
self.assertEqual(word.location, "/next")
self.assertEqual(word.link_type, LinkType.INTERNAL)
def test_link_with_title(self):
"""Test extracting link with title attribute."""
html = '<p><a href="https://example.com" title="Visit Example">click</a></p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
self.assertEqual(len(words), 1)
self.assertIsInstance(words[0], LinkedWord)
self.assertEqual(words[0].link_title, "Visit Example")
def test_mixed_linked_and_normal_text(self):
"""Test paragraph with both linked and normal text."""
html = '<p>Some <a href="/page">linked text</a> and normal text.</p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
# "Some" - normal
# "linked" - LinkedWord
# "text" - LinkedWord
# "and" - normal
# "normal" - normal
# "text." - normal
self.assertNotIsInstance(words[0], LinkedWord) # "Some"
self.assertIsInstance(words[1], LinkedWord) # "linked"
self.assertIsInstance(words[2], LinkedWord) # "text"
self.assertNotIsInstance(words[3], LinkedWord) # "and"
def test_link_without_href(self):
"""Test that <a> without href is treated as normal text."""
html = '<p><a>not a link</a></p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
# Should be regular Words, not LinkedWords
for word in words:
self.assertNotIsInstance(word, LinkedWord)
def test_javascript_link(self):
"""Test that javascript: links are detected as API type."""
html = '<p><a href="javascript:alert()">click</a></p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
self.assertIsInstance(words[0], LinkedWord)
self.assertEqual(words[0].link_type, LinkType.API)
def test_nested_formatting_in_link(self):
"""Test link with nested formatting."""
html = '<p><a href="/page">text with <strong>bold</strong> word</a></p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
# All should be LinkedWords regardless of formatting
for word in words:
self.assertIsInstance(word, LinkedWord)
self.assertEqual(word.location, "/page")
def test_multiple_links_in_paragraph(self):
"""Test paragraph with multiple separate links."""
html = '<p><a href="/page1">first</a> and <a href="/page2">second</a> link</p>'
blocks = parse_html_string(html)
paragraph = blocks[0]
words = list(paragraph.words)
# Find LinkedWords and their locations
linked_words = [(w.text, w.location)
for w in words if isinstance(w, LinkedWord)]
# Should have "first" linked to /page1 and "second" linked to /page2
self.assertIn(("first", "/page1"), linked_words)
self.assertIn(("second", "/page2"), linked_words)
def test_extract_text_content_with_links(self):
"""Test extract_text_content directly with link elements."""
html = '<span>Visit <a href="https://example.com">our site</a> today</span>'
soup = BeautifulSoup(html, 'html.parser')
element = soup.find('span')
context = create_base_context()
words = extract_text_content(element, context)
# Should have: "Visit", "our", "site", "today"
self.assertEqual(len(words), 4)
# Check types
self.assertNotIsInstance(words[0], LinkedWord) # "Visit"
self.assertIsInstance(words[1], LinkedWord) # "our"
self.assertIsInstance(words[2], LinkedWord) # "site"
self.assertNotIsInstance(words[3], LinkedWord) # "today"
if __name__ == '__main__':
unittest.main()