185 lines
6.5 KiB
Python
185 lines
6.5 KiB
Python
"""
|
|
Unit tests for HTML link extraction.
|
|
"""
|
|
|
|
import unittest
|
|
from bs4 import BeautifulSoup
|
|
from pyWebLayout.io.readers.html_extraction import (
|
|
parse_html_string,
|
|
extract_text_content,
|
|
create_base_context
|
|
)
|
|
from pyWebLayout.abstract.inline import LinkedWord
|
|
from pyWebLayout.abstract.functional import LinkType
|
|
from pyWebLayout.abstract.block import Paragraph
|
|
|
|
|
|
class TestHTMLLinkExtraction(unittest.TestCase):
|
|
"""Test cases for HTML hyperlink extraction."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.base_context = create_base_context()
|
|
|
|
def test_simple_external_link(self):
|
|
"""Test extracting a simple external link."""
|
|
html = '<p>Visit <a href="https://example.com">this site</a> for more.</p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
self.assertEqual(len(blocks), 1)
|
|
self.assertIsInstance(blocks[0], Paragraph)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
# Should have: "Visit", "this", "site", "for", "more."
|
|
self.assertEqual(len(words), 5)
|
|
|
|
# Check that "this" and "site" are LinkedWords
|
|
self.assertIsInstance(words[1], LinkedWord)
|
|
self.assertIsInstance(words[2], LinkedWord)
|
|
|
|
# Check link properties
|
|
self.assertEqual(words[1].location, "https://example.com")
|
|
self.assertEqual(words[1].link_type, LinkType.EXTERNAL)
|
|
self.assertEqual(words[2].location, "https://example.com")
|
|
self.assertEqual(words[2].link_type, LinkType.EXTERNAL)
|
|
|
|
def test_internal_link(self):
|
|
"""Test extracting an internal anchor link."""
|
|
html = '<p>Go to <a href="#section2">section 2</a> below.</p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
# Find LinkedWords
|
|
linked_words = [w for w in words if isinstance(w, LinkedWord)]
|
|
self.assertEqual(len(linked_words), 2) # "section" and "2"
|
|
|
|
# Check they're internal links
|
|
for word in linked_words:
|
|
self.assertEqual(word.link_type, LinkType.INTERNAL)
|
|
self.assertEqual(word.location, "#section2")
|
|
|
|
def test_multi_word_link(self):
|
|
"""Test that multi-word links create separate LinkedWords."""
|
|
html = '<p><a href="/next">click here for next page</a></p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
# All words should be LinkedWords
|
|
self.assertEqual(len(words), 5)
|
|
for word in words:
|
|
self.assertIsInstance(word, LinkedWord)
|
|
self.assertEqual(word.location, "/next")
|
|
self.assertEqual(word.link_type, LinkType.INTERNAL)
|
|
|
|
def test_link_with_title(self):
|
|
"""Test extracting link with title attribute."""
|
|
html = '<p><a href="https://example.com" title="Visit Example">click</a></p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
self.assertEqual(len(words), 1)
|
|
self.assertIsInstance(words[0], LinkedWord)
|
|
self.assertEqual(words[0].link_title, "Visit Example")
|
|
|
|
def test_mixed_linked_and_normal_text(self):
|
|
"""Test paragraph with both linked and normal text."""
|
|
html = '<p>Some <a href="/page">linked text</a> and normal text.</p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
# "Some" - normal
|
|
# "linked" - LinkedWord
|
|
# "text" - LinkedWord
|
|
# "and" - normal
|
|
# "normal" - normal
|
|
# "text." - normal
|
|
|
|
self.assertNotIsInstance(words[0], LinkedWord) # "Some"
|
|
self.assertIsInstance(words[1], LinkedWord) # "linked"
|
|
self.assertIsInstance(words[2], LinkedWord) # "text"
|
|
self.assertNotIsInstance(words[3], LinkedWord) # "and"
|
|
|
|
def test_link_without_href(self):
|
|
"""Test that <a> without href is treated as normal text."""
|
|
html = '<p><a>not a link</a></p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
# Should be regular Words, not LinkedWords
|
|
for word in words:
|
|
self.assertNotIsInstance(word, LinkedWord)
|
|
|
|
def test_javascript_link(self):
|
|
"""Test that javascript: links are detected as API type."""
|
|
html = '<p><a href="javascript:alert()">click</a></p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
self.assertIsInstance(words[0], LinkedWord)
|
|
self.assertEqual(words[0].link_type, LinkType.API)
|
|
|
|
def test_nested_formatting_in_link(self):
|
|
"""Test link with nested formatting."""
|
|
html = '<p><a href="/page">text with <strong>bold</strong> word</a></p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
# All should be LinkedWords regardless of formatting
|
|
for word in words:
|
|
self.assertIsInstance(word, LinkedWord)
|
|
self.assertEqual(word.location, "/page")
|
|
|
|
def test_multiple_links_in_paragraph(self):
|
|
"""Test paragraph with multiple separate links."""
|
|
html = '<p><a href="/page1">first</a> and <a href="/page2">second</a> link</p>'
|
|
blocks = parse_html_string(html)
|
|
|
|
paragraph = blocks[0]
|
|
words = list(paragraph.words)
|
|
|
|
# Find LinkedWords and their locations
|
|
linked_words = [(w.text, w.location)
|
|
for w in words if isinstance(w, LinkedWord)]
|
|
|
|
# Should have "first" linked to /page1 and "second" linked to /page2
|
|
self.assertIn(("first", "/page1"), linked_words)
|
|
self.assertIn(("second", "/page2"), linked_words)
|
|
|
|
def test_extract_text_content_with_links(self):
|
|
"""Test extract_text_content directly with link elements."""
|
|
html = '<span>Visit <a href="https://example.com">our site</a> today</span>'
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
element = soup.find('span')
|
|
|
|
context = create_base_context()
|
|
words = extract_text_content(element, context)
|
|
|
|
# Should have: "Visit", "our", "site", "today"
|
|
self.assertEqual(len(words), 4)
|
|
|
|
# Check types
|
|
self.assertNotIsInstance(words[0], LinkedWord) # "Visit"
|
|
self.assertIsInstance(words[1], LinkedWord) # "our"
|
|
self.assertIsInstance(words[2], LinkedWord) # "site"
|
|
self.assertNotIsInstance(words[3], LinkedWord) # "today"
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|