""" Unit tests for HTML link extraction. """ import unittest from bs4 import BeautifulSoup from pyWebLayout.io.readers.html_extraction import ( parse_html_string, extract_text_content, create_base_context ) from pyWebLayout.abstract.inline import LinkedWord from pyWebLayout.abstract.functional import LinkType from pyWebLayout.abstract.block import Paragraph class TestHTMLLinkExtraction(unittest.TestCase): """Test cases for HTML hyperlink extraction.""" def setUp(self): """Set up test fixtures.""" self.base_context = create_base_context() def test_simple_external_link(self): """Test extracting a simple external link.""" html = '
Visit this site for more.
' blocks = parse_html_string(html) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) paragraph = blocks[0] words = list(paragraph.words) # Should have: "Visit", "this", "site", "for", "more." self.assertEqual(len(words), 5) # Check that "this" and "site" are LinkedWords self.assertIsInstance(words[1], LinkedWord) self.assertIsInstance(words[2], LinkedWord) # Check link properties self.assertEqual(words[1].location, "https://example.com") self.assertEqual(words[1].link_type, LinkType.EXTERNAL) self.assertEqual(words[2].location, "https://example.com") self.assertEqual(words[2].link_type, LinkType.EXTERNAL) def test_internal_link(self): """Test extracting an internal anchor link.""" html = 'Go to section 2 below.
' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) # Find LinkedWords linked_words = [w for w in words if isinstance(w, LinkedWord)] self.assertEqual(len(linked_words), 2) # "section" and "2" # Check they're internal links for word in linked_words: self.assertEqual(word.link_type, LinkType.INTERNAL) self.assertEqual(word.location, "#section2") def test_multi_word_link(self): """Test that multi-word links create separate LinkedWords.""" html = '' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) # All words should be LinkedWords self.assertEqual(len(words), 5) for word in words: self.assertIsInstance(word, LinkedWord) self.assertEqual(word.location, "/next") self.assertEqual(word.link_type, LinkType.INTERNAL) def test_link_with_title(self): """Test extracting link with title attribute.""" html = '' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) self.assertEqual(len(words), 1) self.assertIsInstance(words[0], LinkedWord) self.assertEqual(words[0].link_title, "Visit Example") def test_mixed_linked_and_normal_text(self): """Test paragraph with both linked and normal text.""" html = 'Some linked text and normal text.
' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) # "Some" - normal # "linked" - LinkedWord # "text" - LinkedWord # "and" - normal # "normal" - normal # "text." - normal self.assertNotIsInstance(words[0], LinkedWord) # "Some" self.assertIsInstance(words[1], LinkedWord) # "linked" self.assertIsInstance(words[2], LinkedWord) # "text" self.assertNotIsInstance(words[3], LinkedWord) # "and" def test_link_without_href(self): """Test that without href is treated as normal text.""" html = '' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) # Should be regular Words, not LinkedWords for word in words: self.assertNotIsInstance(word, LinkedWord) def test_javascript_link(self): """Test that javascript: links are detected as API type.""" html = '' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) self.assertIsInstance(words[0], LinkedWord) self.assertEqual(words[0].link_type, LinkType.API) def test_nested_formatting_in_link(self): """Test link with nested formatting.""" html = '' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) # All should be LinkedWords regardless of formatting for word in words: self.assertIsInstance(word, LinkedWord) self.assertEqual(word.location, "/page") def test_multiple_links_in_paragraph(self): """Test paragraph with multiple separate links.""" html = '' blocks = parse_html_string(html) paragraph = blocks[0] words = list(paragraph.words) # Find LinkedWords and their locations linked_words = [(w.text, w.location) for w in words if isinstance(w, LinkedWord)] # Should have "first" linked to /page1 and "second" linked to /page2 self.assertIn(("first", "/page1"), linked_words) self.assertIn(("second", "/page2"), linked_words) def test_extract_text_content_with_links(self): """Test extract_text_content directly with link elements.""" html = 'Visit our site today' soup = BeautifulSoup(html, 'html.parser') element = soup.find('span') context = create_base_context() words = extract_text_content(element, context) # Should have: "Visit", "our", "site", "today" self.assertEqual(len(words), 4) # Check types self.assertNotIsInstance(words[0], LinkedWord) # "Visit" self.assertIsInstance(words[1], LinkedWord) # "our" self.assertIsInstance(words[2], LinkedWord) # "site" self.assertNotIsInstance(words[3], LinkedWord) # "today" if __name__ == '__main__': unittest.main()