Article content
""" Unit tests for HTML extraction functionality. Tests the HTML parsing and conversion to pyWebLayout abstract elements, including styled content within paragraphs and block-level elements. """ import unittest from pyWebLayout.io.readers.html_extraction import parse_html_string from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image from pyWebLayout.abstract.document import Document from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration class TestHTMLParagraph(unittest.TestCase): """Test cases for basic paragraph parsing.""" def test_simple(self): text = "
This is a paragraph.
" paragraphs = parse_html_string(text) self.assertEqual(len(paragraphs), 1) self.assertEqual(len(paragraphs[0]), 4) for w1, t1 in zip(paragraphs[0].words_iter(), "This is a paragraph.".split(" ")): self.assertEqual(w1[1].text, t1) def test_multiple(self): text = "This is a paragraph.
This is another paragraph.
" paragraphs = parse_html_string(text) self.assertEqual(len(paragraphs), 2) self.assertEqual(len(paragraphs[0]), 4) self.assertEqual(len(paragraphs[1]), 4) for w1, t1 in zip(paragraphs[0].words_iter(), "This is a paragraph.".split(" ")): self.assertEqual(w1[1].text, t1) for w1, t1 in zip(paragraphs[1].words_iter(), "This is another paragraph.".split(" ")): self.assertEqual(w1[1].text, t1) class TestHTMLStyledParagraphs(unittest.TestCase): """Test cases for paragraphs with inline styling.""" def test_bold_text(self): """Test paragraphs with bold text using and tags.""" text = "This is bold text in a paragraph.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) self.assertEqual(len(words), 7) # "This is bold text in a paragraph." # Check that 'bold' and 'text' words have bold font weight bold_word = words[2][1] # 'bold' text_word = words[3][1] # 'text' self.assertEqual(bold_word.text, "bold") self.assertEqual(bold_word.style.weight, FontWeight.BOLD) self.assertEqual(text_word.text, "text") self.assertEqual(text_word.style.weight, FontWeight.BOLD) # Check that other words are not bold normal_word = words[0][1] # 'This' self.assertEqual(normal_word.text, "This") self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD) def test_italic_text(self): """Test paragraphs with italic text using and tags.""" text = "This is italic text in a paragraph.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) # Check that 'italic' and 'text' words have italic font style italic_word = words[2][1] # 'italic' text_word = words[3][1] # 'text' self.assertEqual(italic_word.text, "italic") self.assertEqual(italic_word.style.style, FontStyle.ITALIC) self.assertEqual(text_word.text, "text") self.assertEqual(text_word.style.style, FontStyle.ITALIC) def test_underlined_text(self): """Test paragraphs with underlined text using tag.""" text = "This is underlined text here.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) words = list(blocks[0].words_iter()) underlined_word = words[2][1] # 'underlined' self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE) def test_strikethrough_text(self): """Test paragraphs with strikethrough text usingThis is strikethrough text here.
This text is normal, but ' 'this part is red and bold.
' ) blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) # Find the styled words styled_words = [] for _, word in words: if word.text in ["this", "part", "is", "red", "and", "bold"]: if word.style.weight == FontWeight.BOLD: styled_words.append(word) self.assertGreater( len(styled_words), 0, "Should have bold words in styled span") # Check that at least one word has the red color red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)] self.assertGreater(len(red_words), 0, "Should have red colored words") def test_mixed_formatting(self): """Test paragraphs with multiple formatting elements combined.""" text = ( 'This paragraph contains bold, italic, ' 'blue, and highlighted ' 'text all together.
' ) blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) # Check for bold word bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] self.assertGreater(len(bold_words), 0, "Should have bold words") # Check for italic word italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC] self.assertGreater(len(italic_words), 0, "Should have italic words") # Check for blue colored word blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)] self.assertGreater(len(blue_words), 0, "Should have blue colored words") def test_nested_formatting(self): """Test nested formatting elements.""" text = "This has bold with italic inside formatting.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) words = list(blocks[0].words_iter()) # Find words that should be both bold and italic bold_italic_words = [w for _, w in words if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC] self.assertGreater( len(bold_italic_words), 0, "Should have words that are both bold and italic") def test_color_variations(self): """Test different color formats in CSS.""" text = 'Hex red and Named green.
' blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) words = list(blocks[0].words_iter()) # Check for hex red color hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)] self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words") # Check for named green color green_words = [w for _, w in words if w.style.colour == (0, 255, 0)] self.assertGreater(len(green_words), 0, "Should have green colored words") class TestHTMLBlockElements(unittest.TestCase): """Test cases for block-level HTML elements.""" def test_body_element(self): """Test parsing of body element containing other elements.""" text = "Paragraph one.
Paragraph two.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 2) self.assertIsInstance(blocks[0], Paragraph) self.assertIsInstance(blocks[1], Paragraph) def test_div_container(self): """Test div elements as generic containers.""" text = "First paragraph.
Second paragraph.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Quote) # Check that the quote contains a paragraph quote_blocks = list(blocks[0].blocks()) self.assertEqual(len(quote_blocks), 1) self.assertIsInstance(quote_blocks[0], Paragraph) def test_preformatted_code(self): """Test preformatted code blocks.""" text = "This is a quoted paragraph.
function hello() {\n console.log('Hello');\n}"
blocks = parse_html_string(text)
self.assertEqual(len(blocks), 1)
self.assertIsInstance(blocks[0], CodeBlock)
lines = list(blocks[0].lines())
self.assertGreater(len(lines), 0)
def test_unordered_list(self):
"""Test unordered lists."""
text = "| Header 1 | Header 2 |
|---|---|
| Cell 1 | Cell 2 |
Article content
Some introductory text.
A quoted paragraph.
" # Contains whitespace blocks_with_content = parse_html_string(text_with_content) # This should create at least one block since there's whitespace content self.assertGreaterEqual(len(blocks_with_content), 0) class TestHTMLComplexStructures(unittest.TestCase): """Test cases for complex HTML structures combining multiple features.""" def test_article_with_mixed_content(self): """Test a realistic article structure with mixed content.""" text = """
This is the introduction paragraph with some emphasis.
This is a quoted section with styling.
inline code| Product | Price |
|---|---|
| Item with red text | $19.99 |
This is bold text and italic text.
This is bold text and italic text.
Normal text with bold and italic and red text.
""" # Parse content blocks = parse_html_string(html_content, self.base_font, document=self.doc) # Extract all words from the paragraph paragraph = blocks[0] words = list(paragraph.words_iter()) # Find words with different styles normal_words = [w for _, w in words if w.style.weight == FontWeight.NORMAL and w.style.style == FontStyle.NORMAL] bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC] red_words = [w for _, w in words if w.style.colour == (255, 0, 0)] # Should have words with different styles self.assertGreater(len(normal_words), 0, "Should have normal words") self.assertGreater(len(bold_words), 0, "Should have bold words") self.assertGreater(len(italic_words), 0, "Should have italic words") self.assertGreater(len(red_words), 0, "Should have red words") # Style registry should contain multiple styles for different formatting self.assertGreater(self.doc.get_style_registry().get_style_count(), 1, "Should have multiple styles for different formatting") def test_font_registry_without_document_context(self): """Test that parsing without document context works (fallback behavior).""" html_content = "This is bold text.
" # Get initial style count (should include default style) initial_style_count = self.doc.get_style_registry().get_style_count() # Parse without document context blocks = parse_html_string(html_content, self.base_font) # Should still create blocks successfully self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) # Should not affect document's style registry final_style_count = self.doc.get_style_registry().get_style_count() self.assertEqual(final_style_count, initial_style_count, "Document style registry should remain unchanged") def test_complex_html_font_reuse(self): """Test style reuse with complex HTML containing repeated styles.""" html_content = """Paragraph with bold text.
Another paragraph with bold text.
Text with bold and bold italic nested styles.
""" # Parse content blocks = parse_html_string(html_content, self.base_font, document=self.doc) # Should create styles for different style combinations paragraph = blocks[0] words = list(paragraph.words_iter()) # Find words that are both bold and italic bold_italic_words = [w for _, w in words if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC] self.assertGreater(len(bold_italic_words), 0, "Should have words with combined bold+italic style") # Should have multiple styles in registry for different combinations self.assertGreater(self.doc.get_style_registry().get_style_count(), 1, "Should create separate styles for style combinations") class TestHTMLImagesInParagraphs(unittest.TestCase): """Test cases for handling images inside paragraph tags.""" def setUp(self): """Set up test fixtures.""" self.base_font = Font(font_size=14) def test_image_only_paragraph(self): """Test paragraph containing only an image (common in EPUBs).""" html = '


Some text before
and after
Just regular text without any images.
' blocks = parse_html_string(html, base_font=self.base_font) # Should be exactly one Paragraph block self.assertEqual(len(blocks), 1, "Should have exactly one block") self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block") # Should not have any Image blocks image_blocks = [b for b in blocks if isinstance(b, Image)] self.assertEqual(len(image_blocks), 0, "Should have no Image blocks") def test_image_with_width_and_height(self): """Test image parsing with width and height attributes.""" html = '
