""" Unit tests for HTML extraction functionality. Tests the HTML parsing and conversion to pyWebLayout abstract elements, including styled content within paragraphs and block-level elements. """ import unittest from pyWebLayout.io.readers.html_extraction import parse_html_string from pyWebLayout.abstract.block import Paragraph, Heading, HeadingLevel, Quote, CodeBlock, HList, ListStyle, Table, Image from pyWebLayout.abstract.document import Document from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration class TestHTMLParagraph(unittest.TestCase): """Test cases for basic paragraph parsing.""" def test_simple(self): text = "

This is a paragraph.

" paragraphs = parse_html_string(text) self.assertEqual(len(paragraphs), 1) self.assertEqual(len(paragraphs[0]), 4) for w1, t1 in zip(paragraphs[0].words_iter(), "This is a paragraph.".split(" ")): self.assertEqual(w1[1].text, t1) def test_multiple(self): text = "

This is a paragraph.

This is another paragraph.

" paragraphs = parse_html_string(text) self.assertEqual(len(paragraphs), 2) self.assertEqual(len(paragraphs[0]), 4) self.assertEqual(len(paragraphs[1]), 4) for w1, t1 in zip(paragraphs[0].words_iter(), "This is a paragraph.".split(" ")): self.assertEqual(w1[1].text, t1) for w1, t1 in zip(paragraphs[1].words_iter(), "This is another paragraph.".split(" ")): self.assertEqual(w1[1].text, t1) class TestHTMLStyledParagraphs(unittest.TestCase): """Test cases for paragraphs with inline styling.""" def test_bold_text(self): """Test paragraphs with bold text using and tags.""" text = "
This is bold text in a paragraph.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) self.assertEqual(len(words), 7) # "This is bold text in a paragraph." # Check that 'bold' and 'text' words have bold font weight bold_word = words[2][1] # 'bold' text_word = words[3][1] # 'text' self.assertEqual(bold_word.text, "bold") self.assertEqual(bold_word.style.weight, FontWeight.BOLD) self.assertEqual(text_word.text, "text") self.assertEqual(text_word.style.weight, FontWeight.BOLD) # Check that other words are not bold normal_word = words[0][1] # 'This' self.assertEqual(normal_word.text, "This") self.assertNotEqual(normal_word.style.weight, FontWeight.BOLD) def test_italic_text(self): """Test paragraphs with italic text using and tags.""" text = "
This is italic text in a paragraph.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) # Check that 'italic' and 'text' words have italic font style italic_word = words[2][1] # 'italic' text_word = words[3][1] # 'text' self.assertEqual(italic_word.text, "italic") self.assertEqual(italic_word.style.style, FontStyle.ITALIC) self.assertEqual(text_word.text, "text") self.assertEqual(text_word.style.style, FontStyle.ITALIC) def test_underlined_text(self): """Test paragraphs with underlined text using tag.""" text = "
This is underlined text here.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) words = list(blocks[0].words_iter()) underlined_word = words[2][1] # 'underlined' self.assertEqual(underlined_word.style.decoration, TextDecoration.UNDERLINE) def test_strikethrough_text(self): """Test paragraphs with strikethrough text using and tags.""" text = "
This is ~~strikethrough text~~ here.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) words = list(blocks[0].words_iter()) strike_word = words[2][1] # 'strikethrough' self.assertEqual(strike_word.style.decoration, TextDecoration.STRIKETHROUGH) def test_span_with_inline_styles(self): """Test paragraphs with span elements containing inline CSS styles.""" text = ( '
This text is normal, but ' 'this part is red and bold.
' ) blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) # Find the styled words styled_words = [] for _, word in words: if word.text in ["this", "part", "is", "red", "and", "bold"]: if word.style.weight == FontWeight.BOLD: styled_words.append(word) self.assertGreater( len(styled_words), 0, "Should have bold words in styled span") # Check that at least one word has the red color red_words = [w for w in styled_words if w.style.colour == (255, 0, 0)] self.assertGreater(len(red_words), 0, "Should have red colored words") def test_mixed_formatting(self): """Test paragraphs with multiple formatting elements combined.""" text = ( '
This paragraph contains bold, italic, ' 'blue, and highlighted ' 'text all together.
' ) blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) words = list(blocks[0].words_iter()) # Check for bold word bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] self.assertGreater(len(bold_words), 0, "Should have bold words") # Check for italic word italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC] self.assertGreater(len(italic_words), 0, "Should have italic words") # Check for blue colored word blue_words = [w for _, w in words if w.style.colour == (0, 0, 255)] self.assertGreater(len(blue_words), 0, "Should have blue colored words") def test_nested_formatting(self): """Test nested formatting elements.""" text = "
This has bold with italic inside formatting.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) words = list(blocks[0].words_iter()) # Find words that should be both bold and italic bold_italic_words = [w for _, w in words if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC] self.assertGreater( len(bold_italic_words), 0, "Should have words that are both bold and italic") def test_color_variations(self): """Test different color formats in CSS.""" text = '
Hex red and Named green.
' blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) words = list(blocks[0].words_iter()) # Check for hex red color hex_red_words = [w for _, w in words if w.style.colour == (255, 0, 0)] self.assertGreater(len(hex_red_words), 0, "Should have hex red colored words") # Check for named green color green_words = [w for _, w in words if w.style.colour == (0, 255, 0)] self.assertGreater(len(green_words), 0, "Should have green colored words") class TestHTMLBlockElements(unittest.TestCase): """Test cases for block-level HTML elements.""" def test_body_element(self): """Test parsing of body element containing other elements.""" text = "
Paragraph one.
Paragraph two.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 2) self.assertIsInstance(blocks[0], Paragraph) self.assertIsInstance(blocks[1], Paragraph) def test_div_container(self): """Test div elements as generic containers.""" text = "
First paragraph.
Second paragraph.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 2) self.assertIsInstance(blocks[0], Paragraph) self.assertIsInstance(blocks[1], Paragraph) def test_headings(self): """Test all heading levels h1-h6.""" text = ( "
Heading 1
Heading 2
Heading 3
" "
Heading 4
Heading 5
Heading 6
" ) blocks = parse_html_string(text) self.assertEqual(len(blocks), 6) expected_levels = [HeadingLevel.H1, HeadingLevel.H2, HeadingLevel.H3, HeadingLevel.H4, HeadingLevel.H5, HeadingLevel.H6] for i, block in enumerate(blocks): self.assertIsInstance(block, Heading) self.assertEqual(block.level, expected_levels[i]) words = list(block.words_iter()) self.assertEqual(len(words), 2) # "Heading" and number self.assertEqual(words[0][1].text, "Heading") def test_blockquote(self): """Test blockquote elements.""" text = "
This is a quoted paragraph.
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Quote) # Check that the quote contains a paragraph quote_blocks = list(blocks[0].blocks()) self.assertEqual(len(quote_blocks), 1) self.assertIsInstance(quote_blocks[0], Paragraph) def test_preformatted_code(self): """Test preformatted code blocks.""" text = "
function hello() {\n console.log('Hello');\n}
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], CodeBlock) lines = list(blocks[0].lines()) self.assertGreater(len(lines), 0) def test_unordered_list(self): """Test unordered lists.""" text = "
First item
Second item
Third item
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], HList) self.assertEqual(blocks[0].style, ListStyle.UNORDERED) items = list(blocks[0].items()) self.assertEqual(len(items), 3) def test_ordered_list(self): """Test ordered lists.""" text = "
First item
Second item
Third item
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], HList) self.assertEqual(blocks[0].style, ListStyle.ORDERED) def test_list_with_styled_content(self): """Test lists containing styled content.""" text = "
Normal item
Bold item
Item with italic text
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], HList) items = list(blocks[0].items()) self.assertEqual(len(items), 3) # Check second item has bold text second_item_blocks = list(items[1].blocks()) if second_item_blocks: words = list(second_item_blocks[0].words_iter()) bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] self.assertGreater(len(bold_words), 0) def test_table_basic(self): """Test basic table structure.""" text = """

Header 1 Header 2

Cell 1 Cell 2

""" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Table) def test_semantic_elements(self): """Test semantic HTML5 elements treated as containers.""" text = "
Article content
" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) def test_nested_block_elements(self): """Test nested block elements.""" text = """

Section Title

Some introductory text.

A quoted paragraph.

""" blocks = parse_html_string(text) self.assertGreater(len(blocks), 2) # Should have at least a heading, paragraph, and quote has_heading = any(isinstance(b, Heading) for b in blocks) has_paragraph = any(isinstance(b, Paragraph) for b in blocks) has_quote = any(isinstance(b, Quote) for b in blocks) self.assertTrue(has_heading, "Should contain a heading") self.assertTrue(has_paragraph, "Should contain a paragraph") self.assertTrue(has_quote, "Should contain a quote") def test_empty_elements(self): """Test handling of empty elements.""" text = "
" blocks = parse_html_string(text) # Empty elements may not create blocks, which is acceptable behavior self.assertGreaterEqual(len(blocks), 0) # Test that empty paragraph with some content does create a block text_with_content = "

" # Contains whitespace blocks_with_content = parse_html_string(text_with_content) # This should create at least one block since there's whitespace content self.assertGreaterEqual(len(blocks_with_content), 0) class TestHTMLComplexStructures(unittest.TestCase): """Test cases for complex HTML structures combining multiple features.""" def test_article_with_mixed_content(self): """Test a realistic article structure with mixed content.""" text = """

Article Title

This is the introduction paragraph with some emphasis.

This is a quoted section with styling.

First important point

Second point with inline code

""" blocks = parse_html_string(text) self.assertGreater(len(blocks), 3) # Verify we have the expected block types block_types = [type(b).__name__ for b in blocks] self.assertIn('Heading', block_types) self.assertIn('Paragraph', block_types) self.assertIn('Quote', block_types) self.assertIn('HList', block_types) def test_styled_table_content(self): """Test table with styled cell content.""" text = """

Product Price

Item with red text $19.99

""" blocks = parse_html_string(text) self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Table) class TestHTMLFontRegistryIntegration(unittest.TestCase): """Test cases for font registry integration with HTML extraction.""" def setUp(self): """Set up test fixtures.""" self.doc = Document("Test Document", "en-US") self.base_font = Font(font_size=16, colour=(0, 0, 0)) def test_font_registry_creates_fonts(self): """Test that HTML parsing with document context creates fonts in registry.""" html_content = """

This is bold text and italic text.

Main Header

""" # Initially empty style registry initial_style_count = self.doc.get_style_registry().get_style_count() # Parse HTML with document context blocks = parse_html_string(html_content, self.base_font, document=self.doc) # Should have created styles for different formatting final_style_count = self.doc.get_style_registry().get_style_count() self.assertGreater(final_style_count, initial_style_count, "Should have created styles in registry") # Should have created blocks self.assertGreater(len(blocks), 0, "Should have created blocks") def test_font_registry_reuses_fonts(self): """Test that parsing same content reuses existing styles.""" html_content = """

This is bold text and italic text.

Main Header

""" # First parse blocks1 = parse_html_string(html_content, self.base_font, document=self.doc) first_parse_style_count = self.doc.get_style_registry().get_style_count() # Second parse with same content blocks2 = parse_html_string(html_content, self.base_font, document=self.doc) second_parse_style_count = self.doc.get_style_registry().get_style_count() # Style count should not increase on second parse self.assertEqual(first_parse_style_count, second_parse_style_count, "Should reuse existing styles instead of creating new ones") # Both parses should create same number of blocks self.assertEqual(len(blocks1), len(blocks2), "Should create same structure on both parses") def test_font_registry_different_styles_create_different_fonts(self): """Test that different styles create different style objects.""" # Create styles with different properties style_id1, style1 = self.doc.get_or_create_style( font_size=14, color=(255, 0, 0), font_weight=FontWeight.BOLD ) style_id2, style2 = self.doc.get_or_create_style( font_size=16, color=(255, 0, 0), font_weight=FontWeight.BOLD ) style_id3, style3 = self.doc.get_or_create_style( font_size=14, color=(0, 255, 0), font_weight=FontWeight.BOLD ) # Should be different style IDs self.assertNotEqual( style_id1, style_id2, "Different sizes should create different styles") self.assertNotEqual( style_id1, style_id3, "Different colors should create different styles") self.assertNotEqual(style_id2, style_id3, "All styles should be different") # Should have multiple styles in registry self.assertGreaterEqual(self.doc.get_style_registry().get_style_count(), 3) def test_font_registry_integration_with_html_styles(self): """Test that HTML parsing uses style registry for styled content.""" html_content = """
Normal text with bold and italic and red text.
""" # Parse content blocks = parse_html_string(html_content, self.base_font, document=self.doc) # Extract all words from the paragraph paragraph = blocks[0] words = list(paragraph.words_iter()) # Find words with different styles normal_words = [w for _, w in words if w.style.weight == FontWeight.NORMAL and w.style.style == FontStyle.NORMAL] bold_words = [w for _, w in words if w.style.weight == FontWeight.BOLD] italic_words = [w for _, w in words if w.style.style == FontStyle.ITALIC] red_words = [w for _, w in words if w.style.colour == (255, 0, 0)] # Should have words with different styles self.assertGreater(len(normal_words), 0, "Should have normal words") self.assertGreater(len(bold_words), 0, "Should have bold words") self.assertGreater(len(italic_words), 0, "Should have italic words") self.assertGreater(len(red_words), 0, "Should have red words") # Style registry should contain multiple styles for different formatting self.assertGreater(self.doc.get_style_registry().get_style_count(), 1, "Should have multiple styles for different formatting") def test_font_registry_without_document_context(self): """Test that parsing without document context works (fallback behavior).""" html_content = "
This is bold text.
" # Get initial style count (should include default style) initial_style_count = self.doc.get_style_registry().get_style_count() # Parse without document context blocks = parse_html_string(html_content, self.base_font) # Should still create blocks successfully self.assertEqual(len(blocks), 1) self.assertIsInstance(blocks[0], Paragraph) # Should not affect document's style registry final_style_count = self.doc.get_style_registry().get_style_count() self.assertEqual(final_style_count, initial_style_count, "Document style registry should remain unchanged") def test_complex_html_font_reuse(self): """Test style reuse with complex HTML containing repeated styles.""" html_content = """

First Header

Paragraph with bold text.

Second Header

Another paragraph with bold text.

""" # Parse content blocks = parse_html_string(html_content, self.base_font, document=self.doc) style_count_after_parse = self.doc.get_style_registry().get_style_count() # Parse same content again blocks2 = parse_html_string(html_content, self.base_font, document=self.doc) style_count_after_second_parse = self.doc.get_style_registry().get_style_count() # Style count should not increase on second parse self.assertEqual(style_count_after_parse, style_count_after_second_parse, "Styles should be reused for repeated formatting") # Both should create same structure self.assertEqual(len(blocks), len(blocks2)) def test_font_registry_with_nested_styles(self): """Test style registry with nested HTML styles.""" html_content = """
Text with bold and bold italic nested styles.
""" # Parse content blocks = parse_html_string(html_content, self.base_font, document=self.doc) # Should create styles for different style combinations paragraph = blocks[0] words = list(paragraph.words_iter()) # Find words that are both bold and italic bold_italic_words = [w for _, w in words if w.style.weight == FontWeight.BOLD and w.style.style == FontStyle.ITALIC] self.assertGreater(len(bold_italic_words), 0, "Should have words with combined bold+italic style") # Should have multiple styles in registry for different combinations self.assertGreater(self.doc.get_style_registry().get_style_count(), 1, "Should create separate styles for style combinations") class TestHTMLImagesInParagraphs(unittest.TestCase): """Test cases for handling images inside paragraph tags.""" def setUp(self): """Set up test fixtures.""" self.base_font = Font(font_size=14) def test_image_only_paragraph(self): """Test paragraph containing only an image (common in EPUBs).""" html = '
' blocks = parse_html_string(html, base_font=self.base_font) # Should parse as an Image block, not a Paragraph self.assertGreater(len(blocks), 0, "Should parse at least one block") # Check that we have an Image block image_blocks = [b for b in blocks if isinstance(b, Image)] self.assertGreater(len(image_blocks), 0, "Should have at least one Image block") # Verify image properties img = image_blocks[0] self.assertEqual(img.source, "cover.jpg") self.assertEqual(img.alt_text, "Book Cover") def test_paragraph_with_multiple_images(self): """Test paragraph with multiple images.""" html = '
' blocks = parse_html_string(html, base_font=self.base_font) # Should have multiple Image blocks image_blocks = [b for b in blocks if isinstance(b, Image)] self.assertEqual(len(image_blocks), 2, "Should have two Image blocks") # Verify both images were parsed sources = [img.source for img in image_blocks] self.assertIn("img1.jpg", sources) self.assertIn("img2.jpg", sources) def test_paragraph_with_text_and_image(self): """Test paragraph with mixed text and image content.""" html = '
Some text before and after
' blocks = parse_html_string(html, base_font=self.base_font) # Should have both paragraph and image blocks paragraphs = [b for b in blocks if isinstance(b, Paragraph)] images = [b for b in blocks if isinstance(b, Image)] self.assertGreater(len(paragraphs), 0, "Should have a Paragraph block for text") self.assertGreater(len(images), 0, "Should have an Image block") # Verify image was parsed self.assertEqual(images[0].source, "inline.jpg") # Verify text was extracted (should have words like "Some", "text", etc.) if paragraphs: words = list(paragraphs[0].words_iter()) self.assertGreater(len(words), 0, "Paragraph should have words") def test_regular_paragraph_still_works(self): """Test that regular paragraphs without images still work correctly.""" html = '
Just regular text without any images.
' blocks = parse_html_string(html, base_font=self.base_font) # Should be exactly one Paragraph block self.assertEqual(len(blocks), 1, "Should have exactly one block") self.assertIsInstance(blocks[0], Paragraph, "Should be a Paragraph block") # Should not have any Image blocks image_blocks = [b for b in blocks if isinstance(b, Image)] self.assertEqual(len(image_blocks), 0, "Should have no Image blocks") def test_image_with_width_and_height(self): """Test image parsing with width and height attributes.""" html = '
' blocks = parse_html_string(html, base_font=self.base_font) # Should have an Image block image_blocks = [b for b in blocks if isinstance(b, Image)] self.assertEqual(len(image_blocks), 1, "Should have one Image block") # Verify dimensions were parsed img = image_blocks[0] self.assertEqual(img.width, 400) self.assertEqual(img.height, 300) def test_nested_paragraph_with_image_in_span(self): """Test image inside nested inline elements.""" html = '
' blocks = parse_html_string(html, base_font=self.base_font) # Should still extract the image image_blocks = [b for b in blocks if isinstance(b, Image)] self.assertGreater(len(image_blocks), 0, "Should find image even when nested") # Verify image was parsed correctly self.assertEqual(image_blocks[0].source, "nested.jpg") if __name__ == '__main__': unittest.main()