Article Header
Article content with nested formatting.
""" Unit tests for EPUB reader functionality. Tests the EPUB parsing and conversion to pyWebLayout abstract elements, using ebooklib to generate test EPUB files. """ import unittest import tempfile import os import shutil # Import ebooklib for creating test EPUB files try: from ebooklib import epub EBOOKLIB_AVAILABLE = True except ImportError: EBOOKLIB_AVAILABLE = False from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader from pyWebLayout.abstract.document import Book from pyWebLayout.abstract.block import ( Paragraph, Heading, Quote, CodeBlock, HList, ListStyle, Table, Image ) from pyWebLayout.style import FontWeight, FontStyle, TextDecoration @unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available") class TestEPUBReader(unittest.TestCase): """Test cases for EPUB reader functionality.""" def setUp(self): """Set up test environment.""" self.test_dir = tempfile.mkdtemp() self.epub_files = [] def tearDown(self): """Clean up test environment.""" # Clean up test EPUB files for epub_file in self.epub_files: try: os.remove(epub_file) except OSError: pass # Clean up test directory if os.path.exists(self.test_dir): shutil.rmtree(self.test_dir, ignore_errors=True) def create_simple_epub(self, title="Test Book", author="Test Author"): """Create a simple EPUB file for testing.""" book = epub.EpubBook() # Set metadata book.set_identifier('test-id-123') book.set_title(title) book.set_language('en') book.add_author(author) # Create a simple chapter chapter1 = epub.EpubHtml( title='Chapter 1', file_name='chapter1.xhtml', lang='en' ) chapter1.content = '''
This is the first paragraph of the first chapter.
This is a second paragraph with some formatting.
''' # Add chapter to book book.add_item(chapter1) # Define table of contents book.toc = (epub.Link("chapter1.xhtml", "Chapter 1", "ch1"),) # Add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Define spine book.spine = ['nav', chapter1] # Create temporary file epub_path = os.path.join(self.test_dir, f'test_simple_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) return epub_path def create_complex_epub(self): """Create a more complex EPUB file with multiple chapters and content types.""" book = epub.EpubBook() # Set metadata book.set_identifier('complex-test-id-456') book.set_title('Complex Test Book') book.set_language('en') book.add_author('Test Author') book.add_metadata('DC', 'description', 'A test book with complex content') book.add_metadata('DC', 'subject', 'Testing') book.add_metadata('DC', 'date', '2024-01-01') book.add_metadata('DC', 'publisher', 'Test Publisher') # Chapter 1: Basic content chapter1 = epub.EpubHtml( title='Introduction', file_name='chapter1.xhtml', lang='en' ) chapter1.content = '''Welcome to this complex test book.
This chapter contains basic content to test paragraph parsing.
''' # Chapter 2: Styled content chapter2 = epub.EpubHtml( title='Styled Content', file_name='chapter2.xhtml', lang='en' ) chapter2.content = '''This chapter contains various bold text, italic text, and colored text.
Text with underline and strikethrough.
Nested formatting: bold with italic inside.
''' # Chapter 3: Lists and quotes chapter3 = epub.EpubHtml( title='Lists and Quotes', file_name='chapter3.xhtml', lang='en' ) chapter3.content = '''''' # Chapter 4: Tables and code chapter4 = epub.EpubHtml( title='Tables and Code', file_name='chapter4.xhtml', lang='en' ) chapter4.content = '''This is a quoted paragraph with some styling.
| Header 1 | Header 2 |
|---|---|
| Cell 1 | Cell 2 with blue text |
| Bold cell | Normal cell |
function test() {
console.log("Hello, world!");
return true;
}
Use the print() function to output text.
Paragraph inside div.
Article content with nested formatting.
This chapter tests metadata extraction.
''' book.add_item(chapter) book.toc = (epub.Link("metadata.xhtml", "Metadata Test", "meta"),) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['nav', chapter] # Write and test epub_path = os.path.join(self.test_dir, f'test_metadata_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) # Read and verify parsed_book = read_epub(epub_path) self.assertEqual(parsed_book.title, "Custom Metadata Test") # Verify chapters were created chapters = list(parsed_book.chapters) self.assertEqual(len(chapters), 1) class TestEPUBIntegrationWithHTMLExtraction(unittest.TestCase): """Test cases that specifically verify EPUB reader uses html_extraction properly.""" def setUp(self): """Set up test environment.""" self.test_dir = tempfile.mkdtemp() self.epub_files = [] def tearDown(self): """Clean up test environment.""" for epub_file in self.epub_files: try: os.remove(epub_file) except OSError: pass if os.path.exists(self.test_dir): shutil.rmtree(self.test_dir, ignore_errors=True) @unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available") def test_html_extraction_integration(self): """Test that EPUB reader properly uses html_extraction functionality.""" # Create an EPUB that exercises various HTML extraction features book = epub.EpubBook() book.set_identifier('html-extraction-test') book.set_title('HTML Extraction Test') book.set_language('en') book.add_author('Test Author') # Chapter that exercises html_extraction features chapter = epub.EpubHtml( title='HTML Features', file_name='html_features.xhtml', lang='en' ) chapter.content = '''This paragraph has bold, italic, underlined, and styled text.
| Header | Value |
|---|---|
| Blue text | Normal text |
This is a quoted paragraph with bold text.
def test_function():
return "Hello, World!"
Nested formatting: bold with italic nested inside.
Red text, Green hex, Blue underlined.
''' book.add_item(chapter) book.toc = (epub.Link("html_features.xhtml", "HTML Features", "html"),) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['nav', chapter] # Write EPUB epub_path = os.path.join(self.test_dir, 'html_extraction_test.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) # Read and analyze parsed_book = read_epub(epub_path) chapters = list(parsed_book.chapters) self.assertEqual(len(chapters), 1) blocks = list(chapters[0].blocks) self.assertGreater(len(blocks), 5) # Should have multiple blocks # Test that we get the expected block types block_types = [type(block).__name__ for block in blocks] self.assertIn('Heading', block_types, "Should have heading blocks") self.assertIn('Paragraph', block_types, "Should have paragraph blocks") self.assertIn('HList', block_types, "Should have list blocks") self.assertIn('Table', block_types, "Should have table blocks") self.assertIn('Quote', block_types, "Should have quote blocks") self.assertIn('CodeBlock', block_types, "Should have code blocks") # Test styled content was preserved styled_content_found = False for block in blocks: if isinstance(block, Paragraph): words = list(block.words_iter()) for _, word in words: if (word.style.weight == FontWeight.BOLD or word.style.style == FontStyle.ITALIC or word.style.decoration == TextDecoration.UNDERLINE or word.style.colour != (0, 0, 0)): styled_content_found = True break if styled_content_found: break self.assertTrue( styled_content_found, "Should find styled content in parsed blocks") # Test specific color parsing red_text_found = False green_text_found = False blue_text_found = False for block in blocks: if isinstance(block, (Paragraph, Table)): if isinstance(block, Paragraph): words = list(block.words_iter()) for _, word in words: if word.style.colour == (255, 0, 0): # Red red_text_found = True elif word.style.colour == (0, 255, 0): # Green green_text_found = True elif word.style.colour == (0, 0, 255): # Blue blue_text_found = True # At least one color should be found (depending on implementation) color_found = red_text_found or green_text_found or blue_text_found self.assertTrue(color_found, "Should find at least one colored text") def test_epub_with_image(self): """Test that images in EPUB are properly parsed.""" book = epub.EpubBook() book.set_identifier('image-test-id') book.set_title('Image Test Book') book.set_language('en') book.add_author('Test Author') # Create minimal JPEG data for testing img_data = ( b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00' b'\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t' b'\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a' b'\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342' b'\xff\xc0\x00\x11\x08\x00d\x00d\x01\x01\x11\x00\x02\x11\x01\x03' b'\x11\x01\xff\xc4\x00\x14\x00\x01\x00\x00\x00\x00\x00\x00\x00' b'\x00\x00\x00\x00\x00\x00\x00\x00\x08\xff\xc4\x00\x14\x10\x01' b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' b'\x00\xff\xda\x00\x0c\x03\x01\x00\x02\x11\x03\x11\x00\x3f\x00' b'\xaa\xff\xd9' ) # Create an EpubImage item image_item = epub.EpubImage() image_item.id = 'test_img' image_item.file_name = 'images/test_image.jpg' image_item.media_type = 'image/jpeg' image_item.content = img_data # Add image to book book.add_item(image_item) # Create a chapter that references the image chapter = epub.EpubHtml( title='Image Chapter', file_name='image_chapter.xhtml', lang='en' ) chapter.content = '''This chapter contains an image:
Text after the image.
''' book.add_item(chapter) book.toc = (epub.Link("image_chapter.xhtml", "Image Chapter", "img_ch"),) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['nav', chapter] # Write EPUB epub_path = os.path.join(self.test_dir, f'test_image_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) # Read and analyze parsed_book = read_epub(epub_path) chapters = list(parsed_book.chapters) self.assertEqual(len(chapters), 1) blocks = list(chapters[0].blocks) self.assertGreater(len(blocks), 0) # Find blocks by type heading_blocks = [block for block in blocks if isinstance(block, Heading)] paragraph_blocks = [block for block in blocks if isinstance(block, Paragraph)] image_blocks = [block for block in blocks if isinstance(block, Image)] # Verify we have the expected blocks self.assertEqual( len(heading_blocks), 1, "Should find exactly one heading block") self.assertGreaterEqual( len(paragraph_blocks), 2, "Should find at least two paragraph blocks") self.assertEqual(len(image_blocks), 1, "Should find exactly one image block") # Verify image properties image_block = image_blocks[0] self.assertEqual(image_block.alt_text, "Test image") self.assertEqual(image_block.width, 300) self.assertEqual(image_block.height, 200) self.assertIn("test_image.jpg", image_block.source) if __name__ == '__main__': unittest.main()