""" Unit tests for EPUB reader functionality. Tests the EPUB parsing and conversion to pyWebLayout abstract elements, using ebooklib to generate test EPUB files. """ import unittest import tempfile import os import shutil # Import ebooklib for creating test EPUB files try: from ebooklib import epub EBOOKLIB_AVAILABLE = True except ImportError: EBOOKLIB_AVAILABLE = False from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader from pyWebLayout.abstract.document import Book, MetadataType from pyWebLayout.abstract.block import ( Paragraph, Heading, Quote, CodeBlock, HList, ListStyle, Table, Image ) from pyWebLayout.style import FontWeight, FontStyle, TextDecoration @unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available") class TestEPUBReader(unittest.TestCase): """Test cases for EPUB reader functionality.""" def setUp(self): """Set up test environment.""" self.test_dir = tempfile.mkdtemp() self.epub_files = [] def tearDown(self): """Clean up test environment.""" # Clean up test EPUB files for epub_file in self.epub_files: try: os.remove(epub_file) except OSError: pass # Clean up test directory if os.path.exists(self.test_dir): shutil.rmtree(self.test_dir, ignore_errors=True) def create_simple_epub(self, title="Test Book", author="Test Author"): """Create a simple EPUB file for testing.""" book = epub.EpubBook() # Set metadata book.set_identifier('test-id-123') book.set_title(title) book.set_language('en') book.add_author(author) # Create a simple chapter chapter1 = epub.EpubHtml( title='Chapter 1', file_name='chapter1.xhtml', lang='en' ) chapter1.content = ''' Chapter 1

Chapter One

This is the first paragraph of the first chapter.

This is a second paragraph with some formatting.

''' # Add chapter to book book.add_item(chapter1) # Define table of contents book.toc = (epub.Link("chapter1.xhtml", "Chapter 1", "ch1"),) # Add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Define spine book.spine = ['nav', chapter1] # Create temporary file epub_path = os.path.join(self.test_dir, f'test_simple_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) return epub_path def create_complex_epub(self): """Create a more complex EPUB file with multiple chapters and content types.""" book = epub.EpubBook() # Set metadata book.set_identifier('complex-test-id-456') book.set_title('Complex Test Book') book.set_language('en') book.add_author('Test Author') book.add_metadata('DC', 'description', 'A test book with complex content') book.add_metadata('DC', 'subject', 'Testing') book.add_metadata('DC', 'date', '2024-01-01') book.add_metadata('DC', 'publisher', 'Test Publisher') # Chapter 1: Basic content chapter1 = epub.EpubHtml( title='Introduction', file_name='chapter1.xhtml', lang='en' ) chapter1.content = ''' Introduction

Introduction

Welcome to this complex test book.

This chapter contains basic content to test paragraph parsing.

''' # Chapter 2: Styled content chapter2 = epub.EpubHtml( title='Styled Content', file_name='chapter2.xhtml', lang='en' ) chapter2.content = ''' Styled Content

Styled Content

This chapter contains various bold text, italic text, and colored text.

Subsection

Text with underline and ~~strikethrough~~.

More Formatting

Nested formatting: bold with italic inside.

''' # Chapter 3: Lists and quotes chapter3 = epub.EpubHtml( title='Lists and Quotes', file_name='chapter3.xhtml', lang='en' ) chapter3.content = ''' Lists and Quotes

Lists and Quotes

Unordered List

First item
Bold item
Item with italic text

Ordered List

First numbered item
Second numbered item
Third numbered item

Quote

This is a quoted paragraph with some styling.

''' # Chapter 4: Tables and code chapter4 = epub.EpubHtml( title='Tables and Code', file_name='chapter4.xhtml', lang='en' ) chapter4.content = ''' Tables and Code

Tables and Code

Simple Table

Header 1	Header 2
Cell 1	Cell 2 with blue text
Bold cell	Normal cell

Code Block

function test() {
    console.log("Hello, world!");
    return true;
}

Inline Code

Use the print() function to output text.

''' # Add chapters to book book.add_item(chapter1) book.add_item(chapter2) book.add_item(chapter3) book.add_item(chapter4) # Define table of contents book.toc = ( epub.Link("chapter1.xhtml", "Introduction", "intro"), epub.Link("chapter2.xhtml", "Styled Content", "styled"), epub.Link("chapter3.xhtml", "Lists and Quotes", "lists"), epub.Link("chapter4.xhtml", "Tables and Code", "tables") ) # Add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Define spine book.spine = ['nav', chapter1, chapter2, chapter3, chapter4] # Create temporary file epub_path = os.path.join(self.test_dir, f'test_complex_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) return epub_path def create_epub_with_nested_content(self): """Create an EPUB with nested content structures.""" book = epub.EpubBook() # Set metadata book.set_identifier('nested-test-id-789') book.set_title('Nested Content Test') book.set_language('en') book.add_author('Test Author') # Chapter with nested content chapter = epub.EpubHtml( title='Nested Content', file_name='nested.xhtml', lang='en' ) chapter.content = ''' Nested Content

Nested Content Examples

Section in Div

Paragraph inside div.

Subsection

Article Header

Article content with nested formatting.

''' # Add chapter to book book.add_item(chapter) # Define table of contents book.toc = (epub.Link("nested.xhtml", "Nested Content", "nested"),) # Add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Define spine book.spine = ['nav', chapter] # Create temporary file epub_path = os.path.join(self.test_dir, f'test_nested_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) return epub_path def test_simple_epub_reading(self): """Test reading a simple EPUB file.""" epub_path = self.create_simple_epub() # Read the EPUB book = read_epub(epub_path) # Verify it's a Book object self.assertIsInstance(book, Book) # Check metadata self.assertEqual(book.title, "Test Book") # Check chapters chapters = list(book.chapters) self.assertEqual(len(chapters), 1) # Check chapter content chapter = chapters[0] blocks = list(chapter.blocks) self.assertGreater(len(blocks), 0) # Should have a heading and paragraphs has_heading = any(isinstance(block, Heading) for block in blocks) has_paragraph = any(isinstance(block, Paragraph) for block in blocks) self.assertTrue(has_heading, "Should contain at least one heading") self.assertTrue(has_paragraph, "Should contain at least one paragraph") def test_complex_epub_reading(self): """Test reading a complex EPUB file with multiple chapters.""" epub_path = self.create_complex_epub() # Read the EPUB book = read_epub(epub_path) # Verify it's a Book object self.assertIsInstance(book, Book) # Check metadata self.assertEqual(book.title, "Complex Test Book") # Check chapters chapters = list(book.chapters) self.assertEqual(len(chapters), 4) # Test each chapter has content for i, chapter in enumerate(chapters): blocks = list(chapter.blocks) self.assertGreater(len(blocks), 0, f"Chapter {i + 1} should have blocks") # Each chapter should start with a heading first_block = blocks[0] self.assertIsInstance( first_block, Heading, f"Chapter {i + 1} should start with heading" ) def test_epub_styled_content(self): """Test that styled content in EPUB is properly parsed.""" epub_path = self.create_complex_epub() book = read_epub(epub_path) chapters = list(book.chapters) # Check styled content in chapter 2 (index 1) if len(chapters) > 1: chapter2_blocks = list(chapters[1].blocks) # Find paragraphs with styled text styled_words_found = False for block in chapter2_blocks: if isinstance(block, Paragraph): words = list(block.words_iter()) for _, word in words: if (word.style.weight == FontWeight.BOLD or word.style.style == FontStyle.ITALIC or word.style.colour != (0, 0, 0)): # Non-black color styled_words_found = True break if styled_words_found: break self.assertTrue(styled_words_found, "Should find styled words in chapter 2") def test_epub_lists(self): """Test that lists in EPUB are properly parsed.""" epub_path = self.create_complex_epub() book = read_epub(epub_path) chapters = list(book.chapters) # Check lists in chapter 3 (index 2) if len(chapters) > 2: chapter3_blocks = list(chapters[2].blocks) # Find list blocks unordered_list_found = False ordered_list_found = False quote_found = False for block in chapter3_blocks: if isinstance(block, HList): if block.style == ListStyle.UNORDERED: unordered_list_found = True # Check list items items = list(block.items()) self.assertGreater( len(items), 0, "Unordered list should have items") elif block.style == ListStyle.ORDERED: ordered_list_found = True # Check list items items = list(block.items()) self.assertGreater( len(items), 0, "Ordered list should have items") elif isinstance(block, Quote): quote_found = True self.assertTrue( unordered_list_found, "Should find unordered list in chapter 3") self.assertTrue(ordered_list_found, "Should find ordered list in chapter 3") self.assertTrue(quote_found, "Should find quote in chapter 3") def test_epub_tables(self): """Test that tables in EPUB are properly parsed.""" epub_path = self.create_complex_epub() book = read_epub(epub_path) chapters = list(book.chapters) # Check tables in chapter 4 (index 3) if len(chapters) > 3: chapter4_blocks = list(chapters[3].blocks) # Find table blocks table_found = False code_block_found = False for block in chapter4_blocks: if isinstance(block, Table): table_found = True # Check table has rows rows = list(block.all_rows()) self.assertGreater(len(rows), 0, "Table should have rows") elif isinstance(block, CodeBlock): code_block_found = True # Check code block has lines lines = list(block.lines()) self.assertGreater(len(lines), 0, "Code block should have lines") self.assertTrue(table_found, "Should find table in chapter 4") self.assertTrue(code_block_found, "Should find code block in chapter 4") def test_epub_nested_content(self): """Test that nested content structures are properly parsed.""" epub_path = self.create_epub_with_nested_content() book = read_epub(epub_path) chapters = list(book.chapters) self.assertEqual(len(chapters), 1) chapter_blocks = list(chapters[0].blocks) self.assertGreater(len(chapter_blocks), 0) # Should have multiple headings (h1, h2, h3, h4) headings = [block for block in chapter_blocks if isinstance(block, Heading)] self.assertGreater( len(headings), 2, "Should have multiple headings from nested content") # Should have paragraphs and lists from nested content paragraphs = [block for block in chapter_blocks if isinstance(block, Paragraph)] lists = [block for block in chapter_blocks if isinstance(block, HList)] self.assertGreater( len(paragraphs), 0, "Should have paragraphs from nested content") self.assertGreater(len(lists), 0, "Should have lists from nested content") def test_epub_metadata_extraction(self): """Test that EPUB metadata is properly extracted.""" epub_path = self.create_complex_epub() book = read_epub(epub_path) # Check basic metadata self.assertEqual(book.title, "Complex Test Book") # Check author extraction author = book.get_metadata(MetadataType.AUTHOR) self.assertIsNotNone(author, "Author metadata should be extracted") self.assertEqual(author, "Test Author") # Check language extraction language = book.get_metadata(MetadataType.LANGUAGE) self.assertIsNotNone(language, "Language metadata should be extracted") self.assertEqual(language, "en") # Check description extraction description = book.get_metadata(MetadataType.DESCRIPTION) self.assertIsNotNone(description, "Description should be extracted") self.assertEqual(description, "A test book with complex content") # Check publisher extraction publisher = book.get_metadata(MetadataType.PUBLISHER) self.assertIsNotNone(publisher, "Publisher should be extracted") self.assertEqual(publisher, "Test Publisher") # Check publication date extraction pub_date = book.get_metadata(MetadataType.PUBLICATION_DATE) self.assertIsNotNone(pub_date, "Publication date should be extracted") self.assertEqual(pub_date, "2024-01-01") # Check identifier extraction identifier = book.get_metadata(MetadataType.IDENTIFIER) self.assertIsNotNone(identifier, "Identifier should be extracted") self.assertEqual(identifier, "complex-test-id-456") def test_epub_reader_class_direct(self): """Test EPUBReader class directly.""" epub_path = self.create_simple_epub() reader = EPUBReader(epub_path) book = reader.read() self.assertIsInstance(book, Book) self.assertEqual(book.title, "Test Book") # Verify author and language from simple EPUB author = book.get_metadata(MetadataType.AUTHOR) self.assertEqual(author, "Test Author", "Author should be extracted") language = book.get_metadata(MetadataType.LANGUAGE) self.assertEqual(language, "en", "Language should be extracted") def test_epub_with_different_languages(self): """Test EPUB with various language codes.""" test_cases = [ ("Test French Book", "François Dupont", "fr"), ("Test German Book", "Hans Mueller", "de"), ("Test Spanish Book", "Juan García", "es"), ("Test Japanese Book", "田中太郎", "ja"), ] for title, author, lang_code in test_cases: with self.subTest(language=lang_code): book_obj = epub.EpubBook() book_obj.set_identifier(f'lang-test-{lang_code}') book_obj.set_title(title) book_obj.set_language(lang_code) book_obj.add_author(author) chapter = epub.EpubHtml( title='Chapter', file_name='chapter.xhtml', lang=lang_code ) chapter.content = f''' Chapter

Test Chapter

Content in {lang_code}.

Chapter

Test content.

''' book_obj.add_item(chapter) book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),) book_obj.add_item(epub.EpubNcx()) book_obj.add_item(epub.EpubNav()) book_obj.spine = ['nav', chapter] # Write EPUB epub_path = os.path.join(self.test_dir, f'test_minimal_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book_obj, {}) self.epub_files.append(epub_path) # Read and verify parsed_book = read_epub(epub_path) self.assertEqual(parsed_book.title, "Minimal Metadata Book") # Author should be None or empty when not provided author = parsed_book.get_metadata(MetadataType.AUTHOR) # It's ok if author is None when not provided in the EPUB # Language might have a default value or be None language = parsed_book.get_metadata(MetadataType.LANGUAGE) # Just verify it doesn't crash - language handling may vary def test_invalid_epub_handling(self): """Test handling of invalid EPUB files.""" # Create a non-EPUB file invalid_path = os.path.join(self.test_dir, 'invalid.epub') with open(invalid_path, 'w') as f: f.write("This is not an EPUB file") # Should raise an exception or handle gracefully with self.assertRaises(Exception): read_epub(invalid_path) def test_nonexistent_epub_handling(self): """Test handling of nonexistent EPUB files.""" nonexistent_path = os.path.join(self.test_dir, 'nonexistent.epub') # Should raise an exception with self.assertRaises(Exception): read_epub(nonexistent_path) def test_epub_with_custom_metadata(self): """Test EPUB with various metadata fields.""" book = epub.EpubBook() # Set comprehensive metadata book.set_identifier('custom-metadata-test') book.set_title('Custom Metadata Test') book.set_language('en') book.add_author('Primary Author') book.add_author('Secondary Author') book.add_metadata( 'DC', 'description', 'A comprehensive test of metadata extraction') book.add_metadata('DC', 'subject', 'Testing') book.add_metadata('DC', 'subject', 'EPUB') book.add_metadata('DC', 'date', '2024-06-07') book.add_metadata('DC', 'publisher', 'Test Publishing House') book.add_metadata('DC', 'rights', 'Public Domain') # Simple chapter chapter = epub.EpubHtml( title='Metadata Test', file_name='metadata.xhtml', lang='en' ) chapter.content = ''' Metadata Test

Metadata Test Chapter

This chapter tests metadata extraction.

''' book.add_item(chapter) book.toc = (epub.Link("metadata.xhtml", "Metadata Test", "meta"),) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['nav', chapter] # Write and test epub_path = os.path.join(self.test_dir, f'test_metadata_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) # Read and verify parsed_book = read_epub(epub_path) self.assertEqual(parsed_book.title, "Custom Metadata Test") # Verify all metadata fields are extracted correctly # Note: When multiple authors are added with ebooklib, the behavior may vary # The EPUB reader currently only extracts the first DC:creator element it finds author = parsed_book.get_metadata(MetadataType.AUTHOR) self.assertIsNotNone(author, "Author should be extracted") # Accept either author as valid since multiple author handling may vary self.assertTrue( "Author" in author, f"Author metadata should contain an author name, got: {author}") # Verify language language = parsed_book.get_metadata(MetadataType.LANGUAGE) self.assertEqual(language, "en", "Language should be 'en'") # Verify description description = parsed_book.get_metadata(MetadataType.DESCRIPTION) self.assertEqual( description, "A comprehensive test of metadata extraction", "Description should match") # Verify publisher publisher = parsed_book.get_metadata(MetadataType.PUBLISHER) self.assertEqual(publisher, "Test Publishing House", "Publisher should match") # Verify publication date pub_date = parsed_book.get_metadata(MetadataType.PUBLICATION_DATE) self.assertEqual(pub_date, "2024-06-07", "Publication date should match") # Verify identifier identifier = parsed_book.get_metadata(MetadataType.IDENTIFIER) self.assertEqual(identifier, "custom-metadata-test", "Identifier should match") # Verify chapters were created chapters = list(parsed_book.chapters) self.assertEqual(len(chapters), 1) class TestEPUBIntegrationWithHTMLExtraction(unittest.TestCase): """Test cases that specifically verify EPUB reader uses html_extraction properly.""" def setUp(self): """Set up test environment.""" self.test_dir = tempfile.mkdtemp() self.epub_files = [] def tearDown(self): """Clean up test environment.""" for epub_file in self.epub_files: try: os.remove(epub_file) except OSError: pass if os.path.exists(self.test_dir): shutil.rmtree(self.test_dir, ignore_errors=True) @unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available") def test_html_extraction_integration(self): """Test that EPUB reader properly uses html_extraction functionality.""" # Create an EPUB that exercises various HTML extraction features book = epub.EpubBook() book.set_identifier('html-extraction-test') book.set_title('HTML Extraction Test') book.set_language('en') book.add_author('Test Author') # Chapter that exercises html_extraction features chapter = epub.EpubHtml( title='HTML Features', file_name='html_features.xhtml', lang='en' ) chapter.content = ''' HTML Features

HTML Extraction Test

This paragraph has bold, italic, underlined, and styled text.

Second Level Heading

Third Level Heading

Plain list item
Bold list item
List item with italic text

Header	Value
Blue text	Normal text

This is a quoted paragraph with bold text.

def test_function():
    return "Hello, World!"

Nested formatting: bold with italic nested inside.

Red text, Green hex, Blue underlined.

''' book.add_item(chapter) book.toc = (epub.Link("html_features.xhtml", "HTML Features", "html"),) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['nav', chapter] # Write EPUB epub_path = os.path.join(self.test_dir, 'html_extraction_test.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) # Read and analyze parsed_book = read_epub(epub_path) chapters = list(parsed_book.chapters) self.assertEqual(len(chapters), 1) blocks = list(chapters[0].blocks) self.assertGreater(len(blocks), 5) # Should have multiple blocks # Test that we get the expected block types block_types = [type(block).__name__ for block in blocks] self.assertIn('Heading', block_types, "Should have heading blocks") self.assertIn('Paragraph', block_types, "Should have paragraph blocks") self.assertIn('HList', block_types, "Should have list blocks") self.assertIn('Table', block_types, "Should have table blocks") self.assertIn('Quote', block_types, "Should have quote blocks") self.assertIn('CodeBlock', block_types, "Should have code blocks") # Test styled content was preserved styled_content_found = False for block in blocks: if isinstance(block, Paragraph): words = list(block.words_iter()) for _, word in words: if (word.style.weight == FontWeight.BOLD or word.style.style == FontStyle.ITALIC or word.style.decoration == TextDecoration.UNDERLINE or word.style.colour != (0, 0, 0)): styled_content_found = True break if styled_content_found: break self.assertTrue( styled_content_found, "Should find styled content in parsed blocks") # Test specific color parsing red_text_found = False green_text_found = False blue_text_found = False for block in blocks: if isinstance(block, (Paragraph, Table)): if isinstance(block, Paragraph): words = list(block.words_iter()) for _, word in words: if word.style.colour == (255, 0, 0): # Red red_text_found = True elif word.style.colour == (0, 255, 0): # Green green_text_found = True elif word.style.colour == (0, 0, 255): # Blue blue_text_found = True # At least one color should be found (depending on implementation) color_found = red_text_found or green_text_found or blue_text_found self.assertTrue(color_found, "Should find at least one colored text") def test_epub_with_image(self): """Test that images in EPUB are properly parsed.""" book = epub.EpubBook() book.set_identifier('image-test-id') book.set_title('Image Test Book') book.set_language('en') book.add_author('Test Author') # Create minimal JPEG data for testing img_data = ( b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00' b'\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t' b'\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a' b'\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342' b'\xff\xc0\x00\x11\x08\x00d\x00d\x01\x01\x11\x00\x02\x11\x01\x03' b'\x11\x01\xff\xc4\x00\x14\x00\x01\x00\x00\x00\x00\x00\x00\x00' b'\x00\x00\x00\x00\x00\x00\x00\x00\x08\xff\xc4\x00\x14\x10\x01' b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' b'\x00\xff\xda\x00\x0c\x03\x01\x00\x02\x11\x03\x11\x00\x3f\x00' b'\xaa\xff\xd9' ) # Create an EpubImage item image_item = epub.EpubImage() image_item.id = 'test_img' image_item.file_name = 'images/test_image.jpg' image_item.media_type = 'image/jpeg' image_item.content = img_data # Add image to book book.add_item(image_item) # Create a chapter that references the image chapter = epub.EpubHtml( title='Image Chapter', file_name='image_chapter.xhtml', lang='en' ) chapter.content = ''' Image Chapter

Chapter with Image

This chapter contains an image:

Text after the image.

''' book.add_item(chapter) book.toc = (epub.Link("image_chapter.xhtml", "Image Chapter", "img_ch"),) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['nav', chapter] # Write EPUB epub_path = os.path.join(self.test_dir, f'test_image_{len(self.epub_files)}.epub') epub.write_epub(epub_path, book, {}) self.epub_files.append(epub_path) # Read and analyze parsed_book = read_epub(epub_path) chapters = list(parsed_book.chapters) self.assertEqual(len(chapters), 1) blocks = list(chapters[0].blocks) self.assertGreater(len(blocks), 0) # Find blocks by type heading_blocks = [block for block in blocks if isinstance(block, Heading)] paragraph_blocks = [block for block in blocks if isinstance(block, Paragraph)] image_blocks = [block for block in blocks if isinstance(block, Image)] # Verify we have the expected blocks self.assertEqual( len(heading_blocks), 1, "Should find exactly one heading block") self.assertGreaterEqual( len(paragraph_blocks), 2, "Should find at least two paragraph blocks") self.assertEqual(len(image_blocks), 1, "Should find exactly one image block") # Verify image properties image_block = image_blocks[0] self.assertEqual(image_block.alt_text, "Test image") self.assertEqual(image_block.width, 300) self.assertEqual(image_block.height, 200) self.assertIn("test_image.jpg", image_block.source) if __name__ == '__main__': unittest.main()