From 303179865d27fd75cdad0ae4e4e083a4de231c08 Mon Sep 17 00:00:00 2001 From: Duncan Tourolle Date: Mon, 10 Nov 2025 13:54:36 +0100 Subject: [PATCH] tests for author names and metadata extraction --- tests/io_tests/test_epub_reader.py | 182 ++++++++++++++++++++++++++++- 1 file changed, 178 insertions(+), 4 deletions(-) diff --git a/tests/io_tests/test_epub_reader.py b/tests/io_tests/test_epub_reader.py index 84d0351..8f0f53a 100644 --- a/tests/io_tests/test_epub_reader.py +++ b/tests/io_tests/test_epub_reader.py @@ -18,7 +18,7 @@ except ImportError: EBOOKLIB_AVAILABLE = False from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader -from pyWebLayout.abstract.document import Book +from pyWebLayout.abstract.document import Book, MetadataType from pyWebLayout.abstract.block import ( Paragraph, Heading, Quote, CodeBlock, HList, ListStyle, Table, Image @@ -523,9 +523,35 @@ class TestEPUBReader(unittest.TestCase): # Check basic metadata self.assertEqual(book.title, "Complex Test Book") - # Check that metadata was set (implementation may vary) - # This tests that the metadata parsing doesn't crash - self.assertIsNotNone(book.title) + # Check author extraction + author = book.get_metadata(MetadataType.AUTHOR) + self.assertIsNotNone(author, "Author metadata should be extracted") + self.assertEqual(author, "Test Author") + + # Check language extraction + language = book.get_metadata(MetadataType.LANGUAGE) + self.assertIsNotNone(language, "Language metadata should be extracted") + self.assertEqual(language, "en") + + # Check description extraction + description = book.get_metadata(MetadataType.DESCRIPTION) + self.assertIsNotNone(description, "Description should be extracted") + self.assertEqual(description, "A test book with complex content") + + # Check publisher extraction + publisher = book.get_metadata(MetadataType.PUBLISHER) + self.assertIsNotNone(publisher, "Publisher should be extracted") + self.assertEqual(publisher, "Test Publisher") + + # Check publication date extraction + pub_date = book.get_metadata(MetadataType.PUBLICATION_DATE) + self.assertIsNotNone(pub_date, "Publication date should be extracted") + self.assertEqual(pub_date, "2024-01-01") + + # Check identifier extraction + identifier = book.get_metadata(MetadataType.IDENTIFIER) + self.assertIsNotNone(identifier, "Identifier should be extracted") + self.assertEqual(identifier, "complex-test-id-456") def test_epub_reader_class_direct(self): """Test EPUBReader class directly.""" @@ -537,6 +563,121 @@ class TestEPUBReader(unittest.TestCase): self.assertIsInstance(book, Book) self.assertEqual(book.title, "Test Book") + # Verify author and language from simple EPUB + author = book.get_metadata(MetadataType.AUTHOR) + self.assertEqual(author, "Test Author", "Author should be extracted") + + language = book.get_metadata(MetadataType.LANGUAGE) + self.assertEqual(language, "en", "Language should be extracted") + + def test_epub_with_different_languages(self): + """Test EPUB with various language codes.""" + test_cases = [ + ("Test French Book", "François Dupont", "fr"), + ("Test German Book", "Hans Mueller", "de"), + ("Test Spanish Book", "Juan García", "es"), + ("Test Japanese Book", "田中太郎", "ja"), + ] + + for title, author, lang_code in test_cases: + with self.subTest(language=lang_code): + book_obj = epub.EpubBook() + book_obj.set_identifier(f'lang-test-{lang_code}') + book_obj.set_title(title) + book_obj.set_language(lang_code) + book_obj.add_author(author) + + chapter = epub.EpubHtml( + title='Chapter', + file_name='chapter.xhtml', + lang=lang_code + ) + chapter.content = f''' + + Chapter + +

Test Chapter

+

Content in {lang_code}.

+ + + ''' + + book_obj.add_item(chapter) + book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),) + book_obj.add_item(epub.EpubNcx()) + book_obj.add_item(epub.EpubNav()) + book_obj.spine = ['nav', chapter] + + # Write EPUB + epub_path = os.path.join( + self.test_dir, + f'test_lang_{lang_code}_{len(self.epub_files)}.epub') + epub.write_epub(epub_path, book_obj, {}) + self.epub_files.append(epub_path) + + # Read and verify + parsed_book = read_epub(epub_path) + self.assertEqual(parsed_book.title, title) + + # Verify language is correctly extracted + language = parsed_book.get_metadata(MetadataType.LANGUAGE) + self.assertEqual( + language, lang_code, + f"Language should be {lang_code}") + + # Verify author is correctly extracted + parsed_author = parsed_book.get_metadata(MetadataType.AUTHOR) + self.assertEqual( + parsed_author, author, + f"Author should be {author}") + + def test_epub_with_minimal_metadata(self): + """Test EPUB with minimal metadata (only title, no author/language).""" + book_obj = epub.EpubBook() + + # Set only minimal metadata - no author or language + book_obj.set_identifier('minimal-metadata-test') + book_obj.set_title('Minimal Metadata Book') + + # Simple chapter + chapter = epub.EpubHtml( + title='Chapter', + file_name='chapter.xhtml' + ) + chapter.content = ''' + + Chapter + +

Chapter

+

Test content.

+ + + ''' + + book_obj.add_item(chapter) + book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),) + book_obj.add_item(epub.EpubNcx()) + book_obj.add_item(epub.EpubNav()) + book_obj.spine = ['nav', chapter] + + # Write EPUB + epub_path = os.path.join(self.test_dir, + f'test_minimal_{len(self.epub_files)}.epub') + epub.write_epub(epub_path, book_obj, {}) + self.epub_files.append(epub_path) + + # Read and verify + parsed_book = read_epub(epub_path) + self.assertEqual(parsed_book.title, "Minimal Metadata Book") + + # Author should be None or empty when not provided + author = parsed_book.get_metadata(MetadataType.AUTHOR) + # It's ok if author is None when not provided in the EPUB + + # Language might have a default value or be None + language = parsed_book.get_metadata(MetadataType.LANGUAGE) + # Just verify it doesn't crash - language handling may vary + def test_invalid_epub_handling(self): """Test handling of invalid EPUB files.""" # Create a non-EPUB file @@ -608,6 +749,39 @@ class TestEPUBReader(unittest.TestCase): parsed_book = read_epub(epub_path) self.assertEqual(parsed_book.title, "Custom Metadata Test") + # Verify all metadata fields are extracted correctly + # Note: When multiple authors are added with ebooklib, the behavior may vary + # The EPUB reader currently only extracts the first DC:creator element it finds + author = parsed_book.get_metadata(MetadataType.AUTHOR) + self.assertIsNotNone(author, "Author should be extracted") + # Accept either author as valid since multiple author handling may vary + self.assertTrue( + "Author" in author, + f"Author metadata should contain an author name, got: {author}") + + # Verify language + language = parsed_book.get_metadata(MetadataType.LANGUAGE) + self.assertEqual(language, "en", "Language should be 'en'") + + # Verify description + description = parsed_book.get_metadata(MetadataType.DESCRIPTION) + self.assertEqual( + description, + "A comprehensive test of metadata extraction", + "Description should match") + + # Verify publisher + publisher = parsed_book.get_metadata(MetadataType.PUBLISHER) + self.assertEqual(publisher, "Test Publishing House", "Publisher should match") + + # Verify publication date + pub_date = parsed_book.get_metadata(MetadataType.PUBLICATION_DATE) + self.assertEqual(pub_date, "2024-06-07", "Publication date should match") + + # Verify identifier + identifier = parsed_book.get_metadata(MetadataType.IDENTIFIER) + self.assertEqual(identifier, "custom-metadata-test", "Identifier should match") + # Verify chapters were created chapters = list(parsed_book.chapters) self.assertEqual(len(chapters), 1)