tests for author names and metadata extraction

2025-11-10 13:54:36 +01:00 · 2025-11-10 13:54:36 +01:00 · 303179865d
commit 303179865d
parent fb52178cc6
1 changed files with 178 additions and 4 deletions
--- a/tests/io_tests/test_epub_reader.py
+++ b/tests/io_tests/test_epub_reader.py
@ -18,7 +18,7 @@ except ImportError:
    EBOOKLIB_AVAILABLE = False
 from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
-from pyWebLayout.abstract.document import Book
+from pyWebLayout.abstract.document import Book, MetadataType
 from pyWebLayout.abstract.block import (
    Paragraph, Heading, Quote, CodeBlock, HList,
    ListStyle, Table, Image
@ -523,9 +523,35 @@ class TestEPUBReader(unittest.TestCase):
        # Check basic metadata
        self.assertEqual(book.title, "Complex Test Book")
-        # Check that metadata was set (implementation may vary)
+        # Check author extraction
-        # This tests that the metadata parsing doesn't crash
+        author = book.get_metadata(MetadataType.AUTHOR)
-        self.assertIsNotNone(book.title)
+        self.assertIsNotNone(author, "Author metadata should be extracted")
        self.assertEqual(author, "Test Author")
        # Check language extraction
        language = book.get_metadata(MetadataType.LANGUAGE)
        self.assertIsNotNone(language, "Language metadata should be extracted")
        self.assertEqual(language, "en")
        # Check description extraction
        description = book.get_metadata(MetadataType.DESCRIPTION)
        self.assertIsNotNone(description, "Description should be extracted")
        self.assertEqual(description, "A test book with complex content")
        # Check publisher extraction
        publisher = book.get_metadata(MetadataType.PUBLISHER)
        self.assertIsNotNone(publisher, "Publisher should be extracted")
        self.assertEqual(publisher, "Test Publisher")
        # Check publication date extraction
        pub_date = book.get_metadata(MetadataType.PUBLICATION_DATE)
        self.assertIsNotNone(pub_date, "Publication date should be extracted")
        self.assertEqual(pub_date, "2024-01-01")
        # Check identifier extraction
        identifier = book.get_metadata(MetadataType.IDENTIFIER)
        self.assertIsNotNone(identifier, "Identifier should be extracted")
        self.assertEqual(identifier, "complex-test-id-456")
    def test_epub_reader_class_direct(self):
        """Test EPUBReader class directly."""
@ -537,6 +563,121 @@ class TestEPUBReader(unittest.TestCase):
        self.assertIsInstance(book, Book)
        self.assertEqual(book.title, "Test Book")
        # Verify author and language from simple EPUB
        author = book.get_metadata(MetadataType.AUTHOR)
        self.assertEqual(author, "Test Author", "Author should be extracted")
        language = book.get_metadata(MetadataType.LANGUAGE)
        self.assertEqual(language, "en", "Language should be extracted")
    def test_epub_with_different_languages(self):
        """Test EPUB with various language codes."""
        test_cases = [
            ("Test French Book", "François Dupont", "fr"),
            ("Test German Book", "Hans Mueller", "de"),
            ("Test Spanish Book", "Juan García", "es"),
            ("Test Japanese Book", "田中太郎", "ja"),
        ]
        for title, author, lang_code in test_cases:
            with self.subTest(language=lang_code):
                book_obj = epub.EpubBook()
                book_obj.set_identifier(f'lang-test-{lang_code}')
                book_obj.set_title(title)
                book_obj.set_language(lang_code)
                book_obj.add_author(author)
                chapter = epub.EpubHtml(
                    title='Chapter',
                    file_name='chapter.xhtml',
                    lang=lang_code
                )
                chapter.content = f'''
                <html xmlns="http://www.w3.org/1999/xhtml">
                <head><title>Chapter</title></head>
                <body>
                    <h1>Test Chapter</h1>
                    <p>Content in {lang_code}.</p>
                </body>
                </html>
                '''
                book_obj.add_item(chapter)
                book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
                book_obj.add_item(epub.EpubNcx())
                book_obj.add_item(epub.EpubNav())
                book_obj.spine = ['nav', chapter]
                # Write EPUB
                epub_path = os.path.join(
                    self.test_dir,
                    f'test_lang_{lang_code}_{len(self.epub_files)}.epub')
                epub.write_epub(epub_path, book_obj, {})
                self.epub_files.append(epub_path)
                # Read and verify
                parsed_book = read_epub(epub_path)
                self.assertEqual(parsed_book.title, title)
                # Verify language is correctly extracted
                language = parsed_book.get_metadata(MetadataType.LANGUAGE)
                self.assertEqual(
                    language, lang_code,
                    f"Language should be {lang_code}")
                # Verify author is correctly extracted
                parsed_author = parsed_book.get_metadata(MetadataType.AUTHOR)
                self.assertEqual(
                    parsed_author, author,
                    f"Author should be {author}")
    def test_epub_with_minimal_metadata(self):
        """Test EPUB with minimal metadata (only title, no author/language)."""
        book_obj = epub.EpubBook()
        # Set only minimal metadata - no author or language
        book_obj.set_identifier('minimal-metadata-test')
        book_obj.set_title('Minimal Metadata Book')
        # Simple chapter
        chapter = epub.EpubHtml(
            title='Chapter',
            file_name='chapter.xhtml'
        )
        chapter.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Chapter</title></head>
        <body>
            <h1>Chapter</h1>
            <p>Test content.</p>
        </body>
        </html>
        '''
        book_obj.add_item(chapter)
        book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
        book_obj.add_item(epub.EpubNcx())
        book_obj.add_item(epub.EpubNav())
        book_obj.spine = ['nav', chapter]
        # Write EPUB
        epub_path = os.path.join(self.test_dir,
                                 f'test_minimal_{len(self.epub_files)}.epub')
        epub.write_epub(epub_path, book_obj, {})
        self.epub_files.append(epub_path)
        # Read and verify
        parsed_book = read_epub(epub_path)
        self.assertEqual(parsed_book.title, "Minimal Metadata Book")
        # Author should be None or empty when not provided
        author = parsed_book.get_metadata(MetadataType.AUTHOR)
        # It's ok if author is None when not provided in the EPUB
        # Language might have a default value or be None
        language = parsed_book.get_metadata(MetadataType.LANGUAGE)
        # Just verify it doesn't crash - language handling may vary
    def test_invalid_epub_handling(self):
        """Test handling of invalid EPUB files."""
        # Create a non-EPUB file
@ -608,6 +749,39 @@ class TestEPUBReader(unittest.TestCase):
        parsed_book = read_epub(epub_path)
        self.assertEqual(parsed_book.title, "Custom Metadata Test")
        # Verify all metadata fields are extracted correctly
        # Note: When multiple authors are added with ebooklib, the behavior may vary
        # The EPUB reader currently only extracts the first DC:creator element it finds
        author = parsed_book.get_metadata(MetadataType.AUTHOR)
        self.assertIsNotNone(author, "Author should be extracted")
        # Accept either author as valid since multiple author handling may vary
        self.assertTrue(
            "Author" in author,
            f"Author metadata should contain an author name, got: {author}")
        # Verify language
        language = parsed_book.get_metadata(MetadataType.LANGUAGE)
        self.assertEqual(language, "en", "Language should be 'en'")
        # Verify description
        description = parsed_book.get_metadata(MetadataType.DESCRIPTION)
        self.assertEqual(
            description,
            "A comprehensive test of metadata extraction",
            "Description should match")
        # Verify publisher
        publisher = parsed_book.get_metadata(MetadataType.PUBLISHER)
        self.assertEqual(publisher, "Test Publishing House", "Publisher should match")
        # Verify publication date
        pub_date = parsed_book.get_metadata(MetadataType.PUBLICATION_DATE)
        self.assertEqual(pub_date, "2024-06-07", "Publication date should match")
        # Verify identifier
        identifier = parsed_book.get_metadata(MetadataType.IDENTIFIER)
        self.assertEqual(identifier, "custom-metadata-test", "Identifier should match")
        # Verify chapters were created
        chapters = list(parsed_book.chapters)
        self.assertEqual(len(chapters), 1)