From 303179865d27fd75cdad0ae4e4e083a4de231c08 Mon Sep 17 00:00:00 2001
From: Duncan Tourolle <duncan@tourolle.paris>
Date: Mon, 10 Nov 2025 13:54:36 +0100
Subject: [PATCH] tests for author names and metadata extraction

---
 tests/io_tests/test_epub_reader.py | 182 ++++++++++++++++++++++++++++-
 1 file changed, 178 insertions(+), 4 deletions(-)

diff --git a/tests/io_tests/test_epub_reader.py b/tests/io_tests/test_epub_reader.py
index 84d0351..8f0f53a 100644
--- a/tests/io_tests/test_epub_reader.py
+++ b/tests/io_tests/test_epub_reader.py
@@ -18,7 +18,7 @@ except ImportError:
     EBOOKLIB_AVAILABLE = False
 
 from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
-from pyWebLayout.abstract.document import Book
+from pyWebLayout.abstract.document import Book, MetadataType
 from pyWebLayout.abstract.block import (
     Paragraph, Heading, Quote, CodeBlock, HList,
     ListStyle, Table, Image
@@ -523,9 +523,35 @@ class TestEPUBReader(unittest.TestCase):
         # Check basic metadata
         self.assertEqual(book.title, "Complex Test Book")
 
-        # Check that metadata was set (implementation may vary)
-        # This tests that the metadata parsing doesn't crash
-        self.assertIsNotNone(book.title)
+        # Check author extraction
+        author = book.get_metadata(MetadataType.AUTHOR)
+        self.assertIsNotNone(author, "Author metadata should be extracted")
+        self.assertEqual(author, "Test Author")
+
+        # Check language extraction
+        language = book.get_metadata(MetadataType.LANGUAGE)
+        self.assertIsNotNone(language, "Language metadata should be extracted")
+        self.assertEqual(language, "en")
+
+        # Check description extraction
+        description = book.get_metadata(MetadataType.DESCRIPTION)
+        self.assertIsNotNone(description, "Description should be extracted")
+        self.assertEqual(description, "A test book with complex content")
+
+        # Check publisher extraction
+        publisher = book.get_metadata(MetadataType.PUBLISHER)
+        self.assertIsNotNone(publisher, "Publisher should be extracted")
+        self.assertEqual(publisher, "Test Publisher")
+
+        # Check publication date extraction
+        pub_date = book.get_metadata(MetadataType.PUBLICATION_DATE)
+        self.assertIsNotNone(pub_date, "Publication date should be extracted")
+        self.assertEqual(pub_date, "2024-01-01")
+
+        # Check identifier extraction
+        identifier = book.get_metadata(MetadataType.IDENTIFIER)
+        self.assertIsNotNone(identifier, "Identifier should be extracted")
+        self.assertEqual(identifier, "complex-test-id-456")
 
     def test_epub_reader_class_direct(self):
         """Test EPUBReader class directly."""
@@ -537,6 +563,121 @@ class TestEPUBReader(unittest.TestCase):
         self.assertIsInstance(book, Book)
         self.assertEqual(book.title, "Test Book")
 
+        # Verify author and language from simple EPUB
+        author = book.get_metadata(MetadataType.AUTHOR)
+        self.assertEqual(author, "Test Author", "Author should be extracted")
+
+        language = book.get_metadata(MetadataType.LANGUAGE)
+        self.assertEqual(language, "en", "Language should be extracted")
+
+    def test_epub_with_different_languages(self):
+        """Test EPUB with various language codes."""
+        test_cases = [
+            ("Test French Book", "François Dupont", "fr"),
+            ("Test German Book", "Hans Mueller", "de"),
+            ("Test Spanish Book", "Juan García", "es"),
+            ("Test Japanese Book", "田中太郎", "ja"),
+        ]
+
+        for title, author, lang_code in test_cases:
+            with self.subTest(language=lang_code):
+                book_obj = epub.EpubBook()
+                book_obj.set_identifier(f'lang-test-{lang_code}')
+                book_obj.set_title(title)
+                book_obj.set_language(lang_code)
+                book_obj.add_author(author)
+
+                chapter = epub.EpubHtml(
+                    title='Chapter',
+                    file_name='chapter.xhtml',
+                    lang=lang_code
+                )
+                chapter.content = f'''
+                <html xmlns="http://www.w3.org/1999/xhtml">
+                <head><title>Chapter</title></head>
+                <body>
+                    <h1>Test Chapter</h1>
+                    <p>Content in {lang_code}.</p>
+                </body>
+                </html>
+                '''
+
+                book_obj.add_item(chapter)
+                book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
+                book_obj.add_item(epub.EpubNcx())
+                book_obj.add_item(epub.EpubNav())
+                book_obj.spine = ['nav', chapter]
+
+                # Write EPUB
+                epub_path = os.path.join(
+                    self.test_dir,
+                    f'test_lang_{lang_code}_{len(self.epub_files)}.epub')
+                epub.write_epub(epub_path, book_obj, {})
+                self.epub_files.append(epub_path)
+
+                # Read and verify
+                parsed_book = read_epub(epub_path)
+                self.assertEqual(parsed_book.title, title)
+
+                # Verify language is correctly extracted
+                language = parsed_book.get_metadata(MetadataType.LANGUAGE)
+                self.assertEqual(
+                    language, lang_code,
+                    f"Language should be {lang_code}")
+
+                # Verify author is correctly extracted
+                parsed_author = parsed_book.get_metadata(MetadataType.AUTHOR)
+                self.assertEqual(
+                    parsed_author, author,
+                    f"Author should be {author}")
+
+    def test_epub_with_minimal_metadata(self):
+        """Test EPUB with minimal metadata (only title, no author/language)."""
+        book_obj = epub.EpubBook()
+
+        # Set only minimal metadata - no author or language
+        book_obj.set_identifier('minimal-metadata-test')
+        book_obj.set_title('Minimal Metadata Book')
+
+        # Simple chapter
+        chapter = epub.EpubHtml(
+            title='Chapter',
+            file_name='chapter.xhtml'
+        )
+        chapter.content = '''
+        <html xmlns="http://www.w3.org/1999/xhtml">
+        <head><title>Chapter</title></head>
+        <body>
+            <h1>Chapter</h1>
+            <p>Test content.</p>
+        </body>
+        </html>
+        '''
+
+        book_obj.add_item(chapter)
+        book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
+        book_obj.add_item(epub.EpubNcx())
+        book_obj.add_item(epub.EpubNav())
+        book_obj.spine = ['nav', chapter]
+
+        # Write EPUB
+        epub_path = os.path.join(self.test_dir,
+                                 f'test_minimal_{len(self.epub_files)}.epub')
+        epub.write_epub(epub_path, book_obj, {})
+        self.epub_files.append(epub_path)
+
+        # Read and verify
+        parsed_book = read_epub(epub_path)
+        self.assertEqual(parsed_book.title, "Minimal Metadata Book")
+
+        # Author should be None or empty when not provided
+        author = parsed_book.get_metadata(MetadataType.AUTHOR)
+        # It's ok if author is None when not provided in the EPUB
+
+        # Language might have a default value or be None
+        language = parsed_book.get_metadata(MetadataType.LANGUAGE)
+        # Just verify it doesn't crash - language handling may vary
+
     def test_invalid_epub_handling(self):
         """Test handling of invalid EPUB files."""
         # Create a non-EPUB file
@@ -608,6 +749,39 @@ class TestEPUBReader(unittest.TestCase):
         parsed_book = read_epub(epub_path)
         self.assertEqual(parsed_book.title, "Custom Metadata Test")
 
+        # Verify all metadata fields are extracted correctly
+        # Note: When multiple authors are added with ebooklib, the behavior may vary
+        # The EPUB reader currently only extracts the first DC:creator element it finds
+        author = parsed_book.get_metadata(MetadataType.AUTHOR)
+        self.assertIsNotNone(author, "Author should be extracted")
+        # Accept either author as valid since multiple author handling may vary
+        self.assertTrue(
+            "Author" in author,
+            f"Author metadata should contain an author name, got: {author}")
+
+        # Verify language
+        language = parsed_book.get_metadata(MetadataType.LANGUAGE)
+        self.assertEqual(language, "en", "Language should be 'en'")
+
+        # Verify description
+        description = parsed_book.get_metadata(MetadataType.DESCRIPTION)
+        self.assertEqual(
+            description,
+            "A comprehensive test of metadata extraction",
+            "Description should match")
+
+        # Verify publisher
+        publisher = parsed_book.get_metadata(MetadataType.PUBLISHER)
+        self.assertEqual(publisher, "Test Publishing House", "Publisher should match")
+
+        # Verify publication date
+        pub_date = parsed_book.get_metadata(MetadataType.PUBLICATION_DATE)
+        self.assertEqual(pub_date, "2024-06-07", "Publication date should match")
+
+        # Verify identifier
+        identifier = parsed_book.get_metadata(MetadataType.IDENTIFIER)
+        self.assertEqual(identifier, "custom-metadata-test", "Identifier should match")
+
         # Verify chapters were created
         chapters = list(parsed_book.chapters)
         self.assertEqual(len(chapters), 1)