tests for author names and metadata extraction
All checks were successful
Python CI / test (3.10) (push) Successful in 2m16s
Python CI / test (3.12) (push) Successful in 2m7s
Python CI / test (3.13) (push) Successful in 2m2s

This commit is contained in:
Duncan Tourolle 2025-11-10 13:54:36 +01:00
parent fb52178cc6
commit 303179865d

View File

@ -18,7 +18,7 @@ except ImportError:
EBOOKLIB_AVAILABLE = False EBOOKLIB_AVAILABLE = False
from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
from pyWebLayout.abstract.document import Book from pyWebLayout.abstract.document import Book, MetadataType
from pyWebLayout.abstract.block import ( from pyWebLayout.abstract.block import (
Paragraph, Heading, Quote, CodeBlock, HList, Paragraph, Heading, Quote, CodeBlock, HList,
ListStyle, Table, Image ListStyle, Table, Image
@ -523,9 +523,35 @@ class TestEPUBReader(unittest.TestCase):
# Check basic metadata # Check basic metadata
self.assertEqual(book.title, "Complex Test Book") self.assertEqual(book.title, "Complex Test Book")
# Check that metadata was set (implementation may vary) # Check author extraction
# This tests that the metadata parsing doesn't crash author = book.get_metadata(MetadataType.AUTHOR)
self.assertIsNotNone(book.title) self.assertIsNotNone(author, "Author metadata should be extracted")
self.assertEqual(author, "Test Author")
# Check language extraction
language = book.get_metadata(MetadataType.LANGUAGE)
self.assertIsNotNone(language, "Language metadata should be extracted")
self.assertEqual(language, "en")
# Check description extraction
description = book.get_metadata(MetadataType.DESCRIPTION)
self.assertIsNotNone(description, "Description should be extracted")
self.assertEqual(description, "A test book with complex content")
# Check publisher extraction
publisher = book.get_metadata(MetadataType.PUBLISHER)
self.assertIsNotNone(publisher, "Publisher should be extracted")
self.assertEqual(publisher, "Test Publisher")
# Check publication date extraction
pub_date = book.get_metadata(MetadataType.PUBLICATION_DATE)
self.assertIsNotNone(pub_date, "Publication date should be extracted")
self.assertEqual(pub_date, "2024-01-01")
# Check identifier extraction
identifier = book.get_metadata(MetadataType.IDENTIFIER)
self.assertIsNotNone(identifier, "Identifier should be extracted")
self.assertEqual(identifier, "complex-test-id-456")
def test_epub_reader_class_direct(self): def test_epub_reader_class_direct(self):
"""Test EPUBReader class directly.""" """Test EPUBReader class directly."""
@ -537,6 +563,121 @@ class TestEPUBReader(unittest.TestCase):
self.assertIsInstance(book, Book) self.assertIsInstance(book, Book)
self.assertEqual(book.title, "Test Book") self.assertEqual(book.title, "Test Book")
# Verify author and language from simple EPUB
author = book.get_metadata(MetadataType.AUTHOR)
self.assertEqual(author, "Test Author", "Author should be extracted")
language = book.get_metadata(MetadataType.LANGUAGE)
self.assertEqual(language, "en", "Language should be extracted")
def test_epub_with_different_languages(self):
"""Test EPUB with various language codes."""
test_cases = [
("Test French Book", "François Dupont", "fr"),
("Test German Book", "Hans Mueller", "de"),
("Test Spanish Book", "Juan García", "es"),
("Test Japanese Book", "田中太郎", "ja"),
]
for title, author, lang_code in test_cases:
with self.subTest(language=lang_code):
book_obj = epub.EpubBook()
book_obj.set_identifier(f'lang-test-{lang_code}')
book_obj.set_title(title)
book_obj.set_language(lang_code)
book_obj.add_author(author)
chapter = epub.EpubHtml(
title='Chapter',
file_name='chapter.xhtml',
lang=lang_code
)
chapter.content = f'''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Chapter</title></head>
<body>
<h1>Test Chapter</h1>
<p>Content in {lang_code}.</p>
</body>
</html>
'''
book_obj.add_item(chapter)
book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
book_obj.add_item(epub.EpubNcx())
book_obj.add_item(epub.EpubNav())
book_obj.spine = ['nav', chapter]
# Write EPUB
epub_path = os.path.join(
self.test_dir,
f'test_lang_{lang_code}_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book_obj, {})
self.epub_files.append(epub_path)
# Read and verify
parsed_book = read_epub(epub_path)
self.assertEqual(parsed_book.title, title)
# Verify language is correctly extracted
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
self.assertEqual(
language, lang_code,
f"Language should be {lang_code}")
# Verify author is correctly extracted
parsed_author = parsed_book.get_metadata(MetadataType.AUTHOR)
self.assertEqual(
parsed_author, author,
f"Author should be {author}")
def test_epub_with_minimal_metadata(self):
"""Test EPUB with minimal metadata (only title, no author/language)."""
book_obj = epub.EpubBook()
# Set only minimal metadata - no author or language
book_obj.set_identifier('minimal-metadata-test')
book_obj.set_title('Minimal Metadata Book')
# Simple chapter
chapter = epub.EpubHtml(
title='Chapter',
file_name='chapter.xhtml'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Chapter</title></head>
<body>
<h1>Chapter</h1>
<p>Test content.</p>
</body>
</html>
'''
book_obj.add_item(chapter)
book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
book_obj.add_item(epub.EpubNcx())
book_obj.add_item(epub.EpubNav())
book_obj.spine = ['nav', chapter]
# Write EPUB
epub_path = os.path.join(self.test_dir,
f'test_minimal_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book_obj, {})
self.epub_files.append(epub_path)
# Read and verify
parsed_book = read_epub(epub_path)
self.assertEqual(parsed_book.title, "Minimal Metadata Book")
# Author should be None or empty when not provided
author = parsed_book.get_metadata(MetadataType.AUTHOR)
# It's ok if author is None when not provided in the EPUB
# Language might have a default value or be None
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
# Just verify it doesn't crash - language handling may vary
def test_invalid_epub_handling(self): def test_invalid_epub_handling(self):
"""Test handling of invalid EPUB files.""" """Test handling of invalid EPUB files."""
# Create a non-EPUB file # Create a non-EPUB file
@ -608,6 +749,39 @@ class TestEPUBReader(unittest.TestCase):
parsed_book = read_epub(epub_path) parsed_book = read_epub(epub_path)
self.assertEqual(parsed_book.title, "Custom Metadata Test") self.assertEqual(parsed_book.title, "Custom Metadata Test")
# Verify all metadata fields are extracted correctly
# Note: When multiple authors are added with ebooklib, the behavior may vary
# The EPUB reader currently only extracts the first DC:creator element it finds
author = parsed_book.get_metadata(MetadataType.AUTHOR)
self.assertIsNotNone(author, "Author should be extracted")
# Accept either author as valid since multiple author handling may vary
self.assertTrue(
"Author" in author,
f"Author metadata should contain an author name, got: {author}")
# Verify language
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
self.assertEqual(language, "en", "Language should be 'en'")
# Verify description
description = parsed_book.get_metadata(MetadataType.DESCRIPTION)
self.assertEqual(
description,
"A comprehensive test of metadata extraction",
"Description should match")
# Verify publisher
publisher = parsed_book.get_metadata(MetadataType.PUBLISHER)
self.assertEqual(publisher, "Test Publishing House", "Publisher should match")
# Verify publication date
pub_date = parsed_book.get_metadata(MetadataType.PUBLICATION_DATE)
self.assertEqual(pub_date, "2024-06-07", "Publication date should match")
# Verify identifier
identifier = parsed_book.get_metadata(MetadataType.IDENTIFIER)
self.assertEqual(identifier, "custom-metadata-test", "Identifier should match")
# Verify chapters were created # Verify chapters were created
chapters = list(parsed_book.chapters) chapters = list(parsed_book.chapters)
self.assertEqual(len(chapters), 1) self.assertEqual(len(chapters), 1)