tests for author names and metadata extraction
This commit is contained in:
parent
fb52178cc6
commit
303179865d
@ -18,7 +18,7 @@ except ImportError:
|
|||||||
EBOOKLIB_AVAILABLE = False
|
EBOOKLIB_AVAILABLE = False
|
||||||
|
|
||||||
from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
|
from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
|
||||||
from pyWebLayout.abstract.document import Book
|
from pyWebLayout.abstract.document import Book, MetadataType
|
||||||
from pyWebLayout.abstract.block import (
|
from pyWebLayout.abstract.block import (
|
||||||
Paragraph, Heading, Quote, CodeBlock, HList,
|
Paragraph, Heading, Quote, CodeBlock, HList,
|
||||||
ListStyle, Table, Image
|
ListStyle, Table, Image
|
||||||
@ -523,9 +523,35 @@ class TestEPUBReader(unittest.TestCase):
|
|||||||
# Check basic metadata
|
# Check basic metadata
|
||||||
self.assertEqual(book.title, "Complex Test Book")
|
self.assertEqual(book.title, "Complex Test Book")
|
||||||
|
|
||||||
# Check that metadata was set (implementation may vary)
|
# Check author extraction
|
||||||
# This tests that the metadata parsing doesn't crash
|
author = book.get_metadata(MetadataType.AUTHOR)
|
||||||
self.assertIsNotNone(book.title)
|
self.assertIsNotNone(author, "Author metadata should be extracted")
|
||||||
|
self.assertEqual(author, "Test Author")
|
||||||
|
|
||||||
|
# Check language extraction
|
||||||
|
language = book.get_metadata(MetadataType.LANGUAGE)
|
||||||
|
self.assertIsNotNone(language, "Language metadata should be extracted")
|
||||||
|
self.assertEqual(language, "en")
|
||||||
|
|
||||||
|
# Check description extraction
|
||||||
|
description = book.get_metadata(MetadataType.DESCRIPTION)
|
||||||
|
self.assertIsNotNone(description, "Description should be extracted")
|
||||||
|
self.assertEqual(description, "A test book with complex content")
|
||||||
|
|
||||||
|
# Check publisher extraction
|
||||||
|
publisher = book.get_metadata(MetadataType.PUBLISHER)
|
||||||
|
self.assertIsNotNone(publisher, "Publisher should be extracted")
|
||||||
|
self.assertEqual(publisher, "Test Publisher")
|
||||||
|
|
||||||
|
# Check publication date extraction
|
||||||
|
pub_date = book.get_metadata(MetadataType.PUBLICATION_DATE)
|
||||||
|
self.assertIsNotNone(pub_date, "Publication date should be extracted")
|
||||||
|
self.assertEqual(pub_date, "2024-01-01")
|
||||||
|
|
||||||
|
# Check identifier extraction
|
||||||
|
identifier = book.get_metadata(MetadataType.IDENTIFIER)
|
||||||
|
self.assertIsNotNone(identifier, "Identifier should be extracted")
|
||||||
|
self.assertEqual(identifier, "complex-test-id-456")
|
||||||
|
|
||||||
def test_epub_reader_class_direct(self):
|
def test_epub_reader_class_direct(self):
|
||||||
"""Test EPUBReader class directly."""
|
"""Test EPUBReader class directly."""
|
||||||
@ -537,6 +563,121 @@ class TestEPUBReader(unittest.TestCase):
|
|||||||
self.assertIsInstance(book, Book)
|
self.assertIsInstance(book, Book)
|
||||||
self.assertEqual(book.title, "Test Book")
|
self.assertEqual(book.title, "Test Book")
|
||||||
|
|
||||||
|
# Verify author and language from simple EPUB
|
||||||
|
author = book.get_metadata(MetadataType.AUTHOR)
|
||||||
|
self.assertEqual(author, "Test Author", "Author should be extracted")
|
||||||
|
|
||||||
|
language = book.get_metadata(MetadataType.LANGUAGE)
|
||||||
|
self.assertEqual(language, "en", "Language should be extracted")
|
||||||
|
|
||||||
|
def test_epub_with_different_languages(self):
|
||||||
|
"""Test EPUB with various language codes."""
|
||||||
|
test_cases = [
|
||||||
|
("Test French Book", "François Dupont", "fr"),
|
||||||
|
("Test German Book", "Hans Mueller", "de"),
|
||||||
|
("Test Spanish Book", "Juan García", "es"),
|
||||||
|
("Test Japanese Book", "田中太郎", "ja"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for title, author, lang_code in test_cases:
|
||||||
|
with self.subTest(language=lang_code):
|
||||||
|
book_obj = epub.EpubBook()
|
||||||
|
book_obj.set_identifier(f'lang-test-{lang_code}')
|
||||||
|
book_obj.set_title(title)
|
||||||
|
book_obj.set_language(lang_code)
|
||||||
|
book_obj.add_author(author)
|
||||||
|
|
||||||
|
chapter = epub.EpubHtml(
|
||||||
|
title='Chapter',
|
||||||
|
file_name='chapter.xhtml',
|
||||||
|
lang=lang_code
|
||||||
|
)
|
||||||
|
chapter.content = f'''
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>Chapter</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Test Chapter</h1>
|
||||||
|
<p>Content in {lang_code}.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
|
||||||
|
book_obj.add_item(chapter)
|
||||||
|
book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
|
||||||
|
book_obj.add_item(epub.EpubNcx())
|
||||||
|
book_obj.add_item(epub.EpubNav())
|
||||||
|
book_obj.spine = ['nav', chapter]
|
||||||
|
|
||||||
|
# Write EPUB
|
||||||
|
epub_path = os.path.join(
|
||||||
|
self.test_dir,
|
||||||
|
f'test_lang_{lang_code}_{len(self.epub_files)}.epub')
|
||||||
|
epub.write_epub(epub_path, book_obj, {})
|
||||||
|
self.epub_files.append(epub_path)
|
||||||
|
|
||||||
|
# Read and verify
|
||||||
|
parsed_book = read_epub(epub_path)
|
||||||
|
self.assertEqual(parsed_book.title, title)
|
||||||
|
|
||||||
|
# Verify language is correctly extracted
|
||||||
|
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
|
||||||
|
self.assertEqual(
|
||||||
|
language, lang_code,
|
||||||
|
f"Language should be {lang_code}")
|
||||||
|
|
||||||
|
# Verify author is correctly extracted
|
||||||
|
parsed_author = parsed_book.get_metadata(MetadataType.AUTHOR)
|
||||||
|
self.assertEqual(
|
||||||
|
parsed_author, author,
|
||||||
|
f"Author should be {author}")
|
||||||
|
|
||||||
|
def test_epub_with_minimal_metadata(self):
|
||||||
|
"""Test EPUB with minimal metadata (only title, no author/language)."""
|
||||||
|
book_obj = epub.EpubBook()
|
||||||
|
|
||||||
|
# Set only minimal metadata - no author or language
|
||||||
|
book_obj.set_identifier('minimal-metadata-test')
|
||||||
|
book_obj.set_title('Minimal Metadata Book')
|
||||||
|
|
||||||
|
# Simple chapter
|
||||||
|
chapter = epub.EpubHtml(
|
||||||
|
title='Chapter',
|
||||||
|
file_name='chapter.xhtml'
|
||||||
|
)
|
||||||
|
chapter.content = '''
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>Chapter</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Chapter</h1>
|
||||||
|
<p>Test content.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
|
||||||
|
book_obj.add_item(chapter)
|
||||||
|
book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
|
||||||
|
book_obj.add_item(epub.EpubNcx())
|
||||||
|
book_obj.add_item(epub.EpubNav())
|
||||||
|
book_obj.spine = ['nav', chapter]
|
||||||
|
|
||||||
|
# Write EPUB
|
||||||
|
epub_path = os.path.join(self.test_dir,
|
||||||
|
f'test_minimal_{len(self.epub_files)}.epub')
|
||||||
|
epub.write_epub(epub_path, book_obj, {})
|
||||||
|
self.epub_files.append(epub_path)
|
||||||
|
|
||||||
|
# Read and verify
|
||||||
|
parsed_book = read_epub(epub_path)
|
||||||
|
self.assertEqual(parsed_book.title, "Minimal Metadata Book")
|
||||||
|
|
||||||
|
# Author should be None or empty when not provided
|
||||||
|
author = parsed_book.get_metadata(MetadataType.AUTHOR)
|
||||||
|
# It's ok if author is None when not provided in the EPUB
|
||||||
|
|
||||||
|
# Language might have a default value or be None
|
||||||
|
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
|
||||||
|
# Just verify it doesn't crash - language handling may vary
|
||||||
|
|
||||||
def test_invalid_epub_handling(self):
|
def test_invalid_epub_handling(self):
|
||||||
"""Test handling of invalid EPUB files."""
|
"""Test handling of invalid EPUB files."""
|
||||||
# Create a non-EPUB file
|
# Create a non-EPUB file
|
||||||
@ -608,6 +749,39 @@ class TestEPUBReader(unittest.TestCase):
|
|||||||
parsed_book = read_epub(epub_path)
|
parsed_book = read_epub(epub_path)
|
||||||
self.assertEqual(parsed_book.title, "Custom Metadata Test")
|
self.assertEqual(parsed_book.title, "Custom Metadata Test")
|
||||||
|
|
||||||
|
# Verify all metadata fields are extracted correctly
|
||||||
|
# Note: When multiple authors are added with ebooklib, the behavior may vary
|
||||||
|
# The EPUB reader currently only extracts the first DC:creator element it finds
|
||||||
|
author = parsed_book.get_metadata(MetadataType.AUTHOR)
|
||||||
|
self.assertIsNotNone(author, "Author should be extracted")
|
||||||
|
# Accept either author as valid since multiple author handling may vary
|
||||||
|
self.assertTrue(
|
||||||
|
"Author" in author,
|
||||||
|
f"Author metadata should contain an author name, got: {author}")
|
||||||
|
|
||||||
|
# Verify language
|
||||||
|
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
|
||||||
|
self.assertEqual(language, "en", "Language should be 'en'")
|
||||||
|
|
||||||
|
# Verify description
|
||||||
|
description = parsed_book.get_metadata(MetadataType.DESCRIPTION)
|
||||||
|
self.assertEqual(
|
||||||
|
description,
|
||||||
|
"A comprehensive test of metadata extraction",
|
||||||
|
"Description should match")
|
||||||
|
|
||||||
|
# Verify publisher
|
||||||
|
publisher = parsed_book.get_metadata(MetadataType.PUBLISHER)
|
||||||
|
self.assertEqual(publisher, "Test Publishing House", "Publisher should match")
|
||||||
|
|
||||||
|
# Verify publication date
|
||||||
|
pub_date = parsed_book.get_metadata(MetadataType.PUBLICATION_DATE)
|
||||||
|
self.assertEqual(pub_date, "2024-06-07", "Publication date should match")
|
||||||
|
|
||||||
|
# Verify identifier
|
||||||
|
identifier = parsed_book.get_metadata(MetadataType.IDENTIFIER)
|
||||||
|
self.assertEqual(identifier, "custom-metadata-test", "Identifier should match")
|
||||||
|
|
||||||
# Verify chapters were created
|
# Verify chapters were created
|
||||||
chapters = list(parsed_book.chapters)
|
chapters = list(parsed_book.chapters)
|
||||||
self.assertEqual(len(chapters), 1)
|
self.assertEqual(len(chapters), 1)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user