pyWebLayout/tests/test_epub_reader.py
Duncan Tourolle 952b7d4394
All checks were successful
Python CI / test (push) Successful in 51s
tests for epub reader
2025-06-07 18:52:15 +02:00

757 lines
27 KiB
Python

"""
Unit tests for EPUB reader functionality.
Tests the EPUB parsing and conversion to pyWebLayout abstract elements,
using ebooklib to generate test EPUB files.
"""
import unittest
import tempfile
import os
import shutil
from datetime import datetime
# Import ebooklib for creating test EPUB files
try:
from ebooklib import epub
EBOOKLIB_AVAILABLE = True
except ImportError:
EBOOKLIB_AVAILABLE = False
from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
from pyWebLayout.abstract.document import Book
from pyWebLayout.abstract.block import (
Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
HList, ListStyle, Table, HorizontalRule, Image
)
from pyWebLayout.style import FontWeight, FontStyle, TextDecoration
@unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
class TestEPUBReader(unittest.TestCase):
"""Test cases for EPUB reader functionality."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
self.epub_files = []
def tearDown(self):
"""Clean up test environment."""
# Clean up test EPUB files
for epub_file in self.epub_files:
try:
os.remove(epub_file)
except OSError:
pass
# Clean up test directory
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir, ignore_errors=True)
def create_simple_epub(self, title="Test Book", author="Test Author"):
"""Create a simple EPUB file for testing."""
book = epub.EpubBook()
# Set metadata
book.set_identifier('test-id-123')
book.set_title(title)
book.set_language('en')
book.add_author(author)
# Create a simple chapter
chapter1 = epub.EpubHtml(
title='Chapter 1',
file_name='chapter1.xhtml',
lang='en'
)
chapter1.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Chapter 1</title></head>
<body>
<h1>Chapter One</h1>
<p>This is the first paragraph of the first chapter.</p>
<p>This is a <strong>second paragraph</strong> with <em>some formatting</em>.</p>
</body>
</html>
'''
# Add chapter to book
book.add_item(chapter1)
# Define table of contents
book.toc = (epub.Link("chapter1.xhtml", "Chapter 1", "ch1"),)
# Add navigation files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Define spine
book.spine = ['nav', chapter1]
# Create temporary file
epub_path = os.path.join(self.test_dir, f'test_simple_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
return epub_path
def create_complex_epub(self):
"""Create a more complex EPUB file with multiple chapters and content types."""
book = epub.EpubBook()
# Set metadata
book.set_identifier('complex-test-id-456')
book.set_title('Complex Test Book')
book.set_language('en')
book.add_author('Test Author')
book.add_metadata('DC', 'description', 'A test book with complex content')
book.add_metadata('DC', 'subject', 'Testing')
book.add_metadata('DC', 'date', '2024-01-01')
book.add_metadata('DC', 'publisher', 'Test Publisher')
# Chapter 1: Basic content
chapter1 = epub.EpubHtml(
title='Introduction',
file_name='chapter1.xhtml',
lang='en'
)
chapter1.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Introduction</title></head>
<body>
<h1>Introduction</h1>
<p>Welcome to this <strong>complex test book</strong>.</p>
<p>This chapter contains basic content to test paragraph parsing.</p>
</body>
</html>
'''
# Chapter 2: Styled content
chapter2 = epub.EpubHtml(
title='Styled Content',
file_name='chapter2.xhtml',
lang='en'
)
chapter2.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Styled Content</title></head>
<body>
<h1>Styled Content</h1>
<p>This chapter contains various <strong>bold text</strong>, <em>italic text</em>,
and <span style="color: red; font-weight: bold;">colored text</span>.</p>
<h2>Subsection</h2>
<p>Text with <u>underline</u> and <s>strikethrough</s>.</p>
<h3>More Formatting</h3>
<p>Nested formatting: <strong>bold with <em>italic inside</em></strong>.</p>
</body>
</html>
'''
# Chapter 3: Lists and quotes
chapter3 = epub.EpubHtml(
title='Lists and Quotes',
file_name='chapter3.xhtml',
lang='en'
)
chapter3.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Lists and Quotes</title></head>
<body>
<h1>Lists and Quotes</h1>
<h2>Unordered List</h2>
<ul>
<li>First item</li>
<li><strong>Bold item</strong></li>
<li>Item with <em>italic text</em></li>
</ul>
<h2>Ordered List</h2>
<ol>
<li>First numbered item</li>
<li>Second numbered item</li>
<li>Third numbered item</li>
</ol>
<h2>Quote</h2>
<blockquote>
<p>This is a <span style="font-style: italic;">quoted paragraph</span>
with some styling.</p>
</blockquote>
</body>
</html>
'''
# Chapter 4: Tables and code
chapter4 = epub.EpubHtml(
title='Tables and Code',
file_name='chapter4.xhtml',
lang='en'
)
chapter4.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Tables and Code</title></head>
<body>
<h1>Tables and Code</h1>
<h2>Simple Table</h2>
<table>
<thead>
<tr>
<th><strong>Header 1</strong></th>
<th><em>Header 2</em></th>
</tr>
</thead>
<tbody>
<tr>
<td>Cell 1</td>
<td>Cell 2 with <span style="color: blue;">blue text</span></td>
</tr>
<tr>
<td><strong>Bold cell</strong></td>
<td>Normal cell</td>
</tr>
</tbody>
</table>
<h2>Code Block</h2>
<pre><code>function test() {
console.log("Hello, world!");
return true;
}</code></pre>
<h2>Inline Code</h2>
<p>Use the <code>print()</code> function to output text.</p>
</body>
</html>
'''
# Add chapters to book
book.add_item(chapter1)
book.add_item(chapter2)
book.add_item(chapter3)
book.add_item(chapter4)
# Define table of contents
book.toc = (
epub.Link("chapter1.xhtml", "Introduction", "intro"),
epub.Link("chapter2.xhtml", "Styled Content", "styled"),
epub.Link("chapter3.xhtml", "Lists and Quotes", "lists"),
epub.Link("chapter4.xhtml", "Tables and Code", "tables")
)
# Add navigation files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Define spine
book.spine = ['nav', chapter1, chapter2, chapter3, chapter4]
# Create temporary file
epub_path = os.path.join(self.test_dir, f'test_complex_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
return epub_path
def create_epub_with_nested_content(self):
"""Create an EPUB with nested content structures."""
book = epub.EpubBook()
# Set metadata
book.set_identifier('nested-test-id-789')
book.set_title('Nested Content Test')
book.set_language('en')
book.add_author('Test Author')
# Chapter with nested content
chapter = epub.EpubHtml(
title='Nested Content',
file_name='nested.xhtml',
lang='en'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Nested Content</title></head>
<body>
<h1>Nested Content Examples</h1>
<div>
<h2>Section in Div</h2>
<p>Paragraph inside div.</p>
<section>
<h3>Subsection</h3>
<article>
<h4>Article Header</h4>
<p>Article content with <strong>nested <em>formatting</em></strong>.</p>
<aside>
<p>Sidebar content in aside element.</p>
<ul>
<li>Nested list item</li>
<li>Another <strong>bold</strong> item</li>
</ul>
</aside>
</article>
</section>
</div>
<footer>
<p>Footer content with <span style="font-size: 12px; color: gray;">small gray text</span>.</p>
</footer>
</body>
</html>
'''
# Add chapter to book
book.add_item(chapter)
# Define table of contents
book.toc = (epub.Link("nested.xhtml", "Nested Content", "nested"),)
# Add navigation files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Define spine
book.spine = ['nav', chapter]
# Create temporary file
epub_path = os.path.join(self.test_dir, f'test_nested_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
return epub_path
def test_simple_epub_reading(self):
"""Test reading a simple EPUB file."""
epub_path = self.create_simple_epub()
# Read the EPUB
book = read_epub(epub_path)
# Verify it's a Book object
self.assertIsInstance(book, Book)
# Check metadata
self.assertEqual(book.title, "Test Book")
# Check chapters
chapters = list(book.chapters)
self.assertEqual(len(chapters), 1)
# Check chapter content
chapter = chapters[0]
blocks = list(chapter.blocks)
self.assertGreater(len(blocks), 0)
# Should have a heading and paragraphs
has_heading = any(isinstance(block, Heading) for block in blocks)
has_paragraph = any(isinstance(block, Paragraph) for block in blocks)
self.assertTrue(has_heading, "Should contain at least one heading")
self.assertTrue(has_paragraph, "Should contain at least one paragraph")
def test_complex_epub_reading(self):
"""Test reading a complex EPUB file with multiple chapters."""
epub_path = self.create_complex_epub()
# Read the EPUB
book = read_epub(epub_path)
# Verify it's a Book object
self.assertIsInstance(book, Book)
# Check metadata
self.assertEqual(book.title, "Complex Test Book")
# Check chapters
chapters = list(book.chapters)
self.assertEqual(len(chapters), 4)
# Test each chapter has content
for i, chapter in enumerate(chapters):
blocks = list(chapter.blocks)
self.assertGreater(len(blocks), 0, f"Chapter {i+1} should have blocks")
# Each chapter should start with a heading
first_block = blocks[0]
self.assertIsInstance(first_block, Heading, f"Chapter {i+1} should start with heading")
def test_epub_styled_content(self):
"""Test that styled content in EPUB is properly parsed."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
chapters = list(book.chapters)
# Check styled content in chapter 2 (index 1)
if len(chapters) > 1:
chapter2_blocks = list(chapters[1].blocks)
# Find paragraphs with styled text
styled_words_found = False
for block in chapter2_blocks:
if isinstance(block, Paragraph):
words = list(block.words())
for _, word in words:
if (word.style.weight == FontWeight.BOLD or
word.style.style == FontStyle.ITALIC or
word.style.colour != (0, 0, 0)): # Non-black color
styled_words_found = True
break
if styled_words_found:
break
self.assertTrue(styled_words_found, "Should find styled words in chapter 2")
def test_epub_lists(self):
"""Test that lists in EPUB are properly parsed."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
chapters = list(book.chapters)
# Check lists in chapter 3 (index 2)
if len(chapters) > 2:
chapter3_blocks = list(chapters[2].blocks)
# Find list blocks
unordered_list_found = False
ordered_list_found = False
quote_found = False
for block in chapter3_blocks:
if isinstance(block, HList):
if block.style == ListStyle.UNORDERED:
unordered_list_found = True
# Check list items
items = list(block.items())
self.assertGreater(len(items), 0, "Unordered list should have items")
elif block.style == ListStyle.ORDERED:
ordered_list_found = True
# Check list items
items = list(block.items())
self.assertGreater(len(items), 0, "Ordered list should have items")
elif isinstance(block, Quote):
quote_found = True
self.assertTrue(unordered_list_found, "Should find unordered list in chapter 3")
self.assertTrue(ordered_list_found, "Should find ordered list in chapter 3")
self.assertTrue(quote_found, "Should find quote in chapter 3")
def test_epub_tables(self):
"""Test that tables in EPUB are properly parsed."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
chapters = list(book.chapters)
# Check tables in chapter 4 (index 3)
if len(chapters) > 3:
chapter4_blocks = list(chapters[3].blocks)
# Find table blocks
table_found = False
code_block_found = False
for block in chapter4_blocks:
if isinstance(block, Table):
table_found = True
# Check table has rows
rows = list(block.all_rows())
self.assertGreater(len(rows), 0, "Table should have rows")
elif isinstance(block, CodeBlock):
code_block_found = True
# Check code block has lines
lines = list(block.lines())
self.assertGreater(len(lines), 0, "Code block should have lines")
self.assertTrue(table_found, "Should find table in chapter 4")
self.assertTrue(code_block_found, "Should find code block in chapter 4")
def test_epub_nested_content(self):
"""Test that nested content structures are properly parsed."""
epub_path = self.create_epub_with_nested_content()
book = read_epub(epub_path)
chapters = list(book.chapters)
self.assertEqual(len(chapters), 1)
chapter_blocks = list(chapters[0].blocks)
self.assertGreater(len(chapter_blocks), 0)
# Should have multiple headings (h1, h2, h3, h4)
headings = [block for block in chapter_blocks if isinstance(block, Heading)]
self.assertGreater(len(headings), 2, "Should have multiple headings from nested content")
# Should have paragraphs and lists from nested content
paragraphs = [block for block in chapter_blocks if isinstance(block, Paragraph)]
lists = [block for block in chapter_blocks if isinstance(block, HList)]
self.assertGreater(len(paragraphs), 0, "Should have paragraphs from nested content")
self.assertGreater(len(lists), 0, "Should have lists from nested content")
def test_epub_metadata_extraction(self):
"""Test that EPUB metadata is properly extracted."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
# Check basic metadata
self.assertEqual(book.title, "Complex Test Book")
# Check that metadata was set (implementation may vary)
# This tests that the metadata parsing doesn't crash
self.assertIsNotNone(book.title)
def test_epub_reader_class_direct(self):
"""Test EPUBReader class directly."""
epub_path = self.create_simple_epub()
reader = EPUBReader(epub_path)
book = reader.read()
self.assertIsInstance(book, Book)
self.assertEqual(book.title, "Test Book")
def test_invalid_epub_handling(self):
"""Test handling of invalid EPUB files."""
# Create a non-EPUB file
invalid_path = os.path.join(self.test_dir, 'invalid.epub')
with open(invalid_path, 'w') as f:
f.write("This is not an EPUB file")
# Should raise an exception or handle gracefully
with self.assertRaises(Exception):
read_epub(invalid_path)
def test_nonexistent_epub_handling(self):
"""Test handling of nonexistent EPUB files."""
nonexistent_path = os.path.join(self.test_dir, 'nonexistent.epub')
# Should raise an exception
with self.assertRaises(Exception):
read_epub(nonexistent_path)
def test_epub_with_custom_metadata(self):
"""Test EPUB with various metadata fields."""
book = epub.EpubBook()
# Set comprehensive metadata
book.set_identifier('custom-metadata-test')
book.set_title('Custom Metadata Test')
book.set_language('en')
book.add_author('Primary Author')
book.add_author('Secondary Author')
book.add_metadata('DC', 'description', 'A comprehensive test of metadata extraction')
book.add_metadata('DC', 'subject', 'Testing')
book.add_metadata('DC', 'subject', 'EPUB')
book.add_metadata('DC', 'date', '2024-06-07')
book.add_metadata('DC', 'publisher', 'Test Publishing House')
book.add_metadata('DC', 'rights', 'Public Domain')
# Simple chapter
chapter = epub.EpubHtml(
title='Metadata Test',
file_name='metadata.xhtml',
lang='en'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Metadata Test</title></head>
<body>
<h1>Metadata Test Chapter</h1>
<p>This chapter tests metadata extraction.</p>
</body>
</html>
'''
book.add_item(chapter)
book.toc = (epub.Link("metadata.xhtml", "Metadata Test", "meta"),)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ['nav', chapter]
# Write and test
epub_path = os.path.join(self.test_dir, f'test_metadata_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
# Read and verify
parsed_book = read_epub(epub_path)
self.assertEqual(parsed_book.title, "Custom Metadata Test")
# Verify chapters were created
chapters = list(parsed_book.chapters)
self.assertEqual(len(chapters), 1)
class TestEPUBIntegrationWithHTMLExtraction(unittest.TestCase):
"""Test cases that specifically verify EPUB reader uses html_extraction properly."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
self.epub_files = []
def tearDown(self):
"""Clean up test environment."""
for epub_file in self.epub_files:
try:
os.remove(epub_file)
except OSError:
pass
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir, ignore_errors=True)
@unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
def test_html_extraction_integration(self):
"""Test that EPUB reader properly uses html_extraction functionality."""
# Create an EPUB that exercises various HTML extraction features
book = epub.EpubBook()
book.set_identifier('html-extraction-test')
book.set_title('HTML Extraction Test')
book.set_language('en')
book.add_author('Test Author')
# Chapter that exercises html_extraction features
chapter = epub.EpubHtml(
title='HTML Features',
file_name='html_features.xhtml',
lang='en'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>HTML Features</title></head>
<body>
<h1>HTML Extraction Test</h1>
<!-- Test paragraph with inline formatting -->
<p>This paragraph has <strong>bold</strong>, <em>italic</em>,
<u>underlined</u>, and <span style="color: #ff0000; font-weight: bold;">styled</span> text.</p>
<!-- Test headings -->
<h2>Second Level Heading</h2>
<h3>Third Level Heading</h3>
<!-- Test lists with styled content -->
<ul>
<li>Plain list item</li>
<li><strong>Bold list item</strong></li>
<li>List item with <em>italic text</em></li>
</ul>
<!-- Test table with styled cells -->
<table>
<tr>
<th style="font-weight: bold;">Header</th>
<th>Value</th>
</tr>
<tr>
<td><span style="color: blue;">Blue text</span></td>
<td>Normal text</td>
</tr>
</table>
<!-- Test blockquote -->
<blockquote>
<p>This is a quoted paragraph with <strong>bold text</strong>.</p>
</blockquote>
<!-- Test code block -->
<pre><code>def test_function():
return "Hello, World!"</code></pre>
<!-- Test nested formatting -->
<p>Nested formatting: <strong>bold with <em>italic nested</em> inside</strong>.</p>
<!-- Test color variations -->
<p>
<span style="color: red;">Red text</span>,
<span style="color: #00ff00;">Green hex</span>,
<span style="color: blue; text-decoration: underline;">Blue underlined</span>.
</p>
</body>
</html>
'''
book.add_item(chapter)
book.toc = (epub.Link("html_features.xhtml", "HTML Features", "html"),)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ['nav', chapter]
# Write EPUB
epub_path = os.path.join(self.test_dir, 'html_extraction_test.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
# Read and analyze
parsed_book = read_epub(epub_path)
chapters = list(parsed_book.chapters)
self.assertEqual(len(chapters), 1)
blocks = list(chapters[0].blocks)
self.assertGreater(len(blocks), 5) # Should have multiple blocks
# Test that we get the expected block types
block_types = [type(block).__name__ for block in blocks]
self.assertIn('Heading', block_types, "Should have heading blocks")
self.assertIn('Paragraph', block_types, "Should have paragraph blocks")
self.assertIn('HList', block_types, "Should have list blocks")
self.assertIn('Table', block_types, "Should have table blocks")
self.assertIn('Quote', block_types, "Should have quote blocks")
self.assertIn('CodeBlock', block_types, "Should have code blocks")
# Test styled content was preserved
styled_content_found = False
for block in blocks:
if isinstance(block, Paragraph):
words = list(block.words())
for _, word in words:
if (word.style.weight == FontWeight.BOLD or
word.style.style == FontStyle.ITALIC or
word.style.decoration == TextDecoration.UNDERLINE or
word.style.colour != (0, 0, 0)):
styled_content_found = True
break
if styled_content_found:
break
self.assertTrue(styled_content_found, "Should find styled content in parsed blocks")
# Test specific color parsing
red_text_found = False
green_text_found = False
blue_text_found = False
for block in blocks:
if isinstance(block, (Paragraph, Table)):
if isinstance(block, Paragraph):
words = list(block.words())
for _, word in words:
if word.style.colour == (255, 0, 0): # Red
red_text_found = True
elif word.style.colour == (0, 255, 0): # Green
green_text_found = True
elif word.style.colour == (0, 0, 255): # Blue
blue_text_found = True
# At least one color should be found (depending on implementation)
color_found = red_text_found or green_text_found or blue_text_found
self.assertTrue(color_found, "Should find at least one colored text")
if __name__ == '__main__':
unittest.main()