pyWebLayout/tests/io_tests/test_epub_reader.py
Duncan Tourolle 303179865d
All checks were successful
Python CI / test (3.10) (push) Successful in 2m16s
Python CI / test (3.12) (push) Successful in 2m7s
Python CI / test (3.13) (push) Successful in 2m2s
tests for author names and metadata extraction
2025-11-10 13:54:36 +01:00

1042 lines
37 KiB
Python

"""
Unit tests for EPUB reader functionality.
Tests the EPUB parsing and conversion to pyWebLayout abstract elements,
using ebooklib to generate test EPUB files.
"""
import unittest
import tempfile
import os
import shutil
# Import ebooklib for creating test EPUB files
try:
from ebooklib import epub
EBOOKLIB_AVAILABLE = True
except ImportError:
EBOOKLIB_AVAILABLE = False
from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
from pyWebLayout.abstract.document import Book, MetadataType
from pyWebLayout.abstract.block import (
Paragraph, Heading, Quote, CodeBlock, HList,
ListStyle, Table, Image
)
from pyWebLayout.style import FontWeight, FontStyle, TextDecoration
@unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
class TestEPUBReader(unittest.TestCase):
"""Test cases for EPUB reader functionality."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
self.epub_files = []
def tearDown(self):
"""Clean up test environment."""
# Clean up test EPUB files
for epub_file in self.epub_files:
try:
os.remove(epub_file)
except OSError:
pass
# Clean up test directory
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir, ignore_errors=True)
def create_simple_epub(self, title="Test Book", author="Test Author"):
"""Create a simple EPUB file for testing."""
book = epub.EpubBook()
# Set metadata
book.set_identifier('test-id-123')
book.set_title(title)
book.set_language('en')
book.add_author(author)
# Create a simple chapter
chapter1 = epub.EpubHtml(
title='Chapter 1',
file_name='chapter1.xhtml',
lang='en'
)
chapter1.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Chapter 1</title></head>
<body>
<h1>Chapter One</h1>
<p>This is the first paragraph of the first chapter.</p>
<p>This is a <strong>second paragraph</strong> with <em>some formatting</em>.</p>
</body>
</html>
'''
# Add chapter to book
book.add_item(chapter1)
# Define table of contents
book.toc = (epub.Link("chapter1.xhtml", "Chapter 1", "ch1"),)
# Add navigation files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Define spine
book.spine = ['nav', chapter1]
# Create temporary file
epub_path = os.path.join(self.test_dir,
f'test_simple_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
return epub_path
def create_complex_epub(self):
"""Create a more complex EPUB file with multiple chapters and content types."""
book = epub.EpubBook()
# Set metadata
book.set_identifier('complex-test-id-456')
book.set_title('Complex Test Book')
book.set_language('en')
book.add_author('Test Author')
book.add_metadata('DC', 'description', 'A test book with complex content')
book.add_metadata('DC', 'subject', 'Testing')
book.add_metadata('DC', 'date', '2024-01-01')
book.add_metadata('DC', 'publisher', 'Test Publisher')
# Chapter 1: Basic content
chapter1 = epub.EpubHtml(
title='Introduction',
file_name='chapter1.xhtml',
lang='en'
)
chapter1.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Introduction</title></head>
<body>
<h1>Introduction</h1>
<p>Welcome to this <strong>complex test book</strong>.</p>
<p>This chapter contains basic content to test paragraph parsing.</p>
</body>
</html>
'''
# Chapter 2: Styled content
chapter2 = epub.EpubHtml(
title='Styled Content',
file_name='chapter2.xhtml',
lang='en'
)
chapter2.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Styled Content</title></head>
<body>
<h1>Styled Content</h1>
<p>This chapter contains various <strong>bold text</strong>, <em>italic text</em>,
and <span style="color: red; font-weight: bold;">colored text</span>.</p>
<h2>Subsection</h2>
<p>Text with <u>underline</u> and <s>strikethrough</s>.</p>
<h3>More Formatting</h3>
<p>Nested formatting: <strong>bold with <em>italic inside</em></strong>.</p>
</body>
</html>
'''
# Chapter 3: Lists and quotes
chapter3 = epub.EpubHtml(
title='Lists and Quotes',
file_name='chapter3.xhtml',
lang='en'
)
chapter3.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Lists and Quotes</title></head>
<body>
<h1>Lists and Quotes</h1>
<h2>Unordered List</h2>
<ul>
<li>First item</li>
<li><strong>Bold item</strong></li>
<li>Item with <em>italic text</em></li>
</ul>
<h2>Ordered List</h2>
<ol>
<li>First numbered item</li>
<li>Second numbered item</li>
<li>Third numbered item</li>
</ol>
<h2>Quote</h2>
<blockquote>
<p>This is a <span style="font-style: italic;">quoted paragraph</span>
with some styling.</p>
</blockquote>
</body>
</html>
'''
# Chapter 4: Tables and code
chapter4 = epub.EpubHtml(
title='Tables and Code',
file_name='chapter4.xhtml',
lang='en'
)
chapter4.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Tables and Code</title></head>
<body>
<h1>Tables and Code</h1>
<h2>Simple Table</h2>
<table>
<thead>
<tr>
<th><strong>Header 1</strong></th>
<th><em>Header 2</em></th>
</tr>
</thead>
<tbody>
<tr>
<td>Cell 1</td>
<td>Cell 2 with <span style="color: blue;">blue text</span></td>
</tr>
<tr>
<td><strong>Bold cell</strong></td>
<td>Normal cell</td>
</tr>
</tbody>
</table>
<h2>Code Block</h2>
<pre><code>function test() {
console.log("Hello, world!");
return true;
}</code></pre>
<h2>Inline Code</h2>
<p>Use the <code>print()</code> function to output text.</p>
</body>
</html>
'''
# Add chapters to book
book.add_item(chapter1)
book.add_item(chapter2)
book.add_item(chapter3)
book.add_item(chapter4)
# Define table of contents
book.toc = (
epub.Link("chapter1.xhtml", "Introduction", "intro"),
epub.Link("chapter2.xhtml", "Styled Content", "styled"),
epub.Link("chapter3.xhtml", "Lists and Quotes", "lists"),
epub.Link("chapter4.xhtml", "Tables and Code", "tables")
)
# Add navigation files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Define spine
book.spine = ['nav', chapter1, chapter2, chapter3, chapter4]
# Create temporary file
epub_path = os.path.join(self.test_dir,
f'test_complex_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
return epub_path
def create_epub_with_nested_content(self):
"""Create an EPUB with nested content structures."""
book = epub.EpubBook()
# Set metadata
book.set_identifier('nested-test-id-789')
book.set_title('Nested Content Test')
book.set_language('en')
book.add_author('Test Author')
# Chapter with nested content
chapter = epub.EpubHtml(
title='Nested Content',
file_name='nested.xhtml',
lang='en'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Nested Content</title></head>
<body>
<h1>Nested Content Examples</h1>
<div>
<h2>Section in Div</h2>
<p>Paragraph inside div.</p>
<section>
<h3>Subsection</h3>
<article>
<h4>Article Header</h4>
<p>Article content with <strong>nested <em>formatting</em></strong>.</p>
<aside>
<p>Sidebar content in aside element.</p>
<ul>
<li>Nested list item</li>
<li>Another <strong>bold</strong> item</li>
</ul>
</aside>
</article>
</section>
</div>
<footer>
<p>Footer content with <span style="font-size: 12px; color: gray;">small gray text</span>.</p>
</footer>
</body>
</html>
'''
# Add chapter to book
book.add_item(chapter)
# Define table of contents
book.toc = (epub.Link("nested.xhtml", "Nested Content", "nested"),)
# Add navigation files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Define spine
book.spine = ['nav', chapter]
# Create temporary file
epub_path = os.path.join(self.test_dir,
f'test_nested_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
return epub_path
def test_simple_epub_reading(self):
"""Test reading a simple EPUB file."""
epub_path = self.create_simple_epub()
# Read the EPUB
book = read_epub(epub_path)
# Verify it's a Book object
self.assertIsInstance(book, Book)
# Check metadata
self.assertEqual(book.title, "Test Book")
# Check chapters
chapters = list(book.chapters)
self.assertEqual(len(chapters), 1)
# Check chapter content
chapter = chapters[0]
blocks = list(chapter.blocks)
self.assertGreater(len(blocks), 0)
# Should have a heading and paragraphs
has_heading = any(isinstance(block, Heading) for block in blocks)
has_paragraph = any(isinstance(block, Paragraph) for block in blocks)
self.assertTrue(has_heading, "Should contain at least one heading")
self.assertTrue(has_paragraph, "Should contain at least one paragraph")
def test_complex_epub_reading(self):
"""Test reading a complex EPUB file with multiple chapters."""
epub_path = self.create_complex_epub()
# Read the EPUB
book = read_epub(epub_path)
# Verify it's a Book object
self.assertIsInstance(book, Book)
# Check metadata
self.assertEqual(book.title, "Complex Test Book")
# Check chapters
chapters = list(book.chapters)
self.assertEqual(len(chapters), 4)
# Test each chapter has content
for i, chapter in enumerate(chapters):
blocks = list(chapter.blocks)
self.assertGreater(len(blocks), 0, f"Chapter {i + 1} should have blocks")
# Each chapter should start with a heading
first_block = blocks[0]
self.assertIsInstance(
first_block, Heading, f"Chapter {i + 1} should start with heading"
)
def test_epub_styled_content(self):
"""Test that styled content in EPUB is properly parsed."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
chapters = list(book.chapters)
# Check styled content in chapter 2 (index 1)
if len(chapters) > 1:
chapter2_blocks = list(chapters[1].blocks)
# Find paragraphs with styled text
styled_words_found = False
for block in chapter2_blocks:
if isinstance(block, Paragraph):
words = list(block.words_iter())
for _, word in words:
if (word.style.weight == FontWeight.BOLD or
word.style.style == FontStyle.ITALIC or
word.style.colour != (0, 0, 0)): # Non-black color
styled_words_found = True
break
if styled_words_found:
break
self.assertTrue(styled_words_found, "Should find styled words in chapter 2")
def test_epub_lists(self):
"""Test that lists in EPUB are properly parsed."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
chapters = list(book.chapters)
# Check lists in chapter 3 (index 2)
if len(chapters) > 2:
chapter3_blocks = list(chapters[2].blocks)
# Find list blocks
unordered_list_found = False
ordered_list_found = False
quote_found = False
for block in chapter3_blocks:
if isinstance(block, HList):
if block.style == ListStyle.UNORDERED:
unordered_list_found = True
# Check list items
items = list(block.items())
self.assertGreater(
len(items), 0, "Unordered list should have items")
elif block.style == ListStyle.ORDERED:
ordered_list_found = True
# Check list items
items = list(block.items())
self.assertGreater(
len(items), 0, "Ordered list should have items")
elif isinstance(block, Quote):
quote_found = True
self.assertTrue(
unordered_list_found,
"Should find unordered list in chapter 3")
self.assertTrue(ordered_list_found, "Should find ordered list in chapter 3")
self.assertTrue(quote_found, "Should find quote in chapter 3")
def test_epub_tables(self):
"""Test that tables in EPUB are properly parsed."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
chapters = list(book.chapters)
# Check tables in chapter 4 (index 3)
if len(chapters) > 3:
chapter4_blocks = list(chapters[3].blocks)
# Find table blocks
table_found = False
code_block_found = False
for block in chapter4_blocks:
if isinstance(block, Table):
table_found = True
# Check table has rows
rows = list(block.all_rows())
self.assertGreater(len(rows), 0, "Table should have rows")
elif isinstance(block, CodeBlock):
code_block_found = True
# Check code block has lines
lines = list(block.lines())
self.assertGreater(len(lines), 0, "Code block should have lines")
self.assertTrue(table_found, "Should find table in chapter 4")
self.assertTrue(code_block_found, "Should find code block in chapter 4")
def test_epub_nested_content(self):
"""Test that nested content structures are properly parsed."""
epub_path = self.create_epub_with_nested_content()
book = read_epub(epub_path)
chapters = list(book.chapters)
self.assertEqual(len(chapters), 1)
chapter_blocks = list(chapters[0].blocks)
self.assertGreater(len(chapter_blocks), 0)
# Should have multiple headings (h1, h2, h3, h4)
headings = [block for block in chapter_blocks if isinstance(block, Heading)]
self.assertGreater(
len(headings),
2,
"Should have multiple headings from nested content")
# Should have paragraphs and lists from nested content
paragraphs = [block for block in chapter_blocks if isinstance(block, Paragraph)]
lists = [block for block in chapter_blocks if isinstance(block, HList)]
self.assertGreater(
len(paragraphs),
0,
"Should have paragraphs from nested content")
self.assertGreater(len(lists), 0, "Should have lists from nested content")
def test_epub_metadata_extraction(self):
"""Test that EPUB metadata is properly extracted."""
epub_path = self.create_complex_epub()
book = read_epub(epub_path)
# Check basic metadata
self.assertEqual(book.title, "Complex Test Book")
# Check author extraction
author = book.get_metadata(MetadataType.AUTHOR)
self.assertIsNotNone(author, "Author metadata should be extracted")
self.assertEqual(author, "Test Author")
# Check language extraction
language = book.get_metadata(MetadataType.LANGUAGE)
self.assertIsNotNone(language, "Language metadata should be extracted")
self.assertEqual(language, "en")
# Check description extraction
description = book.get_metadata(MetadataType.DESCRIPTION)
self.assertIsNotNone(description, "Description should be extracted")
self.assertEqual(description, "A test book with complex content")
# Check publisher extraction
publisher = book.get_metadata(MetadataType.PUBLISHER)
self.assertIsNotNone(publisher, "Publisher should be extracted")
self.assertEqual(publisher, "Test Publisher")
# Check publication date extraction
pub_date = book.get_metadata(MetadataType.PUBLICATION_DATE)
self.assertIsNotNone(pub_date, "Publication date should be extracted")
self.assertEqual(pub_date, "2024-01-01")
# Check identifier extraction
identifier = book.get_metadata(MetadataType.IDENTIFIER)
self.assertIsNotNone(identifier, "Identifier should be extracted")
self.assertEqual(identifier, "complex-test-id-456")
def test_epub_reader_class_direct(self):
"""Test EPUBReader class directly."""
epub_path = self.create_simple_epub()
reader = EPUBReader(epub_path)
book = reader.read()
self.assertIsInstance(book, Book)
self.assertEqual(book.title, "Test Book")
# Verify author and language from simple EPUB
author = book.get_metadata(MetadataType.AUTHOR)
self.assertEqual(author, "Test Author", "Author should be extracted")
language = book.get_metadata(MetadataType.LANGUAGE)
self.assertEqual(language, "en", "Language should be extracted")
def test_epub_with_different_languages(self):
"""Test EPUB with various language codes."""
test_cases = [
("Test French Book", "François Dupont", "fr"),
("Test German Book", "Hans Mueller", "de"),
("Test Spanish Book", "Juan García", "es"),
("Test Japanese Book", "田中太郎", "ja"),
]
for title, author, lang_code in test_cases:
with self.subTest(language=lang_code):
book_obj = epub.EpubBook()
book_obj.set_identifier(f'lang-test-{lang_code}')
book_obj.set_title(title)
book_obj.set_language(lang_code)
book_obj.add_author(author)
chapter = epub.EpubHtml(
title='Chapter',
file_name='chapter.xhtml',
lang=lang_code
)
chapter.content = f'''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Chapter</title></head>
<body>
<h1>Test Chapter</h1>
<p>Content in {lang_code}.</p>
</body>
</html>
'''
book_obj.add_item(chapter)
book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
book_obj.add_item(epub.EpubNcx())
book_obj.add_item(epub.EpubNav())
book_obj.spine = ['nav', chapter]
# Write EPUB
epub_path = os.path.join(
self.test_dir,
f'test_lang_{lang_code}_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book_obj, {})
self.epub_files.append(epub_path)
# Read and verify
parsed_book = read_epub(epub_path)
self.assertEqual(parsed_book.title, title)
# Verify language is correctly extracted
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
self.assertEqual(
language, lang_code,
f"Language should be {lang_code}")
# Verify author is correctly extracted
parsed_author = parsed_book.get_metadata(MetadataType.AUTHOR)
self.assertEqual(
parsed_author, author,
f"Author should be {author}")
def test_epub_with_minimal_metadata(self):
"""Test EPUB with minimal metadata (only title, no author/language)."""
book_obj = epub.EpubBook()
# Set only minimal metadata - no author or language
book_obj.set_identifier('minimal-metadata-test')
book_obj.set_title('Minimal Metadata Book')
# Simple chapter
chapter = epub.EpubHtml(
title='Chapter',
file_name='chapter.xhtml'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Chapter</title></head>
<body>
<h1>Chapter</h1>
<p>Test content.</p>
</body>
</html>
'''
book_obj.add_item(chapter)
book_obj.toc = (epub.Link("chapter.xhtml", "Chapter", "ch"),)
book_obj.add_item(epub.EpubNcx())
book_obj.add_item(epub.EpubNav())
book_obj.spine = ['nav', chapter]
# Write EPUB
epub_path = os.path.join(self.test_dir,
f'test_minimal_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book_obj, {})
self.epub_files.append(epub_path)
# Read and verify
parsed_book = read_epub(epub_path)
self.assertEqual(parsed_book.title, "Minimal Metadata Book")
# Author should be None or empty when not provided
author = parsed_book.get_metadata(MetadataType.AUTHOR)
# It's ok if author is None when not provided in the EPUB
# Language might have a default value or be None
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
# Just verify it doesn't crash - language handling may vary
def test_invalid_epub_handling(self):
"""Test handling of invalid EPUB files."""
# Create a non-EPUB file
invalid_path = os.path.join(self.test_dir, 'invalid.epub')
with open(invalid_path, 'w') as f:
f.write("This is not an EPUB file")
# Should raise an exception or handle gracefully
with self.assertRaises(Exception):
read_epub(invalid_path)
def test_nonexistent_epub_handling(self):
"""Test handling of nonexistent EPUB files."""
nonexistent_path = os.path.join(self.test_dir, 'nonexistent.epub')
# Should raise an exception
with self.assertRaises(Exception):
read_epub(nonexistent_path)
def test_epub_with_custom_metadata(self):
"""Test EPUB with various metadata fields."""
book = epub.EpubBook()
# Set comprehensive metadata
book.set_identifier('custom-metadata-test')
book.set_title('Custom Metadata Test')
book.set_language('en')
book.add_author('Primary Author')
book.add_author('Secondary Author')
book.add_metadata(
'DC',
'description',
'A comprehensive test of metadata extraction')
book.add_metadata('DC', 'subject', 'Testing')
book.add_metadata('DC', 'subject', 'EPUB')
book.add_metadata('DC', 'date', '2024-06-07')
book.add_metadata('DC', 'publisher', 'Test Publishing House')
book.add_metadata('DC', 'rights', 'Public Domain')
# Simple chapter
chapter = epub.EpubHtml(
title='Metadata Test',
file_name='metadata.xhtml',
lang='en'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Metadata Test</title></head>
<body>
<h1>Metadata Test Chapter</h1>
<p>This chapter tests metadata extraction.</p>
</body>
</html>
'''
book.add_item(chapter)
book.toc = (epub.Link("metadata.xhtml", "Metadata Test", "meta"),)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ['nav', chapter]
# Write and test
epub_path = os.path.join(self.test_dir,
f'test_metadata_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
# Read and verify
parsed_book = read_epub(epub_path)
self.assertEqual(parsed_book.title, "Custom Metadata Test")
# Verify all metadata fields are extracted correctly
# Note: When multiple authors are added with ebooklib, the behavior may vary
# The EPUB reader currently only extracts the first DC:creator element it finds
author = parsed_book.get_metadata(MetadataType.AUTHOR)
self.assertIsNotNone(author, "Author should be extracted")
# Accept either author as valid since multiple author handling may vary
self.assertTrue(
"Author" in author,
f"Author metadata should contain an author name, got: {author}")
# Verify language
language = parsed_book.get_metadata(MetadataType.LANGUAGE)
self.assertEqual(language, "en", "Language should be 'en'")
# Verify description
description = parsed_book.get_metadata(MetadataType.DESCRIPTION)
self.assertEqual(
description,
"A comprehensive test of metadata extraction",
"Description should match")
# Verify publisher
publisher = parsed_book.get_metadata(MetadataType.PUBLISHER)
self.assertEqual(publisher, "Test Publishing House", "Publisher should match")
# Verify publication date
pub_date = parsed_book.get_metadata(MetadataType.PUBLICATION_DATE)
self.assertEqual(pub_date, "2024-06-07", "Publication date should match")
# Verify identifier
identifier = parsed_book.get_metadata(MetadataType.IDENTIFIER)
self.assertEqual(identifier, "custom-metadata-test", "Identifier should match")
# Verify chapters were created
chapters = list(parsed_book.chapters)
self.assertEqual(len(chapters), 1)
class TestEPUBIntegrationWithHTMLExtraction(unittest.TestCase):
"""Test cases that specifically verify EPUB reader uses html_extraction properly."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
self.epub_files = []
def tearDown(self):
"""Clean up test environment."""
for epub_file in self.epub_files:
try:
os.remove(epub_file)
except OSError:
pass
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir, ignore_errors=True)
@unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
def test_html_extraction_integration(self):
"""Test that EPUB reader properly uses html_extraction functionality."""
# Create an EPUB that exercises various HTML extraction features
book = epub.EpubBook()
book.set_identifier('html-extraction-test')
book.set_title('HTML Extraction Test')
book.set_language('en')
book.add_author('Test Author')
# Chapter that exercises html_extraction features
chapter = epub.EpubHtml(
title='HTML Features',
file_name='html_features.xhtml',
lang='en'
)
chapter.content = '''
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>HTML Features</title></head>
<body>
<h1>HTML Extraction Test</h1>
<!-- Test paragraph with inline formatting -->
<p>This paragraph has <strong>bold</strong>, <em>italic</em>,
<u>underlined</u>, and <span style="color: #ff0000; font-weight: bold;">styled</span> text.</p>
<!-- Test headings -->
<h2>Second Level Heading</h2>
<h3>Third Level Heading</h3>
<!-- Test lists with styled content -->
<ul>
<li>Plain list item</li>
<li><strong>Bold list item</strong></li>
<li>List item with <em>italic text</em></li>
</ul>
<!-- Test table with styled cells -->
<table>
<tr>
<th style="font-weight: bold;">Header</th>
<th>Value</th>
</tr>
<tr>
<td><span style="color: blue;">Blue text</span></td>
<td>Normal text</td>
</tr>
</table>
<!-- Test blockquote -->
<blockquote>
<p>This is a quoted paragraph with <strong>bold text</strong>.</p>
</blockquote>
<!-- Test code block -->
<pre><code>def test_function():
return "Hello, World!"</code></pre>
<!-- Test nested formatting -->
<p>Nested formatting: <strong>bold with <em>italic nested</em> inside</strong>.</p>
<!-- Test color variations -->
<p>
<span style="color: red;">Red text</span>,
<span style="color: #00ff00;">Green hex</span>,
<span style="color: blue; text-decoration: underline;">Blue underlined</span>.
</p>
</body>
</html>
'''
book.add_item(chapter)
book.toc = (epub.Link("html_features.xhtml", "HTML Features", "html"),)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ['nav', chapter]
# Write EPUB
epub_path = os.path.join(self.test_dir, 'html_extraction_test.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
# Read and analyze
parsed_book = read_epub(epub_path)
chapters = list(parsed_book.chapters)
self.assertEqual(len(chapters), 1)
blocks = list(chapters[0].blocks)
self.assertGreater(len(blocks), 5) # Should have multiple blocks
# Test that we get the expected block types
block_types = [type(block).__name__ for block in blocks]
self.assertIn('Heading', block_types, "Should have heading blocks")
self.assertIn('Paragraph', block_types, "Should have paragraph blocks")
self.assertIn('HList', block_types, "Should have list blocks")
self.assertIn('Table', block_types, "Should have table blocks")
self.assertIn('Quote', block_types, "Should have quote blocks")
self.assertIn('CodeBlock', block_types, "Should have code blocks")
# Test styled content was preserved
styled_content_found = False
for block in blocks:
if isinstance(block, Paragraph):
words = list(block.words_iter())
for _, word in words:
if (word.style.weight == FontWeight.BOLD or
word.style.style == FontStyle.ITALIC or
word.style.decoration == TextDecoration.UNDERLINE or
word.style.colour != (0, 0, 0)):
styled_content_found = True
break
if styled_content_found:
break
self.assertTrue(
styled_content_found,
"Should find styled content in parsed blocks")
# Test specific color parsing
red_text_found = False
green_text_found = False
blue_text_found = False
for block in blocks:
if isinstance(block, (Paragraph, Table)):
if isinstance(block, Paragraph):
words = list(block.words_iter())
for _, word in words:
if word.style.colour == (255, 0, 0): # Red
red_text_found = True
elif word.style.colour == (0, 255, 0): # Green
green_text_found = True
elif word.style.colour == (0, 0, 255): # Blue
blue_text_found = True
# At least one color should be found (depending on implementation)
color_found = red_text_found or green_text_found or blue_text_found
self.assertTrue(color_found, "Should find at least one colored text")
def test_epub_with_image(self):
"""Test that images in EPUB are properly parsed."""
book = epub.EpubBook()
book.set_identifier('image-test-id')
book.set_title('Image Test Book')
book.set_language('en')
book.add_author('Test Author')
# Create minimal JPEG data for testing
img_data = (
b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00'
b'\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t'
b'\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a'
b'\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342'
b'\xff\xc0\x00\x11\x08\x00d\x00d\x01\x01\x11\x00\x02\x11\x01\x03'
b'\x11\x01\xff\xc4\x00\x14\x00\x01\x00\x00\x00\x00\x00\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\x08\xff\xc4\x00\x14\x10\x01'
b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
b'\x00\xff\xda\x00\x0c\x03\x01\x00\x02\x11\x03\x11\x00\x3f\x00'
b'\xaa\xff\xd9'
)
# Create an EpubImage item
image_item = epub.EpubImage()
image_item.id = 'test_img'
image_item.file_name = 'images/test_image.jpg'
image_item.media_type = 'image/jpeg'
image_item.content = img_data
# Add image to book
book.add_item(image_item)
# Create a chapter that references the image
chapter = epub.EpubHtml(
title='Image Chapter',
file_name='image_chapter.xhtml',
lang='en'
)
chapter.content = '''<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Image Chapter</title></head>
<body>
<h1>Chapter with Image</h1>
<p>This chapter contains an image:</p>
<img src="images/test_image.jpg" alt="Test image" width="300" height="200" />
<p>Text after the image.</p>
</body>
</html>'''
book.add_item(chapter)
book.toc = (epub.Link("image_chapter.xhtml", "Image Chapter", "img_ch"),)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ['nav', chapter]
# Write EPUB
epub_path = os.path.join(self.test_dir,
f'test_image_{len(self.epub_files)}.epub')
epub.write_epub(epub_path, book, {})
self.epub_files.append(epub_path)
# Read and analyze
parsed_book = read_epub(epub_path)
chapters = list(parsed_book.chapters)
self.assertEqual(len(chapters), 1)
blocks = list(chapters[0].blocks)
self.assertGreater(len(blocks), 0)
# Find blocks by type
heading_blocks = [block for block in blocks if isinstance(block, Heading)]
paragraph_blocks = [block for block in blocks if isinstance(block, Paragraph)]
image_blocks = [block for block in blocks if isinstance(block, Image)]
# Verify we have the expected blocks
self.assertEqual(
len(heading_blocks),
1,
"Should find exactly one heading block")
self.assertGreaterEqual(
len(paragraph_blocks),
2,
"Should find at least two paragraph blocks")
self.assertEqual(len(image_blocks), 1, "Should find exactly one image block")
# Verify image properties
image_block = image_blocks[0]
self.assertEqual(image_block.alt_text, "Test image")
self.assertEqual(image_block.width, 300)
self.assertEqual(image_block.height, 200)
self.assertIn("test_image.jpg", image_block.source)
if __name__ == '__main__':
unittest.main()