831 lines
31 KiB
Python
831 lines
31 KiB
Python
"""
|
|
Unit tests for EPUB reader functionality.
|
|
|
|
Tests the EPUB parsing and conversion to pyWebLayout abstract elements,
|
|
using ebooklib to generate test EPUB files.
|
|
"""
|
|
|
|
import unittest
|
|
import tempfile
|
|
import os
|
|
import shutil
|
|
from datetime import datetime
|
|
|
|
# Import ebooklib for creating test EPUB files
|
|
try:
|
|
from ebooklib import epub
|
|
EBOOKLIB_AVAILABLE = True
|
|
except ImportError:
|
|
EBOOKLIB_AVAILABLE = False
|
|
|
|
from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
|
|
from pyWebLayout.abstract.document import Book
|
|
from pyWebLayout.abstract.block import (
|
|
Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
|
|
HList, ListStyle, Table, HorizontalRule, Image
|
|
)
|
|
from pyWebLayout.style import FontWeight, FontStyle, TextDecoration
|
|
|
|
|
|
@unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
|
|
class TestEPUBReader(unittest.TestCase):
|
|
"""Test cases for EPUB reader functionality."""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment."""
|
|
self.test_dir = tempfile.mkdtemp()
|
|
self.epub_files = []
|
|
|
|
def tearDown(self):
|
|
"""Clean up test environment."""
|
|
# Clean up test EPUB files
|
|
for epub_file in self.epub_files:
|
|
try:
|
|
os.remove(epub_file)
|
|
except OSError:
|
|
pass
|
|
|
|
# Clean up test directory
|
|
if os.path.exists(self.test_dir):
|
|
shutil.rmtree(self.test_dir, ignore_errors=True)
|
|
|
|
def create_simple_epub(self, title="Test Book", author="Test Author"):
|
|
"""Create a simple EPUB file for testing."""
|
|
book = epub.EpubBook()
|
|
|
|
# Set metadata
|
|
book.set_identifier('test-id-123')
|
|
book.set_title(title)
|
|
book.set_language('en')
|
|
book.add_author(author)
|
|
|
|
# Create a simple chapter
|
|
chapter1 = epub.EpubHtml(
|
|
title='Chapter 1',
|
|
file_name='chapter1.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter1.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Chapter 1</title></head>
|
|
<body>
|
|
<h1>Chapter One</h1>
|
|
<p>This is the first paragraph of the first chapter.</p>
|
|
<p>This is a <strong>second paragraph</strong> with <em>some formatting</em>.</p>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
# Add chapter to book
|
|
book.add_item(chapter1)
|
|
|
|
# Define table of contents
|
|
book.toc = (epub.Link("chapter1.xhtml", "Chapter 1", "ch1"),)
|
|
|
|
# Add navigation files
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
|
|
# Define spine
|
|
book.spine = ['nav', chapter1]
|
|
|
|
# Create temporary file
|
|
epub_path = os.path.join(self.test_dir, f'test_simple_{len(self.epub_files)}.epub')
|
|
epub.write_epub(epub_path, book, {})
|
|
self.epub_files.append(epub_path)
|
|
|
|
return epub_path
|
|
|
|
def create_complex_epub(self):
|
|
"""Create a more complex EPUB file with multiple chapters and content types."""
|
|
book = epub.EpubBook()
|
|
|
|
# Set metadata
|
|
book.set_identifier('complex-test-id-456')
|
|
book.set_title('Complex Test Book')
|
|
book.set_language('en')
|
|
book.add_author('Test Author')
|
|
book.add_metadata('DC', 'description', 'A test book with complex content')
|
|
book.add_metadata('DC', 'subject', 'Testing')
|
|
book.add_metadata('DC', 'date', '2024-01-01')
|
|
book.add_metadata('DC', 'publisher', 'Test Publisher')
|
|
|
|
# Chapter 1: Basic content
|
|
chapter1 = epub.EpubHtml(
|
|
title='Introduction',
|
|
file_name='chapter1.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter1.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Introduction</title></head>
|
|
<body>
|
|
<h1>Introduction</h1>
|
|
<p>Welcome to this <strong>complex test book</strong>.</p>
|
|
<p>This chapter contains basic content to test paragraph parsing.</p>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
# Chapter 2: Styled content
|
|
chapter2 = epub.EpubHtml(
|
|
title='Styled Content',
|
|
file_name='chapter2.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter2.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Styled Content</title></head>
|
|
<body>
|
|
<h1>Styled Content</h1>
|
|
<p>This chapter contains various <strong>bold text</strong>, <em>italic text</em>,
|
|
and <span style="color: red; font-weight: bold;">colored text</span>.</p>
|
|
<h2>Subsection</h2>
|
|
<p>Text with <u>underline</u> and <s>strikethrough</s>.</p>
|
|
<h3>More Formatting</h3>
|
|
<p>Nested formatting: <strong>bold with <em>italic inside</em></strong>.</p>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
# Chapter 3: Lists and quotes
|
|
chapter3 = epub.EpubHtml(
|
|
title='Lists and Quotes',
|
|
file_name='chapter3.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter3.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Lists and Quotes</title></head>
|
|
<body>
|
|
<h1>Lists and Quotes</h1>
|
|
|
|
<h2>Unordered List</h2>
|
|
<ul>
|
|
<li>First item</li>
|
|
<li><strong>Bold item</strong></li>
|
|
<li>Item with <em>italic text</em></li>
|
|
</ul>
|
|
|
|
<h2>Ordered List</h2>
|
|
<ol>
|
|
<li>First numbered item</li>
|
|
<li>Second numbered item</li>
|
|
<li>Third numbered item</li>
|
|
</ol>
|
|
|
|
<h2>Quote</h2>
|
|
<blockquote>
|
|
<p>This is a <span style="font-style: italic;">quoted paragraph</span>
|
|
with some styling.</p>
|
|
</blockquote>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
# Chapter 4: Tables and code
|
|
chapter4 = epub.EpubHtml(
|
|
title='Tables and Code',
|
|
file_name='chapter4.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter4.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Tables and Code</title></head>
|
|
<body>
|
|
<h1>Tables and Code</h1>
|
|
|
|
<h2>Simple Table</h2>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th><strong>Header 1</strong></th>
|
|
<th><em>Header 2</em></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>Cell 1</td>
|
|
<td>Cell 2 with <span style="color: blue;">blue text</span></td>
|
|
</tr>
|
|
<tr>
|
|
<td><strong>Bold cell</strong></td>
|
|
<td>Normal cell</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
<h2>Code Block</h2>
|
|
<pre><code>function test() {
|
|
console.log("Hello, world!");
|
|
return true;
|
|
}</code></pre>
|
|
|
|
<h2>Inline Code</h2>
|
|
<p>Use the <code>print()</code> function to output text.</p>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
# Add chapters to book
|
|
book.add_item(chapter1)
|
|
book.add_item(chapter2)
|
|
book.add_item(chapter3)
|
|
book.add_item(chapter4)
|
|
|
|
# Define table of contents
|
|
book.toc = (
|
|
epub.Link("chapter1.xhtml", "Introduction", "intro"),
|
|
epub.Link("chapter2.xhtml", "Styled Content", "styled"),
|
|
epub.Link("chapter3.xhtml", "Lists and Quotes", "lists"),
|
|
epub.Link("chapter4.xhtml", "Tables and Code", "tables")
|
|
)
|
|
|
|
# Add navigation files
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
|
|
# Define spine
|
|
book.spine = ['nav', chapter1, chapter2, chapter3, chapter4]
|
|
|
|
# Create temporary file
|
|
epub_path = os.path.join(self.test_dir, f'test_complex_{len(self.epub_files)}.epub')
|
|
epub.write_epub(epub_path, book, {})
|
|
self.epub_files.append(epub_path)
|
|
|
|
return epub_path
|
|
|
|
def create_epub_with_nested_content(self):
|
|
"""Create an EPUB with nested content structures."""
|
|
book = epub.EpubBook()
|
|
|
|
# Set metadata
|
|
book.set_identifier('nested-test-id-789')
|
|
book.set_title('Nested Content Test')
|
|
book.set_language('en')
|
|
book.add_author('Test Author')
|
|
|
|
# Chapter with nested content
|
|
chapter = epub.EpubHtml(
|
|
title='Nested Content',
|
|
file_name='nested.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Nested Content</title></head>
|
|
<body>
|
|
<h1>Nested Content Examples</h1>
|
|
|
|
<div>
|
|
<h2>Section in Div</h2>
|
|
<p>Paragraph inside div.</p>
|
|
|
|
<section>
|
|
<h3>Subsection</h3>
|
|
<article>
|
|
<h4>Article Header</h4>
|
|
<p>Article content with <strong>nested <em>formatting</em></strong>.</p>
|
|
|
|
<aside>
|
|
<p>Sidebar content in aside element.</p>
|
|
<ul>
|
|
<li>Nested list item</li>
|
|
<li>Another <strong>bold</strong> item</li>
|
|
</ul>
|
|
</aside>
|
|
</article>
|
|
</section>
|
|
</div>
|
|
|
|
<footer>
|
|
<p>Footer content with <span style="font-size: 12px; color: gray;">small gray text</span>.</p>
|
|
</footer>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
# Add chapter to book
|
|
book.add_item(chapter)
|
|
|
|
# Define table of contents
|
|
book.toc = (epub.Link("nested.xhtml", "Nested Content", "nested"),)
|
|
|
|
# Add navigation files
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
|
|
# Define spine
|
|
book.spine = ['nav', chapter]
|
|
|
|
# Create temporary file
|
|
epub_path = os.path.join(self.test_dir, f'test_nested_{len(self.epub_files)}.epub')
|
|
epub.write_epub(epub_path, book, {})
|
|
self.epub_files.append(epub_path)
|
|
|
|
return epub_path
|
|
|
|
def test_simple_epub_reading(self):
|
|
"""Test reading a simple EPUB file."""
|
|
epub_path = self.create_simple_epub()
|
|
|
|
# Read the EPUB
|
|
book = read_epub(epub_path)
|
|
|
|
# Verify it's a Book object
|
|
self.assertIsInstance(book, Book)
|
|
|
|
# Check metadata
|
|
self.assertEqual(book.title, "Test Book")
|
|
|
|
# Check chapters
|
|
chapters = list(book.chapters)
|
|
self.assertEqual(len(chapters), 1)
|
|
|
|
# Check chapter content
|
|
chapter = chapters[0]
|
|
blocks = list(chapter.blocks)
|
|
self.assertGreater(len(blocks), 0)
|
|
|
|
# Should have a heading and paragraphs
|
|
has_heading = any(isinstance(block, Heading) for block in blocks)
|
|
has_paragraph = any(isinstance(block, Paragraph) for block in blocks)
|
|
|
|
self.assertTrue(has_heading, "Should contain at least one heading")
|
|
self.assertTrue(has_paragraph, "Should contain at least one paragraph")
|
|
|
|
def test_complex_epub_reading(self):
|
|
"""Test reading a complex EPUB file with multiple chapters."""
|
|
epub_path = self.create_complex_epub()
|
|
|
|
# Read the EPUB
|
|
book = read_epub(epub_path)
|
|
|
|
# Verify it's a Book object
|
|
self.assertIsInstance(book, Book)
|
|
|
|
# Check metadata
|
|
self.assertEqual(book.title, "Complex Test Book")
|
|
|
|
# Check chapters
|
|
chapters = list(book.chapters)
|
|
self.assertEqual(len(chapters), 4)
|
|
|
|
# Test each chapter has content
|
|
for i, chapter in enumerate(chapters):
|
|
blocks = list(chapter.blocks)
|
|
self.assertGreater(len(blocks), 0, f"Chapter {i+1} should have blocks")
|
|
|
|
# Each chapter should start with a heading
|
|
first_block = blocks[0]
|
|
self.assertIsInstance(first_block, Heading, f"Chapter {i+1} should start with heading")
|
|
|
|
def test_epub_styled_content(self):
|
|
"""Test that styled content in EPUB is properly parsed."""
|
|
epub_path = self.create_complex_epub()
|
|
book = read_epub(epub_path)
|
|
|
|
chapters = list(book.chapters)
|
|
|
|
# Check styled content in chapter 2 (index 1)
|
|
if len(chapters) > 1:
|
|
chapter2_blocks = list(chapters[1].blocks)
|
|
|
|
# Find paragraphs with styled text
|
|
styled_words_found = False
|
|
for block in chapter2_blocks:
|
|
if isinstance(block, Paragraph):
|
|
words = list(block.words_iter())
|
|
for _, word in words:
|
|
if (word.style.weight == FontWeight.BOLD or
|
|
word.style.style == FontStyle.ITALIC or
|
|
word.style.colour != (0, 0, 0)): # Non-black color
|
|
styled_words_found = True
|
|
break
|
|
if styled_words_found:
|
|
break
|
|
|
|
self.assertTrue(styled_words_found, "Should find styled words in chapter 2")
|
|
|
|
def test_epub_lists(self):
|
|
"""Test that lists in EPUB are properly parsed."""
|
|
epub_path = self.create_complex_epub()
|
|
book = read_epub(epub_path)
|
|
|
|
chapters = list(book.chapters)
|
|
|
|
# Check lists in chapter 3 (index 2)
|
|
if len(chapters) > 2:
|
|
chapter3_blocks = list(chapters[2].blocks)
|
|
|
|
# Find list blocks
|
|
unordered_list_found = False
|
|
ordered_list_found = False
|
|
quote_found = False
|
|
|
|
for block in chapter3_blocks:
|
|
if isinstance(block, HList):
|
|
if block.style == ListStyle.UNORDERED:
|
|
unordered_list_found = True
|
|
|
|
# Check list items
|
|
items = list(block.items())
|
|
self.assertGreater(len(items), 0, "Unordered list should have items")
|
|
|
|
elif block.style == ListStyle.ORDERED:
|
|
ordered_list_found = True
|
|
|
|
# Check list items
|
|
items = list(block.items())
|
|
self.assertGreater(len(items), 0, "Ordered list should have items")
|
|
|
|
elif isinstance(block, Quote):
|
|
quote_found = True
|
|
|
|
self.assertTrue(unordered_list_found, "Should find unordered list in chapter 3")
|
|
self.assertTrue(ordered_list_found, "Should find ordered list in chapter 3")
|
|
self.assertTrue(quote_found, "Should find quote in chapter 3")
|
|
|
|
def test_epub_tables(self):
|
|
"""Test that tables in EPUB are properly parsed."""
|
|
epub_path = self.create_complex_epub()
|
|
book = read_epub(epub_path)
|
|
|
|
chapters = list(book.chapters)
|
|
|
|
# Check tables in chapter 4 (index 3)
|
|
if len(chapters) > 3:
|
|
chapter4_blocks = list(chapters[3].blocks)
|
|
|
|
# Find table blocks
|
|
table_found = False
|
|
code_block_found = False
|
|
|
|
for block in chapter4_blocks:
|
|
if isinstance(block, Table):
|
|
table_found = True
|
|
|
|
# Check table has rows
|
|
rows = list(block.all_rows())
|
|
self.assertGreater(len(rows), 0, "Table should have rows")
|
|
|
|
elif isinstance(block, CodeBlock):
|
|
code_block_found = True
|
|
|
|
# Check code block has lines
|
|
lines = list(block.lines())
|
|
self.assertGreater(len(lines), 0, "Code block should have lines")
|
|
|
|
self.assertTrue(table_found, "Should find table in chapter 4")
|
|
self.assertTrue(code_block_found, "Should find code block in chapter 4")
|
|
|
|
def test_epub_nested_content(self):
|
|
"""Test that nested content structures are properly parsed."""
|
|
epub_path = self.create_epub_with_nested_content()
|
|
book = read_epub(epub_path)
|
|
|
|
chapters = list(book.chapters)
|
|
self.assertEqual(len(chapters), 1)
|
|
|
|
chapter_blocks = list(chapters[0].blocks)
|
|
self.assertGreater(len(chapter_blocks), 0)
|
|
|
|
# Should have multiple headings (h1, h2, h3, h4)
|
|
headings = [block for block in chapter_blocks if isinstance(block, Heading)]
|
|
self.assertGreater(len(headings), 2, "Should have multiple headings from nested content")
|
|
|
|
# Should have paragraphs and lists from nested content
|
|
paragraphs = [block for block in chapter_blocks if isinstance(block, Paragraph)]
|
|
lists = [block for block in chapter_blocks if isinstance(block, HList)]
|
|
|
|
self.assertGreater(len(paragraphs), 0, "Should have paragraphs from nested content")
|
|
self.assertGreater(len(lists), 0, "Should have lists from nested content")
|
|
|
|
def test_epub_metadata_extraction(self):
|
|
"""Test that EPUB metadata is properly extracted."""
|
|
epub_path = self.create_complex_epub()
|
|
book = read_epub(epub_path)
|
|
|
|
# Check basic metadata
|
|
self.assertEqual(book.title, "Complex Test Book")
|
|
|
|
# Check that metadata was set (implementation may vary)
|
|
# This tests that the metadata parsing doesn't crash
|
|
self.assertIsNotNone(book.title)
|
|
|
|
def test_epub_reader_class_direct(self):
|
|
"""Test EPUBReader class directly."""
|
|
epub_path = self.create_simple_epub()
|
|
|
|
reader = EPUBReader(epub_path)
|
|
book = reader.read()
|
|
|
|
self.assertIsInstance(book, Book)
|
|
self.assertEqual(book.title, "Test Book")
|
|
|
|
def test_invalid_epub_handling(self):
|
|
"""Test handling of invalid EPUB files."""
|
|
# Create a non-EPUB file
|
|
invalid_path = os.path.join(self.test_dir, 'invalid.epub')
|
|
with open(invalid_path, 'w') as f:
|
|
f.write("This is not an EPUB file")
|
|
|
|
# Should raise an exception or handle gracefully
|
|
with self.assertRaises(Exception):
|
|
read_epub(invalid_path)
|
|
|
|
def test_nonexistent_epub_handling(self):
|
|
"""Test handling of nonexistent EPUB files."""
|
|
nonexistent_path = os.path.join(self.test_dir, 'nonexistent.epub')
|
|
|
|
# Should raise an exception
|
|
with self.assertRaises(Exception):
|
|
read_epub(nonexistent_path)
|
|
|
|
def test_epub_with_custom_metadata(self):
|
|
"""Test EPUB with various metadata fields."""
|
|
book = epub.EpubBook()
|
|
|
|
# Set comprehensive metadata
|
|
book.set_identifier('custom-metadata-test')
|
|
book.set_title('Custom Metadata Test')
|
|
book.set_language('en')
|
|
book.add_author('Primary Author')
|
|
book.add_author('Secondary Author')
|
|
book.add_metadata('DC', 'description', 'A comprehensive test of metadata extraction')
|
|
book.add_metadata('DC', 'subject', 'Testing')
|
|
book.add_metadata('DC', 'subject', 'EPUB')
|
|
book.add_metadata('DC', 'date', '2024-06-07')
|
|
book.add_metadata('DC', 'publisher', 'Test Publishing House')
|
|
book.add_metadata('DC', 'rights', 'Public Domain')
|
|
|
|
# Simple chapter
|
|
chapter = epub.EpubHtml(
|
|
title='Metadata Test',
|
|
file_name='metadata.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Metadata Test</title></head>
|
|
<body>
|
|
<h1>Metadata Test Chapter</h1>
|
|
<p>This chapter tests metadata extraction.</p>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
book.add_item(chapter)
|
|
book.toc = (epub.Link("metadata.xhtml", "Metadata Test", "meta"),)
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
book.spine = ['nav', chapter]
|
|
|
|
# Write and test
|
|
epub_path = os.path.join(self.test_dir, f'test_metadata_{len(self.epub_files)}.epub')
|
|
epub.write_epub(epub_path, book, {})
|
|
self.epub_files.append(epub_path)
|
|
|
|
# Read and verify
|
|
parsed_book = read_epub(epub_path)
|
|
self.assertEqual(parsed_book.title, "Custom Metadata Test")
|
|
|
|
# Verify chapters were created
|
|
chapters = list(parsed_book.chapters)
|
|
self.assertEqual(len(chapters), 1)
|
|
|
|
|
|
class TestEPUBIntegrationWithHTMLExtraction(unittest.TestCase):
|
|
"""Test cases that specifically verify EPUB reader uses html_extraction properly."""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment."""
|
|
self.test_dir = tempfile.mkdtemp()
|
|
self.epub_files = []
|
|
|
|
def tearDown(self):
|
|
"""Clean up test environment."""
|
|
for epub_file in self.epub_files:
|
|
try:
|
|
os.remove(epub_file)
|
|
except OSError:
|
|
pass
|
|
|
|
if os.path.exists(self.test_dir):
|
|
shutil.rmtree(self.test_dir, ignore_errors=True)
|
|
|
|
@unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
|
|
def test_html_extraction_integration(self):
|
|
"""Test that EPUB reader properly uses html_extraction functionality."""
|
|
# Create an EPUB that exercises various HTML extraction features
|
|
book = epub.EpubBook()
|
|
book.set_identifier('html-extraction-test')
|
|
book.set_title('HTML Extraction Test')
|
|
book.set_language('en')
|
|
book.add_author('Test Author')
|
|
|
|
# Chapter that exercises html_extraction features
|
|
chapter = epub.EpubHtml(
|
|
title='HTML Features',
|
|
file_name='html_features.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter.content = '''
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>HTML Features</title></head>
|
|
<body>
|
|
<h1>HTML Extraction Test</h1>
|
|
|
|
<!-- Test paragraph with inline formatting -->
|
|
<p>This paragraph has <strong>bold</strong>, <em>italic</em>,
|
|
<u>underlined</u>, and <span style="color: #ff0000; font-weight: bold;">styled</span> text.</p>
|
|
|
|
<!-- Test headings -->
|
|
<h2>Second Level Heading</h2>
|
|
<h3>Third Level Heading</h3>
|
|
|
|
<!-- Test lists with styled content -->
|
|
<ul>
|
|
<li>Plain list item</li>
|
|
<li><strong>Bold list item</strong></li>
|
|
<li>List item with <em>italic text</em></li>
|
|
</ul>
|
|
|
|
<!-- Test table with styled cells -->
|
|
<table>
|
|
<tr>
|
|
<th style="font-weight: bold;">Header</th>
|
|
<th>Value</th>
|
|
</tr>
|
|
<tr>
|
|
<td><span style="color: blue;">Blue text</span></td>
|
|
<td>Normal text</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<!-- Test blockquote -->
|
|
<blockquote>
|
|
<p>This is a quoted paragraph with <strong>bold text</strong>.</p>
|
|
</blockquote>
|
|
|
|
<!-- Test code block -->
|
|
<pre><code>def test_function():
|
|
return "Hello, World!"</code></pre>
|
|
|
|
<!-- Test nested formatting -->
|
|
<p>Nested formatting: <strong>bold with <em>italic nested</em> inside</strong>.</p>
|
|
|
|
<!-- Test color variations -->
|
|
<p>
|
|
<span style="color: red;">Red text</span>,
|
|
<span style="color: #00ff00;">Green hex</span>,
|
|
<span style="color: blue; text-decoration: underline;">Blue underlined</span>.
|
|
</p>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
book.add_item(chapter)
|
|
book.toc = (epub.Link("html_features.xhtml", "HTML Features", "html"),)
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
book.spine = ['nav', chapter]
|
|
|
|
# Write EPUB
|
|
epub_path = os.path.join(self.test_dir, 'html_extraction_test.epub')
|
|
epub.write_epub(epub_path, book, {})
|
|
self.epub_files.append(epub_path)
|
|
|
|
# Read and analyze
|
|
parsed_book = read_epub(epub_path)
|
|
chapters = list(parsed_book.chapters)
|
|
self.assertEqual(len(chapters), 1)
|
|
|
|
blocks = list(chapters[0].blocks)
|
|
self.assertGreater(len(blocks), 5) # Should have multiple blocks
|
|
|
|
# Test that we get the expected block types
|
|
block_types = [type(block).__name__ for block in blocks]
|
|
self.assertIn('Heading', block_types, "Should have heading blocks")
|
|
self.assertIn('Paragraph', block_types, "Should have paragraph blocks")
|
|
self.assertIn('HList', block_types, "Should have list blocks")
|
|
self.assertIn('Table', block_types, "Should have table blocks")
|
|
self.assertIn('Quote', block_types, "Should have quote blocks")
|
|
self.assertIn('CodeBlock', block_types, "Should have code blocks")
|
|
|
|
# Test styled content was preserved
|
|
styled_content_found = False
|
|
for block in blocks:
|
|
if isinstance(block, Paragraph):
|
|
words = list(block.words_iter())
|
|
for _, word in words:
|
|
if (word.style.weight == FontWeight.BOLD or
|
|
word.style.style == FontStyle.ITALIC or
|
|
word.style.decoration == TextDecoration.UNDERLINE or
|
|
word.style.colour != (0, 0, 0)):
|
|
styled_content_found = True
|
|
break
|
|
if styled_content_found:
|
|
break
|
|
|
|
self.assertTrue(styled_content_found, "Should find styled content in parsed blocks")
|
|
|
|
# Test specific color parsing
|
|
red_text_found = False
|
|
green_text_found = False
|
|
blue_text_found = False
|
|
|
|
for block in blocks:
|
|
if isinstance(block, (Paragraph, Table)):
|
|
if isinstance(block, Paragraph):
|
|
words = list(block.words_iter())
|
|
for _, word in words:
|
|
if word.style.colour == (255, 0, 0): # Red
|
|
red_text_found = True
|
|
elif word.style.colour == (0, 255, 0): # Green
|
|
green_text_found = True
|
|
elif word.style.colour == (0, 0, 255): # Blue
|
|
blue_text_found = True
|
|
|
|
# At least one color should be found (depending on implementation)
|
|
color_found = red_text_found or green_text_found or blue_text_found
|
|
self.assertTrue(color_found, "Should find at least one colored text")
|
|
|
|
|
|
def test_epub_with_image(self):
|
|
"""Test that images in EPUB are properly parsed."""
|
|
book = epub.EpubBook()
|
|
book.set_identifier('image-test-id')
|
|
book.set_title('Image Test Book')
|
|
book.set_language('en')
|
|
book.add_author('Test Author')
|
|
|
|
# Create minimal JPEG data for testing
|
|
img_data = b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xc0\x00\x11\x08\x00d\x00d\x01\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x14\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\xff\xc4\x00\x14\x10\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xda\x00\x0c\x03\x01\x00\x02\x11\x03\x11\x00\x3f\x00\xaa\xff\xd9'
|
|
|
|
# Create an EpubImage item
|
|
image_item = epub.EpubImage()
|
|
image_item.id = 'test_img'
|
|
image_item.file_name = 'images/test_image.jpg'
|
|
image_item.media_type = 'image/jpeg'
|
|
image_item.content = img_data
|
|
|
|
# Add image to book
|
|
book.add_item(image_item)
|
|
|
|
# Create a chapter that references the image
|
|
chapter = epub.EpubHtml(
|
|
title='Image Chapter',
|
|
file_name='image_chapter.xhtml',
|
|
lang='en'
|
|
)
|
|
chapter.content = '''<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head><title>Image Chapter</title></head>
|
|
<body>
|
|
<h1>Chapter with Image</h1>
|
|
<p>This chapter contains an image:</p>
|
|
<img src="images/test_image.jpg" alt="Test image" width="300" height="200" />
|
|
<p>Text after the image.</p>
|
|
</body>
|
|
</html>'''
|
|
|
|
book.add_item(chapter)
|
|
book.toc = (epub.Link("image_chapter.xhtml", "Image Chapter", "img_ch"),)
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
book.spine = ['nav', chapter]
|
|
|
|
# Write EPUB
|
|
epub_path = os.path.join(self.test_dir, f'test_image_{len(self.epub_files)}.epub')
|
|
epub.write_epub(epub_path, book, {})
|
|
self.epub_files.append(epub_path)
|
|
|
|
# Read and analyze
|
|
parsed_book = read_epub(epub_path)
|
|
chapters = list(parsed_book.chapters)
|
|
self.assertEqual(len(chapters), 1)
|
|
|
|
blocks = list(chapters[0].blocks)
|
|
self.assertGreater(len(blocks), 0)
|
|
|
|
# Find blocks by type
|
|
heading_blocks = [block for block in blocks if isinstance(block, Heading)]
|
|
paragraph_blocks = [block for block in blocks if isinstance(block, Paragraph)]
|
|
image_blocks = [block for block in blocks if isinstance(block, Image)]
|
|
|
|
# Verify we have the expected blocks
|
|
self.assertEqual(len(heading_blocks), 1, "Should find exactly one heading block")
|
|
self.assertGreaterEqual(len(paragraph_blocks), 2, "Should find at least two paragraph blocks")
|
|
self.assertEqual(len(image_blocks), 1, "Should find exactly one image block")
|
|
|
|
# Verify image properties
|
|
image_block = image_blocks[0]
|
|
self.assertEqual(image_block.alt_text, "Test image")
|
|
self.assertEqual(image_block.width, 300)
|
|
self.assertEqual(image_block.height, 200)
|
|
self.assertIn("test_image.jpg", image_block.source)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|