pyWebLayout/tests/test_epub_reader.py

"""
Unit tests for EPUB reader functionality.

Tests the EPUB parsing and conversion to pyWebLayout abstract elements,
using ebooklib to generate test EPUB files.
"""

import unittest
import tempfile
import os
import shutil
from datetime import datetime

# Import ebooklib for creating test EPUB files
try:
    from ebooklib import epub
    EBOOKLIB_AVAILABLE = True
except ImportError:
    EBOOKLIB_AVAILABLE = False

from pyWebLayout.io.readers.epub_reader import read_epub, EPUBReader
from pyWebLayout.abstract.document import Book
from pyWebLayout.abstract.block import (
    Paragraph, Heading, HeadingLevel, Quote, CodeBlock,
    HList, ListStyle, Table, HorizontalRule, Image
)
from pyWebLayout.style import FontWeight, FontStyle, TextDecoration


@unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
class TestEPUBReader(unittest.TestCase):
    """Test cases for EPUB reader functionality."""

    def setUp(self):
        """Set up test environment."""
        self.test_dir = tempfile.mkdtemp()
        self.epub_files = []

    def tearDown(self):
        """Clean up test environment."""
        # Clean up test EPUB files
        for epub_file in self.epub_files:
            try:
                os.remove(epub_file)
            except OSError:
                pass

        # Clean up test directory
        if os.path.exists(self.test_dir):
            shutil.rmtree(self.test_dir, ignore_errors=True)

    def create_simple_epub(self, title="Test Book", author="Test Author"):
        """Create a simple EPUB file for testing."""
        book = epub.EpubBook()

        # Set metadata
        book.set_identifier('test-id-123')
        book.set_title(title)
        book.set_language('en')
        book.add_author(author)

        # Create a simple chapter
        chapter1 = epub.EpubHtml(
            title='Chapter 1',
            file_name='chapter1.xhtml',
            lang='en'
        )
        chapter1.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Chapter 1</title></head>
        <body>
            <h1>Chapter One</h1>
            <p>This is the first paragraph of the first chapter.</p>
            <p>This is a <strong>second paragraph</strong> with <em>some formatting</em>.</p>
        </body>
        </html>
        '''

        # Add chapter to book
        book.add_item(chapter1)

        # Define table of contents
        book.toc = (epub.Link("chapter1.xhtml", "Chapter 1", "ch1"),)

        # Add navigation files
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())

        # Define spine
        book.spine = ['nav', chapter1]

        # Create temporary file
        epub_path = os.path.join(self.test_dir, f'test_simple_{len(self.epub_files)}.epub')
        epub.write_epub(epub_path, book, {})
        self.epub_files.append(epub_path)

        return epub_path

    def create_complex_epub(self):
        """Create a more complex EPUB file with multiple chapters and content types."""
        book = epub.EpubBook()

        # Set metadata
        book.set_identifier('complex-test-id-456')
        book.set_title('Complex Test Book')
        book.set_language('en')
        book.add_author('Test Author')
        book.add_metadata('DC', 'description', 'A test book with complex content')
        book.add_metadata('DC', 'subject', 'Testing')
        book.add_metadata('DC', 'date', '2024-01-01')
        book.add_metadata('DC', 'publisher', 'Test Publisher')

        # Chapter 1: Basic content
        chapter1 = epub.EpubHtml(
            title='Introduction',
            file_name='chapter1.xhtml',
            lang='en'
        )
        chapter1.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Introduction</title></head>
        <body>
            <h1>Introduction</h1>
            <p>Welcome to this <strong>complex test book</strong>.</p>
            <p>This chapter contains basic content to test paragraph parsing.</p>
        </body>
        </html>
        '''

        # Chapter 2: Styled content
        chapter2 = epub.EpubHtml(
            title='Styled Content',
            file_name='chapter2.xhtml',
            lang='en'
        )
        chapter2.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Styled Content</title></head>
        <body>
            <h1>Styled Content</h1>
            <p>This chapter contains various <strong>bold text</strong>, <em>italic text</em>,
               and <span style="color: red; font-weight: bold;">colored text</span>.</p>
            <h2>Subsection</h2>
            <p>Text with <u>underline</u> and <s>strikethrough</s>.</p>
            <h3>More Formatting</h3>
            <p>Nested formatting: <strong>bold with <em>italic inside</em></strong>.</p>
        </body>
        </html>
        '''

        # Chapter 3: Lists and quotes
        chapter3 = epub.EpubHtml(
            title='Lists and Quotes',
            file_name='chapter3.xhtml',
            lang='en'
        )
        chapter3.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Lists and Quotes</title></head>
        <body>
            <h1>Lists and Quotes</h1>

            <h2>Unordered List</h2>
            <ul>
                <li>First item</li>
                <li><strong>Bold item</strong></li>
                <li>Item with <em>italic text</em></li>
            </ul>

            <h2>Ordered List</h2>
            <ol>
                <li>First numbered item</li>
                <li>Second numbered item</li>
                <li>Third numbered item</li>
            </ol>

            <h2>Quote</h2>
            <blockquote>
                <p>This is a <span style="font-style: italic;">quoted paragraph</span>
                   with some styling.</p>
            </blockquote>
        </body>
        </html>
        '''

        # Chapter 4: Tables and code
        chapter4 = epub.EpubHtml(
            title='Tables and Code',
            file_name='chapter4.xhtml',
            lang='en'
        )
        chapter4.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Tables and Code</title></head>
        <body>
            <h1>Tables and Code</h1>

            <h2>Simple Table</h2>
            <table>
                <thead>
                    <tr>
                        <th><strong>Header 1</strong></th>
                        <th><em>Header 2</em></th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td>Cell 1</td>
                        <td>Cell 2 with <span style="color: blue;">blue text</span></td>
                    </tr>
                    <tr>
                        <td><strong>Bold cell</strong></td>
                        <td>Normal cell</td>
                    </tr>
                </tbody>
            </table>

            <h2>Code Block</h2>
            <pre><code>function test() {
    console.log("Hello, world!");
    return true;
}</code></pre>

            <h2>Inline Code</h2>
            <p>Use the <code>print()</code> function to output text.</p>
        </body>
        </html>
        '''

        # Add chapters to book
        book.add_item(chapter1)
        book.add_item(chapter2)
        book.add_item(chapter3)
        book.add_item(chapter4)

        # Define table of contents
        book.toc = (
            epub.Link("chapter1.xhtml", "Introduction", "intro"),
            epub.Link("chapter2.xhtml", "Styled Content", "styled"),
            epub.Link("chapter3.xhtml", "Lists and Quotes", "lists"),
            epub.Link("chapter4.xhtml", "Tables and Code", "tables")
        )

        # Add navigation files
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())

        # Define spine
        book.spine = ['nav', chapter1, chapter2, chapter3, chapter4]

        # Create temporary file
        epub_path = os.path.join(self.test_dir, f'test_complex_{len(self.epub_files)}.epub')
        epub.write_epub(epub_path, book, {})
        self.epub_files.append(epub_path)

        return epub_path

    def create_epub_with_nested_content(self):
        """Create an EPUB with nested content structures."""
        book = epub.EpubBook()

        # Set metadata
        book.set_identifier('nested-test-id-789')
        book.set_title('Nested Content Test')
        book.set_language('en')
        book.add_author('Test Author')

        # Chapter with nested content
        chapter = epub.EpubHtml(
            title='Nested Content',
            file_name='nested.xhtml',
            lang='en'
        )
        chapter.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Nested Content</title></head>
        <body>
            <h1>Nested Content Examples</h1>

            <div>
                <h2>Section in Div</h2>
                <p>Paragraph inside div.</p>

                <section>
                    <h3>Subsection</h3>
                    <article>
                        <h4>Article Header</h4>
                        <p>Article content with <strong>nested <em>formatting</em></strong>.</p>

                        <aside>
                            <p>Sidebar content in aside element.</p>
                            <ul>
                                <li>Nested list item</li>
                                <li>Another <strong>bold</strong> item</li>
                            </ul>
                        </aside>
                    </article>
                </section>
            </div>

            <footer>
                <p>Footer content with <span style="font-size: 12px; color: gray;">small gray text</span>.</p>
            </footer>
        </body>
        </html>
        '''

        # Add chapter to book
        book.add_item(chapter)

        # Define table of contents
        book.toc = (epub.Link("nested.xhtml", "Nested Content", "nested"),)

        # Add navigation files
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())

        # Define spine
        book.spine = ['nav', chapter]

        # Create temporary file
        epub_path = os.path.join(self.test_dir, f'test_nested_{len(self.epub_files)}.epub')
        epub.write_epub(epub_path, book, {})
        self.epub_files.append(epub_path)

        return epub_path

    def test_simple_epub_reading(self):
        """Test reading a simple EPUB file."""
        epub_path = self.create_simple_epub()

        # Read the EPUB
        book = read_epub(epub_path)

        # Verify it's a Book object
        self.assertIsInstance(book, Book)

        # Check metadata
        self.assertEqual(book.title, "Test Book")

        # Check chapters
        chapters = list(book.chapters)
        self.assertEqual(len(chapters), 1)

        # Check chapter content
        chapter = chapters[0]
        blocks = list(chapter.blocks)
        self.assertGreater(len(blocks), 0)

        # Should have a heading and paragraphs
        has_heading = any(isinstance(block, Heading) for block in blocks)
        has_paragraph = any(isinstance(block, Paragraph) for block in blocks)

        self.assertTrue(has_heading, "Should contain at least one heading")
        self.assertTrue(has_paragraph, "Should contain at least one paragraph")

    def test_complex_epub_reading(self):
        """Test reading a complex EPUB file with multiple chapters."""
        epub_path = self.create_complex_epub()

        # Read the EPUB
        book = read_epub(epub_path)

        # Verify it's a Book object
        self.assertIsInstance(book, Book)

        # Check metadata
        self.assertEqual(book.title, "Complex Test Book")

        # Check chapters
        chapters = list(book.chapters)
        self.assertEqual(len(chapters), 4)

        # Test each chapter has content
        for i, chapter in enumerate(chapters):
            blocks = list(chapter.blocks)
            self.assertGreater(len(blocks), 0, f"Chapter {i+1} should have blocks")

            # Each chapter should start with a heading
            first_block = blocks[0]
            self.assertIsInstance(first_block, Heading, f"Chapter {i+1} should start with heading")

    def test_epub_styled_content(self):
        """Test that styled content in EPUB is properly parsed."""
        epub_path = self.create_complex_epub()
        book = read_epub(epub_path)

        chapters = list(book.chapters)

        # Check styled content in chapter 2 (index 1)
        if len(chapters) > 1:
            chapter2_blocks = list(chapters[1].blocks)

            # Find paragraphs with styled text
            styled_words_found = False
            for block in chapter2_blocks:
                if isinstance(block, Paragraph):
                    words = list(block.words())
                    for _, word in words:
                        if (word.style.weight == FontWeight.BOLD or
                            word.style.style == FontStyle.ITALIC or
                            word.style.colour != (0, 0, 0)):  # Non-black color
                            styled_words_found = True
                            break
                    if styled_words_found:
                        break

            self.assertTrue(styled_words_found, "Should find styled words in chapter 2")

    def test_epub_lists(self):
        """Test that lists in EPUB are properly parsed."""
        epub_path = self.create_complex_epub()
        book = read_epub(epub_path)

        chapters = list(book.chapters)

        # Check lists in chapter 3 (index 2)
        if len(chapters) > 2:
            chapter3_blocks = list(chapters[2].blocks)

            # Find list blocks
            unordered_list_found = False
            ordered_list_found = False
            quote_found = False

            for block in chapter3_blocks:
                if isinstance(block, HList):
                    if block.style == ListStyle.UNORDERED:
                        unordered_list_found = True

                        # Check list items
                        items = list(block.items())
                        self.assertGreater(len(items), 0, "Unordered list should have items")

                    elif block.style == ListStyle.ORDERED:
                        ordered_list_found = True

                        # Check list items
                        items = list(block.items())
                        self.assertGreater(len(items), 0, "Ordered list should have items")

                elif isinstance(block, Quote):
                    quote_found = True

            self.assertTrue(unordered_list_found, "Should find unordered list in chapter 3")
            self.assertTrue(ordered_list_found, "Should find ordered list in chapter 3")
            self.assertTrue(quote_found, "Should find quote in chapter 3")

    def test_epub_tables(self):
        """Test that tables in EPUB are properly parsed."""
        epub_path = self.create_complex_epub()
        book = read_epub(epub_path)

        chapters = list(book.chapters)

        # Check tables in chapter 4 (index 3)
        if len(chapters) > 3:
            chapter4_blocks = list(chapters[3].blocks)

            # Find table blocks
            table_found = False
            code_block_found = False

            for block in chapter4_blocks:
                if isinstance(block, Table):
                    table_found = True

                    # Check table has rows
                    rows = list(block.all_rows())
                    self.assertGreater(len(rows), 0, "Table should have rows")

                elif isinstance(block, CodeBlock):
                    code_block_found = True

                    # Check code block has lines
                    lines = list(block.lines())
                    self.assertGreater(len(lines), 0, "Code block should have lines")

            self.assertTrue(table_found, "Should find table in chapter 4")
            self.assertTrue(code_block_found, "Should find code block in chapter 4")

    def test_epub_nested_content(self):
        """Test that nested content structures are properly parsed."""
        epub_path = self.create_epub_with_nested_content()
        book = read_epub(epub_path)

        chapters = list(book.chapters)
        self.assertEqual(len(chapters), 1)

        chapter_blocks = list(chapters[0].blocks)
        self.assertGreater(len(chapter_blocks), 0)

        # Should have multiple headings (h1, h2, h3, h4)
        headings = [block for block in chapter_blocks if isinstance(block, Heading)]
        self.assertGreater(len(headings), 2, "Should have multiple headings from nested content")

        # Should have paragraphs and lists from nested content
        paragraphs = [block for block in chapter_blocks if isinstance(block, Paragraph)]
        lists = [block for block in chapter_blocks if isinstance(block, HList)]

        self.assertGreater(len(paragraphs), 0, "Should have paragraphs from nested content")
        self.assertGreater(len(lists), 0, "Should have lists from nested content")

    def test_epub_metadata_extraction(self):
        """Test that EPUB metadata is properly extracted."""
        epub_path = self.create_complex_epub()
        book = read_epub(epub_path)

        # Check basic metadata
        self.assertEqual(book.title, "Complex Test Book")

        # Check that metadata was set (implementation may vary)
        # This tests that the metadata parsing doesn't crash
        self.assertIsNotNone(book.title)

    def test_epub_reader_class_direct(self):
        """Test EPUBReader class directly."""
        epub_path = self.create_simple_epub()

        reader = EPUBReader(epub_path)
        book = reader.read()

        self.assertIsInstance(book, Book)
        self.assertEqual(book.title, "Test Book")

    def test_invalid_epub_handling(self):
        """Test handling of invalid EPUB files."""
        # Create a non-EPUB file
        invalid_path = os.path.join(self.test_dir, 'invalid.epub')
        with open(invalid_path, 'w') as f:
            f.write("This is not an EPUB file")

        # Should raise an exception or handle gracefully
        with self.assertRaises(Exception):
            read_epub(invalid_path)

    def test_nonexistent_epub_handling(self):
        """Test handling of nonexistent EPUB files."""
        nonexistent_path = os.path.join(self.test_dir, 'nonexistent.epub')

        # Should raise an exception
        with self.assertRaises(Exception):
            read_epub(nonexistent_path)

    def test_epub_with_custom_metadata(self):
        """Test EPUB with various metadata fields."""
        book = epub.EpubBook()

        # Set comprehensive metadata
        book.set_identifier('custom-metadata-test')
        book.set_title('Custom Metadata Test')
        book.set_language('en')
        book.add_author('Primary Author')
        book.add_author('Secondary Author')
        book.add_metadata('DC', 'description', 'A comprehensive test of metadata extraction')
        book.add_metadata('DC', 'subject', 'Testing')
        book.add_metadata('DC', 'subject', 'EPUB')
        book.add_metadata('DC', 'date', '2024-06-07')
        book.add_metadata('DC', 'publisher', 'Test Publishing House')
        book.add_metadata('DC', 'rights', 'Public Domain')

        # Simple chapter
        chapter = epub.EpubHtml(
            title='Metadata Test',
            file_name='metadata.xhtml',
            lang='en'
        )
        chapter.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>Metadata Test</title></head>
        <body>
            <h1>Metadata Test Chapter</h1>
            <p>This chapter tests metadata extraction.</p>
        </body>
        </html>
        '''

        book.add_item(chapter)
        book.toc = (epub.Link("metadata.xhtml", "Metadata Test", "meta"),)
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())
        book.spine = ['nav', chapter]

        # Write and test
        epub_path = os.path.join(self.test_dir, f'test_metadata_{len(self.epub_files)}.epub')
        epub.write_epub(epub_path, book, {})
        self.epub_files.append(epub_path)

        # Read and verify
        parsed_book = read_epub(epub_path)
        self.assertEqual(parsed_book.title, "Custom Metadata Test")

        # Verify chapters were created
        chapters = list(parsed_book.chapters)
        self.assertEqual(len(chapters), 1)


class TestEPUBIntegrationWithHTMLExtraction(unittest.TestCase):
    """Test cases that specifically verify EPUB reader uses html_extraction properly."""

    def setUp(self):
        """Set up test environment."""
        self.test_dir = tempfile.mkdtemp()
        self.epub_files = []

    def tearDown(self):
        """Clean up test environment."""
        for epub_file in self.epub_files:
            try:
                os.remove(epub_file)
            except OSError:
                pass

        if os.path.exists(self.test_dir):
            shutil.rmtree(self.test_dir, ignore_errors=True)

    @unittest.skipUnless(EBOOKLIB_AVAILABLE, "ebooklib not available")
    def test_html_extraction_integration(self):
        """Test that EPUB reader properly uses html_extraction functionality."""
        # Create an EPUB that exercises various HTML extraction features
        book = epub.EpubBook()
        book.set_identifier('html-extraction-test')
        book.set_title('HTML Extraction Test')
        book.set_language('en')
        book.add_author('Test Author')

        # Chapter that exercises html_extraction features
        chapter = epub.EpubHtml(
            title='HTML Features',
            file_name='html_features.xhtml',
            lang='en'
        )
        chapter.content = '''
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head><title>HTML Features</title></head>
        <body>
            <h1>HTML Extraction Test</h1>

            <!-- Test paragraph with inline formatting -->
            <p>This paragraph has <strong>bold</strong>, <em>italic</em>,
               <u>underlined</u>, and <span style="color: #ff0000; font-weight: bold;">styled</span> text.</p>

            <!-- Test headings -->
            <h2>Second Level Heading</h2>
            <h3>Third Level Heading</h3>

            <!-- Test lists with styled content -->
            <ul>
                <li>Plain list item</li>
                <li><strong>Bold list item</strong></li>
                <li>List item with <em>italic text</em></li>
            </ul>

            <!-- Test table with styled cells -->
            <table>
                <tr>
                    <th style="font-weight: bold;">Header</th>
                    <th>Value</th>
                </tr>
                <tr>
                    <td><span style="color: blue;">Blue text</span></td>
                    <td>Normal text</td>
                </tr>
            </table>

            <!-- Test blockquote -->
            <blockquote>
                <p>This is a quoted paragraph with <strong>bold text</strong>.</p>
            </blockquote>

            <!-- Test code block -->
            <pre><code>def test_function():
    return "Hello, World!"</code></pre>

            <!-- Test nested formatting -->
            <p>Nested formatting: <strong>bold with <em>italic nested</em> inside</strong>.</p>

            <!-- Test color variations -->
            <p>
                <span style="color: red;">Red text</span>,
                <span style="color: #00ff00;">Green hex</span>,
                <span style="color: blue; text-decoration: underline;">Blue underlined</span>.
            </p>
        </body>
        </html>
        '''

        book.add_item(chapter)
        book.toc = (epub.Link("html_features.xhtml", "HTML Features", "html"),)
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())
        book.spine = ['nav', chapter]

        # Write EPUB
        epub_path = os.path.join(self.test_dir, 'html_extraction_test.epub')
        epub.write_epub(epub_path, book, {})
        self.epub_files.append(epub_path)

        # Read and analyze
        parsed_book = read_epub(epub_path)
        chapters = list(parsed_book.chapters)
        self.assertEqual(len(chapters), 1)

        blocks = list(chapters[0].blocks)
        self.assertGreater(len(blocks), 5)  # Should have multiple blocks

        # Test that we get the expected block types
        block_types = [type(block).__name__ for block in blocks]
        self.assertIn('Heading', block_types, "Should have heading blocks")
        self.assertIn('Paragraph', block_types, "Should have paragraph blocks")
        self.assertIn('HList', block_types, "Should have list blocks")
        self.assertIn('Table', block_types, "Should have table blocks")
        self.assertIn('Quote', block_types, "Should have quote blocks")
        self.assertIn('CodeBlock', block_types, "Should have code blocks")

        # Test styled content was preserved
        styled_content_found = False
        for block in blocks:
            if isinstance(block, Paragraph):
                words = list(block.words())
                for _, word in words:
                    if (word.style.weight == FontWeight.BOLD or
                        word.style.style == FontStyle.ITALIC or
                        word.style.decoration == TextDecoration.UNDERLINE or
                        word.style.colour != (0, 0, 0)):
                        styled_content_found = True
                        break
                if styled_content_found:
                    break

        self.assertTrue(styled_content_found, "Should find styled content in parsed blocks")

        # Test specific color parsing
        red_text_found = False
        green_text_found = False
        blue_text_found = False

        for block in blocks:
            if isinstance(block, (Paragraph, Table)):
                if isinstance(block, Paragraph):
                    words = list(block.words())
                    for _, word in words:
                        if word.style.colour == (255, 0, 0):  # Red
                            red_text_found = True
                        elif word.style.colour == (0, 255, 0):  # Green
                            green_text_found = True
                        elif word.style.colour == (0, 0, 255):  # Blue
                            blue_text_found = True

        # At least one color should be found (depending on implementation)
        color_found = red_text_found or green_text_found or blue_text_found
        self.assertTrue(color_found, "Should find at least one colored text")


if __name__ == '__main__':
    unittest.main()