pyWebLayout/pyWebLayout/io/readers/epub_reader.py

"""
EPUB reader for pyWebLayout.

This module provides functionality for reading EPUB documents and converting them
to pyWebLayout's abstract document model.
"""

import os
import zipfile
import tempfile
from typing import Dict, List, Optional, Any, Tuple
import xml.etree.ElementTree as ET
import re
import urllib.parse

from pyWebLayout.abstract.document import Document, Book, Chapter, MetadataType
from pyWebLayout.io.readers.html_extraction import parse_html_string


# XML namespaces used in EPUB files
NAMESPACES = {
    'opf': 'http://www.idpf.org/2007/opf',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcterms': 'http://purl.org/dc/terms/',
    'xhtml': 'http://www.w3.org/1999/xhtml',
    'ncx': 'http://www.daisy.org/z3986/2005/ncx/',
}


class EPUBReader:
    """
    Reader for EPUB documents.

    This class extracts content from EPUB files and converts it to
    pyWebLayout's abstract document model.
    """

    def __init__(self, epub_path: str):
        """
        Initialize an EPUB reader.

        Args:
            epub_path: Path to the EPUB file
        """
        self.epub_path = epub_path
        self.book = Book()
        self.temp_dir = None
        self.content_dir = None
        self.metadata = {}
        self.toc = []
        self.spine = []
        self.manifest = {}
        self.cover_id = None  # ID of the cover image in manifest

    def read(self) -> Book:
        """
        Read the EPUB file and convert it to a Book.

        Returns:
            Book: The parsed book
        """
        try:
            # Extract the EPUB file
            self.temp_dir = tempfile.mkdtemp()
            self._extract_epub()
            self._parse_package_document()
            self._parse_toc()
            self._create_book()

            # Add chapters to the book
            self._add_chapters()

            return self.book

        finally:
            # Clean up temporary files
            if self.temp_dir:
                import shutil
                shutil.rmtree(self.temp_dir, ignore_errors=True)

    def _extract_epub(self):
        """Extract the EPUB file to a temporary directory."""
        with zipfile.ZipFile(self.epub_path, 'r') as zip_ref:
            zip_ref.extractall(self.temp_dir)

        # Find the content directory (typically OEBPS or OPS)
        container_path = os.path.join(self.temp_dir, 'META-INF', 'container.xml')
        if os.path.exists(container_path):
            tree = ET.parse(container_path)
            root = tree.getroot()

            # Get the path to the package document (content.opf)
            for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
                full_path = rootfile.get('full-path')
                if full_path:
                    self.content_dir = os.path.dirname(os.path.join(self.temp_dir, full_path))
                    return

        # Fallback: look for common content directories
        for content_dir in ['OEBPS', 'OPS', 'Content']:
            if os.path.exists(os.path.join(self.temp_dir, content_dir)):
                self.content_dir = os.path.join(self.temp_dir, content_dir)
                return

        # If no content directory found, use the root
        self.content_dir = self.temp_dir

    def _parse_package_document(self):
        """Parse the package document (content.opf)."""
        # Find the package document
        opf_path = None
        for root, dirs, files in os.walk(self.content_dir):
            for file in files:
                if file.endswith('.opf'):
                    opf_path = os.path.join(root, file)
                    break
            if opf_path:
                break

        if not opf_path:
            raise ValueError("No package document (.opf) found in EPUB")

        # Parse the package document
        tree = ET.parse(opf_path)
        root = tree.getroot()

        # Parse metadata
        self._parse_metadata(root)

        # Parse manifest
        self._parse_manifest(root)

        # Parse spine
        self._parse_spine(root)

    def _parse_metadata(self, root: ET.Element):
        """
        Parse metadata from the package document.

        Args:
            root: Root element of the package document
        """
        # Find the metadata element
        metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
        if metadata_elem is None:
            return

        # Parse DC metadata
        for elem in metadata_elem:
            if elem.tag.startswith('{{{0}}}'.format(NAMESPACES['dc'])):
                # Get the local name (without namespace)
                name = elem.tag.split('}', 1)[1]
                value = elem.text

                if name == 'title':
                    self.metadata['title'] = value
                elif name == 'creator':
                    self.metadata['creator'] = value
                elif name == 'language':
                    self.metadata['language'] = value
                elif name == 'description':
                    self.metadata['description'] = value
                elif name == 'subject':
                    if 'subjects' not in self.metadata:
                        self.metadata['subjects'] = []
                    self.metadata['subjects'].append(value)
                elif name == 'date':
                    self.metadata['date'] = value
                elif name == 'identifier':
                    self.metadata['identifier'] = value
                elif name == 'publisher':
                    self.metadata['publisher'] = value
                else:
                    # Store other metadata
                    self.metadata[name] = value

        # Parse meta elements for cover reference
        for meta in metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf'])):
            name = meta.get('name')
            content = meta.get('content')

            if name == 'cover' and content:
                # This is a reference to the cover image in the manifest
                self.cover_id = content

    def _parse_manifest(self, root: ET.Element):
        """
        Parse manifest from the package document.

        Args:
            root: Root element of the package document
        """
        # Find the manifest element
        manifest_elem = root.find('.//{{{0}}}manifest'.format(NAMESPACES['opf']))
        if manifest_elem is None:
            return

        # Parse items
        for item in manifest_elem.findall('.//{{{0}}}item'.format(NAMESPACES['opf'])):
            id = item.get('id')
            href = item.get('href')
            media_type = item.get('media-type')

            if id and href:
                # Resolve relative path
                href = urllib.parse.unquote(href)
                path = os.path.normpath(os.path.join(self.content_dir, href))

                self.manifest[id] = {
                    'href': href,
                    'path': path,
                    'media_type': media_type
                }

    def _parse_spine(self, root: ET.Element):
        """
        Parse spine from the package document.

        Args:
            root: Root element of the package document
        """
        # Find the spine element
        spine_elem = root.find('.//{{{0}}}spine'.format(NAMESPACES['opf']))
        if spine_elem is None:
            return

        # Get the toc attribute (NCX file ID)
        toc_id = spine_elem.get('toc')
        if toc_id and toc_id in self.manifest:
            self.toc_path = self.manifest[toc_id]['path']

        # Parse itemrefs
        for itemref in spine_elem.findall('.//{{{0}}}itemref'.format(NAMESPACES['opf'])):
            idref = itemref.get('idref')
            if idref and idref in self.manifest:
                self.spine.append(idref)

    def _parse_toc(self):
        """Parse the table of contents."""
        if not hasattr(self, 'toc_path') or not self.toc_path or not os.path.exists(self.toc_path):
            # Try to find the toc.ncx file
            for root, dirs, files in os.walk(self.content_dir):
                for file in files:
                    if file.endswith('.ncx'):
                        self.toc_path = os.path.join(root, file)
                        break
                if hasattr(self, 'toc_path') and self.toc_path:
                    break

        if not hasattr(self, 'toc_path') or not self.toc_path or not os.path.exists(self.toc_path):
            # No TOC found
            return

        # Parse the NCX file
        tree = ET.parse(self.toc_path)
        root = tree.getroot()

        # Parse navMap
        nav_map = root.find('.//{{{0}}}navMap'.format(NAMESPACES['ncx']))
        if nav_map is None:
            return

        # Parse navPoints
        self._parse_nav_points(nav_map, [])

    def _parse_nav_points(self, parent: ET.Element, path: List[Dict[str, Any]]):
        """
        Recursively parse navPoints from the NCX file.

        Args:
            parent: Parent element containing navPoints
            path: Current path in the TOC hierarchy
        """
        for nav_point in parent.findall('.//{{{0}}}navPoint'.format(NAMESPACES['ncx'])):
            # Get navPoint attributes
            id = nav_point.get('id')
            play_order = nav_point.get('playOrder')

            # Get navLabel
            nav_label = nav_point.find('.//{{{0}}}navLabel'.format(NAMESPACES['ncx']))
            text_elem = nav_label.find('.//{{{0}}}text'.format(NAMESPACES['ncx'])) if nav_label else None
            label = text_elem.text if text_elem is not None else ""

            # Get content
            content = nav_point.find('.//{{{0}}}content'.format(NAMESPACES['ncx']))
            src = content.get('src') if content is not None else ""

            # Create a TOC entry
            entry = {
                'id': id,
                'label': label,
                'src': src,
                'play_order': play_order,
                'children': []
            }

            # Add to TOC
            if path:
                path[-1]['children'].append(entry)
            else:
                self.toc.append(entry)

            # Parse child navPoints
            self._parse_nav_points(nav_point, path + [entry])

    def _create_book(self):
        """Create a Book object from the parsed metadata."""
        # Set book metadata
        if 'title' in self.metadata:
            self.book.set_title(self.metadata['title'])

        if 'creator' in self.metadata:
            self.book.set_metadata(MetadataType.AUTHOR, self.metadata['creator'])

        if 'language' in self.metadata:
            self.book.set_metadata(MetadataType.LANGUAGE, self.metadata['language'])

        if 'description' in self.metadata:
            self.book.set_metadata(MetadataType.DESCRIPTION, self.metadata['description'])

        if 'subjects' in self.metadata:
            self.book.set_metadata(MetadataType.KEYWORDS, ', '.join(self.metadata['subjects']))

        if 'date' in self.metadata:
            self.book.set_metadata(MetadataType.PUBLICATION_DATE, self.metadata['date'])

        if 'identifier' in self.metadata:
            self.book.set_metadata(MetadataType.IDENTIFIER, self.metadata['identifier'])

        if 'publisher' in self.metadata:
            self.book.set_metadata(MetadataType.PUBLISHER, self.metadata['publisher'])

    def _add_cover_chapter(self):
        """Add a cover chapter if a cover image is available."""
        if not self.cover_id or self.cover_id not in self.manifest:
            return

        # Get the cover image path from the manifest
        cover_item = self.manifest[self.cover_id]
        cover_path = cover_item['path']

        # Check if the file exists
        if not os.path.exists(cover_path):
            print(f"Warning: Cover image file not found: {cover_path}")
            return

        # Create a cover chapter
        cover_chapter = self.book.create_chapter("Cover", 0)

        try:
            # Create an Image block for the cover
            from pyWebLayout.abstract.block import Image as AbstractImage
            from PIL import Image as PILImage
            import io

            # Load the image into memory before the temp directory is cleaned up
            # We need to fully copy the image data to ensure it persists after temp cleanup
            with open(cover_path, 'rb') as f:
                image_bytes = f.read()

            # Create PIL image from bytes in memory
            pil_image = PILImage.open(io.BytesIO(image_bytes))
            pil_image.load()  # Force loading into memory

            # Create a copy to ensure all data is in memory
            pil_image = pil_image.copy()

            # Create an AbstractImage block with the cover image path
            cover_image = AbstractImage(source=cover_path, alt_text="Cover Image")

            # Set dimensions from the loaded image
            cover_image._width = pil_image.width
            cover_image._height = pil_image.height

            # Store the loaded PIL image in the abstract image so it persists after temp cleanup
            cover_image._loaded_image = pil_image

            # Add the image to the cover chapter
            cover_chapter.add_block(cover_image)

        except Exception as e:
            print(f"Error creating cover chapter: {str(e)}")
            import traceback
            traceback.print_exc()
            # If we can't create the cover image, remove the chapter
            if hasattr(self.book, 'chapters') and cover_chapter in self.book.chapters:
                self.book.chapters.remove(cover_chapter)

    def _add_chapters(self):
        """Add chapters to the book based on the spine and TOC."""
        # Add cover chapter first if available
        self._add_cover_chapter()

        # Create a mapping from src to TOC entry
        toc_map = {}

        def add_to_toc_map(entries):
            for entry in entries:
                if entry['src']:
                    # Extract the path part of the src (remove fragment)
                    src_parts = entry['src'].split('#', 1)
                    path = src_parts[0]
                    toc_map[path] = entry

                # Process children
                if entry['children']:
                    add_to_toc_map(entry['children'])

        add_to_toc_map(self.toc)

        # Process spine items
        # Start from chapter_index = 1 if cover was added, otherwise 0
        chapter_index = 1 if (self.cover_id and self.cover_id in self.manifest) else 0
        for i, idref in enumerate(self.spine):
            if idref not in self.manifest:
                continue

            item = self.manifest[idref]
            path = item['path']
            href = item['href']

            # Skip navigation files
            if (idref == 'nav' or
                item.get('media_type') == 'application/xhtml+xml' and
                ('nav' in href.lower() or 'toc' in href.lower())):
                continue

            # Check if this item is in the TOC
            chapter_title = None
            if href in toc_map:
                chapter_title = toc_map[href]['label']

            # Create a chapter
            chapter_index += 1
            chapter = self.book.create_chapter(chapter_title, chapter_index)

            # Parse the HTML content
            try:
                # Read the HTML file
                with open(path, 'r', encoding='utf-8') as f:
                    html = f.read()

                # Parse HTML and add blocks to chapter
                blocks = parse_html_string(html, document=self.book)

                # Copy blocks to the chapter
                for block in blocks:
                    chapter.add_block(block)

            except Exception as e:
                print(f"Error parsing chapter {i+1}: {str(e)}")
                # Add an error message block
                from pyWebLayout.abstract.block import Paragraph
                from pyWebLayout.abstract.inline import Word
                from pyWebLayout.style import Font
                error_para = Paragraph()
                # Create a default font style for the error message
                default_font = Font()
                error_para.add_word(Word(f"Error loading chapter: {str(e)}", default_font))
                chapter.add_block(error_para)


def read_epub(epub_path: str) -> Book:
    """
    Read an EPUB file and convert it to a Book.

    Args:
        epub_path: Path to the EPUB file

    Returns:
        Book: The parsed book
    """
    reader = EPUBReader(epub_path)
    return reader.read()