dreader-application/dreader/managers/document.py

"""
Document loading and metadata management.

This module handles EPUB and HTML loading, extracting blocks and metadata.
"""

from __future__ import annotations
from typing import List, Tuple, Dict, Any, Optional
from pathlib import Path
import os

from pyWebLayout.io.readers.epub_reader import read_epub
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.abstract.block import Block


class DocumentManager:
    """
    Handles document loading and metadata extraction.

    Responsibilities:
    - Load EPUB files
    - Load HTML content
    - Extract document metadata (title, author, etc.)
    - Extract content blocks for rendering
    """

    def __init__(self):
        """Initialize the document manager."""
        self.document_id: Optional[str] = None
        self.title: Optional[str] = None
        self.author: Optional[str] = None
        self.blocks: Optional[List[Block]] = None

    def load_epub(self, epub_path: str) -> bool:
        """
        Load an EPUB file and extract content.

        Args:
            epub_path: Path to the EPUB file

        Returns:
            True if loaded successfully, False otherwise
        """
        try:
            # Validate path
            if not os.path.exists(epub_path):
                raise FileNotFoundError(f"EPUB file not found: {epub_path}")

            # Load the EPUB
            book = read_epub(epub_path)

            # Extract metadata
            self.title = book.get_title() or "Unknown Title"
            self.author = book.get_metadata('AUTHOR') or "Unknown Author"

            # Create document ID from filename
            self.document_id = Path(epub_path).stem

            # Extract all blocks from chapters
            self.blocks = []
            for chapter in book.chapters:
                if hasattr(chapter, '_blocks'):
                    self.blocks.extend(chapter._blocks)

            if not self.blocks:
                raise ValueError("No content blocks found in EPUB")

            return True

        except Exception as e:
            print(f"Error loading EPUB: {e}")
            return False

    def load_html(self, html_string: str, title: str = "HTML Document",
                  author: str = "Unknown", document_id: str = "html_doc") -> bool:
        """
        Load HTML content directly.

        This is useful for rendering library screens, menus, or other HTML-based UI elements.

        Args:
            html_string: HTML content to render
            title: Document title (for metadata)
            author: Document author (for metadata)
            document_id: Unique identifier for this HTML document

        Returns:
            True if loaded successfully, False otherwise
        """
        try:
            # Parse HTML into blocks
            blocks = parse_html_string(html_string)

            if not blocks:
                raise ValueError("No content blocks parsed from HTML")

            # Set metadata
            self.title = title
            self.author = author
            self.document_id = document_id
            self.blocks = blocks

            return True

        except Exception as e:
            print(f"Error loading HTML: {e}")
            return False

    def is_loaded(self) -> bool:
        """Check if a document is currently loaded."""
        return self.blocks is not None and len(self.blocks) > 0

    def get_metadata(self) -> Dict[str, Any]:
        """
        Get document metadata.

        Returns:
            Dictionary with metadata (title, author, document_id, total_blocks)
        """
        return {
            'title': self.title,
            'author': self.author,
            'document_id': self.document_id,
            'total_blocks': len(self.blocks) if self.blocks else 0
        }

    def get_blocks(self) -> Optional[List[Block]]:
        """Get the list of content blocks."""
        return self.blocks

    def clear(self):
        """Clear the currently loaded document."""
        self.document_id = None
        self.title = None
        self.author = None
        self.blocks = None