""" Document loading and metadata management. This module handles EPUB and HTML loading, extracting blocks and metadata. """ from __future__ import annotations from typing import List, Tuple, Dict, Any, Optional from pathlib import Path import os from pyWebLayout.io.readers.epub_reader import read_epub from pyWebLayout.io.readers.html_extraction import parse_html_string from pyWebLayout.abstract.block import Block class DocumentManager: """ Handles document loading and metadata extraction. Responsibilities: - Load EPUB files - Load HTML content - Extract document metadata (title, author, etc.) - Extract content blocks for rendering """ def __init__(self): """Initialize the document manager.""" self.document_id: Optional[str] = None self.title: Optional[str] = None self.author: Optional[str] = None self.blocks: Optional[List[Block]] = None def load_epub(self, epub_path: str) -> bool: """ Load an EPUB file and extract content. Args: epub_path: Path to the EPUB file Returns: True if loaded successfully, False otherwise """ try: # Validate path if not os.path.exists(epub_path): raise FileNotFoundError(f"EPUB file not found: {epub_path}") # Load the EPUB book = read_epub(epub_path) # Extract metadata self.title = book.get_title() or "Unknown Title" self.author = book.get_metadata('AUTHOR') or "Unknown Author" # Create document ID from filename self.document_id = Path(epub_path).stem # Extract all blocks from chapters self.blocks = [] for chapter in book.chapters: if hasattr(chapter, '_blocks'): self.blocks.extend(chapter._blocks) if not self.blocks: raise ValueError("No content blocks found in EPUB") return True except Exception as e: print(f"Error loading EPUB: {e}") import traceback print(f"Full traceback:") traceback.print_exc() return False def load_html(self, html_string: str, title: str = "HTML Document", author: str = "Unknown", document_id: str = "html_doc") -> bool: """ Load HTML content directly. This is useful for rendering library screens, menus, or other HTML-based UI elements. Args: html_string: HTML content to render title: Document title (for metadata) author: Document author (for metadata) document_id: Unique identifier for this HTML document Returns: True if loaded successfully, False otherwise """ try: # Parse HTML into blocks blocks = parse_html_string(html_string) if not blocks: raise ValueError("No content blocks parsed from HTML") # Set metadata self.title = title self.author = author self.document_id = document_id self.blocks = blocks return True except Exception as e: print(f"Error loading HTML: {e}") return False def is_loaded(self) -> bool: """Check if a document is currently loaded.""" return self.blocks is not None and len(self.blocks) > 0 def get_metadata(self) -> Dict[str, Any]: """ Get document metadata. Returns: Dictionary with metadata (title, author, document_id, total_blocks) """ return { 'title': self.title, 'author': self.author, 'document_id': self.document_id, 'total_blocks': len(self.blocks) if self.blocks else 0 } def get_blocks(self) -> Optional[List[Block]]: """Get the list of content blocks.""" return self.blocks def clear(self): """Clear the currently loaded document.""" self.document_id = None self.title = None self.author = None self.blocks = None