138 lines
4.0 KiB
Python
138 lines
4.0 KiB
Python
"""
|
|
Document loading and metadata management.
|
|
|
|
This module handles EPUB and HTML loading, extracting blocks and metadata.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
from typing import List, Tuple, Dict, Any, Optional
|
|
from pathlib import Path
|
|
import os
|
|
|
|
from pyWebLayout.io.readers.epub_reader import read_epub
|
|
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
|
from pyWebLayout.abstract.block import Block
|
|
|
|
|
|
class DocumentManager:
|
|
"""
|
|
Handles document loading and metadata extraction.
|
|
|
|
Responsibilities:
|
|
- Load EPUB files
|
|
- Load HTML content
|
|
- Extract document metadata (title, author, etc.)
|
|
- Extract content blocks for rendering
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the document manager."""
|
|
self.document_id: Optional[str] = None
|
|
self.title: Optional[str] = None
|
|
self.author: Optional[str] = None
|
|
self.blocks: Optional[List[Block]] = None
|
|
|
|
def load_epub(self, epub_path: str) -> bool:
|
|
"""
|
|
Load an EPUB file and extract content.
|
|
|
|
Args:
|
|
epub_path: Path to the EPUB file
|
|
|
|
Returns:
|
|
True if loaded successfully, False otherwise
|
|
"""
|
|
try:
|
|
# Validate path
|
|
if not os.path.exists(epub_path):
|
|
raise FileNotFoundError(f"EPUB file not found: {epub_path}")
|
|
|
|
# Load the EPUB
|
|
book = read_epub(epub_path)
|
|
|
|
# Extract metadata
|
|
self.title = book.get_title() or "Unknown Title"
|
|
self.author = book.get_metadata('AUTHOR') or "Unknown Author"
|
|
|
|
# Create document ID from filename
|
|
self.document_id = Path(epub_path).stem
|
|
|
|
# Extract all blocks from chapters
|
|
self.blocks = []
|
|
for chapter in book.chapters:
|
|
if hasattr(chapter, '_blocks'):
|
|
self.blocks.extend(chapter._blocks)
|
|
|
|
if not self.blocks:
|
|
raise ValueError("No content blocks found in EPUB")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error loading EPUB: {e}")
|
|
return False
|
|
|
|
def load_html(self, html_string: str, title: str = "HTML Document",
|
|
author: str = "Unknown", document_id: str = "html_doc") -> bool:
|
|
"""
|
|
Load HTML content directly.
|
|
|
|
This is useful for rendering library screens, menus, or other HTML-based UI elements.
|
|
|
|
Args:
|
|
html_string: HTML content to render
|
|
title: Document title (for metadata)
|
|
author: Document author (for metadata)
|
|
document_id: Unique identifier for this HTML document
|
|
|
|
Returns:
|
|
True if loaded successfully, False otherwise
|
|
"""
|
|
try:
|
|
# Parse HTML into blocks
|
|
blocks = parse_html_string(html_string)
|
|
|
|
if not blocks:
|
|
raise ValueError("No content blocks parsed from HTML")
|
|
|
|
# Set metadata
|
|
self.title = title
|
|
self.author = author
|
|
self.document_id = document_id
|
|
self.blocks = blocks
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error loading HTML: {e}")
|
|
return False
|
|
|
|
def is_loaded(self) -> bool:
|
|
"""Check if a document is currently loaded."""
|
|
return self.blocks is not None and len(self.blocks) > 0
|
|
|
|
def get_metadata(self) -> Dict[str, Any]:
|
|
"""
|
|
Get document metadata.
|
|
|
|
Returns:
|
|
Dictionary with metadata (title, author, document_id, total_blocks)
|
|
"""
|
|
return {
|
|
'title': self.title,
|
|
'author': self.author,
|
|
'document_id': self.document_id,
|
|
'total_blocks': len(self.blocks) if self.blocks else 0
|
|
}
|
|
|
|
def get_blocks(self) -> Optional[List[Block]]:
|
|
"""Get the list of content blocks."""
|
|
return self.blocks
|
|
|
|
def clear(self):
|
|
"""Clear the currently loaded document."""
|
|
self.document_id = None
|
|
self.title = None
|
|
self.author = None
|
|
self.blocks = None
|