Duncan Tourolle dd392d2e15
Some checks failed
Python CI / test (3.12) (push) Successful in 6m50s
Python CI / test (3.13) (push) Has been cancelled
improved library screen, fixed issues with image rendering and navigation
2025-11-10 14:13:59 +01:00

141 lines
4.1 KiB
Python

"""
Document loading and metadata management.
This module handles EPUB and HTML loading, extracting blocks and metadata.
"""
from __future__ import annotations
from typing import List, Tuple, Dict, Any, Optional
from pathlib import Path
import os
from pyWebLayout.io.readers.epub_reader import read_epub
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.abstract.block import Block
class DocumentManager:
"""
Handles document loading and metadata extraction.
Responsibilities:
- Load EPUB files
- Load HTML content
- Extract document metadata (title, author, etc.)
- Extract content blocks for rendering
"""
def __init__(self):
"""Initialize the document manager."""
self.document_id: Optional[str] = None
self.title: Optional[str] = None
self.author: Optional[str] = None
self.blocks: Optional[List[Block]] = None
def load_epub(self, epub_path: str) -> bool:
"""
Load an EPUB file and extract content.
Args:
epub_path: Path to the EPUB file
Returns:
True if loaded successfully, False otherwise
"""
try:
# Validate path
if not os.path.exists(epub_path):
raise FileNotFoundError(f"EPUB file not found: {epub_path}")
# Load the EPUB
book = read_epub(epub_path)
# Extract metadata
self.title = book.get_title() or "Unknown Title"
self.author = book.get_author() or "Unknown Author"
# Create document ID from filename
self.document_id = Path(epub_path).stem
# Extract all blocks from chapters
self.blocks = []
for chapter in book.chapters:
if hasattr(chapter, '_blocks'):
self.blocks.extend(chapter._blocks)
if not self.blocks:
raise ValueError("No content blocks found in EPUB")
return True
except Exception as e:
print(f"Error loading EPUB: {e}")
import traceback
print(f"Full traceback:")
traceback.print_exc()
return False
def load_html(self, html_string: str, title: str = "HTML Document",
author: str = "Unknown", document_id: str = "html_doc") -> bool:
"""
Load HTML content directly.
This is useful for rendering library screens, menus, or other HTML-based UI elements.
Args:
html_string: HTML content to render
title: Document title (for metadata)
author: Document author (for metadata)
document_id: Unique identifier for this HTML document
Returns:
True if loaded successfully, False otherwise
"""
try:
# Parse HTML into blocks
blocks = parse_html_string(html_string)
if not blocks:
raise ValueError("No content blocks parsed from HTML")
# Set metadata
self.title = title
self.author = author
self.document_id = document_id
self.blocks = blocks
return True
except Exception as e:
print(f"Error loading HTML: {e}")
return False
def is_loaded(self) -> bool:
"""Check if a document is currently loaded."""
return self.blocks is not None and len(self.blocks) > 0
def get_metadata(self) -> Dict[str, Any]:
"""
Get document metadata.
Returns:
Dictionary with metadata (title, author, document_id, total_blocks)
"""
return {
'title': self.title,
'author': self.author,
'document_id': self.document_id,
'total_blocks': len(self.blocks) if self.blocks else 0
}
def get_blocks(self) -> Optional[List[Block]]:
"""Get the list of content blocks."""
return self.blocks
def clear(self):
"""Clear the currently loaded document."""
self.document_id = None
self.title = None
self.author = None
self.blocks = None