353 lines
13 KiB
Python
353 lines
13 KiB
Python
"""
|
|
EPUB metadata reader for pyWebLayout.
|
|
|
|
This module provides specialized functionality for extracting metadata
|
|
from EPUB documents, following the decomposed architecture pattern.
|
|
"""
|
|
|
|
import os
|
|
import zipfile
|
|
import tempfile
|
|
from typing import Dict, Any, Optional, List
|
|
import xml.etree.ElementTree as ET
|
|
from pyWebLayout.abstract.document import Document, MetadataType
|
|
from pyWebLayout.io.readers.base import MetadataReader
|
|
|
|
|
|
# XML namespaces used in EPUB files
|
|
NAMESPACES = {
|
|
'opf': 'http://www.idpf.org/2007/opf',
|
|
'dc': 'http://purl.org/dc/elements/1.1/',
|
|
'dcterms': 'http://purl.org/dc/terms/',
|
|
}
|
|
|
|
|
|
class EPUBMetadataReader(MetadataReader):
|
|
"""
|
|
Specialized reader for extracting metadata from EPUB documents.
|
|
|
|
This class handles EPUB package document metadata including
|
|
Dublin Core elements and custom metadata.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the EPUB metadata reader."""
|
|
self._metadata = {}
|
|
self._temp_dir = None
|
|
self._package_path = None
|
|
|
|
def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
|
|
"""
|
|
Extract metadata from EPUB file.
|
|
|
|
Args:
|
|
epub_path: Path to the EPUB file
|
|
document: The document to populate with metadata
|
|
|
|
Returns:
|
|
Dictionary of extracted metadata
|
|
"""
|
|
# Reset internal state
|
|
self._reset()
|
|
|
|
try:
|
|
# Extract EPUB to temporary directory
|
|
self._extract_epub(epub_path)
|
|
|
|
# Find and parse package document
|
|
self._find_package_document()
|
|
|
|
if self._package_path:
|
|
self._parse_package_metadata()
|
|
|
|
# Populate document with extracted metadata
|
|
self._populate_document(document)
|
|
|
|
return self._metadata
|
|
|
|
finally:
|
|
# Clean up temporary files
|
|
self._cleanup()
|
|
|
|
def _reset(self):
|
|
"""Reset internal state for a new extraction."""
|
|
self._metadata = {}
|
|
self._temp_dir = None
|
|
self._package_path = None
|
|
|
|
def _extract_epub(self, epub_path: str):
|
|
"""
|
|
Extract EPUB file to temporary directory.
|
|
|
|
Args:
|
|
epub_path: Path to the EPUB file
|
|
"""
|
|
self._temp_dir = tempfile.mkdtemp()
|
|
|
|
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
|
|
zip_ref.extractall(self._temp_dir)
|
|
|
|
def _find_package_document(self):
|
|
"""Find the package document (content.opf) in the extracted EPUB."""
|
|
# First, try to find it via META-INF/container.xml
|
|
container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
|
|
|
|
if os.path.exists(container_path):
|
|
try:
|
|
tree = ET.parse(container_path)
|
|
root = tree.getroot()
|
|
|
|
# Find rootfile element
|
|
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
|
|
full_path = rootfile.get('full-path')
|
|
if full_path:
|
|
self._package_path = os.path.join(self._temp_dir, full_path)
|
|
if os.path.exists(self._package_path):
|
|
return
|
|
except ET.ParseError:
|
|
pass
|
|
|
|
# Fallback: search for .opf files
|
|
for root, dirs, files in os.walk(self._temp_dir):
|
|
for file in files:
|
|
if file.endswith('.opf'):
|
|
self._package_path = os.path.join(root, file)
|
|
return
|
|
|
|
def _parse_package_metadata(self):
|
|
"""Parse metadata from the package document."""
|
|
if not self._package_path or not os.path.exists(self._package_path):
|
|
return
|
|
|
|
try:
|
|
tree = ET.parse(self._package_path)
|
|
root = tree.getroot()
|
|
|
|
# Find metadata element
|
|
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
|
|
if metadata_elem is None:
|
|
return
|
|
|
|
# Parse Dublin Core metadata
|
|
self._parse_dublin_core(metadata_elem)
|
|
|
|
# Parse OPF-specific metadata
|
|
self._parse_opf_metadata(metadata_elem)
|
|
|
|
except ET.ParseError as e:
|
|
print(f"Error parsing package document: {e}")
|
|
|
|
def _parse_dublin_core(self, metadata_elem: ET.Element):
|
|
"""
|
|
Parse Dublin Core metadata elements.
|
|
|
|
Args:
|
|
metadata_elem: The metadata XML element
|
|
"""
|
|
dc_elements = {
|
|
'title': 'title',
|
|
'creator': 'creator',
|
|
'subject': 'subject',
|
|
'description': 'description',
|
|
'publisher': 'publisher',
|
|
'contributor': 'contributor',
|
|
'date': 'date',
|
|
'type': 'type',
|
|
'format': 'format',
|
|
'identifier': 'identifier',
|
|
'source': 'source',
|
|
'language': 'language',
|
|
'relation': 'relation',
|
|
'coverage': 'coverage',
|
|
'rights': 'rights'
|
|
}
|
|
|
|
for dc_name, meta_key in dc_elements.items():
|
|
elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
|
|
|
|
if elements:
|
|
if len(elements) == 1:
|
|
# Single element
|
|
text = elements[0].text
|
|
if text:
|
|
self._metadata[meta_key] = text.strip()
|
|
|
|
# Handle special attributes
|
|
elem = elements[0]
|
|
if dc_name == 'creator':
|
|
# Check for role attribute
|
|
role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
|
|
if role:
|
|
self._metadata[f'{meta_key}_role'] = role
|
|
|
|
# Check for file-as attribute for sorting
|
|
file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
|
|
if file_as:
|
|
self._metadata[f'{meta_key}_file_as'] = file_as
|
|
|
|
elif dc_name == 'identifier':
|
|
# Check for scheme (ISBN, DOI, etc.)
|
|
scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
|
|
if scheme:
|
|
self._metadata[f'{meta_key}_scheme'] = scheme
|
|
|
|
# Check if this is the unique identifier
|
|
id_attr = elem.get('id')
|
|
if id_attr:
|
|
self._metadata[f'{meta_key}_id'] = id_attr
|
|
|
|
elif dc_name == 'date':
|
|
# Check for event type
|
|
event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
|
|
if event:
|
|
self._metadata[f'{meta_key}_event'] = event
|
|
|
|
else:
|
|
# Multiple elements - store as list
|
|
values = []
|
|
for elem in elements:
|
|
if elem.text:
|
|
values.append(elem.text.strip())
|
|
|
|
if values:
|
|
self._metadata[meta_key] = values
|
|
|
|
def _parse_opf_metadata(self, metadata_elem: ET.Element):
|
|
"""
|
|
Parse OPF-specific metadata elements.
|
|
|
|
Args:
|
|
metadata_elem: The metadata XML element
|
|
"""
|
|
# Parse meta elements
|
|
meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
|
|
|
|
for meta in meta_elements:
|
|
name = meta.get('name')
|
|
content = meta.get('content')
|
|
|
|
if name and content:
|
|
self._metadata[f'meta_{name}'] = content
|
|
|
|
# Parse x-metadata elements (custom metadata)
|
|
x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
|
|
|
|
for x_meta in x_meta_elements:
|
|
for child in x_meta:
|
|
if child.tag and child.text:
|
|
# Remove namespace prefix for cleaner key names
|
|
tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
|
self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
|
|
|
|
def _populate_document(self, document: Document):
|
|
"""
|
|
Populate the document with extracted metadata.
|
|
|
|
Args:
|
|
document: The document to populate
|
|
"""
|
|
# Map EPUB metadata to document metadata types
|
|
metadata_mapping = {
|
|
'title': MetadataType.TITLE,
|
|
'creator': MetadataType.AUTHOR,
|
|
'description': MetadataType.DESCRIPTION,
|
|
'subject': MetadataType.KEYWORDS,
|
|
'language': MetadataType.LANGUAGE,
|
|
'date': MetadataType.PUBLICATION_DATE,
|
|
'publisher': MetadataType.PUBLISHER,
|
|
'identifier': MetadataType.IDENTIFIER,
|
|
}
|
|
|
|
for epub_key, doc_type in metadata_mapping.items():
|
|
if epub_key in self._metadata:
|
|
value = self._metadata[epub_key]
|
|
|
|
# Handle list values (like multiple subjects)
|
|
if isinstance(value, list):
|
|
if epub_key == 'subject':
|
|
# Join subjects with commas for keywords
|
|
document.set_metadata(doc_type, ', '.join(value))
|
|
else:
|
|
# For other list values, use the first one
|
|
document.set_metadata(doc_type, value[0])
|
|
else:
|
|
document.set_metadata(doc_type, value)
|
|
|
|
# Handle cover image
|
|
cover_meta = self._metadata.get('meta_cover')
|
|
if cover_meta:
|
|
document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
|
|
|
|
# Store original EPUB metadata for reference
|
|
document.set_metadata(MetadataType.CUSTOM, {
|
|
'epub_metadata': self._metadata
|
|
})
|
|
|
|
def _cleanup(self):
|
|
"""Clean up temporary files."""
|
|
if self._temp_dir:
|
|
try:
|
|
import shutil
|
|
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
|
except:
|
|
pass
|
|
self._temp_dir = None
|
|
|
|
def get_unique_identifier(self) -> Optional[str]:
|
|
"""
|
|
Get the unique identifier from the EPUB metadata.
|
|
|
|
Returns:
|
|
The unique identifier string, or None if not found
|
|
"""
|
|
# Look for identifier with specific ID
|
|
for key, value in self._metadata.items():
|
|
if key.startswith('identifier') and key.endswith('_id'):
|
|
return self._metadata.get('identifier')
|
|
|
|
# Fallback to any identifier
|
|
return self._metadata.get('identifier')
|
|
|
|
def get_cover_id(self) -> Optional[str]:
|
|
"""
|
|
Get the cover image ID from metadata.
|
|
|
|
Returns:
|
|
The cover image ID, or None if not found
|
|
"""
|
|
return self._metadata.get('meta_cover')
|
|
|
|
def get_creators(self) -> List[Dict[str, str]]:
|
|
"""
|
|
Get creator information with roles.
|
|
|
|
Returns:
|
|
List of creator dictionaries with name, role, and file-as info
|
|
"""
|
|
creators = []
|
|
creator_value = self._metadata.get('creator')
|
|
|
|
if creator_value:
|
|
if isinstance(creator_value, list):
|
|
# Multiple creators - this is simplified, real implementation
|
|
# would need to correlate with role and file-as attributes
|
|
for creator in creator_value:
|
|
creators.append({'name': creator})
|
|
else:
|
|
# Single creator
|
|
creator_info = {'name': creator_value}
|
|
|
|
# Add role if available
|
|
role = self._metadata.get('creator_role')
|
|
if role:
|
|
creator_info['role'] = role
|
|
|
|
# Add file-as if available
|
|
file_as = self._metadata.get('creator_file_as')
|
|
if file_as:
|
|
creator_info['file_as'] = file_as
|
|
|
|
creators.append(creator_info)
|
|
|
|
return creators
|