pyWebLayout/pyWebLayout/io/readers/epub_metadata.py

353 lines
13 KiB
Python

"""
EPUB metadata reader for pyWebLayout.
This module provides specialized functionality for extracting metadata
from EPUB documents, following the decomposed architecture pattern.
"""
import os
import zipfile
import tempfile
from typing import Dict, Any, Optional, List
import xml.etree.ElementTree as ET
from pyWebLayout.abstract.document import Document, MetadataType
from pyWebLayout.io.readers.base import MetadataReader
# XML namespaces used in EPUB files
NAMESPACES = {
'opf': 'http://www.idpf.org/2007/opf',
'dc': 'http://purl.org/dc/elements/1.1/',
'dcterms': 'http://purl.org/dc/terms/',
}
class EPUBMetadataReader(MetadataReader):
"""
Specialized reader for extracting metadata from EPUB documents.
This class handles EPUB package document metadata including
Dublin Core elements and custom metadata.
"""
def __init__(self):
"""Initialize the EPUB metadata reader."""
self._metadata = {}
self._temp_dir = None
self._package_path = None
def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
"""
Extract metadata from EPUB file.
Args:
epub_path: Path to the EPUB file
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
# Reset internal state
self._reset()
try:
# Extract EPUB to temporary directory
self._extract_epub(epub_path)
# Find and parse package document
self._find_package_document()
if self._package_path:
self._parse_package_metadata()
# Populate document with extracted metadata
self._populate_document(document)
return self._metadata
finally:
# Clean up temporary files
self._cleanup()
def _reset(self):
"""Reset internal state for a new extraction."""
self._metadata = {}
self._temp_dir = None
self._package_path = None
def _extract_epub(self, epub_path: str):
"""
Extract EPUB file to temporary directory.
Args:
epub_path: Path to the EPUB file
"""
self._temp_dir = tempfile.mkdtemp()
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
zip_ref.extractall(self._temp_dir)
def _find_package_document(self):
"""Find the package document (content.opf) in the extracted EPUB."""
# First, try to find it via META-INF/container.xml
container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
if os.path.exists(container_path):
try:
tree = ET.parse(container_path)
root = tree.getroot()
# Find rootfile element
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
full_path = rootfile.get('full-path')
if full_path:
self._package_path = os.path.join(self._temp_dir, full_path)
if os.path.exists(self._package_path):
return
except ET.ParseError:
pass
# Fallback: search for .opf files
for root, dirs, files in os.walk(self._temp_dir):
for file in files:
if file.endswith('.opf'):
self._package_path = os.path.join(root, file)
return
def _parse_package_metadata(self):
"""Parse metadata from the package document."""
if not self._package_path or not os.path.exists(self._package_path):
return
try:
tree = ET.parse(self._package_path)
root = tree.getroot()
# Find metadata element
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
if metadata_elem is None:
return
# Parse Dublin Core metadata
self._parse_dublin_core(metadata_elem)
# Parse OPF-specific metadata
self._parse_opf_metadata(metadata_elem)
except ET.ParseError as e:
print(f"Error parsing package document: {e}")
def _parse_dublin_core(self, metadata_elem: ET.Element):
"""
Parse Dublin Core metadata elements.
Args:
metadata_elem: The metadata XML element
"""
dc_elements = {
'title': 'title',
'creator': 'creator',
'subject': 'subject',
'description': 'description',
'publisher': 'publisher',
'contributor': 'contributor',
'date': 'date',
'type': 'type',
'format': 'format',
'identifier': 'identifier',
'source': 'source',
'language': 'language',
'relation': 'relation',
'coverage': 'coverage',
'rights': 'rights'
}
for dc_name, meta_key in dc_elements.items():
elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
if elements:
if len(elements) == 1:
# Single element
text = elements[0].text
if text:
self._metadata[meta_key] = text.strip()
# Handle special attributes
elem = elements[0]
if dc_name == 'creator':
# Check for role attribute
role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
if role:
self._metadata[f'{meta_key}_role'] = role
# Check for file-as attribute for sorting
file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
if file_as:
self._metadata[f'{meta_key}_file_as'] = file_as
elif dc_name == 'identifier':
# Check for scheme (ISBN, DOI, etc.)
scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
if scheme:
self._metadata[f'{meta_key}_scheme'] = scheme
# Check if this is the unique identifier
id_attr = elem.get('id')
if id_attr:
self._metadata[f'{meta_key}_id'] = id_attr
elif dc_name == 'date':
# Check for event type
event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
if event:
self._metadata[f'{meta_key}_event'] = event
else:
# Multiple elements - store as list
values = []
for elem in elements:
if elem.text:
values.append(elem.text.strip())
if values:
self._metadata[meta_key] = values
def _parse_opf_metadata(self, metadata_elem: ET.Element):
"""
Parse OPF-specific metadata elements.
Args:
metadata_elem: The metadata XML element
"""
# Parse meta elements
meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
for meta in meta_elements:
name = meta.get('name')
content = meta.get('content')
if name and content:
self._metadata[f'meta_{name}'] = content
# Parse x-metadata elements (custom metadata)
x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
for x_meta in x_meta_elements:
for child in x_meta:
if child.tag and child.text:
# Remove namespace prefix for cleaner key names
tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
def _populate_document(self, document: Document):
"""
Populate the document with extracted metadata.
Args:
document: The document to populate
"""
# Map EPUB metadata to document metadata types
metadata_mapping = {
'title': MetadataType.TITLE,
'creator': MetadataType.AUTHOR,
'description': MetadataType.DESCRIPTION,
'subject': MetadataType.KEYWORDS,
'language': MetadataType.LANGUAGE,
'date': MetadataType.PUBLICATION_DATE,
'publisher': MetadataType.PUBLISHER,
'identifier': MetadataType.IDENTIFIER,
}
for epub_key, doc_type in metadata_mapping.items():
if epub_key in self._metadata:
value = self._metadata[epub_key]
# Handle list values (like multiple subjects)
if isinstance(value, list):
if epub_key == 'subject':
# Join subjects with commas for keywords
document.set_metadata(doc_type, ', '.join(value))
else:
# For other list values, use the first one
document.set_metadata(doc_type, value[0])
else:
document.set_metadata(doc_type, value)
# Handle cover image
cover_meta = self._metadata.get('meta_cover')
if cover_meta:
document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
# Store original EPUB metadata for reference
document.set_metadata(MetadataType.CUSTOM, {
'epub_metadata': self._metadata
})
def _cleanup(self):
"""Clean up temporary files."""
if self._temp_dir:
try:
import shutil
shutil.rmtree(self._temp_dir, ignore_errors=True)
except:
pass
self._temp_dir = None
def get_unique_identifier(self) -> Optional[str]:
"""
Get the unique identifier from the EPUB metadata.
Returns:
The unique identifier string, or None if not found
"""
# Look for identifier with specific ID
for key, value in self._metadata.items():
if key.startswith('identifier') and key.endswith('_id'):
return self._metadata.get('identifier')
# Fallback to any identifier
return self._metadata.get('identifier')
def get_cover_id(self) -> Optional[str]:
"""
Get the cover image ID from metadata.
Returns:
The cover image ID, or None if not found
"""
return self._metadata.get('meta_cover')
def get_creators(self) -> List[Dict[str, str]]:
"""
Get creator information with roles.
Returns:
List of creator dictionaries with name, role, and file-as info
"""
creators = []
creator_value = self._metadata.get('creator')
if creator_value:
if isinstance(creator_value, list):
# Multiple creators - this is simplified, real implementation
# would need to correlate with role and file-as attributes
for creator in creator_value:
creators.append({'name': creator})
else:
# Single creator
creator_info = {'name': creator_value}
# Add role if available
role = self._metadata.get('creator_role')
if role:
creator_info['role'] = role
# Add file-as if available
file_as = self._metadata.get('creator_file_as')
if file_as:
creator_info['file_as'] = file_as
creators.append(creator_info)
return creators