230 lines
6.4 KiB
Python
230 lines
6.4 KiB
Python
"""
|
|
Base classes for document readers in pyWebLayout.
|
|
|
|
This module provides the foundational classes that all readers inherit from,
|
|
similar to how the abstract module provides base classes for document elements.
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any, Dict, List, Optional, Union
|
|
from pyWebLayout.abstract.document import Document
|
|
|
|
|
|
class BaseReader(ABC):
|
|
"""
|
|
Abstract base class for all document readers.
|
|
|
|
This class defines the common interface that all readers must implement.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the base reader."""
|
|
self._document = None
|
|
self._options = {}
|
|
|
|
@abstractmethod
|
|
def can_read(self, source: Union[str, bytes]) -> bool:
|
|
"""
|
|
Check if this reader can handle the given source.
|
|
|
|
Args:
|
|
source: The source to check (file path, URL, or content)
|
|
|
|
Returns:
|
|
True if this reader can handle the source, False otherwise
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def read(self, source: Union[str, bytes], **options) -> Document:
|
|
"""
|
|
Read and parse the source into a Document.
|
|
|
|
Args:
|
|
source: The source to read (file path, URL, or content)
|
|
**options: Additional options for reading
|
|
|
|
Returns:
|
|
The parsed Document
|
|
"""
|
|
pass
|
|
|
|
def set_option(self, key: str, value: Any):
|
|
"""
|
|
Set a reader option.
|
|
|
|
Args:
|
|
key: The option name
|
|
value: The option value
|
|
"""
|
|
self._options[key] = value
|
|
|
|
def get_option(self, key: str, default: Any = None) -> Any:
|
|
"""
|
|
Get a reader option.
|
|
|
|
Args:
|
|
key: The option name
|
|
default: Default value if option is not set
|
|
|
|
Returns:
|
|
The option value or default
|
|
"""
|
|
return self._options.get(key, default)
|
|
|
|
|
|
class MetadataReader(ABC):
|
|
"""
|
|
Abstract base class for reading document metadata.
|
|
|
|
This class handles extraction of document metadata like title, author, etc.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
|
|
"""
|
|
Extract metadata from the source.
|
|
|
|
Args:
|
|
source: The source data
|
|
document: The document to populate with metadata
|
|
|
|
Returns:
|
|
Dictionary of extracted metadata
|
|
"""
|
|
pass
|
|
|
|
|
|
class StructureReader(ABC):
|
|
"""
|
|
Abstract base class for reading document structure.
|
|
|
|
This class handles extraction of document structure like headings, sections, etc.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def extract_structure(self, source: Any, document: Document) -> List[Any]:
|
|
"""
|
|
Extract structure information from the source.
|
|
|
|
Args:
|
|
source: The source data
|
|
document: The document to populate with structure
|
|
|
|
Returns:
|
|
List of structural elements
|
|
"""
|
|
pass
|
|
|
|
|
|
class ContentReader(ABC):
|
|
"""
|
|
Abstract base class for reading document content.
|
|
|
|
This class handles extraction of document content like text, formatting, etc.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def extract_content(self, source: Any, document: Document) -> Any:
|
|
"""
|
|
Extract content from the source.
|
|
|
|
Args:
|
|
source: The source data
|
|
document: The document to populate with content
|
|
|
|
Returns:
|
|
The extracted content
|
|
"""
|
|
pass
|
|
|
|
|
|
class ResourceReader(ABC):
|
|
"""
|
|
Abstract base class for reading document resources.
|
|
|
|
This class handles extraction of document resources like images, stylesheets, etc.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
|
|
"""
|
|
Extract resources from the source.
|
|
|
|
Args:
|
|
source: The source data
|
|
document: The document to populate with resources
|
|
|
|
Returns:
|
|
Dictionary of extracted resources
|
|
"""
|
|
pass
|
|
|
|
|
|
class CompositeReader(BaseReader):
|
|
"""
|
|
A reader that combines multiple specialized readers.
|
|
|
|
This class uses composition to combine metadata, structure, content,
|
|
and resource readers into a complete document reader.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the composite reader."""
|
|
super().__init__()
|
|
self._metadata_reader: Optional[MetadataReader] = None
|
|
self._structure_reader: Optional[StructureReader] = None
|
|
self._content_reader: Optional[ContentReader] = None
|
|
self._resource_reader: Optional[ResourceReader] = None
|
|
|
|
def set_metadata_reader(self, reader: MetadataReader):
|
|
"""Set the metadata reader."""
|
|
self._metadata_reader = reader
|
|
|
|
def set_structure_reader(self, reader: StructureReader):
|
|
"""Set the structure reader."""
|
|
self._structure_reader = reader
|
|
|
|
def set_content_reader(self, reader: ContentReader):
|
|
"""Set the content reader."""
|
|
self._content_reader = reader
|
|
|
|
def set_resource_reader(self, reader: ResourceReader):
|
|
"""Set the resource reader."""
|
|
self._resource_reader = reader
|
|
|
|
def read(self, source: Union[str, bytes], **options) -> Document:
|
|
"""
|
|
Read the source using all configured readers.
|
|
|
|
Args:
|
|
source: The source to read
|
|
**options: Additional options for reading
|
|
|
|
Returns:
|
|
The parsed Document
|
|
"""
|
|
# Create a new document
|
|
document = Document()
|
|
|
|
# Store options
|
|
self._options.update(options)
|
|
|
|
# Extract metadata if reader is available
|
|
if self._metadata_reader:
|
|
self._metadata_reader.extract_metadata(source, document)
|
|
|
|
# Extract structure if reader is available
|
|
if self._structure_reader:
|
|
self._structure_reader.extract_structure(source, document)
|
|
|
|
# Extract content if reader is available
|
|
if self._content_reader:
|
|
self._content_reader.extract_content(source, document)
|
|
|
|
# Extract resources if reader is available
|
|
if self._resource_reader:
|
|
self._resource_reader.extract_resources(source, document)
|
|
|
|
return document
|