230 lines
6.4 KiB
Python

"""
Base classes for document readers in pyWebLayout.
This module provides the foundational classes that all readers inherit from,
similar to how the abstract module provides base classes for document elements.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union
from pyWebLayout.abstract.document import Document
class BaseReader(ABC):
"""
Abstract base class for all document readers.
This class defines the common interface that all readers must implement.
"""
def __init__(self):
"""Initialize the base reader."""
self._document = None
self._options = {}
@abstractmethod
def can_read(self, source: Union[str, bytes]) -> bool:
"""
Check if this reader can handle the given source.
Args:
source: The source to check (file path, URL, or content)
Returns:
True if this reader can handle the source, False otherwise
"""
pass
@abstractmethod
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read and parse the source into a Document.
Args:
source: The source to read (file path, URL, or content)
**options: Additional options for reading
Returns:
The parsed Document
"""
pass
def set_option(self, key: str, value: Any):
"""
Set a reader option.
Args:
key: The option name
value: The option value
"""
self._options[key] = value
def get_option(self, key: str, default: Any = None) -> Any:
"""
Get a reader option.
Args:
key: The option name
default: Default value if option is not set
Returns:
The option value or default
"""
return self._options.get(key, default)
class MetadataReader(ABC):
"""
Abstract base class for reading document metadata.
This class handles extraction of document metadata like title, author, etc.
"""
@abstractmethod
def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
"""
Extract metadata from the source.
Args:
source: The source data
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
pass
class StructureReader(ABC):
"""
Abstract base class for reading document structure.
This class handles extraction of document structure like headings, sections, etc.
"""
@abstractmethod
def extract_structure(self, source: Any, document: Document) -> List[Any]:
"""
Extract structure information from the source.
Args:
source: The source data
document: The document to populate with structure
Returns:
List of structural elements
"""
pass
class ContentReader(ABC):
"""
Abstract base class for reading document content.
This class handles extraction of document content like text, formatting, etc.
"""
@abstractmethod
def extract_content(self, source: Any, document: Document) -> Any:
"""
Extract content from the source.
Args:
source: The source data
document: The document to populate with content
Returns:
The extracted content
"""
pass
class ResourceReader(ABC):
"""
Abstract base class for reading document resources.
This class handles extraction of document resources like images, stylesheets, etc.
"""
@abstractmethod
def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
"""
Extract resources from the source.
Args:
source: The source data
document: The document to populate with resources
Returns:
Dictionary of extracted resources
"""
pass
class CompositeReader(BaseReader):
"""
A reader that combines multiple specialized readers.
This class uses composition to combine metadata, structure, content,
and resource readers into a complete document reader.
"""
def __init__(self):
"""Initialize the composite reader."""
super().__init__()
self._metadata_reader: Optional[MetadataReader] = None
self._structure_reader: Optional[StructureReader] = None
self._content_reader: Optional[ContentReader] = None
self._resource_reader: Optional[ResourceReader] = None
def set_metadata_reader(self, reader: MetadataReader):
"""Set the metadata reader."""
self._metadata_reader = reader
def set_structure_reader(self, reader: StructureReader):
"""Set the structure reader."""
self._structure_reader = reader
def set_content_reader(self, reader: ContentReader):
"""Set the content reader."""
self._content_reader = reader
def set_resource_reader(self, reader: ResourceReader):
"""Set the resource reader."""
self._resource_reader = reader
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read the source using all configured readers.
Args:
source: The source to read
**options: Additional options for reading
Returns:
The parsed Document
"""
# Create a new document
document = Document()
# Store options
self._options.update(options)
# Extract metadata if reader is available
if self._metadata_reader:
self._metadata_reader.extract_metadata(source, document)
# Extract structure if reader is available
if self._structure_reader:
self._structure_reader.extract_structure(source, document)
# Extract content if reader is available
if self._content_reader:
self._content_reader.extract_content(source, document)
# Extract resources if reader is available
if self._resource_reader:
self._resource_reader.extract_resources(source, document)
return document