pyWebLayout/pyWebLayout/io/readers/base.py

"""
Base classes for document readers in pyWebLayout.

This module provides the foundational classes that all readers inherit from,
similar to how the abstract module provides base classes for document elements.
"""

from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union
from pyWebLayout.abstract.document import Document


class BaseReader(ABC):
    """
    Abstract base class for all document readers.

    This class defines the common interface that all readers must implement.
    """

    def __init__(self):
        """Initialize the base reader."""
        self._document = None
        self._options = {}

    @abstractmethod
    def can_read(self, source: Union[str, bytes]) -> bool:
        """
        Check if this reader can handle the given source.

        Args:
            source: The source to check (file path, URL, or content)

        Returns:
            True if this reader can handle the source, False otherwise
        """
        pass

    @abstractmethod
    def read(self, source: Union[str, bytes], **options) -> Document:
        """
        Read and parse the source into a Document.

        Args:
            source: The source to read (file path, URL, or content)
            **options: Additional options for reading

        Returns:
            The parsed Document
        """
        pass

    def set_option(self, key: str, value: Any):
        """
        Set a reader option.

        Args:
            key: The option name
            value: The option value
        """
        self._options[key] = value

    def get_option(self, key: str, default: Any = None) -> Any:
        """
        Get a reader option.

        Args:
            key: The option name
            default: Default value if option is not set

        Returns:
            The option value or default
        """
        return self._options.get(key, default)


class MetadataReader(ABC):
    """
    Abstract base class for reading document metadata.

    This class handles extraction of document metadata like title, author, etc.
    """

    @abstractmethod
    def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
        """
        Extract metadata from the source.

        Args:
            source: The source data
            document: The document to populate with metadata

        Returns:
            Dictionary of extracted metadata
        """
        pass


class StructureReader(ABC):
    """
    Abstract base class for reading document structure.

    This class handles extraction of document structure like headings, sections, etc.
    """

    @abstractmethod
    def extract_structure(self, source: Any, document: Document) -> List[Any]:
        """
        Extract structure information from the source.

        Args:
            source: The source data
            document: The document to populate with structure

        Returns:
            List of structural elements
        """
        pass


class ContentReader(ABC):
    """
    Abstract base class for reading document content.

    This class handles extraction of document content like text, formatting, etc.
    """

    @abstractmethod
    def extract_content(self, source: Any, document: Document) -> Any:
        """
        Extract content from the source.

        Args:
            source: The source data
            document: The document to populate with content

        Returns:
            The extracted content
        """
        pass


class ResourceReader(ABC):
    """
    Abstract base class for reading document resources.

    This class handles extraction of document resources like images, stylesheets, etc.
    """

    @abstractmethod
    def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
        """
        Extract resources from the source.

        Args:
            source: The source data
            document: The document to populate with resources

        Returns:
            Dictionary of extracted resources
        """
        pass


class CompositeReader(BaseReader):
    """
    A reader that combines multiple specialized readers.

    This class uses composition to combine metadata, structure, content,
    and resource readers into a complete document reader.
    """

    def __init__(self):
        """Initialize the composite reader."""
        super().__init__()
        self._metadata_reader: Optional[MetadataReader] = None
        self._structure_reader: Optional[StructureReader] = None
        self._content_reader: Optional[ContentReader] = None
        self._resource_reader: Optional[ResourceReader] = None

    def set_metadata_reader(self, reader: MetadataReader):
        """Set the metadata reader."""
        self._metadata_reader = reader

    def set_structure_reader(self, reader: StructureReader):
        """Set the structure reader."""
        self._structure_reader = reader

    def set_content_reader(self, reader: ContentReader):
        """Set the content reader."""
        self._content_reader = reader

    def set_resource_reader(self, reader: ResourceReader):
        """Set the resource reader."""
        self._resource_reader = reader

    def read(self, source: Union[str, bytes], **options) -> Document:
        """
        Read the source using all configured readers.

        Args:
            source: The source to read
            **options: Additional options for reading

        Returns:
            The parsed Document
        """
        # Create a new document
        document = Document()

        # Store options
        self._options.update(options)

        # Extract metadata if reader is available
        if self._metadata_reader:
            self._metadata_reader.extract_metadata(source, document)

        # Extract structure if reader is available
        if self._structure_reader:
            self._structure_reader.extract_structure(source, document)

        # Extract content if reader is available
        if self._content_reader:
            self._content_reader.extract_content(source, document)

        # Extract resources if reader is available
        if self._resource_reader:
            self._resource_reader.extract_resources(source, document)

        return document