more repo cleaning

2025-11-06 17:40:34 +01:00 · 2025-11-06 17:40:34 +01:00 · 1bd9fdb551
commit 1bd9fdb551
parent 84229ad4da
6 changed files with 5 additions and 572 deletions
--- a/pyWebLayout/main.py
+++ b/pyWebLayout/main.py
@ -1,12 +0,0 @@
 import os
 import sys
 # Add the parent directory to sys.path for direct execution
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 # Now import the example module
 from pyWebLayout.example import save_examples
 if __name__ == "__main__":
    print("Running PyWebLayout examples...")
    save_examples()
--- a/pyWebLayout/io/init.py
+++ b/pyWebLayout/io/init.py
@ -3,13 +3,7 @@ Input/Output module for pyWebLayout.
 This package provides functionality for reading and writing various file formats,
 including HTML, EPUB, and other document formats.
 The module uses a decomposed architecture with specialized readers for different
 aspects of document parsing (metadata, content, resources), following the same
 pattern as the abstract module.
 """
-# Legacy readers (for backward compatibility)
+# Readers
 # Legacy functions provided by new HTML reader for backward compatibility
 from pyWebLayout.io.readers.epub_reader import EPUBReader
--- a/pyWebLayout/io/readers/init.py
+++ b/pyWebLayout/io/readers/init.py
@ -1,30 +1,17 @@
 """
 Readers module for pyWebLayout.
-This module provides specialized readers for different document formats
+This module provides specialized readers for different document formats.
 using a decomposed architecture pattern.
 """
 # Base classes for the decomposed architecture
 from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
 # HTML readers (decomposed)
 # EPUB readers
 from .epub_reader import read_epub  # Legacy
 __all__ = [
    # Base classes
    'BaseReader', 'MetadataReader', 'ContentReader', 'ResourceReader', 'CompositeReader',
    # HTML readers
-    'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string',
+    'read_html', 'read_html_file', 'parse_html_string',
    'HTMLMetadataReader', 'HTMLResourceReader',
    # EPUB readers
-    'read_epub', 'EPUBMetadataReader',
+    'read_epub',
 ]
--- a/pyWebLayout/io/readers/base.py
+++ b/pyWebLayout/io/readers/base.py
@ -1,229 +0,0 @@
 """
 Base classes for document readers in pyWebLayout.
 This module provides the foundational classes that all readers inherit from,
 similar to how the abstract module provides base classes for document elements.
 """
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Union
 from pyWebLayout.abstract.document import Document
 class BaseReader(ABC):
    """
    Abstract base class for all document readers.
    This class defines the common interface that all readers must implement.
    """
    def __init__(self):
        """Initialize the base reader."""
        self._document = None
        self._options = {}
    @abstractmethod
    def can_read(self, source: Union[str, bytes]) -> bool:
        """
        Check if this reader can handle the given source.
        Args:
            source: The source to check (file path, URL, or content)
        Returns:
            True if this reader can handle the source, False otherwise
        """
        pass
    @abstractmethod
    def read(self, source: Union[str, bytes], **options) -> Document:
        """
        Read and parse the source into a Document.
        Args:
            source: The source to read (file path, URL, or content)
            **options: Additional options for reading
        Returns:
            The parsed Document
        """
        pass
    def set_option(self, key: str, value: Any):
        """
        Set a reader option.
        Args:
            key: The option name
            value: The option value
        """
        self._options[key] = value
    def get_option(self, key: str, default: Any = None) -> Any:
        """
        Get a reader option.
        Args:
            key: The option name
            default: Default value if option is not set
        Returns:
            The option value or default
        """
        return self._options.get(key, default)
 class MetadataReader(ABC):
    """
    Abstract base class for reading document metadata.
    This class handles extraction of document metadata like title, author, etc.
    """
    @abstractmethod
    def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
        """
        Extract metadata from the source.
        Args:
            source: The source data
            document: The document to populate with metadata
        Returns:
            Dictionary of extracted metadata
        """
        pass
 class StructureReader(ABC):
    """
    Abstract base class for reading document structure.
    This class handles extraction of document structure like headings, sections, etc.
    """
    @abstractmethod
    def extract_structure(self, source: Any, document: Document) -> List[Any]:
        """
        Extract structure information from the source.
        Args:
            source: The source data
            document: The document to populate with structure
        Returns:
            List of structural elements
        """
        pass
 class ContentReader(ABC):
    """
    Abstract base class for reading document content.
    This class handles extraction of document content like text, formatting, etc.
    """
    @abstractmethod
    def extract_content(self, source: Any, document: Document) -> Any:
        """
        Extract content from the source.
        Args:
            source: The source data
            document: The document to populate with content
        Returns:
            The extracted content
        """
        pass
 class ResourceReader(ABC):
    """
    Abstract base class for reading document resources.
    This class handles extraction of document resources like images, stylesheets, etc.
    """
    @abstractmethod
    def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
        """
        Extract resources from the source.
        Args:
            source: The source data
            document: The document to populate with resources
        Returns:
            Dictionary of extracted resources
        """
        pass
 class CompositeReader(BaseReader):
    """
    A reader that combines multiple specialized readers.
    This class uses composition to combine metadata, structure, content,
    and resource readers into a complete document reader.
    """
    def __init__(self):
        """Initialize the composite reader."""
        super().__init__()
        self._metadata_reader: Optional[MetadataReader] = None
        self._structure_reader: Optional[StructureReader] = None
        self._content_reader: Optional[ContentReader] = None
        self._resource_reader: Optional[ResourceReader] = None
    def set_metadata_reader(self, reader: MetadataReader):
        """Set the metadata reader."""
        self._metadata_reader = reader
    def set_structure_reader(self, reader: StructureReader):
        """Set the structure reader."""
        self._structure_reader = reader
    def set_content_reader(self, reader: ContentReader):
        """Set the content reader."""
        self._content_reader = reader
    def set_resource_reader(self, reader: ResourceReader):
        """Set the resource reader."""
        self._resource_reader = reader
    def read(self, source: Union[str, bytes], **options) -> Document:
        """
        Read the source using all configured readers.
        Args:
            source: The source to read
            **options: Additional options for reading
        Returns:
            The parsed Document
        """
        # Create a new document
        document = Document()
        # Store options
        self._options.update(options)
        # Extract metadata if reader is available
        if self._metadata_reader:
            self._metadata_reader.extract_metadata(source, document)
        # Extract structure if reader is available
        if self._structure_reader:
            self._structure_reader.extract_structure(source, document)
        # Extract content if reader is available
        if self._content_reader:
            self._content_reader.extract_content(source, document)
        # Extract resources if reader is available
        if self._resource_reader:
            self._resource_reader.extract_resources(source, document)
        return document
--- a/pyWebLayout/style/layout.py
+++ b/pyWebLayout/style/layout.py
@ -1,5 +0,0 @@
 """
 Layout options for the pyWebLayout library.
 This module provides layout-related functionality.
 """
--- a/tests/io_tests/test_base_reader.py
+++ b/tests/io_tests/test_base_reader.py
@ -1,302 +0,0 @@
 """
 Tests for pyWebLayout.io.readers.base module.
 Tests the base reader classes and their functionality.
 """
 import pytest
 from pyWebLayout.io.readers.base import (
    BaseReader,
    MetadataReader,
    StructureReader,
    ContentReader,
    ResourceReader,
    CompositeReader
 )
 from pyWebLayout.abstract.document import Document
 # Concrete implementations for testing
 class ConcreteBaseReader(BaseReader):
    """Test implementation of BaseReader."""
    def can_read(self, source):
        return isinstance(source, str) and source.endswith('.test')
    def read(self, source, **options):
        doc = Document()
        doc.set_metadata('source', source)
        return doc
 class ConcreteMetadataReader(MetadataReader):
    """Test implementation of MetadataReader."""
    def extract_metadata(self, source, document):
        metadata = {
            'title': 'Test Title',
            'author': 'Test Author'
        }
        document.set_metadata('title', metadata['title'])
        document.set_metadata('author', metadata['author'])
        return metadata
 class ConcreteStructureReader(StructureReader):
    """Test implementation of StructureReader."""
    def extract_structure(self, source, document):
        return ['heading1', 'heading2']
 class ConcreteContentReader(ContentReader):
    """Test implementation of ContentReader."""
    def extract_content(self, source, document):
        return "Test content"
 class ConcreteResourceReader(ResourceReader):
    """Test implementation of ResourceReader."""
    def extract_resources(self, source, document):
        resources = {
            'image1.png': b'fake image data',
            'style.css': 'fake css'
        }
        for name, data in resources.items():
            document.add_resource(name, data)
        return resources
 class ConcreteCompositeReader(CompositeReader):
    """Test implementation of CompositeReader."""
    def can_read(self, source):
        return True
 # Test Cases
 class TestBaseReaderOptions:
    """Test BaseReader options functionality."""
    def test_set_and_get_option(self):
        """Test setting and getting options."""
        reader = ConcreteBaseReader()
        reader.set_option('font_size', 12)
        assert reader.get_option('font_size') == 12
    def test_get_option_with_default(self):
        """Test getting option with default value."""
        reader = ConcreteBaseReader()
        assert reader.get_option('nonexistent', 'default_value') == 'default_value'
    def test_get_option_without_default(self):
        """Test getting nonexistent option without default."""
        reader = ConcreteBaseReader()
        assert reader.get_option('nonexistent') is None
    def test_multiple_options(self):
        """Test setting multiple options."""
        reader = ConcreteBaseReader()
        reader.set_option('font_size', 12)
        reader.set_option('line_height', 1.5)
        reader.set_option('color', 'black')
        assert reader.get_option('font_size') == 12
        assert reader.get_option('line_height') == 1.5
        assert reader.get_option('color') == 'black'
 class TestBaseReaderConcrete:
    """Test concrete BaseReader implementation."""
    def test_can_read_valid_source(self):
        """Test can_read with valid source."""
        reader = ConcreteBaseReader()
        assert reader.can_read('document.test') is True
    def test_can_read_invalid_source(self):
        """Test can_read with invalid source."""
        reader = ConcreteBaseReader()
        assert reader.can_read('document.html') is False
    def test_read_creates_document(self):
        """Test read creates a Document."""
        reader = ConcreteBaseReader()
        doc = reader.read('test.test')
        assert isinstance(doc, Document)
        assert doc.get_metadata('source') == 'test.test'
 class TestMetadataReaderConcrete:
    """Test concrete MetadataReader implementation."""
    def test_extract_metadata(self):
        """Test metadata extraction."""
        reader = ConcreteMetadataReader()
        doc = Document()
        metadata = reader.extract_metadata('source', doc)
        assert metadata['title'] == 'Test Title'
        assert metadata['author'] == 'Test Author'
        assert doc.get_metadata('title') == 'Test Title'
        assert doc.get_metadata('author') == 'Test Author'
 class TestStructureReaderConcrete:
    """Test concrete StructureReader implementation."""
    def test_extract_structure(self):
        """Test structure extraction."""
        reader = ConcreteStructureReader()
        doc = Document()
        structure = reader.extract_structure('source', doc)
        assert isinstance(structure, list)
        assert len(structure) == 2
        assert structure[0] == 'heading1'
        assert structure[1] == 'heading2'
 class TestContentReaderConcrete:
    """Test concrete ContentReader implementation."""
    def test_extract_content(self):
        """Test content extraction."""
        reader = ConcreteContentReader()
        doc = Document()
        content = reader.extract_content('source', doc)
        assert content == "Test content"
 class TestResourceReaderConcrete:
    """Test concrete ResourceReader implementation."""
    def test_extract_resources(self):
        """Test resource extraction."""
        reader = ConcreteResourceReader()
        doc = Document()
        resources = reader.extract_resources('source', doc)
        assert isinstance(resources, dict)
        assert 'image1.png' in resources
        assert 'style.css' in resources
        assert doc.get_resource('image1.png') == b'fake image data'
        assert doc.get_resource('style.css') == 'fake css'
 class TestCompositeReader:
    """Test CompositeReader functionality."""
    def test_initialization(self):
        """Test composite reader initialization."""
        reader = ConcreteCompositeReader()
        assert reader._metadata_reader is None
        assert reader._structure_reader is None
        assert reader._content_reader is None
        assert reader._resource_reader is None
    def test_set_metadata_reader(self):
        """Test setting metadata reader."""
        reader = ConcreteCompositeReader()
        metadata_reader = ConcreteMetadataReader()
        reader.set_metadata_reader(metadata_reader)
        assert reader._metadata_reader is metadata_reader
    def test_set_structure_reader(self):
        """Test setting structure reader."""
        reader = ConcreteCompositeReader()
        structure_reader = ConcreteStructureReader()
        reader.set_structure_reader(structure_reader)
        assert reader._structure_reader is structure_reader
    def test_set_content_reader(self):
        """Test setting content reader."""
        reader = ConcreteCompositeReader()
        content_reader = ConcreteContentReader()
        reader.set_content_reader(content_reader)
        assert reader._content_reader is content_reader
    def test_set_resource_reader(self):
        """Test setting resource reader."""
        reader = ConcreteCompositeReader()
        resource_reader = ConcreteResourceReader()
        reader.set_resource_reader(resource_reader)
        assert reader._resource_reader is resource_reader
    def test_read_with_all_readers(self):
        """Test reading with all readers configured."""
        reader = ConcreteCompositeReader()
        reader.set_metadata_reader(ConcreteMetadataReader())
        reader.set_structure_reader(ConcreteStructureReader())
        reader.set_content_reader(ConcreteContentReader())
        reader.set_resource_reader(ConcreteResourceReader())
        doc = reader.read('test_source')
        # Verify metadata was extracted
        assert doc.get_metadata('title') == 'Test Title'
        assert doc.get_metadata('author') == 'Test Author'
        # Verify resources were extracted
        assert doc.get_resource('image1.png') == b'fake image data'
        assert doc.get_resource('style.css') == 'fake css'
    def test_read_with_no_readers(self):
        """Test reading with no readers configured."""
        reader = ConcreteCompositeReader()
        doc = reader.read('test_source')
        # Should create an empty document
        assert isinstance(doc, Document)
    def test_read_with_only_metadata_reader(self):
        """Test reading with only metadata reader."""
        reader = ConcreteCompositeReader()
        reader.set_metadata_reader(ConcreteMetadataReader())
        doc = reader.read('test_source')
        assert doc.get_metadata('title') == 'Test Title'
    def test_read_with_options(self):
        """Test reading with options."""
        reader = ConcreteCompositeReader()
        reader.set_metadata_reader(ConcreteMetadataReader())
        doc = reader.read('test_source', font_size=14, encoding='utf-8')
        # Verify options were stored
        assert reader.get_option('font_size') == 14
        assert reader.get_option('encoding') == 'utf-8'
    def test_can_read_implemented(self):
        """Test that can_read is implemented in ConcreteCompositeReader."""
        reader = ConcreteCompositeReader()
        assert reader.can_read('test_source') is True
 class TestCompositeReaderIntegration:
    """Integration tests for CompositeReader."""
    def test_full_document_reading_workflow(self):
        """Test complete document reading workflow."""
        # Create and configure composite reader
        reader = ConcreteCompositeReader()
        reader.set_metadata_reader(ConcreteMetadataReader())
        reader.set_structure_reader(ConcreteStructureReader())
        reader.set_content_reader(ConcreteContentReader())
        reader.set_resource_reader(ConcreteResourceReader())
        # Read document with options
        doc = reader.read('complex_document.test', font_size=16, page_width=800)
        # Verify all components worked together
        assert doc.get_metadata('title') == 'Test Title'
        assert doc.get_metadata('author') == 'Test Author'
        assert doc.get_resource('image1.png') is not None
        assert reader.get_option('font_size') == 16
        assert reader.get_option('page_width') == 800