diff --git a/pyWebLayout/__main__.py b/pyWebLayout/__main__.py deleted file mode 100644 index b40fdf5..0000000 --- a/pyWebLayout/__main__.py +++ /dev/null @@ -1,12 +0,0 @@ -import os -import sys - -# Add the parent directory to sys.path for direct execution -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -# Now import the example module -from pyWebLayout.example import save_examples - -if __name__ == "__main__": - print("Running PyWebLayout examples...") - save_examples() diff --git a/pyWebLayout/io/__init__.py b/pyWebLayout/io/__init__.py index 8365e96..d7d47f7 100644 --- a/pyWebLayout/io/__init__.py +++ b/pyWebLayout/io/__init__.py @@ -3,13 +3,7 @@ Input/Output module for pyWebLayout. This package provides functionality for reading and writing various file formats, including HTML, EPUB, and other document formats. - -The module uses a decomposed architecture with specialized readers for different -aspects of document parsing (metadata, content, resources), following the same -pattern as the abstract module. """ -# Legacy readers (for backward compatibility) -# Legacy functions provided by new HTML reader for backward compatibility - +# Readers from pyWebLayout.io.readers.epub_reader import EPUBReader diff --git a/pyWebLayout/io/readers/__init__.py b/pyWebLayout/io/readers/__init__.py index c2350ad..9ee1022 100644 --- a/pyWebLayout/io/readers/__init__.py +++ b/pyWebLayout/io/readers/__init__.py @@ -1,30 +1,17 @@ """ Readers module for pyWebLayout. -This module provides specialized readers for different document formats -using a decomposed architecture pattern. +This module provides specialized readers for different document formats. """ -# Base classes for the decomposed architecture -from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader - -# HTML readers (decomposed) - - - - # EPUB readers from .epub_reader import read_epub # Legacy __all__ = [ - # Base classes - 'BaseReader', 'MetadataReader', 'ContentReader', 'ResourceReader', 'CompositeReader', - # HTML readers - 'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string', - 'HTMLMetadataReader', 'HTMLResourceReader', - + 'read_html', 'read_html_file', 'parse_html_string', + # EPUB readers - 'read_epub', 'EPUBMetadataReader', + 'read_epub', ] diff --git a/pyWebLayout/io/readers/base.py b/pyWebLayout/io/readers/base.py deleted file mode 100644 index 984f580..0000000 --- a/pyWebLayout/io/readers/base.py +++ /dev/null @@ -1,229 +0,0 @@ -""" -Base classes for document readers in pyWebLayout. - -This module provides the foundational classes that all readers inherit from, -similar to how the abstract module provides base classes for document elements. -""" - -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Union -from pyWebLayout.abstract.document import Document - - -class BaseReader(ABC): - """ - Abstract base class for all document readers. - - This class defines the common interface that all readers must implement. - """ - - def __init__(self): - """Initialize the base reader.""" - self._document = None - self._options = {} - - @abstractmethod - def can_read(self, source: Union[str, bytes]) -> bool: - """ - Check if this reader can handle the given source. - - Args: - source: The source to check (file path, URL, or content) - - Returns: - True if this reader can handle the source, False otherwise - """ - pass - - @abstractmethod - def read(self, source: Union[str, bytes], **options) -> Document: - """ - Read and parse the source into a Document. - - Args: - source: The source to read (file path, URL, or content) - **options: Additional options for reading - - Returns: - The parsed Document - """ - pass - - def set_option(self, key: str, value: Any): - """ - Set a reader option. - - Args: - key: The option name - value: The option value - """ - self._options[key] = value - - def get_option(self, key: str, default: Any = None) -> Any: - """ - Get a reader option. - - Args: - key: The option name - default: Default value if option is not set - - Returns: - The option value or default - """ - return self._options.get(key, default) - - -class MetadataReader(ABC): - """ - Abstract base class for reading document metadata. - - This class handles extraction of document metadata like title, author, etc. - """ - - @abstractmethod - def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]: - """ - Extract metadata from the source. - - Args: - source: The source data - document: The document to populate with metadata - - Returns: - Dictionary of extracted metadata - """ - pass - - -class StructureReader(ABC): - """ - Abstract base class for reading document structure. - - This class handles extraction of document structure like headings, sections, etc. - """ - - @abstractmethod - def extract_structure(self, source: Any, document: Document) -> List[Any]: - """ - Extract structure information from the source. - - Args: - source: The source data - document: The document to populate with structure - - Returns: - List of structural elements - """ - pass - - -class ContentReader(ABC): - """ - Abstract base class for reading document content. - - This class handles extraction of document content like text, formatting, etc. - """ - - @abstractmethod - def extract_content(self, source: Any, document: Document) -> Any: - """ - Extract content from the source. - - Args: - source: The source data - document: The document to populate with content - - Returns: - The extracted content - """ - pass - - -class ResourceReader(ABC): - """ - Abstract base class for reading document resources. - - This class handles extraction of document resources like images, stylesheets, etc. - """ - - @abstractmethod - def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]: - """ - Extract resources from the source. - - Args: - source: The source data - document: The document to populate with resources - - Returns: - Dictionary of extracted resources - """ - pass - - -class CompositeReader(BaseReader): - """ - A reader that combines multiple specialized readers. - - This class uses composition to combine metadata, structure, content, - and resource readers into a complete document reader. - """ - - def __init__(self): - """Initialize the composite reader.""" - super().__init__() - self._metadata_reader: Optional[MetadataReader] = None - self._structure_reader: Optional[StructureReader] = None - self._content_reader: Optional[ContentReader] = None - self._resource_reader: Optional[ResourceReader] = None - - def set_metadata_reader(self, reader: MetadataReader): - """Set the metadata reader.""" - self._metadata_reader = reader - - def set_structure_reader(self, reader: StructureReader): - """Set the structure reader.""" - self._structure_reader = reader - - def set_content_reader(self, reader: ContentReader): - """Set the content reader.""" - self._content_reader = reader - - def set_resource_reader(self, reader: ResourceReader): - """Set the resource reader.""" - self._resource_reader = reader - - def read(self, source: Union[str, bytes], **options) -> Document: - """ - Read the source using all configured readers. - - Args: - source: The source to read - **options: Additional options for reading - - Returns: - The parsed Document - """ - # Create a new document - document = Document() - - # Store options - self._options.update(options) - - # Extract metadata if reader is available - if self._metadata_reader: - self._metadata_reader.extract_metadata(source, document) - - # Extract structure if reader is available - if self._structure_reader: - self._structure_reader.extract_structure(source, document) - - # Extract content if reader is available - if self._content_reader: - self._content_reader.extract_content(source, document) - - # Extract resources if reader is available - if self._resource_reader: - self._resource_reader.extract_resources(source, document) - - return document diff --git a/pyWebLayout/style/layout.py b/pyWebLayout/style/layout.py deleted file mode 100644 index 456adf8..0000000 --- a/pyWebLayout/style/layout.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Layout options for the pyWebLayout library. - -This module provides layout-related functionality. -""" diff --git a/tests/io_tests/test_base_reader.py b/tests/io_tests/test_base_reader.py deleted file mode 100644 index 4a25a44..0000000 --- a/tests/io_tests/test_base_reader.py +++ /dev/null @@ -1,302 +0,0 @@ -""" -Tests for pyWebLayout.io.readers.base module. - -Tests the base reader classes and their functionality. -""" - -import pytest -from pyWebLayout.io.readers.base import ( - BaseReader, - MetadataReader, - StructureReader, - ContentReader, - ResourceReader, - CompositeReader -) -from pyWebLayout.abstract.document import Document - - -# Concrete implementations for testing - -class ConcreteBaseReader(BaseReader): - """Test implementation of BaseReader.""" - - def can_read(self, source): - return isinstance(source, str) and source.endswith('.test') - - def read(self, source, **options): - doc = Document() - doc.set_metadata('source', source) - return doc - - -class ConcreteMetadataReader(MetadataReader): - """Test implementation of MetadataReader.""" - - def extract_metadata(self, source, document): - metadata = { - 'title': 'Test Title', - 'author': 'Test Author' - } - document.set_metadata('title', metadata['title']) - document.set_metadata('author', metadata['author']) - return metadata - - -class ConcreteStructureReader(StructureReader): - """Test implementation of StructureReader.""" - - def extract_structure(self, source, document): - return ['heading1', 'heading2'] - - -class ConcreteContentReader(ContentReader): - """Test implementation of ContentReader.""" - - def extract_content(self, source, document): - return "Test content" - - -class ConcreteResourceReader(ResourceReader): - """Test implementation of ResourceReader.""" - - def extract_resources(self, source, document): - resources = { - 'image1.png': b'fake image data', - 'style.css': 'fake css' - } - for name, data in resources.items(): - document.add_resource(name, data) - return resources - - -class ConcreteCompositeReader(CompositeReader): - """Test implementation of CompositeReader.""" - - def can_read(self, source): - return True - - -# Test Cases - -class TestBaseReaderOptions: - """Test BaseReader options functionality.""" - - def test_set_and_get_option(self): - """Test setting and getting options.""" - reader = ConcreteBaseReader() - reader.set_option('font_size', 12) - assert reader.get_option('font_size') == 12 - - def test_get_option_with_default(self): - """Test getting option with default value.""" - reader = ConcreteBaseReader() - assert reader.get_option('nonexistent', 'default_value') == 'default_value' - - def test_get_option_without_default(self): - """Test getting nonexistent option without default.""" - reader = ConcreteBaseReader() - assert reader.get_option('nonexistent') is None - - def test_multiple_options(self): - """Test setting multiple options.""" - reader = ConcreteBaseReader() - reader.set_option('font_size', 12) - reader.set_option('line_height', 1.5) - reader.set_option('color', 'black') - - assert reader.get_option('font_size') == 12 - assert reader.get_option('line_height') == 1.5 - assert reader.get_option('color') == 'black' - - -class TestBaseReaderConcrete: - """Test concrete BaseReader implementation.""" - - def test_can_read_valid_source(self): - """Test can_read with valid source.""" - reader = ConcreteBaseReader() - assert reader.can_read('document.test') is True - - def test_can_read_invalid_source(self): - """Test can_read with invalid source.""" - reader = ConcreteBaseReader() - assert reader.can_read('document.html') is False - - def test_read_creates_document(self): - """Test read creates a Document.""" - reader = ConcreteBaseReader() - doc = reader.read('test.test') - assert isinstance(doc, Document) - assert doc.get_metadata('source') == 'test.test' - - -class TestMetadataReaderConcrete: - """Test concrete MetadataReader implementation.""" - - def test_extract_metadata(self): - """Test metadata extraction.""" - reader = ConcreteMetadataReader() - doc = Document() - metadata = reader.extract_metadata('source', doc) - - assert metadata['title'] == 'Test Title' - assert metadata['author'] == 'Test Author' - assert doc.get_metadata('title') == 'Test Title' - assert doc.get_metadata('author') == 'Test Author' - - -class TestStructureReaderConcrete: - """Test concrete StructureReader implementation.""" - - def test_extract_structure(self): - """Test structure extraction.""" - reader = ConcreteStructureReader() - doc = Document() - structure = reader.extract_structure('source', doc) - - assert isinstance(structure, list) - assert len(structure) == 2 - assert structure[0] == 'heading1' - assert structure[1] == 'heading2' - - -class TestContentReaderConcrete: - """Test concrete ContentReader implementation.""" - - def test_extract_content(self): - """Test content extraction.""" - reader = ConcreteContentReader() - doc = Document() - content = reader.extract_content('source', doc) - - assert content == "Test content" - - -class TestResourceReaderConcrete: - """Test concrete ResourceReader implementation.""" - - def test_extract_resources(self): - """Test resource extraction.""" - reader = ConcreteResourceReader() - doc = Document() - resources = reader.extract_resources('source', doc) - - assert isinstance(resources, dict) - assert 'image1.png' in resources - assert 'style.css' in resources - assert doc.get_resource('image1.png') == b'fake image data' - assert doc.get_resource('style.css') == 'fake css' - - -class TestCompositeReader: - """Test CompositeReader functionality.""" - - def test_initialization(self): - """Test composite reader initialization.""" - reader = ConcreteCompositeReader() - assert reader._metadata_reader is None - assert reader._structure_reader is None - assert reader._content_reader is None - assert reader._resource_reader is None - - def test_set_metadata_reader(self): - """Test setting metadata reader.""" - reader = ConcreteCompositeReader() - metadata_reader = ConcreteMetadataReader() - reader.set_metadata_reader(metadata_reader) - assert reader._metadata_reader is metadata_reader - - def test_set_structure_reader(self): - """Test setting structure reader.""" - reader = ConcreteCompositeReader() - structure_reader = ConcreteStructureReader() - reader.set_structure_reader(structure_reader) - assert reader._structure_reader is structure_reader - - def test_set_content_reader(self): - """Test setting content reader.""" - reader = ConcreteCompositeReader() - content_reader = ConcreteContentReader() - reader.set_content_reader(content_reader) - assert reader._content_reader is content_reader - - def test_set_resource_reader(self): - """Test setting resource reader.""" - reader = ConcreteCompositeReader() - resource_reader = ConcreteResourceReader() - reader.set_resource_reader(resource_reader) - assert reader._resource_reader is resource_reader - - def test_read_with_all_readers(self): - """Test reading with all readers configured.""" - reader = ConcreteCompositeReader() - reader.set_metadata_reader(ConcreteMetadataReader()) - reader.set_structure_reader(ConcreteStructureReader()) - reader.set_content_reader(ConcreteContentReader()) - reader.set_resource_reader(ConcreteResourceReader()) - - doc = reader.read('test_source') - - # Verify metadata was extracted - assert doc.get_metadata('title') == 'Test Title' - assert doc.get_metadata('author') == 'Test Author' - - # Verify resources were extracted - assert doc.get_resource('image1.png') == b'fake image data' - assert doc.get_resource('style.css') == 'fake css' - - def test_read_with_no_readers(self): - """Test reading with no readers configured.""" - reader = ConcreteCompositeReader() - doc = reader.read('test_source') - - # Should create an empty document - assert isinstance(doc, Document) - - def test_read_with_only_metadata_reader(self): - """Test reading with only metadata reader.""" - reader = ConcreteCompositeReader() - reader.set_metadata_reader(ConcreteMetadataReader()) - - doc = reader.read('test_source') - assert doc.get_metadata('title') == 'Test Title' - - def test_read_with_options(self): - """Test reading with options.""" - reader = ConcreteCompositeReader() - reader.set_metadata_reader(ConcreteMetadataReader()) - - doc = reader.read('test_source', font_size=14, encoding='utf-8') - - # Verify options were stored - assert reader.get_option('font_size') == 14 - assert reader.get_option('encoding') == 'utf-8' - - def test_can_read_implemented(self): - """Test that can_read is implemented in ConcreteCompositeReader.""" - reader = ConcreteCompositeReader() - assert reader.can_read('test_source') is True - - -class TestCompositeReaderIntegration: - """Integration tests for CompositeReader.""" - - def test_full_document_reading_workflow(self): - """Test complete document reading workflow.""" - # Create and configure composite reader - reader = ConcreteCompositeReader() - reader.set_metadata_reader(ConcreteMetadataReader()) - reader.set_structure_reader(ConcreteStructureReader()) - reader.set_content_reader(ConcreteContentReader()) - reader.set_resource_reader(ConcreteResourceReader()) - - # Read document with options - doc = reader.read('complex_document.test', font_size=16, page_width=800) - - # Verify all components worked together - assert doc.get_metadata('title') == 'Test Title' - assert doc.get_metadata('author') == 'Test Author' - assert doc.get_resource('image1.png') is not None - assert reader.get_option('font_size') == 16 - assert reader.get_option('page_width') == 800