more repo cleaning
All checks were successful
Python CI / test (push) Successful in 10m0s

This commit is contained in:
Duncan Tourolle 2025-11-06 17:40:34 +01:00
parent 84229ad4da
commit 1bd9fdb551
6 changed files with 5 additions and 572 deletions

View File

@ -1,12 +0,0 @@
import os
import sys
# Add the parent directory to sys.path for direct execution
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Now import the example module
from pyWebLayout.example import save_examples
if __name__ == "__main__":
print("Running PyWebLayout examples...")
save_examples()

View File

@ -3,13 +3,7 @@ Input/Output module for pyWebLayout.
This package provides functionality for reading and writing various file formats, This package provides functionality for reading and writing various file formats,
including HTML, EPUB, and other document formats. including HTML, EPUB, and other document formats.
The module uses a decomposed architecture with specialized readers for different
aspects of document parsing (metadata, content, resources), following the same
pattern as the abstract module.
""" """
# Legacy readers (for backward compatibility) # Readers
# Legacy functions provided by new HTML reader for backward compatibility
from pyWebLayout.io.readers.epub_reader import EPUBReader from pyWebLayout.io.readers.epub_reader import EPUBReader

View File

@ -1,30 +1,17 @@
""" """
Readers module for pyWebLayout. Readers module for pyWebLayout.
This module provides specialized readers for different document formats This module provides specialized readers for different document formats.
using a decomposed architecture pattern.
""" """
# Base classes for the decomposed architecture
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
# HTML readers (decomposed)
# EPUB readers # EPUB readers
from .epub_reader import read_epub # Legacy from .epub_reader import read_epub # Legacy
__all__ = [ __all__ = [
# Base classes
'BaseReader', 'MetadataReader', 'ContentReader', 'ResourceReader', 'CompositeReader',
# HTML readers # HTML readers
'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string', 'read_html', 'read_html_file', 'parse_html_string',
'HTMLMetadataReader', 'HTMLResourceReader',
# EPUB readers # EPUB readers
'read_epub', 'EPUBMetadataReader', 'read_epub',
] ]

View File

@ -1,229 +0,0 @@
"""
Base classes for document readers in pyWebLayout.
This module provides the foundational classes that all readers inherit from,
similar to how the abstract module provides base classes for document elements.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union
from pyWebLayout.abstract.document import Document
class BaseReader(ABC):
"""
Abstract base class for all document readers.
This class defines the common interface that all readers must implement.
"""
def __init__(self):
"""Initialize the base reader."""
self._document = None
self._options = {}
@abstractmethod
def can_read(self, source: Union[str, bytes]) -> bool:
"""
Check if this reader can handle the given source.
Args:
source: The source to check (file path, URL, or content)
Returns:
True if this reader can handle the source, False otherwise
"""
pass
@abstractmethod
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read and parse the source into a Document.
Args:
source: The source to read (file path, URL, or content)
**options: Additional options for reading
Returns:
The parsed Document
"""
pass
def set_option(self, key: str, value: Any):
"""
Set a reader option.
Args:
key: The option name
value: The option value
"""
self._options[key] = value
def get_option(self, key: str, default: Any = None) -> Any:
"""
Get a reader option.
Args:
key: The option name
default: Default value if option is not set
Returns:
The option value or default
"""
return self._options.get(key, default)
class MetadataReader(ABC):
"""
Abstract base class for reading document metadata.
This class handles extraction of document metadata like title, author, etc.
"""
@abstractmethod
def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
"""
Extract metadata from the source.
Args:
source: The source data
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
pass
class StructureReader(ABC):
"""
Abstract base class for reading document structure.
This class handles extraction of document structure like headings, sections, etc.
"""
@abstractmethod
def extract_structure(self, source: Any, document: Document) -> List[Any]:
"""
Extract structure information from the source.
Args:
source: The source data
document: The document to populate with structure
Returns:
List of structural elements
"""
pass
class ContentReader(ABC):
"""
Abstract base class for reading document content.
This class handles extraction of document content like text, formatting, etc.
"""
@abstractmethod
def extract_content(self, source: Any, document: Document) -> Any:
"""
Extract content from the source.
Args:
source: The source data
document: The document to populate with content
Returns:
The extracted content
"""
pass
class ResourceReader(ABC):
"""
Abstract base class for reading document resources.
This class handles extraction of document resources like images, stylesheets, etc.
"""
@abstractmethod
def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
"""
Extract resources from the source.
Args:
source: The source data
document: The document to populate with resources
Returns:
Dictionary of extracted resources
"""
pass
class CompositeReader(BaseReader):
"""
A reader that combines multiple specialized readers.
This class uses composition to combine metadata, structure, content,
and resource readers into a complete document reader.
"""
def __init__(self):
"""Initialize the composite reader."""
super().__init__()
self._metadata_reader: Optional[MetadataReader] = None
self._structure_reader: Optional[StructureReader] = None
self._content_reader: Optional[ContentReader] = None
self._resource_reader: Optional[ResourceReader] = None
def set_metadata_reader(self, reader: MetadataReader):
"""Set the metadata reader."""
self._metadata_reader = reader
def set_structure_reader(self, reader: StructureReader):
"""Set the structure reader."""
self._structure_reader = reader
def set_content_reader(self, reader: ContentReader):
"""Set the content reader."""
self._content_reader = reader
def set_resource_reader(self, reader: ResourceReader):
"""Set the resource reader."""
self._resource_reader = reader
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read the source using all configured readers.
Args:
source: The source to read
**options: Additional options for reading
Returns:
The parsed Document
"""
# Create a new document
document = Document()
# Store options
self._options.update(options)
# Extract metadata if reader is available
if self._metadata_reader:
self._metadata_reader.extract_metadata(source, document)
# Extract structure if reader is available
if self._structure_reader:
self._structure_reader.extract_structure(source, document)
# Extract content if reader is available
if self._content_reader:
self._content_reader.extract_content(source, document)
# Extract resources if reader is available
if self._resource_reader:
self._resource_reader.extract_resources(source, document)
return document

View File

@ -1,5 +0,0 @@
"""
Layout options for the pyWebLayout library.
This module provides layout-related functionality.
"""

View File

@ -1,302 +0,0 @@
"""
Tests for pyWebLayout.io.readers.base module.
Tests the base reader classes and their functionality.
"""
import pytest
from pyWebLayout.io.readers.base import (
BaseReader,
MetadataReader,
StructureReader,
ContentReader,
ResourceReader,
CompositeReader
)
from pyWebLayout.abstract.document import Document
# Concrete implementations for testing
class ConcreteBaseReader(BaseReader):
"""Test implementation of BaseReader."""
def can_read(self, source):
return isinstance(source, str) and source.endswith('.test')
def read(self, source, **options):
doc = Document()
doc.set_metadata('source', source)
return doc
class ConcreteMetadataReader(MetadataReader):
"""Test implementation of MetadataReader."""
def extract_metadata(self, source, document):
metadata = {
'title': 'Test Title',
'author': 'Test Author'
}
document.set_metadata('title', metadata['title'])
document.set_metadata('author', metadata['author'])
return metadata
class ConcreteStructureReader(StructureReader):
"""Test implementation of StructureReader."""
def extract_structure(self, source, document):
return ['heading1', 'heading2']
class ConcreteContentReader(ContentReader):
"""Test implementation of ContentReader."""
def extract_content(self, source, document):
return "Test content"
class ConcreteResourceReader(ResourceReader):
"""Test implementation of ResourceReader."""
def extract_resources(self, source, document):
resources = {
'image1.png': b'fake image data',
'style.css': 'fake css'
}
for name, data in resources.items():
document.add_resource(name, data)
return resources
class ConcreteCompositeReader(CompositeReader):
"""Test implementation of CompositeReader."""
def can_read(self, source):
return True
# Test Cases
class TestBaseReaderOptions:
"""Test BaseReader options functionality."""
def test_set_and_get_option(self):
"""Test setting and getting options."""
reader = ConcreteBaseReader()
reader.set_option('font_size', 12)
assert reader.get_option('font_size') == 12
def test_get_option_with_default(self):
"""Test getting option with default value."""
reader = ConcreteBaseReader()
assert reader.get_option('nonexistent', 'default_value') == 'default_value'
def test_get_option_without_default(self):
"""Test getting nonexistent option without default."""
reader = ConcreteBaseReader()
assert reader.get_option('nonexistent') is None
def test_multiple_options(self):
"""Test setting multiple options."""
reader = ConcreteBaseReader()
reader.set_option('font_size', 12)
reader.set_option('line_height', 1.5)
reader.set_option('color', 'black')
assert reader.get_option('font_size') == 12
assert reader.get_option('line_height') == 1.5
assert reader.get_option('color') == 'black'
class TestBaseReaderConcrete:
"""Test concrete BaseReader implementation."""
def test_can_read_valid_source(self):
"""Test can_read with valid source."""
reader = ConcreteBaseReader()
assert reader.can_read('document.test') is True
def test_can_read_invalid_source(self):
"""Test can_read with invalid source."""
reader = ConcreteBaseReader()
assert reader.can_read('document.html') is False
def test_read_creates_document(self):
"""Test read creates a Document."""
reader = ConcreteBaseReader()
doc = reader.read('test.test')
assert isinstance(doc, Document)
assert doc.get_metadata('source') == 'test.test'
class TestMetadataReaderConcrete:
"""Test concrete MetadataReader implementation."""
def test_extract_metadata(self):
"""Test metadata extraction."""
reader = ConcreteMetadataReader()
doc = Document()
metadata = reader.extract_metadata('source', doc)
assert metadata['title'] == 'Test Title'
assert metadata['author'] == 'Test Author'
assert doc.get_metadata('title') == 'Test Title'
assert doc.get_metadata('author') == 'Test Author'
class TestStructureReaderConcrete:
"""Test concrete StructureReader implementation."""
def test_extract_structure(self):
"""Test structure extraction."""
reader = ConcreteStructureReader()
doc = Document()
structure = reader.extract_structure('source', doc)
assert isinstance(structure, list)
assert len(structure) == 2
assert structure[0] == 'heading1'
assert structure[1] == 'heading2'
class TestContentReaderConcrete:
"""Test concrete ContentReader implementation."""
def test_extract_content(self):
"""Test content extraction."""
reader = ConcreteContentReader()
doc = Document()
content = reader.extract_content('source', doc)
assert content == "Test content"
class TestResourceReaderConcrete:
"""Test concrete ResourceReader implementation."""
def test_extract_resources(self):
"""Test resource extraction."""
reader = ConcreteResourceReader()
doc = Document()
resources = reader.extract_resources('source', doc)
assert isinstance(resources, dict)
assert 'image1.png' in resources
assert 'style.css' in resources
assert doc.get_resource('image1.png') == b'fake image data'
assert doc.get_resource('style.css') == 'fake css'
class TestCompositeReader:
"""Test CompositeReader functionality."""
def test_initialization(self):
"""Test composite reader initialization."""
reader = ConcreteCompositeReader()
assert reader._metadata_reader is None
assert reader._structure_reader is None
assert reader._content_reader is None
assert reader._resource_reader is None
def test_set_metadata_reader(self):
"""Test setting metadata reader."""
reader = ConcreteCompositeReader()
metadata_reader = ConcreteMetadataReader()
reader.set_metadata_reader(metadata_reader)
assert reader._metadata_reader is metadata_reader
def test_set_structure_reader(self):
"""Test setting structure reader."""
reader = ConcreteCompositeReader()
structure_reader = ConcreteStructureReader()
reader.set_structure_reader(structure_reader)
assert reader._structure_reader is structure_reader
def test_set_content_reader(self):
"""Test setting content reader."""
reader = ConcreteCompositeReader()
content_reader = ConcreteContentReader()
reader.set_content_reader(content_reader)
assert reader._content_reader is content_reader
def test_set_resource_reader(self):
"""Test setting resource reader."""
reader = ConcreteCompositeReader()
resource_reader = ConcreteResourceReader()
reader.set_resource_reader(resource_reader)
assert reader._resource_reader is resource_reader
def test_read_with_all_readers(self):
"""Test reading with all readers configured."""
reader = ConcreteCompositeReader()
reader.set_metadata_reader(ConcreteMetadataReader())
reader.set_structure_reader(ConcreteStructureReader())
reader.set_content_reader(ConcreteContentReader())
reader.set_resource_reader(ConcreteResourceReader())
doc = reader.read('test_source')
# Verify metadata was extracted
assert doc.get_metadata('title') == 'Test Title'
assert doc.get_metadata('author') == 'Test Author'
# Verify resources were extracted
assert doc.get_resource('image1.png') == b'fake image data'
assert doc.get_resource('style.css') == 'fake css'
def test_read_with_no_readers(self):
"""Test reading with no readers configured."""
reader = ConcreteCompositeReader()
doc = reader.read('test_source')
# Should create an empty document
assert isinstance(doc, Document)
def test_read_with_only_metadata_reader(self):
"""Test reading with only metadata reader."""
reader = ConcreteCompositeReader()
reader.set_metadata_reader(ConcreteMetadataReader())
doc = reader.read('test_source')
assert doc.get_metadata('title') == 'Test Title'
def test_read_with_options(self):
"""Test reading with options."""
reader = ConcreteCompositeReader()
reader.set_metadata_reader(ConcreteMetadataReader())
doc = reader.read('test_source', font_size=14, encoding='utf-8')
# Verify options were stored
assert reader.get_option('font_size') == 14
assert reader.get_option('encoding') == 'utf-8'
def test_can_read_implemented(self):
"""Test that can_read is implemented in ConcreteCompositeReader."""
reader = ConcreteCompositeReader()
assert reader.can_read('test_source') is True
class TestCompositeReaderIntegration:
"""Integration tests for CompositeReader."""
def test_full_document_reading_workflow(self):
"""Test complete document reading workflow."""
# Create and configure composite reader
reader = ConcreteCompositeReader()
reader.set_metadata_reader(ConcreteMetadataReader())
reader.set_structure_reader(ConcreteStructureReader())
reader.set_content_reader(ConcreteContentReader())
reader.set_resource_reader(ConcreteResourceReader())
# Read document with options
doc = reader.read('complex_document.test', font_size=16, page_width=800)
# Verify all components worked together
assert doc.get_metadata('title') == 'Test Title'
assert doc.get_metadata('author') == 'Test Author'
assert doc.get_resource('image1.png') is not None
assert reader.get_option('font_size') == 16
assert reader.get_option('page_width') == 800