This commit is contained in:
parent
84229ad4da
commit
1bd9fdb551
@ -1,12 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the parent directory to sys.path for direct execution
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
|
||||
# Now import the example module
|
||||
from pyWebLayout.example import save_examples
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Running PyWebLayout examples...")
|
||||
save_examples()
|
||||
@ -3,13 +3,7 @@ Input/Output module for pyWebLayout.
|
||||
|
||||
This package provides functionality for reading and writing various file formats,
|
||||
including HTML, EPUB, and other document formats.
|
||||
|
||||
The module uses a decomposed architecture with specialized readers for different
|
||||
aspects of document parsing (metadata, content, resources), following the same
|
||||
pattern as the abstract module.
|
||||
"""
|
||||
|
||||
# Legacy readers (for backward compatibility)
|
||||
# Legacy functions provided by new HTML reader for backward compatibility
|
||||
|
||||
# Readers
|
||||
from pyWebLayout.io.readers.epub_reader import EPUBReader
|
||||
|
||||
@ -1,30 +1,17 @@
|
||||
"""
|
||||
Readers module for pyWebLayout.
|
||||
|
||||
This module provides specialized readers for different document formats
|
||||
using a decomposed architecture pattern.
|
||||
This module provides specialized readers for different document formats.
|
||||
"""
|
||||
|
||||
# Base classes for the decomposed architecture
|
||||
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
||||
|
||||
# HTML readers (decomposed)
|
||||
|
||||
|
||||
|
||||
|
||||
# EPUB readers
|
||||
from .epub_reader import read_epub # Legacy
|
||||
|
||||
|
||||
__all__ = [
|
||||
# Base classes
|
||||
'BaseReader', 'MetadataReader', 'ContentReader', 'ResourceReader', 'CompositeReader',
|
||||
|
||||
# HTML readers
|
||||
'HTMLReader', 'read_html', 'read_html_file', 'parse_html_string',
|
||||
'HTMLMetadataReader', 'HTMLResourceReader',
|
||||
|
||||
'read_html', 'read_html_file', 'parse_html_string',
|
||||
|
||||
# EPUB readers
|
||||
'read_epub', 'EPUBMetadataReader',
|
||||
'read_epub',
|
||||
]
|
||||
|
||||
@ -1,229 +0,0 @@
|
||||
"""
|
||||
Base classes for document readers in pyWebLayout.
|
||||
|
||||
This module provides the foundational classes that all readers inherit from,
|
||||
similar to how the abstract module provides base classes for document elements.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from pyWebLayout.abstract.document import Document
|
||||
|
||||
|
||||
class BaseReader(ABC):
|
||||
"""
|
||||
Abstract base class for all document readers.
|
||||
|
||||
This class defines the common interface that all readers must implement.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the base reader."""
|
||||
self._document = None
|
||||
self._options = {}
|
||||
|
||||
@abstractmethod
|
||||
def can_read(self, source: Union[str, bytes]) -> bool:
|
||||
"""
|
||||
Check if this reader can handle the given source.
|
||||
|
||||
Args:
|
||||
source: The source to check (file path, URL, or content)
|
||||
|
||||
Returns:
|
||||
True if this reader can handle the source, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def read(self, source: Union[str, bytes], **options) -> Document:
|
||||
"""
|
||||
Read and parse the source into a Document.
|
||||
|
||||
Args:
|
||||
source: The source to read (file path, URL, or content)
|
||||
**options: Additional options for reading
|
||||
|
||||
Returns:
|
||||
The parsed Document
|
||||
"""
|
||||
pass
|
||||
|
||||
def set_option(self, key: str, value: Any):
|
||||
"""
|
||||
Set a reader option.
|
||||
|
||||
Args:
|
||||
key: The option name
|
||||
value: The option value
|
||||
"""
|
||||
self._options[key] = value
|
||||
|
||||
def get_option(self, key: str, default: Any = None) -> Any:
|
||||
"""
|
||||
Get a reader option.
|
||||
|
||||
Args:
|
||||
key: The option name
|
||||
default: Default value if option is not set
|
||||
|
||||
Returns:
|
||||
The option value or default
|
||||
"""
|
||||
return self._options.get(key, default)
|
||||
|
||||
|
||||
class MetadataReader(ABC):
|
||||
"""
|
||||
Abstract base class for reading document metadata.
|
||||
|
||||
This class handles extraction of document metadata like title, author, etc.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_metadata(self, source: Any, document: Document) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract metadata from the source.
|
||||
|
||||
Args:
|
||||
source: The source data
|
||||
document: The document to populate with metadata
|
||||
|
||||
Returns:
|
||||
Dictionary of extracted metadata
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class StructureReader(ABC):
|
||||
"""
|
||||
Abstract base class for reading document structure.
|
||||
|
||||
This class handles extraction of document structure like headings, sections, etc.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_structure(self, source: Any, document: Document) -> List[Any]:
|
||||
"""
|
||||
Extract structure information from the source.
|
||||
|
||||
Args:
|
||||
source: The source data
|
||||
document: The document to populate with structure
|
||||
|
||||
Returns:
|
||||
List of structural elements
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ContentReader(ABC):
|
||||
"""
|
||||
Abstract base class for reading document content.
|
||||
|
||||
This class handles extraction of document content like text, formatting, etc.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_content(self, source: Any, document: Document) -> Any:
|
||||
"""
|
||||
Extract content from the source.
|
||||
|
||||
Args:
|
||||
source: The source data
|
||||
document: The document to populate with content
|
||||
|
||||
Returns:
|
||||
The extracted content
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ResourceReader(ABC):
|
||||
"""
|
||||
Abstract base class for reading document resources.
|
||||
|
||||
This class handles extraction of document resources like images, stylesheets, etc.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_resources(self, source: Any, document: Document) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract resources from the source.
|
||||
|
||||
Args:
|
||||
source: The source data
|
||||
document: The document to populate with resources
|
||||
|
||||
Returns:
|
||||
Dictionary of extracted resources
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class CompositeReader(BaseReader):
|
||||
"""
|
||||
A reader that combines multiple specialized readers.
|
||||
|
||||
This class uses composition to combine metadata, structure, content,
|
||||
and resource readers into a complete document reader.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the composite reader."""
|
||||
super().__init__()
|
||||
self._metadata_reader: Optional[MetadataReader] = None
|
||||
self._structure_reader: Optional[StructureReader] = None
|
||||
self._content_reader: Optional[ContentReader] = None
|
||||
self._resource_reader: Optional[ResourceReader] = None
|
||||
|
||||
def set_metadata_reader(self, reader: MetadataReader):
|
||||
"""Set the metadata reader."""
|
||||
self._metadata_reader = reader
|
||||
|
||||
def set_structure_reader(self, reader: StructureReader):
|
||||
"""Set the structure reader."""
|
||||
self._structure_reader = reader
|
||||
|
||||
def set_content_reader(self, reader: ContentReader):
|
||||
"""Set the content reader."""
|
||||
self._content_reader = reader
|
||||
|
||||
def set_resource_reader(self, reader: ResourceReader):
|
||||
"""Set the resource reader."""
|
||||
self._resource_reader = reader
|
||||
|
||||
def read(self, source: Union[str, bytes], **options) -> Document:
|
||||
"""
|
||||
Read the source using all configured readers.
|
||||
|
||||
Args:
|
||||
source: The source to read
|
||||
**options: Additional options for reading
|
||||
|
||||
Returns:
|
||||
The parsed Document
|
||||
"""
|
||||
# Create a new document
|
||||
document = Document()
|
||||
|
||||
# Store options
|
||||
self._options.update(options)
|
||||
|
||||
# Extract metadata if reader is available
|
||||
if self._metadata_reader:
|
||||
self._metadata_reader.extract_metadata(source, document)
|
||||
|
||||
# Extract structure if reader is available
|
||||
if self._structure_reader:
|
||||
self._structure_reader.extract_structure(source, document)
|
||||
|
||||
# Extract content if reader is available
|
||||
if self._content_reader:
|
||||
self._content_reader.extract_content(source, document)
|
||||
|
||||
# Extract resources if reader is available
|
||||
if self._resource_reader:
|
||||
self._resource_reader.extract_resources(source, document)
|
||||
|
||||
return document
|
||||
@ -1,5 +0,0 @@
|
||||
"""
|
||||
Layout options for the pyWebLayout library.
|
||||
|
||||
This module provides layout-related functionality.
|
||||
"""
|
||||
@ -1,302 +0,0 @@
|
||||
"""
|
||||
Tests for pyWebLayout.io.readers.base module.
|
||||
|
||||
Tests the base reader classes and their functionality.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pyWebLayout.io.readers.base import (
|
||||
BaseReader,
|
||||
MetadataReader,
|
||||
StructureReader,
|
||||
ContentReader,
|
||||
ResourceReader,
|
||||
CompositeReader
|
||||
)
|
||||
from pyWebLayout.abstract.document import Document
|
||||
|
||||
|
||||
# Concrete implementations for testing
|
||||
|
||||
class ConcreteBaseReader(BaseReader):
|
||||
"""Test implementation of BaseReader."""
|
||||
|
||||
def can_read(self, source):
|
||||
return isinstance(source, str) and source.endswith('.test')
|
||||
|
||||
def read(self, source, **options):
|
||||
doc = Document()
|
||||
doc.set_metadata('source', source)
|
||||
return doc
|
||||
|
||||
|
||||
class ConcreteMetadataReader(MetadataReader):
|
||||
"""Test implementation of MetadataReader."""
|
||||
|
||||
def extract_metadata(self, source, document):
|
||||
metadata = {
|
||||
'title': 'Test Title',
|
||||
'author': 'Test Author'
|
||||
}
|
||||
document.set_metadata('title', metadata['title'])
|
||||
document.set_metadata('author', metadata['author'])
|
||||
return metadata
|
||||
|
||||
|
||||
class ConcreteStructureReader(StructureReader):
|
||||
"""Test implementation of StructureReader."""
|
||||
|
||||
def extract_structure(self, source, document):
|
||||
return ['heading1', 'heading2']
|
||||
|
||||
|
||||
class ConcreteContentReader(ContentReader):
|
||||
"""Test implementation of ContentReader."""
|
||||
|
||||
def extract_content(self, source, document):
|
||||
return "Test content"
|
||||
|
||||
|
||||
class ConcreteResourceReader(ResourceReader):
|
||||
"""Test implementation of ResourceReader."""
|
||||
|
||||
def extract_resources(self, source, document):
|
||||
resources = {
|
||||
'image1.png': b'fake image data',
|
||||
'style.css': 'fake css'
|
||||
}
|
||||
for name, data in resources.items():
|
||||
document.add_resource(name, data)
|
||||
return resources
|
||||
|
||||
|
||||
class ConcreteCompositeReader(CompositeReader):
|
||||
"""Test implementation of CompositeReader."""
|
||||
|
||||
def can_read(self, source):
|
||||
return True
|
||||
|
||||
|
||||
# Test Cases
|
||||
|
||||
class TestBaseReaderOptions:
|
||||
"""Test BaseReader options functionality."""
|
||||
|
||||
def test_set_and_get_option(self):
|
||||
"""Test setting and getting options."""
|
||||
reader = ConcreteBaseReader()
|
||||
reader.set_option('font_size', 12)
|
||||
assert reader.get_option('font_size') == 12
|
||||
|
||||
def test_get_option_with_default(self):
|
||||
"""Test getting option with default value."""
|
||||
reader = ConcreteBaseReader()
|
||||
assert reader.get_option('nonexistent', 'default_value') == 'default_value'
|
||||
|
||||
def test_get_option_without_default(self):
|
||||
"""Test getting nonexistent option without default."""
|
||||
reader = ConcreteBaseReader()
|
||||
assert reader.get_option('nonexistent') is None
|
||||
|
||||
def test_multiple_options(self):
|
||||
"""Test setting multiple options."""
|
||||
reader = ConcreteBaseReader()
|
||||
reader.set_option('font_size', 12)
|
||||
reader.set_option('line_height', 1.5)
|
||||
reader.set_option('color', 'black')
|
||||
|
||||
assert reader.get_option('font_size') == 12
|
||||
assert reader.get_option('line_height') == 1.5
|
||||
assert reader.get_option('color') == 'black'
|
||||
|
||||
|
||||
class TestBaseReaderConcrete:
|
||||
"""Test concrete BaseReader implementation."""
|
||||
|
||||
def test_can_read_valid_source(self):
|
||||
"""Test can_read with valid source."""
|
||||
reader = ConcreteBaseReader()
|
||||
assert reader.can_read('document.test') is True
|
||||
|
||||
def test_can_read_invalid_source(self):
|
||||
"""Test can_read with invalid source."""
|
||||
reader = ConcreteBaseReader()
|
||||
assert reader.can_read('document.html') is False
|
||||
|
||||
def test_read_creates_document(self):
|
||||
"""Test read creates a Document."""
|
||||
reader = ConcreteBaseReader()
|
||||
doc = reader.read('test.test')
|
||||
assert isinstance(doc, Document)
|
||||
assert doc.get_metadata('source') == 'test.test'
|
||||
|
||||
|
||||
class TestMetadataReaderConcrete:
|
||||
"""Test concrete MetadataReader implementation."""
|
||||
|
||||
def test_extract_metadata(self):
|
||||
"""Test metadata extraction."""
|
||||
reader = ConcreteMetadataReader()
|
||||
doc = Document()
|
||||
metadata = reader.extract_metadata('source', doc)
|
||||
|
||||
assert metadata['title'] == 'Test Title'
|
||||
assert metadata['author'] == 'Test Author'
|
||||
assert doc.get_metadata('title') == 'Test Title'
|
||||
assert doc.get_metadata('author') == 'Test Author'
|
||||
|
||||
|
||||
class TestStructureReaderConcrete:
|
||||
"""Test concrete StructureReader implementation."""
|
||||
|
||||
def test_extract_structure(self):
|
||||
"""Test structure extraction."""
|
||||
reader = ConcreteStructureReader()
|
||||
doc = Document()
|
||||
structure = reader.extract_structure('source', doc)
|
||||
|
||||
assert isinstance(structure, list)
|
||||
assert len(structure) == 2
|
||||
assert structure[0] == 'heading1'
|
||||
assert structure[1] == 'heading2'
|
||||
|
||||
|
||||
class TestContentReaderConcrete:
|
||||
"""Test concrete ContentReader implementation."""
|
||||
|
||||
def test_extract_content(self):
|
||||
"""Test content extraction."""
|
||||
reader = ConcreteContentReader()
|
||||
doc = Document()
|
||||
content = reader.extract_content('source', doc)
|
||||
|
||||
assert content == "Test content"
|
||||
|
||||
|
||||
class TestResourceReaderConcrete:
|
||||
"""Test concrete ResourceReader implementation."""
|
||||
|
||||
def test_extract_resources(self):
|
||||
"""Test resource extraction."""
|
||||
reader = ConcreteResourceReader()
|
||||
doc = Document()
|
||||
resources = reader.extract_resources('source', doc)
|
||||
|
||||
assert isinstance(resources, dict)
|
||||
assert 'image1.png' in resources
|
||||
assert 'style.css' in resources
|
||||
assert doc.get_resource('image1.png') == b'fake image data'
|
||||
assert doc.get_resource('style.css') == 'fake css'
|
||||
|
||||
|
||||
class TestCompositeReader:
|
||||
"""Test CompositeReader functionality."""
|
||||
|
||||
def test_initialization(self):
|
||||
"""Test composite reader initialization."""
|
||||
reader = ConcreteCompositeReader()
|
||||
assert reader._metadata_reader is None
|
||||
assert reader._structure_reader is None
|
||||
assert reader._content_reader is None
|
||||
assert reader._resource_reader is None
|
||||
|
||||
def test_set_metadata_reader(self):
|
||||
"""Test setting metadata reader."""
|
||||
reader = ConcreteCompositeReader()
|
||||
metadata_reader = ConcreteMetadataReader()
|
||||
reader.set_metadata_reader(metadata_reader)
|
||||
assert reader._metadata_reader is metadata_reader
|
||||
|
||||
def test_set_structure_reader(self):
|
||||
"""Test setting structure reader."""
|
||||
reader = ConcreteCompositeReader()
|
||||
structure_reader = ConcreteStructureReader()
|
||||
reader.set_structure_reader(structure_reader)
|
||||
assert reader._structure_reader is structure_reader
|
||||
|
||||
def test_set_content_reader(self):
|
||||
"""Test setting content reader."""
|
||||
reader = ConcreteCompositeReader()
|
||||
content_reader = ConcreteContentReader()
|
||||
reader.set_content_reader(content_reader)
|
||||
assert reader._content_reader is content_reader
|
||||
|
||||
def test_set_resource_reader(self):
|
||||
"""Test setting resource reader."""
|
||||
reader = ConcreteCompositeReader()
|
||||
resource_reader = ConcreteResourceReader()
|
||||
reader.set_resource_reader(resource_reader)
|
||||
assert reader._resource_reader is resource_reader
|
||||
|
||||
def test_read_with_all_readers(self):
|
||||
"""Test reading with all readers configured."""
|
||||
reader = ConcreteCompositeReader()
|
||||
reader.set_metadata_reader(ConcreteMetadataReader())
|
||||
reader.set_structure_reader(ConcreteStructureReader())
|
||||
reader.set_content_reader(ConcreteContentReader())
|
||||
reader.set_resource_reader(ConcreteResourceReader())
|
||||
|
||||
doc = reader.read('test_source')
|
||||
|
||||
# Verify metadata was extracted
|
||||
assert doc.get_metadata('title') == 'Test Title'
|
||||
assert doc.get_metadata('author') == 'Test Author'
|
||||
|
||||
# Verify resources were extracted
|
||||
assert doc.get_resource('image1.png') == b'fake image data'
|
||||
assert doc.get_resource('style.css') == 'fake css'
|
||||
|
||||
def test_read_with_no_readers(self):
|
||||
"""Test reading with no readers configured."""
|
||||
reader = ConcreteCompositeReader()
|
||||
doc = reader.read('test_source')
|
||||
|
||||
# Should create an empty document
|
||||
assert isinstance(doc, Document)
|
||||
|
||||
def test_read_with_only_metadata_reader(self):
|
||||
"""Test reading with only metadata reader."""
|
||||
reader = ConcreteCompositeReader()
|
||||
reader.set_metadata_reader(ConcreteMetadataReader())
|
||||
|
||||
doc = reader.read('test_source')
|
||||
assert doc.get_metadata('title') == 'Test Title'
|
||||
|
||||
def test_read_with_options(self):
|
||||
"""Test reading with options."""
|
||||
reader = ConcreteCompositeReader()
|
||||
reader.set_metadata_reader(ConcreteMetadataReader())
|
||||
|
||||
doc = reader.read('test_source', font_size=14, encoding='utf-8')
|
||||
|
||||
# Verify options were stored
|
||||
assert reader.get_option('font_size') == 14
|
||||
assert reader.get_option('encoding') == 'utf-8'
|
||||
|
||||
def test_can_read_implemented(self):
|
||||
"""Test that can_read is implemented in ConcreteCompositeReader."""
|
||||
reader = ConcreteCompositeReader()
|
||||
assert reader.can_read('test_source') is True
|
||||
|
||||
|
||||
class TestCompositeReaderIntegration:
|
||||
"""Integration tests for CompositeReader."""
|
||||
|
||||
def test_full_document_reading_workflow(self):
|
||||
"""Test complete document reading workflow."""
|
||||
# Create and configure composite reader
|
||||
reader = ConcreteCompositeReader()
|
||||
reader.set_metadata_reader(ConcreteMetadataReader())
|
||||
reader.set_structure_reader(ConcreteStructureReader())
|
||||
reader.set_content_reader(ConcreteContentReader())
|
||||
reader.set_resource_reader(ConcreteResourceReader())
|
||||
|
||||
# Read document with options
|
||||
doc = reader.read('complex_document.test', font_size=16, page_width=800)
|
||||
|
||||
# Verify all components worked together
|
||||
assert doc.get_metadata('title') == 'Test Title'
|
||||
assert doc.get_metadata('author') == 'Test Author'
|
||||
assert doc.get_resource('image1.png') is not None
|
||||
assert reader.get_option('font_size') == 16
|
||||
assert reader.get_option('page_width') == 800
|
||||
Loading…
x
Reference in New Issue
Block a user