removed more unneed junk

2025-06-07 19:16:27 +02:00 · 2025-06-07 19:16:27 +02:00 · 28c7b6700b
commit 28c7b6700b
parent 2b1170cac7
7 changed files with 4 additions and 1513 deletions
--- a/pyWebLayout/init.py
+++ b/pyWebLayout/init.py
@ -35,8 +35,3 @@ from pyWebLayout.concrete.page import Container, Page
 from pyWebLayout.abstract.inline import Word


-# IO functionality (reading and writing)
-from pyWebLayout.io import (
-    parse_html, html_to_document,  # HTML parsing
-    read_epub                      # EPUB reading
-)
--- a/pyWebLayout/io/init.py
+++ b/pyWebLayout/io/init.py
@ -11,61 +11,5 @@ pattern as the abstract module.

 # Legacy readers (for backward compatibility)
 # Legacy functions provided by new HTML reader for backward compatibility
-from pyWebLayout.io.readers.html import parse_html_string as parse_html
-from pyWebLayout.io.readers.html import read_html_file as html_to_document
-from pyWebLayout.io.readers.epub_reader import read_epub

-# New decomposed readers
-from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string
-from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
-
-# Specialized HTML readers
-from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
-from pyWebLayout.io.readers.html_resources import HTMLResourceReader
-
-# HTML extraction parser (the best approach)
-from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction
-
-# Specialized EPUB readers
-from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
-
-# Convenience functions using the new architecture
-def read_document(source, format_hint=None, **options):
-    """
-    Read a document using the appropriate reader based on format detection.
-    
-    Args:
-        source: The source to read (file path, URL, or content)
-        format_hint: Optional hint about the format ('html', 'epub', etc.)
-        **options: Additional options for reading
-        
-    Returns:
-        Document: The parsed document
-    """
-    if format_hint == 'html' or (not format_hint and _is_html_source(source)):
-        reader = HTMLReader()
-        return reader.read(source, **options)
-    elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)):
-        # Use legacy EPUB reader for now
-        return read_epub(source)
-    else:
-        # Try HTML reader as fallback
-        try:
-            reader = HTMLReader()
-            if reader.can_read(source):
-                return reader.read(source, **options)
-        except:
-            pass
-        
-        raise ValueError(f"Cannot determine format for source: {source}")
-
-def _is_html_source(source):
-    """Check if source appears to be HTML."""
-    reader = HTMLReader()
-    return reader.can_read(source)
-
-def _is_epub_source(source):
-    """Check if source appears to be EPUB."""
-    if isinstance(source, str):
-        return source.lower().endswith('.epub')
-    return False
+from pyWebLayout.io.readers.epub_reader import EPUBReader
--- a/pyWebLayout/io/readers/init.py
+++ b/pyWebLayout/io/readers/init.py
@ -9,14 +9,13 @@ using a decomposed architecture pattern.
 from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader

 # HTML readers (decomposed)
-from .html import HTMLReader, read_html, read_html_file, parse_html_string
-from .html_metadata import HTMLMetadataReader
-from .html_resources import HTMLResourceReader
+
+


 # EPUB readers
 from .epub_reader import read_epub  # Legacy
-from .epub_metadata import EPUBMetadataReader  # New decomposed
+

 __all__ = [
    # Base classes
--- a/pyWebLayout/io/readers/epub_metadata.py
+++ b/pyWebLayout/io/readers/epub_metadata.py
@ -1,352 +0,0 @@
-"""
-EPUB metadata reader for pyWebLayout.
-
-This module provides specialized functionality for extracting metadata
-from EPUB documents, following the decomposed architecture pattern.
-"""
-
-import os
-import zipfile
-import tempfile
-from typing import Dict, Any, Optional, List
-import xml.etree.ElementTree as ET
-from pyWebLayout.abstract.document import Document, MetadataType
-from pyWebLayout.io.readers.base import MetadataReader
-
-
-# XML namespaces used in EPUB files
-NAMESPACES = {
-    'opf': 'http://www.idpf.org/2007/opf',
-    'dc': 'http://purl.org/dc/elements/1.1/',
-    'dcterms': 'http://purl.org/dc/terms/',
-}
-
-
-class EPUBMetadataReader(MetadataReader):
-    """
-    Specialized reader for extracting metadata from EPUB documents.
-    
-    This class handles EPUB package document metadata including
-    Dublin Core elements and custom metadata.
-    """
-    
-    def __init__(self):
-        """Initialize the EPUB metadata reader."""
-        self._metadata = {}
-        self._temp_dir = None
-        self._package_path = None
-    
-    def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
-        """
-        Extract metadata from EPUB file.
-        
-        Args:
-            epub_path: Path to the EPUB file
-            document: The document to populate with metadata
-            
-        Returns:
-            Dictionary of extracted metadata
-        """
-        # Reset internal state
-        self._reset()
-        
-        try:
-            # Extract EPUB to temporary directory
-            self._extract_epub(epub_path)
-            
-            # Find and parse package document
-            self._find_package_document()
-            
-            if self._package_path:
-                self._parse_package_metadata()
-            
-            # Populate document with extracted metadata
-            self._populate_document(document)
-            
-            return self._metadata
-            
-        finally:
-            # Clean up temporary files
-            self._cleanup()
-    
-    def _reset(self):
-        """Reset internal state for a new extraction."""
-        self._metadata = {}
-        self._temp_dir = None
-        self._package_path = None
-    
-    def _extract_epub(self, epub_path: str):
-        """
-        Extract EPUB file to temporary directory.
-        
-        Args:
-            epub_path: Path to the EPUB file
-        """
-        self._temp_dir = tempfile.mkdtemp()
-        
-        with zipfile.ZipFile(epub_path, 'r') as zip_ref:
-            zip_ref.extractall(self._temp_dir)
-    
-    def _find_package_document(self):
-        """Find the package document (content.opf) in the extracted EPUB."""
-        # First, try to find it via META-INF/container.xml
-        container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
-        
-        if os.path.exists(container_path):
-            try:
-                tree = ET.parse(container_path)
-                root = tree.getroot()
-                
-                # Find rootfile element
-                for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
-                    full_path = rootfile.get('full-path')
-                    if full_path:
-                        self._package_path = os.path.join(self._temp_dir, full_path)
-                        if os.path.exists(self._package_path):
-                            return
-            except ET.ParseError:
-                pass
-        
-        # Fallback: search for .opf files
-        for root, dirs, files in os.walk(self._temp_dir):
-            for file in files:
-                if file.endswith('.opf'):
-                    self._package_path = os.path.join(root, file)
-                    return
-    
-    def _parse_package_metadata(self):
-        """Parse metadata from the package document."""
-        if not self._package_path or not os.path.exists(self._package_path):
-            return
-        
-        try:
-            tree = ET.parse(self._package_path)
-            root = tree.getroot()
-            
-            # Find metadata element
-            metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
-            if metadata_elem is None:
-                return
-            
-            # Parse Dublin Core metadata
-            self._parse_dublin_core(metadata_elem)
-            
-            # Parse OPF-specific metadata
-            self._parse_opf_metadata(metadata_elem)
-            
-        except ET.ParseError as e:
-            print(f"Error parsing package document: {e}")
-    
-    def _parse_dublin_core(self, metadata_elem: ET.Element):
-        """
-        Parse Dublin Core metadata elements.
-        
-        Args:
-            metadata_elem: The metadata XML element
-        """
-        dc_elements = {
-            'title': 'title',
-            'creator': 'creator',
-            'subject': 'subject',
-            'description': 'description',
-            'publisher': 'publisher',
-            'contributor': 'contributor',
-            'date': 'date',
-            'type': 'type',
-            'format': 'format',
-            'identifier': 'identifier',
-            'source': 'source',
-            'language': 'language',
-            'relation': 'relation',
-            'coverage': 'coverage',
-            'rights': 'rights'
-        }
-        
-        for dc_name, meta_key in dc_elements.items():
-            elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
-            
-            if elements:
-                if len(elements) == 1:
-                    # Single element
-                    text = elements[0].text
-                    if text:
-                        self._metadata[meta_key] = text.strip()
-                        
-                        # Handle special attributes
-                        elem = elements[0]
-                        if dc_name == 'creator':
-                            # Check for role attribute
-                            role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
-                            if role:
-                                self._metadata[f'{meta_key}_role'] = role
-                            
-                            # Check for file-as attribute for sorting
-                            file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
-                            if file_as:
-                                self._metadata[f'{meta_key}_file_as'] = file_as
-                        
-                        elif dc_name == 'identifier':
-                            # Check for scheme (ISBN, DOI, etc.)
-                            scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
-                            if scheme:
-                                self._metadata[f'{meta_key}_scheme'] = scheme
-                            
-                            # Check if this is the unique identifier
-                            id_attr = elem.get('id')
-                            if id_attr:
-                                self._metadata[f'{meta_key}_id'] = id_attr
-                        
-                        elif dc_name == 'date':
-                            # Check for event type
-                            event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
-                            if event:
-                                self._metadata[f'{meta_key}_event'] = event
-                
-                else:
-                    # Multiple elements - store as list
-                    values = []
-                    for elem in elements:
-                        if elem.text:
-                            values.append(elem.text.strip())
-                    
-                    if values:
-                        self._metadata[meta_key] = values
-    
-    def _parse_opf_metadata(self, metadata_elem: ET.Element):
-        """
-        Parse OPF-specific metadata elements.
-        
-        Args:
-            metadata_elem: The metadata XML element
-        """
-        # Parse meta elements
-        meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
-        
-        for meta in meta_elements:
-            name = meta.get('name')
-            content = meta.get('content')
-            
-            if name and content:
-                self._metadata[f'meta_{name}'] = content
-        
-        # Parse x-metadata elements (custom metadata)
-        x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
-        
-        for x_meta in x_meta_elements:
-            for child in x_meta:
-                if child.tag and child.text:
-                    # Remove namespace prefix for cleaner key names
-                    tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
-                    self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
-    
-    def _populate_document(self, document: Document):
-        """
-        Populate the document with extracted metadata.
-        
-        Args:
-            document: The document to populate
-        """
-        # Map EPUB metadata to document metadata types
-        metadata_mapping = {
-            'title': MetadataType.TITLE,
-            'creator': MetadataType.AUTHOR,
-            'description': MetadataType.DESCRIPTION,
-            'subject': MetadataType.KEYWORDS,
-            'language': MetadataType.LANGUAGE,
-            'date': MetadataType.PUBLICATION_DATE,
-            'publisher': MetadataType.PUBLISHER,
-            'identifier': MetadataType.IDENTIFIER,
-        }
-        
-        for epub_key, doc_type in metadata_mapping.items():
-            if epub_key in self._metadata:
-                value = self._metadata[epub_key]
-                
-                # Handle list values (like multiple subjects)
-                if isinstance(value, list):
-                    if epub_key == 'subject':
-                        # Join subjects with commas for keywords
-                        document.set_metadata(doc_type, ', '.join(value))
-                    else:
-                        # For other list values, use the first one
-                        document.set_metadata(doc_type, value[0])
-                else:
-                    document.set_metadata(doc_type, value)
-        
-        # Handle cover image
-        cover_meta = self._metadata.get('meta_cover')
-        if cover_meta:
-            document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
-        
-        # Store original EPUB metadata for reference
-        document.set_metadata(MetadataType.CUSTOM, {
-            'epub_metadata': self._metadata
-        })
-    
-    def _cleanup(self):
-        """Clean up temporary files."""
-        if self._temp_dir:
-            try:
-                import shutil
-                shutil.rmtree(self._temp_dir, ignore_errors=True)
-            except:
-                pass
-            self._temp_dir = None
-    
-    def get_unique_identifier(self) -> Optional[str]:
-        """
-        Get the unique identifier from the EPUB metadata.
-        
-        Returns:
-            The unique identifier string, or None if not found
-        """
-        # Look for identifier with specific ID
-        for key, value in self._metadata.items():
-            if key.startswith('identifier') and key.endswith('_id'):
-                return self._metadata.get('identifier')
-        
-        # Fallback to any identifier
-        return self._metadata.get('identifier')
-    
-    def get_cover_id(self) -> Optional[str]:
-        """
-        Get the cover image ID from metadata.
-        
-        Returns:
-            The cover image ID, or None if not found
-        """
-        return self._metadata.get('meta_cover')
-    
-    def get_creators(self) -> List[Dict[str, str]]:
-        """
-        Get creator information with roles.
-        
-        Returns:
-            List of creator dictionaries with name, role, and file-as info
-        """
-        creators = []
-        creator_value = self._metadata.get('creator')
-        
-        if creator_value:
-            if isinstance(creator_value, list):
-                # Multiple creators - this is simplified, real implementation
-                # would need to correlate with role and file-as attributes
-                for creator in creator_value:
-                    creators.append({'name': creator})
-            else:
-                # Single creator
-                creator_info = {'name': creator_value}
-                
-                # Add role if available
-                role = self._metadata.get('creator_role')
-                if role:
-                    creator_info['role'] = role
-                
-                # Add file-as if available
-                file_as = self._metadata.get('creator_file_as')
-                if file_as:
-                    creator_info['file_as'] = file_as
-                
-                creators.append(creator_info)
-        
-        return creators
--- a/pyWebLayout/io/readers/html.py
+++ b/pyWebLayout/io/readers/html.py
@ -1,186 +0,0 @@
-"""
-Modern HTML reader for pyWebLayout.
-
-This module provides an HTML reader that uses the html_extraction module
-for clean, handler-based parsing using BeautifulSoup.
-"""
-
-import os
-from typing import Union, Optional
-from pyWebLayout.abstract.document import Document
-from pyWebLayout.io.readers.base import BaseReader
-from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
-from pyWebLayout.io.readers.html_resources import HTMLResourceReader
-from pyWebLayout.io.readers.html_extraction import parse_html_string
-from pyWebLayout.style import Font
-
-
-class HTMLReader(BaseReader):
-    """
-    Modern HTML reader using the html_extraction parser.
-    
-    This reader uses the clean, handler-based architecture from html_extraction.py
-    for parsing HTML content into pyWebLayout's abstract document structure.
-    """
-    
-    def __init__(self):
-        """Initialize the HTML reader."""
-        super().__init__()
-        self._metadata_reader = HTMLMetadataReader()
-        self._resource_reader = HTMLResourceReader()
-    
-    def can_read(self, source: Union[str, bytes]) -> bool:
-        """
-        Check if this reader can handle the given source.
-        
-        Args:
-            source: The source to check (file path, URL, or content)
-            
-        Returns:
-            True if this reader can handle the source, False otherwise
-        """
-        if isinstance(source, str):
-            # Check if it's a file path
-            if os.path.isfile(source):
-                return source.lower().endswith(('.html', '.htm', '.xhtml'))
-            
-            # Check if it's HTML content (very basic check)
-            source_lower = source.lower().strip()
-            return (source_lower.startswith('<!doctype html') or 
-                   source_lower.startswith('<html') or
-                   '<html' in source_lower[:200])
-        
-        elif isinstance(source, bytes):
-            # Check if it's HTML content in bytes
-            try:
-                source_str = source.decode('utf-8', errors='ignore').lower().strip()
-                return (source_str.startswith('<!doctype html') or 
-                       source_str.startswith('<html') or
-                       '<html' in source_str[:200])
-            except:
-                return False
-        
-        return False
-    
-    def read(self, source: Union[str, bytes], **options) -> Document:
-        """
-        Read and parse the HTML source into a Document.
-        
-        Args:
-            source: The HTML source to read (file path, URL, or content)
-            **options: Additional options for reading
-                - base_url: Base URL for resolving relative links
-                - encoding: Character encoding (default: 'utf-8')
-                - extract_metadata: Whether to extract metadata (default: True)
-                - extract_resources: Whether to extract resources (default: True)
-                - base_font: Base font for styling (default: None)
-            
-        Returns:
-            The parsed Document
-        """
-        # Get options
-        base_url = options.get('base_url')
-        encoding = options.get('encoding', 'utf-8')
-        extract_metadata = options.get('extract_metadata', True)
-        extract_resources = options.get('extract_resources', True)
-        base_font = options.get('base_font')
-        
-        # Read the HTML content
-        html_content = self._read_html_content(source, encoding)
-        
-        # Set base URL if not provided and source is a file
-        if not base_url and isinstance(source, str) and os.path.isfile(source):
-            base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
-        
-        # Create a new document
-        document = Document()
-        
-        # Extract metadata if enabled
-        if extract_metadata and self._metadata_reader:
-            self._metadata_reader.extract_metadata(html_content, document)
-        
-        # Parse content using html_extraction
-        blocks = parse_html_string(html_content, base_font)
-        for block in blocks:
-            document.add_block(block)
-        
-        # Extract resources if enabled
-        if extract_resources and self._resource_reader:
-            self._resource_reader.extract_resources(html_content, document)
-        
-        return document
-    
-    def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str:
-        """
-        Read HTML content from various sources.
-        
-        Args:
-            source: The source to read from
-            encoding: Character encoding to use
-            
-        Returns:
-            The HTML content as a string
-        """
-        if isinstance(source, bytes):
-            # Source is already bytes, decode it
-            return source.decode(encoding, errors='replace')
-        
-        elif isinstance(source, str):
-            # Check if it's a file path
-            if os.path.isfile(source):
-                with open(source, 'r', encoding=encoding, errors='replace') as f:
-                    return f.read()
-            else:
-                # Assume it's HTML content
-                return source
-        
-        else:
-            raise ValueError(f"Unsupported source type: {type(source)}")
-
-
-def read_html(source: Union[str, bytes], **options) -> Document:
-    """
-    Convenience function to read HTML content.
-    
-    Args:
-        source: The HTML source to read (file path, URL, or content)
-        **options: Additional options for reading
-        
-    Returns:
-        The parsed Document
-    """
-    reader = HTMLReader()
-    return reader.read(source, **options)
-
-
-def read_html_file(file_path: str, **options) -> Document:
-    """
-    Convenience function to read HTML from a file.
-    
-    Args:
-        file_path: Path to the HTML file
-        **options: Additional options for reading
-        
-    Returns:
-        The parsed Document
-    """
-    if not os.path.isfile(file_path):
-        raise FileNotFoundError(f"HTML file not found: {file_path}")
-    
-    reader = HTMLReader()
-    return reader.read(file_path, **options)
-
-
-def parse_html_string(html_content: str, **options) -> Document:
-    """
-    Convenience function to parse HTML content from a string.
-    
-    Args:
-        html_content: The HTML content as a string
-        **options: Additional options for reading
-        
-    Returns:
-        The parsed Document
-    """
-    reader = HTMLReader()
-    return reader.read(html_content, **options)
--- a/pyWebLayout/io/readers/html_metadata.py
+++ b/pyWebLayout/io/readers/html_metadata.py
@ -1,426 +0,0 @@
-"""
-HTML metadata reader for pyWebLayout.
-
-This module provides specialized functionality for extracting metadata
-from HTML documents, following the decomposed architecture pattern.
-"""
-
-from typing import Dict, Any, Optional
-import re
-from pyWebLayout.abstract.document import Document, MetadataType
-from pyWebLayout.io.readers.base import MetadataReader
-
-
-class HTMLMetadataReader(MetadataReader):
-    """
-    Specialized reader for extracting metadata from HTML documents.
-    
-    This class handles HTML meta tags, title elements, and other metadata
-    sources like Open Graph tags and JSON-LD structured data.
-    """
-    
-    def __init__(self):
-        """Initialize the HTML metadata reader."""
-        self._title = None
-        self._meta_tags = {}
-        self._og_tags = {}
-        self._twitter_tags = {}
-        self._json_ld = {}
-    
-    def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
-        """
-        Extract metadata from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-            document: The document to populate with metadata
-            
-        Returns:
-            Dictionary of extracted metadata
-        """
-        # Reset internal state
-        self._reset()
-        
-        # Extract title
-        self._extract_title(html_content)
-        
-        # Extract meta tags
-        self._extract_meta_tags(html_content)
-        
-        # Extract Open Graph tags
-        self._extract_open_graph(html_content)
-        
-        # Extract Twitter Card tags
-        self._extract_twitter_cards(html_content)
-        
-        # Extract JSON-LD structured data
-        self._extract_json_ld(html_content)
-        
-        # Populate document with extracted metadata
-        self._populate_document(document)
-        
-        # Return all extracted metadata
-        return {
-            'title': self._title,
-            'meta_tags': self._meta_tags,
-            'open_graph': self._og_tags,
-            'twitter_cards': self._twitter_tags,
-            'json_ld': self._json_ld
-        }
-    
-    def _reset(self):
-        """Reset internal state for a new extraction."""
-        self._title = None
-        self._meta_tags = {}
-        self._og_tags = {}
-        self._twitter_tags = {}
-        self._json_ld = {}
-    
-    def _extract_title(self, html_content: str):
-        """
-        Extract the title from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Look for title tag
-        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
-        if title_match:
-            # Clean up the title text
-            self._title = self._clean_text(title_match.group(1))
-    
-    def _extract_meta_tags(self, html_content: str):
-        """
-        Extract meta tags from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Regular expression to match meta tags
-        meta_pattern = r'<meta\s+([^>]+)>'
-        
-        for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
-            attrs = self._parse_attributes(match.group(1))
-            
-            # Get name and content
-            name = attrs.get('name', '').lower()
-            content = attrs.get('content', '')
-            
-            # Handle different types of meta tags
-            if name and content:
-                self._meta_tags[name] = content
-            
-            # Handle http-equiv meta tags
-            http_equiv = attrs.get('http-equiv', '').lower()
-            if http_equiv and content:
-                self._meta_tags[f'http-equiv:{http_equiv}'] = content
-            
-            # Handle charset meta tags
-            charset = attrs.get('charset', '')
-            if charset:
-                self._meta_tags['charset'] = charset
-    
-    def _extract_open_graph(self, html_content: str):
-        """
-        Extract Open Graph meta tags from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Regular expression to match Open Graph meta tags
-        og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
-        
-        for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
-            property_name = match.group(1)
-            content = match.group(2)
-            self._og_tags[property_name] = content
-    
-    def _extract_twitter_cards(self, html_content: str):
-        """
-        Extract Twitter Card meta tags from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Regular expression to match Twitter Card meta tags
-        twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
-        
-        for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
-            property_name = match.group(1)
-            content = match.group(2)
-            self._twitter_tags[property_name] = content
-    
-    def _extract_json_ld(self, html_content: str):
-        """
-        Extract JSON-LD structured data from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Regular expression to match JSON-LD script tags
-        json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
-        
-        for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
-            try:
-                import json
-                json_content = match.group(1).strip()
-                data = json.loads(json_content)
-                
-                # Store JSON-LD data by type if available
-                if isinstance(data, dict) and '@type' in data:
-                    type_name = data['@type']
-                    if type_name not in self._json_ld:
-                        self._json_ld[type_name] = []
-                    self._json_ld[type_name].append(data)
-                elif isinstance(data, list):
-                    # Handle arrays of structured data
-                    for item in data:
-                        if isinstance(item, dict) and '@type' in item:
-                            type_name = item['@type']
-                            if type_name not in self._json_ld:
-                                self._json_ld[type_name] = []
-                            self._json_ld[type_name].append(item)
-            except (json.JSONDecodeError, ImportError):
-                # Skip invalid JSON-LD
-                continue
-    
-    def _populate_document(self, document: Document):
-        """
-        Populate the document with extracted metadata.
-        
-        Args:
-            document: The document to populate
-        """
-        # Set title
-        title = self._get_best_title()
-        if title:
-            document.set_metadata(MetadataType.TITLE, title)
-        
-        # Set description
-        description = self._get_best_description()
-        if description:
-            document.set_metadata(MetadataType.DESCRIPTION, description)
-        
-        # Set author
-        author = self._get_best_author()
-        if author:
-            document.set_metadata(MetadataType.AUTHOR, author)
-        
-        # Set keywords
-        keywords = self._get_keywords()
-        if keywords:
-            document.set_metadata(MetadataType.KEYWORDS, keywords)
-        
-        # Set language
-        language = self._get_language()
-        if language:
-            document.set_metadata(MetadataType.LANGUAGE, language)
-        
-        # Set cover image
-        cover_image = self._get_cover_image()
-        if cover_image:
-            document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
-        
-        # Set publisher
-        publisher = self._get_publisher()
-        if publisher:
-            document.set_metadata(MetadataType.PUBLISHER, publisher)
-        
-        # Set publication date
-        pub_date = self._get_publication_date()
-        if pub_date:
-            document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
-    
-    def _get_best_title(self) -> Optional[str]:
-        """Get the best available title from all sources."""
-        # Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
-        
-        # Check Open Graph
-        if 'title' in self._og_tags:
-            return self._og_tags['title']
-        
-        # Check Twitter Cards
-        if 'title' in self._twitter_tags:
-            return self._twitter_tags['title']
-        
-        # Check JSON-LD
-        for type_name, items in self._json_ld.items():
-            for item in items:
-                if 'name' in item:
-                    return item['name']
-                elif 'headline' in item:
-                    return item['headline']
-        
-        # Check meta tags
-        for key in ['title', 'og:title', 'twitter:title']:
-            if key in self._meta_tags:
-                return self._meta_tags[key]
-        
-        # Fall back to HTML title
-        return self._title
-    
-    def _get_best_description(self) -> Optional[str]:
-        """Get the best available description from all sources."""
-        # Priority order: Open Graph > Twitter > meta description > JSON-LD
-        
-        # Check Open Graph
-        if 'description' in self._og_tags:
-            return self._og_tags['description']
-        
-        # Check Twitter Cards
-        if 'description' in self._twitter_tags:
-            return self._twitter_tags['description']
-        
-        # Check meta description
-        if 'description' in self._meta_tags:
-            return self._meta_tags['description']
-        
-        # Check JSON-LD
-        for type_name, items in self._json_ld.items():
-            for item in items:
-                if 'description' in item:
-                    return item['description']
-        
-        return None
-    
-    def _get_best_author(self) -> Optional[str]:
-        """Get the best available author from all sources."""
-        # Check meta tags
-        if 'author' in self._meta_tags:
-            return self._meta_tags['author']
-        
-        # Check JSON-LD
-        for type_name, items in self._json_ld.items():
-            for item in items:
-                if 'author' in item:
-                    author = item['author']
-                    if isinstance(author, dict) and 'name' in author:
-                        return author['name']
-                    elif isinstance(author, str):
-                        return author
-                elif 'creator' in item:
-                    creator = item['creator']
-                    if isinstance(creator, dict) and 'name' in creator:
-                        return creator['name']
-                    elif isinstance(creator, str):
-                        return creator
-        
-        return None
-    
-    def _get_keywords(self) -> Optional[str]:
-        """Get keywords from meta tags."""
-        return self._meta_tags.get('keywords')
-    
-    def _get_language(self) -> Optional[str]:
-        """Get language from meta tags or HTML lang attribute."""
-        # Check meta tags first
-        if 'language' in self._meta_tags:
-            return self._meta_tags['language']
-        
-        # Could also extract from html lang attribute if needed
-        return None
-    
-    def _get_cover_image(self) -> Optional[str]:
-        """Get the best available cover image from all sources."""
-        # Check Open Graph
-        if 'image' in self._og_tags:
-            return self._og_tags['image']
-        
-        # Check Twitter Cards
-        if 'image' in self._twitter_tags:
-            return self._twitter_tags['image']
-        
-        # Check JSON-LD
-        for type_name, items in self._json_ld.items():
-            for item in items:
-                if 'image' in item:
-                    image = item['image']
-                    if isinstance(image, dict) and 'url' in image:
-                        return image['url']
-                    elif isinstance(image, str):
-                        return image
-        
-        return None
-    
-    def _get_publisher(self) -> Optional[str]:
-        """Get publisher from JSON-LD or other sources."""
-        # Check JSON-LD
-        for type_name, items in self._json_ld.items():
-            for item in items:
-                if 'publisher' in item:
-                    publisher = item['publisher']
-                    if isinstance(publisher, dict) and 'name' in publisher:
-                        return publisher['name']
-                    elif isinstance(publisher, str):
-                        return publisher
-        
-        return None
-    
-    def _get_publication_date(self) -> Optional[str]:
-        """Get publication date from JSON-LD or other sources."""
-        # Check JSON-LD
-        for type_name, items in self._json_ld.items():
-            for item in items:
-                if 'datePublished' in item:
-                    return item['datePublished']
-                elif 'publishDate' in item:
-                    return item['publishDate']
-        
-        return None
-    
-    def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
-        """
-        Parse HTML attributes from a string.
-        
-        Args:
-            attr_string: String containing HTML attributes
-            
-        Returns:
-            Dictionary of attribute name-value pairs
-        """
-        attrs = {}
-        
-        # Regular expression to match attribute="value" or attribute='value'
-        attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
-        
-        for match in re.finditer(attr_pattern, attr_string):
-            name = match.group(1).lower()
-            value = match.group(2) or match.group(3) or match.group(4) or ''
-            attrs[name] = value
-        
-        # Handle standalone attributes (like charset)
-        standalone_pattern = r'\b(\w+)(?!=)'
-        for match in re.finditer(standalone_pattern, attr_string):
-            attr_name = match.group(1).lower()
-            if attr_name not in attrs:
-                attrs[attr_name] = ''
-        
-        return attrs
-    
-    def _clean_text(self, text: str) -> str:
-        """
-        Clean up text content by removing extra whitespace and HTML entities.
-        
-        Args:
-            text: The text to clean
-            
-        Returns:
-            Cleaned text
-        """
-        # Remove extra whitespace
-        cleaned = re.sub(r'\s+', ' ', text).strip()
-        
-        # Decode common HTML entities
-        entities = {
-            '&lt;': '<',
-            '&gt;': '>',
-            '&amp;': '&',
-            '&quot;': '"',
-            '&apos;': "'",
-            '&nbsp;': ' ',
-        }
-        
-        for entity, char in entities.items():
-            cleaned = cleaned.replace(entity, char)
-        
-        return cleaned
--- a/pyWebLayout/io/readers/html_resources.py
+++ b/pyWebLayout/io/readers/html_resources.py
@ -1,483 +0,0 @@
-"""
-HTML resources reader for pyWebLayout.
-
-This module provides specialized functionality for extracting resources
-from HTML documents, such as stylesheets, scripts, and external files.
-"""
-
-from typing import Dict, Any, Optional, List
-import re
-import urllib.parse
-from pyWebLayout.abstract.document import Document
-from pyWebLayout.io.readers.base import ResourceReader
-
-
-class HTMLResourceReader(ResourceReader):
-    """
-    Specialized reader for extracting resources from HTML documents.
-    
-    This class handles CSS stylesheets, JavaScript files, images,
-    and other external resources referenced in HTML.
-    """
-    
-    def __init__(self):
-        """Initialize the HTML resource reader."""
-        self._stylesheets = []
-        self._scripts = []
-        self._external_resources = {}
-        self._inline_styles = {}
-        self._inline_scripts = []
-    
-    def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]:
-        """
-        Extract resources from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-            document: The document to populate with resources
-            
-        Returns:
-            Dictionary of extracted resources
-        """
-        # Reset internal state
-        self._reset()
-        
-        # Extract stylesheets
-        self._extract_stylesheets(html_content)
-        
-        # Extract scripts
-        self._extract_scripts(html_content)
-        
-        # Extract other external resources
-        self._extract_external_resources(html_content)
-        
-        # Extract inline styles
-        self._extract_inline_styles(html_content)
-        
-        # Extract inline scripts
-        self._extract_inline_scripts(html_content)
-        
-        # Populate document with extracted resources
-        self._populate_document(document)
-        
-        # Return all extracted resources
-        return {
-            'stylesheets': self._stylesheets,
-            'scripts': self._scripts,
-            'external_resources': self._external_resources,
-            'inline_styles': self._inline_styles,
-            'inline_scripts': self._inline_scripts
-        }
-    
-    def _reset(self):
-        """Reset internal state for a new extraction."""
-        self._stylesheets = []
-        self._scripts = []
-        self._external_resources = {}
-        self._inline_styles = {}
-        self._inline_scripts = []
-    
-    def _extract_stylesheets(self, html_content: str):
-        """
-        Extract CSS stylesheet references from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Regular expression to match link tags for stylesheets
-        link_pattern = r'<link\s+([^>]+)>'
-        
-        for match in re.finditer(link_pattern, html_content, re.IGNORECASE):
-            attrs = self._parse_attributes(match.group(1))
-            
-            # Check if this is a stylesheet
-            rel = attrs.get('rel', '').lower()
-            if rel == 'stylesheet':
-                href = attrs.get('href', '')
-                media = attrs.get('media', 'all')
-                type_attr = attrs.get('type', 'text/css')
-                
-                if href:
-                    stylesheet = {
-                        'type': 'external',
-                        'href': href,
-                        'media': media,
-                        'content_type': type_attr
-                    }
-                    self._stylesheets.append(stylesheet)
-            
-            # Handle other link types
-            elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'):
-                href = attrs.get('href', '')
-                if href:
-                    self._external_resources[f'icon_{len(self._external_resources)}'] = {
-                        'type': 'icon',
-                        'rel': rel,
-                        'href': href,
-                        'sizes': attrs.get('sizes', ''),
-                        'content_type': attrs.get('type', '')
-                    }
-            
-            elif rel == 'preload':
-                href = attrs.get('href', '')
-                if href:
-                    self._external_resources[f'preload_{len(self._external_resources)}'] = {
-                        'type': 'preload',
-                        'href': href,
-                        'as': attrs.get('as', ''),
-                        'content_type': attrs.get('type', '')
-                    }
-    
-    def _extract_scripts(self, html_content: str):
-        """
-        Extract script references from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Regular expression to match script tags
-        script_pattern = r'<script\s*([^>]*)>(.*?)</script>'
-        
-        for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL):
-            attrs_str = match.group(1)
-            content = match.group(2).strip()
-            
-            attrs = self._parse_attributes(attrs_str)
-            
-            src = attrs.get('src', '')
-            script_type = attrs.get('type', 'text/javascript')
-            
-            if src:
-                # External script
-                script = {
-                    'type': 'external',
-                    'src': src,
-                    'content_type': script_type,
-                    'async': 'async' in attrs,
-                    'defer': 'defer' in attrs,
-                    'integrity': attrs.get('integrity', ''),
-                    'crossorigin': attrs.get('crossorigin', '')
-                }
-                self._scripts.append(script)
-            
-            elif content:
-                # Inline script
-                script = {
-                    'type': 'inline',
-                    'content': content,
-                    'content_type': script_type
-                }
-                self._scripts.append(script)
-    
-    def _extract_external_resources(self, html_content: str):
-        """
-        Extract other external resources from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Extract images
-        img_pattern = r'<img\s+([^>]+)>'
-        for match in re.finditer(img_pattern, html_content, re.IGNORECASE):
-            attrs = self._parse_attributes(match.group(1))
-            src = attrs.get('src', '')
-            if src:
-                self._external_resources[f'image_{len(self._external_resources)}'] = {
-                    'type': 'image',
-                    'src': src,
-                    'alt': attrs.get('alt', ''),
-                    'width': attrs.get('width', ''),
-                    'height': attrs.get('height', ''),
-                    'loading': attrs.get('loading', ''),
-                    'srcset': attrs.get('srcset', '')
-                }
-        
-        # Extract audio
-        audio_pattern = r'<audio\s+([^>]+)>'
-        for match in re.finditer(audio_pattern, html_content, re.IGNORECASE):
-            attrs = self._parse_attributes(match.group(1))
-            src = attrs.get('src', '')
-            if src:
-                self._external_resources[f'audio_{len(self._external_resources)}'] = {
-                    'type': 'audio',
-                    'src': src,
-                    'controls': 'controls' in attrs,
-                    'autoplay': 'autoplay' in attrs,
-                    'loop': 'loop' in attrs,
-                    'muted': 'muted' in attrs
-                }
-        
-        # Extract video
-        video_pattern = r'<video\s+([^>]+)>'
-        for match in re.finditer(video_pattern, html_content, re.IGNORECASE):
-            attrs = self._parse_attributes(match.group(1))
-            src = attrs.get('src', '')
-            if src:
-                self._external_resources[f'video_{len(self._external_resources)}'] = {
-                    'type': 'video',
-                    'src': src,
-                    'controls': 'controls' in attrs,
-                    'autoplay': 'autoplay' in attrs,
-                    'loop': 'loop' in attrs,
-                    'muted': 'muted' in attrs,
-                    'width': attrs.get('width', ''),
-                    'height': attrs.get('height', ''),
-                    'poster': attrs.get('poster', '')
-                }
-        
-        # Extract embed/object resources
-        embed_pattern = r'<embed\s+([^>]+)>'
-        for match in re.finditer(embed_pattern, html_content, re.IGNORECASE):
-            attrs = self._parse_attributes(match.group(1))
-            src = attrs.get('src', '')
-            if src:
-                self._external_resources[f'embed_{len(self._external_resources)}'] = {
-                    'type': 'embed',
-                    'src': src,
-                    'content_type': attrs.get('type', ''),
-                    'width': attrs.get('width', ''),
-                    'height': attrs.get('height', '')
-                }
-        
-        # Extract iframe sources
-        iframe_pattern = r'<iframe\s+([^>]+)>'
-        for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE):
-            attrs = self._parse_attributes(match.group(1))
-            src = attrs.get('src', '')
-            if src:
-                self._external_resources[f'iframe_{len(self._external_resources)}'] = {
-                    'type': 'iframe',
-                    'src': src,
-                    'width': attrs.get('width', ''),
-                    'height': attrs.get('height', ''),
-                    'loading': attrs.get('loading', ''),
-                    'sandbox': attrs.get('sandbox', '')
-                }
-    
-    def _extract_inline_styles(self, html_content: str):
-        """
-        Extract inline CSS styles from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # Extract style blocks
-        style_pattern = r'<style\s*([^>]*)>(.*?)</style>'
-        
-        for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)):
-            attrs_str = match.group(1)
-            content = match.group(2).strip()
-            
-            attrs = self._parse_attributes(attrs_str)
-            
-            if content:
-                style_block = {
-                    'content': content,
-                    'media': attrs.get('media', 'all'),
-                    'content_type': attrs.get('type', 'text/css')
-                }
-                self._inline_styles[f'style_block_{i}'] = style_block
-        
-        # Extract inline style attributes (this would be more complex
-        # as it requires parsing all elements with style attributes)
-        style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>'
-        
-        for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)):
-            style_content = match.group(1)
-            if style_content:
-                style_attr = {
-                    'content': style_content,
-                    'type': 'attribute'
-                }
-                self._inline_styles[f'style_attr_{i}'] = style_attr
-    
-    def _extract_inline_scripts(self, html_content: str):
-        """
-        Extract inline JavaScript from HTML content.
-        
-        Args:
-            html_content: The HTML content to parse
-        """
-        # This is already handled in _extract_scripts, but we keep this
-        # method for consistency and potential future extensions
-        pass
-    
-    def _populate_document(self, document: Document):
-        """
-        Populate the document with extracted resources.
-        
-        Args:
-            document: The document to populate
-        """
-        # Add stylesheets
-        for stylesheet in self._stylesheets:
-            document.add_stylesheet(stylesheet)
-        
-        # Add scripts
-        for script in self._scripts:
-            if script['type'] == 'inline':
-                document.add_script(script['content'])
-            else:
-                # For external scripts, we store them as resources
-                script_name = f"script_{len(document._resources)}"
-                document.add_resource(script_name, script)
-        
-        # Add external resources
-        for name, resource in self._external_resources.items():
-            document.add_resource(name, resource)
-        
-        # Add inline styles as stylesheets
-        for name, style in self._inline_styles.items():
-            if style.get('type') != 'attribute':  # Don't add individual style attributes
-                parsed_style = self._parse_css(style['content'])
-                if parsed_style:
-                    document.add_stylesheet({
-                        'type': 'inline',
-                        'content': style['content'],
-                        'parsed': parsed_style,
-                        'media': style.get('media', 'all')
-                    })
-    
-    def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
-        """
-        Parse HTML attributes from a string.
-        
-        Args:
-            attr_string: String containing HTML attributes
-            
-        Returns:
-            Dictionary of attribute name-value pairs
-        """
-        attrs = {}
-        
-        # Regular expression to match attribute="value" or attribute='value'
-        attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
-        
-        for match in re.finditer(attr_pattern, attr_string):
-            name = match.group(1).lower()
-            value = match.group(2) or match.group(3) or match.group(4) or ''
-            attrs[name] = value
-        
-        # Handle standalone attributes (like async, defer)
-        standalone_pattern = r'\b(\w+)(?!=)'
-        for match in re.finditer(standalone_pattern, attr_string):
-            attr_name = match.group(1).lower()
-            if attr_name not in attrs:
-                attrs[attr_name] = ''
-        
-        return attrs
-    
-    def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]:
-        """
-        Parse a CSS stylesheet.
-        
-        Args:
-            css_str: CSS stylesheet string
-            
-        Returns:
-            Dictionary of selectors and their style properties
-        """
-        stylesheet = {}
-        
-        # Remove comments
-        css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL)
-        
-        # Split into rule sets
-        rule_sets = css_str.split('}')
-        
-        for rule_set in rule_sets:
-            # Split into selector and declarations
-            parts = rule_set.split('{', 1)
-            if len(parts) != 2:
-                continue
-                
-            selector = parts[0].strip()
-            declarations = parts[1].strip()
-            
-            # Parse declarations
-            style = self._parse_css_declarations(declarations)
-            
-            # Add to stylesheet
-            if selector and style:
-                stylesheet[selector] = style
-        
-        return stylesheet
-    
-    def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]:
-        """
-        Parse CSS declarations.
-        
-        Args:
-            declarations_str: CSS declarations string
-            
-        Returns:
-            Dictionary of CSS properties and values
-        """
-        declarations = {}
-        
-        # Split the declarations string into individual declarations
-        decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()]
-        
-        for declaration in decl_list:
-            # Split into property and value
-            parts = declaration.split(':', 1)
-            if len(parts) != 2:
-                continue
-                
-            prop = parts[0].strip().lower()
-            value = parts[1].strip()
-            
-            # Store the declaration
-            declarations[prop] = value
-        
-        return declarations
-    
-    def resolve_url(self, url: str, base_url: Optional[str] = None) -> str:
-        """
-        Resolve a relative URL against a base URL.
-        
-        Args:
-            url: The URL to resolve
-            base_url: The base URL to resolve against
-            
-        Returns:
-            The resolved URL
-        """
-        if base_url and not url.startswith(('http://', 'https://', '//', 'data:')):
-            return urllib.parse.urljoin(base_url, url)
-        return url
-    
-    def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]:
-        """
-        Get the dependencies of a resource (e.g., CSS imports, script dependencies).
-        
-        Args:
-            resource: The resource to analyze
-            
-        Returns:
-            List of dependency URLs
-        """
-        dependencies = []
-        
-        if resource.get('type') == 'external' and 'content' in resource:
-            content = resource['content']
-            
-            # Check for CSS @import rules
-            if resource.get('content_type', '').startswith('text/css'):
-                import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?'
-                for match in re.finditer(import_pattern, content, re.IGNORECASE):
-                    dependencies.append(match.group(1))
-            
-            # Check for JavaScript imports/requires (basic detection)
-            elif resource.get('content_type', '').startswith('text/javascript'):
-                # ES6 imports
-                import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']'
-                for match in re.finditer(import_pattern, content):
-                    dependencies.append(match.group(1))
-                
-                # CommonJS requires
-                require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)'
-                for match in re.finditer(require_pattern, content):
-                    dependencies.append(match.group(1))
-        
-        return dependencies