removed more unneed junk

2025-06-07 19:16:27 +02:00 · 2025-06-07 19:16:27 +02:00 · 28c7b6700b
commit 28c7b6700b
parent 2b1170cac7
7 changed files with 4 additions and 1513 deletions
--- a/pyWebLayout/init.py
+++ b/pyWebLayout/init.py
@ -35,8 +35,3 @@ from pyWebLayout.concrete.page import Container, Page
 from pyWebLayout.abstract.inline import Word
 # IO functionality (reading and writing)
 from pyWebLayout.io import (
    parse_html, html_to_document,  # HTML parsing
    read_epub                      # EPUB reading
 )
--- a/pyWebLayout/io/init.py
+++ b/pyWebLayout/io/init.py
@ -11,61 +11,5 @@ pattern as the abstract module.
 # Legacy readers (for backward compatibility)
 # Legacy functions provided by new HTML reader for backward compatibility
 from pyWebLayout.io.readers.html import parse_html_string as parse_html
 from pyWebLayout.io.readers.html import read_html_file as html_to_document
 from pyWebLayout.io.readers.epub_reader import read_epub
-# New decomposed readers
+from pyWebLayout.io.readers.epub_reader import EPUBReader
 from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string
 from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
 # Specialized HTML readers
 from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
 from pyWebLayout.io.readers.html_resources import HTMLResourceReader
 # HTML extraction parser (the best approach)
 from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction
 # Specialized EPUB readers
 from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
 # Convenience functions using the new architecture
 def read_document(source, format_hint=None, **options):
    """
    Read a document using the appropriate reader based on format detection.
    Args:
        source: The source to read (file path, URL, or content)
        format_hint: Optional hint about the format ('html', 'epub', etc.)
        **options: Additional options for reading
    Returns:
        Document: The parsed document
    """
    if format_hint == 'html' or (not format_hint and _is_html_source(source)):
        reader = HTMLReader()
        return reader.read(source, **options)
    elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)):
        # Use legacy EPUB reader for now
        return read_epub(source)
    else:
        # Try HTML reader as fallback
        try:
            reader = HTMLReader()
            if reader.can_read(source):
                return reader.read(source, **options)
        except:
            pass
        raise ValueError(f"Cannot determine format for source: {source}")
 def _is_html_source(source):
    """Check if source appears to be HTML."""
    reader = HTMLReader()
    return reader.can_read(source)
 def _is_epub_source(source):
    """Check if source appears to be EPUB."""
    if isinstance(source, str):
        return source.lower().endswith('.epub')
    return False
--- a/pyWebLayout/io/readers/init.py
+++ b/pyWebLayout/io/readers/init.py
@ -9,14 +9,13 @@ using a decomposed architecture pattern.
 from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
 # HTML readers (decomposed)
-from .html import HTMLReader, read_html, read_html_file, parse_html_string
+
-from .html_metadata import HTMLMetadataReader
+
 from .html_resources import HTMLResourceReader
 # EPUB readers
 from .epub_reader import read_epub  # Legacy
-from .epub_metadata import EPUBMetadataReader  # New decomposed
+
 __all__ = [
    # Base classes
--- a/pyWebLayout/io/readers/epub_metadata.py
+++ b/pyWebLayout/io/readers/epub_metadata.py
@ -1,352 +0,0 @@
 """
 EPUB metadata reader for pyWebLayout.
 This module provides specialized functionality for extracting metadata
 from EPUB documents, following the decomposed architecture pattern.
 """
 import os
 import zipfile
 import tempfile
 from typing import Dict, Any, Optional, List
 import xml.etree.ElementTree as ET
 from pyWebLayout.abstract.document import Document, MetadataType
 from pyWebLayout.io.readers.base import MetadataReader
 # XML namespaces used in EPUB files
 NAMESPACES = {
    'opf': 'http://www.idpf.org/2007/opf',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcterms': 'http://purl.org/dc/terms/',
 }
 class EPUBMetadataReader(MetadataReader):
    """
    Specialized reader for extracting metadata from EPUB documents.
    This class handles EPUB package document metadata including
    Dublin Core elements and custom metadata.
    """
    def __init__(self):
        """Initialize the EPUB metadata reader."""
        self._metadata = {}
        self._temp_dir = None
        self._package_path = None
    def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
        """
        Extract metadata from EPUB file.
        Args:
            epub_path: Path to the EPUB file
            document: The document to populate with metadata
        Returns:
            Dictionary of extracted metadata
        """
        # Reset internal state
        self._reset()
        try:
            # Extract EPUB to temporary directory
            self._extract_epub(epub_path)
            # Find and parse package document
            self._find_package_document()
            if self._package_path:
                self._parse_package_metadata()
            # Populate document with extracted metadata
            self._populate_document(document)
            return self._metadata
        finally:
            # Clean up temporary files
            self._cleanup()
    def _reset(self):
        """Reset internal state for a new extraction."""
        self._metadata = {}
        self._temp_dir = None
        self._package_path = None
    def _extract_epub(self, epub_path: str):
        """
        Extract EPUB file to temporary directory.
        Args:
            epub_path: Path to the EPUB file
        """
        self._temp_dir = tempfile.mkdtemp()
        with zipfile.ZipFile(epub_path, 'r') as zip_ref:
            zip_ref.extractall(self._temp_dir)
    def _find_package_document(self):
        """Find the package document (content.opf) in the extracted EPUB."""
        # First, try to find it via META-INF/container.xml
        container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
        if os.path.exists(container_path):
            try:
                tree = ET.parse(container_path)
                root = tree.getroot()
                # Find rootfile element
                for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
                    full_path = rootfile.get('full-path')
                    if full_path:
                        self._package_path = os.path.join(self._temp_dir, full_path)
                        if os.path.exists(self._package_path):
                            return
            except ET.ParseError:
                pass
        # Fallback: search for .opf files
        for root, dirs, files in os.walk(self._temp_dir):
            for file in files:
                if file.endswith('.opf'):
                    self._package_path = os.path.join(root, file)
                    return
    def _parse_package_metadata(self):
        """Parse metadata from the package document."""
        if not self._package_path or not os.path.exists(self._package_path):
            return
        try:
            tree = ET.parse(self._package_path)
            root = tree.getroot()
            # Find metadata element
            metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
            if metadata_elem is None:
                return
            # Parse Dublin Core metadata
            self._parse_dublin_core(metadata_elem)
            # Parse OPF-specific metadata
            self._parse_opf_metadata(metadata_elem)
        except ET.ParseError as e:
            print(f"Error parsing package document: {e}")
    def _parse_dublin_core(self, metadata_elem: ET.Element):
        """
        Parse Dublin Core metadata elements.
        Args:
            metadata_elem: The metadata XML element
        """
        dc_elements = {
            'title': 'title',
            'creator': 'creator',
            'subject': 'subject',
            'description': 'description',
            'publisher': 'publisher',
            'contributor': 'contributor',
            'date': 'date',
            'type': 'type',
            'format': 'format',
            'identifier': 'identifier',
            'source': 'source',
            'language': 'language',
            'relation': 'relation',
            'coverage': 'coverage',
            'rights': 'rights'
        }
        for dc_name, meta_key in dc_elements.items():
            elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
            if elements:
                if len(elements) == 1:
                    # Single element
                    text = elements[0].text
                    if text:
                        self._metadata[meta_key] = text.strip()
                        # Handle special attributes
                        elem = elements[0]
                        if dc_name == 'creator':
                            # Check for role attribute
                            role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
                            if role:
                                self._metadata[f'{meta_key}_role'] = role
                            # Check for file-as attribute for sorting
                            file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
                            if file_as:
                                self._metadata[f'{meta_key}_file_as'] = file_as
                        elif dc_name == 'identifier':
                            # Check for scheme (ISBN, DOI, etc.)
                            scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
                            if scheme:
                                self._metadata[f'{meta_key}_scheme'] = scheme
                            # Check if this is the unique identifier
                            id_attr = elem.get('id')
                            if id_attr:
                                self._metadata[f'{meta_key}_id'] = id_attr
                        elif dc_name == 'date':
                            # Check for event type
                            event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
                            if event:
                                self._metadata[f'{meta_key}_event'] = event
                else:
                    # Multiple elements - store as list
                    values = []
                    for elem in elements:
                        if elem.text:
                            values.append(elem.text.strip())
                    if values:
                        self._metadata[meta_key] = values
    def _parse_opf_metadata(self, metadata_elem: ET.Element):
        """
        Parse OPF-specific metadata elements.
        Args:
            metadata_elem: The metadata XML element
        """
        # Parse meta elements
        meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
        for meta in meta_elements:
            name = meta.get('name')
            content = meta.get('content')
            if name and content:
                self._metadata[f'meta_{name}'] = content
        # Parse x-metadata elements (custom metadata)
        x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
        for x_meta in x_meta_elements:
            for child in x_meta:
                if child.tag and child.text:
                    # Remove namespace prefix for cleaner key names
                    tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
                    self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
    def _populate_document(self, document: Document):
        """
        Populate the document with extracted metadata.
        Args:
            document: The document to populate
        """
        # Map EPUB metadata to document metadata types
        metadata_mapping = {
            'title': MetadataType.TITLE,
            'creator': MetadataType.AUTHOR,
            'description': MetadataType.DESCRIPTION,
            'subject': MetadataType.KEYWORDS,
            'language': MetadataType.LANGUAGE,
            'date': MetadataType.PUBLICATION_DATE,
            'publisher': MetadataType.PUBLISHER,
            'identifier': MetadataType.IDENTIFIER,
        }
        for epub_key, doc_type in metadata_mapping.items():
            if epub_key in self._metadata:
                value = self._metadata[epub_key]
                # Handle list values (like multiple subjects)
                if isinstance(value, list):
                    if epub_key == 'subject':
                        # Join subjects with commas for keywords
                        document.set_metadata(doc_type, ', '.join(value))
                    else:
                        # For other list values, use the first one
                        document.set_metadata(doc_type, value[0])
                else:
                    document.set_metadata(doc_type, value)
        # Handle cover image
        cover_meta = self._metadata.get('meta_cover')
        if cover_meta:
            document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
        # Store original EPUB metadata for reference
        document.set_metadata(MetadataType.CUSTOM, {
            'epub_metadata': self._metadata
        })
    def _cleanup(self):
        """Clean up temporary files."""
        if self._temp_dir:
            try:
                import shutil
                shutil.rmtree(self._temp_dir, ignore_errors=True)
            except:
                pass
            self._temp_dir = None
    def get_unique_identifier(self) -> Optional[str]:
        """
        Get the unique identifier from the EPUB metadata.
        Returns:
            The unique identifier string, or None if not found
        """
        # Look for identifier with specific ID
        for key, value in self._metadata.items():
            if key.startswith('identifier') and key.endswith('_id'):
                return self._metadata.get('identifier')
        # Fallback to any identifier
        return self._metadata.get('identifier')
    def get_cover_id(self) -> Optional[str]:
        """
        Get the cover image ID from metadata.
        Returns:
            The cover image ID, or None if not found
        """
        return self._metadata.get('meta_cover')
    def get_creators(self) -> List[Dict[str, str]]:
        """
        Get creator information with roles.
        Returns:
            List of creator dictionaries with name, role, and file-as info
        """
        creators = []
        creator_value = self._metadata.get('creator')
        if creator_value:
            if isinstance(creator_value, list):
                # Multiple creators - this is simplified, real implementation
                # would need to correlate with role and file-as attributes
                for creator in creator_value:
                    creators.append({'name': creator})
            else:
                # Single creator
                creator_info = {'name': creator_value}
                # Add role if available
                role = self._metadata.get('creator_role')
                if role:
                    creator_info['role'] = role
                # Add file-as if available
                file_as = self._metadata.get('creator_file_as')
                if file_as:
                    creator_info['file_as'] = file_as
                creators.append(creator_info)
        return creators
--- a/pyWebLayout/io/readers/html.py
+++ b/pyWebLayout/io/readers/html.py
@ -1,186 +0,0 @@
 """
 Modern HTML reader for pyWebLayout.
 This module provides an HTML reader that uses the html_extraction module
 for clean, handler-based parsing using BeautifulSoup.
 """
 import os
 from typing import Union, Optional
 from pyWebLayout.abstract.document import Document
 from pyWebLayout.io.readers.base import BaseReader
 from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
 from pyWebLayout.io.readers.html_resources import HTMLResourceReader
 from pyWebLayout.io.readers.html_extraction import parse_html_string
 from pyWebLayout.style import Font
 class HTMLReader(BaseReader):
    """
    Modern HTML reader using the html_extraction parser.
    This reader uses the clean, handler-based architecture from html_extraction.py
    for parsing HTML content into pyWebLayout's abstract document structure.
    """
    def __init__(self):
        """Initialize the HTML reader."""
        super().__init__()
        self._metadata_reader = HTMLMetadataReader()
        self._resource_reader = HTMLResourceReader()
    def can_read(self, source: Union[str, bytes]) -> bool:
        """
        Check if this reader can handle the given source.
        Args:
            source: The source to check (file path, URL, or content)
        Returns:
            True if this reader can handle the source, False otherwise
        """
        if isinstance(source, str):
            # Check if it's a file path
            if os.path.isfile(source):
                return source.lower().endswith(('.html', '.htm', '.xhtml'))
            # Check if it's HTML content (very basic check)
            source_lower = source.lower().strip()
            return (source_lower.startswith('<!doctype html') or 
                   source_lower.startswith('<html') or
                   '<html' in source_lower[:200])
        elif isinstance(source, bytes):
            # Check if it's HTML content in bytes
            try:
                source_str = source.decode('utf-8', errors='ignore').lower().strip()
                return (source_str.startswith('<!doctype html') or 
                       source_str.startswith('<html') or
                       '<html' in source_str[:200])
            except:
                return False
        return False
    def read(self, source: Union[str, bytes], **options) -> Document:
        """
        Read and parse the HTML source into a Document.
        Args:
            source: The HTML source to read (file path, URL, or content)
            **options: Additional options for reading
                - base_url: Base URL for resolving relative links
                - encoding: Character encoding (default: 'utf-8')
                - extract_metadata: Whether to extract metadata (default: True)
                - extract_resources: Whether to extract resources (default: True)
                - base_font: Base font for styling (default: None)
        Returns:
            The parsed Document
        """
        # Get options
        base_url = options.get('base_url')
        encoding = options.get('encoding', 'utf-8')
        extract_metadata = options.get('extract_metadata', True)
        extract_resources = options.get('extract_resources', True)
        base_font = options.get('base_font')
        # Read the HTML content
        html_content = self._read_html_content(source, encoding)
        # Set base URL if not provided and source is a file
        if not base_url and isinstance(source, str) and os.path.isfile(source):
            base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
        # Create a new document
        document = Document()
        # Extract metadata if enabled
        if extract_metadata and self._metadata_reader:
            self._metadata_reader.extract_metadata(html_content, document)
        # Parse content using html_extraction
        blocks = parse_html_string(html_content, base_font)
        for block in blocks:
            document.add_block(block)
        # Extract resources if enabled
        if extract_resources and self._resource_reader:
            self._resource_reader.extract_resources(html_content, document)
        return document
    def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str:
        """
        Read HTML content from various sources.
        Args:
            source: The source to read from
            encoding: Character encoding to use
        Returns:
            The HTML content as a string
        """
        if isinstance(source, bytes):
            # Source is already bytes, decode it
            return source.decode(encoding, errors='replace')
        elif isinstance(source, str):
            # Check if it's a file path
            if os.path.isfile(source):
                with open(source, 'r', encoding=encoding, errors='replace') as f:
                    return f.read()
            else:
                # Assume it's HTML content
                return source
        else:
            raise ValueError(f"Unsupported source type: {type(source)}")
 def read_html(source: Union[str, bytes], **options) -> Document:
    """
    Convenience function to read HTML content.
    Args:
        source: The HTML source to read (file path, URL, or content)
        **options: Additional options for reading
    Returns:
        The parsed Document
    """
    reader = HTMLReader()
    return reader.read(source, **options)
 def read_html_file(file_path: str, **options) -> Document:
    """
    Convenience function to read HTML from a file.
    Args:
        file_path: Path to the HTML file
        **options: Additional options for reading
    Returns:
        The parsed Document
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"HTML file not found: {file_path}")
    reader = HTMLReader()
    return reader.read(file_path, **options)
 def parse_html_string(html_content: str, **options) -> Document:
    """
    Convenience function to parse HTML content from a string.
    Args:
        html_content: The HTML content as a string
        **options: Additional options for reading
    Returns:
        The parsed Document
    """
    reader = HTMLReader()
    return reader.read(html_content, **options)
--- a/pyWebLayout/io/readers/html_metadata.py
+++ b/pyWebLayout/io/readers/html_metadata.py
@ -1,426 +0,0 @@
 """
 HTML metadata reader for pyWebLayout.
 This module provides specialized functionality for extracting metadata
 from HTML documents, following the decomposed architecture pattern.
 """
 from typing import Dict, Any, Optional
 import re
 from pyWebLayout.abstract.document import Document, MetadataType
 from pyWebLayout.io.readers.base import MetadataReader
 class HTMLMetadataReader(MetadataReader):
    """
    Specialized reader for extracting metadata from HTML documents.
    This class handles HTML meta tags, title elements, and other metadata
    sources like Open Graph tags and JSON-LD structured data.
    """
    def __init__(self):
        """Initialize the HTML metadata reader."""
        self._title = None
        self._meta_tags = {}
        self._og_tags = {}
        self._twitter_tags = {}
        self._json_ld = {}
    def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
        """
        Extract metadata from HTML content.
        Args:
            html_content: The HTML content to parse
            document: The document to populate with metadata
        Returns:
            Dictionary of extracted metadata
        """
        # Reset internal state
        self._reset()
        # Extract title
        self._extract_title(html_content)
        # Extract meta tags
        self._extract_meta_tags(html_content)
        # Extract Open Graph tags
        self._extract_open_graph(html_content)
        # Extract Twitter Card tags
        self._extract_twitter_cards(html_content)
        # Extract JSON-LD structured data
        self._extract_json_ld(html_content)
        # Populate document with extracted metadata
        self._populate_document(document)
        # Return all extracted metadata
        return {
            'title': self._title,
            'meta_tags': self._meta_tags,
            'open_graph': self._og_tags,
            'twitter_cards': self._twitter_tags,
            'json_ld': self._json_ld
        }
    def _reset(self):
        """Reset internal state for a new extraction."""
        self._title = None
        self._meta_tags = {}
        self._og_tags = {}
        self._twitter_tags = {}
        self._json_ld = {}
    def _extract_title(self, html_content: str):
        """
        Extract the title from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Look for title tag
        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
        if title_match:
            # Clean up the title text
            self._title = self._clean_text(title_match.group(1))
    def _extract_meta_tags(self, html_content: str):
        """
        Extract meta tags from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match meta tags
        meta_pattern = r'<meta\s+([^>]+)>'
        for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))
            # Get name and content
            name = attrs.get('name', '').lower()
            content = attrs.get('content', '')
            # Handle different types of meta tags
            if name and content:
                self._meta_tags[name] = content
            # Handle http-equiv meta tags
            http_equiv = attrs.get('http-equiv', '').lower()
            if http_equiv and content:
                self._meta_tags[f'http-equiv:{http_equiv}'] = content
            # Handle charset meta tags
            charset = attrs.get('charset', '')
            if charset:
                self._meta_tags['charset'] = charset
    def _extract_open_graph(self, html_content: str):
        """
        Extract Open Graph meta tags from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match Open Graph meta tags
        og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
        for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
            property_name = match.group(1)
            content = match.group(2)
            self._og_tags[property_name] = content
    def _extract_twitter_cards(self, html_content: str):
        """
        Extract Twitter Card meta tags from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match Twitter Card meta tags
        twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
        for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
            property_name = match.group(1)
            content = match.group(2)
            self._twitter_tags[property_name] = content
    def _extract_json_ld(self, html_content: str):
        """
        Extract JSON-LD structured data from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match JSON-LD script tags
        json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
        for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
            try:
                import json
                json_content = match.group(1).strip()
                data = json.loads(json_content)
                # Store JSON-LD data by type if available
                if isinstance(data, dict) and '@type' in data:
                    type_name = data['@type']
                    if type_name not in self._json_ld:
                        self._json_ld[type_name] = []
                    self._json_ld[type_name].append(data)
                elif isinstance(data, list):
                    # Handle arrays of structured data
                    for item in data:
                        if isinstance(item, dict) and '@type' in item:
                            type_name = item['@type']
                            if type_name not in self._json_ld:
                                self._json_ld[type_name] = []
                            self._json_ld[type_name].append(item)
            except (json.JSONDecodeError, ImportError):
                # Skip invalid JSON-LD
                continue
    def _populate_document(self, document: Document):
        """
        Populate the document with extracted metadata.
        Args:
            document: The document to populate
        """
        # Set title
        title = self._get_best_title()
        if title:
            document.set_metadata(MetadataType.TITLE, title)
        # Set description
        description = self._get_best_description()
        if description:
            document.set_metadata(MetadataType.DESCRIPTION, description)
        # Set author
        author = self._get_best_author()
        if author:
            document.set_metadata(MetadataType.AUTHOR, author)
        # Set keywords
        keywords = self._get_keywords()
        if keywords:
            document.set_metadata(MetadataType.KEYWORDS, keywords)
        # Set language
        language = self._get_language()
        if language:
            document.set_metadata(MetadataType.LANGUAGE, language)
        # Set cover image
        cover_image = self._get_cover_image()
        if cover_image:
            document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
        # Set publisher
        publisher = self._get_publisher()
        if publisher:
            document.set_metadata(MetadataType.PUBLISHER, publisher)
        # Set publication date
        pub_date = self._get_publication_date()
        if pub_date:
            document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
    def _get_best_title(self) -> Optional[str]:
        """Get the best available title from all sources."""
        # Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
        # Check Open Graph
        if 'title' in self._og_tags:
            return self._og_tags['title']
        # Check Twitter Cards
        if 'title' in self._twitter_tags:
            return self._twitter_tags['title']
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'name' in item:
                    return item['name']
                elif 'headline' in item:
                    return item['headline']
        # Check meta tags
        for key in ['title', 'og:title', 'twitter:title']:
            if key in self._meta_tags:
                return self._meta_tags[key]
        # Fall back to HTML title
        return self._title
    def _get_best_description(self) -> Optional[str]:
        """Get the best available description from all sources."""
        # Priority order: Open Graph > Twitter > meta description > JSON-LD
        # Check Open Graph
        if 'description' in self._og_tags:
            return self._og_tags['description']
        # Check Twitter Cards
        if 'description' in self._twitter_tags:
            return self._twitter_tags['description']
        # Check meta description
        if 'description' in self._meta_tags:
            return self._meta_tags['description']
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'description' in item:
                    return item['description']
        return None
    def _get_best_author(self) -> Optional[str]:
        """Get the best available author from all sources."""
        # Check meta tags
        if 'author' in self._meta_tags:
            return self._meta_tags['author']
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'author' in item:
                    author = item['author']
                    if isinstance(author, dict) and 'name' in author:
                        return author['name']
                    elif isinstance(author, str):
                        return author
                elif 'creator' in item:
                    creator = item['creator']
                    if isinstance(creator, dict) and 'name' in creator:
                        return creator['name']
                    elif isinstance(creator, str):
                        return creator
        return None
    def _get_keywords(self) -> Optional[str]:
        """Get keywords from meta tags."""
        return self._meta_tags.get('keywords')
    def _get_language(self) -> Optional[str]:
        """Get language from meta tags or HTML lang attribute."""
        # Check meta tags first
        if 'language' in self._meta_tags:
            return self._meta_tags['language']
        # Could also extract from html lang attribute if needed
        return None
    def _get_cover_image(self) -> Optional[str]:
        """Get the best available cover image from all sources."""
        # Check Open Graph
        if 'image' in self._og_tags:
            return self._og_tags['image']
        # Check Twitter Cards
        if 'image' in self._twitter_tags:
            return self._twitter_tags['image']
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'image' in item:
                    image = item['image']
                    if isinstance(image, dict) and 'url' in image:
                        return image['url']
                    elif isinstance(image, str):
                        return image
        return None
    def _get_publisher(self) -> Optional[str]:
        """Get publisher from JSON-LD or other sources."""
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'publisher' in item:
                    publisher = item['publisher']
                    if isinstance(publisher, dict) and 'name' in publisher:
                        return publisher['name']
                    elif isinstance(publisher, str):
                        return publisher
        return None
    def _get_publication_date(self) -> Optional[str]:
        """Get publication date from JSON-LD or other sources."""
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'datePublished' in item:
                    return item['datePublished']
                elif 'publishDate' in item:
                    return item['publishDate']
        return None
    def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
        """
        Parse HTML attributes from a string.
        Args:
            attr_string: String containing HTML attributes
        Returns:
            Dictionary of attribute name-value pairs
        """
        attrs = {}
        # Regular expression to match attribute="value" or attribute='value'
        attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
        for match in re.finditer(attr_pattern, attr_string):
            name = match.group(1).lower()
            value = match.group(2) or match.group(3) or match.group(4) or ''
            attrs[name] = value
        # Handle standalone attributes (like charset)
        standalone_pattern = r'\b(\w+)(?!=)'
        for match in re.finditer(standalone_pattern, attr_string):
            attr_name = match.group(1).lower()
            if attr_name not in attrs:
                attrs[attr_name] = ''
        return attrs
    def _clean_text(self, text: str) -> str:
        """
        Clean up text content by removing extra whitespace and HTML entities.
        Args:
            text: The text to clean
        Returns:
            Cleaned text
        """
        # Remove extra whitespace
        cleaned = re.sub(r'\s+', ' ', text).strip()
        # Decode common HTML entities
        entities = {
            '&lt;': '<',
            '&gt;': '>',
            '&amp;': '&',
            '&quot;': '"',
            '&apos;': "'",
            '&nbsp;': ' ',
        }
        for entity, char in entities.items():
            cleaned = cleaned.replace(entity, char)
        return cleaned
--- a/pyWebLayout/io/readers/html_resources.py
+++ b/pyWebLayout/io/readers/html_resources.py
@ -1,483 +0,0 @@
 """
 HTML resources reader for pyWebLayout.
 This module provides specialized functionality for extracting resources
 from HTML documents, such as stylesheets, scripts, and external files.
 """
 from typing import Dict, Any, Optional, List
 import re
 import urllib.parse
 from pyWebLayout.abstract.document import Document
 from pyWebLayout.io.readers.base import ResourceReader
 class HTMLResourceReader(ResourceReader):
    """
    Specialized reader for extracting resources from HTML documents.
    This class handles CSS stylesheets, JavaScript files, images,
    and other external resources referenced in HTML.
    """
    def __init__(self):
        """Initialize the HTML resource reader."""
        self._stylesheets = []
        self._scripts = []
        self._external_resources = {}
        self._inline_styles = {}
        self._inline_scripts = []
    def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]:
        """
        Extract resources from HTML content.
        Args:
            html_content: The HTML content to parse
            document: The document to populate with resources
        Returns:
            Dictionary of extracted resources
        """
        # Reset internal state
        self._reset()
        # Extract stylesheets
        self._extract_stylesheets(html_content)
        # Extract scripts
        self._extract_scripts(html_content)
        # Extract other external resources
        self._extract_external_resources(html_content)
        # Extract inline styles
        self._extract_inline_styles(html_content)
        # Extract inline scripts
        self._extract_inline_scripts(html_content)
        # Populate document with extracted resources
        self._populate_document(document)
        # Return all extracted resources
        return {
            'stylesheets': self._stylesheets,
            'scripts': self._scripts,
            'external_resources': self._external_resources,
            'inline_styles': self._inline_styles,
            'inline_scripts': self._inline_scripts
        }
    def _reset(self):
        """Reset internal state for a new extraction."""
        self._stylesheets = []
        self._scripts = []
        self._external_resources = {}
        self._inline_styles = {}
        self._inline_scripts = []
    def _extract_stylesheets(self, html_content: str):
        """
        Extract CSS stylesheet references from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match link tags for stylesheets
        link_pattern = r'<link\s+([^>]+)>'
        for match in re.finditer(link_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))
            # Check if this is a stylesheet
            rel = attrs.get('rel', '').lower()
            if rel == 'stylesheet':
                href = attrs.get('href', '')
                media = attrs.get('media', 'all')
                type_attr = attrs.get('type', 'text/css')
                if href:
                    stylesheet = {
                        'type': 'external',
                        'href': href,
                        'media': media,
                        'content_type': type_attr
                    }
                    self._stylesheets.append(stylesheet)
            # Handle other link types
            elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'):
                href = attrs.get('href', '')
                if href:
                    self._external_resources[f'icon_{len(self._external_resources)}'] = {
                        'type': 'icon',
                        'rel': rel,
                        'href': href,
                        'sizes': attrs.get('sizes', ''),
                        'content_type': attrs.get('type', '')
                    }
            elif rel == 'preload':
                href = attrs.get('href', '')
                if href:
                    self._external_resources[f'preload_{len(self._external_resources)}'] = {
                        'type': 'preload',
                        'href': href,
                        'as': attrs.get('as', ''),
                        'content_type': attrs.get('type', '')
                    }
    def _extract_scripts(self, html_content: str):
        """
        Extract script references from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match script tags
        script_pattern = r'<script\s*([^>]*)>(.*?)</script>'
        for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL):
            attrs_str = match.group(1)
            content = match.group(2).strip()
            attrs = self._parse_attributes(attrs_str)
            src = attrs.get('src', '')
            script_type = attrs.get('type', 'text/javascript')
            if src:
                # External script
                script = {
                    'type': 'external',
                    'src': src,
                    'content_type': script_type,
                    'async': 'async' in attrs,
                    'defer': 'defer' in attrs,
                    'integrity': attrs.get('integrity', ''),
                    'crossorigin': attrs.get('crossorigin', '')
                }
                self._scripts.append(script)
            elif content:
                # Inline script
                script = {
                    'type': 'inline',
                    'content': content,
                    'content_type': script_type
                }
                self._scripts.append(script)
    def _extract_external_resources(self, html_content: str):
        """
        Extract other external resources from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Extract images
        img_pattern = r'<img\s+([^>]+)>'
        for match in re.finditer(img_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))
            src = attrs.get('src', '')
            if src:
                self._external_resources[f'image_{len(self._external_resources)}'] = {
                    'type': 'image',
                    'src': src,
                    'alt': attrs.get('alt', ''),
                    'width': attrs.get('width', ''),
                    'height': attrs.get('height', ''),
                    'loading': attrs.get('loading', ''),
                    'srcset': attrs.get('srcset', '')
                }
        # Extract audio
        audio_pattern = r'<audio\s+([^>]+)>'
        for match in re.finditer(audio_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))
            src = attrs.get('src', '')
            if src:
                self._external_resources[f'audio_{len(self._external_resources)}'] = {
                    'type': 'audio',
                    'src': src,
                    'controls': 'controls' in attrs,
                    'autoplay': 'autoplay' in attrs,
                    'loop': 'loop' in attrs,
                    'muted': 'muted' in attrs
                }
        # Extract video
        video_pattern = r'<video\s+([^>]+)>'
        for match in re.finditer(video_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))
            src = attrs.get('src', '')
            if src:
                self._external_resources[f'video_{len(self._external_resources)}'] = {
                    'type': 'video',
                    'src': src,
                    'controls': 'controls' in attrs,
                    'autoplay': 'autoplay' in attrs,
                    'loop': 'loop' in attrs,
                    'muted': 'muted' in attrs,
                    'width': attrs.get('width', ''),
                    'height': attrs.get('height', ''),
                    'poster': attrs.get('poster', '')
                }
        # Extract embed/object resources
        embed_pattern = r'<embed\s+([^>]+)>'
        for match in re.finditer(embed_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))
            src = attrs.get('src', '')
            if src:
                self._external_resources[f'embed_{len(self._external_resources)}'] = {
                    'type': 'embed',
                    'src': src,
                    'content_type': attrs.get('type', ''),
                    'width': attrs.get('width', ''),
                    'height': attrs.get('height', '')
                }
        # Extract iframe sources
        iframe_pattern = r'<iframe\s+([^>]+)>'
        for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))
            src = attrs.get('src', '')
            if src:
                self._external_resources[f'iframe_{len(self._external_resources)}'] = {
                    'type': 'iframe',
                    'src': src,
                    'width': attrs.get('width', ''),
                    'height': attrs.get('height', ''),
                    'loading': attrs.get('loading', ''),
                    'sandbox': attrs.get('sandbox', '')
                }
    def _extract_inline_styles(self, html_content: str):
        """
        Extract inline CSS styles from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # Extract style blocks
        style_pattern = r'<style\s*([^>]*)>(.*?)</style>'
        for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)):
            attrs_str = match.group(1)
            content = match.group(2).strip()
            attrs = self._parse_attributes(attrs_str)
            if content:
                style_block = {
                    'content': content,
                    'media': attrs.get('media', 'all'),
                    'content_type': attrs.get('type', 'text/css')
                }
                self._inline_styles[f'style_block_{i}'] = style_block
        # Extract inline style attributes (this would be more complex
        # as it requires parsing all elements with style attributes)
        style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>'
        for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)):
            style_content = match.group(1)
            if style_content:
                style_attr = {
                    'content': style_content,
                    'type': 'attribute'
                }
                self._inline_styles[f'style_attr_{i}'] = style_attr
    def _extract_inline_scripts(self, html_content: str):
        """
        Extract inline JavaScript from HTML content.
        Args:
            html_content: The HTML content to parse
        """
        # This is already handled in _extract_scripts, but we keep this
        # method for consistency and potential future extensions
        pass
    def _populate_document(self, document: Document):
        """
        Populate the document with extracted resources.
        Args:
            document: The document to populate
        """
        # Add stylesheets
        for stylesheet in self._stylesheets:
            document.add_stylesheet(stylesheet)
        # Add scripts
        for script in self._scripts:
            if script['type'] == 'inline':
                document.add_script(script['content'])
            else:
                # For external scripts, we store them as resources
                script_name = f"script_{len(document._resources)}"
                document.add_resource(script_name, script)
        # Add external resources
        for name, resource in self._external_resources.items():
            document.add_resource(name, resource)
        # Add inline styles as stylesheets
        for name, style in self._inline_styles.items():
            if style.get('type') != 'attribute':  # Don't add individual style attributes
                parsed_style = self._parse_css(style['content'])
                if parsed_style:
                    document.add_stylesheet({
                        'type': 'inline',
                        'content': style['content'],
                        'parsed': parsed_style,
                        'media': style.get('media', 'all')
                    })
    def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
        """
        Parse HTML attributes from a string.
        Args:
            attr_string: String containing HTML attributes
        Returns:
            Dictionary of attribute name-value pairs
        """
        attrs = {}
        # Regular expression to match attribute="value" or attribute='value'
        attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
        for match in re.finditer(attr_pattern, attr_string):
            name = match.group(1).lower()
            value = match.group(2) or match.group(3) or match.group(4) or ''
            attrs[name] = value
        # Handle standalone attributes (like async, defer)
        standalone_pattern = r'\b(\w+)(?!=)'
        for match in re.finditer(standalone_pattern, attr_string):
            attr_name = match.group(1).lower()
            if attr_name not in attrs:
                attrs[attr_name] = ''
        return attrs
    def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]:
        """
        Parse a CSS stylesheet.
        Args:
            css_str: CSS stylesheet string
        Returns:
            Dictionary of selectors and their style properties
        """
        stylesheet = {}
        # Remove comments
        css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL)
        # Split into rule sets
        rule_sets = css_str.split('}')
        for rule_set in rule_sets:
            # Split into selector and declarations
            parts = rule_set.split('{', 1)
            if len(parts) != 2:
                continue
            selector = parts[0].strip()
            declarations = parts[1].strip()
            # Parse declarations
            style = self._parse_css_declarations(declarations)
            # Add to stylesheet
            if selector and style:
                stylesheet[selector] = style
        return stylesheet
    def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]:
        """
        Parse CSS declarations.
        Args:
            declarations_str: CSS declarations string
        Returns:
            Dictionary of CSS properties and values
        """
        declarations = {}
        # Split the declarations string into individual declarations
        decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()]
        for declaration in decl_list:
            # Split into property and value
            parts = declaration.split(':', 1)
            if len(parts) != 2:
                continue
            prop = parts[0].strip().lower()
            value = parts[1].strip()
            # Store the declaration
            declarations[prop] = value
        return declarations
    def resolve_url(self, url: str, base_url: Optional[str] = None) -> str:
        """
        Resolve a relative URL against a base URL.
        Args:
            url: The URL to resolve
            base_url: The base URL to resolve against
        Returns:
            The resolved URL
        """
        if base_url and not url.startswith(('http://', 'https://', '//', 'data:')):
            return urllib.parse.urljoin(base_url, url)
        return url
    def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]:
        """
        Get the dependencies of a resource (e.g., CSS imports, script dependencies).
        Args:
            resource: The resource to analyze
        Returns:
            List of dependency URLs
        """
        dependencies = []
        if resource.get('type') == 'external' and 'content' in resource:
            content = resource['content']
            # Check for CSS @import rules
            if resource.get('content_type', '').startswith('text/css'):
                import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?'
                for match in re.finditer(import_pattern, content, re.IGNORECASE):
                    dependencies.append(match.group(1))
            # Check for JavaScript imports/requires (basic detection)
            elif resource.get('content_type', '').startswith('text/javascript'):
                # ES6 imports
                import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']'
                for match in re.finditer(import_pattern, content):
                    dependencies.append(match.group(1))
                # CommonJS requires
                require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)'
                for match in re.finditer(require_pattern, content):
                    dependencies.append(match.group(1))
        return dependencies