From 28c7b6700bff5c6759428e8e9038e753b06d9995 Mon Sep 17 00:00:00 2001 From: Duncan Tourolle Date: Sat, 7 Jun 2025 19:16:27 +0200 Subject: [PATCH] removed more unneed junk --- pyWebLayout/__init__.py | 5 - pyWebLayout/io/__init__.py | 58 +-- pyWebLayout/io/readers/__init__.py | 7 +- pyWebLayout/io/readers/epub_metadata.py | 352 ----------------- pyWebLayout/io/readers/html.py | 186 --------- pyWebLayout/io/readers/html_metadata.py | 426 -------------------- pyWebLayout/io/readers/html_resources.py | 483 ----------------------- 7 files changed, 4 insertions(+), 1513 deletions(-) delete mode 100644 pyWebLayout/io/readers/epub_metadata.py delete mode 100644 pyWebLayout/io/readers/html.py delete mode 100644 pyWebLayout/io/readers/html_metadata.py delete mode 100644 pyWebLayout/io/readers/html_resources.py diff --git a/pyWebLayout/__init__.py b/pyWebLayout/__init__.py index dd4611e..3d3891e 100644 --- a/pyWebLayout/__init__.py +++ b/pyWebLayout/__init__.py @@ -35,8 +35,3 @@ from pyWebLayout.concrete.page import Container, Page from pyWebLayout.abstract.inline import Word -# IO functionality (reading and writing) -from pyWebLayout.io import ( - parse_html, html_to_document, # HTML parsing - read_epub # EPUB reading -) diff --git a/pyWebLayout/io/__init__.py b/pyWebLayout/io/__init__.py index a1fd9ac..8365e96 100644 --- a/pyWebLayout/io/__init__.py +++ b/pyWebLayout/io/__init__.py @@ -11,61 +11,5 @@ pattern as the abstract module. # Legacy readers (for backward compatibility) # Legacy functions provided by new HTML reader for backward compatibility -from pyWebLayout.io.readers.html import parse_html_string as parse_html -from pyWebLayout.io.readers.html import read_html_file as html_to_document -from pyWebLayout.io.readers.epub_reader import read_epub -# New decomposed readers -from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string -from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader - -# Specialized HTML readers -from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader -from pyWebLayout.io.readers.html_resources import HTMLResourceReader - -# HTML extraction parser (the best approach) -from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction - -# Specialized EPUB readers -from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader - -# Convenience functions using the new architecture -def read_document(source, format_hint=None, **options): - """ - Read a document using the appropriate reader based on format detection. - - Args: - source: The source to read (file path, URL, or content) - format_hint: Optional hint about the format ('html', 'epub', etc.) - **options: Additional options for reading - - Returns: - Document: The parsed document - """ - if format_hint == 'html' or (not format_hint and _is_html_source(source)): - reader = HTMLReader() - return reader.read(source, **options) - elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)): - # Use legacy EPUB reader for now - return read_epub(source) - else: - # Try HTML reader as fallback - try: - reader = HTMLReader() - if reader.can_read(source): - return reader.read(source, **options) - except: - pass - - raise ValueError(f"Cannot determine format for source: {source}") - -def _is_html_source(source): - """Check if source appears to be HTML.""" - reader = HTMLReader() - return reader.can_read(source) - -def _is_epub_source(source): - """Check if source appears to be EPUB.""" - if isinstance(source, str): - return source.lower().endswith('.epub') - return False +from pyWebLayout.io.readers.epub_reader import EPUBReader diff --git a/pyWebLayout/io/readers/__init__.py b/pyWebLayout/io/readers/__init__.py index 950535a..c2350ad 100644 --- a/pyWebLayout/io/readers/__init__.py +++ b/pyWebLayout/io/readers/__init__.py @@ -9,14 +9,13 @@ using a decomposed architecture pattern. from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader # HTML readers (decomposed) -from .html import HTMLReader, read_html, read_html_file, parse_html_string -from .html_metadata import HTMLMetadataReader -from .html_resources import HTMLResourceReader + + # EPUB readers from .epub_reader import read_epub # Legacy -from .epub_metadata import EPUBMetadataReader # New decomposed + __all__ = [ # Base classes diff --git a/pyWebLayout/io/readers/epub_metadata.py b/pyWebLayout/io/readers/epub_metadata.py deleted file mode 100644 index 1ee2770..0000000 --- a/pyWebLayout/io/readers/epub_metadata.py +++ /dev/null @@ -1,352 +0,0 @@ -""" -EPUB metadata reader for pyWebLayout. - -This module provides specialized functionality for extracting metadata -from EPUB documents, following the decomposed architecture pattern. -""" - -import os -import zipfile -import tempfile -from typing import Dict, Any, Optional, List -import xml.etree.ElementTree as ET -from pyWebLayout.abstract.document import Document, MetadataType -from pyWebLayout.io.readers.base import MetadataReader - - -# XML namespaces used in EPUB files -NAMESPACES = { - 'opf': 'http://www.idpf.org/2007/opf', - 'dc': 'http://purl.org/dc/elements/1.1/', - 'dcterms': 'http://purl.org/dc/terms/', -} - - -class EPUBMetadataReader(MetadataReader): - """ - Specialized reader for extracting metadata from EPUB documents. - - This class handles EPUB package document metadata including - Dublin Core elements and custom metadata. - """ - - def __init__(self): - """Initialize the EPUB metadata reader.""" - self._metadata = {} - self._temp_dir = None - self._package_path = None - - def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]: - """ - Extract metadata from EPUB file. - - Args: - epub_path: Path to the EPUB file - document: The document to populate with metadata - - Returns: - Dictionary of extracted metadata - """ - # Reset internal state - self._reset() - - try: - # Extract EPUB to temporary directory - self._extract_epub(epub_path) - - # Find and parse package document - self._find_package_document() - - if self._package_path: - self._parse_package_metadata() - - # Populate document with extracted metadata - self._populate_document(document) - - return self._metadata - - finally: - # Clean up temporary files - self._cleanup() - - def _reset(self): - """Reset internal state for a new extraction.""" - self._metadata = {} - self._temp_dir = None - self._package_path = None - - def _extract_epub(self, epub_path: str): - """ - Extract EPUB file to temporary directory. - - Args: - epub_path: Path to the EPUB file - """ - self._temp_dir = tempfile.mkdtemp() - - with zipfile.ZipFile(epub_path, 'r') as zip_ref: - zip_ref.extractall(self._temp_dir) - - def _find_package_document(self): - """Find the package document (content.opf) in the extracted EPUB.""" - # First, try to find it via META-INF/container.xml - container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml') - - if os.path.exists(container_path): - try: - tree = ET.parse(container_path) - root = tree.getroot() - - # Find rootfile element - for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'): - full_path = rootfile.get('full-path') - if full_path: - self._package_path = os.path.join(self._temp_dir, full_path) - if os.path.exists(self._package_path): - return - except ET.ParseError: - pass - - # Fallback: search for .opf files - for root, dirs, files in os.walk(self._temp_dir): - for file in files: - if file.endswith('.opf'): - self._package_path = os.path.join(root, file) - return - - def _parse_package_metadata(self): - """Parse metadata from the package document.""" - if not self._package_path or not os.path.exists(self._package_path): - return - - try: - tree = ET.parse(self._package_path) - root = tree.getroot() - - # Find metadata element - metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf'])) - if metadata_elem is None: - return - - # Parse Dublin Core metadata - self._parse_dublin_core(metadata_elem) - - # Parse OPF-specific metadata - self._parse_opf_metadata(metadata_elem) - - except ET.ParseError as e: - print(f"Error parsing package document: {e}") - - def _parse_dublin_core(self, metadata_elem: ET.Element): - """ - Parse Dublin Core metadata elements. - - Args: - metadata_elem: The metadata XML element - """ - dc_elements = { - 'title': 'title', - 'creator': 'creator', - 'subject': 'subject', - 'description': 'description', - 'publisher': 'publisher', - 'contributor': 'contributor', - 'date': 'date', - 'type': 'type', - 'format': 'format', - 'identifier': 'identifier', - 'source': 'source', - 'language': 'language', - 'relation': 'relation', - 'coverage': 'coverage', - 'rights': 'rights' - } - - for dc_name, meta_key in dc_elements.items(): - elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name)) - - if elements: - if len(elements) == 1: - # Single element - text = elements[0].text - if text: - self._metadata[meta_key] = text.strip() - - # Handle special attributes - elem = elements[0] - if dc_name == 'creator': - # Check for role attribute - role = elem.get('{{{0}}}role'.format(NAMESPACES['opf'])) - if role: - self._metadata[f'{meta_key}_role'] = role - - # Check for file-as attribute for sorting - file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf'])) - if file_as: - self._metadata[f'{meta_key}_file_as'] = file_as - - elif dc_name == 'identifier': - # Check for scheme (ISBN, DOI, etc.) - scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf'])) - if scheme: - self._metadata[f'{meta_key}_scheme'] = scheme - - # Check if this is the unique identifier - id_attr = elem.get('id') - if id_attr: - self._metadata[f'{meta_key}_id'] = id_attr - - elif dc_name == 'date': - # Check for event type - event = elem.get('{{{0}}}event'.format(NAMESPACES['opf'])) - if event: - self._metadata[f'{meta_key}_event'] = event - - else: - # Multiple elements - store as list - values = [] - for elem in elements: - if elem.text: - values.append(elem.text.strip()) - - if values: - self._metadata[meta_key] = values - - def _parse_opf_metadata(self, metadata_elem: ET.Element): - """ - Parse OPF-specific metadata elements. - - Args: - metadata_elem: The metadata XML element - """ - # Parse meta elements - meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf'])) - - for meta in meta_elements: - name = meta.get('name') - content = meta.get('content') - - if name and content: - self._metadata[f'meta_{name}'] = content - - # Parse x-metadata elements (custom metadata) - x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf'])) - - for x_meta in x_meta_elements: - for child in x_meta: - if child.tag and child.text: - # Remove namespace prefix for cleaner key names - tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag - self._metadata[f'x_meta_{tag_name}'] = child.text.strip() - - def _populate_document(self, document: Document): - """ - Populate the document with extracted metadata. - - Args: - document: The document to populate - """ - # Map EPUB metadata to document metadata types - metadata_mapping = { - 'title': MetadataType.TITLE, - 'creator': MetadataType.AUTHOR, - 'description': MetadataType.DESCRIPTION, - 'subject': MetadataType.KEYWORDS, - 'language': MetadataType.LANGUAGE, - 'date': MetadataType.PUBLICATION_DATE, - 'publisher': MetadataType.PUBLISHER, - 'identifier': MetadataType.IDENTIFIER, - } - - for epub_key, doc_type in metadata_mapping.items(): - if epub_key in self._metadata: - value = self._metadata[epub_key] - - # Handle list values (like multiple subjects) - if isinstance(value, list): - if epub_key == 'subject': - # Join subjects with commas for keywords - document.set_metadata(doc_type, ', '.join(value)) - else: - # For other list values, use the first one - document.set_metadata(doc_type, value[0]) - else: - document.set_metadata(doc_type, value) - - # Handle cover image - cover_meta = self._metadata.get('meta_cover') - if cover_meta: - document.set_metadata(MetadataType.COVER_IMAGE, cover_meta) - - # Store original EPUB metadata for reference - document.set_metadata(MetadataType.CUSTOM, { - 'epub_metadata': self._metadata - }) - - def _cleanup(self): - """Clean up temporary files.""" - if self._temp_dir: - try: - import shutil - shutil.rmtree(self._temp_dir, ignore_errors=True) - except: - pass - self._temp_dir = None - - def get_unique_identifier(self) -> Optional[str]: - """ - Get the unique identifier from the EPUB metadata. - - Returns: - The unique identifier string, or None if not found - """ - # Look for identifier with specific ID - for key, value in self._metadata.items(): - if key.startswith('identifier') and key.endswith('_id'): - return self._metadata.get('identifier') - - # Fallback to any identifier - return self._metadata.get('identifier') - - def get_cover_id(self) -> Optional[str]: - """ - Get the cover image ID from metadata. - - Returns: - The cover image ID, or None if not found - """ - return self._metadata.get('meta_cover') - - def get_creators(self) -> List[Dict[str, str]]: - """ - Get creator information with roles. - - Returns: - List of creator dictionaries with name, role, and file-as info - """ - creators = [] - creator_value = self._metadata.get('creator') - - if creator_value: - if isinstance(creator_value, list): - # Multiple creators - this is simplified, real implementation - # would need to correlate with role and file-as attributes - for creator in creator_value: - creators.append({'name': creator}) - else: - # Single creator - creator_info = {'name': creator_value} - - # Add role if available - role = self._metadata.get('creator_role') - if role: - creator_info['role'] = role - - # Add file-as if available - file_as = self._metadata.get('creator_file_as') - if file_as: - creator_info['file_as'] = file_as - - creators.append(creator_info) - - return creators diff --git a/pyWebLayout/io/readers/html.py b/pyWebLayout/io/readers/html.py deleted file mode 100644 index 4e1cc16..0000000 --- a/pyWebLayout/io/readers/html.py +++ /dev/null @@ -1,186 +0,0 @@ -""" -Modern HTML reader for pyWebLayout. - -This module provides an HTML reader that uses the html_extraction module -for clean, handler-based parsing using BeautifulSoup. -""" - -import os -from typing import Union, Optional -from pyWebLayout.abstract.document import Document -from pyWebLayout.io.readers.base import BaseReader -from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader -from pyWebLayout.io.readers.html_resources import HTMLResourceReader -from pyWebLayout.io.readers.html_extraction import parse_html_string -from pyWebLayout.style import Font - - -class HTMLReader(BaseReader): - """ - Modern HTML reader using the html_extraction parser. - - This reader uses the clean, handler-based architecture from html_extraction.py - for parsing HTML content into pyWebLayout's abstract document structure. - """ - - def __init__(self): - """Initialize the HTML reader.""" - super().__init__() - self._metadata_reader = HTMLMetadataReader() - self._resource_reader = HTMLResourceReader() - - def can_read(self, source: Union[str, bytes]) -> bool: - """ - Check if this reader can handle the given source. - - Args: - source: The source to check (file path, URL, or content) - - Returns: - True if this reader can handle the source, False otherwise - """ - if isinstance(source, str): - # Check if it's a file path - if os.path.isfile(source): - return source.lower().endswith(('.html', '.htm', '.xhtml')) - - # Check if it's HTML content (very basic check) - source_lower = source.lower().strip() - return (source_lower.startswith(' Document: - """ - Read and parse the HTML source into a Document. - - Args: - source: The HTML source to read (file path, URL, or content) - **options: Additional options for reading - - base_url: Base URL for resolving relative links - - encoding: Character encoding (default: 'utf-8') - - extract_metadata: Whether to extract metadata (default: True) - - extract_resources: Whether to extract resources (default: True) - - base_font: Base font for styling (default: None) - - Returns: - The parsed Document - """ - # Get options - base_url = options.get('base_url') - encoding = options.get('encoding', 'utf-8') - extract_metadata = options.get('extract_metadata', True) - extract_resources = options.get('extract_resources', True) - base_font = options.get('base_font') - - # Read the HTML content - html_content = self._read_html_content(source, encoding) - - # Set base URL if not provided and source is a file - if not base_url and isinstance(source, str) and os.path.isfile(source): - base_url = f"file://{os.path.dirname(os.path.abspath(source))}/" - - # Create a new document - document = Document() - - # Extract metadata if enabled - if extract_metadata and self._metadata_reader: - self._metadata_reader.extract_metadata(html_content, document) - - # Parse content using html_extraction - blocks = parse_html_string(html_content, base_font) - for block in blocks: - document.add_block(block) - - # Extract resources if enabled - if extract_resources and self._resource_reader: - self._resource_reader.extract_resources(html_content, document) - - return document - - def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str: - """ - Read HTML content from various sources. - - Args: - source: The source to read from - encoding: Character encoding to use - - Returns: - The HTML content as a string - """ - if isinstance(source, bytes): - # Source is already bytes, decode it - return source.decode(encoding, errors='replace') - - elif isinstance(source, str): - # Check if it's a file path - if os.path.isfile(source): - with open(source, 'r', encoding=encoding, errors='replace') as f: - return f.read() - else: - # Assume it's HTML content - return source - - else: - raise ValueError(f"Unsupported source type: {type(source)}") - - -def read_html(source: Union[str, bytes], **options) -> Document: - """ - Convenience function to read HTML content. - - Args: - source: The HTML source to read (file path, URL, or content) - **options: Additional options for reading - - Returns: - The parsed Document - """ - reader = HTMLReader() - return reader.read(source, **options) - - -def read_html_file(file_path: str, **options) -> Document: - """ - Convenience function to read HTML from a file. - - Args: - file_path: Path to the HTML file - **options: Additional options for reading - - Returns: - The parsed Document - """ - if not os.path.isfile(file_path): - raise FileNotFoundError(f"HTML file not found: {file_path}") - - reader = HTMLReader() - return reader.read(file_path, **options) - - -def parse_html_string(html_content: str, **options) -> Document: - """ - Convenience function to parse HTML content from a string. - - Args: - html_content: The HTML content as a string - **options: Additional options for reading - - Returns: - The parsed Document - """ - reader = HTMLReader() - return reader.read(html_content, **options) diff --git a/pyWebLayout/io/readers/html_metadata.py b/pyWebLayout/io/readers/html_metadata.py deleted file mode 100644 index 89c48cb..0000000 --- a/pyWebLayout/io/readers/html_metadata.py +++ /dev/null @@ -1,426 +0,0 @@ -""" -HTML metadata reader for pyWebLayout. - -This module provides specialized functionality for extracting metadata -from HTML documents, following the decomposed architecture pattern. -""" - -from typing import Dict, Any, Optional -import re -from pyWebLayout.abstract.document import Document, MetadataType -from pyWebLayout.io.readers.base import MetadataReader - - -class HTMLMetadataReader(MetadataReader): - """ - Specialized reader for extracting metadata from HTML documents. - - This class handles HTML meta tags, title elements, and other metadata - sources like Open Graph tags and JSON-LD structured data. - """ - - def __init__(self): - """Initialize the HTML metadata reader.""" - self._title = None - self._meta_tags = {} - self._og_tags = {} - self._twitter_tags = {} - self._json_ld = {} - - def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]: - """ - Extract metadata from HTML content. - - Args: - html_content: The HTML content to parse - document: The document to populate with metadata - - Returns: - Dictionary of extracted metadata - """ - # Reset internal state - self._reset() - - # Extract title - self._extract_title(html_content) - - # Extract meta tags - self._extract_meta_tags(html_content) - - # Extract Open Graph tags - self._extract_open_graph(html_content) - - # Extract Twitter Card tags - self._extract_twitter_cards(html_content) - - # Extract JSON-LD structured data - self._extract_json_ld(html_content) - - # Populate document with extracted metadata - self._populate_document(document) - - # Return all extracted metadata - return { - 'title': self._title, - 'meta_tags': self._meta_tags, - 'open_graph': self._og_tags, - 'twitter_cards': self._twitter_tags, - 'json_ld': self._json_ld - } - - def _reset(self): - """Reset internal state for a new extraction.""" - self._title = None - self._meta_tags = {} - self._og_tags = {} - self._twitter_tags = {} - self._json_ld = {} - - def _extract_title(self, html_content: str): - """ - Extract the title from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Look for title tag - title_match = re.search(r']*>(.*?)', html_content, re.IGNORECASE | re.DOTALL) - if title_match: - # Clean up the title text - self._title = self._clean_text(title_match.group(1)) - - def _extract_meta_tags(self, html_content: str): - """ - Extract meta tags from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Regular expression to match meta tags - meta_pattern = r']+)>' - - for match in re.finditer(meta_pattern, html_content, re.IGNORECASE): - attrs = self._parse_attributes(match.group(1)) - - # Get name and content - name = attrs.get('name', '').lower() - content = attrs.get('content', '') - - # Handle different types of meta tags - if name and content: - self._meta_tags[name] = content - - # Handle http-equiv meta tags - http_equiv = attrs.get('http-equiv', '').lower() - if http_equiv and content: - self._meta_tags[f'http-equiv:{http_equiv}'] = content - - # Handle charset meta tags - charset = attrs.get('charset', '') - if charset: - self._meta_tags['charset'] = charset - - def _extract_open_graph(self, html_content: str): - """ - Extract Open Graph meta tags from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Regular expression to match Open Graph meta tags - og_pattern = r']*>' - - for match in re.finditer(og_pattern, html_content, re.IGNORECASE): - property_name = match.group(1) - content = match.group(2) - self._og_tags[property_name] = content - - def _extract_twitter_cards(self, html_content: str): - """ - Extract Twitter Card meta tags from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Regular expression to match Twitter Card meta tags - twitter_pattern = r']*>' - - for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE): - property_name = match.group(1) - content = match.group(2) - self._twitter_tags[property_name] = content - - def _extract_json_ld(self, html_content: str): - """ - Extract JSON-LD structured data from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Regular expression to match JSON-LD script tags - json_ld_pattern = r']*type="application/ld\+json"[^>]*>(.*?)' - - for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL): - try: - import json - json_content = match.group(1).strip() - data = json.loads(json_content) - - # Store JSON-LD data by type if available - if isinstance(data, dict) and '@type' in data: - type_name = data['@type'] - if type_name not in self._json_ld: - self._json_ld[type_name] = [] - self._json_ld[type_name].append(data) - elif isinstance(data, list): - # Handle arrays of structured data - for item in data: - if isinstance(item, dict) and '@type' in item: - type_name = item['@type'] - if type_name not in self._json_ld: - self._json_ld[type_name] = [] - self._json_ld[type_name].append(item) - except (json.JSONDecodeError, ImportError): - # Skip invalid JSON-LD - continue - - def _populate_document(self, document: Document): - """ - Populate the document with extracted metadata. - - Args: - document: The document to populate - """ - # Set title - title = self._get_best_title() - if title: - document.set_metadata(MetadataType.TITLE, title) - - # Set description - description = self._get_best_description() - if description: - document.set_metadata(MetadataType.DESCRIPTION, description) - - # Set author - author = self._get_best_author() - if author: - document.set_metadata(MetadataType.AUTHOR, author) - - # Set keywords - keywords = self._get_keywords() - if keywords: - document.set_metadata(MetadataType.KEYWORDS, keywords) - - # Set language - language = self._get_language() - if language: - document.set_metadata(MetadataType.LANGUAGE, language) - - # Set cover image - cover_image = self._get_cover_image() - if cover_image: - document.set_metadata(MetadataType.COVER_IMAGE, cover_image) - - # Set publisher - publisher = self._get_publisher() - if publisher: - document.set_metadata(MetadataType.PUBLISHER, publisher) - - # Set publication date - pub_date = self._get_publication_date() - if pub_date: - document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date) - - def _get_best_title(self) -> Optional[str]: - """Get the best available title from all sources.""" - # Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title - - # Check Open Graph - if 'title' in self._og_tags: - return self._og_tags['title'] - - # Check Twitter Cards - if 'title' in self._twitter_tags: - return self._twitter_tags['title'] - - # Check JSON-LD - for type_name, items in self._json_ld.items(): - for item in items: - if 'name' in item: - return item['name'] - elif 'headline' in item: - return item['headline'] - - # Check meta tags - for key in ['title', 'og:title', 'twitter:title']: - if key in self._meta_tags: - return self._meta_tags[key] - - # Fall back to HTML title - return self._title - - def _get_best_description(self) -> Optional[str]: - """Get the best available description from all sources.""" - # Priority order: Open Graph > Twitter > meta description > JSON-LD - - # Check Open Graph - if 'description' in self._og_tags: - return self._og_tags['description'] - - # Check Twitter Cards - if 'description' in self._twitter_tags: - return self._twitter_tags['description'] - - # Check meta description - if 'description' in self._meta_tags: - return self._meta_tags['description'] - - # Check JSON-LD - for type_name, items in self._json_ld.items(): - for item in items: - if 'description' in item: - return item['description'] - - return None - - def _get_best_author(self) -> Optional[str]: - """Get the best available author from all sources.""" - # Check meta tags - if 'author' in self._meta_tags: - return self._meta_tags['author'] - - # Check JSON-LD - for type_name, items in self._json_ld.items(): - for item in items: - if 'author' in item: - author = item['author'] - if isinstance(author, dict) and 'name' in author: - return author['name'] - elif isinstance(author, str): - return author - elif 'creator' in item: - creator = item['creator'] - if isinstance(creator, dict) and 'name' in creator: - return creator['name'] - elif isinstance(creator, str): - return creator - - return None - - def _get_keywords(self) -> Optional[str]: - """Get keywords from meta tags.""" - return self._meta_tags.get('keywords') - - def _get_language(self) -> Optional[str]: - """Get language from meta tags or HTML lang attribute.""" - # Check meta tags first - if 'language' in self._meta_tags: - return self._meta_tags['language'] - - # Could also extract from html lang attribute if needed - return None - - def _get_cover_image(self) -> Optional[str]: - """Get the best available cover image from all sources.""" - # Check Open Graph - if 'image' in self._og_tags: - return self._og_tags['image'] - - # Check Twitter Cards - if 'image' in self._twitter_tags: - return self._twitter_tags['image'] - - # Check JSON-LD - for type_name, items in self._json_ld.items(): - for item in items: - if 'image' in item: - image = item['image'] - if isinstance(image, dict) and 'url' in image: - return image['url'] - elif isinstance(image, str): - return image - - return None - - def _get_publisher(self) -> Optional[str]: - """Get publisher from JSON-LD or other sources.""" - # Check JSON-LD - for type_name, items in self._json_ld.items(): - for item in items: - if 'publisher' in item: - publisher = item['publisher'] - if isinstance(publisher, dict) and 'name' in publisher: - return publisher['name'] - elif isinstance(publisher, str): - return publisher - - return None - - def _get_publication_date(self) -> Optional[str]: - """Get publication date from JSON-LD or other sources.""" - # Check JSON-LD - for type_name, items in self._json_ld.items(): - for item in items: - if 'datePublished' in item: - return item['datePublished'] - elif 'publishDate' in item: - return item['publishDate'] - - return None - - def _parse_attributes(self, attr_string: str) -> Dict[str, str]: - """ - Parse HTML attributes from a string. - - Args: - attr_string: String containing HTML attributes - - Returns: - Dictionary of attribute name-value pairs - """ - attrs = {} - - # Regular expression to match attribute="value" or attribute='value' - attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))' - - for match in re.finditer(attr_pattern, attr_string): - name = match.group(1).lower() - value = match.group(2) or match.group(3) or match.group(4) or '' - attrs[name] = value - - # Handle standalone attributes (like charset) - standalone_pattern = r'\b(\w+)(?!=)' - for match in re.finditer(standalone_pattern, attr_string): - attr_name = match.group(1).lower() - if attr_name not in attrs: - attrs[attr_name] = '' - - return attrs - - def _clean_text(self, text: str) -> str: - """ - Clean up text content by removing extra whitespace and HTML entities. - - Args: - text: The text to clean - - Returns: - Cleaned text - """ - # Remove extra whitespace - cleaned = re.sub(r'\s+', ' ', text).strip() - - # Decode common HTML entities - entities = { - '<': '<', - '>': '>', - '&': '&', - '"': '"', - ''': "'", - ' ': ' ', - } - - for entity, char in entities.items(): - cleaned = cleaned.replace(entity, char) - - return cleaned diff --git a/pyWebLayout/io/readers/html_resources.py b/pyWebLayout/io/readers/html_resources.py deleted file mode 100644 index ffab0e0..0000000 --- a/pyWebLayout/io/readers/html_resources.py +++ /dev/null @@ -1,483 +0,0 @@ -""" -HTML resources reader for pyWebLayout. - -This module provides specialized functionality for extracting resources -from HTML documents, such as stylesheets, scripts, and external files. -""" - -from typing import Dict, Any, Optional, List -import re -import urllib.parse -from pyWebLayout.abstract.document import Document -from pyWebLayout.io.readers.base import ResourceReader - - -class HTMLResourceReader(ResourceReader): - """ - Specialized reader for extracting resources from HTML documents. - - This class handles CSS stylesheets, JavaScript files, images, - and other external resources referenced in HTML. - """ - - def __init__(self): - """Initialize the HTML resource reader.""" - self._stylesheets = [] - self._scripts = [] - self._external_resources = {} - self._inline_styles = {} - self._inline_scripts = [] - - def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]: - """ - Extract resources from HTML content. - - Args: - html_content: The HTML content to parse - document: The document to populate with resources - - Returns: - Dictionary of extracted resources - """ - # Reset internal state - self._reset() - - # Extract stylesheets - self._extract_stylesheets(html_content) - - # Extract scripts - self._extract_scripts(html_content) - - # Extract other external resources - self._extract_external_resources(html_content) - - # Extract inline styles - self._extract_inline_styles(html_content) - - # Extract inline scripts - self._extract_inline_scripts(html_content) - - # Populate document with extracted resources - self._populate_document(document) - - # Return all extracted resources - return { - 'stylesheets': self._stylesheets, - 'scripts': self._scripts, - 'external_resources': self._external_resources, - 'inline_styles': self._inline_styles, - 'inline_scripts': self._inline_scripts - } - - def _reset(self): - """Reset internal state for a new extraction.""" - self._stylesheets = [] - self._scripts = [] - self._external_resources = {} - self._inline_styles = {} - self._inline_scripts = [] - - def _extract_stylesheets(self, html_content: str): - """ - Extract CSS stylesheet references from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Regular expression to match link tags for stylesheets - link_pattern = r']+)>' - - for match in re.finditer(link_pattern, html_content, re.IGNORECASE): - attrs = self._parse_attributes(match.group(1)) - - # Check if this is a stylesheet - rel = attrs.get('rel', '').lower() - if rel == 'stylesheet': - href = attrs.get('href', '') - media = attrs.get('media', 'all') - type_attr = attrs.get('type', 'text/css') - - if href: - stylesheet = { - 'type': 'external', - 'href': href, - 'media': media, - 'content_type': type_attr - } - self._stylesheets.append(stylesheet) - - # Handle other link types - elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'): - href = attrs.get('href', '') - if href: - self._external_resources[f'icon_{len(self._external_resources)}'] = { - 'type': 'icon', - 'rel': rel, - 'href': href, - 'sizes': attrs.get('sizes', ''), - 'content_type': attrs.get('type', '') - } - - elif rel == 'preload': - href = attrs.get('href', '') - if href: - self._external_resources[f'preload_{len(self._external_resources)}'] = { - 'type': 'preload', - 'href': href, - 'as': attrs.get('as', ''), - 'content_type': attrs.get('type', '') - } - - def _extract_scripts(self, html_content: str): - """ - Extract script references from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Regular expression to match script tags - script_pattern = r']*)>(.*?)' - - for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL): - attrs_str = match.group(1) - content = match.group(2).strip() - - attrs = self._parse_attributes(attrs_str) - - src = attrs.get('src', '') - script_type = attrs.get('type', 'text/javascript') - - if src: - # External script - script = { - 'type': 'external', - 'src': src, - 'content_type': script_type, - 'async': 'async' in attrs, - 'defer': 'defer' in attrs, - 'integrity': attrs.get('integrity', ''), - 'crossorigin': attrs.get('crossorigin', '') - } - self._scripts.append(script) - - elif content: - # Inline script - script = { - 'type': 'inline', - 'content': content, - 'content_type': script_type - } - self._scripts.append(script) - - def _extract_external_resources(self, html_content: str): - """ - Extract other external resources from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Extract images - img_pattern = r']+)>' - for match in re.finditer(img_pattern, html_content, re.IGNORECASE): - attrs = self._parse_attributes(match.group(1)) - src = attrs.get('src', '') - if src: - self._external_resources[f'image_{len(self._external_resources)}'] = { - 'type': 'image', - 'src': src, - 'alt': attrs.get('alt', ''), - 'width': attrs.get('width', ''), - 'height': attrs.get('height', ''), - 'loading': attrs.get('loading', ''), - 'srcset': attrs.get('srcset', '') - } - - # Extract audio - audio_pattern = r']+)>' - for match in re.finditer(audio_pattern, html_content, re.IGNORECASE): - attrs = self._parse_attributes(match.group(1)) - src = attrs.get('src', '') - if src: - self._external_resources[f'audio_{len(self._external_resources)}'] = { - 'type': 'audio', - 'src': src, - 'controls': 'controls' in attrs, - 'autoplay': 'autoplay' in attrs, - 'loop': 'loop' in attrs, - 'muted': 'muted' in attrs - } - - # Extract video - video_pattern = r']+)>' - for match in re.finditer(video_pattern, html_content, re.IGNORECASE): - attrs = self._parse_attributes(match.group(1)) - src = attrs.get('src', '') - if src: - self._external_resources[f'video_{len(self._external_resources)}'] = { - 'type': 'video', - 'src': src, - 'controls': 'controls' in attrs, - 'autoplay': 'autoplay' in attrs, - 'loop': 'loop' in attrs, - 'muted': 'muted' in attrs, - 'width': attrs.get('width', ''), - 'height': attrs.get('height', ''), - 'poster': attrs.get('poster', '') - } - - # Extract embed/object resources - embed_pattern = r']+)>' - for match in re.finditer(embed_pattern, html_content, re.IGNORECASE): - attrs = self._parse_attributes(match.group(1)) - src = attrs.get('src', '') - if src: - self._external_resources[f'embed_{len(self._external_resources)}'] = { - 'type': 'embed', - 'src': src, - 'content_type': attrs.get('type', ''), - 'width': attrs.get('width', ''), - 'height': attrs.get('height', '') - } - - # Extract iframe sources - iframe_pattern = r']+)>' - for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE): - attrs = self._parse_attributes(match.group(1)) - src = attrs.get('src', '') - if src: - self._external_resources[f'iframe_{len(self._external_resources)}'] = { - 'type': 'iframe', - 'src': src, - 'width': attrs.get('width', ''), - 'height': attrs.get('height', ''), - 'loading': attrs.get('loading', ''), - 'sandbox': attrs.get('sandbox', '') - } - - def _extract_inline_styles(self, html_content: str): - """ - Extract inline CSS styles from HTML content. - - Args: - html_content: The HTML content to parse - """ - # Extract style blocks - style_pattern = r']*)>(.*?)' - - for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)): - attrs_str = match.group(1) - content = match.group(2).strip() - - attrs = self._parse_attributes(attrs_str) - - if content: - style_block = { - 'content': content, - 'media': attrs.get('media', 'all'), - 'content_type': attrs.get('type', 'text/css') - } - self._inline_styles[f'style_block_{i}'] = style_block - - # Extract inline style attributes (this would be more complex - # as it requires parsing all elements with style attributes) - style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>' - - for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)): - style_content = match.group(1) - if style_content: - style_attr = { - 'content': style_content, - 'type': 'attribute' - } - self._inline_styles[f'style_attr_{i}'] = style_attr - - def _extract_inline_scripts(self, html_content: str): - """ - Extract inline JavaScript from HTML content. - - Args: - html_content: The HTML content to parse - """ - # This is already handled in _extract_scripts, but we keep this - # method for consistency and potential future extensions - pass - - def _populate_document(self, document: Document): - """ - Populate the document with extracted resources. - - Args: - document: The document to populate - """ - # Add stylesheets - for stylesheet in self._stylesheets: - document.add_stylesheet(stylesheet) - - # Add scripts - for script in self._scripts: - if script['type'] == 'inline': - document.add_script(script['content']) - else: - # For external scripts, we store them as resources - script_name = f"script_{len(document._resources)}" - document.add_resource(script_name, script) - - # Add external resources - for name, resource in self._external_resources.items(): - document.add_resource(name, resource) - - # Add inline styles as stylesheets - for name, style in self._inline_styles.items(): - if style.get('type') != 'attribute': # Don't add individual style attributes - parsed_style = self._parse_css(style['content']) - if parsed_style: - document.add_stylesheet({ - 'type': 'inline', - 'content': style['content'], - 'parsed': parsed_style, - 'media': style.get('media', 'all') - }) - - def _parse_attributes(self, attr_string: str) -> Dict[str, str]: - """ - Parse HTML attributes from a string. - - Args: - attr_string: String containing HTML attributes - - Returns: - Dictionary of attribute name-value pairs - """ - attrs = {} - - # Regular expression to match attribute="value" or attribute='value' - attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))' - - for match in re.finditer(attr_pattern, attr_string): - name = match.group(1).lower() - value = match.group(2) or match.group(3) or match.group(4) or '' - attrs[name] = value - - # Handle standalone attributes (like async, defer) - standalone_pattern = r'\b(\w+)(?!=)' - for match in re.finditer(standalone_pattern, attr_string): - attr_name = match.group(1).lower() - if attr_name not in attrs: - attrs[attr_name] = '' - - return attrs - - def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]: - """ - Parse a CSS stylesheet. - - Args: - css_str: CSS stylesheet string - - Returns: - Dictionary of selectors and their style properties - """ - stylesheet = {} - - # Remove comments - css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL) - - # Split into rule sets - rule_sets = css_str.split('}') - - for rule_set in rule_sets: - # Split into selector and declarations - parts = rule_set.split('{', 1) - if len(parts) != 2: - continue - - selector = parts[0].strip() - declarations = parts[1].strip() - - # Parse declarations - style = self._parse_css_declarations(declarations) - - # Add to stylesheet - if selector and style: - stylesheet[selector] = style - - return stylesheet - - def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]: - """ - Parse CSS declarations. - - Args: - declarations_str: CSS declarations string - - Returns: - Dictionary of CSS properties and values - """ - declarations = {} - - # Split the declarations string into individual declarations - decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()] - - for declaration in decl_list: - # Split into property and value - parts = declaration.split(':', 1) - if len(parts) != 2: - continue - - prop = parts[0].strip().lower() - value = parts[1].strip() - - # Store the declaration - declarations[prop] = value - - return declarations - - def resolve_url(self, url: str, base_url: Optional[str] = None) -> str: - """ - Resolve a relative URL against a base URL. - - Args: - url: The URL to resolve - base_url: The base URL to resolve against - - Returns: - The resolved URL - """ - if base_url and not url.startswith(('http://', 'https://', '//', 'data:')): - return urllib.parse.urljoin(base_url, url) - return url - - def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]: - """ - Get the dependencies of a resource (e.g., CSS imports, script dependencies). - - Args: - resource: The resource to analyze - - Returns: - List of dependency URLs - """ - dependencies = [] - - if resource.get('type') == 'external' and 'content' in resource: - content = resource['content'] - - # Check for CSS @import rules - if resource.get('content_type', '').startswith('text/css'): - import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?' - for match in re.finditer(import_pattern, content, re.IGNORECASE): - dependencies.append(match.group(1)) - - # Check for JavaScript imports/requires (basic detection) - elif resource.get('content_type', '').startswith('text/javascript'): - # ES6 imports - import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']' - for match in re.finditer(import_pattern, content): - dependencies.append(match.group(1)) - - # CommonJS requires - require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)' - for match in re.finditer(require_pattern, content): - dependencies.append(match.group(1)) - - return dependencies