This commit is contained in:
parent
2b1170cac7
commit
28c7b6700b
@ -35,8 +35,3 @@ from pyWebLayout.concrete.page import Container, Page
|
||||
from pyWebLayout.abstract.inline import Word
|
||||
|
||||
|
||||
# IO functionality (reading and writing)
|
||||
from pyWebLayout.io import (
|
||||
parse_html, html_to_document, # HTML parsing
|
||||
read_epub # EPUB reading
|
||||
)
|
||||
|
||||
@ -11,61 +11,5 @@ pattern as the abstract module.
|
||||
|
||||
# Legacy readers (for backward compatibility)
|
||||
# Legacy functions provided by new HTML reader for backward compatibility
|
||||
from pyWebLayout.io.readers.html import parse_html_string as parse_html
|
||||
from pyWebLayout.io.readers.html import read_html_file as html_to_document
|
||||
from pyWebLayout.io.readers.epub_reader import read_epub
|
||||
|
||||
# New decomposed readers
|
||||
from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string
|
||||
from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
||||
|
||||
# Specialized HTML readers
|
||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||
|
||||
# HTML extraction parser (the best approach)
|
||||
from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction
|
||||
|
||||
# Specialized EPUB readers
|
||||
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
|
||||
|
||||
# Convenience functions using the new architecture
|
||||
def read_document(source, format_hint=None, **options):
|
||||
"""
|
||||
Read a document using the appropriate reader based on format detection.
|
||||
|
||||
Args:
|
||||
source: The source to read (file path, URL, or content)
|
||||
format_hint: Optional hint about the format ('html', 'epub', etc.)
|
||||
**options: Additional options for reading
|
||||
|
||||
Returns:
|
||||
Document: The parsed document
|
||||
"""
|
||||
if format_hint == 'html' or (not format_hint and _is_html_source(source)):
|
||||
reader = HTMLReader()
|
||||
return reader.read(source, **options)
|
||||
elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)):
|
||||
# Use legacy EPUB reader for now
|
||||
return read_epub(source)
|
||||
else:
|
||||
# Try HTML reader as fallback
|
||||
try:
|
||||
reader = HTMLReader()
|
||||
if reader.can_read(source):
|
||||
return reader.read(source, **options)
|
||||
except:
|
||||
pass
|
||||
|
||||
raise ValueError(f"Cannot determine format for source: {source}")
|
||||
|
||||
def _is_html_source(source):
|
||||
"""Check if source appears to be HTML."""
|
||||
reader = HTMLReader()
|
||||
return reader.can_read(source)
|
||||
|
||||
def _is_epub_source(source):
|
||||
"""Check if source appears to be EPUB."""
|
||||
if isinstance(source, str):
|
||||
return source.lower().endswith('.epub')
|
||||
return False
|
||||
from pyWebLayout.io.readers.epub_reader import EPUBReader
|
||||
|
||||
@ -9,14 +9,13 @@ using a decomposed architecture pattern.
|
||||
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
||||
|
||||
# HTML readers (decomposed)
|
||||
from .html import HTMLReader, read_html, read_html_file, parse_html_string
|
||||
from .html_metadata import HTMLMetadataReader
|
||||
from .html_resources import HTMLResourceReader
|
||||
|
||||
|
||||
|
||||
|
||||
# EPUB readers
|
||||
from .epub_reader import read_epub # Legacy
|
||||
from .epub_metadata import EPUBMetadataReader # New decomposed
|
||||
|
||||
|
||||
__all__ = [
|
||||
# Base classes
|
||||
|
||||
@ -1,352 +0,0 @@
|
||||
"""
|
||||
EPUB metadata reader for pyWebLayout.
|
||||
|
||||
This module provides specialized functionality for extracting metadata
|
||||
from EPUB documents, following the decomposed architecture pattern.
|
||||
"""
|
||||
|
||||
import os
|
||||
import zipfile
|
||||
import tempfile
|
||||
from typing import Dict, Any, Optional, List
|
||||
import xml.etree.ElementTree as ET
|
||||
from pyWebLayout.abstract.document import Document, MetadataType
|
||||
from pyWebLayout.io.readers.base import MetadataReader
|
||||
|
||||
|
||||
# XML namespaces used in EPUB files
|
||||
NAMESPACES = {
|
||||
'opf': 'http://www.idpf.org/2007/opf',
|
||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||
'dcterms': 'http://purl.org/dc/terms/',
|
||||
}
|
||||
|
||||
|
||||
class EPUBMetadataReader(MetadataReader):
|
||||
"""
|
||||
Specialized reader for extracting metadata from EPUB documents.
|
||||
|
||||
This class handles EPUB package document metadata including
|
||||
Dublin Core elements and custom metadata.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the EPUB metadata reader."""
|
||||
self._metadata = {}
|
||||
self._temp_dir = None
|
||||
self._package_path = None
|
||||
|
||||
def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract metadata from EPUB file.
|
||||
|
||||
Args:
|
||||
epub_path: Path to the EPUB file
|
||||
document: The document to populate with metadata
|
||||
|
||||
Returns:
|
||||
Dictionary of extracted metadata
|
||||
"""
|
||||
# Reset internal state
|
||||
self._reset()
|
||||
|
||||
try:
|
||||
# Extract EPUB to temporary directory
|
||||
self._extract_epub(epub_path)
|
||||
|
||||
# Find and parse package document
|
||||
self._find_package_document()
|
||||
|
||||
if self._package_path:
|
||||
self._parse_package_metadata()
|
||||
|
||||
# Populate document with extracted metadata
|
||||
self._populate_document(document)
|
||||
|
||||
return self._metadata
|
||||
|
||||
finally:
|
||||
# Clean up temporary files
|
||||
self._cleanup()
|
||||
|
||||
def _reset(self):
|
||||
"""Reset internal state for a new extraction."""
|
||||
self._metadata = {}
|
||||
self._temp_dir = None
|
||||
self._package_path = None
|
||||
|
||||
def _extract_epub(self, epub_path: str):
|
||||
"""
|
||||
Extract EPUB file to temporary directory.
|
||||
|
||||
Args:
|
||||
epub_path: Path to the EPUB file
|
||||
"""
|
||||
self._temp_dir = tempfile.mkdtemp()
|
||||
|
||||
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(self._temp_dir)
|
||||
|
||||
def _find_package_document(self):
|
||||
"""Find the package document (content.opf) in the extracted EPUB."""
|
||||
# First, try to find it via META-INF/container.xml
|
||||
container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
|
||||
|
||||
if os.path.exists(container_path):
|
||||
try:
|
||||
tree = ET.parse(container_path)
|
||||
root = tree.getroot()
|
||||
|
||||
# Find rootfile element
|
||||
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
|
||||
full_path = rootfile.get('full-path')
|
||||
if full_path:
|
||||
self._package_path = os.path.join(self._temp_dir, full_path)
|
||||
if os.path.exists(self._package_path):
|
||||
return
|
||||
except ET.ParseError:
|
||||
pass
|
||||
|
||||
# Fallback: search for .opf files
|
||||
for root, dirs, files in os.walk(self._temp_dir):
|
||||
for file in files:
|
||||
if file.endswith('.opf'):
|
||||
self._package_path = os.path.join(root, file)
|
||||
return
|
||||
|
||||
def _parse_package_metadata(self):
|
||||
"""Parse metadata from the package document."""
|
||||
if not self._package_path or not os.path.exists(self._package_path):
|
||||
return
|
||||
|
||||
try:
|
||||
tree = ET.parse(self._package_path)
|
||||
root = tree.getroot()
|
||||
|
||||
# Find metadata element
|
||||
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
|
||||
if metadata_elem is None:
|
||||
return
|
||||
|
||||
# Parse Dublin Core metadata
|
||||
self._parse_dublin_core(metadata_elem)
|
||||
|
||||
# Parse OPF-specific metadata
|
||||
self._parse_opf_metadata(metadata_elem)
|
||||
|
||||
except ET.ParseError as e:
|
||||
print(f"Error parsing package document: {e}")
|
||||
|
||||
def _parse_dublin_core(self, metadata_elem: ET.Element):
|
||||
"""
|
||||
Parse Dublin Core metadata elements.
|
||||
|
||||
Args:
|
||||
metadata_elem: The metadata XML element
|
||||
"""
|
||||
dc_elements = {
|
||||
'title': 'title',
|
||||
'creator': 'creator',
|
||||
'subject': 'subject',
|
||||
'description': 'description',
|
||||
'publisher': 'publisher',
|
||||
'contributor': 'contributor',
|
||||
'date': 'date',
|
||||
'type': 'type',
|
||||
'format': 'format',
|
||||
'identifier': 'identifier',
|
||||
'source': 'source',
|
||||
'language': 'language',
|
||||
'relation': 'relation',
|
||||
'coverage': 'coverage',
|
||||
'rights': 'rights'
|
||||
}
|
||||
|
||||
for dc_name, meta_key in dc_elements.items():
|
||||
elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
|
||||
|
||||
if elements:
|
||||
if len(elements) == 1:
|
||||
# Single element
|
||||
text = elements[0].text
|
||||
if text:
|
||||
self._metadata[meta_key] = text.strip()
|
||||
|
||||
# Handle special attributes
|
||||
elem = elements[0]
|
||||
if dc_name == 'creator':
|
||||
# Check for role attribute
|
||||
role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
|
||||
if role:
|
||||
self._metadata[f'{meta_key}_role'] = role
|
||||
|
||||
# Check for file-as attribute for sorting
|
||||
file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
|
||||
if file_as:
|
||||
self._metadata[f'{meta_key}_file_as'] = file_as
|
||||
|
||||
elif dc_name == 'identifier':
|
||||
# Check for scheme (ISBN, DOI, etc.)
|
||||
scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
|
||||
if scheme:
|
||||
self._metadata[f'{meta_key}_scheme'] = scheme
|
||||
|
||||
# Check if this is the unique identifier
|
||||
id_attr = elem.get('id')
|
||||
if id_attr:
|
||||
self._metadata[f'{meta_key}_id'] = id_attr
|
||||
|
||||
elif dc_name == 'date':
|
||||
# Check for event type
|
||||
event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
|
||||
if event:
|
||||
self._metadata[f'{meta_key}_event'] = event
|
||||
|
||||
else:
|
||||
# Multiple elements - store as list
|
||||
values = []
|
||||
for elem in elements:
|
||||
if elem.text:
|
||||
values.append(elem.text.strip())
|
||||
|
||||
if values:
|
||||
self._metadata[meta_key] = values
|
||||
|
||||
def _parse_opf_metadata(self, metadata_elem: ET.Element):
|
||||
"""
|
||||
Parse OPF-specific metadata elements.
|
||||
|
||||
Args:
|
||||
metadata_elem: The metadata XML element
|
||||
"""
|
||||
# Parse meta elements
|
||||
meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
|
||||
|
||||
for meta in meta_elements:
|
||||
name = meta.get('name')
|
||||
content = meta.get('content')
|
||||
|
||||
if name and content:
|
||||
self._metadata[f'meta_{name}'] = content
|
||||
|
||||
# Parse x-metadata elements (custom metadata)
|
||||
x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
|
||||
|
||||
for x_meta in x_meta_elements:
|
||||
for child in x_meta:
|
||||
if child.tag and child.text:
|
||||
# Remove namespace prefix for cleaner key names
|
||||
tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
||||
self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
|
||||
|
||||
def _populate_document(self, document: Document):
|
||||
"""
|
||||
Populate the document with extracted metadata.
|
||||
|
||||
Args:
|
||||
document: The document to populate
|
||||
"""
|
||||
# Map EPUB metadata to document metadata types
|
||||
metadata_mapping = {
|
||||
'title': MetadataType.TITLE,
|
||||
'creator': MetadataType.AUTHOR,
|
||||
'description': MetadataType.DESCRIPTION,
|
||||
'subject': MetadataType.KEYWORDS,
|
||||
'language': MetadataType.LANGUAGE,
|
||||
'date': MetadataType.PUBLICATION_DATE,
|
||||
'publisher': MetadataType.PUBLISHER,
|
||||
'identifier': MetadataType.IDENTIFIER,
|
||||
}
|
||||
|
||||
for epub_key, doc_type in metadata_mapping.items():
|
||||
if epub_key in self._metadata:
|
||||
value = self._metadata[epub_key]
|
||||
|
||||
# Handle list values (like multiple subjects)
|
||||
if isinstance(value, list):
|
||||
if epub_key == 'subject':
|
||||
# Join subjects with commas for keywords
|
||||
document.set_metadata(doc_type, ', '.join(value))
|
||||
else:
|
||||
# For other list values, use the first one
|
||||
document.set_metadata(doc_type, value[0])
|
||||
else:
|
||||
document.set_metadata(doc_type, value)
|
||||
|
||||
# Handle cover image
|
||||
cover_meta = self._metadata.get('meta_cover')
|
||||
if cover_meta:
|
||||
document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
|
||||
|
||||
# Store original EPUB metadata for reference
|
||||
document.set_metadata(MetadataType.CUSTOM, {
|
||||
'epub_metadata': self._metadata
|
||||
})
|
||||
|
||||
def _cleanup(self):
|
||||
"""Clean up temporary files."""
|
||||
if self._temp_dir:
|
||||
try:
|
||||
import shutil
|
||||
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
self._temp_dir = None
|
||||
|
||||
def get_unique_identifier(self) -> Optional[str]:
|
||||
"""
|
||||
Get the unique identifier from the EPUB metadata.
|
||||
|
||||
Returns:
|
||||
The unique identifier string, or None if not found
|
||||
"""
|
||||
# Look for identifier with specific ID
|
||||
for key, value in self._metadata.items():
|
||||
if key.startswith('identifier') and key.endswith('_id'):
|
||||
return self._metadata.get('identifier')
|
||||
|
||||
# Fallback to any identifier
|
||||
return self._metadata.get('identifier')
|
||||
|
||||
def get_cover_id(self) -> Optional[str]:
|
||||
"""
|
||||
Get the cover image ID from metadata.
|
||||
|
||||
Returns:
|
||||
The cover image ID, or None if not found
|
||||
"""
|
||||
return self._metadata.get('meta_cover')
|
||||
|
||||
def get_creators(self) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Get creator information with roles.
|
||||
|
||||
Returns:
|
||||
List of creator dictionaries with name, role, and file-as info
|
||||
"""
|
||||
creators = []
|
||||
creator_value = self._metadata.get('creator')
|
||||
|
||||
if creator_value:
|
||||
if isinstance(creator_value, list):
|
||||
# Multiple creators - this is simplified, real implementation
|
||||
# would need to correlate with role and file-as attributes
|
||||
for creator in creator_value:
|
||||
creators.append({'name': creator})
|
||||
else:
|
||||
# Single creator
|
||||
creator_info = {'name': creator_value}
|
||||
|
||||
# Add role if available
|
||||
role = self._metadata.get('creator_role')
|
||||
if role:
|
||||
creator_info['role'] = role
|
||||
|
||||
# Add file-as if available
|
||||
file_as = self._metadata.get('creator_file_as')
|
||||
if file_as:
|
||||
creator_info['file_as'] = file_as
|
||||
|
||||
creators.append(creator_info)
|
||||
|
||||
return creators
|
||||
@ -1,186 +0,0 @@
|
||||
"""
|
||||
Modern HTML reader for pyWebLayout.
|
||||
|
||||
This module provides an HTML reader that uses the html_extraction module
|
||||
for clean, handler-based parsing using BeautifulSoup.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Union, Optional
|
||||
from pyWebLayout.abstract.document import Document
|
||||
from pyWebLayout.io.readers.base import BaseReader
|
||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
||||
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
||||
from pyWebLayout.style import Font
|
||||
|
||||
|
||||
class HTMLReader(BaseReader):
|
||||
"""
|
||||
Modern HTML reader using the html_extraction parser.
|
||||
|
||||
This reader uses the clean, handler-based architecture from html_extraction.py
|
||||
for parsing HTML content into pyWebLayout's abstract document structure.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the HTML reader."""
|
||||
super().__init__()
|
||||
self._metadata_reader = HTMLMetadataReader()
|
||||
self._resource_reader = HTMLResourceReader()
|
||||
|
||||
def can_read(self, source: Union[str, bytes]) -> bool:
|
||||
"""
|
||||
Check if this reader can handle the given source.
|
||||
|
||||
Args:
|
||||
source: The source to check (file path, URL, or content)
|
||||
|
||||
Returns:
|
||||
True if this reader can handle the source, False otherwise
|
||||
"""
|
||||
if isinstance(source, str):
|
||||
# Check if it's a file path
|
||||
if os.path.isfile(source):
|
||||
return source.lower().endswith(('.html', '.htm', '.xhtml'))
|
||||
|
||||
# Check if it's HTML content (very basic check)
|
||||
source_lower = source.lower().strip()
|
||||
return (source_lower.startswith('<!doctype html') or
|
||||
source_lower.startswith('<html') or
|
||||
'<html' in source_lower[:200])
|
||||
|
||||
elif isinstance(source, bytes):
|
||||
# Check if it's HTML content in bytes
|
||||
try:
|
||||
source_str = source.decode('utf-8', errors='ignore').lower().strip()
|
||||
return (source_str.startswith('<!doctype html') or
|
||||
source_str.startswith('<html') or
|
||||
'<html' in source_str[:200])
|
||||
except:
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def read(self, source: Union[str, bytes], **options) -> Document:
|
||||
"""
|
||||
Read and parse the HTML source into a Document.
|
||||
|
||||
Args:
|
||||
source: The HTML source to read (file path, URL, or content)
|
||||
**options: Additional options for reading
|
||||
- base_url: Base URL for resolving relative links
|
||||
- encoding: Character encoding (default: 'utf-8')
|
||||
- extract_metadata: Whether to extract metadata (default: True)
|
||||
- extract_resources: Whether to extract resources (default: True)
|
||||
- base_font: Base font for styling (default: None)
|
||||
|
||||
Returns:
|
||||
The parsed Document
|
||||
"""
|
||||
# Get options
|
||||
base_url = options.get('base_url')
|
||||
encoding = options.get('encoding', 'utf-8')
|
||||
extract_metadata = options.get('extract_metadata', True)
|
||||
extract_resources = options.get('extract_resources', True)
|
||||
base_font = options.get('base_font')
|
||||
|
||||
# Read the HTML content
|
||||
html_content = self._read_html_content(source, encoding)
|
||||
|
||||
# Set base URL if not provided and source is a file
|
||||
if not base_url and isinstance(source, str) and os.path.isfile(source):
|
||||
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
|
||||
|
||||
# Create a new document
|
||||
document = Document()
|
||||
|
||||
# Extract metadata if enabled
|
||||
if extract_metadata and self._metadata_reader:
|
||||
self._metadata_reader.extract_metadata(html_content, document)
|
||||
|
||||
# Parse content using html_extraction
|
||||
blocks = parse_html_string(html_content, base_font)
|
||||
for block in blocks:
|
||||
document.add_block(block)
|
||||
|
||||
# Extract resources if enabled
|
||||
if extract_resources and self._resource_reader:
|
||||
self._resource_reader.extract_resources(html_content, document)
|
||||
|
||||
return document
|
||||
|
||||
def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str:
|
||||
"""
|
||||
Read HTML content from various sources.
|
||||
|
||||
Args:
|
||||
source: The source to read from
|
||||
encoding: Character encoding to use
|
||||
|
||||
Returns:
|
||||
The HTML content as a string
|
||||
"""
|
||||
if isinstance(source, bytes):
|
||||
# Source is already bytes, decode it
|
||||
return source.decode(encoding, errors='replace')
|
||||
|
||||
elif isinstance(source, str):
|
||||
# Check if it's a file path
|
||||
if os.path.isfile(source):
|
||||
with open(source, 'r', encoding=encoding, errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
# Assume it's HTML content
|
||||
return source
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported source type: {type(source)}")
|
||||
|
||||
|
||||
def read_html(source: Union[str, bytes], **options) -> Document:
|
||||
"""
|
||||
Convenience function to read HTML content.
|
||||
|
||||
Args:
|
||||
source: The HTML source to read (file path, URL, or content)
|
||||
**options: Additional options for reading
|
||||
|
||||
Returns:
|
||||
The parsed Document
|
||||
"""
|
||||
reader = HTMLReader()
|
||||
return reader.read(source, **options)
|
||||
|
||||
|
||||
def read_html_file(file_path: str, **options) -> Document:
|
||||
"""
|
||||
Convenience function to read HTML from a file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the HTML file
|
||||
**options: Additional options for reading
|
||||
|
||||
Returns:
|
||||
The parsed Document
|
||||
"""
|
||||
if not os.path.isfile(file_path):
|
||||
raise FileNotFoundError(f"HTML file not found: {file_path}")
|
||||
|
||||
reader = HTMLReader()
|
||||
return reader.read(file_path, **options)
|
||||
|
||||
|
||||
def parse_html_string(html_content: str, **options) -> Document:
|
||||
"""
|
||||
Convenience function to parse HTML content from a string.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content as a string
|
||||
**options: Additional options for reading
|
||||
|
||||
Returns:
|
||||
The parsed Document
|
||||
"""
|
||||
reader = HTMLReader()
|
||||
return reader.read(html_content, **options)
|
||||
@ -1,426 +0,0 @@
|
||||
"""
|
||||
HTML metadata reader for pyWebLayout.
|
||||
|
||||
This module provides specialized functionality for extracting metadata
|
||||
from HTML documents, following the decomposed architecture pattern.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
import re
|
||||
from pyWebLayout.abstract.document import Document, MetadataType
|
||||
from pyWebLayout.io.readers.base import MetadataReader
|
||||
|
||||
|
||||
class HTMLMetadataReader(MetadataReader):
|
||||
"""
|
||||
Specialized reader for extracting metadata from HTML documents.
|
||||
|
||||
This class handles HTML meta tags, title elements, and other metadata
|
||||
sources like Open Graph tags and JSON-LD structured data.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the HTML metadata reader."""
|
||||
self._title = None
|
||||
self._meta_tags = {}
|
||||
self._og_tags = {}
|
||||
self._twitter_tags = {}
|
||||
self._json_ld = {}
|
||||
|
||||
def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract metadata from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
document: The document to populate with metadata
|
||||
|
||||
Returns:
|
||||
Dictionary of extracted metadata
|
||||
"""
|
||||
# Reset internal state
|
||||
self._reset()
|
||||
|
||||
# Extract title
|
||||
self._extract_title(html_content)
|
||||
|
||||
# Extract meta tags
|
||||
self._extract_meta_tags(html_content)
|
||||
|
||||
# Extract Open Graph tags
|
||||
self._extract_open_graph(html_content)
|
||||
|
||||
# Extract Twitter Card tags
|
||||
self._extract_twitter_cards(html_content)
|
||||
|
||||
# Extract JSON-LD structured data
|
||||
self._extract_json_ld(html_content)
|
||||
|
||||
# Populate document with extracted metadata
|
||||
self._populate_document(document)
|
||||
|
||||
# Return all extracted metadata
|
||||
return {
|
||||
'title': self._title,
|
||||
'meta_tags': self._meta_tags,
|
||||
'open_graph': self._og_tags,
|
||||
'twitter_cards': self._twitter_tags,
|
||||
'json_ld': self._json_ld
|
||||
}
|
||||
|
||||
def _reset(self):
|
||||
"""Reset internal state for a new extraction."""
|
||||
self._title = None
|
||||
self._meta_tags = {}
|
||||
self._og_tags = {}
|
||||
self._twitter_tags = {}
|
||||
self._json_ld = {}
|
||||
|
||||
def _extract_title(self, html_content: str):
|
||||
"""
|
||||
Extract the title from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Look for title tag
|
||||
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
|
||||
if title_match:
|
||||
# Clean up the title text
|
||||
self._title = self._clean_text(title_match.group(1))
|
||||
|
||||
def _extract_meta_tags(self, html_content: str):
|
||||
"""
|
||||
Extract meta tags from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Regular expression to match meta tags
|
||||
meta_pattern = r'<meta\s+([^>]+)>'
|
||||
|
||||
for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
|
||||
attrs = self._parse_attributes(match.group(1))
|
||||
|
||||
# Get name and content
|
||||
name = attrs.get('name', '').lower()
|
||||
content = attrs.get('content', '')
|
||||
|
||||
# Handle different types of meta tags
|
||||
if name and content:
|
||||
self._meta_tags[name] = content
|
||||
|
||||
# Handle http-equiv meta tags
|
||||
http_equiv = attrs.get('http-equiv', '').lower()
|
||||
if http_equiv and content:
|
||||
self._meta_tags[f'http-equiv:{http_equiv}'] = content
|
||||
|
||||
# Handle charset meta tags
|
||||
charset = attrs.get('charset', '')
|
||||
if charset:
|
||||
self._meta_tags['charset'] = charset
|
||||
|
||||
def _extract_open_graph(self, html_content: str):
|
||||
"""
|
||||
Extract Open Graph meta tags from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Regular expression to match Open Graph meta tags
|
||||
og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
||||
|
||||
for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
|
||||
property_name = match.group(1)
|
||||
content = match.group(2)
|
||||
self._og_tags[property_name] = content
|
||||
|
||||
def _extract_twitter_cards(self, html_content: str):
|
||||
"""
|
||||
Extract Twitter Card meta tags from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Regular expression to match Twitter Card meta tags
|
||||
twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
||||
|
||||
for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
|
||||
property_name = match.group(1)
|
||||
content = match.group(2)
|
||||
self._twitter_tags[property_name] = content
|
||||
|
||||
def _extract_json_ld(self, html_content: str):
|
||||
"""
|
||||
Extract JSON-LD structured data from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Regular expression to match JSON-LD script tags
|
||||
json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
|
||||
|
||||
for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
|
||||
try:
|
||||
import json
|
||||
json_content = match.group(1).strip()
|
||||
data = json.loads(json_content)
|
||||
|
||||
# Store JSON-LD data by type if available
|
||||
if isinstance(data, dict) and '@type' in data:
|
||||
type_name = data['@type']
|
||||
if type_name not in self._json_ld:
|
||||
self._json_ld[type_name] = []
|
||||
self._json_ld[type_name].append(data)
|
||||
elif isinstance(data, list):
|
||||
# Handle arrays of structured data
|
||||
for item in data:
|
||||
if isinstance(item, dict) and '@type' in item:
|
||||
type_name = item['@type']
|
||||
if type_name not in self._json_ld:
|
||||
self._json_ld[type_name] = []
|
||||
self._json_ld[type_name].append(item)
|
||||
except (json.JSONDecodeError, ImportError):
|
||||
# Skip invalid JSON-LD
|
||||
continue
|
||||
|
||||
def _populate_document(self, document: Document):
|
||||
"""
|
||||
Populate the document with extracted metadata.
|
||||
|
||||
Args:
|
||||
document: The document to populate
|
||||
"""
|
||||
# Set title
|
||||
title = self._get_best_title()
|
||||
if title:
|
||||
document.set_metadata(MetadataType.TITLE, title)
|
||||
|
||||
# Set description
|
||||
description = self._get_best_description()
|
||||
if description:
|
||||
document.set_metadata(MetadataType.DESCRIPTION, description)
|
||||
|
||||
# Set author
|
||||
author = self._get_best_author()
|
||||
if author:
|
||||
document.set_metadata(MetadataType.AUTHOR, author)
|
||||
|
||||
# Set keywords
|
||||
keywords = self._get_keywords()
|
||||
if keywords:
|
||||
document.set_metadata(MetadataType.KEYWORDS, keywords)
|
||||
|
||||
# Set language
|
||||
language = self._get_language()
|
||||
if language:
|
||||
document.set_metadata(MetadataType.LANGUAGE, language)
|
||||
|
||||
# Set cover image
|
||||
cover_image = self._get_cover_image()
|
||||
if cover_image:
|
||||
document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
|
||||
|
||||
# Set publisher
|
||||
publisher = self._get_publisher()
|
||||
if publisher:
|
||||
document.set_metadata(MetadataType.PUBLISHER, publisher)
|
||||
|
||||
# Set publication date
|
||||
pub_date = self._get_publication_date()
|
||||
if pub_date:
|
||||
document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
|
||||
|
||||
def _get_best_title(self) -> Optional[str]:
|
||||
"""Get the best available title from all sources."""
|
||||
# Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
|
||||
|
||||
# Check Open Graph
|
||||
if 'title' in self._og_tags:
|
||||
return self._og_tags['title']
|
||||
|
||||
# Check Twitter Cards
|
||||
if 'title' in self._twitter_tags:
|
||||
return self._twitter_tags['title']
|
||||
|
||||
# Check JSON-LD
|
||||
for type_name, items in self._json_ld.items():
|
||||
for item in items:
|
||||
if 'name' in item:
|
||||
return item['name']
|
||||
elif 'headline' in item:
|
||||
return item['headline']
|
||||
|
||||
# Check meta tags
|
||||
for key in ['title', 'og:title', 'twitter:title']:
|
||||
if key in self._meta_tags:
|
||||
return self._meta_tags[key]
|
||||
|
||||
# Fall back to HTML title
|
||||
return self._title
|
||||
|
||||
def _get_best_description(self) -> Optional[str]:
|
||||
"""Get the best available description from all sources."""
|
||||
# Priority order: Open Graph > Twitter > meta description > JSON-LD
|
||||
|
||||
# Check Open Graph
|
||||
if 'description' in self._og_tags:
|
||||
return self._og_tags['description']
|
||||
|
||||
# Check Twitter Cards
|
||||
if 'description' in self._twitter_tags:
|
||||
return self._twitter_tags['description']
|
||||
|
||||
# Check meta description
|
||||
if 'description' in self._meta_tags:
|
||||
return self._meta_tags['description']
|
||||
|
||||
# Check JSON-LD
|
||||
for type_name, items in self._json_ld.items():
|
||||
for item in items:
|
||||
if 'description' in item:
|
||||
return item['description']
|
||||
|
||||
return None
|
||||
|
||||
def _get_best_author(self) -> Optional[str]:
|
||||
"""Get the best available author from all sources."""
|
||||
# Check meta tags
|
||||
if 'author' in self._meta_tags:
|
||||
return self._meta_tags['author']
|
||||
|
||||
# Check JSON-LD
|
||||
for type_name, items in self._json_ld.items():
|
||||
for item in items:
|
||||
if 'author' in item:
|
||||
author = item['author']
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
return author['name']
|
||||
elif isinstance(author, str):
|
||||
return author
|
||||
elif 'creator' in item:
|
||||
creator = item['creator']
|
||||
if isinstance(creator, dict) and 'name' in creator:
|
||||
return creator['name']
|
||||
elif isinstance(creator, str):
|
||||
return creator
|
||||
|
||||
return None
|
||||
|
||||
def _get_keywords(self) -> Optional[str]:
|
||||
"""Get keywords from meta tags."""
|
||||
return self._meta_tags.get('keywords')
|
||||
|
||||
def _get_language(self) -> Optional[str]:
|
||||
"""Get language from meta tags or HTML lang attribute."""
|
||||
# Check meta tags first
|
||||
if 'language' in self._meta_tags:
|
||||
return self._meta_tags['language']
|
||||
|
||||
# Could also extract from html lang attribute if needed
|
||||
return None
|
||||
|
||||
def _get_cover_image(self) -> Optional[str]:
|
||||
"""Get the best available cover image from all sources."""
|
||||
# Check Open Graph
|
||||
if 'image' in self._og_tags:
|
||||
return self._og_tags['image']
|
||||
|
||||
# Check Twitter Cards
|
||||
if 'image' in self._twitter_tags:
|
||||
return self._twitter_tags['image']
|
||||
|
||||
# Check JSON-LD
|
||||
for type_name, items in self._json_ld.items():
|
||||
for item in items:
|
||||
if 'image' in item:
|
||||
image = item['image']
|
||||
if isinstance(image, dict) and 'url' in image:
|
||||
return image['url']
|
||||
elif isinstance(image, str):
|
||||
return image
|
||||
|
||||
return None
|
||||
|
||||
def _get_publisher(self) -> Optional[str]:
|
||||
"""Get publisher from JSON-LD or other sources."""
|
||||
# Check JSON-LD
|
||||
for type_name, items in self._json_ld.items():
|
||||
for item in items:
|
||||
if 'publisher' in item:
|
||||
publisher = item['publisher']
|
||||
if isinstance(publisher, dict) and 'name' in publisher:
|
||||
return publisher['name']
|
||||
elif isinstance(publisher, str):
|
||||
return publisher
|
||||
|
||||
return None
|
||||
|
||||
def _get_publication_date(self) -> Optional[str]:
|
||||
"""Get publication date from JSON-LD or other sources."""
|
||||
# Check JSON-LD
|
||||
for type_name, items in self._json_ld.items():
|
||||
for item in items:
|
||||
if 'datePublished' in item:
|
||||
return item['datePublished']
|
||||
elif 'publishDate' in item:
|
||||
return item['publishDate']
|
||||
|
||||
return None
|
||||
|
||||
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
|
||||
"""
|
||||
Parse HTML attributes from a string.
|
||||
|
||||
Args:
|
||||
attr_string: String containing HTML attributes
|
||||
|
||||
Returns:
|
||||
Dictionary of attribute name-value pairs
|
||||
"""
|
||||
attrs = {}
|
||||
|
||||
# Regular expression to match attribute="value" or attribute='value'
|
||||
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
|
||||
|
||||
for match in re.finditer(attr_pattern, attr_string):
|
||||
name = match.group(1).lower()
|
||||
value = match.group(2) or match.group(3) or match.group(4) or ''
|
||||
attrs[name] = value
|
||||
|
||||
# Handle standalone attributes (like charset)
|
||||
standalone_pattern = r'\b(\w+)(?!=)'
|
||||
for match in re.finditer(standalone_pattern, attr_string):
|
||||
attr_name = match.group(1).lower()
|
||||
if attr_name not in attrs:
|
||||
attrs[attr_name] = ''
|
||||
|
||||
return attrs
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""
|
||||
Clean up text content by removing extra whitespace and HTML entities.
|
||||
|
||||
Args:
|
||||
text: The text to clean
|
||||
|
||||
Returns:
|
||||
Cleaned text
|
||||
"""
|
||||
# Remove extra whitespace
|
||||
cleaned = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# Decode common HTML entities
|
||||
entities = {
|
||||
'<': '<',
|
||||
'>': '>',
|
||||
'&': '&',
|
||||
'"': '"',
|
||||
''': "'",
|
||||
' ': ' ',
|
||||
}
|
||||
|
||||
for entity, char in entities.items():
|
||||
cleaned = cleaned.replace(entity, char)
|
||||
|
||||
return cleaned
|
||||
@ -1,483 +0,0 @@
|
||||
"""
|
||||
HTML resources reader for pyWebLayout.
|
||||
|
||||
This module provides specialized functionality for extracting resources
|
||||
from HTML documents, such as stylesheets, scripts, and external files.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional, List
|
||||
import re
|
||||
import urllib.parse
|
||||
from pyWebLayout.abstract.document import Document
|
||||
from pyWebLayout.io.readers.base import ResourceReader
|
||||
|
||||
|
||||
class HTMLResourceReader(ResourceReader):
|
||||
"""
|
||||
Specialized reader for extracting resources from HTML documents.
|
||||
|
||||
This class handles CSS stylesheets, JavaScript files, images,
|
||||
and other external resources referenced in HTML.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the HTML resource reader."""
|
||||
self._stylesheets = []
|
||||
self._scripts = []
|
||||
self._external_resources = {}
|
||||
self._inline_styles = {}
|
||||
self._inline_scripts = []
|
||||
|
||||
def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract resources from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
document: The document to populate with resources
|
||||
|
||||
Returns:
|
||||
Dictionary of extracted resources
|
||||
"""
|
||||
# Reset internal state
|
||||
self._reset()
|
||||
|
||||
# Extract stylesheets
|
||||
self._extract_stylesheets(html_content)
|
||||
|
||||
# Extract scripts
|
||||
self._extract_scripts(html_content)
|
||||
|
||||
# Extract other external resources
|
||||
self._extract_external_resources(html_content)
|
||||
|
||||
# Extract inline styles
|
||||
self._extract_inline_styles(html_content)
|
||||
|
||||
# Extract inline scripts
|
||||
self._extract_inline_scripts(html_content)
|
||||
|
||||
# Populate document with extracted resources
|
||||
self._populate_document(document)
|
||||
|
||||
# Return all extracted resources
|
||||
return {
|
||||
'stylesheets': self._stylesheets,
|
||||
'scripts': self._scripts,
|
||||
'external_resources': self._external_resources,
|
||||
'inline_styles': self._inline_styles,
|
||||
'inline_scripts': self._inline_scripts
|
||||
}
|
||||
|
||||
def _reset(self):
|
||||
"""Reset internal state for a new extraction."""
|
||||
self._stylesheets = []
|
||||
self._scripts = []
|
||||
self._external_resources = {}
|
||||
self._inline_styles = {}
|
||||
self._inline_scripts = []
|
||||
|
||||
def _extract_stylesheets(self, html_content: str):
|
||||
"""
|
||||
Extract CSS stylesheet references from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Regular expression to match link tags for stylesheets
|
||||
link_pattern = r'<link\s+([^>]+)>'
|
||||
|
||||
for match in re.finditer(link_pattern, html_content, re.IGNORECASE):
|
||||
attrs = self._parse_attributes(match.group(1))
|
||||
|
||||
# Check if this is a stylesheet
|
||||
rel = attrs.get('rel', '').lower()
|
||||
if rel == 'stylesheet':
|
||||
href = attrs.get('href', '')
|
||||
media = attrs.get('media', 'all')
|
||||
type_attr = attrs.get('type', 'text/css')
|
||||
|
||||
if href:
|
||||
stylesheet = {
|
||||
'type': 'external',
|
||||
'href': href,
|
||||
'media': media,
|
||||
'content_type': type_attr
|
||||
}
|
||||
self._stylesheets.append(stylesheet)
|
||||
|
||||
# Handle other link types
|
||||
elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'):
|
||||
href = attrs.get('href', '')
|
||||
if href:
|
||||
self._external_resources[f'icon_{len(self._external_resources)}'] = {
|
||||
'type': 'icon',
|
||||
'rel': rel,
|
||||
'href': href,
|
||||
'sizes': attrs.get('sizes', ''),
|
||||
'content_type': attrs.get('type', '')
|
||||
}
|
||||
|
||||
elif rel == 'preload':
|
||||
href = attrs.get('href', '')
|
||||
if href:
|
||||
self._external_resources[f'preload_{len(self._external_resources)}'] = {
|
||||
'type': 'preload',
|
||||
'href': href,
|
||||
'as': attrs.get('as', ''),
|
||||
'content_type': attrs.get('type', '')
|
||||
}
|
||||
|
||||
def _extract_scripts(self, html_content: str):
|
||||
"""
|
||||
Extract script references from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Regular expression to match script tags
|
||||
script_pattern = r'<script\s*([^>]*)>(.*?)</script>'
|
||||
|
||||
for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL):
|
||||
attrs_str = match.group(1)
|
||||
content = match.group(2).strip()
|
||||
|
||||
attrs = self._parse_attributes(attrs_str)
|
||||
|
||||
src = attrs.get('src', '')
|
||||
script_type = attrs.get('type', 'text/javascript')
|
||||
|
||||
if src:
|
||||
# External script
|
||||
script = {
|
||||
'type': 'external',
|
||||
'src': src,
|
||||
'content_type': script_type,
|
||||
'async': 'async' in attrs,
|
||||
'defer': 'defer' in attrs,
|
||||
'integrity': attrs.get('integrity', ''),
|
||||
'crossorigin': attrs.get('crossorigin', '')
|
||||
}
|
||||
self._scripts.append(script)
|
||||
|
||||
elif content:
|
||||
# Inline script
|
||||
script = {
|
||||
'type': 'inline',
|
||||
'content': content,
|
||||
'content_type': script_type
|
||||
}
|
||||
self._scripts.append(script)
|
||||
|
||||
def _extract_external_resources(self, html_content: str):
|
||||
"""
|
||||
Extract other external resources from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Extract images
|
||||
img_pattern = r'<img\s+([^>]+)>'
|
||||
for match in re.finditer(img_pattern, html_content, re.IGNORECASE):
|
||||
attrs = self._parse_attributes(match.group(1))
|
||||
src = attrs.get('src', '')
|
||||
if src:
|
||||
self._external_resources[f'image_{len(self._external_resources)}'] = {
|
||||
'type': 'image',
|
||||
'src': src,
|
||||
'alt': attrs.get('alt', ''),
|
||||
'width': attrs.get('width', ''),
|
||||
'height': attrs.get('height', ''),
|
||||
'loading': attrs.get('loading', ''),
|
||||
'srcset': attrs.get('srcset', '')
|
||||
}
|
||||
|
||||
# Extract audio
|
||||
audio_pattern = r'<audio\s+([^>]+)>'
|
||||
for match in re.finditer(audio_pattern, html_content, re.IGNORECASE):
|
||||
attrs = self._parse_attributes(match.group(1))
|
||||
src = attrs.get('src', '')
|
||||
if src:
|
||||
self._external_resources[f'audio_{len(self._external_resources)}'] = {
|
||||
'type': 'audio',
|
||||
'src': src,
|
||||
'controls': 'controls' in attrs,
|
||||
'autoplay': 'autoplay' in attrs,
|
||||
'loop': 'loop' in attrs,
|
||||
'muted': 'muted' in attrs
|
||||
}
|
||||
|
||||
# Extract video
|
||||
video_pattern = r'<video\s+([^>]+)>'
|
||||
for match in re.finditer(video_pattern, html_content, re.IGNORECASE):
|
||||
attrs = self._parse_attributes(match.group(1))
|
||||
src = attrs.get('src', '')
|
||||
if src:
|
||||
self._external_resources[f'video_{len(self._external_resources)}'] = {
|
||||
'type': 'video',
|
||||
'src': src,
|
||||
'controls': 'controls' in attrs,
|
||||
'autoplay': 'autoplay' in attrs,
|
||||
'loop': 'loop' in attrs,
|
||||
'muted': 'muted' in attrs,
|
||||
'width': attrs.get('width', ''),
|
||||
'height': attrs.get('height', ''),
|
||||
'poster': attrs.get('poster', '')
|
||||
}
|
||||
|
||||
# Extract embed/object resources
|
||||
embed_pattern = r'<embed\s+([^>]+)>'
|
||||
for match in re.finditer(embed_pattern, html_content, re.IGNORECASE):
|
||||
attrs = self._parse_attributes(match.group(1))
|
||||
src = attrs.get('src', '')
|
||||
if src:
|
||||
self._external_resources[f'embed_{len(self._external_resources)}'] = {
|
||||
'type': 'embed',
|
||||
'src': src,
|
||||
'content_type': attrs.get('type', ''),
|
||||
'width': attrs.get('width', ''),
|
||||
'height': attrs.get('height', '')
|
||||
}
|
||||
|
||||
# Extract iframe sources
|
||||
iframe_pattern = r'<iframe\s+([^>]+)>'
|
||||
for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE):
|
||||
attrs = self._parse_attributes(match.group(1))
|
||||
src = attrs.get('src', '')
|
||||
if src:
|
||||
self._external_resources[f'iframe_{len(self._external_resources)}'] = {
|
||||
'type': 'iframe',
|
||||
'src': src,
|
||||
'width': attrs.get('width', ''),
|
||||
'height': attrs.get('height', ''),
|
||||
'loading': attrs.get('loading', ''),
|
||||
'sandbox': attrs.get('sandbox', '')
|
||||
}
|
||||
|
||||
def _extract_inline_styles(self, html_content: str):
|
||||
"""
|
||||
Extract inline CSS styles from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# Extract style blocks
|
||||
style_pattern = r'<style\s*([^>]*)>(.*?)</style>'
|
||||
|
||||
for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)):
|
||||
attrs_str = match.group(1)
|
||||
content = match.group(2).strip()
|
||||
|
||||
attrs = self._parse_attributes(attrs_str)
|
||||
|
||||
if content:
|
||||
style_block = {
|
||||
'content': content,
|
||||
'media': attrs.get('media', 'all'),
|
||||
'content_type': attrs.get('type', 'text/css')
|
||||
}
|
||||
self._inline_styles[f'style_block_{i}'] = style_block
|
||||
|
||||
# Extract inline style attributes (this would be more complex
|
||||
# as it requires parsing all elements with style attributes)
|
||||
style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>'
|
||||
|
||||
for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)):
|
||||
style_content = match.group(1)
|
||||
if style_content:
|
||||
style_attr = {
|
||||
'content': style_content,
|
||||
'type': 'attribute'
|
||||
}
|
||||
self._inline_styles[f'style_attr_{i}'] = style_attr
|
||||
|
||||
def _extract_inline_scripts(self, html_content: str):
|
||||
"""
|
||||
Extract inline JavaScript from HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
"""
|
||||
# This is already handled in _extract_scripts, but we keep this
|
||||
# method for consistency and potential future extensions
|
||||
pass
|
||||
|
||||
def _populate_document(self, document: Document):
|
||||
"""
|
||||
Populate the document with extracted resources.
|
||||
|
||||
Args:
|
||||
document: The document to populate
|
||||
"""
|
||||
# Add stylesheets
|
||||
for stylesheet in self._stylesheets:
|
||||
document.add_stylesheet(stylesheet)
|
||||
|
||||
# Add scripts
|
||||
for script in self._scripts:
|
||||
if script['type'] == 'inline':
|
||||
document.add_script(script['content'])
|
||||
else:
|
||||
# For external scripts, we store them as resources
|
||||
script_name = f"script_{len(document._resources)}"
|
||||
document.add_resource(script_name, script)
|
||||
|
||||
# Add external resources
|
||||
for name, resource in self._external_resources.items():
|
||||
document.add_resource(name, resource)
|
||||
|
||||
# Add inline styles as stylesheets
|
||||
for name, style in self._inline_styles.items():
|
||||
if style.get('type') != 'attribute': # Don't add individual style attributes
|
||||
parsed_style = self._parse_css(style['content'])
|
||||
if parsed_style:
|
||||
document.add_stylesheet({
|
||||
'type': 'inline',
|
||||
'content': style['content'],
|
||||
'parsed': parsed_style,
|
||||
'media': style.get('media', 'all')
|
||||
})
|
||||
|
||||
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
|
||||
"""
|
||||
Parse HTML attributes from a string.
|
||||
|
||||
Args:
|
||||
attr_string: String containing HTML attributes
|
||||
|
||||
Returns:
|
||||
Dictionary of attribute name-value pairs
|
||||
"""
|
||||
attrs = {}
|
||||
|
||||
# Regular expression to match attribute="value" or attribute='value'
|
||||
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
|
||||
|
||||
for match in re.finditer(attr_pattern, attr_string):
|
||||
name = match.group(1).lower()
|
||||
value = match.group(2) or match.group(3) or match.group(4) or ''
|
||||
attrs[name] = value
|
||||
|
||||
# Handle standalone attributes (like async, defer)
|
||||
standalone_pattern = r'\b(\w+)(?!=)'
|
||||
for match in re.finditer(standalone_pattern, attr_string):
|
||||
attr_name = match.group(1).lower()
|
||||
if attr_name not in attrs:
|
||||
attrs[attr_name] = ''
|
||||
|
||||
return attrs
|
||||
|
||||
def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]:
|
||||
"""
|
||||
Parse a CSS stylesheet.
|
||||
|
||||
Args:
|
||||
css_str: CSS stylesheet string
|
||||
|
||||
Returns:
|
||||
Dictionary of selectors and their style properties
|
||||
"""
|
||||
stylesheet = {}
|
||||
|
||||
# Remove comments
|
||||
css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL)
|
||||
|
||||
# Split into rule sets
|
||||
rule_sets = css_str.split('}')
|
||||
|
||||
for rule_set in rule_sets:
|
||||
# Split into selector and declarations
|
||||
parts = rule_set.split('{', 1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
selector = parts[0].strip()
|
||||
declarations = parts[1].strip()
|
||||
|
||||
# Parse declarations
|
||||
style = self._parse_css_declarations(declarations)
|
||||
|
||||
# Add to stylesheet
|
||||
if selector and style:
|
||||
stylesheet[selector] = style
|
||||
|
||||
return stylesheet
|
||||
|
||||
def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]:
|
||||
"""
|
||||
Parse CSS declarations.
|
||||
|
||||
Args:
|
||||
declarations_str: CSS declarations string
|
||||
|
||||
Returns:
|
||||
Dictionary of CSS properties and values
|
||||
"""
|
||||
declarations = {}
|
||||
|
||||
# Split the declarations string into individual declarations
|
||||
decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()]
|
||||
|
||||
for declaration in decl_list:
|
||||
# Split into property and value
|
||||
parts = declaration.split(':', 1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
prop = parts[0].strip().lower()
|
||||
value = parts[1].strip()
|
||||
|
||||
# Store the declaration
|
||||
declarations[prop] = value
|
||||
|
||||
return declarations
|
||||
|
||||
def resolve_url(self, url: str, base_url: Optional[str] = None) -> str:
|
||||
"""
|
||||
Resolve a relative URL against a base URL.
|
||||
|
||||
Args:
|
||||
url: The URL to resolve
|
||||
base_url: The base URL to resolve against
|
||||
|
||||
Returns:
|
||||
The resolved URL
|
||||
"""
|
||||
if base_url and not url.startswith(('http://', 'https://', '//', 'data:')):
|
||||
return urllib.parse.urljoin(base_url, url)
|
||||
return url
|
||||
|
||||
def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Get the dependencies of a resource (e.g., CSS imports, script dependencies).
|
||||
|
||||
Args:
|
||||
resource: The resource to analyze
|
||||
|
||||
Returns:
|
||||
List of dependency URLs
|
||||
"""
|
||||
dependencies = []
|
||||
|
||||
if resource.get('type') == 'external' and 'content' in resource:
|
||||
content = resource['content']
|
||||
|
||||
# Check for CSS @import rules
|
||||
if resource.get('content_type', '').startswith('text/css'):
|
||||
import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?'
|
||||
for match in re.finditer(import_pattern, content, re.IGNORECASE):
|
||||
dependencies.append(match.group(1))
|
||||
|
||||
# Check for JavaScript imports/requires (basic detection)
|
||||
elif resource.get('content_type', '').startswith('text/javascript'):
|
||||
# ES6 imports
|
||||
import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']'
|
||||
for match in re.finditer(import_pattern, content):
|
||||
dependencies.append(match.group(1))
|
||||
|
||||
# CommonJS requires
|
||||
require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)'
|
||||
for match in re.finditer(require_pattern, content):
|
||||
dependencies.append(match.group(1))
|
||||
|
||||
return dependencies
|
||||
Loading…
x
Reference in New Issue
Block a user