This commit is contained in:
parent
2b1170cac7
commit
28c7b6700b
@ -35,8 +35,3 @@ from pyWebLayout.concrete.page import Container, Page
|
|||||||
from pyWebLayout.abstract.inline import Word
|
from pyWebLayout.abstract.inline import Word
|
||||||
|
|
||||||
|
|
||||||
# IO functionality (reading and writing)
|
|
||||||
from pyWebLayout.io import (
|
|
||||||
parse_html, html_to_document, # HTML parsing
|
|
||||||
read_epub # EPUB reading
|
|
||||||
)
|
|
||||||
|
|||||||
@ -11,61 +11,5 @@ pattern as the abstract module.
|
|||||||
|
|
||||||
# Legacy readers (for backward compatibility)
|
# Legacy readers (for backward compatibility)
|
||||||
# Legacy functions provided by new HTML reader for backward compatibility
|
# Legacy functions provided by new HTML reader for backward compatibility
|
||||||
from pyWebLayout.io.readers.html import parse_html_string as parse_html
|
|
||||||
from pyWebLayout.io.readers.html import read_html_file as html_to_document
|
|
||||||
from pyWebLayout.io.readers.epub_reader import read_epub
|
|
||||||
|
|
||||||
# New decomposed readers
|
from pyWebLayout.io.readers.epub_reader import EPUBReader
|
||||||
from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string
|
|
||||||
from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
|
||||||
|
|
||||||
# Specialized HTML readers
|
|
||||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
|
||||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
|
||||||
|
|
||||||
# HTML extraction parser (the best approach)
|
|
||||||
from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction
|
|
||||||
|
|
||||||
# Specialized EPUB readers
|
|
||||||
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
|
|
||||||
|
|
||||||
# Convenience functions using the new architecture
|
|
||||||
def read_document(source, format_hint=None, **options):
|
|
||||||
"""
|
|
||||||
Read a document using the appropriate reader based on format detection.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source: The source to read (file path, URL, or content)
|
|
||||||
format_hint: Optional hint about the format ('html', 'epub', etc.)
|
|
||||||
**options: Additional options for reading
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Document: The parsed document
|
|
||||||
"""
|
|
||||||
if format_hint == 'html' or (not format_hint and _is_html_source(source)):
|
|
||||||
reader = HTMLReader()
|
|
||||||
return reader.read(source, **options)
|
|
||||||
elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)):
|
|
||||||
# Use legacy EPUB reader for now
|
|
||||||
return read_epub(source)
|
|
||||||
else:
|
|
||||||
# Try HTML reader as fallback
|
|
||||||
try:
|
|
||||||
reader = HTMLReader()
|
|
||||||
if reader.can_read(source):
|
|
||||||
return reader.read(source, **options)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
raise ValueError(f"Cannot determine format for source: {source}")
|
|
||||||
|
|
||||||
def _is_html_source(source):
|
|
||||||
"""Check if source appears to be HTML."""
|
|
||||||
reader = HTMLReader()
|
|
||||||
return reader.can_read(source)
|
|
||||||
|
|
||||||
def _is_epub_source(source):
|
|
||||||
"""Check if source appears to be EPUB."""
|
|
||||||
if isinstance(source, str):
|
|
||||||
return source.lower().endswith('.epub')
|
|
||||||
return False
|
|
||||||
|
|||||||
@ -9,14 +9,13 @@ using a decomposed architecture pattern.
|
|||||||
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
|
||||||
|
|
||||||
# HTML readers (decomposed)
|
# HTML readers (decomposed)
|
||||||
from .html import HTMLReader, read_html, read_html_file, parse_html_string
|
|
||||||
from .html_metadata import HTMLMetadataReader
|
|
||||||
from .html_resources import HTMLResourceReader
|
|
||||||
|
|
||||||
|
|
||||||
# EPUB readers
|
# EPUB readers
|
||||||
from .epub_reader import read_epub # Legacy
|
from .epub_reader import read_epub # Legacy
|
||||||
from .epub_metadata import EPUBMetadataReader # New decomposed
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
# Base classes
|
# Base classes
|
||||||
|
|||||||
@ -1,352 +0,0 @@
|
|||||||
"""
|
|
||||||
EPUB metadata reader for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides specialized functionality for extracting metadata
|
|
||||||
from EPUB documents, following the decomposed architecture pattern.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import zipfile
|
|
||||||
import tempfile
|
|
||||||
from typing import Dict, Any, Optional, List
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
from pyWebLayout.abstract.document import Document, MetadataType
|
|
||||||
from pyWebLayout.io.readers.base import MetadataReader
|
|
||||||
|
|
||||||
|
|
||||||
# XML namespaces used in EPUB files
|
|
||||||
NAMESPACES = {
|
|
||||||
'opf': 'http://www.idpf.org/2007/opf',
|
|
||||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
|
||||||
'dcterms': 'http://purl.org/dc/terms/',
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class EPUBMetadataReader(MetadataReader):
|
|
||||||
"""
|
|
||||||
Specialized reader for extracting metadata from EPUB documents.
|
|
||||||
|
|
||||||
This class handles EPUB package document metadata including
|
|
||||||
Dublin Core elements and custom metadata.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the EPUB metadata reader."""
|
|
||||||
self._metadata = {}
|
|
||||||
self._temp_dir = None
|
|
||||||
self._package_path = None
|
|
||||||
|
|
||||||
def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Extract metadata from EPUB file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
epub_path: Path to the EPUB file
|
|
||||||
document: The document to populate with metadata
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of extracted metadata
|
|
||||||
"""
|
|
||||||
# Reset internal state
|
|
||||||
self._reset()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Extract EPUB to temporary directory
|
|
||||||
self._extract_epub(epub_path)
|
|
||||||
|
|
||||||
# Find and parse package document
|
|
||||||
self._find_package_document()
|
|
||||||
|
|
||||||
if self._package_path:
|
|
||||||
self._parse_package_metadata()
|
|
||||||
|
|
||||||
# Populate document with extracted metadata
|
|
||||||
self._populate_document(document)
|
|
||||||
|
|
||||||
return self._metadata
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Clean up temporary files
|
|
||||||
self._cleanup()
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
"""Reset internal state for a new extraction."""
|
|
||||||
self._metadata = {}
|
|
||||||
self._temp_dir = None
|
|
||||||
self._package_path = None
|
|
||||||
|
|
||||||
def _extract_epub(self, epub_path: str):
|
|
||||||
"""
|
|
||||||
Extract EPUB file to temporary directory.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
epub_path: Path to the EPUB file
|
|
||||||
"""
|
|
||||||
self._temp_dir = tempfile.mkdtemp()
|
|
||||||
|
|
||||||
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall(self._temp_dir)
|
|
||||||
|
|
||||||
def _find_package_document(self):
|
|
||||||
"""Find the package document (content.opf) in the extracted EPUB."""
|
|
||||||
# First, try to find it via META-INF/container.xml
|
|
||||||
container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
|
|
||||||
|
|
||||||
if os.path.exists(container_path):
|
|
||||||
try:
|
|
||||||
tree = ET.parse(container_path)
|
|
||||||
root = tree.getroot()
|
|
||||||
|
|
||||||
# Find rootfile element
|
|
||||||
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
|
|
||||||
full_path = rootfile.get('full-path')
|
|
||||||
if full_path:
|
|
||||||
self._package_path = os.path.join(self._temp_dir, full_path)
|
|
||||||
if os.path.exists(self._package_path):
|
|
||||||
return
|
|
||||||
except ET.ParseError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Fallback: search for .opf files
|
|
||||||
for root, dirs, files in os.walk(self._temp_dir):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith('.opf'):
|
|
||||||
self._package_path = os.path.join(root, file)
|
|
||||||
return
|
|
||||||
|
|
||||||
def _parse_package_metadata(self):
|
|
||||||
"""Parse metadata from the package document."""
|
|
||||||
if not self._package_path or not os.path.exists(self._package_path):
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
tree = ET.parse(self._package_path)
|
|
||||||
root = tree.getroot()
|
|
||||||
|
|
||||||
# Find metadata element
|
|
||||||
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
|
|
||||||
if metadata_elem is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Parse Dublin Core metadata
|
|
||||||
self._parse_dublin_core(metadata_elem)
|
|
||||||
|
|
||||||
# Parse OPF-specific metadata
|
|
||||||
self._parse_opf_metadata(metadata_elem)
|
|
||||||
|
|
||||||
except ET.ParseError as e:
|
|
||||||
print(f"Error parsing package document: {e}")
|
|
||||||
|
|
||||||
def _parse_dublin_core(self, metadata_elem: ET.Element):
|
|
||||||
"""
|
|
||||||
Parse Dublin Core metadata elements.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
metadata_elem: The metadata XML element
|
|
||||||
"""
|
|
||||||
dc_elements = {
|
|
||||||
'title': 'title',
|
|
||||||
'creator': 'creator',
|
|
||||||
'subject': 'subject',
|
|
||||||
'description': 'description',
|
|
||||||
'publisher': 'publisher',
|
|
||||||
'contributor': 'contributor',
|
|
||||||
'date': 'date',
|
|
||||||
'type': 'type',
|
|
||||||
'format': 'format',
|
|
||||||
'identifier': 'identifier',
|
|
||||||
'source': 'source',
|
|
||||||
'language': 'language',
|
|
||||||
'relation': 'relation',
|
|
||||||
'coverage': 'coverage',
|
|
||||||
'rights': 'rights'
|
|
||||||
}
|
|
||||||
|
|
||||||
for dc_name, meta_key in dc_elements.items():
|
|
||||||
elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
|
|
||||||
|
|
||||||
if elements:
|
|
||||||
if len(elements) == 1:
|
|
||||||
# Single element
|
|
||||||
text = elements[0].text
|
|
||||||
if text:
|
|
||||||
self._metadata[meta_key] = text.strip()
|
|
||||||
|
|
||||||
# Handle special attributes
|
|
||||||
elem = elements[0]
|
|
||||||
if dc_name == 'creator':
|
|
||||||
# Check for role attribute
|
|
||||||
role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
|
|
||||||
if role:
|
|
||||||
self._metadata[f'{meta_key}_role'] = role
|
|
||||||
|
|
||||||
# Check for file-as attribute for sorting
|
|
||||||
file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
|
|
||||||
if file_as:
|
|
||||||
self._metadata[f'{meta_key}_file_as'] = file_as
|
|
||||||
|
|
||||||
elif dc_name == 'identifier':
|
|
||||||
# Check for scheme (ISBN, DOI, etc.)
|
|
||||||
scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
|
|
||||||
if scheme:
|
|
||||||
self._metadata[f'{meta_key}_scheme'] = scheme
|
|
||||||
|
|
||||||
# Check if this is the unique identifier
|
|
||||||
id_attr = elem.get('id')
|
|
||||||
if id_attr:
|
|
||||||
self._metadata[f'{meta_key}_id'] = id_attr
|
|
||||||
|
|
||||||
elif dc_name == 'date':
|
|
||||||
# Check for event type
|
|
||||||
event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
|
|
||||||
if event:
|
|
||||||
self._metadata[f'{meta_key}_event'] = event
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Multiple elements - store as list
|
|
||||||
values = []
|
|
||||||
for elem in elements:
|
|
||||||
if elem.text:
|
|
||||||
values.append(elem.text.strip())
|
|
||||||
|
|
||||||
if values:
|
|
||||||
self._metadata[meta_key] = values
|
|
||||||
|
|
||||||
def _parse_opf_metadata(self, metadata_elem: ET.Element):
|
|
||||||
"""
|
|
||||||
Parse OPF-specific metadata elements.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
metadata_elem: The metadata XML element
|
|
||||||
"""
|
|
||||||
# Parse meta elements
|
|
||||||
meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
|
|
||||||
|
|
||||||
for meta in meta_elements:
|
|
||||||
name = meta.get('name')
|
|
||||||
content = meta.get('content')
|
|
||||||
|
|
||||||
if name and content:
|
|
||||||
self._metadata[f'meta_{name}'] = content
|
|
||||||
|
|
||||||
# Parse x-metadata elements (custom metadata)
|
|
||||||
x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
|
|
||||||
|
|
||||||
for x_meta in x_meta_elements:
|
|
||||||
for child in x_meta:
|
|
||||||
if child.tag and child.text:
|
|
||||||
# Remove namespace prefix for cleaner key names
|
|
||||||
tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
|
||||||
self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
|
|
||||||
|
|
||||||
def _populate_document(self, document: Document):
|
|
||||||
"""
|
|
||||||
Populate the document with extracted metadata.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: The document to populate
|
|
||||||
"""
|
|
||||||
# Map EPUB metadata to document metadata types
|
|
||||||
metadata_mapping = {
|
|
||||||
'title': MetadataType.TITLE,
|
|
||||||
'creator': MetadataType.AUTHOR,
|
|
||||||
'description': MetadataType.DESCRIPTION,
|
|
||||||
'subject': MetadataType.KEYWORDS,
|
|
||||||
'language': MetadataType.LANGUAGE,
|
|
||||||
'date': MetadataType.PUBLICATION_DATE,
|
|
||||||
'publisher': MetadataType.PUBLISHER,
|
|
||||||
'identifier': MetadataType.IDENTIFIER,
|
|
||||||
}
|
|
||||||
|
|
||||||
for epub_key, doc_type in metadata_mapping.items():
|
|
||||||
if epub_key in self._metadata:
|
|
||||||
value = self._metadata[epub_key]
|
|
||||||
|
|
||||||
# Handle list values (like multiple subjects)
|
|
||||||
if isinstance(value, list):
|
|
||||||
if epub_key == 'subject':
|
|
||||||
# Join subjects with commas for keywords
|
|
||||||
document.set_metadata(doc_type, ', '.join(value))
|
|
||||||
else:
|
|
||||||
# For other list values, use the first one
|
|
||||||
document.set_metadata(doc_type, value[0])
|
|
||||||
else:
|
|
||||||
document.set_metadata(doc_type, value)
|
|
||||||
|
|
||||||
# Handle cover image
|
|
||||||
cover_meta = self._metadata.get('meta_cover')
|
|
||||||
if cover_meta:
|
|
||||||
document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
|
|
||||||
|
|
||||||
# Store original EPUB metadata for reference
|
|
||||||
document.set_metadata(MetadataType.CUSTOM, {
|
|
||||||
'epub_metadata': self._metadata
|
|
||||||
})
|
|
||||||
|
|
||||||
def _cleanup(self):
|
|
||||||
"""Clean up temporary files."""
|
|
||||||
if self._temp_dir:
|
|
||||||
try:
|
|
||||||
import shutil
|
|
||||||
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self._temp_dir = None
|
|
||||||
|
|
||||||
def get_unique_identifier(self) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
Get the unique identifier from the EPUB metadata.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The unique identifier string, or None if not found
|
|
||||||
"""
|
|
||||||
# Look for identifier with specific ID
|
|
||||||
for key, value in self._metadata.items():
|
|
||||||
if key.startswith('identifier') and key.endswith('_id'):
|
|
||||||
return self._metadata.get('identifier')
|
|
||||||
|
|
||||||
# Fallback to any identifier
|
|
||||||
return self._metadata.get('identifier')
|
|
||||||
|
|
||||||
def get_cover_id(self) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
Get the cover image ID from metadata.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The cover image ID, or None if not found
|
|
||||||
"""
|
|
||||||
return self._metadata.get('meta_cover')
|
|
||||||
|
|
||||||
def get_creators(self) -> List[Dict[str, str]]:
|
|
||||||
"""
|
|
||||||
Get creator information with roles.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of creator dictionaries with name, role, and file-as info
|
|
||||||
"""
|
|
||||||
creators = []
|
|
||||||
creator_value = self._metadata.get('creator')
|
|
||||||
|
|
||||||
if creator_value:
|
|
||||||
if isinstance(creator_value, list):
|
|
||||||
# Multiple creators - this is simplified, real implementation
|
|
||||||
# would need to correlate with role and file-as attributes
|
|
||||||
for creator in creator_value:
|
|
||||||
creators.append({'name': creator})
|
|
||||||
else:
|
|
||||||
# Single creator
|
|
||||||
creator_info = {'name': creator_value}
|
|
||||||
|
|
||||||
# Add role if available
|
|
||||||
role = self._metadata.get('creator_role')
|
|
||||||
if role:
|
|
||||||
creator_info['role'] = role
|
|
||||||
|
|
||||||
# Add file-as if available
|
|
||||||
file_as = self._metadata.get('creator_file_as')
|
|
||||||
if file_as:
|
|
||||||
creator_info['file_as'] = file_as
|
|
||||||
|
|
||||||
creators.append(creator_info)
|
|
||||||
|
|
||||||
return creators
|
|
||||||
@ -1,186 +0,0 @@
|
|||||||
"""
|
|
||||||
Modern HTML reader for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides an HTML reader that uses the html_extraction module
|
|
||||||
for clean, handler-based parsing using BeautifulSoup.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union, Optional
|
|
||||||
from pyWebLayout.abstract.document import Document
|
|
||||||
from pyWebLayout.io.readers.base import BaseReader
|
|
||||||
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
|
|
||||||
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
|
|
||||||
from pyWebLayout.io.readers.html_extraction import parse_html_string
|
|
||||||
from pyWebLayout.style import Font
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLReader(BaseReader):
|
|
||||||
"""
|
|
||||||
Modern HTML reader using the html_extraction parser.
|
|
||||||
|
|
||||||
This reader uses the clean, handler-based architecture from html_extraction.py
|
|
||||||
for parsing HTML content into pyWebLayout's abstract document structure.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the HTML reader."""
|
|
||||||
super().__init__()
|
|
||||||
self._metadata_reader = HTMLMetadataReader()
|
|
||||||
self._resource_reader = HTMLResourceReader()
|
|
||||||
|
|
||||||
def can_read(self, source: Union[str, bytes]) -> bool:
|
|
||||||
"""
|
|
||||||
Check if this reader can handle the given source.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source: The source to check (file path, URL, or content)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if this reader can handle the source, False otherwise
|
|
||||||
"""
|
|
||||||
if isinstance(source, str):
|
|
||||||
# Check if it's a file path
|
|
||||||
if os.path.isfile(source):
|
|
||||||
return source.lower().endswith(('.html', '.htm', '.xhtml'))
|
|
||||||
|
|
||||||
# Check if it's HTML content (very basic check)
|
|
||||||
source_lower = source.lower().strip()
|
|
||||||
return (source_lower.startswith('<!doctype html') or
|
|
||||||
source_lower.startswith('<html') or
|
|
||||||
'<html' in source_lower[:200])
|
|
||||||
|
|
||||||
elif isinstance(source, bytes):
|
|
||||||
# Check if it's HTML content in bytes
|
|
||||||
try:
|
|
||||||
source_str = source.decode('utf-8', errors='ignore').lower().strip()
|
|
||||||
return (source_str.startswith('<!doctype html') or
|
|
||||||
source_str.startswith('<html') or
|
|
||||||
'<html' in source_str[:200])
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def read(self, source: Union[str, bytes], **options) -> Document:
|
|
||||||
"""
|
|
||||||
Read and parse the HTML source into a Document.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source: The HTML source to read (file path, URL, or content)
|
|
||||||
**options: Additional options for reading
|
|
||||||
- base_url: Base URL for resolving relative links
|
|
||||||
- encoding: Character encoding (default: 'utf-8')
|
|
||||||
- extract_metadata: Whether to extract metadata (default: True)
|
|
||||||
- extract_resources: Whether to extract resources (default: True)
|
|
||||||
- base_font: Base font for styling (default: None)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The parsed Document
|
|
||||||
"""
|
|
||||||
# Get options
|
|
||||||
base_url = options.get('base_url')
|
|
||||||
encoding = options.get('encoding', 'utf-8')
|
|
||||||
extract_metadata = options.get('extract_metadata', True)
|
|
||||||
extract_resources = options.get('extract_resources', True)
|
|
||||||
base_font = options.get('base_font')
|
|
||||||
|
|
||||||
# Read the HTML content
|
|
||||||
html_content = self._read_html_content(source, encoding)
|
|
||||||
|
|
||||||
# Set base URL if not provided and source is a file
|
|
||||||
if not base_url and isinstance(source, str) and os.path.isfile(source):
|
|
||||||
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
|
|
||||||
|
|
||||||
# Create a new document
|
|
||||||
document = Document()
|
|
||||||
|
|
||||||
# Extract metadata if enabled
|
|
||||||
if extract_metadata and self._metadata_reader:
|
|
||||||
self._metadata_reader.extract_metadata(html_content, document)
|
|
||||||
|
|
||||||
# Parse content using html_extraction
|
|
||||||
blocks = parse_html_string(html_content, base_font)
|
|
||||||
for block in blocks:
|
|
||||||
document.add_block(block)
|
|
||||||
|
|
||||||
# Extract resources if enabled
|
|
||||||
if extract_resources and self._resource_reader:
|
|
||||||
self._resource_reader.extract_resources(html_content, document)
|
|
||||||
|
|
||||||
return document
|
|
||||||
|
|
||||||
def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str:
|
|
||||||
"""
|
|
||||||
Read HTML content from various sources.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source: The source to read from
|
|
||||||
encoding: Character encoding to use
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The HTML content as a string
|
|
||||||
"""
|
|
||||||
if isinstance(source, bytes):
|
|
||||||
# Source is already bytes, decode it
|
|
||||||
return source.decode(encoding, errors='replace')
|
|
||||||
|
|
||||||
elif isinstance(source, str):
|
|
||||||
# Check if it's a file path
|
|
||||||
if os.path.isfile(source):
|
|
||||||
with open(source, 'r', encoding=encoding, errors='replace') as f:
|
|
||||||
return f.read()
|
|
||||||
else:
|
|
||||||
# Assume it's HTML content
|
|
||||||
return source
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported source type: {type(source)}")
|
|
||||||
|
|
||||||
|
|
||||||
def read_html(source: Union[str, bytes], **options) -> Document:
|
|
||||||
"""
|
|
||||||
Convenience function to read HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source: The HTML source to read (file path, URL, or content)
|
|
||||||
**options: Additional options for reading
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The parsed Document
|
|
||||||
"""
|
|
||||||
reader = HTMLReader()
|
|
||||||
return reader.read(source, **options)
|
|
||||||
|
|
||||||
|
|
||||||
def read_html_file(file_path: str, **options) -> Document:
|
|
||||||
"""
|
|
||||||
Convenience function to read HTML from a file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the HTML file
|
|
||||||
**options: Additional options for reading
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The parsed Document
|
|
||||||
"""
|
|
||||||
if not os.path.isfile(file_path):
|
|
||||||
raise FileNotFoundError(f"HTML file not found: {file_path}")
|
|
||||||
|
|
||||||
reader = HTMLReader()
|
|
||||||
return reader.read(file_path, **options)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_html_string(html_content: str, **options) -> Document:
|
|
||||||
"""
|
|
||||||
Convenience function to parse HTML content from a string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content as a string
|
|
||||||
**options: Additional options for reading
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The parsed Document
|
|
||||||
"""
|
|
||||||
reader = HTMLReader()
|
|
||||||
return reader.read(html_content, **options)
|
|
||||||
@ -1,426 +0,0 @@
|
|||||||
"""
|
|
||||||
HTML metadata reader for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides specialized functionality for extracting metadata
|
|
||||||
from HTML documents, following the decomposed architecture pattern.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
import re
|
|
||||||
from pyWebLayout.abstract.document import Document, MetadataType
|
|
||||||
from pyWebLayout.io.readers.base import MetadataReader
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLMetadataReader(MetadataReader):
|
|
||||||
"""
|
|
||||||
Specialized reader for extracting metadata from HTML documents.
|
|
||||||
|
|
||||||
This class handles HTML meta tags, title elements, and other metadata
|
|
||||||
sources like Open Graph tags and JSON-LD structured data.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the HTML metadata reader."""
|
|
||||||
self._title = None
|
|
||||||
self._meta_tags = {}
|
|
||||||
self._og_tags = {}
|
|
||||||
self._twitter_tags = {}
|
|
||||||
self._json_ld = {}
|
|
||||||
|
|
||||||
def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Extract metadata from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
document: The document to populate with metadata
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of extracted metadata
|
|
||||||
"""
|
|
||||||
# Reset internal state
|
|
||||||
self._reset()
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
self._extract_title(html_content)
|
|
||||||
|
|
||||||
# Extract meta tags
|
|
||||||
self._extract_meta_tags(html_content)
|
|
||||||
|
|
||||||
# Extract Open Graph tags
|
|
||||||
self._extract_open_graph(html_content)
|
|
||||||
|
|
||||||
# Extract Twitter Card tags
|
|
||||||
self._extract_twitter_cards(html_content)
|
|
||||||
|
|
||||||
# Extract JSON-LD structured data
|
|
||||||
self._extract_json_ld(html_content)
|
|
||||||
|
|
||||||
# Populate document with extracted metadata
|
|
||||||
self._populate_document(document)
|
|
||||||
|
|
||||||
# Return all extracted metadata
|
|
||||||
return {
|
|
||||||
'title': self._title,
|
|
||||||
'meta_tags': self._meta_tags,
|
|
||||||
'open_graph': self._og_tags,
|
|
||||||
'twitter_cards': self._twitter_tags,
|
|
||||||
'json_ld': self._json_ld
|
|
||||||
}
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
"""Reset internal state for a new extraction."""
|
|
||||||
self._title = None
|
|
||||||
self._meta_tags = {}
|
|
||||||
self._og_tags = {}
|
|
||||||
self._twitter_tags = {}
|
|
||||||
self._json_ld = {}
|
|
||||||
|
|
||||||
def _extract_title(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract the title from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Look for title tag
|
|
||||||
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
|
|
||||||
if title_match:
|
|
||||||
# Clean up the title text
|
|
||||||
self._title = self._clean_text(title_match.group(1))
|
|
||||||
|
|
||||||
def _extract_meta_tags(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract meta tags from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Regular expression to match meta tags
|
|
||||||
meta_pattern = r'<meta\s+([^>]+)>'
|
|
||||||
|
|
||||||
for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
|
|
||||||
attrs = self._parse_attributes(match.group(1))
|
|
||||||
|
|
||||||
# Get name and content
|
|
||||||
name = attrs.get('name', '').lower()
|
|
||||||
content = attrs.get('content', '')
|
|
||||||
|
|
||||||
# Handle different types of meta tags
|
|
||||||
if name and content:
|
|
||||||
self._meta_tags[name] = content
|
|
||||||
|
|
||||||
# Handle http-equiv meta tags
|
|
||||||
http_equiv = attrs.get('http-equiv', '').lower()
|
|
||||||
if http_equiv and content:
|
|
||||||
self._meta_tags[f'http-equiv:{http_equiv}'] = content
|
|
||||||
|
|
||||||
# Handle charset meta tags
|
|
||||||
charset = attrs.get('charset', '')
|
|
||||||
if charset:
|
|
||||||
self._meta_tags['charset'] = charset
|
|
||||||
|
|
||||||
def _extract_open_graph(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract Open Graph meta tags from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Regular expression to match Open Graph meta tags
|
|
||||||
og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
|
||||||
|
|
||||||
for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
|
|
||||||
property_name = match.group(1)
|
|
||||||
content = match.group(2)
|
|
||||||
self._og_tags[property_name] = content
|
|
||||||
|
|
||||||
def _extract_twitter_cards(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract Twitter Card meta tags from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Regular expression to match Twitter Card meta tags
|
|
||||||
twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
|
||||||
|
|
||||||
for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
|
|
||||||
property_name = match.group(1)
|
|
||||||
content = match.group(2)
|
|
||||||
self._twitter_tags[property_name] = content
|
|
||||||
|
|
||||||
def _extract_json_ld(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract JSON-LD structured data from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Regular expression to match JSON-LD script tags
|
|
||||||
json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
|
|
||||||
|
|
||||||
for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
|
|
||||||
try:
|
|
||||||
import json
|
|
||||||
json_content = match.group(1).strip()
|
|
||||||
data = json.loads(json_content)
|
|
||||||
|
|
||||||
# Store JSON-LD data by type if available
|
|
||||||
if isinstance(data, dict) and '@type' in data:
|
|
||||||
type_name = data['@type']
|
|
||||||
if type_name not in self._json_ld:
|
|
||||||
self._json_ld[type_name] = []
|
|
||||||
self._json_ld[type_name].append(data)
|
|
||||||
elif isinstance(data, list):
|
|
||||||
# Handle arrays of structured data
|
|
||||||
for item in data:
|
|
||||||
if isinstance(item, dict) and '@type' in item:
|
|
||||||
type_name = item['@type']
|
|
||||||
if type_name not in self._json_ld:
|
|
||||||
self._json_ld[type_name] = []
|
|
||||||
self._json_ld[type_name].append(item)
|
|
||||||
except (json.JSONDecodeError, ImportError):
|
|
||||||
# Skip invalid JSON-LD
|
|
||||||
continue
|
|
||||||
|
|
||||||
def _populate_document(self, document: Document):
|
|
||||||
"""
|
|
||||||
Populate the document with extracted metadata.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: The document to populate
|
|
||||||
"""
|
|
||||||
# Set title
|
|
||||||
title = self._get_best_title()
|
|
||||||
if title:
|
|
||||||
document.set_metadata(MetadataType.TITLE, title)
|
|
||||||
|
|
||||||
# Set description
|
|
||||||
description = self._get_best_description()
|
|
||||||
if description:
|
|
||||||
document.set_metadata(MetadataType.DESCRIPTION, description)
|
|
||||||
|
|
||||||
# Set author
|
|
||||||
author = self._get_best_author()
|
|
||||||
if author:
|
|
||||||
document.set_metadata(MetadataType.AUTHOR, author)
|
|
||||||
|
|
||||||
# Set keywords
|
|
||||||
keywords = self._get_keywords()
|
|
||||||
if keywords:
|
|
||||||
document.set_metadata(MetadataType.KEYWORDS, keywords)
|
|
||||||
|
|
||||||
# Set language
|
|
||||||
language = self._get_language()
|
|
||||||
if language:
|
|
||||||
document.set_metadata(MetadataType.LANGUAGE, language)
|
|
||||||
|
|
||||||
# Set cover image
|
|
||||||
cover_image = self._get_cover_image()
|
|
||||||
if cover_image:
|
|
||||||
document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
|
|
||||||
|
|
||||||
# Set publisher
|
|
||||||
publisher = self._get_publisher()
|
|
||||||
if publisher:
|
|
||||||
document.set_metadata(MetadataType.PUBLISHER, publisher)
|
|
||||||
|
|
||||||
# Set publication date
|
|
||||||
pub_date = self._get_publication_date()
|
|
||||||
if pub_date:
|
|
||||||
document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
|
|
||||||
|
|
||||||
def _get_best_title(self) -> Optional[str]:
|
|
||||||
"""Get the best available title from all sources."""
|
|
||||||
# Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
|
|
||||||
|
|
||||||
# Check Open Graph
|
|
||||||
if 'title' in self._og_tags:
|
|
||||||
return self._og_tags['title']
|
|
||||||
|
|
||||||
# Check Twitter Cards
|
|
||||||
if 'title' in self._twitter_tags:
|
|
||||||
return self._twitter_tags['title']
|
|
||||||
|
|
||||||
# Check JSON-LD
|
|
||||||
for type_name, items in self._json_ld.items():
|
|
||||||
for item in items:
|
|
||||||
if 'name' in item:
|
|
||||||
return item['name']
|
|
||||||
elif 'headline' in item:
|
|
||||||
return item['headline']
|
|
||||||
|
|
||||||
# Check meta tags
|
|
||||||
for key in ['title', 'og:title', 'twitter:title']:
|
|
||||||
if key in self._meta_tags:
|
|
||||||
return self._meta_tags[key]
|
|
||||||
|
|
||||||
# Fall back to HTML title
|
|
||||||
return self._title
|
|
||||||
|
|
||||||
def _get_best_description(self) -> Optional[str]:
|
|
||||||
"""Get the best available description from all sources."""
|
|
||||||
# Priority order: Open Graph > Twitter > meta description > JSON-LD
|
|
||||||
|
|
||||||
# Check Open Graph
|
|
||||||
if 'description' in self._og_tags:
|
|
||||||
return self._og_tags['description']
|
|
||||||
|
|
||||||
# Check Twitter Cards
|
|
||||||
if 'description' in self._twitter_tags:
|
|
||||||
return self._twitter_tags['description']
|
|
||||||
|
|
||||||
# Check meta description
|
|
||||||
if 'description' in self._meta_tags:
|
|
||||||
return self._meta_tags['description']
|
|
||||||
|
|
||||||
# Check JSON-LD
|
|
||||||
for type_name, items in self._json_ld.items():
|
|
||||||
for item in items:
|
|
||||||
if 'description' in item:
|
|
||||||
return item['description']
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_best_author(self) -> Optional[str]:
|
|
||||||
"""Get the best available author from all sources."""
|
|
||||||
# Check meta tags
|
|
||||||
if 'author' in self._meta_tags:
|
|
||||||
return self._meta_tags['author']
|
|
||||||
|
|
||||||
# Check JSON-LD
|
|
||||||
for type_name, items in self._json_ld.items():
|
|
||||||
for item in items:
|
|
||||||
if 'author' in item:
|
|
||||||
author = item['author']
|
|
||||||
if isinstance(author, dict) and 'name' in author:
|
|
||||||
return author['name']
|
|
||||||
elif isinstance(author, str):
|
|
||||||
return author
|
|
||||||
elif 'creator' in item:
|
|
||||||
creator = item['creator']
|
|
||||||
if isinstance(creator, dict) and 'name' in creator:
|
|
||||||
return creator['name']
|
|
||||||
elif isinstance(creator, str):
|
|
||||||
return creator
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_keywords(self) -> Optional[str]:
|
|
||||||
"""Get keywords from meta tags."""
|
|
||||||
return self._meta_tags.get('keywords')
|
|
||||||
|
|
||||||
def _get_language(self) -> Optional[str]:
|
|
||||||
"""Get language from meta tags or HTML lang attribute."""
|
|
||||||
# Check meta tags first
|
|
||||||
if 'language' in self._meta_tags:
|
|
||||||
return self._meta_tags['language']
|
|
||||||
|
|
||||||
# Could also extract from html lang attribute if needed
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_cover_image(self) -> Optional[str]:
|
|
||||||
"""Get the best available cover image from all sources."""
|
|
||||||
# Check Open Graph
|
|
||||||
if 'image' in self._og_tags:
|
|
||||||
return self._og_tags['image']
|
|
||||||
|
|
||||||
# Check Twitter Cards
|
|
||||||
if 'image' in self._twitter_tags:
|
|
||||||
return self._twitter_tags['image']
|
|
||||||
|
|
||||||
# Check JSON-LD
|
|
||||||
for type_name, items in self._json_ld.items():
|
|
||||||
for item in items:
|
|
||||||
if 'image' in item:
|
|
||||||
image = item['image']
|
|
||||||
if isinstance(image, dict) and 'url' in image:
|
|
||||||
return image['url']
|
|
||||||
elif isinstance(image, str):
|
|
||||||
return image
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_publisher(self) -> Optional[str]:
|
|
||||||
"""Get publisher from JSON-LD or other sources."""
|
|
||||||
# Check JSON-LD
|
|
||||||
for type_name, items in self._json_ld.items():
|
|
||||||
for item in items:
|
|
||||||
if 'publisher' in item:
|
|
||||||
publisher = item['publisher']
|
|
||||||
if isinstance(publisher, dict) and 'name' in publisher:
|
|
||||||
return publisher['name']
|
|
||||||
elif isinstance(publisher, str):
|
|
||||||
return publisher
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_publication_date(self) -> Optional[str]:
|
|
||||||
"""Get publication date from JSON-LD or other sources."""
|
|
||||||
# Check JSON-LD
|
|
||||||
for type_name, items in self._json_ld.items():
|
|
||||||
for item in items:
|
|
||||||
if 'datePublished' in item:
|
|
||||||
return item['datePublished']
|
|
||||||
elif 'publishDate' in item:
|
|
||||||
return item['publishDate']
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
|
|
||||||
"""
|
|
||||||
Parse HTML attributes from a string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
attr_string: String containing HTML attributes
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of attribute name-value pairs
|
|
||||||
"""
|
|
||||||
attrs = {}
|
|
||||||
|
|
||||||
# Regular expression to match attribute="value" or attribute='value'
|
|
||||||
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
|
|
||||||
|
|
||||||
for match in re.finditer(attr_pattern, attr_string):
|
|
||||||
name = match.group(1).lower()
|
|
||||||
value = match.group(2) or match.group(3) or match.group(4) or ''
|
|
||||||
attrs[name] = value
|
|
||||||
|
|
||||||
# Handle standalone attributes (like charset)
|
|
||||||
standalone_pattern = r'\b(\w+)(?!=)'
|
|
||||||
for match in re.finditer(standalone_pattern, attr_string):
|
|
||||||
attr_name = match.group(1).lower()
|
|
||||||
if attr_name not in attrs:
|
|
||||||
attrs[attr_name] = ''
|
|
||||||
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
def _clean_text(self, text: str) -> str:
|
|
||||||
"""
|
|
||||||
Clean up text content by removing extra whitespace and HTML entities.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: The text to clean
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Cleaned text
|
|
||||||
"""
|
|
||||||
# Remove extra whitespace
|
|
||||||
cleaned = re.sub(r'\s+', ' ', text).strip()
|
|
||||||
|
|
||||||
# Decode common HTML entities
|
|
||||||
entities = {
|
|
||||||
'<': '<',
|
|
||||||
'>': '>',
|
|
||||||
'&': '&',
|
|
||||||
'"': '"',
|
|
||||||
''': "'",
|
|
||||||
' ': ' ',
|
|
||||||
}
|
|
||||||
|
|
||||||
for entity, char in entities.items():
|
|
||||||
cleaned = cleaned.replace(entity, char)
|
|
||||||
|
|
||||||
return cleaned
|
|
||||||
@ -1,483 +0,0 @@
|
|||||||
"""
|
|
||||||
HTML resources reader for pyWebLayout.
|
|
||||||
|
|
||||||
This module provides specialized functionality for extracting resources
|
|
||||||
from HTML documents, such as stylesheets, scripts, and external files.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Dict, Any, Optional, List
|
|
||||||
import re
|
|
||||||
import urllib.parse
|
|
||||||
from pyWebLayout.abstract.document import Document
|
|
||||||
from pyWebLayout.io.readers.base import ResourceReader
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLResourceReader(ResourceReader):
|
|
||||||
"""
|
|
||||||
Specialized reader for extracting resources from HTML documents.
|
|
||||||
|
|
||||||
This class handles CSS stylesheets, JavaScript files, images,
|
|
||||||
and other external resources referenced in HTML.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the HTML resource reader."""
|
|
||||||
self._stylesheets = []
|
|
||||||
self._scripts = []
|
|
||||||
self._external_resources = {}
|
|
||||||
self._inline_styles = {}
|
|
||||||
self._inline_scripts = []
|
|
||||||
|
|
||||||
def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Extract resources from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
document: The document to populate with resources
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of extracted resources
|
|
||||||
"""
|
|
||||||
# Reset internal state
|
|
||||||
self._reset()
|
|
||||||
|
|
||||||
# Extract stylesheets
|
|
||||||
self._extract_stylesheets(html_content)
|
|
||||||
|
|
||||||
# Extract scripts
|
|
||||||
self._extract_scripts(html_content)
|
|
||||||
|
|
||||||
# Extract other external resources
|
|
||||||
self._extract_external_resources(html_content)
|
|
||||||
|
|
||||||
# Extract inline styles
|
|
||||||
self._extract_inline_styles(html_content)
|
|
||||||
|
|
||||||
# Extract inline scripts
|
|
||||||
self._extract_inline_scripts(html_content)
|
|
||||||
|
|
||||||
# Populate document with extracted resources
|
|
||||||
self._populate_document(document)
|
|
||||||
|
|
||||||
# Return all extracted resources
|
|
||||||
return {
|
|
||||||
'stylesheets': self._stylesheets,
|
|
||||||
'scripts': self._scripts,
|
|
||||||
'external_resources': self._external_resources,
|
|
||||||
'inline_styles': self._inline_styles,
|
|
||||||
'inline_scripts': self._inline_scripts
|
|
||||||
}
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
"""Reset internal state for a new extraction."""
|
|
||||||
self._stylesheets = []
|
|
||||||
self._scripts = []
|
|
||||||
self._external_resources = {}
|
|
||||||
self._inline_styles = {}
|
|
||||||
self._inline_scripts = []
|
|
||||||
|
|
||||||
def _extract_stylesheets(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract CSS stylesheet references from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Regular expression to match link tags for stylesheets
|
|
||||||
link_pattern = r'<link\s+([^>]+)>'
|
|
||||||
|
|
||||||
for match in re.finditer(link_pattern, html_content, re.IGNORECASE):
|
|
||||||
attrs = self._parse_attributes(match.group(1))
|
|
||||||
|
|
||||||
# Check if this is a stylesheet
|
|
||||||
rel = attrs.get('rel', '').lower()
|
|
||||||
if rel == 'stylesheet':
|
|
||||||
href = attrs.get('href', '')
|
|
||||||
media = attrs.get('media', 'all')
|
|
||||||
type_attr = attrs.get('type', 'text/css')
|
|
||||||
|
|
||||||
if href:
|
|
||||||
stylesheet = {
|
|
||||||
'type': 'external',
|
|
||||||
'href': href,
|
|
||||||
'media': media,
|
|
||||||
'content_type': type_attr
|
|
||||||
}
|
|
||||||
self._stylesheets.append(stylesheet)
|
|
||||||
|
|
||||||
# Handle other link types
|
|
||||||
elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'):
|
|
||||||
href = attrs.get('href', '')
|
|
||||||
if href:
|
|
||||||
self._external_resources[f'icon_{len(self._external_resources)}'] = {
|
|
||||||
'type': 'icon',
|
|
||||||
'rel': rel,
|
|
||||||
'href': href,
|
|
||||||
'sizes': attrs.get('sizes', ''),
|
|
||||||
'content_type': attrs.get('type', '')
|
|
||||||
}
|
|
||||||
|
|
||||||
elif rel == 'preload':
|
|
||||||
href = attrs.get('href', '')
|
|
||||||
if href:
|
|
||||||
self._external_resources[f'preload_{len(self._external_resources)}'] = {
|
|
||||||
'type': 'preload',
|
|
||||||
'href': href,
|
|
||||||
'as': attrs.get('as', ''),
|
|
||||||
'content_type': attrs.get('type', '')
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_scripts(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract script references from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Regular expression to match script tags
|
|
||||||
script_pattern = r'<script\s*([^>]*)>(.*?)</script>'
|
|
||||||
|
|
||||||
for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL):
|
|
||||||
attrs_str = match.group(1)
|
|
||||||
content = match.group(2).strip()
|
|
||||||
|
|
||||||
attrs = self._parse_attributes(attrs_str)
|
|
||||||
|
|
||||||
src = attrs.get('src', '')
|
|
||||||
script_type = attrs.get('type', 'text/javascript')
|
|
||||||
|
|
||||||
if src:
|
|
||||||
# External script
|
|
||||||
script = {
|
|
||||||
'type': 'external',
|
|
||||||
'src': src,
|
|
||||||
'content_type': script_type,
|
|
||||||
'async': 'async' in attrs,
|
|
||||||
'defer': 'defer' in attrs,
|
|
||||||
'integrity': attrs.get('integrity', ''),
|
|
||||||
'crossorigin': attrs.get('crossorigin', '')
|
|
||||||
}
|
|
||||||
self._scripts.append(script)
|
|
||||||
|
|
||||||
elif content:
|
|
||||||
# Inline script
|
|
||||||
script = {
|
|
||||||
'type': 'inline',
|
|
||||||
'content': content,
|
|
||||||
'content_type': script_type
|
|
||||||
}
|
|
||||||
self._scripts.append(script)
|
|
||||||
|
|
||||||
def _extract_external_resources(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract other external resources from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Extract images
|
|
||||||
img_pattern = r'<img\s+([^>]+)>'
|
|
||||||
for match in re.finditer(img_pattern, html_content, re.IGNORECASE):
|
|
||||||
attrs = self._parse_attributes(match.group(1))
|
|
||||||
src = attrs.get('src', '')
|
|
||||||
if src:
|
|
||||||
self._external_resources[f'image_{len(self._external_resources)}'] = {
|
|
||||||
'type': 'image',
|
|
||||||
'src': src,
|
|
||||||
'alt': attrs.get('alt', ''),
|
|
||||||
'width': attrs.get('width', ''),
|
|
||||||
'height': attrs.get('height', ''),
|
|
||||||
'loading': attrs.get('loading', ''),
|
|
||||||
'srcset': attrs.get('srcset', '')
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract audio
|
|
||||||
audio_pattern = r'<audio\s+([^>]+)>'
|
|
||||||
for match in re.finditer(audio_pattern, html_content, re.IGNORECASE):
|
|
||||||
attrs = self._parse_attributes(match.group(1))
|
|
||||||
src = attrs.get('src', '')
|
|
||||||
if src:
|
|
||||||
self._external_resources[f'audio_{len(self._external_resources)}'] = {
|
|
||||||
'type': 'audio',
|
|
||||||
'src': src,
|
|
||||||
'controls': 'controls' in attrs,
|
|
||||||
'autoplay': 'autoplay' in attrs,
|
|
||||||
'loop': 'loop' in attrs,
|
|
||||||
'muted': 'muted' in attrs
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract video
|
|
||||||
video_pattern = r'<video\s+([^>]+)>'
|
|
||||||
for match in re.finditer(video_pattern, html_content, re.IGNORECASE):
|
|
||||||
attrs = self._parse_attributes(match.group(1))
|
|
||||||
src = attrs.get('src', '')
|
|
||||||
if src:
|
|
||||||
self._external_resources[f'video_{len(self._external_resources)}'] = {
|
|
||||||
'type': 'video',
|
|
||||||
'src': src,
|
|
||||||
'controls': 'controls' in attrs,
|
|
||||||
'autoplay': 'autoplay' in attrs,
|
|
||||||
'loop': 'loop' in attrs,
|
|
||||||
'muted': 'muted' in attrs,
|
|
||||||
'width': attrs.get('width', ''),
|
|
||||||
'height': attrs.get('height', ''),
|
|
||||||
'poster': attrs.get('poster', '')
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract embed/object resources
|
|
||||||
embed_pattern = r'<embed\s+([^>]+)>'
|
|
||||||
for match in re.finditer(embed_pattern, html_content, re.IGNORECASE):
|
|
||||||
attrs = self._parse_attributes(match.group(1))
|
|
||||||
src = attrs.get('src', '')
|
|
||||||
if src:
|
|
||||||
self._external_resources[f'embed_{len(self._external_resources)}'] = {
|
|
||||||
'type': 'embed',
|
|
||||||
'src': src,
|
|
||||||
'content_type': attrs.get('type', ''),
|
|
||||||
'width': attrs.get('width', ''),
|
|
||||||
'height': attrs.get('height', '')
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract iframe sources
|
|
||||||
iframe_pattern = r'<iframe\s+([^>]+)>'
|
|
||||||
for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE):
|
|
||||||
attrs = self._parse_attributes(match.group(1))
|
|
||||||
src = attrs.get('src', '')
|
|
||||||
if src:
|
|
||||||
self._external_resources[f'iframe_{len(self._external_resources)}'] = {
|
|
||||||
'type': 'iframe',
|
|
||||||
'src': src,
|
|
||||||
'width': attrs.get('width', ''),
|
|
||||||
'height': attrs.get('height', ''),
|
|
||||||
'loading': attrs.get('loading', ''),
|
|
||||||
'sandbox': attrs.get('sandbox', '')
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_inline_styles(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract inline CSS styles from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# Extract style blocks
|
|
||||||
style_pattern = r'<style\s*([^>]*)>(.*?)</style>'
|
|
||||||
|
|
||||||
for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)):
|
|
||||||
attrs_str = match.group(1)
|
|
||||||
content = match.group(2).strip()
|
|
||||||
|
|
||||||
attrs = self._parse_attributes(attrs_str)
|
|
||||||
|
|
||||||
if content:
|
|
||||||
style_block = {
|
|
||||||
'content': content,
|
|
||||||
'media': attrs.get('media', 'all'),
|
|
||||||
'content_type': attrs.get('type', 'text/css')
|
|
||||||
}
|
|
||||||
self._inline_styles[f'style_block_{i}'] = style_block
|
|
||||||
|
|
||||||
# Extract inline style attributes (this would be more complex
|
|
||||||
# as it requires parsing all elements with style attributes)
|
|
||||||
style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>'
|
|
||||||
|
|
||||||
for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)):
|
|
||||||
style_content = match.group(1)
|
|
||||||
if style_content:
|
|
||||||
style_attr = {
|
|
||||||
'content': style_content,
|
|
||||||
'type': 'attribute'
|
|
||||||
}
|
|
||||||
self._inline_styles[f'style_attr_{i}'] = style_attr
|
|
||||||
|
|
||||||
def _extract_inline_scripts(self, html_content: str):
|
|
||||||
"""
|
|
||||||
Extract inline JavaScript from HTML content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: The HTML content to parse
|
|
||||||
"""
|
|
||||||
# This is already handled in _extract_scripts, but we keep this
|
|
||||||
# method for consistency and potential future extensions
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _populate_document(self, document: Document):
|
|
||||||
"""
|
|
||||||
Populate the document with extracted resources.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: The document to populate
|
|
||||||
"""
|
|
||||||
# Add stylesheets
|
|
||||||
for stylesheet in self._stylesheets:
|
|
||||||
document.add_stylesheet(stylesheet)
|
|
||||||
|
|
||||||
# Add scripts
|
|
||||||
for script in self._scripts:
|
|
||||||
if script['type'] == 'inline':
|
|
||||||
document.add_script(script['content'])
|
|
||||||
else:
|
|
||||||
# For external scripts, we store them as resources
|
|
||||||
script_name = f"script_{len(document._resources)}"
|
|
||||||
document.add_resource(script_name, script)
|
|
||||||
|
|
||||||
# Add external resources
|
|
||||||
for name, resource in self._external_resources.items():
|
|
||||||
document.add_resource(name, resource)
|
|
||||||
|
|
||||||
# Add inline styles as stylesheets
|
|
||||||
for name, style in self._inline_styles.items():
|
|
||||||
if style.get('type') != 'attribute': # Don't add individual style attributes
|
|
||||||
parsed_style = self._parse_css(style['content'])
|
|
||||||
if parsed_style:
|
|
||||||
document.add_stylesheet({
|
|
||||||
'type': 'inline',
|
|
||||||
'content': style['content'],
|
|
||||||
'parsed': parsed_style,
|
|
||||||
'media': style.get('media', 'all')
|
|
||||||
})
|
|
||||||
|
|
||||||
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
|
|
||||||
"""
|
|
||||||
Parse HTML attributes from a string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
attr_string: String containing HTML attributes
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of attribute name-value pairs
|
|
||||||
"""
|
|
||||||
attrs = {}
|
|
||||||
|
|
||||||
# Regular expression to match attribute="value" or attribute='value'
|
|
||||||
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
|
|
||||||
|
|
||||||
for match in re.finditer(attr_pattern, attr_string):
|
|
||||||
name = match.group(1).lower()
|
|
||||||
value = match.group(2) or match.group(3) or match.group(4) or ''
|
|
||||||
attrs[name] = value
|
|
||||||
|
|
||||||
# Handle standalone attributes (like async, defer)
|
|
||||||
standalone_pattern = r'\b(\w+)(?!=)'
|
|
||||||
for match in re.finditer(standalone_pattern, attr_string):
|
|
||||||
attr_name = match.group(1).lower()
|
|
||||||
if attr_name not in attrs:
|
|
||||||
attrs[attr_name] = ''
|
|
||||||
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]:
|
|
||||||
"""
|
|
||||||
Parse a CSS stylesheet.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
css_str: CSS stylesheet string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of selectors and their style properties
|
|
||||||
"""
|
|
||||||
stylesheet = {}
|
|
||||||
|
|
||||||
# Remove comments
|
|
||||||
css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL)
|
|
||||||
|
|
||||||
# Split into rule sets
|
|
||||||
rule_sets = css_str.split('}')
|
|
||||||
|
|
||||||
for rule_set in rule_sets:
|
|
||||||
# Split into selector and declarations
|
|
||||||
parts = rule_set.split('{', 1)
|
|
||||||
if len(parts) != 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
selector = parts[0].strip()
|
|
||||||
declarations = parts[1].strip()
|
|
||||||
|
|
||||||
# Parse declarations
|
|
||||||
style = self._parse_css_declarations(declarations)
|
|
||||||
|
|
||||||
# Add to stylesheet
|
|
||||||
if selector and style:
|
|
||||||
stylesheet[selector] = style
|
|
||||||
|
|
||||||
return stylesheet
|
|
||||||
|
|
||||||
def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]:
|
|
||||||
"""
|
|
||||||
Parse CSS declarations.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
declarations_str: CSS declarations string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary of CSS properties and values
|
|
||||||
"""
|
|
||||||
declarations = {}
|
|
||||||
|
|
||||||
# Split the declarations string into individual declarations
|
|
||||||
decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()]
|
|
||||||
|
|
||||||
for declaration in decl_list:
|
|
||||||
# Split into property and value
|
|
||||||
parts = declaration.split(':', 1)
|
|
||||||
if len(parts) != 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
prop = parts[0].strip().lower()
|
|
||||||
value = parts[1].strip()
|
|
||||||
|
|
||||||
# Store the declaration
|
|
||||||
declarations[prop] = value
|
|
||||||
|
|
||||||
return declarations
|
|
||||||
|
|
||||||
def resolve_url(self, url: str, base_url: Optional[str] = None) -> str:
|
|
||||||
"""
|
|
||||||
Resolve a relative URL against a base URL.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: The URL to resolve
|
|
||||||
base_url: The base URL to resolve against
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The resolved URL
|
|
||||||
"""
|
|
||||||
if base_url and not url.startswith(('http://', 'https://', '//', 'data:')):
|
|
||||||
return urllib.parse.urljoin(base_url, url)
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]:
|
|
||||||
"""
|
|
||||||
Get the dependencies of a resource (e.g., CSS imports, script dependencies).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
resource: The resource to analyze
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of dependency URLs
|
|
||||||
"""
|
|
||||||
dependencies = []
|
|
||||||
|
|
||||||
if resource.get('type') == 'external' and 'content' in resource:
|
|
||||||
content = resource['content']
|
|
||||||
|
|
||||||
# Check for CSS @import rules
|
|
||||||
if resource.get('content_type', '').startswith('text/css'):
|
|
||||||
import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?'
|
|
||||||
for match in re.finditer(import_pattern, content, re.IGNORECASE):
|
|
||||||
dependencies.append(match.group(1))
|
|
||||||
|
|
||||||
# Check for JavaScript imports/requires (basic detection)
|
|
||||||
elif resource.get('content_type', '').startswith('text/javascript'):
|
|
||||||
# ES6 imports
|
|
||||||
import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']'
|
|
||||||
for match in re.finditer(import_pattern, content):
|
|
||||||
dependencies.append(match.group(1))
|
|
||||||
|
|
||||||
# CommonJS requires
|
|
||||||
require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)'
|
|
||||||
for match in re.finditer(require_pattern, content):
|
|
||||||
dependencies.append(match.group(1))
|
|
||||||
|
|
||||||
return dependencies
|
|
||||||
Loading…
x
Reference in New Issue
Block a user