removed more unneed junk
Some checks failed
Python CI / test (push) Failing after 45s

This commit is contained in:
Duncan Tourolle 2025-06-07 19:16:27 +02:00
parent 2b1170cac7
commit 28c7b6700b
7 changed files with 4 additions and 1513 deletions

View File

@ -35,8 +35,3 @@ from pyWebLayout.concrete.page import Container, Page
from pyWebLayout.abstract.inline import Word
# IO functionality (reading and writing)
from pyWebLayout.io import (
parse_html, html_to_document, # HTML parsing
read_epub # EPUB reading
)

View File

@ -11,61 +11,5 @@ pattern as the abstract module.
# Legacy readers (for backward compatibility)
# Legacy functions provided by new HTML reader for backward compatibility
from pyWebLayout.io.readers.html import parse_html_string as parse_html
from pyWebLayout.io.readers.html import read_html_file as html_to_document
from pyWebLayout.io.readers.epub_reader import read_epub
# New decomposed readers
from pyWebLayout.io.readers.html import HTMLReader, read_html, read_html_file, parse_html_string
from pyWebLayout.io.readers.base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
# Specialized HTML readers
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
# HTML extraction parser (the best approach)
from pyWebLayout.io.readers.html_extraction import parse_html_string as parse_html_extraction
# Specialized EPUB readers
from pyWebLayout.io.readers.epub_metadata import EPUBMetadataReader
# Convenience functions using the new architecture
def read_document(source, format_hint=None, **options):
"""
Read a document using the appropriate reader based on format detection.
Args:
source: The source to read (file path, URL, or content)
format_hint: Optional hint about the format ('html', 'epub', etc.)
**options: Additional options for reading
Returns:
Document: The parsed document
"""
if format_hint == 'html' or (not format_hint and _is_html_source(source)):
reader = HTMLReader()
return reader.read(source, **options)
elif format_hint == 'epub' or (not format_hint and _is_epub_source(source)):
# Use legacy EPUB reader for now
return read_epub(source)
else:
# Try HTML reader as fallback
try:
reader = HTMLReader()
if reader.can_read(source):
return reader.read(source, **options)
except:
pass
raise ValueError(f"Cannot determine format for source: {source}")
def _is_html_source(source):
"""Check if source appears to be HTML."""
reader = HTMLReader()
return reader.can_read(source)
def _is_epub_source(source):
"""Check if source appears to be EPUB."""
if isinstance(source, str):
return source.lower().endswith('.epub')
return False
from pyWebLayout.io.readers.epub_reader import EPUBReader

View File

@ -9,14 +9,13 @@ using a decomposed architecture pattern.
from .base import BaseReader, MetadataReader, ContentReader, ResourceReader, CompositeReader
# HTML readers (decomposed)
from .html import HTMLReader, read_html, read_html_file, parse_html_string
from .html_metadata import HTMLMetadataReader
from .html_resources import HTMLResourceReader
# EPUB readers
from .epub_reader import read_epub # Legacy
from .epub_metadata import EPUBMetadataReader # New decomposed
__all__ = [
# Base classes

View File

@ -1,352 +0,0 @@
"""
EPUB metadata reader for pyWebLayout.
This module provides specialized functionality for extracting metadata
from EPUB documents, following the decomposed architecture pattern.
"""
import os
import zipfile
import tempfile
from typing import Dict, Any, Optional, List
import xml.etree.ElementTree as ET
from pyWebLayout.abstract.document import Document, MetadataType
from pyWebLayout.io.readers.base import MetadataReader
# XML namespaces used in EPUB files
NAMESPACES = {
'opf': 'http://www.idpf.org/2007/opf',
'dc': 'http://purl.org/dc/elements/1.1/',
'dcterms': 'http://purl.org/dc/terms/',
}
class EPUBMetadataReader(MetadataReader):
"""
Specialized reader for extracting metadata from EPUB documents.
This class handles EPUB package document metadata including
Dublin Core elements and custom metadata.
"""
def __init__(self):
"""Initialize the EPUB metadata reader."""
self._metadata = {}
self._temp_dir = None
self._package_path = None
def extract_metadata(self, epub_path: str, document: Document) -> Dict[str, Any]:
"""
Extract metadata from EPUB file.
Args:
epub_path: Path to the EPUB file
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
# Reset internal state
self._reset()
try:
# Extract EPUB to temporary directory
self._extract_epub(epub_path)
# Find and parse package document
self._find_package_document()
if self._package_path:
self._parse_package_metadata()
# Populate document with extracted metadata
self._populate_document(document)
return self._metadata
finally:
# Clean up temporary files
self._cleanup()
def _reset(self):
"""Reset internal state for a new extraction."""
self._metadata = {}
self._temp_dir = None
self._package_path = None
def _extract_epub(self, epub_path: str):
"""
Extract EPUB file to temporary directory.
Args:
epub_path: Path to the EPUB file
"""
self._temp_dir = tempfile.mkdtemp()
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
zip_ref.extractall(self._temp_dir)
def _find_package_document(self):
"""Find the package document (content.opf) in the extracted EPUB."""
# First, try to find it via META-INF/container.xml
container_path = os.path.join(self._temp_dir, 'META-INF', 'container.xml')
if os.path.exists(container_path):
try:
tree = ET.parse(container_path)
root = tree.getroot()
# Find rootfile element
for rootfile in root.findall('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile'):
full_path = rootfile.get('full-path')
if full_path:
self._package_path = os.path.join(self._temp_dir, full_path)
if os.path.exists(self._package_path):
return
except ET.ParseError:
pass
# Fallback: search for .opf files
for root, dirs, files in os.walk(self._temp_dir):
for file in files:
if file.endswith('.opf'):
self._package_path = os.path.join(root, file)
return
def _parse_package_metadata(self):
"""Parse metadata from the package document."""
if not self._package_path or not os.path.exists(self._package_path):
return
try:
tree = ET.parse(self._package_path)
root = tree.getroot()
# Find metadata element
metadata_elem = root.find('.//{{{0}}}metadata'.format(NAMESPACES['opf']))
if metadata_elem is None:
return
# Parse Dublin Core metadata
self._parse_dublin_core(metadata_elem)
# Parse OPF-specific metadata
self._parse_opf_metadata(metadata_elem)
except ET.ParseError as e:
print(f"Error parsing package document: {e}")
def _parse_dublin_core(self, metadata_elem: ET.Element):
"""
Parse Dublin Core metadata elements.
Args:
metadata_elem: The metadata XML element
"""
dc_elements = {
'title': 'title',
'creator': 'creator',
'subject': 'subject',
'description': 'description',
'publisher': 'publisher',
'contributor': 'contributor',
'date': 'date',
'type': 'type',
'format': 'format',
'identifier': 'identifier',
'source': 'source',
'language': 'language',
'relation': 'relation',
'coverage': 'coverage',
'rights': 'rights'
}
for dc_name, meta_key in dc_elements.items():
elements = metadata_elem.findall('.//{{{0}}}{1}'.format(NAMESPACES['dc'], dc_name))
if elements:
if len(elements) == 1:
# Single element
text = elements[0].text
if text:
self._metadata[meta_key] = text.strip()
# Handle special attributes
elem = elements[0]
if dc_name == 'creator':
# Check for role attribute
role = elem.get('{{{0}}}role'.format(NAMESPACES['opf']))
if role:
self._metadata[f'{meta_key}_role'] = role
# Check for file-as attribute for sorting
file_as = elem.get('{{{0}}}file-as'.format(NAMESPACES['opf']))
if file_as:
self._metadata[f'{meta_key}_file_as'] = file_as
elif dc_name == 'identifier':
# Check for scheme (ISBN, DOI, etc.)
scheme = elem.get('{{{0}}}scheme'.format(NAMESPACES['opf']))
if scheme:
self._metadata[f'{meta_key}_scheme'] = scheme
# Check if this is the unique identifier
id_attr = elem.get('id')
if id_attr:
self._metadata[f'{meta_key}_id'] = id_attr
elif dc_name == 'date':
# Check for event type
event = elem.get('{{{0}}}event'.format(NAMESPACES['opf']))
if event:
self._metadata[f'{meta_key}_event'] = event
else:
# Multiple elements - store as list
values = []
for elem in elements:
if elem.text:
values.append(elem.text.strip())
if values:
self._metadata[meta_key] = values
def _parse_opf_metadata(self, metadata_elem: ET.Element):
"""
Parse OPF-specific metadata elements.
Args:
metadata_elem: The metadata XML element
"""
# Parse meta elements
meta_elements = metadata_elem.findall('.//{{{0}}}meta'.format(NAMESPACES['opf']))
for meta in meta_elements:
name = meta.get('name')
content = meta.get('content')
if name and content:
self._metadata[f'meta_{name}'] = content
# Parse x-metadata elements (custom metadata)
x_meta_elements = metadata_elem.findall('.//{{{0}}}x-metadata'.format(NAMESPACES['opf']))
for x_meta in x_meta_elements:
for child in x_meta:
if child.tag and child.text:
# Remove namespace prefix for cleaner key names
tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
self._metadata[f'x_meta_{tag_name}'] = child.text.strip()
def _populate_document(self, document: Document):
"""
Populate the document with extracted metadata.
Args:
document: The document to populate
"""
# Map EPUB metadata to document metadata types
metadata_mapping = {
'title': MetadataType.TITLE,
'creator': MetadataType.AUTHOR,
'description': MetadataType.DESCRIPTION,
'subject': MetadataType.KEYWORDS,
'language': MetadataType.LANGUAGE,
'date': MetadataType.PUBLICATION_DATE,
'publisher': MetadataType.PUBLISHER,
'identifier': MetadataType.IDENTIFIER,
}
for epub_key, doc_type in metadata_mapping.items():
if epub_key in self._metadata:
value = self._metadata[epub_key]
# Handle list values (like multiple subjects)
if isinstance(value, list):
if epub_key == 'subject':
# Join subjects with commas for keywords
document.set_metadata(doc_type, ', '.join(value))
else:
# For other list values, use the first one
document.set_metadata(doc_type, value[0])
else:
document.set_metadata(doc_type, value)
# Handle cover image
cover_meta = self._metadata.get('meta_cover')
if cover_meta:
document.set_metadata(MetadataType.COVER_IMAGE, cover_meta)
# Store original EPUB metadata for reference
document.set_metadata(MetadataType.CUSTOM, {
'epub_metadata': self._metadata
})
def _cleanup(self):
"""Clean up temporary files."""
if self._temp_dir:
try:
import shutil
shutil.rmtree(self._temp_dir, ignore_errors=True)
except:
pass
self._temp_dir = None
def get_unique_identifier(self) -> Optional[str]:
"""
Get the unique identifier from the EPUB metadata.
Returns:
The unique identifier string, or None if not found
"""
# Look for identifier with specific ID
for key, value in self._metadata.items():
if key.startswith('identifier') and key.endswith('_id'):
return self._metadata.get('identifier')
# Fallback to any identifier
return self._metadata.get('identifier')
def get_cover_id(self) -> Optional[str]:
"""
Get the cover image ID from metadata.
Returns:
The cover image ID, or None if not found
"""
return self._metadata.get('meta_cover')
def get_creators(self) -> List[Dict[str, str]]:
"""
Get creator information with roles.
Returns:
List of creator dictionaries with name, role, and file-as info
"""
creators = []
creator_value = self._metadata.get('creator')
if creator_value:
if isinstance(creator_value, list):
# Multiple creators - this is simplified, real implementation
# would need to correlate with role and file-as attributes
for creator in creator_value:
creators.append({'name': creator})
else:
# Single creator
creator_info = {'name': creator_value}
# Add role if available
role = self._metadata.get('creator_role')
if role:
creator_info['role'] = role
# Add file-as if available
file_as = self._metadata.get('creator_file_as')
if file_as:
creator_info['file_as'] = file_as
creators.append(creator_info)
return creators

View File

@ -1,186 +0,0 @@
"""
Modern HTML reader for pyWebLayout.
This module provides an HTML reader that uses the html_extraction module
for clean, handler-based parsing using BeautifulSoup.
"""
import os
from typing import Union, Optional
from pyWebLayout.abstract.document import Document
from pyWebLayout.io.readers.base import BaseReader
from pyWebLayout.io.readers.html_metadata import HTMLMetadataReader
from pyWebLayout.io.readers.html_resources import HTMLResourceReader
from pyWebLayout.io.readers.html_extraction import parse_html_string
from pyWebLayout.style import Font
class HTMLReader(BaseReader):
"""
Modern HTML reader using the html_extraction parser.
This reader uses the clean, handler-based architecture from html_extraction.py
for parsing HTML content into pyWebLayout's abstract document structure.
"""
def __init__(self):
"""Initialize the HTML reader."""
super().__init__()
self._metadata_reader = HTMLMetadataReader()
self._resource_reader = HTMLResourceReader()
def can_read(self, source: Union[str, bytes]) -> bool:
"""
Check if this reader can handle the given source.
Args:
source: The source to check (file path, URL, or content)
Returns:
True if this reader can handle the source, False otherwise
"""
if isinstance(source, str):
# Check if it's a file path
if os.path.isfile(source):
return source.lower().endswith(('.html', '.htm', '.xhtml'))
# Check if it's HTML content (very basic check)
source_lower = source.lower().strip()
return (source_lower.startswith('<!doctype html') or
source_lower.startswith('<html') or
'<html' in source_lower[:200])
elif isinstance(source, bytes):
# Check if it's HTML content in bytes
try:
source_str = source.decode('utf-8', errors='ignore').lower().strip()
return (source_str.startswith('<!doctype html') or
source_str.startswith('<html') or
'<html' in source_str[:200])
except:
return False
return False
def read(self, source: Union[str, bytes], **options) -> Document:
"""
Read and parse the HTML source into a Document.
Args:
source: The HTML source to read (file path, URL, or content)
**options: Additional options for reading
- base_url: Base URL for resolving relative links
- encoding: Character encoding (default: 'utf-8')
- extract_metadata: Whether to extract metadata (default: True)
- extract_resources: Whether to extract resources (default: True)
- base_font: Base font for styling (default: None)
Returns:
The parsed Document
"""
# Get options
base_url = options.get('base_url')
encoding = options.get('encoding', 'utf-8')
extract_metadata = options.get('extract_metadata', True)
extract_resources = options.get('extract_resources', True)
base_font = options.get('base_font')
# Read the HTML content
html_content = self._read_html_content(source, encoding)
# Set base URL if not provided and source is a file
if not base_url and isinstance(source, str) and os.path.isfile(source):
base_url = f"file://{os.path.dirname(os.path.abspath(source))}/"
# Create a new document
document = Document()
# Extract metadata if enabled
if extract_metadata and self._metadata_reader:
self._metadata_reader.extract_metadata(html_content, document)
# Parse content using html_extraction
blocks = parse_html_string(html_content, base_font)
for block in blocks:
document.add_block(block)
# Extract resources if enabled
if extract_resources and self._resource_reader:
self._resource_reader.extract_resources(html_content, document)
return document
def _read_html_content(self, source: Union[str, bytes], encoding: str = 'utf-8') -> str:
"""
Read HTML content from various sources.
Args:
source: The source to read from
encoding: Character encoding to use
Returns:
The HTML content as a string
"""
if isinstance(source, bytes):
# Source is already bytes, decode it
return source.decode(encoding, errors='replace')
elif isinstance(source, str):
# Check if it's a file path
if os.path.isfile(source):
with open(source, 'r', encoding=encoding, errors='replace') as f:
return f.read()
else:
# Assume it's HTML content
return source
else:
raise ValueError(f"Unsupported source type: {type(source)}")
def read_html(source: Union[str, bytes], **options) -> Document:
"""
Convenience function to read HTML content.
Args:
source: The HTML source to read (file path, URL, or content)
**options: Additional options for reading
Returns:
The parsed Document
"""
reader = HTMLReader()
return reader.read(source, **options)
def read_html_file(file_path: str, **options) -> Document:
"""
Convenience function to read HTML from a file.
Args:
file_path: Path to the HTML file
**options: Additional options for reading
Returns:
The parsed Document
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"HTML file not found: {file_path}")
reader = HTMLReader()
return reader.read(file_path, **options)
def parse_html_string(html_content: str, **options) -> Document:
"""
Convenience function to parse HTML content from a string.
Args:
html_content: The HTML content as a string
**options: Additional options for reading
Returns:
The parsed Document
"""
reader = HTMLReader()
return reader.read(html_content, **options)

View File

@ -1,426 +0,0 @@
"""
HTML metadata reader for pyWebLayout.
This module provides specialized functionality for extracting metadata
from HTML documents, following the decomposed architecture pattern.
"""
from typing import Dict, Any, Optional
import re
from pyWebLayout.abstract.document import Document, MetadataType
from pyWebLayout.io.readers.base import MetadataReader
class HTMLMetadataReader(MetadataReader):
"""
Specialized reader for extracting metadata from HTML documents.
This class handles HTML meta tags, title elements, and other metadata
sources like Open Graph tags and JSON-LD structured data.
"""
def __init__(self):
"""Initialize the HTML metadata reader."""
self._title = None
self._meta_tags = {}
self._og_tags = {}
self._twitter_tags = {}
self._json_ld = {}
def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
"""
Extract metadata from HTML content.
Args:
html_content: The HTML content to parse
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
# Reset internal state
self._reset()
# Extract title
self._extract_title(html_content)
# Extract meta tags
self._extract_meta_tags(html_content)
# Extract Open Graph tags
self._extract_open_graph(html_content)
# Extract Twitter Card tags
self._extract_twitter_cards(html_content)
# Extract JSON-LD structured data
self._extract_json_ld(html_content)
# Populate document with extracted metadata
self._populate_document(document)
# Return all extracted metadata
return {
'title': self._title,
'meta_tags': self._meta_tags,
'open_graph': self._og_tags,
'twitter_cards': self._twitter_tags,
'json_ld': self._json_ld
}
def _reset(self):
"""Reset internal state for a new extraction."""
self._title = None
self._meta_tags = {}
self._og_tags = {}
self._twitter_tags = {}
self._json_ld = {}
def _extract_title(self, html_content: str):
"""
Extract the title from HTML content.
Args:
html_content: The HTML content to parse
"""
# Look for title tag
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
if title_match:
# Clean up the title text
self._title = self._clean_text(title_match.group(1))
def _extract_meta_tags(self, html_content: str):
"""
Extract meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match meta tags
meta_pattern = r'<meta\s+([^>]+)>'
for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
# Get name and content
name = attrs.get('name', '').lower()
content = attrs.get('content', '')
# Handle different types of meta tags
if name and content:
self._meta_tags[name] = content
# Handle http-equiv meta tags
http_equiv = attrs.get('http-equiv', '').lower()
if http_equiv and content:
self._meta_tags[f'http-equiv:{http_equiv}'] = content
# Handle charset meta tags
charset = attrs.get('charset', '')
if charset:
self._meta_tags['charset'] = charset
def _extract_open_graph(self, html_content: str):
"""
Extract Open Graph meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match Open Graph meta tags
og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
property_name = match.group(1)
content = match.group(2)
self._og_tags[property_name] = content
def _extract_twitter_cards(self, html_content: str):
"""
Extract Twitter Card meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match Twitter Card meta tags
twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
property_name = match.group(1)
content = match.group(2)
self._twitter_tags[property_name] = content
def _extract_json_ld(self, html_content: str):
"""
Extract JSON-LD structured data from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match JSON-LD script tags
json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
try:
import json
json_content = match.group(1).strip()
data = json.loads(json_content)
# Store JSON-LD data by type if available
if isinstance(data, dict) and '@type' in data:
type_name = data['@type']
if type_name not in self._json_ld:
self._json_ld[type_name] = []
self._json_ld[type_name].append(data)
elif isinstance(data, list):
# Handle arrays of structured data
for item in data:
if isinstance(item, dict) and '@type' in item:
type_name = item['@type']
if type_name not in self._json_ld:
self._json_ld[type_name] = []
self._json_ld[type_name].append(item)
except (json.JSONDecodeError, ImportError):
# Skip invalid JSON-LD
continue
def _populate_document(self, document: Document):
"""
Populate the document with extracted metadata.
Args:
document: The document to populate
"""
# Set title
title = self._get_best_title()
if title:
document.set_metadata(MetadataType.TITLE, title)
# Set description
description = self._get_best_description()
if description:
document.set_metadata(MetadataType.DESCRIPTION, description)
# Set author
author = self._get_best_author()
if author:
document.set_metadata(MetadataType.AUTHOR, author)
# Set keywords
keywords = self._get_keywords()
if keywords:
document.set_metadata(MetadataType.KEYWORDS, keywords)
# Set language
language = self._get_language()
if language:
document.set_metadata(MetadataType.LANGUAGE, language)
# Set cover image
cover_image = self._get_cover_image()
if cover_image:
document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
# Set publisher
publisher = self._get_publisher()
if publisher:
document.set_metadata(MetadataType.PUBLISHER, publisher)
# Set publication date
pub_date = self._get_publication_date()
if pub_date:
document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
def _get_best_title(self) -> Optional[str]:
"""Get the best available title from all sources."""
# Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
# Check Open Graph
if 'title' in self._og_tags:
return self._og_tags['title']
# Check Twitter Cards
if 'title' in self._twitter_tags:
return self._twitter_tags['title']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'name' in item:
return item['name']
elif 'headline' in item:
return item['headline']
# Check meta tags
for key in ['title', 'og:title', 'twitter:title']:
if key in self._meta_tags:
return self._meta_tags[key]
# Fall back to HTML title
return self._title
def _get_best_description(self) -> Optional[str]:
"""Get the best available description from all sources."""
# Priority order: Open Graph > Twitter > meta description > JSON-LD
# Check Open Graph
if 'description' in self._og_tags:
return self._og_tags['description']
# Check Twitter Cards
if 'description' in self._twitter_tags:
return self._twitter_tags['description']
# Check meta description
if 'description' in self._meta_tags:
return self._meta_tags['description']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'description' in item:
return item['description']
return None
def _get_best_author(self) -> Optional[str]:
"""Get the best available author from all sources."""
# Check meta tags
if 'author' in self._meta_tags:
return self._meta_tags['author']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'author' in item:
author = item['author']
if isinstance(author, dict) and 'name' in author:
return author['name']
elif isinstance(author, str):
return author
elif 'creator' in item:
creator = item['creator']
if isinstance(creator, dict) and 'name' in creator:
return creator['name']
elif isinstance(creator, str):
return creator
return None
def _get_keywords(self) -> Optional[str]:
"""Get keywords from meta tags."""
return self._meta_tags.get('keywords')
def _get_language(self) -> Optional[str]:
"""Get language from meta tags or HTML lang attribute."""
# Check meta tags first
if 'language' in self._meta_tags:
return self._meta_tags['language']
# Could also extract from html lang attribute if needed
return None
def _get_cover_image(self) -> Optional[str]:
"""Get the best available cover image from all sources."""
# Check Open Graph
if 'image' in self._og_tags:
return self._og_tags['image']
# Check Twitter Cards
if 'image' in self._twitter_tags:
return self._twitter_tags['image']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'image' in item:
image = item['image']
if isinstance(image, dict) and 'url' in image:
return image['url']
elif isinstance(image, str):
return image
return None
def _get_publisher(self) -> Optional[str]:
"""Get publisher from JSON-LD or other sources."""
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'publisher' in item:
publisher = item['publisher']
if isinstance(publisher, dict) and 'name' in publisher:
return publisher['name']
elif isinstance(publisher, str):
return publisher
return None
def _get_publication_date(self) -> Optional[str]:
"""Get publication date from JSON-LD or other sources."""
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'datePublished' in item:
return item['datePublished']
elif 'publishDate' in item:
return item['publishDate']
return None
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
"""
Parse HTML attributes from a string.
Args:
attr_string: String containing HTML attributes
Returns:
Dictionary of attribute name-value pairs
"""
attrs = {}
# Regular expression to match attribute="value" or attribute='value'
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
for match in re.finditer(attr_pattern, attr_string):
name = match.group(1).lower()
value = match.group(2) or match.group(3) or match.group(4) or ''
attrs[name] = value
# Handle standalone attributes (like charset)
standalone_pattern = r'\b(\w+)(?!=)'
for match in re.finditer(standalone_pattern, attr_string):
attr_name = match.group(1).lower()
if attr_name not in attrs:
attrs[attr_name] = ''
return attrs
def _clean_text(self, text: str) -> str:
"""
Clean up text content by removing extra whitespace and HTML entities.
Args:
text: The text to clean
Returns:
Cleaned text
"""
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', text).strip()
# Decode common HTML entities
entities = {
'&lt;': '<',
'&gt;': '>',
'&amp;': '&',
'&quot;': '"',
'&apos;': "'",
'&nbsp;': ' ',
}
for entity, char in entities.items():
cleaned = cleaned.replace(entity, char)
return cleaned

View File

@ -1,483 +0,0 @@
"""
HTML resources reader for pyWebLayout.
This module provides specialized functionality for extracting resources
from HTML documents, such as stylesheets, scripts, and external files.
"""
from typing import Dict, Any, Optional, List
import re
import urllib.parse
from pyWebLayout.abstract.document import Document
from pyWebLayout.io.readers.base import ResourceReader
class HTMLResourceReader(ResourceReader):
"""
Specialized reader for extracting resources from HTML documents.
This class handles CSS stylesheets, JavaScript files, images,
and other external resources referenced in HTML.
"""
def __init__(self):
"""Initialize the HTML resource reader."""
self._stylesheets = []
self._scripts = []
self._external_resources = {}
self._inline_styles = {}
self._inline_scripts = []
def extract_resources(self, html_content: str, document: Document) -> Dict[str, Any]:
"""
Extract resources from HTML content.
Args:
html_content: The HTML content to parse
document: The document to populate with resources
Returns:
Dictionary of extracted resources
"""
# Reset internal state
self._reset()
# Extract stylesheets
self._extract_stylesheets(html_content)
# Extract scripts
self._extract_scripts(html_content)
# Extract other external resources
self._extract_external_resources(html_content)
# Extract inline styles
self._extract_inline_styles(html_content)
# Extract inline scripts
self._extract_inline_scripts(html_content)
# Populate document with extracted resources
self._populate_document(document)
# Return all extracted resources
return {
'stylesheets': self._stylesheets,
'scripts': self._scripts,
'external_resources': self._external_resources,
'inline_styles': self._inline_styles,
'inline_scripts': self._inline_scripts
}
def _reset(self):
"""Reset internal state for a new extraction."""
self._stylesheets = []
self._scripts = []
self._external_resources = {}
self._inline_styles = {}
self._inline_scripts = []
def _extract_stylesheets(self, html_content: str):
"""
Extract CSS stylesheet references from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match link tags for stylesheets
link_pattern = r'<link\s+([^>]+)>'
for match in re.finditer(link_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
# Check if this is a stylesheet
rel = attrs.get('rel', '').lower()
if rel == 'stylesheet':
href = attrs.get('href', '')
media = attrs.get('media', 'all')
type_attr = attrs.get('type', 'text/css')
if href:
stylesheet = {
'type': 'external',
'href': href,
'media': media,
'content_type': type_attr
}
self._stylesheets.append(stylesheet)
# Handle other link types
elif rel in ('icon', 'shortcut icon', 'apple-touch-icon'):
href = attrs.get('href', '')
if href:
self._external_resources[f'icon_{len(self._external_resources)}'] = {
'type': 'icon',
'rel': rel,
'href': href,
'sizes': attrs.get('sizes', ''),
'content_type': attrs.get('type', '')
}
elif rel == 'preload':
href = attrs.get('href', '')
if href:
self._external_resources[f'preload_{len(self._external_resources)}'] = {
'type': 'preload',
'href': href,
'as': attrs.get('as', ''),
'content_type': attrs.get('type', '')
}
def _extract_scripts(self, html_content: str):
"""
Extract script references from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match script tags
script_pattern = r'<script\s*([^>]*)>(.*?)</script>'
for match in re.finditer(script_pattern, html_content, re.IGNORECASE | re.DOTALL):
attrs_str = match.group(1)
content = match.group(2).strip()
attrs = self._parse_attributes(attrs_str)
src = attrs.get('src', '')
script_type = attrs.get('type', 'text/javascript')
if src:
# External script
script = {
'type': 'external',
'src': src,
'content_type': script_type,
'async': 'async' in attrs,
'defer': 'defer' in attrs,
'integrity': attrs.get('integrity', ''),
'crossorigin': attrs.get('crossorigin', '')
}
self._scripts.append(script)
elif content:
# Inline script
script = {
'type': 'inline',
'content': content,
'content_type': script_type
}
self._scripts.append(script)
def _extract_external_resources(self, html_content: str):
"""
Extract other external resources from HTML content.
Args:
html_content: The HTML content to parse
"""
# Extract images
img_pattern = r'<img\s+([^>]+)>'
for match in re.finditer(img_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'image_{len(self._external_resources)}'] = {
'type': 'image',
'src': src,
'alt': attrs.get('alt', ''),
'width': attrs.get('width', ''),
'height': attrs.get('height', ''),
'loading': attrs.get('loading', ''),
'srcset': attrs.get('srcset', '')
}
# Extract audio
audio_pattern = r'<audio\s+([^>]+)>'
for match in re.finditer(audio_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'audio_{len(self._external_resources)}'] = {
'type': 'audio',
'src': src,
'controls': 'controls' in attrs,
'autoplay': 'autoplay' in attrs,
'loop': 'loop' in attrs,
'muted': 'muted' in attrs
}
# Extract video
video_pattern = r'<video\s+([^>]+)>'
for match in re.finditer(video_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'video_{len(self._external_resources)}'] = {
'type': 'video',
'src': src,
'controls': 'controls' in attrs,
'autoplay': 'autoplay' in attrs,
'loop': 'loop' in attrs,
'muted': 'muted' in attrs,
'width': attrs.get('width', ''),
'height': attrs.get('height', ''),
'poster': attrs.get('poster', '')
}
# Extract embed/object resources
embed_pattern = r'<embed\s+([^>]+)>'
for match in re.finditer(embed_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'embed_{len(self._external_resources)}'] = {
'type': 'embed',
'src': src,
'content_type': attrs.get('type', ''),
'width': attrs.get('width', ''),
'height': attrs.get('height', '')
}
# Extract iframe sources
iframe_pattern = r'<iframe\s+([^>]+)>'
for match in re.finditer(iframe_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
src = attrs.get('src', '')
if src:
self._external_resources[f'iframe_{len(self._external_resources)}'] = {
'type': 'iframe',
'src': src,
'width': attrs.get('width', ''),
'height': attrs.get('height', ''),
'loading': attrs.get('loading', ''),
'sandbox': attrs.get('sandbox', '')
}
def _extract_inline_styles(self, html_content: str):
"""
Extract inline CSS styles from HTML content.
Args:
html_content: The HTML content to parse
"""
# Extract style blocks
style_pattern = r'<style\s*([^>]*)>(.*?)</style>'
for i, match in enumerate(re.finditer(style_pattern, html_content, re.IGNORECASE | re.DOTALL)):
attrs_str = match.group(1)
content = match.group(2).strip()
attrs = self._parse_attributes(attrs_str)
if content:
style_block = {
'content': content,
'media': attrs.get('media', 'all'),
'content_type': attrs.get('type', 'text/css')
}
self._inline_styles[f'style_block_{i}'] = style_block
# Extract inline style attributes (this would be more complex
# as it requires parsing all elements with style attributes)
style_attr_pattern = r'<[^>]+style\s*=\s*["\']([^"\']+)["\'][^>]*>'
for i, match in enumerate(re.finditer(style_attr_pattern, html_content, re.IGNORECASE)):
style_content = match.group(1)
if style_content:
style_attr = {
'content': style_content,
'type': 'attribute'
}
self._inline_styles[f'style_attr_{i}'] = style_attr
def _extract_inline_scripts(self, html_content: str):
"""
Extract inline JavaScript from HTML content.
Args:
html_content: The HTML content to parse
"""
# This is already handled in _extract_scripts, but we keep this
# method for consistency and potential future extensions
pass
def _populate_document(self, document: Document):
"""
Populate the document with extracted resources.
Args:
document: The document to populate
"""
# Add stylesheets
for stylesheet in self._stylesheets:
document.add_stylesheet(stylesheet)
# Add scripts
for script in self._scripts:
if script['type'] == 'inline':
document.add_script(script['content'])
else:
# For external scripts, we store them as resources
script_name = f"script_{len(document._resources)}"
document.add_resource(script_name, script)
# Add external resources
for name, resource in self._external_resources.items():
document.add_resource(name, resource)
# Add inline styles as stylesheets
for name, style in self._inline_styles.items():
if style.get('type') != 'attribute': # Don't add individual style attributes
parsed_style = self._parse_css(style['content'])
if parsed_style:
document.add_stylesheet({
'type': 'inline',
'content': style['content'],
'parsed': parsed_style,
'media': style.get('media', 'all')
})
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
"""
Parse HTML attributes from a string.
Args:
attr_string: String containing HTML attributes
Returns:
Dictionary of attribute name-value pairs
"""
attrs = {}
# Regular expression to match attribute="value" or attribute='value'
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
for match in re.finditer(attr_pattern, attr_string):
name = match.group(1).lower()
value = match.group(2) or match.group(3) or match.group(4) or ''
attrs[name] = value
# Handle standalone attributes (like async, defer)
standalone_pattern = r'\b(\w+)(?!=)'
for match in re.finditer(standalone_pattern, attr_string):
attr_name = match.group(1).lower()
if attr_name not in attrs:
attrs[attr_name] = ''
return attrs
def _parse_css(self, css_str: str) -> Dict[str, Dict[str, str]]:
"""
Parse a CSS stylesheet.
Args:
css_str: CSS stylesheet string
Returns:
Dictionary of selectors and their style properties
"""
stylesheet = {}
# Remove comments
css_str = re.sub(r'/\*.*?\*/', '', css_str, flags=re.DOTALL)
# Split into rule sets
rule_sets = css_str.split('}')
for rule_set in rule_sets:
# Split into selector and declarations
parts = rule_set.split('{', 1)
if len(parts) != 2:
continue
selector = parts[0].strip()
declarations = parts[1].strip()
# Parse declarations
style = self._parse_css_declarations(declarations)
# Add to stylesheet
if selector and style:
stylesheet[selector] = style
return stylesheet
def _parse_css_declarations(self, declarations_str: str) -> Dict[str, str]:
"""
Parse CSS declarations.
Args:
declarations_str: CSS declarations string
Returns:
Dictionary of CSS properties and values
"""
declarations = {}
# Split the declarations string into individual declarations
decl_list = [d.strip() for d in declarations_str.split(';') if d.strip()]
for declaration in decl_list:
# Split into property and value
parts = declaration.split(':', 1)
if len(parts) != 2:
continue
prop = parts[0].strip().lower()
value = parts[1].strip()
# Store the declaration
declarations[prop] = value
return declarations
def resolve_url(self, url: str, base_url: Optional[str] = None) -> str:
"""
Resolve a relative URL against a base URL.
Args:
url: The URL to resolve
base_url: The base URL to resolve against
Returns:
The resolved URL
"""
if base_url and not url.startswith(('http://', 'https://', '//', 'data:')):
return urllib.parse.urljoin(base_url, url)
return url
def get_resource_dependencies(self, resource: Dict[str, Any]) -> List[str]:
"""
Get the dependencies of a resource (e.g., CSS imports, script dependencies).
Args:
resource: The resource to analyze
Returns:
List of dependency URLs
"""
dependencies = []
if resource.get('type') == 'external' and 'content' in resource:
content = resource['content']
# Check for CSS @import rules
if resource.get('content_type', '').startswith('text/css'):
import_pattern = r'@import\s+(?:url\()?["\']?([^"\'()]+)["\']?\)?'
for match in re.finditer(import_pattern, content, re.IGNORECASE):
dependencies.append(match.group(1))
# Check for JavaScript imports/requires (basic detection)
elif resource.get('content_type', '').startswith('text/javascript'):
# ES6 imports
import_pattern = r'import\s+.*?\s+from\s+["\']([^"\']+)["\']'
for match in re.finditer(import_pattern, content):
dependencies.append(match.group(1))
# CommonJS requires
require_pattern = r'require\(\s*["\']([^"\']+)["\']\s*\)'
for match in re.finditer(require_pattern, content):
dependencies.append(match.group(1))
return dependencies