427 lines
14 KiB
Python
427 lines
14 KiB
Python
"""
|
|
HTML metadata reader for pyWebLayout.
|
|
|
|
This module provides specialized functionality for extracting metadata
|
|
from HTML documents, following the decomposed architecture pattern.
|
|
"""
|
|
|
|
from typing import Dict, Any, Optional
|
|
import re
|
|
from pyWebLayout.abstract.document import Document, MetadataType
|
|
from pyWebLayout.io.readers.base import MetadataReader
|
|
|
|
|
|
class HTMLMetadataReader(MetadataReader):
|
|
"""
|
|
Specialized reader for extracting metadata from HTML documents.
|
|
|
|
This class handles HTML meta tags, title elements, and other metadata
|
|
sources like Open Graph tags and JSON-LD structured data.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the HTML metadata reader."""
|
|
self._title = None
|
|
self._meta_tags = {}
|
|
self._og_tags = {}
|
|
self._twitter_tags = {}
|
|
self._json_ld = {}
|
|
|
|
def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
|
|
"""
|
|
Extract metadata from HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to parse
|
|
document: The document to populate with metadata
|
|
|
|
Returns:
|
|
Dictionary of extracted metadata
|
|
"""
|
|
# Reset internal state
|
|
self._reset()
|
|
|
|
# Extract title
|
|
self._extract_title(html_content)
|
|
|
|
# Extract meta tags
|
|
self._extract_meta_tags(html_content)
|
|
|
|
# Extract Open Graph tags
|
|
self._extract_open_graph(html_content)
|
|
|
|
# Extract Twitter Card tags
|
|
self._extract_twitter_cards(html_content)
|
|
|
|
# Extract JSON-LD structured data
|
|
self._extract_json_ld(html_content)
|
|
|
|
# Populate document with extracted metadata
|
|
self._populate_document(document)
|
|
|
|
# Return all extracted metadata
|
|
return {
|
|
'title': self._title,
|
|
'meta_tags': self._meta_tags,
|
|
'open_graph': self._og_tags,
|
|
'twitter_cards': self._twitter_tags,
|
|
'json_ld': self._json_ld
|
|
}
|
|
|
|
def _reset(self):
|
|
"""Reset internal state for a new extraction."""
|
|
self._title = None
|
|
self._meta_tags = {}
|
|
self._og_tags = {}
|
|
self._twitter_tags = {}
|
|
self._json_ld = {}
|
|
|
|
def _extract_title(self, html_content: str):
|
|
"""
|
|
Extract the title from HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to parse
|
|
"""
|
|
# Look for title tag
|
|
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
|
|
if title_match:
|
|
# Clean up the title text
|
|
self._title = self._clean_text(title_match.group(1))
|
|
|
|
def _extract_meta_tags(self, html_content: str):
|
|
"""
|
|
Extract meta tags from HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to parse
|
|
"""
|
|
# Regular expression to match meta tags
|
|
meta_pattern = r'<meta\s+([^>]+)>'
|
|
|
|
for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
|
|
attrs = self._parse_attributes(match.group(1))
|
|
|
|
# Get name and content
|
|
name = attrs.get('name', '').lower()
|
|
content = attrs.get('content', '')
|
|
|
|
# Handle different types of meta tags
|
|
if name and content:
|
|
self._meta_tags[name] = content
|
|
|
|
# Handle http-equiv meta tags
|
|
http_equiv = attrs.get('http-equiv', '').lower()
|
|
if http_equiv and content:
|
|
self._meta_tags[f'http-equiv:{http_equiv}'] = content
|
|
|
|
# Handle charset meta tags
|
|
charset = attrs.get('charset', '')
|
|
if charset:
|
|
self._meta_tags['charset'] = charset
|
|
|
|
def _extract_open_graph(self, html_content: str):
|
|
"""
|
|
Extract Open Graph meta tags from HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to parse
|
|
"""
|
|
# Regular expression to match Open Graph meta tags
|
|
og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
|
|
|
for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
|
|
property_name = match.group(1)
|
|
content = match.group(2)
|
|
self._og_tags[property_name] = content
|
|
|
|
def _extract_twitter_cards(self, html_content: str):
|
|
"""
|
|
Extract Twitter Card meta tags from HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to parse
|
|
"""
|
|
# Regular expression to match Twitter Card meta tags
|
|
twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
|
|
|
|
for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
|
|
property_name = match.group(1)
|
|
content = match.group(2)
|
|
self._twitter_tags[property_name] = content
|
|
|
|
def _extract_json_ld(self, html_content: str):
|
|
"""
|
|
Extract JSON-LD structured data from HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to parse
|
|
"""
|
|
# Regular expression to match JSON-LD script tags
|
|
json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
|
|
|
|
for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
|
|
try:
|
|
import json
|
|
json_content = match.group(1).strip()
|
|
data = json.loads(json_content)
|
|
|
|
# Store JSON-LD data by type if available
|
|
if isinstance(data, dict) and '@type' in data:
|
|
type_name = data['@type']
|
|
if type_name not in self._json_ld:
|
|
self._json_ld[type_name] = []
|
|
self._json_ld[type_name].append(data)
|
|
elif isinstance(data, list):
|
|
# Handle arrays of structured data
|
|
for item in data:
|
|
if isinstance(item, dict) and '@type' in item:
|
|
type_name = item['@type']
|
|
if type_name not in self._json_ld:
|
|
self._json_ld[type_name] = []
|
|
self._json_ld[type_name].append(item)
|
|
except (json.JSONDecodeError, ImportError):
|
|
# Skip invalid JSON-LD
|
|
continue
|
|
|
|
def _populate_document(self, document: Document):
|
|
"""
|
|
Populate the document with extracted metadata.
|
|
|
|
Args:
|
|
document: The document to populate
|
|
"""
|
|
# Set title
|
|
title = self._get_best_title()
|
|
if title:
|
|
document.set_metadata(MetadataType.TITLE, title)
|
|
|
|
# Set description
|
|
description = self._get_best_description()
|
|
if description:
|
|
document.set_metadata(MetadataType.DESCRIPTION, description)
|
|
|
|
# Set author
|
|
author = self._get_best_author()
|
|
if author:
|
|
document.set_metadata(MetadataType.AUTHOR, author)
|
|
|
|
# Set keywords
|
|
keywords = self._get_keywords()
|
|
if keywords:
|
|
document.set_metadata(MetadataType.KEYWORDS, keywords)
|
|
|
|
# Set language
|
|
language = self._get_language()
|
|
if language:
|
|
document.set_metadata(MetadataType.LANGUAGE, language)
|
|
|
|
# Set cover image
|
|
cover_image = self._get_cover_image()
|
|
if cover_image:
|
|
document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
|
|
|
|
# Set publisher
|
|
publisher = self._get_publisher()
|
|
if publisher:
|
|
document.set_metadata(MetadataType.PUBLISHER, publisher)
|
|
|
|
# Set publication date
|
|
pub_date = self._get_publication_date()
|
|
if pub_date:
|
|
document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
|
|
|
|
def _get_best_title(self) -> Optional[str]:
|
|
"""Get the best available title from all sources."""
|
|
# Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
|
|
|
|
# Check Open Graph
|
|
if 'title' in self._og_tags:
|
|
return self._og_tags['title']
|
|
|
|
# Check Twitter Cards
|
|
if 'title' in self._twitter_tags:
|
|
return self._twitter_tags['title']
|
|
|
|
# Check JSON-LD
|
|
for type_name, items in self._json_ld.items():
|
|
for item in items:
|
|
if 'name' in item:
|
|
return item['name']
|
|
elif 'headline' in item:
|
|
return item['headline']
|
|
|
|
# Check meta tags
|
|
for key in ['title', 'og:title', 'twitter:title']:
|
|
if key in self._meta_tags:
|
|
return self._meta_tags[key]
|
|
|
|
# Fall back to HTML title
|
|
return self._title
|
|
|
|
def _get_best_description(self) -> Optional[str]:
|
|
"""Get the best available description from all sources."""
|
|
# Priority order: Open Graph > Twitter > meta description > JSON-LD
|
|
|
|
# Check Open Graph
|
|
if 'description' in self._og_tags:
|
|
return self._og_tags['description']
|
|
|
|
# Check Twitter Cards
|
|
if 'description' in self._twitter_tags:
|
|
return self._twitter_tags['description']
|
|
|
|
# Check meta description
|
|
if 'description' in self._meta_tags:
|
|
return self._meta_tags['description']
|
|
|
|
# Check JSON-LD
|
|
for type_name, items in self._json_ld.items():
|
|
for item in items:
|
|
if 'description' in item:
|
|
return item['description']
|
|
|
|
return None
|
|
|
|
def _get_best_author(self) -> Optional[str]:
|
|
"""Get the best available author from all sources."""
|
|
# Check meta tags
|
|
if 'author' in self._meta_tags:
|
|
return self._meta_tags['author']
|
|
|
|
# Check JSON-LD
|
|
for type_name, items in self._json_ld.items():
|
|
for item in items:
|
|
if 'author' in item:
|
|
author = item['author']
|
|
if isinstance(author, dict) and 'name' in author:
|
|
return author['name']
|
|
elif isinstance(author, str):
|
|
return author
|
|
elif 'creator' in item:
|
|
creator = item['creator']
|
|
if isinstance(creator, dict) and 'name' in creator:
|
|
return creator['name']
|
|
elif isinstance(creator, str):
|
|
return creator
|
|
|
|
return None
|
|
|
|
def _get_keywords(self) -> Optional[str]:
|
|
"""Get keywords from meta tags."""
|
|
return self._meta_tags.get('keywords')
|
|
|
|
def _get_language(self) -> Optional[str]:
|
|
"""Get language from meta tags or HTML lang attribute."""
|
|
# Check meta tags first
|
|
if 'language' in self._meta_tags:
|
|
return self._meta_tags['language']
|
|
|
|
# Could also extract from html lang attribute if needed
|
|
return None
|
|
|
|
def _get_cover_image(self) -> Optional[str]:
|
|
"""Get the best available cover image from all sources."""
|
|
# Check Open Graph
|
|
if 'image' in self._og_tags:
|
|
return self._og_tags['image']
|
|
|
|
# Check Twitter Cards
|
|
if 'image' in self._twitter_tags:
|
|
return self._twitter_tags['image']
|
|
|
|
# Check JSON-LD
|
|
for type_name, items in self._json_ld.items():
|
|
for item in items:
|
|
if 'image' in item:
|
|
image = item['image']
|
|
if isinstance(image, dict) and 'url' in image:
|
|
return image['url']
|
|
elif isinstance(image, str):
|
|
return image
|
|
|
|
return None
|
|
|
|
def _get_publisher(self) -> Optional[str]:
|
|
"""Get publisher from JSON-LD or other sources."""
|
|
# Check JSON-LD
|
|
for type_name, items in self._json_ld.items():
|
|
for item in items:
|
|
if 'publisher' in item:
|
|
publisher = item['publisher']
|
|
if isinstance(publisher, dict) and 'name' in publisher:
|
|
return publisher['name']
|
|
elif isinstance(publisher, str):
|
|
return publisher
|
|
|
|
return None
|
|
|
|
def _get_publication_date(self) -> Optional[str]:
|
|
"""Get publication date from JSON-LD or other sources."""
|
|
# Check JSON-LD
|
|
for type_name, items in self._json_ld.items():
|
|
for item in items:
|
|
if 'datePublished' in item:
|
|
return item['datePublished']
|
|
elif 'publishDate' in item:
|
|
return item['publishDate']
|
|
|
|
return None
|
|
|
|
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
|
|
"""
|
|
Parse HTML attributes from a string.
|
|
|
|
Args:
|
|
attr_string: String containing HTML attributes
|
|
|
|
Returns:
|
|
Dictionary of attribute name-value pairs
|
|
"""
|
|
attrs = {}
|
|
|
|
# Regular expression to match attribute="value" or attribute='value'
|
|
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
|
|
|
|
for match in re.finditer(attr_pattern, attr_string):
|
|
name = match.group(1).lower()
|
|
value = match.group(2) or match.group(3) or match.group(4) or ''
|
|
attrs[name] = value
|
|
|
|
# Handle standalone attributes (like charset)
|
|
standalone_pattern = r'\b(\w+)(?!=)'
|
|
for match in re.finditer(standalone_pattern, attr_string):
|
|
attr_name = match.group(1).lower()
|
|
if attr_name not in attrs:
|
|
attrs[attr_name] = ''
|
|
|
|
return attrs
|
|
|
|
def _clean_text(self, text: str) -> str:
|
|
"""
|
|
Clean up text content by removing extra whitespace and HTML entities.
|
|
|
|
Args:
|
|
text: The text to clean
|
|
|
|
Returns:
|
|
Cleaned text
|
|
"""
|
|
# Remove extra whitespace
|
|
cleaned = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
# Decode common HTML entities
|
|
entities = {
|
|
'<': '<',
|
|
'>': '>',
|
|
'&': '&',
|
|
'"': '"',
|
|
''': "'",
|
|
' ': ' ',
|
|
}
|
|
|
|
for entity, char in entities.items():
|
|
cleaned = cleaned.replace(entity, char)
|
|
|
|
return cleaned
|