pyWebLayout/pyWebLayout/io/readers/html_metadata.py

427 lines
14 KiB
Python

"""
HTML metadata reader for pyWebLayout.
This module provides specialized functionality for extracting metadata
from HTML documents, following the decomposed architecture pattern.
"""
from typing import Dict, Any, Optional
import re
from pyWebLayout.abstract.document import Document, MetadataType
from pyWebLayout.io.readers.base import MetadataReader
class HTMLMetadataReader(MetadataReader):
"""
Specialized reader for extracting metadata from HTML documents.
This class handles HTML meta tags, title elements, and other metadata
sources like Open Graph tags and JSON-LD structured data.
"""
def __init__(self):
"""Initialize the HTML metadata reader."""
self._title = None
self._meta_tags = {}
self._og_tags = {}
self._twitter_tags = {}
self._json_ld = {}
def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
"""
Extract metadata from HTML content.
Args:
html_content: The HTML content to parse
document: The document to populate with metadata
Returns:
Dictionary of extracted metadata
"""
# Reset internal state
self._reset()
# Extract title
self._extract_title(html_content)
# Extract meta tags
self._extract_meta_tags(html_content)
# Extract Open Graph tags
self._extract_open_graph(html_content)
# Extract Twitter Card tags
self._extract_twitter_cards(html_content)
# Extract JSON-LD structured data
self._extract_json_ld(html_content)
# Populate document with extracted metadata
self._populate_document(document)
# Return all extracted metadata
return {
'title': self._title,
'meta_tags': self._meta_tags,
'open_graph': self._og_tags,
'twitter_cards': self._twitter_tags,
'json_ld': self._json_ld
}
def _reset(self):
"""Reset internal state for a new extraction."""
self._title = None
self._meta_tags = {}
self._og_tags = {}
self._twitter_tags = {}
self._json_ld = {}
def _extract_title(self, html_content: str):
"""
Extract the title from HTML content.
Args:
html_content: The HTML content to parse
"""
# Look for title tag
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
if title_match:
# Clean up the title text
self._title = self._clean_text(title_match.group(1))
def _extract_meta_tags(self, html_content: str):
"""
Extract meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match meta tags
meta_pattern = r'<meta\s+([^>]+)>'
for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
attrs = self._parse_attributes(match.group(1))
# Get name and content
name = attrs.get('name', '').lower()
content = attrs.get('content', '')
# Handle different types of meta tags
if name and content:
self._meta_tags[name] = content
# Handle http-equiv meta tags
http_equiv = attrs.get('http-equiv', '').lower()
if http_equiv and content:
self._meta_tags[f'http-equiv:{http_equiv}'] = content
# Handle charset meta tags
charset = attrs.get('charset', '')
if charset:
self._meta_tags['charset'] = charset
def _extract_open_graph(self, html_content: str):
"""
Extract Open Graph meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match Open Graph meta tags
og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'
for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
property_name = match.group(1)
content = match.group(2)
self._og_tags[property_name] = content
def _extract_twitter_cards(self, html_content: str):
"""
Extract Twitter Card meta tags from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match Twitter Card meta tags
twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'
for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
property_name = match.group(1)
content = match.group(2)
self._twitter_tags[property_name] = content
def _extract_json_ld(self, html_content: str):
"""
Extract JSON-LD structured data from HTML content.
Args:
html_content: The HTML content to parse
"""
# Regular expression to match JSON-LD script tags
json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'
for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
try:
import json
json_content = match.group(1).strip()
data = json.loads(json_content)
# Store JSON-LD data by type if available
if isinstance(data, dict) and '@type' in data:
type_name = data['@type']
if type_name not in self._json_ld:
self._json_ld[type_name] = []
self._json_ld[type_name].append(data)
elif isinstance(data, list):
# Handle arrays of structured data
for item in data:
if isinstance(item, dict) and '@type' in item:
type_name = item['@type']
if type_name not in self._json_ld:
self._json_ld[type_name] = []
self._json_ld[type_name].append(item)
except (json.JSONDecodeError, ImportError):
# Skip invalid JSON-LD
continue
def _populate_document(self, document: Document):
"""
Populate the document with extracted metadata.
Args:
document: The document to populate
"""
# Set title
title = self._get_best_title()
if title:
document.set_metadata(MetadataType.TITLE, title)
# Set description
description = self._get_best_description()
if description:
document.set_metadata(MetadataType.DESCRIPTION, description)
# Set author
author = self._get_best_author()
if author:
document.set_metadata(MetadataType.AUTHOR, author)
# Set keywords
keywords = self._get_keywords()
if keywords:
document.set_metadata(MetadataType.KEYWORDS, keywords)
# Set language
language = self._get_language()
if language:
document.set_metadata(MetadataType.LANGUAGE, language)
# Set cover image
cover_image = self._get_cover_image()
if cover_image:
document.set_metadata(MetadataType.COVER_IMAGE, cover_image)
# Set publisher
publisher = self._get_publisher()
if publisher:
document.set_metadata(MetadataType.PUBLISHER, publisher)
# Set publication date
pub_date = self._get_publication_date()
if pub_date:
document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)
def _get_best_title(self) -> Optional[str]:
"""Get the best available title from all sources."""
# Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title
# Check Open Graph
if 'title' in self._og_tags:
return self._og_tags['title']
# Check Twitter Cards
if 'title' in self._twitter_tags:
return self._twitter_tags['title']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'name' in item:
return item['name']
elif 'headline' in item:
return item['headline']
# Check meta tags
for key in ['title', 'og:title', 'twitter:title']:
if key in self._meta_tags:
return self._meta_tags[key]
# Fall back to HTML title
return self._title
def _get_best_description(self) -> Optional[str]:
"""Get the best available description from all sources."""
# Priority order: Open Graph > Twitter > meta description > JSON-LD
# Check Open Graph
if 'description' in self._og_tags:
return self._og_tags['description']
# Check Twitter Cards
if 'description' in self._twitter_tags:
return self._twitter_tags['description']
# Check meta description
if 'description' in self._meta_tags:
return self._meta_tags['description']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'description' in item:
return item['description']
return None
def _get_best_author(self) -> Optional[str]:
"""Get the best available author from all sources."""
# Check meta tags
if 'author' in self._meta_tags:
return self._meta_tags['author']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'author' in item:
author = item['author']
if isinstance(author, dict) and 'name' in author:
return author['name']
elif isinstance(author, str):
return author
elif 'creator' in item:
creator = item['creator']
if isinstance(creator, dict) and 'name' in creator:
return creator['name']
elif isinstance(creator, str):
return creator
return None
def _get_keywords(self) -> Optional[str]:
"""Get keywords from meta tags."""
return self._meta_tags.get('keywords')
def _get_language(self) -> Optional[str]:
"""Get language from meta tags or HTML lang attribute."""
# Check meta tags first
if 'language' in self._meta_tags:
return self._meta_tags['language']
# Could also extract from html lang attribute if needed
return None
def _get_cover_image(self) -> Optional[str]:
"""Get the best available cover image from all sources."""
# Check Open Graph
if 'image' in self._og_tags:
return self._og_tags['image']
# Check Twitter Cards
if 'image' in self._twitter_tags:
return self._twitter_tags['image']
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'image' in item:
image = item['image']
if isinstance(image, dict) and 'url' in image:
return image['url']
elif isinstance(image, str):
return image
return None
def _get_publisher(self) -> Optional[str]:
"""Get publisher from JSON-LD or other sources."""
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'publisher' in item:
publisher = item['publisher']
if isinstance(publisher, dict) and 'name' in publisher:
return publisher['name']
elif isinstance(publisher, str):
return publisher
return None
def _get_publication_date(self) -> Optional[str]:
"""Get publication date from JSON-LD or other sources."""
# Check JSON-LD
for type_name, items in self._json_ld.items():
for item in items:
if 'datePublished' in item:
return item['datePublished']
elif 'publishDate' in item:
return item['publishDate']
return None
def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
"""
Parse HTML attributes from a string.
Args:
attr_string: String containing HTML attributes
Returns:
Dictionary of attribute name-value pairs
"""
attrs = {}
# Regular expression to match attribute="value" or attribute='value'
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'
for match in re.finditer(attr_pattern, attr_string):
name = match.group(1).lower()
value = match.group(2) or match.group(3) or match.group(4) or ''
attrs[name] = value
# Handle standalone attributes (like charset)
standalone_pattern = r'\b(\w+)(?!=)'
for match in re.finditer(standalone_pattern, attr_string):
attr_name = match.group(1).lower()
if attr_name not in attrs:
attrs[attr_name] = ''
return attrs
def _clean_text(self, text: str) -> str:
"""
Clean up text content by removing extra whitespace and HTML entities.
Args:
text: The text to clean
Returns:
Cleaned text
"""
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', text).strip()
# Decode common HTML entities
entities = {
'&lt;': '<',
'&gt;': '>',
'&amp;': '&',
'&quot;': '"',
'&apos;': "'",
'&nbsp;': ' ',
}
for entity, char in entities.items():
cleaned = cleaned.replace(entity, char)
return cleaned