pyWebLayout/pyWebLayout/io/readers/html_metadata.py

"""
HTML metadata reader for pyWebLayout.

This module provides specialized functionality for extracting metadata
from HTML documents, following the decomposed architecture pattern.
"""

from typing import Dict, Any, Optional
import re
from pyWebLayout.abstract.document import Document, MetadataType
from pyWebLayout.io.readers.base import MetadataReader


class HTMLMetadataReader(MetadataReader):
    """
    Specialized reader for extracting metadata from HTML documents.

    This class handles HTML meta tags, title elements, and other metadata
    sources like Open Graph tags and JSON-LD structured data.
    """

    def __init__(self):
        """Initialize the HTML metadata reader."""
        self._title = None
        self._meta_tags = {}
        self._og_tags = {}
        self._twitter_tags = {}
        self._json_ld = {}

    def extract_metadata(self, html_content: str, document: Document) -> Dict[str, Any]:
        """
        Extract metadata from HTML content.

        Args:
            html_content: The HTML content to parse
            document: The document to populate with metadata

        Returns:
            Dictionary of extracted metadata
        """
        # Reset internal state
        self._reset()

        # Extract title
        self._extract_title(html_content)

        # Extract meta tags
        self._extract_meta_tags(html_content)

        # Extract Open Graph tags
        self._extract_open_graph(html_content)

        # Extract Twitter Card tags
        self._extract_twitter_cards(html_content)

        # Extract JSON-LD structured data
        self._extract_json_ld(html_content)

        # Populate document with extracted metadata
        self._populate_document(document)

        # Return all extracted metadata
        return {
            'title': self._title,
            'meta_tags': self._meta_tags,
            'open_graph': self._og_tags,
            'twitter_cards': self._twitter_tags,
            'json_ld': self._json_ld
        }

    def _reset(self):
        """Reset internal state for a new extraction."""
        self._title = None
        self._meta_tags = {}
        self._og_tags = {}
        self._twitter_tags = {}
        self._json_ld = {}

    def _extract_title(self, html_content: str):
        """
        Extract the title from HTML content.

        Args:
            html_content: The HTML content to parse
        """
        # Look for title tag
        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
        if title_match:
            # Clean up the title text
            self._title = self._clean_text(title_match.group(1))

    def _extract_meta_tags(self, html_content: str):
        """
        Extract meta tags from HTML content.

        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match meta tags
        meta_pattern = r'<meta\s+([^>]+)>'

        for match in re.finditer(meta_pattern, html_content, re.IGNORECASE):
            attrs = self._parse_attributes(match.group(1))

            # Get name and content
            name = attrs.get('name', '').lower()
            content = attrs.get('content', '')

            # Handle different types of meta tags
            if name and content:
                self._meta_tags[name] = content

            # Handle http-equiv meta tags
            http_equiv = attrs.get('http-equiv', '').lower()
            if http_equiv and content:
                self._meta_tags[f'http-equiv:{http_equiv}'] = content

            # Handle charset meta tags
            charset = attrs.get('charset', '')
            if charset:
                self._meta_tags['charset'] = charset

    def _extract_open_graph(self, html_content: str):
        """
        Extract Open Graph meta tags from HTML content.

        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match Open Graph meta tags
        og_pattern = r'<meta\s+property="og:([^"]+)"\s+content="([^"]*)"[^>]*>'

        for match in re.finditer(og_pattern, html_content, re.IGNORECASE):
            property_name = match.group(1)
            content = match.group(2)
            self._og_tags[property_name] = content

    def _extract_twitter_cards(self, html_content: str):
        """
        Extract Twitter Card meta tags from HTML content.

        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match Twitter Card meta tags
        twitter_pattern = r'<meta\s+name="twitter:([^"]+)"\s+content="([^"]*)"[^>]*>'

        for match in re.finditer(twitter_pattern, html_content, re.IGNORECASE):
            property_name = match.group(1)
            content = match.group(2)
            self._twitter_tags[property_name] = content

    def _extract_json_ld(self, html_content: str):
        """
        Extract JSON-LD structured data from HTML content.

        Args:
            html_content: The HTML content to parse
        """
        # Regular expression to match JSON-LD script tags
        json_ld_pattern = r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>'

        for match in re.finditer(json_ld_pattern, html_content, re.IGNORECASE | re.DOTALL):
            try:
                import json
                json_content = match.group(1).strip()
                data = json.loads(json_content)

                # Store JSON-LD data by type if available
                if isinstance(data, dict) and '@type' in data:
                    type_name = data['@type']
                    if type_name not in self._json_ld:
                        self._json_ld[type_name] = []
                    self._json_ld[type_name].append(data)
                elif isinstance(data, list):
                    # Handle arrays of structured data
                    for item in data:
                        if isinstance(item, dict) and '@type' in item:
                            type_name = item['@type']
                            if type_name not in self._json_ld:
                                self._json_ld[type_name] = []
                            self._json_ld[type_name].append(item)
            except (json.JSONDecodeError, ImportError):
                # Skip invalid JSON-LD
                continue

    def _populate_document(self, document: Document):
        """
        Populate the document with extracted metadata.

        Args:
            document: The document to populate
        """
        # Set title
        title = self._get_best_title()
        if title:
            document.set_metadata(MetadataType.TITLE, title)

        # Set description
        description = self._get_best_description()
        if description:
            document.set_metadata(MetadataType.DESCRIPTION, description)

        # Set author
        author = self._get_best_author()
        if author:
            document.set_metadata(MetadataType.AUTHOR, author)

        # Set keywords
        keywords = self._get_keywords()
        if keywords:
            document.set_metadata(MetadataType.KEYWORDS, keywords)

        # Set language
        language = self._get_language()
        if language:
            document.set_metadata(MetadataType.LANGUAGE, language)

        # Set cover image
        cover_image = self._get_cover_image()
        if cover_image:
            document.set_metadata(MetadataType.COVER_IMAGE, cover_image)

        # Set publisher
        publisher = self._get_publisher()
        if publisher:
            document.set_metadata(MetadataType.PUBLISHER, publisher)

        # Set publication date
        pub_date = self._get_publication_date()
        if pub_date:
            document.set_metadata(MetadataType.PUBLICATION_DATE, pub_date)

    def _get_best_title(self) -> Optional[str]:
        """Get the best available title from all sources."""
        # Priority order: Open Graph > Twitter > JSON-LD > meta > HTML title

        # Check Open Graph
        if 'title' in self._og_tags:
            return self._og_tags['title']

        # Check Twitter Cards
        if 'title' in self._twitter_tags:
            return self._twitter_tags['title']

        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'name' in item:
                    return item['name']
                elif 'headline' in item:
                    return item['headline']

        # Check meta tags
        for key in ['title', 'og:title', 'twitter:title']:
            if key in self._meta_tags:
                return self._meta_tags[key]

        # Fall back to HTML title
        return self._title

    def _get_best_description(self) -> Optional[str]:
        """Get the best available description from all sources."""
        # Priority order: Open Graph > Twitter > meta description > JSON-LD

        # Check Open Graph
        if 'description' in self._og_tags:
            return self._og_tags['description']

        # Check Twitter Cards
        if 'description' in self._twitter_tags:
            return self._twitter_tags['description']

        # Check meta description
        if 'description' in self._meta_tags:
            return self._meta_tags['description']

        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'description' in item:
                    return item['description']

        return None

    def _get_best_author(self) -> Optional[str]:
        """Get the best available author from all sources."""
        # Check meta tags
        if 'author' in self._meta_tags:
            return self._meta_tags['author']

        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'author' in item:
                    author = item['author']
                    if isinstance(author, dict) and 'name' in author:
                        return author['name']
                    elif isinstance(author, str):
                        return author
                elif 'creator' in item:
                    creator = item['creator']
                    if isinstance(creator, dict) and 'name' in creator:
                        return creator['name']
                    elif isinstance(creator, str):
                        return creator

        return None

    def _get_keywords(self) -> Optional[str]:
        """Get keywords from meta tags."""
        return self._meta_tags.get('keywords')

    def _get_language(self) -> Optional[str]:
        """Get language from meta tags or HTML lang attribute."""
        # Check meta tags first
        if 'language' in self._meta_tags:
            return self._meta_tags['language']

        # Could also extract from html lang attribute if needed
        return None

    def _get_cover_image(self) -> Optional[str]:
        """Get the best available cover image from all sources."""
        # Check Open Graph
        if 'image' in self._og_tags:
            return self._og_tags['image']

        # Check Twitter Cards
        if 'image' in self._twitter_tags:
            return self._twitter_tags['image']

        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'image' in item:
                    image = item['image']
                    if isinstance(image, dict) and 'url' in image:
                        return image['url']
                    elif isinstance(image, str):
                        return image

        return None

    def _get_publisher(self) -> Optional[str]:
        """Get publisher from JSON-LD or other sources."""
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'publisher' in item:
                    publisher = item['publisher']
                    if isinstance(publisher, dict) and 'name' in publisher:
                        return publisher['name']
                    elif isinstance(publisher, str):
                        return publisher

        return None

    def _get_publication_date(self) -> Optional[str]:
        """Get publication date from JSON-LD or other sources."""
        # Check JSON-LD
        for type_name, items in self._json_ld.items():
            for item in items:
                if 'datePublished' in item:
                    return item['datePublished']
                elif 'publishDate' in item:
                    return item['publishDate']

        return None

    def _parse_attributes(self, attr_string: str) -> Dict[str, str]:
        """
        Parse HTML attributes from a string.

        Args:
            attr_string: String containing HTML attributes

        Returns:
            Dictionary of attribute name-value pairs
        """
        attrs = {}

        # Regular expression to match attribute="value" or attribute='value'
        attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)|([^\s>]+))'

        for match in re.finditer(attr_pattern, attr_string):
            name = match.group(1).lower()
            value = match.group(2) or match.group(3) or match.group(4) or ''
            attrs[name] = value

        # Handle standalone attributes (like charset)
        standalone_pattern = r'\b(\w+)(?!=)'
        for match in re.finditer(standalone_pattern, attr_string):
            attr_name = match.group(1).lower()
            if attr_name not in attrs:
                attrs[attr_name] = ''

        return attrs

    def _clean_text(self, text: str) -> str:
        """
        Clean up text content by removing extra whitespace and HTML entities.

        Args:
            text: The text to clean

        Returns:
            Cleaned text
        """
        # Remove extra whitespace
        cleaned = re.sub(r'\s+', ' ', text).strip()

        # Decode common HTML entities
        entities = {
            '&lt;': '<',
            '&gt;': '>',
            '&amp;': '&',
            '&quot;': '"',
            '&apos;': "'",
            '&nbsp;': ' ',
        }

        for entity, char in entities.items():
            cleaned = cleaned.replace(entity, char)

        return cleaned