pyWebLayout/html_browser.py
Duncan Tourolle 899182152a
Some checks failed
Python CI / test (push) Failing after 4m51s
add some additional tests
2025-06-07 20:16:38 +02:00

643 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Simple HTML Browser using pyWebLayout
This browser can render basic HTML content using the pyWebLayout concrete objects.
It supports text, images, links, forms, and basic styling.
"""
import re
import tkinter as tk
from tkinter import ttk, messagebox, filedialog, simpledialog
from PIL import Image, ImageTk
from typing import Dict, List, Optional, Tuple, Any
import webbrowser
import os
from urllib.parse import urljoin, urlparse
import requests
from io import BytesIO
# Import pyWebLayout components
from pyWebLayout.concrete import (
Page, Container, Box, Text, RenderableImage,
RenderableLink, RenderableButton, RenderableForm, RenderableFormField
)
from pyWebLayout.abstract.functional import (
Link, Button, Form, FormField, LinkType, FormFieldType
)
from pyWebLayout.style.fonts import Font, FontWeight, FontStyle, TextDecoration
from pyWebLayout.style.layout import Alignment
class HTMLParser:
"""Simple HTML parser that converts HTML to pyWebLayout objects"""
def __init__(self):
self.font_stack = [Font(font_size=14)] # Default font
self.current_container = None
def parse_html_string(self, html_content: str, base_url: str = "") -> Page:
"""Parse HTML string and return a Page object"""
# Create the main page
page = Page(size=(800, 1600), background_color=(255, 255, 255))
self.current_container = page
self.base_url = base_url
# Simple HTML parsing using regex (not production-ready, but works for demo)
# Remove comments and scripts
html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)
html_content = re.sub(r'<style.*?</style>', '', html_content, flags=re.DOTALL)
# Extract title
title_match = re.search(r'<title>(.*?)</title>', html_content, re.IGNORECASE)
if title_match:
page.title = title_match.group(1)
# Extract body content
body_match = re.search(r'<body[^>]*>(.*?)</body>', html_content, re.DOTALL | re.IGNORECASE)
if body_match:
body_content = body_match.group(1)
else:
# If no body tag, use the entire content
body_content = html_content
# Parse the body content
self._parse_content(body_content, page)
return page
def parse_html_file(self, file_path: str) -> Page:
"""Parse HTML file and return a Page object"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
base_url = os.path.dirname(os.path.abspath(file_path))
return self.parse_html_string(html_content, base_url)
except Exception as e:
# Create error page
page = Page(size=(800, 1600), background_color=(255, 255, 255))
error_text = Text(f"Error loading file: {str(e)}", Font(font_size=16, colour=(255, 0, 0)))
page.add_child(error_text)
return page
def _parse_content(self, content: str, container: Container):
"""Parse HTML content and add elements to container"""
# Simple token-based parsing
tokens = self._tokenize_html(content)
i = 0
while i < len(tokens):
token = tokens[i]
if token['type'] == 'text':
if token['content'].strip(): # Only add non-empty text
text_obj = Text(token['content'].strip(), self.font_stack[-1])
container.add_child(text_obj)
elif token['type'] == 'tag':
# Handle the tag and potentially parse content between opening and closing tags
i = self._handle_tag_with_content(token, tokens, i, container)
continue
i += 1
def _handle_tag_with_content(self, token, tokens, current_index, container):
"""Handle tags and their content, returning the new index position"""
tag_name = token['name']
is_closing = token['closing']
if is_closing:
# Handle closing tags
if tag_name in ['b', 'strong', 'i', 'em', 'u', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if len(self.font_stack) > 1: # Don't pop the last font
self.font_stack.pop()
return current_index + 1
# For opening tags that affect text styling, parse their content with the new style
if tag_name in ['b', 'strong', 'i', 'em', 'u', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Push new font onto stack
self._handle_tag(token, container)
# Find the matching closing tag and parse content in between
content_start = current_index + 1
content_end = self._find_matching_closing_tag(tokens, current_index, tag_name)
if content_end > content_start:
# Parse content between opening and closing tags with current font style
for j in range(content_start, content_end):
content_token = tokens[j]
if content_token['type'] == 'text':
if content_token['content'].strip():
text_obj = Text(content_token['content'].strip(), self.font_stack[-1])
container.add_child(text_obj)
elif content_token['type'] == 'tag' and not content_token['closing']:
# Handle nested tags
self._handle_tag(content_token, container)
# Pop the font from stack
if len(self.font_stack) > 1:
self.font_stack.pop()
return content_end + 1 if content_end < len(tokens) else len(tokens)
else:
# Handle other tags normally
self._handle_tag(token, container)
return current_index + 1
def _find_matching_closing_tag(self, tokens, start_index, tag_name):
"""Find the index of the matching closing tag"""
open_count = 1
i = start_index + 1
while i < len(tokens) and open_count > 0:
token = tokens[i]
if token['type'] == 'tag' and token['name'] == tag_name:
if token['closing']:
open_count -= 1
else:
open_count += 1
i += 1
return i - 1 if open_count == 0 else len(tokens)
def _tokenize_html(self, content: str) -> List[Dict]:
"""Simple HTML tokenizer"""
tokens = []
tag_pattern = r'<(/?)([^>]+)>'
last_end = 0
for match in re.finditer(tag_pattern, content):
# Add text before tag
text_content = content[last_end:match.start()]
if text_content:
tokens.append({'type': 'text', 'content': text_content})
# Add tag
is_closing = bool(match.group(1))
tag_content = match.group(2)
tag_parts = tag_content.split()
tag_name = tag_parts[0].lower()
# Parse attributes
attributes = {}
if len(tag_parts) > 1:
attr_text = ' '.join(tag_parts[1:])
attr_pattern = r'(\w+)=(?:"([^"]*)"|\'([^\']*)\'|([^\s>]+))'
for attr_match in re.finditer(attr_pattern, attr_text):
attr_name = attr_match.group(1).lower()
attr_value = attr_match.group(2) or attr_match.group(3) or attr_match.group(4)
attributes[attr_name] = attr_value
tokens.append({
'type': 'tag',
'name': tag_name,
'closing': is_closing,
'attributes': attributes,
'content': tag_content
})
last_end = match.end()
# Add remaining text
if last_end < len(content):
text_content = content[last_end:]
if text_content:
tokens.append({'type': 'text', 'content': text_content})
return tokens
def _handle_tag(self, token: Dict, container: Container):
"""Handle HTML tags"""
tag_name = token['name']
is_closing = token['closing']
attributes = token['attributes']
if is_closing:
# Handle closing tags
if tag_name in ['b', 'strong']:
self.font_stack.pop()
elif tag_name in ['i', 'em']:
self.font_stack.pop()
elif tag_name == 'u':
self.font_stack.pop()
return
# Handle opening tags
if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Headers
size_map = {'h1': 24, 'h2': 20, 'h3': 18, 'h4': 16, 'h5': 14, 'h6': 12}
font = self.font_stack[-1].with_size(size_map[tag_name]).with_weight(FontWeight.BOLD)
self.font_stack.append(font)
elif tag_name in ['b', 'strong']:
# Bold text
font = self.font_stack[-1].with_weight(FontWeight.BOLD)
self.font_stack.append(font)
elif tag_name in ['i', 'em']:
# Italic text
font = self.font_stack[-1].with_style(FontStyle.ITALIC)
self.font_stack.append(font)
elif tag_name == 'u':
# Underlined text
font = self.font_stack[-1].with_decoration(TextDecoration.UNDERLINE)
self.font_stack.append(font)
elif tag_name == 'a':
# Links
href = attributes.get('href', '#')
title = attributes.get('title', href)
# Determine link type
if href.startswith('http'):
link_type = LinkType.EXTERNAL
elif href.startswith('#'):
link_type = LinkType.INTERNAL
else:
link_type = LinkType.INTERNAL
# Create link callback
def link_callback(location, **kwargs):
return f"Navigate to: {location}"
link = Link(href, link_type, link_callback, title=title)
link_font = self.font_stack[-1].with_colour((0, 0, 255)).with_decoration(TextDecoration.UNDERLINE)
# For now, just add the link text with link styling
link_text = attributes.get('title', href)
renderable_link = RenderableLink(link, link_text, link_font)
container.add_child(renderable_link)
elif tag_name == 'img':
# Images
src = attributes.get('src', '')
alt = attributes.get('alt', 'Image')
width = attributes.get('width')
height = attributes.get('height')
if src:
# Resolve relative URLs
if self.base_url and not src.startswith(('http://', 'https://')):
if os.path.isdir(self.base_url):
src = os.path.join(self.base_url, src)
else:
src = urljoin(self.base_url, src)
try:
# Create abstract image
from pyWebLayout.abstract.block import Image as AbstractImage
abstract_img = AbstractImage(src, alt)
# Parse dimensions if provided
max_width = int(width) if width and width.isdigit() else None
max_height = int(height) if height and height.isdigit() else None
renderable_img = RenderableImage(abstract_img, max_width, max_height)
container.add_child(renderable_img)
except Exception as e:
# Add error text if image fails to load
error_text = Text(f"[Image Error: {alt}]", Font(colour=(255, 0, 0)))
container.add_child(error_text)
elif tag_name == 'br':
# Line breaks - add some vertical space
spacer = Box((0, 0), (1, 10))
container.add_child(spacer)
elif tag_name == 'p':
# Paragraphs - add some vertical space
spacer = Box((0, 0), (1, 5))
container.add_child(spacer)
elif tag_name in ['div', 'span']:
# Generic containers - just continue parsing
pass
class BrowserWindow:
"""Main browser window using Tkinter"""
def __init__(self):
self.root = tk.Tk()
self.root.title("pyWebLayout HTML Browser")
self.root.geometry("900x700")
self.current_page = None
self.history = []
self.history_index = -1
self.setup_ui()
def setup_ui(self):
"""Setup the user interface"""
# Create main frame
main_frame = ttk.Frame(self.root)
main_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
# Navigation frame
nav_frame = ttk.Frame(main_frame)
nav_frame.pack(fill=tk.X, pady=(0, 5))
# Navigation buttons
self.back_btn = ttk.Button(nav_frame, text="", command=self.go_back, state=tk.DISABLED)
self.back_btn.pack(side=tk.LEFT, padx=(0, 5))
self.forward_btn = ttk.Button(nav_frame, text="", command=self.go_forward, state=tk.DISABLED)
self.forward_btn.pack(side=tk.LEFT, padx=(0, 5))
self.refresh_btn = ttk.Button(nav_frame, text="", command=self.refresh)
self.refresh_btn.pack(side=tk.LEFT, padx=(0, 10))
# Address bar
ttk.Label(nav_frame, text="URL:").pack(side=tk.LEFT)
self.url_var = tk.StringVar()
self.url_entry = ttk.Entry(nav_frame, textvariable=self.url_var, width=50)
self.url_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(5, 5))
self.url_entry.bind('<Return>', self.navigate_to_url)
self.go_btn = ttk.Button(nav_frame, text="Go", command=self.navigate_to_url)
self.go_btn.pack(side=tk.LEFT, padx=(0, 10))
# File operations
self.open_btn = ttk.Button(nav_frame, text="Open File", command=self.open_file)
self.open_btn.pack(side=tk.LEFT)
# Content frame with scrollbars
content_frame = ttk.Frame(main_frame)
content_frame.pack(fill=tk.BOTH, expand=True)
# Create canvas with scrollbars
self.canvas = tk.Canvas(content_frame, bg='white')
v_scrollbar = ttk.Scrollbar(content_frame, orient=tk.VERTICAL, command=self.canvas.yview)
h_scrollbar = ttk.Scrollbar(content_frame, orient=tk.HORIZONTAL, command=self.canvas.xview)
self.canvas.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set)
v_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
h_scrollbar.pack(side=tk.BOTTOM, fill=tk.X)
self.canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
# Status bar
self.status_var = tk.StringVar(value="Ready")
status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN)
status_bar.pack(fill=tk.X, pady=(5, 0))
# Bind mouse events
self.canvas.bind('<Button-1>', self.on_click)
self.canvas.bind('<Motion>', self.on_mouse_move)
# Load default page
self.load_default_page()
def load_default_page(self):
"""Load a default welcome page"""
html_content = """
<html>
<head><title>pyWebLayout Browser - Welcome</title></head>
<body>
<h1>Welcome to pyWebLayout Browser</h1>
<p>This is a simple HTML browser built using pyWebLayout components.</p>
<h2>Features:</h2>
<ul>
<li>Basic HTML rendering</li>
<li>Text formatting (bold, italic, underline)</li>
<li>Headers (H1-H6)</li>
<li>Links (clickable)</li>
<li>Images</li>
<li>Forms (basic support)</li>
</ul>
<h2>Try these features:</h2>
<p><b>Bold text</b>, <i>italic text</i>, and <u>underlined text</u></p>
<p>Sample link: <a href="https://www.example.com" title="External link">Visit Example.com</a></p>
<h3>File Operations</h3>
<p>Use the "Open File" button to load local HTML files.</p>
<p>Or enter a URL in the address bar above.</p>
</body>
</html>
"""
parser = HTMLParser()
self.current_page = parser.parse_html_string(html_content)
self.render_page()
self.status_var.set("Welcome page loaded")
def navigate_to_url(self, event=None):
"""Navigate to the URL in the address bar"""
url = self.url_var.get().strip()
if not url:
return
self.status_var.set(f"Loading {url}...")
self.root.update()
try:
if url.startswith(('http://', 'https://')):
# Web URL
response = requests.get(url, timeout=10)
response.raise_for_status()
html_content = response.text
parser = HTMLParser()
self.current_page = parser.parse_html_string(html_content, url)
elif os.path.isfile(url):
# Local file
parser = HTMLParser()
self.current_page = parser.parse_html_file(url)
else:
# Try to treat as a local file path
if not url.startswith('file://'):
url = 'file://' + os.path.abspath(url)
file_path = url.replace('file://', '')
if os.path.isfile(file_path):
parser = HTMLParser()
self.current_page = parser.parse_html_file(file_path)
else:
raise FileNotFoundError(f"File not found: {file_path}")
# Add to history
self.add_to_history(url)
self.render_page()
self.status_var.set(f"Loaded {url}")
except Exception as e:
self.status_var.set(f"Error loading {url}: {str(e)}")
messagebox.showerror("Error", f"Failed to load {url}:\n{str(e)}")
def open_file(self):
"""Open a local HTML file"""
file_path = filedialog.askopenfilename(
title="Open HTML File",
filetypes=[("HTML files", "*.html *.htm"), ("All files", "*.*")]
)
if file_path:
self.url_var.set(file_path)
self.navigate_to_url()
def render_page(self):
"""Render the current page to the canvas"""
if not self.current_page:
return
# Clear canvas
self.canvas.delete("all")
# Render the page to PIL Image
page_image = self.current_page.render()
# Convert to PhotoImage
self.photo = ImageTk.PhotoImage(page_image)
# Display on canvas
self.canvas.create_image(0, 0, anchor=tk.NW, image=self.photo)
# Update scroll region
self.canvas.configure(scrollregion=self.canvas.bbox("all"))
# Store page elements for interaction
self.page_elements = self._get_clickable_elements(self.current_page)
def _get_clickable_elements(self, container, offset=(0, 0)) -> List[Tuple]:
"""Get list of clickable elements with their positions"""
elements = []
if hasattr(container, '_children'):
for child in container._children:
if hasattr(child, '_origin'):
child_offset = (offset[0] + child._origin[0], offset[1] + child._origin[1])
# Check if element is clickable
if isinstance(child, (RenderableLink, RenderableButton)):
elements.append((child, child_offset, child._size))
# Recursively check children
if hasattr(child, '_children'):
elements.extend(self._get_clickable_elements(child, child_offset))
return elements
def on_click(self, event):
"""Handle mouse clicks on the canvas"""
# Convert canvas coordinates to image coordinates
canvas_x = self.canvas.canvasx(event.x)
canvas_y = self.canvas.canvasy(event.y)
# Check if click is on any clickable element
for element, offset, size in self.page_elements:
element_x, element_y = offset
element_w, element_h = size
if (element_x <= canvas_x <= element_x + element_w and
element_y <= canvas_y <= element_y + element_h):
# Handle the click
if isinstance(element, RenderableLink):
result = element._callback()
if result:
self.status_var.set(result)
# For external links, open in system browser
if element._link.link_type == LinkType.EXTERNAL:
webbrowser.open(element._link.location)
elif isinstance(element, RenderableButton):
result = element._callback()
if result:
self.status_var.set(f"Button clicked: {result}")
break
def on_mouse_move(self, event):
"""Handle mouse movement for hover effects"""
# Convert canvas coordinates to image coordinates
canvas_x = self.canvas.canvasx(event.x)
canvas_y = self.canvas.canvasy(event.y)
# Check if mouse is over any clickable element
cursor = "arrow"
for element, offset, size in self.page_elements:
element_x, element_y = offset
element_w, element_h = size
if (element_x <= canvas_x <= element_x + element_w and
element_y <= canvas_y <= element_y + element_h):
cursor = "hand2"
break
self.canvas.configure(cursor=cursor)
def add_to_history(self, url):
"""Add URL to navigation history"""
# Remove any forward history
self.history = self.history[:self.history_index + 1]
# Add new URL
self.history.append(url)
self.history_index = len(self.history) - 1
# Update navigation buttons
self.update_nav_buttons()
def update_nav_buttons(self):
"""Update the state of navigation buttons"""
self.back_btn.configure(state=tk.NORMAL if self.history_index > 0 else tk.DISABLED)
self.forward_btn.configure(state=tk.NORMAL if self.history_index < len(self.history) - 1 else tk.DISABLED)
def go_back(self):
"""Navigate back in history"""
if self.history_index > 0:
self.history_index -= 1
url = self.history[self.history_index]
self.url_var.set(url)
self.navigate_to_url()
def go_forward(self):
"""Navigate forward in history"""
if self.history_index < len(self.history) - 1:
self.history_index += 1
url = self.history[self.history_index]
self.url_var.set(url)
self.navigate_to_url()
def refresh(self):
"""Refresh the current page"""
if self.current_page:
current_url = self.url_var.get()
if current_url:
self.navigate_to_url()
else:
self.load_default_page()
def run(self):
"""Start the browser"""
self.root.mainloop()
def main():
"""Main function to run the browser"""
print("Starting pyWebLayout HTML Browser...")
try:
browser = BrowserWindow()
browser.run()
except Exception as e:
print(f"Error starting browser: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()