Coverage for pyWebLayout/io/readers/html_extraction.py: 88%
424 statements
« prev ^ index » next coverage.py v7.11.2, created at 2025-11-12 12:02 +0000
« prev ^ index » next coverage.py v7.11.2, created at 2025-11-12 12:02 +0000
1"""
2HTML extraction module for converting HTML elements to pyWebLayout abstract elements.
4This module provides handler functions for converting HTML elements into the abstract document structure
5used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting.
6Each handler function has a robust signature that handles style hints, CSS classes, and attributes.
7"""
9from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple
10from bs4 import BeautifulSoup, Tag, NavigableString
11from pyWebLayout.abstract.inline import Word
12from pyWebLayout.abstract.block import (
13 Block,
14 Paragraph,
15 Heading,
16 HeadingLevel,
17 Quote,
18 CodeBlock,
19 HList,
20 ListItem,
21 ListStyle,
22 Table,
23 TableRow,
24 TableCell,
25 HorizontalRule,
26 Image,
27)
28from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration
31class StyleContext(NamedTuple):
32 """
33 Immutable style context passed to handler functions.
34 Contains all styling information including inherited styles, CSS hints, and element attributes.
35 """
37 font: Font
38 background: Optional[Tuple[int, int, int, int]]
39 css_classes: set
40 css_styles: Dict[str, str]
41 element_attributes: Dict[str, Any]
42 parent_elements: List[str] # Stack of parent element names
43 document: Optional[Any] # Reference to document for font registry
44 base_path: Optional[str] = None # Base path for resolving relative URLs
46 def with_font(self, font: Font) -> "StyleContext":
47 """Create new context with modified font."""
48 return self._replace(font=font)
50 def with_background(
51 self, background: Optional[Tuple[int, int, int, int]]
52 ) -> "StyleContext":
53 """Create new context with modified background."""
54 return self._replace(background=background)
56 def with_css_classes(self, css_classes: set) -> "StyleContext":
57 """Create new context with modified CSS classes."""
58 return self._replace(css_classes=css_classes)
60 def with_css_styles(self, css_styles: Dict[str, str]) -> "StyleContext":
61 """Create new context with modified CSS styles."""
62 return self._replace(css_styles=css_styles)
64 def with_attributes(self, attributes: Dict[str, Any]) -> "StyleContext":
65 """Create new context with modified element attributes."""
66 return self._replace(element_attributes=attributes)
68 def push_element(self, element_name: str) -> "StyleContext":
69 """Create new context with element pushed onto parent stack."""
70 return self._replace(parent_elements=self.parent_elements + [element_name])
73def create_base_context(
74 base_font: Optional[Font] = None,
75 document=None,
76 base_path: Optional[str] = None) -> StyleContext:
77 """
78 Create a base style context with default values.
80 Args:
81 base_font: Base font to use, defaults to system default
82 document: Document instance for font registry
83 base_path: Base directory path for resolving relative URLs
85 Returns:
86 StyleContext with default values
87 """
88 # Use document's font registry if available, otherwise create default font
89 if base_font is None:
90 if document and hasattr(document, 'get_or_create_font'):
91 base_font = document.get_or_create_font()
92 else:
93 base_font = Font()
95 return StyleContext(
96 font=base_font,
97 background=None,
98 css_classes=set(),
99 css_styles={},
100 element_attributes={},
101 parent_elements=[],
102 document=document,
103 base_path=base_path,
104 )
107def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext:
108 """
109 Apply element-specific styling to context based on HTML element and attributes.
111 Args:
112 context: Current style context
113 element: BeautifulSoup Tag object
115 Returns:
116 New StyleContext with applied styling
117 """
118 tag_name = element.name.lower()
119 attributes = dict(element.attrs) if element.attrs else {}
121 # Start with current context
122 new_context = context.with_attributes(attributes).push_element(tag_name)
124 # Apply CSS classes
125 css_classes = new_context.css_classes.copy()
126 if "class" in attributes:
127 classes = (
128 attributes["class"].split()
129 if isinstance(attributes["class"], str)
130 else attributes["class"]
131 )
132 css_classes.update(classes)
133 new_context = new_context.with_css_classes(css_classes)
135 # Apply inline styles
136 css_styles = new_context.css_styles.copy()
137 if "style" in attributes:
138 inline_styles = parse_inline_styles(attributes["style"])
139 css_styles.update(inline_styles)
140 new_context = new_context.with_css_styles(css_styles)
142 # Apply element-specific default styles
143 font = apply_element_font_styles(
144 new_context.font, tag_name, css_styles, new_context)
145 new_context = new_context.with_font(font)
147 # Apply background from styles
148 background = apply_background_styles(new_context.background, css_styles)
149 new_context = new_context.with_background(background)
151 return new_context
154def parse_inline_styles(style_text: str) -> Dict[str, str]:
155 """
156 Parse CSS inline styles into dictionary.
158 Args:
159 style_text: CSS style text (e.g., "color: red; font-weight: bold;")
161 Returns:
162 Dictionary of CSS property-value pairs
163 """
164 styles = {}
165 for declaration in style_text.split(";"):
166 if ":" in declaration:
167 prop, value = declaration.split(":", 1)
168 styles[prop.strip().lower()] = value.strip()
169 return styles
172def apply_element_font_styles(font: Font,
173 tag_name: str,
174 css_styles: Dict[str,
175 str],
176 context: Optional[StyleContext] = None) -> Font:
177 """
178 Apply font styling based on HTML element and CSS styles.
179 Uses document's font registry when available to avoid creating duplicate fonts.
181 Args:
182 font: Current font
183 tag_name: HTML tag name
184 css_styles: CSS styles dictionary
185 context: Style context with document reference for font registry
187 Returns:
188 Font object with applied styling (either existing or newly created)
189 """
190 # Default element styles
191 element_font_styles = {
192 "b": {"weight": FontWeight.BOLD},
193 "strong": {"weight": FontWeight.BOLD},
194 "i": {"style": FontStyle.ITALIC},
195 "em": {"style": FontStyle.ITALIC},
196 "u": {"decoration": TextDecoration.UNDERLINE},
197 "s": {"decoration": TextDecoration.STRIKETHROUGH},
198 "del": {"decoration": TextDecoration.STRIKETHROUGH},
199 "h1": {"size": 24, "weight": FontWeight.BOLD},
200 "h2": {"size": 20, "weight": FontWeight.BOLD},
201 "h3": {"size": 18, "weight": FontWeight.BOLD},
202 "h4": {"size": 16, "weight": FontWeight.BOLD},
203 "h5": {"size": 14, "weight": FontWeight.BOLD},
204 "h6": {"size": 12, "weight": FontWeight.BOLD},
205 }
207 # Start with current font properties
208 font_size = font.font_size
209 colour = font.colour
210 weight = font.weight
211 style = font.style
212 decoration = font.decoration
213 background = font.background
214 language = font.language
215 font_path = font._font_path
217 # Apply element default styles
218 if tag_name in element_font_styles:
219 elem_styles = element_font_styles[tag_name]
220 if "size" in elem_styles:
221 font_size = elem_styles["size"]
222 if "weight" in elem_styles:
223 weight = elem_styles["weight"]
224 if "style" in elem_styles:
225 style = elem_styles["style"]
226 if "decoration" in elem_styles:
227 decoration = elem_styles["decoration"]
229 # Apply CSS styles (override element defaults)
230 if "font-size" in css_styles:
231 # Parse font-size (simplified - could be enhanced)
232 size_value = css_styles["font-size"].lower()
233 if size_value.endswith("px"):
234 try:
235 font_size = int(float(size_value[:-2]))
236 except ValueError:
237 pass
238 elif size_value.endswith("pt"): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true
239 try:
240 font_size = int(float(size_value[:-2]))
241 except ValueError:
242 pass
244 if "font-weight" in css_styles:
245 weight_value = css_styles["font-weight"].lower()
246 if weight_value in ["bold", "700", "800", "900"]: 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was always true
247 weight = FontWeight.BOLD
248 elif weight_value in ["normal", "400"]:
249 weight = FontWeight.NORMAL
251 if "font-style" in css_styles:
252 style_value = css_styles["font-style"].lower()
253 if style_value == "italic":
254 style = FontStyle.ITALIC
255 elif style_value == "normal": 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was always true
256 style = FontStyle.NORMAL
258 if "text-decoration" in css_styles:
259 decoration_value = css_styles["text-decoration"].lower()
260 if "underline" in decoration_value:
261 decoration = TextDecoration.UNDERLINE
262 elif "line-through" in decoration_value: 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true
263 decoration = TextDecoration.STRIKETHROUGH
264 elif "none" in decoration_value: 264 ↛ 267line 264 didn't jump to line 267 because the condition on line 264 was always true
265 decoration = TextDecoration.NONE
267 if "color" in css_styles:
268 # Parse color (simplified - could be enhanced for hex, rgb, etc.)
269 color_value = css_styles["color"].lower()
270 color_map = {
271 "black": (0, 0, 0),
272 "white": (255, 255, 255),
273 "red": (255, 0, 0),
274 "green": (0, 255, 0),
275 "blue": (0, 0, 255),
276 }
277 if color_value in color_map:
278 colour = color_map[color_value]
279 elif color_value.startswith("#") and len(color_value) == 7:
280 try:
281 r = int(color_value[1:3], 16)
282 g = int(color_value[3:5], 16)
283 b = int(color_value[5:7], 16)
284 colour = (r, g, b)
285 except ValueError:
286 pass
288 # Use document's style registry if available to avoid creating duplicate styles
289 if context and context.document and hasattr(
290 context.document, 'get_or_create_style'):
291 # Create an abstract style first
292 from pyWebLayout.style.abstract_style import FontFamily, FontSize
294 # Map font properties to abstract style properties
295 font_family = FontFamily.SERIF # Default - could be enhanced to detect from font_path
296 if font_size: 296 ↛ 300line 296 didn't jump to line 300 because the condition on line 296 was always true
297 font_size_value = font_size if isinstance(
298 font_size, int) else FontSize.MEDIUM
299 else:
300 font_size_value = FontSize.MEDIUM
302 # Create abstract style and register it
303 style_id, abstract_style = context.document.get_or_create_style(
304 font_family=font_family,
305 font_size=font_size_value,
306 font_weight=weight,
307 font_style=style,
308 text_decoration=decoration,
309 color=colour,
310 language=language
311 )
313 # Get the concrete font for this style
314 return context.document.get_font_for_style(abstract_style)
315 elif context and context.document and hasattr(context.document, 'get_or_create_font'): 315 ↛ 317line 315 didn't jump to line 317 because the condition on line 315 was never true
316 # Fallback to old font registry system
317 return context.document.get_or_create_font(
318 font_path=font_path,
319 font_size=font_size,
320 colour=colour,
321 weight=weight,
322 style=style,
323 decoration=decoration,
324 background=background,
325 language=language,
326 min_hyphenation_width=font.min_hyphenation_width
327 )
328 else:
329 # Fallback to creating new font if no document context
330 return Font(
331 font_path=font_path,
332 font_size=font_size,
333 colour=colour,
334 weight=weight,
335 style=style,
336 decoration=decoration,
337 background=background,
338 language=language,
339 )
342def apply_background_styles(
343 current_background: Optional[Tuple[int, int, int, int]], css_styles: Dict[str, str]
344) -> Optional[Tuple[int, int, int, int]]:
345 """
346 Apply background styling from CSS.
348 Args:
349 current_background: Current background color (RGBA)
350 css_styles: CSS styles dictionary
352 Returns:
353 New background color or None
354 """
355 if "background-color" in css_styles:
356 bg_value = css_styles["background-color"].lower()
357 if bg_value == "transparent": 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true
358 return None
359 # Add color parsing logic here if needed
361 return current_background
364def extract_text_content(element: Tag, context: StyleContext) -> List[Word]:
365 """
366 Extract text content from an element, handling inline formatting and links.
368 Args:
369 element: BeautifulSoup Tag object
370 context: Current style context
372 Returns:
373 List of Word objects (including LinkedWord for hyperlinks)
374 """
375 from pyWebLayout.abstract.inline import LinkedWord
376 from pyWebLayout.abstract.functional import LinkType
378 words = []
380 for child in element.children:
381 if isinstance(child, NavigableString):
382 # Plain text - split into words
383 text = str(child).strip()
384 if text:
385 word_texts = text.split()
386 for word_text in word_texts:
387 if word_text: 387 ↛ 386line 387 didn't jump to line 386 because the condition on line 387 was always true
388 words.append(Word(word_text, context.font, context.background))
389 elif isinstance(child, Tag): 389 ↛ 380line 389 didn't jump to line 380 because the condition on line 389 was always true
390 # Special handling for <a> tags (hyperlinks)
391 if child.name.lower() == "a":
392 href = child.get('href', '')
393 if href:
394 # Determine link type based on href
395 if href.startswith(('http://', 'https://')):
396 link_type = LinkType.EXTERNAL
397 elif href.startswith('#'):
398 link_type = LinkType.INTERNAL
399 elif href.startswith('javascript:') or href.startswith('api:'):
400 link_type = LinkType.API
401 else:
402 link_type = LinkType.INTERNAL
404 # Apply link styling
405 child_context = apply_element_styling(context, child)
407 # Extract text and create LinkedWord for each word
408 link_text = child.get_text(strip=True)
409 title = child.get('title', '')
411 for word_text in link_text.split():
412 if word_text: 412 ↛ 411line 412 didn't jump to line 411 because the condition on line 412 was always true
413 linked_word = LinkedWord(
414 text=word_text,
415 style=child_context.font,
416 location=href,
417 link_type=link_type,
418 background=child_context.background,
419 title=title if title else None
420 )
421 words.append(linked_word)
422 else:
423 # <a> without href - treat as normal text
424 child_context = apply_element_styling(context, child)
425 child_words = extract_text_content(child, child_context)
426 words.extend(child_words)
428 # Process other inline elements
429 elif child.name.lower() in [
430 "span",
431 "strong",
432 "b",
433 "em",
434 "i",
435 "u",
436 "s",
437 "del",
438 "ins",
439 "mark",
440 "small",
441 "sub",
442 "sup",
443 "code",
444 "q",
445 "cite",
446 "abbr",
447 "time",
448 ]:
449 child_context = apply_element_styling(context, child)
450 child_words = extract_text_content(child, child_context)
451 words.extend(child_words)
452 else:
453 # Block element - shouldn't happen in well-formed HTML but handle
454 # gracefully
455 child_context = apply_element_styling(context, child)
456 child_result = process_element(child, child_context)
457 if isinstance(child_result, list): 457 ↛ 458line 457 didn't jump to line 458 because the condition on line 457 was never true
458 for block in child_result:
459 if isinstance(block, Paragraph):
460 for _, word in block.words_iter():
461 words.append(word)
462 elif isinstance(child_result, Paragraph): 462 ↛ 463line 462 didn't jump to line 463 because the condition on line 462 was never true
463 for _, word in child_result.words_iter():
464 words.append(word)
466 return words
469def process_element(
470 element: Tag, context: StyleContext
471) -> Union[Block, List[Block], None]:
472 """
473 Process a single HTML element using appropriate handler.
475 Args:
476 element: BeautifulSoup Tag object
477 context: Current style context
479 Returns:
480 Block object(s) or None if element should be ignored
481 """
482 tag_name = element.name.lower()
483 handler = HANDLERS.get(tag_name, generic_handler)
484 return handler(element, context)
487# Handler function signatures:
488# All handlers receive (element: Tag, context: StyleContext) ->
489# Union[Block, List[Block], None]
492def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]:
493 """
494 Handle <p> elements.
496 Special handling for paragraphs containing images:
497 - If the paragraph contains only an image (common in EPUBs), return the image block
498 - If the paragraph contains images mixed with text, split into separate blocks
499 - Otherwise, return a normal paragraph with text content
500 """
501 # Check if paragraph contains any img tags (including nested ones)
502 img_tags = element.find_all('img')
504 if img_tags:
505 # Paragraph contains images - need special handling
506 blocks = []
508 # Check if this is an image-only paragraph (very common in EPUBs)
509 # Get text content without the img tags
510 text_content = element.get_text(strip=True)
512 if not text_content or len(text_content.strip()) == 0:
513 # Image-only paragraph - return just the image(s)
514 for img_tag in img_tags:
515 child_context = apply_element_styling(context, img_tag)
516 img_block = image_handler(img_tag, child_context)
517 if img_block: 517 ↛ 514line 517 didn't jump to line 514 because the condition on line 517 was always true
518 blocks.append(img_block)
520 # Return single image or list of images
521 if len(blocks) == 1:
522 return blocks[0]
523 return blocks if blocks else Paragraph(context.font)
525 # Mixed content - paragraph has both text and images
526 # Process children in order to preserve structure
527 for child in element.children:
528 if isinstance(child, Tag):
529 if child.name == 'img': 529 ↛ 538line 529 didn't jump to line 538 because the condition on line 529 was always true
530 # Add the image as a separate block
531 child_context = apply_element_styling(context, child)
532 img_block = image_handler(child, child_context)
533 if img_block: 533 ↛ 527line 533 didn't jump to line 527 because the condition on line 533 was always true
534 blocks.append(img_block)
535 else:
536 # Process other inline elements as part of text
537 # This will be handled by extract_text_content below
538 pass
540 # Also add a paragraph with the text content
541 paragraph = Paragraph(context.font)
542 words = extract_text_content(element, context)
543 if words: 543 ↛ 548line 543 didn't jump to line 548 because the condition on line 543 was always true
544 for word in words:
545 paragraph.add_word(word)
546 blocks.insert(0, paragraph) # Text comes before images
548 return blocks if blocks else Paragraph(context.font)
550 # No images - normal paragraph handling
551 paragraph = Paragraph(context.font)
552 words = extract_text_content(element, context)
553 for word in words:
554 paragraph.add_word(word)
555 return paragraph
558def div_handler(element: Tag, context: StyleContext) -> List[Block]:
559 """Handle <div> elements - treat as generic container."""
560 blocks = []
561 for child in element.children:
562 if isinstance(child, Tag):
563 child_context = apply_element_styling(context, child)
564 result = process_element(child, child_context)
565 if result:
566 if isinstance(result, list):
567 blocks.extend(result)
568 else:
569 blocks.append(result)
570 return blocks
573def heading_handler(element: Tag, context: StyleContext) -> Heading:
574 """Handle <h1>-<h6> elements."""
575 level_map = {
576 "h1": HeadingLevel.H1,
577 "h2": HeadingLevel.H2,
578 "h3": HeadingLevel.H3,
579 "h4": HeadingLevel.H4,
580 "h5": HeadingLevel.H5,
581 "h6": HeadingLevel.H6,
582 }
584 level = level_map.get(element.name.lower(), HeadingLevel.H1)
585 heading = Heading(level, context.font)
586 words = extract_text_content(element, context)
587 for word in words:
588 heading.add_word(word)
589 return heading
592def blockquote_handler(element: Tag, context: StyleContext) -> Quote:
593 """Handle <blockquote> elements."""
594 quote = Quote(context.font)
595 for child in element.children:
596 if isinstance(child, Tag):
597 child_context = apply_element_styling(context, child)
598 result = process_element(child, child_context)
599 if result: 599 ↛ 595line 599 didn't jump to line 595 because the condition on line 599 was always true
600 if isinstance(result, list): 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true
601 for block in result:
602 quote.add_block(block)
603 else:
604 quote.add_block(result)
605 return quote
608def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock:
609 """Handle <pre> elements."""
610 language = context.element_attributes.get("data-language", "")
611 code_block = CodeBlock(language)
613 # Preserve whitespace and line breaks in preformatted text
614 text = element.get_text(separator="\n", strip=False)
615 for line in text.split("\n"):
616 code_block.add_line(line)
618 return code_block
621def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]:
622 """Handle <code> elements."""
623 # If parent is <pre>, this is handled by preformatted_handler
624 if context.parent_elements and context.parent_elements[-1] == "pre": 624 ↛ 625line 624 didn't jump to line 625 because the condition on line 624 was never true
625 return None # Will be handled by parent
627 # Inline code - handled during text extraction
628 return None
631def unordered_list_handler(element: Tag, context: StyleContext) -> HList:
632 """Handle <ul> elements."""
633 hlist = HList(ListStyle.UNORDERED, context.font)
634 for child in element.children:
635 if isinstance(child, Tag) and child.name.lower() == "li":
636 child_context = apply_element_styling(context, child)
637 item = process_element(child, child_context)
638 if item: 638 ↛ 634line 638 didn't jump to line 634 because the condition on line 638 was always true
639 hlist.add_item(item)
640 return hlist
643def ordered_list_handler(element: Tag, context: StyleContext) -> HList:
644 """Handle <ol> elements."""
645 hlist = HList(ListStyle.ORDERED, context.font)
646 for child in element.children:
647 if isinstance(child, Tag) and child.name.lower() == "li":
648 child_context = apply_element_styling(context, child)
649 item = process_element(child, child_context)
650 if item: 650 ↛ 646line 650 didn't jump to line 646 because the condition on line 650 was always true
651 hlist.add_item(item)
652 return hlist
655def list_item_handler(element: Tag, context: StyleContext) -> ListItem:
656 """Handle <li> elements."""
657 list_item = ListItem(None, context.font)
659 for child in element.children:
660 if isinstance(child, Tag):
661 child_context = apply_element_styling(context, child)
662 result = process_element(child, child_context)
663 if result:
664 if isinstance(result, list): 664 ↛ 665line 664 didn't jump to line 665 because the condition on line 664 was never true
665 for block in result:
666 list_item.add_block(block)
667 else:
668 list_item.add_block(result)
669 elif isinstance(child, NavigableString): 669 ↛ 659line 669 didn't jump to line 659 because the condition on line 669 was always true
670 # Direct text in list item - create paragraph
671 text = str(child).strip()
672 if text:
673 paragraph = Paragraph(context.font)
674 words = text.split()
675 for word_text in words:
676 if word_text: 676 ↛ 675line 676 didn't jump to line 675 because the condition on line 676 was always true
677 paragraph.add_word(Word(word_text, context.font))
678 list_item.add_block(paragraph)
680 return list_item
683def table_handler(element: Tag, context: StyleContext) -> Table:
684 """Handle <table> elements."""
685 caption = None
686 caption_elem = element.find("caption")
687 if caption_elem: 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true
688 caption = caption_elem.get_text(strip=True)
690 table = Table(caption, context.font)
692 # Process table rows
693 for child in element.children:
694 if isinstance(child, Tag):
695 if child.name.lower() == "tr":
696 child_context = apply_element_styling(context, child)
697 row = process_element(child, child_context)
698 if row: 698 ↛ 693line 698 didn't jump to line 693 because the condition on line 698 was always true
699 table.add_row(row)
700 elif child.name.lower() in ["thead", "tbody", "tfoot"]: 700 ↛ 693line 700 didn't jump to line 693 because the condition on line 700 was always true
701 section = "header" if child.name.lower() == "thead" else "body"
702 section = "footer" if child.name.lower() == "tfoot" else section
704 for row_elem in child.find_all("tr"):
705 child_context = apply_element_styling(context, row_elem)
706 row = process_element(row_elem, child_context)
707 if row: 707 ↛ 704line 707 didn't jump to line 704 because the condition on line 707 was always true
708 table.add_row(row, section)
710 return table
713def table_row_handler(element: Tag, context: StyleContext) -> TableRow:
714 """Handle <tr> elements."""
715 row = TableRow(context.font)
716 for child in element.children:
717 if isinstance(child, Tag) and child.name.lower() in ["td", "th"]:
718 child_context = apply_element_styling(context, child)
719 cell = process_element(child, child_context)
720 if cell: 720 ↛ 716line 720 didn't jump to line 716 because the condition on line 720 was always true
721 row.add_cell(cell)
722 return row
725def table_cell_handler(element: Tag, context: StyleContext) -> TableCell:
726 """Handle <td> elements."""
727 colspan = int(context.element_attributes.get("colspan", 1))
728 rowspan = int(context.element_attributes.get("rowspan", 1))
729 cell = TableCell(False, colspan, rowspan, context.font)
731 # Process cell content
732 for child in element.children:
733 if isinstance(child, Tag):
734 child_context = apply_element_styling(context, child)
735 result = process_element(child, child_context)
736 if result:
737 if isinstance(result, list):
738 for block in result:
739 cell.add_block(block)
740 else:
741 cell.add_block(result)
742 elif isinstance(child, NavigableString): 742 ↛ 732line 742 didn't jump to line 732 because the condition on line 742 was always true
743 # Direct text in cell - create paragraph
744 text = str(child).strip()
745 if text:
746 paragraph = Paragraph(context.font)
747 words = text.split()
748 for word_text in words:
749 if word_text: 749 ↛ 748line 749 didn't jump to line 748 because the condition on line 749 was always true
750 paragraph.add_word(Word(word_text, context.font))
751 cell.add_block(paragraph)
753 return cell
756def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:
757 """Handle <th> elements."""
758 colspan = int(context.element_attributes.get("colspan", 1))
759 rowspan = int(context.element_attributes.get("rowspan", 1))
760 cell = TableCell(True, colspan, rowspan, context.font)
762 # Process cell content (same as td)
763 for child in element.children:
764 if isinstance(child, Tag):
765 child_context = apply_element_styling(context, child)
766 result = process_element(child, child_context)
767 if result:
768 if isinstance(result, list): 768 ↛ 772line 768 didn't jump to line 772 because the condition on line 768 was always true
769 for block in result:
770 cell.add_block(block)
771 else:
772 cell.add_block(result)
773 elif isinstance(child, NavigableString): 773 ↛ 763line 773 didn't jump to line 763 because the condition on line 773 was always true
774 text = str(child).strip()
775 if text:
776 paragraph = Paragraph(context.font)
777 words = text.split()
778 for word_text in words:
779 if word_text: 779 ↛ 778line 779 didn't jump to line 778 because the condition on line 779 was always true
780 paragraph.add_word(Word(word_text, context.font))
781 cell.add_block(paragraph)
783 return cell
786def horizontal_rule_handler(element: Tag, context: StyleContext) -> HorizontalRule:
787 """Handle <hr> elements."""
788 return HorizontalRule()
791def line_break_handler(element: Tag, context: StyleContext) -> None:
792 """Handle <br> elements."""
793 # Line breaks are typically handled at the paragraph level
794 return None
797def image_handler(element: Tag, context: StyleContext) -> Image:
798 """Handle <img> elements."""
799 import os
800 import urllib.parse
802 src = context.element_attributes.get("src", "")
803 alt_text = context.element_attributes.get("alt", "")
805 # Resolve relative paths if base_path is provided
806 if context.base_path and src and not src.startswith(('http://', 'https://', '/')):
807 # Parse the src to handle URL-encoded characters
808 src_decoded = urllib.parse.unquote(src)
809 # Resolve relative path to absolute path
810 src = os.path.normpath(os.path.join(context.base_path, src_decoded))
812 # Parse dimensions if provided
813 width = height = None
814 try:
815 if "width" in context.element_attributes:
816 width = int(context.element_attributes["width"])
817 if "height" in context.element_attributes:
818 height = int(context.element_attributes["height"])
819 except ValueError:
820 pass
822 return Image(source=src, alt_text=alt_text, width=width, height=height)
825def ignore_handler(element: Tag, context: StyleContext) -> None:
826 """Handle elements that should be ignored."""
827 return None
830def generic_handler(element: Tag, context: StyleContext) -> List[Block]:
831 """Handle unknown elements as generic containers."""
832 return div_handler(element, context)
835# Handler registry - maps HTML tag names to handler functions
836HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = {
837 # Block elements
838 "p": paragraph_handler,
839 "div": div_handler,
840 "h1": heading_handler,
841 "h2": heading_handler,
842 "h3": heading_handler,
843 "h4": heading_handler,
844 "h5": heading_handler,
845 "h6": heading_handler,
846 "blockquote": blockquote_handler,
847 "pre": preformatted_handler,
848 "code": code_handler,
849 "ul": unordered_list_handler,
850 "ol": ordered_list_handler,
851 "li": list_item_handler,
852 "table": table_handler,
853 "tr": table_row_handler,
854 "td": table_cell_handler,
855 "th": table_header_cell_handler,
856 "hr": horizontal_rule_handler,
857 "br": line_break_handler,
858 # Semantic elements (treated as containers)
859 "section": div_handler,
860 "article": div_handler,
861 "aside": div_handler,
862 "nav": div_handler,
863 "header": div_handler,
864 "footer": div_handler,
865 "main": div_handler,
866 "figure": div_handler,
867 "figcaption": paragraph_handler,
868 # Media elements
869 "img": image_handler,
870 # Inline elements (handled during text extraction)
871 "span": ignore_handler,
872 "a": ignore_handler,
873 "strong": ignore_handler,
874 "b": ignore_handler,
875 "em": ignore_handler,
876 "i": ignore_handler,
877 "u": ignore_handler,
878 "s": ignore_handler,
879 "del": ignore_handler,
880 "ins": ignore_handler,
881 "mark": ignore_handler,
882 "small": ignore_handler,
883 "sub": ignore_handler,
884 "sup": ignore_handler,
885 "q": ignore_handler,
886 "cite": ignore_handler,
887 "abbr": ignore_handler,
888 "time": ignore_handler,
889 # Ignored elements
890 "script": ignore_handler,
891 "style": ignore_handler,
892 "meta": ignore_handler,
893 "link": ignore_handler,
894 "head": ignore_handler,
895 "title": ignore_handler,
896}
899def parse_html_string(
900 html_string: str, base_font: Optional[Font] = None, document=None, base_path: Optional[str] = None
901) -> List[Block]:
902 """
903 Parse HTML string and return list of Block objects.
905 Args:
906 html_string: HTML content to parse
907 base_font: Base font for styling, defaults to system default
908 document: Document instance for font registry to avoid duplicate fonts
909 base_path: Base directory path for resolving relative URLs (e.g., image sources)
911 Returns:
912 List of Block objects representing the document structure
913 """
914 soup = BeautifulSoup(html_string, "html.parser")
915 context = create_base_context(base_font, document, base_path)
917 blocks = []
919 # Process the body if it exists, otherwise process all top-level elements
920 root_element = soup.find("body") or soup
922 for element in root_element.children:
923 if isinstance(element, Tag):
924 element_context = apply_element_styling(context, element)
925 result = process_element(element, element_context)
926 if result:
927 if isinstance(result, list):
928 blocks.extend(result)
929 else:
930 blocks.append(result)
932 return blocks