Coverage for pyWebLayout/io/readers/html_extraction.py: 88%

424 statements  

« prev     ^ index     » next       coverage.py v7.11.2, created at 2025-11-12 12:02 +0000

1""" 

2HTML extraction module for converting HTML elements to pyWebLayout abstract elements. 

3 

4This module provides handler functions for converting HTML elements into the abstract document structure 

5used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting. 

6Each handler function has a robust signature that handles style hints, CSS classes, and attributes. 

7""" 

8 

9from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple 

10from bs4 import BeautifulSoup, Tag, NavigableString 

11from pyWebLayout.abstract.inline import Word 

12from pyWebLayout.abstract.block import ( 

13 Block, 

14 Paragraph, 

15 Heading, 

16 HeadingLevel, 

17 Quote, 

18 CodeBlock, 

19 HList, 

20 ListItem, 

21 ListStyle, 

22 Table, 

23 TableRow, 

24 TableCell, 

25 HorizontalRule, 

26 Image, 

27) 

28from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration 

29 

30 

31class StyleContext(NamedTuple): 

32 """ 

33 Immutable style context passed to handler functions. 

34 Contains all styling information including inherited styles, CSS hints, and element attributes. 

35 """ 

36 

37 font: Font 

38 background: Optional[Tuple[int, int, int, int]] 

39 css_classes: set 

40 css_styles: Dict[str, str] 

41 element_attributes: Dict[str, Any] 

42 parent_elements: List[str] # Stack of parent element names 

43 document: Optional[Any] # Reference to document for font registry 

44 base_path: Optional[str] = None # Base path for resolving relative URLs 

45 

46 def with_font(self, font: Font) -> "StyleContext": 

47 """Create new context with modified font.""" 

48 return self._replace(font=font) 

49 

50 def with_background( 

51 self, background: Optional[Tuple[int, int, int, int]] 

52 ) -> "StyleContext": 

53 """Create new context with modified background.""" 

54 return self._replace(background=background) 

55 

56 def with_css_classes(self, css_classes: set) -> "StyleContext": 

57 """Create new context with modified CSS classes.""" 

58 return self._replace(css_classes=css_classes) 

59 

60 def with_css_styles(self, css_styles: Dict[str, str]) -> "StyleContext": 

61 """Create new context with modified CSS styles.""" 

62 return self._replace(css_styles=css_styles) 

63 

64 def with_attributes(self, attributes: Dict[str, Any]) -> "StyleContext": 

65 """Create new context with modified element attributes.""" 

66 return self._replace(element_attributes=attributes) 

67 

68 def push_element(self, element_name: str) -> "StyleContext": 

69 """Create new context with element pushed onto parent stack.""" 

70 return self._replace(parent_elements=self.parent_elements + [element_name]) 

71 

72 

73def create_base_context( 

74 base_font: Optional[Font] = None, 

75 document=None, 

76 base_path: Optional[str] = None) -> StyleContext: 

77 """ 

78 Create a base style context with default values. 

79 

80 Args: 

81 base_font: Base font to use, defaults to system default 

82 document: Document instance for font registry 

83 base_path: Base directory path for resolving relative URLs 

84 

85 Returns: 

86 StyleContext with default values 

87 """ 

88 # Use document's font registry if available, otherwise create default font 

89 if base_font is None: 

90 if document and hasattr(document, 'get_or_create_font'): 

91 base_font = document.get_or_create_font() 

92 else: 

93 base_font = Font() 

94 

95 return StyleContext( 

96 font=base_font, 

97 background=None, 

98 css_classes=set(), 

99 css_styles={}, 

100 element_attributes={}, 

101 parent_elements=[], 

102 document=document, 

103 base_path=base_path, 

104 ) 

105 

106 

107def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext: 

108 """ 

109 Apply element-specific styling to context based on HTML element and attributes. 

110 

111 Args: 

112 context: Current style context 

113 element: BeautifulSoup Tag object 

114 

115 Returns: 

116 New StyleContext with applied styling 

117 """ 

118 tag_name = element.name.lower() 

119 attributes = dict(element.attrs) if element.attrs else {} 

120 

121 # Start with current context 

122 new_context = context.with_attributes(attributes).push_element(tag_name) 

123 

124 # Apply CSS classes 

125 css_classes = new_context.css_classes.copy() 

126 if "class" in attributes: 

127 classes = ( 

128 attributes["class"].split() 

129 if isinstance(attributes["class"], str) 

130 else attributes["class"] 

131 ) 

132 css_classes.update(classes) 

133 new_context = new_context.with_css_classes(css_classes) 

134 

135 # Apply inline styles 

136 css_styles = new_context.css_styles.copy() 

137 if "style" in attributes: 

138 inline_styles = parse_inline_styles(attributes["style"]) 

139 css_styles.update(inline_styles) 

140 new_context = new_context.with_css_styles(css_styles) 

141 

142 # Apply element-specific default styles 

143 font = apply_element_font_styles( 

144 new_context.font, tag_name, css_styles, new_context) 

145 new_context = new_context.with_font(font) 

146 

147 # Apply background from styles 

148 background = apply_background_styles(new_context.background, css_styles) 

149 new_context = new_context.with_background(background) 

150 

151 return new_context 

152 

153 

154def parse_inline_styles(style_text: str) -> Dict[str, str]: 

155 """ 

156 Parse CSS inline styles into dictionary. 

157 

158 Args: 

159 style_text: CSS style text (e.g., "color: red; font-weight: bold;") 

160 

161 Returns: 

162 Dictionary of CSS property-value pairs 

163 """ 

164 styles = {} 

165 for declaration in style_text.split(";"): 

166 if ":" in declaration: 

167 prop, value = declaration.split(":", 1) 

168 styles[prop.strip().lower()] = value.strip() 

169 return styles 

170 

171 

172def apply_element_font_styles(font: Font, 

173 tag_name: str, 

174 css_styles: Dict[str, 

175 str], 

176 context: Optional[StyleContext] = None) -> Font: 

177 """ 

178 Apply font styling based on HTML element and CSS styles. 

179 Uses document's font registry when available to avoid creating duplicate fonts. 

180 

181 Args: 

182 font: Current font 

183 tag_name: HTML tag name 

184 css_styles: CSS styles dictionary 

185 context: Style context with document reference for font registry 

186 

187 Returns: 

188 Font object with applied styling (either existing or newly created) 

189 """ 

190 # Default element styles 

191 element_font_styles = { 

192 "b": {"weight": FontWeight.BOLD}, 

193 "strong": {"weight": FontWeight.BOLD}, 

194 "i": {"style": FontStyle.ITALIC}, 

195 "em": {"style": FontStyle.ITALIC}, 

196 "u": {"decoration": TextDecoration.UNDERLINE}, 

197 "s": {"decoration": TextDecoration.STRIKETHROUGH}, 

198 "del": {"decoration": TextDecoration.STRIKETHROUGH}, 

199 "h1": {"size": 24, "weight": FontWeight.BOLD}, 

200 "h2": {"size": 20, "weight": FontWeight.BOLD}, 

201 "h3": {"size": 18, "weight": FontWeight.BOLD}, 

202 "h4": {"size": 16, "weight": FontWeight.BOLD}, 

203 "h5": {"size": 14, "weight": FontWeight.BOLD}, 

204 "h6": {"size": 12, "weight": FontWeight.BOLD}, 

205 } 

206 

207 # Start with current font properties 

208 font_size = font.font_size 

209 colour = font.colour 

210 weight = font.weight 

211 style = font.style 

212 decoration = font.decoration 

213 background = font.background 

214 language = font.language 

215 font_path = font._font_path 

216 

217 # Apply element default styles 

218 if tag_name in element_font_styles: 

219 elem_styles = element_font_styles[tag_name] 

220 if "size" in elem_styles: 

221 font_size = elem_styles["size"] 

222 if "weight" in elem_styles: 

223 weight = elem_styles["weight"] 

224 if "style" in elem_styles: 

225 style = elem_styles["style"] 

226 if "decoration" in elem_styles: 

227 decoration = elem_styles["decoration"] 

228 

229 # Apply CSS styles (override element defaults) 

230 if "font-size" in css_styles: 

231 # Parse font-size (simplified - could be enhanced) 

232 size_value = css_styles["font-size"].lower() 

233 if size_value.endswith("px"): 

234 try: 

235 font_size = int(float(size_value[:-2])) 

236 except ValueError: 

237 pass 

238 elif size_value.endswith("pt"): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 try: 

240 font_size = int(float(size_value[:-2])) 

241 except ValueError: 

242 pass 

243 

244 if "font-weight" in css_styles: 

245 weight_value = css_styles["font-weight"].lower() 

246 if weight_value in ["bold", "700", "800", "900"]: 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was always true

247 weight = FontWeight.BOLD 

248 elif weight_value in ["normal", "400"]: 

249 weight = FontWeight.NORMAL 

250 

251 if "font-style" in css_styles: 

252 style_value = css_styles["font-style"].lower() 

253 if style_value == "italic": 

254 style = FontStyle.ITALIC 

255 elif style_value == "normal": 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was always true

256 style = FontStyle.NORMAL 

257 

258 if "text-decoration" in css_styles: 

259 decoration_value = css_styles["text-decoration"].lower() 

260 if "underline" in decoration_value: 

261 decoration = TextDecoration.UNDERLINE 

262 elif "line-through" in decoration_value: 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true

263 decoration = TextDecoration.STRIKETHROUGH 

264 elif "none" in decoration_value: 264 ↛ 267line 264 didn't jump to line 267 because the condition on line 264 was always true

265 decoration = TextDecoration.NONE 

266 

267 if "color" in css_styles: 

268 # Parse color (simplified - could be enhanced for hex, rgb, etc.) 

269 color_value = css_styles["color"].lower() 

270 color_map = { 

271 "black": (0, 0, 0), 

272 "white": (255, 255, 255), 

273 "red": (255, 0, 0), 

274 "green": (0, 255, 0), 

275 "blue": (0, 0, 255), 

276 } 

277 if color_value in color_map: 

278 colour = color_map[color_value] 

279 elif color_value.startswith("#") and len(color_value) == 7: 

280 try: 

281 r = int(color_value[1:3], 16) 

282 g = int(color_value[3:5], 16) 

283 b = int(color_value[5:7], 16) 

284 colour = (r, g, b) 

285 except ValueError: 

286 pass 

287 

288 # Use document's style registry if available to avoid creating duplicate styles 

289 if context and context.document and hasattr( 

290 context.document, 'get_or_create_style'): 

291 # Create an abstract style first 

292 from pyWebLayout.style.abstract_style import FontFamily, FontSize 

293 

294 # Map font properties to abstract style properties 

295 font_family = FontFamily.SERIF # Default - could be enhanced to detect from font_path 

296 if font_size: 296 ↛ 300line 296 didn't jump to line 300 because the condition on line 296 was always true

297 font_size_value = font_size if isinstance( 

298 font_size, int) else FontSize.MEDIUM 

299 else: 

300 font_size_value = FontSize.MEDIUM 

301 

302 # Create abstract style and register it 

303 style_id, abstract_style = context.document.get_or_create_style( 

304 font_family=font_family, 

305 font_size=font_size_value, 

306 font_weight=weight, 

307 font_style=style, 

308 text_decoration=decoration, 

309 color=colour, 

310 language=language 

311 ) 

312 

313 # Get the concrete font for this style 

314 return context.document.get_font_for_style(abstract_style) 

315 elif context and context.document and hasattr(context.document, 'get_or_create_font'): 315 ↛ 317line 315 didn't jump to line 317 because the condition on line 315 was never true

316 # Fallback to old font registry system 

317 return context.document.get_or_create_font( 

318 font_path=font_path, 

319 font_size=font_size, 

320 colour=colour, 

321 weight=weight, 

322 style=style, 

323 decoration=decoration, 

324 background=background, 

325 language=language, 

326 min_hyphenation_width=font.min_hyphenation_width 

327 ) 

328 else: 

329 # Fallback to creating new font if no document context 

330 return Font( 

331 font_path=font_path, 

332 font_size=font_size, 

333 colour=colour, 

334 weight=weight, 

335 style=style, 

336 decoration=decoration, 

337 background=background, 

338 language=language, 

339 ) 

340 

341 

342def apply_background_styles( 

343 current_background: Optional[Tuple[int, int, int, int]], css_styles: Dict[str, str] 

344) -> Optional[Tuple[int, int, int, int]]: 

345 """ 

346 Apply background styling from CSS. 

347 

348 Args: 

349 current_background: Current background color (RGBA) 

350 css_styles: CSS styles dictionary 

351 

352 Returns: 

353 New background color or None 

354 """ 

355 if "background-color" in css_styles: 

356 bg_value = css_styles["background-color"].lower() 

357 if bg_value == "transparent": 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true

358 return None 

359 # Add color parsing logic here if needed 

360 

361 return current_background 

362 

363 

364def extract_text_content(element: Tag, context: StyleContext) -> List[Word]: 

365 """ 

366 Extract text content from an element, handling inline formatting and links. 

367 

368 Args: 

369 element: BeautifulSoup Tag object 

370 context: Current style context 

371 

372 Returns: 

373 List of Word objects (including LinkedWord for hyperlinks) 

374 """ 

375 from pyWebLayout.abstract.inline import LinkedWord 

376 from pyWebLayout.abstract.functional import LinkType 

377 

378 words = [] 

379 

380 for child in element.children: 

381 if isinstance(child, NavigableString): 

382 # Plain text - split into words 

383 text = str(child).strip() 

384 if text: 

385 word_texts = text.split() 

386 for word_text in word_texts: 

387 if word_text: 387 ↛ 386line 387 didn't jump to line 386 because the condition on line 387 was always true

388 words.append(Word(word_text, context.font, context.background)) 

389 elif isinstance(child, Tag): 389 ↛ 380line 389 didn't jump to line 380 because the condition on line 389 was always true

390 # Special handling for <a> tags (hyperlinks) 

391 if child.name.lower() == "a": 

392 href = child.get('href', '') 

393 if href: 

394 # Determine link type based on href 

395 if href.startswith(('http://', 'https://')): 

396 link_type = LinkType.EXTERNAL 

397 elif href.startswith('#'): 

398 link_type = LinkType.INTERNAL 

399 elif href.startswith('javascript:') or href.startswith('api:'): 

400 link_type = LinkType.API 

401 else: 

402 link_type = LinkType.INTERNAL 

403 

404 # Apply link styling 

405 child_context = apply_element_styling(context, child) 

406 

407 # Extract text and create LinkedWord for each word 

408 link_text = child.get_text(strip=True) 

409 title = child.get('title', '') 

410 

411 for word_text in link_text.split(): 

412 if word_text: 412 ↛ 411line 412 didn't jump to line 411 because the condition on line 412 was always true

413 linked_word = LinkedWord( 

414 text=word_text, 

415 style=child_context.font, 

416 location=href, 

417 link_type=link_type, 

418 background=child_context.background, 

419 title=title if title else None 

420 ) 

421 words.append(linked_word) 

422 else: 

423 # <a> without href - treat as normal text 

424 child_context = apply_element_styling(context, child) 

425 child_words = extract_text_content(child, child_context) 

426 words.extend(child_words) 

427 

428 # Process other inline elements 

429 elif child.name.lower() in [ 

430 "span", 

431 "strong", 

432 "b", 

433 "em", 

434 "i", 

435 "u", 

436 "s", 

437 "del", 

438 "ins", 

439 "mark", 

440 "small", 

441 "sub", 

442 "sup", 

443 "code", 

444 "q", 

445 "cite", 

446 "abbr", 

447 "time", 

448 ]: 

449 child_context = apply_element_styling(context, child) 

450 child_words = extract_text_content(child, child_context) 

451 words.extend(child_words) 

452 else: 

453 # Block element - shouldn't happen in well-formed HTML but handle 

454 # gracefully 

455 child_context = apply_element_styling(context, child) 

456 child_result = process_element(child, child_context) 

457 if isinstance(child_result, list): 457 ↛ 458line 457 didn't jump to line 458 because the condition on line 457 was never true

458 for block in child_result: 

459 if isinstance(block, Paragraph): 

460 for _, word in block.words_iter(): 

461 words.append(word) 

462 elif isinstance(child_result, Paragraph): 462 ↛ 463line 462 didn't jump to line 463 because the condition on line 462 was never true

463 for _, word in child_result.words_iter(): 

464 words.append(word) 

465 

466 return words 

467 

468 

469def process_element( 

470 element: Tag, context: StyleContext 

471) -> Union[Block, List[Block], None]: 

472 """ 

473 Process a single HTML element using appropriate handler. 

474 

475 Args: 

476 element: BeautifulSoup Tag object 

477 context: Current style context 

478 

479 Returns: 

480 Block object(s) or None if element should be ignored 

481 """ 

482 tag_name = element.name.lower() 

483 handler = HANDLERS.get(tag_name, generic_handler) 

484 return handler(element, context) 

485 

486 

487# Handler function signatures: 

488# All handlers receive (element: Tag, context: StyleContext) -> 

489# Union[Block, List[Block], None] 

490 

491 

492def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]: 

493 """ 

494 Handle <p> elements. 

495 

496 Special handling for paragraphs containing images: 

497 - If the paragraph contains only an image (common in EPUBs), return the image block 

498 - If the paragraph contains images mixed with text, split into separate blocks 

499 - Otherwise, return a normal paragraph with text content 

500 """ 

501 # Check if paragraph contains any img tags (including nested ones) 

502 img_tags = element.find_all('img') 

503 

504 if img_tags: 

505 # Paragraph contains images - need special handling 

506 blocks = [] 

507 

508 # Check if this is an image-only paragraph (very common in EPUBs) 

509 # Get text content without the img tags 

510 text_content = element.get_text(strip=True) 

511 

512 if not text_content or len(text_content.strip()) == 0: 

513 # Image-only paragraph - return just the image(s) 

514 for img_tag in img_tags: 

515 child_context = apply_element_styling(context, img_tag) 

516 img_block = image_handler(img_tag, child_context) 

517 if img_block: 517 ↛ 514line 517 didn't jump to line 514 because the condition on line 517 was always true

518 blocks.append(img_block) 

519 

520 # Return single image or list of images 

521 if len(blocks) == 1: 

522 return blocks[0] 

523 return blocks if blocks else Paragraph(context.font) 

524 

525 # Mixed content - paragraph has both text and images 

526 # Process children in order to preserve structure 

527 for child in element.children: 

528 if isinstance(child, Tag): 

529 if child.name == 'img': 529 ↛ 538line 529 didn't jump to line 538 because the condition on line 529 was always true

530 # Add the image as a separate block 

531 child_context = apply_element_styling(context, child) 

532 img_block = image_handler(child, child_context) 

533 if img_block: 533 ↛ 527line 533 didn't jump to line 527 because the condition on line 533 was always true

534 blocks.append(img_block) 

535 else: 

536 # Process other inline elements as part of text 

537 # This will be handled by extract_text_content below 

538 pass 

539 

540 # Also add a paragraph with the text content 

541 paragraph = Paragraph(context.font) 

542 words = extract_text_content(element, context) 

543 if words: 543 ↛ 548line 543 didn't jump to line 548 because the condition on line 543 was always true

544 for word in words: 

545 paragraph.add_word(word) 

546 blocks.insert(0, paragraph) # Text comes before images 

547 

548 return blocks if blocks else Paragraph(context.font) 

549 

550 # No images - normal paragraph handling 

551 paragraph = Paragraph(context.font) 

552 words = extract_text_content(element, context) 

553 for word in words: 

554 paragraph.add_word(word) 

555 return paragraph 

556 

557 

558def div_handler(element: Tag, context: StyleContext) -> List[Block]: 

559 """Handle <div> elements - treat as generic container.""" 

560 blocks = [] 

561 for child in element.children: 

562 if isinstance(child, Tag): 

563 child_context = apply_element_styling(context, child) 

564 result = process_element(child, child_context) 

565 if result: 

566 if isinstance(result, list): 

567 blocks.extend(result) 

568 else: 

569 blocks.append(result) 

570 return blocks 

571 

572 

573def heading_handler(element: Tag, context: StyleContext) -> Heading: 

574 """Handle <h1>-<h6> elements.""" 

575 level_map = { 

576 "h1": HeadingLevel.H1, 

577 "h2": HeadingLevel.H2, 

578 "h3": HeadingLevel.H3, 

579 "h4": HeadingLevel.H4, 

580 "h5": HeadingLevel.H5, 

581 "h6": HeadingLevel.H6, 

582 } 

583 

584 level = level_map.get(element.name.lower(), HeadingLevel.H1) 

585 heading = Heading(level, context.font) 

586 words = extract_text_content(element, context) 

587 for word in words: 

588 heading.add_word(word) 

589 return heading 

590 

591 

592def blockquote_handler(element: Tag, context: StyleContext) -> Quote: 

593 """Handle <blockquote> elements.""" 

594 quote = Quote(context.font) 

595 for child in element.children: 

596 if isinstance(child, Tag): 

597 child_context = apply_element_styling(context, child) 

598 result = process_element(child, child_context) 

599 if result: 599 ↛ 595line 599 didn't jump to line 595 because the condition on line 599 was always true

600 if isinstance(result, list): 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true

601 for block in result: 

602 quote.add_block(block) 

603 else: 

604 quote.add_block(result) 

605 return quote 

606 

607 

608def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock: 

609 """Handle <pre> elements.""" 

610 language = context.element_attributes.get("data-language", "") 

611 code_block = CodeBlock(language) 

612 

613 # Preserve whitespace and line breaks in preformatted text 

614 text = element.get_text(separator="\n", strip=False) 

615 for line in text.split("\n"): 

616 code_block.add_line(line) 

617 

618 return code_block 

619 

620 

621def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]: 

622 """Handle <code> elements.""" 

623 # If parent is <pre>, this is handled by preformatted_handler 

624 if context.parent_elements and context.parent_elements[-1] == "pre": 624 ↛ 625line 624 didn't jump to line 625 because the condition on line 624 was never true

625 return None # Will be handled by parent 

626 

627 # Inline code - handled during text extraction 

628 return None 

629 

630 

631def unordered_list_handler(element: Tag, context: StyleContext) -> HList: 

632 """Handle <ul> elements.""" 

633 hlist = HList(ListStyle.UNORDERED, context.font) 

634 for child in element.children: 

635 if isinstance(child, Tag) and child.name.lower() == "li": 

636 child_context = apply_element_styling(context, child) 

637 item = process_element(child, child_context) 

638 if item: 638 ↛ 634line 638 didn't jump to line 634 because the condition on line 638 was always true

639 hlist.add_item(item) 

640 return hlist 

641 

642 

643def ordered_list_handler(element: Tag, context: StyleContext) -> HList: 

644 """Handle <ol> elements.""" 

645 hlist = HList(ListStyle.ORDERED, context.font) 

646 for child in element.children: 

647 if isinstance(child, Tag) and child.name.lower() == "li": 

648 child_context = apply_element_styling(context, child) 

649 item = process_element(child, child_context) 

650 if item: 650 ↛ 646line 650 didn't jump to line 646 because the condition on line 650 was always true

651 hlist.add_item(item) 

652 return hlist 

653 

654 

655def list_item_handler(element: Tag, context: StyleContext) -> ListItem: 

656 """Handle <li> elements.""" 

657 list_item = ListItem(None, context.font) 

658 

659 for child in element.children: 

660 if isinstance(child, Tag): 

661 child_context = apply_element_styling(context, child) 

662 result = process_element(child, child_context) 

663 if result: 

664 if isinstance(result, list): 664 ↛ 665line 664 didn't jump to line 665 because the condition on line 664 was never true

665 for block in result: 

666 list_item.add_block(block) 

667 else: 

668 list_item.add_block(result) 

669 elif isinstance(child, NavigableString): 669 ↛ 659line 669 didn't jump to line 659 because the condition on line 669 was always true

670 # Direct text in list item - create paragraph 

671 text = str(child).strip() 

672 if text: 

673 paragraph = Paragraph(context.font) 

674 words = text.split() 

675 for word_text in words: 

676 if word_text: 676 ↛ 675line 676 didn't jump to line 675 because the condition on line 676 was always true

677 paragraph.add_word(Word(word_text, context.font)) 

678 list_item.add_block(paragraph) 

679 

680 return list_item 

681 

682 

683def table_handler(element: Tag, context: StyleContext) -> Table: 

684 """Handle <table> elements.""" 

685 caption = None 

686 caption_elem = element.find("caption") 

687 if caption_elem: 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true

688 caption = caption_elem.get_text(strip=True) 

689 

690 table = Table(caption, context.font) 

691 

692 # Process table rows 

693 for child in element.children: 

694 if isinstance(child, Tag): 

695 if child.name.lower() == "tr": 

696 child_context = apply_element_styling(context, child) 

697 row = process_element(child, child_context) 

698 if row: 698 ↛ 693line 698 didn't jump to line 693 because the condition on line 698 was always true

699 table.add_row(row) 

700 elif child.name.lower() in ["thead", "tbody", "tfoot"]: 700 ↛ 693line 700 didn't jump to line 693 because the condition on line 700 was always true

701 section = "header" if child.name.lower() == "thead" else "body" 

702 section = "footer" if child.name.lower() == "tfoot" else section 

703 

704 for row_elem in child.find_all("tr"): 

705 child_context = apply_element_styling(context, row_elem) 

706 row = process_element(row_elem, child_context) 

707 if row: 707 ↛ 704line 707 didn't jump to line 704 because the condition on line 707 was always true

708 table.add_row(row, section) 

709 

710 return table 

711 

712 

713def table_row_handler(element: Tag, context: StyleContext) -> TableRow: 

714 """Handle <tr> elements.""" 

715 row = TableRow(context.font) 

716 for child in element.children: 

717 if isinstance(child, Tag) and child.name.lower() in ["td", "th"]: 

718 child_context = apply_element_styling(context, child) 

719 cell = process_element(child, child_context) 

720 if cell: 720 ↛ 716line 720 didn't jump to line 716 because the condition on line 720 was always true

721 row.add_cell(cell) 

722 return row 

723 

724 

725def table_cell_handler(element: Tag, context: StyleContext) -> TableCell: 

726 """Handle <td> elements.""" 

727 colspan = int(context.element_attributes.get("colspan", 1)) 

728 rowspan = int(context.element_attributes.get("rowspan", 1)) 

729 cell = TableCell(False, colspan, rowspan, context.font) 

730 

731 # Process cell content 

732 for child in element.children: 

733 if isinstance(child, Tag): 

734 child_context = apply_element_styling(context, child) 

735 result = process_element(child, child_context) 

736 if result: 

737 if isinstance(result, list): 

738 for block in result: 

739 cell.add_block(block) 

740 else: 

741 cell.add_block(result) 

742 elif isinstance(child, NavigableString): 742 ↛ 732line 742 didn't jump to line 732 because the condition on line 742 was always true

743 # Direct text in cell - create paragraph 

744 text = str(child).strip() 

745 if text: 

746 paragraph = Paragraph(context.font) 

747 words = text.split() 

748 for word_text in words: 

749 if word_text: 749 ↛ 748line 749 didn't jump to line 748 because the condition on line 749 was always true

750 paragraph.add_word(Word(word_text, context.font)) 

751 cell.add_block(paragraph) 

752 

753 return cell 

754 

755 

756def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell: 

757 """Handle <th> elements.""" 

758 colspan = int(context.element_attributes.get("colspan", 1)) 

759 rowspan = int(context.element_attributes.get("rowspan", 1)) 

760 cell = TableCell(True, colspan, rowspan, context.font) 

761 

762 # Process cell content (same as td) 

763 for child in element.children: 

764 if isinstance(child, Tag): 

765 child_context = apply_element_styling(context, child) 

766 result = process_element(child, child_context) 

767 if result: 

768 if isinstance(result, list): 768 ↛ 772line 768 didn't jump to line 772 because the condition on line 768 was always true

769 for block in result: 

770 cell.add_block(block) 

771 else: 

772 cell.add_block(result) 

773 elif isinstance(child, NavigableString): 773 ↛ 763line 773 didn't jump to line 763 because the condition on line 773 was always true

774 text = str(child).strip() 

775 if text: 

776 paragraph = Paragraph(context.font) 

777 words = text.split() 

778 for word_text in words: 

779 if word_text: 779 ↛ 778line 779 didn't jump to line 778 because the condition on line 779 was always true

780 paragraph.add_word(Word(word_text, context.font)) 

781 cell.add_block(paragraph) 

782 

783 return cell 

784 

785 

786def horizontal_rule_handler(element: Tag, context: StyleContext) -> HorizontalRule: 

787 """Handle <hr> elements.""" 

788 return HorizontalRule() 

789 

790 

791def line_break_handler(element: Tag, context: StyleContext) -> None: 

792 """Handle <br> elements.""" 

793 # Line breaks are typically handled at the paragraph level 

794 return None 

795 

796 

797def image_handler(element: Tag, context: StyleContext) -> Image: 

798 """Handle <img> elements.""" 

799 import os 

800 import urllib.parse 

801 

802 src = context.element_attributes.get("src", "") 

803 alt_text = context.element_attributes.get("alt", "") 

804 

805 # Resolve relative paths if base_path is provided 

806 if context.base_path and src and not src.startswith(('http://', 'https://', '/')): 

807 # Parse the src to handle URL-encoded characters 

808 src_decoded = urllib.parse.unquote(src) 

809 # Resolve relative path to absolute path 

810 src = os.path.normpath(os.path.join(context.base_path, src_decoded)) 

811 

812 # Parse dimensions if provided 

813 width = height = None 

814 try: 

815 if "width" in context.element_attributes: 

816 width = int(context.element_attributes["width"]) 

817 if "height" in context.element_attributes: 

818 height = int(context.element_attributes["height"]) 

819 except ValueError: 

820 pass 

821 

822 return Image(source=src, alt_text=alt_text, width=width, height=height) 

823 

824 

825def ignore_handler(element: Tag, context: StyleContext) -> None: 

826 """Handle elements that should be ignored.""" 

827 return None 

828 

829 

830def generic_handler(element: Tag, context: StyleContext) -> List[Block]: 

831 """Handle unknown elements as generic containers.""" 

832 return div_handler(element, context) 

833 

834 

835# Handler registry - maps HTML tag names to handler functions 

836HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = { 

837 # Block elements 

838 "p": paragraph_handler, 

839 "div": div_handler, 

840 "h1": heading_handler, 

841 "h2": heading_handler, 

842 "h3": heading_handler, 

843 "h4": heading_handler, 

844 "h5": heading_handler, 

845 "h6": heading_handler, 

846 "blockquote": blockquote_handler, 

847 "pre": preformatted_handler, 

848 "code": code_handler, 

849 "ul": unordered_list_handler, 

850 "ol": ordered_list_handler, 

851 "li": list_item_handler, 

852 "table": table_handler, 

853 "tr": table_row_handler, 

854 "td": table_cell_handler, 

855 "th": table_header_cell_handler, 

856 "hr": horizontal_rule_handler, 

857 "br": line_break_handler, 

858 # Semantic elements (treated as containers) 

859 "section": div_handler, 

860 "article": div_handler, 

861 "aside": div_handler, 

862 "nav": div_handler, 

863 "header": div_handler, 

864 "footer": div_handler, 

865 "main": div_handler, 

866 "figure": div_handler, 

867 "figcaption": paragraph_handler, 

868 # Media elements 

869 "img": image_handler, 

870 # Inline elements (handled during text extraction) 

871 "span": ignore_handler, 

872 "a": ignore_handler, 

873 "strong": ignore_handler, 

874 "b": ignore_handler, 

875 "em": ignore_handler, 

876 "i": ignore_handler, 

877 "u": ignore_handler, 

878 "s": ignore_handler, 

879 "del": ignore_handler, 

880 "ins": ignore_handler, 

881 "mark": ignore_handler, 

882 "small": ignore_handler, 

883 "sub": ignore_handler, 

884 "sup": ignore_handler, 

885 "q": ignore_handler, 

886 "cite": ignore_handler, 

887 "abbr": ignore_handler, 

888 "time": ignore_handler, 

889 # Ignored elements 

890 "script": ignore_handler, 

891 "style": ignore_handler, 

892 "meta": ignore_handler, 

893 "link": ignore_handler, 

894 "head": ignore_handler, 

895 "title": ignore_handler, 

896} 

897 

898 

899def parse_html_string( 

900 html_string: str, base_font: Optional[Font] = None, document=None, base_path: Optional[str] = None 

901) -> List[Block]: 

902 """ 

903 Parse HTML string and return list of Block objects. 

904 

905 Args: 

906 html_string: HTML content to parse 

907 base_font: Base font for styling, defaults to system default 

908 document: Document instance for font registry to avoid duplicate fonts 

909 base_path: Base directory path for resolving relative URLs (e.g., image sources) 

910 

911 Returns: 

912 List of Block objects representing the document structure 

913 """ 

914 soup = BeautifulSoup(html_string, "html.parser") 

915 context = create_base_context(base_font, document, base_path) 

916 

917 blocks = [] 

918 

919 # Process the body if it exists, otherwise process all top-level elements 

920 root_element = soup.find("body") or soup 

921 

922 for element in root_element.children: 

923 if isinstance(element, Tag): 

924 element_context = apply_element_styling(context, element) 

925 result = process_element(element, element_context) 

926 if result: 

927 if isinstance(result, list): 

928 blocks.extend(result) 

929 else: 

930 blocks.append(result) 

931 

932 return blocks