Coverage for pyWebLayout/io/readers/html

1"""

2HTML extraction module for converting HTML elements to pyWebLayout abstract elements.

4This module provides handler functions for converting HTML elements into the abstract document structure

5used by pyWebLayout, including paragraphs, headings, lists, tables, and inline formatting.

6Each handler function has a robust signature that handles style hints, CSS classes, and attributes.

7"""

9from typing import List, Dict, Any, Optional, Union, Callable, Tuple, NamedTuple

10from bs4 import BeautifulSoup, Tag, NavigableString

11from pyWebLayout.abstract.inline import Word

12from pyWebLayout.abstract.block import (

13 Block,

14 Paragraph,

15 Heading,

16 HeadingLevel,

17 Quote,

18 CodeBlock,

19 HList,

20 ListItem,

21 ListStyle,

22 Table,

23 TableRow,

24 TableCell,

25 HorizontalRule,

26 Image,

27)

28from pyWebLayout.style import Font, FontWeight, FontStyle, TextDecoration

31class StyleContext(NamedTuple):

32 """

33 Immutable style context passed to handler functions.

34 Contains all styling information including inherited styles, CSS hints, and element attributes.

35 """

37 font: Font

38 background: Optional[Tuple[int, int, int, int]]

39 css_classes: set

40 css_styles: Dict[str, str]

41 element_attributes: Dict[str, Any]

42 parent_elements: List[str] # Stack of parent element names

43 document: Optional[Any] # Reference to document for font registry

44 base_path: Optional[str] = None # Base path for resolving relative URLs

46 def with_font(self, font: Font) -> "StyleContext":

47 """Create new context with modified font."""

48 return self._replace(font=font)

50 def with_background(

51 self, background: Optional[Tuple[int, int, int, int]]

52 ) -> "StyleContext":

53 """Create new context with modified background."""

54 return self._replace(background=background)

56 def with_css_classes(self, css_classes: set) -> "StyleContext":

57 """Create new context with modified CSS classes."""

58 return self._replace(css_classes=css_classes)

60 def with_css_styles(self, css_styles: Dict[str, str]) -> "StyleContext":

61 """Create new context with modified CSS styles."""

62 return self._replace(css_styles=css_styles)

64 def with_attributes(self, attributes: Dict[str, Any]) -> "StyleContext":

65 """Create new context with modified element attributes."""

66 return self._replace(element_attributes=attributes)

68 def push_element(self, element_name: str) -> "StyleContext":

69 """Create new context with element pushed onto parent stack."""

70 return self._replace(parent_elements=self.parent_elements + [element_name])

73def create_base_context(

74 base_font: Optional[Font] = None,

75 document=None,

76 base_path: Optional[str] = None) -> StyleContext:

77 """

78 Create a base style context with default values.

80 Args:

81 base_font: Base font to use, defaults to system default

82 document: Document instance for font registry

83 base_path: Base directory path for resolving relative URLs

85 Returns:

86 StyleContext with default values

87 """

88 # Use document's font registry if available, otherwise create default font

89 if base_font is None:

90 if document and hasattr(document, 'get_or_create_font'):

91 base_font = document.get_or_create_font()

92 else:

93 base_font = Font()

95 return StyleContext(

96 font=base_font,

97 background=None,

98 css_classes=set(),

99 css_styles={},

100 element_attributes={},

101 parent_elements=[],

102 document=document,

103 base_path=base_path,

104 )

105

106

107def apply_element_styling(context: StyleContext, element: Tag) -> StyleContext:

108 """

109 Apply element-specific styling to context based on HTML element and attributes.

110

111 Args:

112 context: Current style context

113 element: BeautifulSoup Tag object

114

115 Returns:

116 New StyleContext with applied styling

117 """

118 tag_name = element.name.lower()

119 attributes = dict(element.attrs) if element.attrs else {}

120

121 # Start with current context

122 new_context = context.with_attributes(attributes).push_element(tag_name)

123

124 # Apply CSS classes

125 css_classes = new_context.css_classes.copy()

126 if "class" in attributes:

127 classes = (

128 attributes["class"].split()

129 if isinstance(attributes["class"], str)

130 else attributes["class"]

131 )

132 css_classes.update(classes)

133 new_context = new_context.with_css_classes(css_classes)

134

135 # Apply inline styles

136 css_styles = new_context.css_styles.copy()

137 if "style" in attributes:

138 inline_styles = parse_inline_styles(attributes["style"])

139 css_styles.update(inline_styles)

140 new_context = new_context.with_css_styles(css_styles)

141

142 # Apply element-specific default styles

143 font = apply_element_font_styles(

144 new_context.font, tag_name, css_styles, new_context)

145 new_context = new_context.with_font(font)

146

147 # Apply background from styles

148 background = apply_background_styles(new_context.background, css_styles)

149 new_context = new_context.with_background(background)

150

151 return new_context

152

153

154def parse_inline_styles(style_text: str) -> Dict[str, str]:

155 """

156 Parse CSS inline styles into dictionary.

157

158 Args:

159 style_text: CSS style text (e.g., "color: red; font-weight: bold;")

160

161 Returns:

162 Dictionary of CSS property-value pairs

163 """

164 styles = {}

165 for declaration in style_text.split(";"):

166 if ":" in declaration:

167 prop, value = declaration.split(":", 1)

168 styles[prop.strip().lower()] = value.strip()

169 return styles

170

171

172def apply_element_font_styles(font: Font,

173 tag_name: str,

174 css_styles: Dict[str,

175 str],

176 context: Optional[StyleContext] = None) -> Font:

177 """

178 Apply font styling based on HTML element and CSS styles.

179 Uses document's font registry when available to avoid creating duplicate fonts.

180

181 Args:

182 font: Current font

183 tag_name: HTML tag name

184 css_styles: CSS styles dictionary

185 context: Style context with document reference for font registry

186

187 Returns:

188 Font object with applied styling (either existing or newly created)

189 """

190 # Default element styles

191 element_font_styles = {

192 "b": {"weight": FontWeight.BOLD},

193 "strong": {"weight": FontWeight.BOLD},

194 "i": {"style": FontStyle.ITALIC},

195 "em": {"style": FontStyle.ITALIC},

196 "u": {"decoration": TextDecoration.UNDERLINE},

197 "s": {"decoration": TextDecoration.STRIKETHROUGH},

198 "del": {"decoration": TextDecoration.STRIKETHROUGH},

199 "h1": {"size": 24, "weight": FontWeight.BOLD},

200 "h2": {"size": 20, "weight": FontWeight.BOLD},

201 "h3": {"size": 18, "weight": FontWeight.BOLD},

202 "h4": {"size": 16, "weight": FontWeight.BOLD},

203 "h5": {"size": 14, "weight": FontWeight.BOLD},

204 "h6": {"size": 12, "weight": FontWeight.BOLD},

205 }

206

207 # Start with current font properties

208 font_size = font.font_size

209 colour = font.colour

210 weight = font.weight

211 style = font.style

212 decoration = font.decoration

213 background = font.background

214 language = font.language

215 font_path = font._font_path

216

217 # Apply element default styles

218 if tag_name in element_font_styles:

219 elem_styles = element_font_styles[tag_name]

220 if "size" in elem_styles:

221 font_size = elem_styles["size"]

222 if "weight" in elem_styles:

223 weight = elem_styles["weight"]

224 if "style" in elem_styles:

225 style = elem_styles["style"]

226 if "decoration" in elem_styles:

227 decoration = elem_styles["decoration"]

228

229 # Apply CSS styles (override element defaults)

230 if "font-size" in css_styles:

231 # Parse font-size (simplified - could be enhanced)

232 size_value = css_styles["font-size"].lower()

233 if size_value.endswith("px"):

234 try:

235 font_size = int(float(size_value[:-2]))

236 except ValueError:

237 pass

238 elif size_value.endswith("pt"): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 try:

240 font_size = int(float(size_value[:-2]))

241 except ValueError:

242 pass

243

244 if "font-weight" in css_styles:

245 weight_value = css_styles["font-weight"].lower()

246 if weight_value in ["bold", "700", "800", "900"]: 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was always true

247 weight = FontWeight.BOLD

248 elif weight_value in ["normal", "400"]:

249 weight = FontWeight.NORMAL

250

251 if "font-style" in css_styles:

252 style_value = css_styles["font-style"].lower()

253 if style_value == "italic":

254 style = FontStyle.ITALIC

255 elif style_value == "normal": 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was always true

256 style = FontStyle.NORMAL

257

258 if "text-decoration" in css_styles:

259 decoration_value = css_styles["text-decoration"].lower()

260 if "underline" in decoration_value:

261 decoration = TextDecoration.UNDERLINE

262 elif "line-through" in decoration_value: 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true

263 decoration = TextDecoration.STRIKETHROUGH

264 elif "none" in decoration_value: 264 ↛ 267line 264 didn't jump to line 267 because the condition on line 264 was always true

265 decoration = TextDecoration.NONE

266

267 if "color" in css_styles:

268 # Parse color (simplified - could be enhanced for hex, rgb, etc.)

269 color_value = css_styles["color"].lower()

270 color_map = {

271 "black": (0, 0, 0),

272 "white": (255, 255, 255),

273 "red": (255, 0, 0),

274 "green": (0, 255, 0),

275 "blue": (0, 0, 255),

276 }

277 if color_value in color_map:

278 colour = color_map[color_value]

279 elif color_value.startswith("#") and len(color_value) == 7:

280 try:

281 r = int(color_value[1:3], 16)

282 g = int(color_value[3:5], 16)

283 b = int(color_value[5:7], 16)

284 colour = (r, g, b)

285 except ValueError:

286 pass

287

288 # Use document's style registry if available to avoid creating duplicate styles

289 if context and context.document and hasattr(

290 context.document, 'get_or_create_style'):

291 # Create an abstract style first

292 from pyWebLayout.style.abstract_style import FontFamily, FontSize

293

294 # Map font properties to abstract style properties

295 font_family = FontFamily.SERIF # Default - could be enhanced to detect from font_path

296 if font_size: 296 ↛ 300line 296 didn't jump to line 300 because the condition on line 296 was always true

297 font_size_value = font_size if isinstance(

298 font_size, int) else FontSize.MEDIUM

299 else:

300 font_size_value = FontSize.MEDIUM

301

302 # Create abstract style and register it

303 style_id, abstract_style = context.document.get_or_create_style(

304 font_family=font_family,

305 font_size=font_size_value,

306 font_weight=weight,

307 font_style=style,

308 text_decoration=decoration,

309 color=colour,

310 language=language

311 )

312

313 # Get the concrete font for this style

314 return context.document.get_font_for_style(abstract_style)

315 elif context and context.document and hasattr(context.document, 'get_or_create_font'): 315 ↛ 317line 315 didn't jump to line 317 because the condition on line 315 was never true

316 # Fallback to old font registry system

317 return context.document.get_or_create_font(

318 font_path=font_path,

319 font_size=font_size,

320 colour=colour,

321 weight=weight,

322 style=style,

323 decoration=decoration,

324 background=background,

325 language=language,

326 min_hyphenation_width=font.min_hyphenation_width

327 )

328 else:

329 # Fallback to creating new font if no document context

330 return Font(

331 font_path=font_path,

332 font_size=font_size,

333 colour=colour,

334 weight=weight,

335 style=style,

336 decoration=decoration,

337 background=background,

338 language=language,

339 )

340

341

342def apply_background_styles(

343 current_background: Optional[Tuple[int, int, int, int]], css_styles: Dict[str, str]

344) -> Optional[Tuple[int, int, int, int]]:

345 """

346 Apply background styling from CSS.

347

348 Args:

349 current_background: Current background color (RGBA)

350 css_styles: CSS styles dictionary

351

352 Returns:

353 New background color or None

354 """

355 if "background-color" in css_styles:

356 bg_value = css_styles["background-color"].lower()

357 if bg_value == "transparent": 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true

358 return None

359 # Add color parsing logic here if needed

360

361 return current_background

362

363

364def extract_text_content(element: Tag, context: StyleContext) -> List[Word]:

365 """

366 Extract text content from an element, handling inline formatting and links.

367

368 Args:

369 element: BeautifulSoup Tag object

370 context: Current style context

371

372 Returns:

373 List of Word objects (including LinkedWord for hyperlinks)

374 """

375 from pyWebLayout.abstract.inline import LinkedWord

376 from pyWebLayout.abstract.functional import LinkType

377

378 words = []

379

380 for child in element.children:

381 if isinstance(child, NavigableString):

382 # Plain text - split into words

383 text = str(child).strip()

384 if text:

385 word_texts = text.split()

386 for word_text in word_texts:

387 if word_text: 387 ↛ 386line 387 didn't jump to line 386 because the condition on line 387 was always true

388 words.append(Word(word_text, context.font, context.background))

389 elif isinstance(child, Tag): 389 ↛ 380line 389 didn't jump to line 380 because the condition on line 389 was always true

390 # Special handling for <a> tags (hyperlinks)

391 if child.name.lower() == "a":

392 href = child.get('href', '')

393 if href:

394 # Determine link type based on href

395 if href.startswith(('http://', 'https://')):

396 link_type = LinkType.EXTERNAL

397 elif href.startswith('#'):

398 link_type = LinkType.INTERNAL

399 elif href.startswith('javascript:') or href.startswith('api:'):

400 link_type = LinkType.API

401 else:

402 link_type = LinkType.INTERNAL

403

404 # Apply link styling

405 child_context = apply_element_styling(context, child)

406

407 # Extract text and create LinkedWord for each word

408 link_text = child.get_text(strip=True)

409 title = child.get('title', '')

410

411 for word_text in link_text.split():

412 if word_text: 412 ↛ 411line 412 didn't jump to line 411 because the condition on line 412 was always true

413 linked_word = LinkedWord(

414 text=word_text,

415 style=child_context.font,

416 location=href,

417 link_type=link_type,

418 background=child_context.background,

419 title=title if title else None

420 )

421 words.append(linked_word)

422 else:

423 # <a> without href - treat as normal text

424 child_context = apply_element_styling(context, child)

425 child_words = extract_text_content(child, child_context)

426 words.extend(child_words)

427

428 # Process other inline elements

429 elif child.name.lower() in [

430 "span",

431 "strong",

432 "b",

433 "em",

434 "i",

435 "u",

436 "s",

437 "del",

438 "ins",

439 "mark",

440 "small",

441 "sub",

442 "sup",

443 "code",

444 "q",

445 "cite",

446 "abbr",

447 "time",

448 ]:

449 child_context = apply_element_styling(context, child)

450 child_words = extract_text_content(child, child_context)

451 words.extend(child_words)

452 else:

453 # Block element - shouldn't happen in well-formed HTML but handle

454 # gracefully

455 child_context = apply_element_styling(context, child)

456 child_result = process_element(child, child_context)

457 if isinstance(child_result, list): 457 ↛ 458line 457 didn't jump to line 458 because the condition on line 457 was never true

458 for block in child_result:

459 if isinstance(block, Paragraph):

460 for _, word in block.words_iter():

461 words.append(word)

462 elif isinstance(child_result, Paragraph): 462 ↛ 463line 462 didn't jump to line 463 because the condition on line 462 was never true

463 for _, word in child_result.words_iter():

464 words.append(word)

465

466 return words

467

468

469def process_element(

470 element: Tag, context: StyleContext

471) -> Union[Block, List[Block], None]:

472 """

473 Process a single HTML element using appropriate handler.

474

475 Args:

476 element: BeautifulSoup Tag object

477 context: Current style context

478

479 Returns:

480 Block object(s) or None if element should be ignored

481 """

482 tag_name = element.name.lower()

483 handler = HANDLERS.get(tag_name, generic_handler)

484 return handler(element, context)

485

486

487# Handler function signatures:

488# All handlers receive (element: Tag, context: StyleContext) ->

489# Union[Block, List[Block], None]

490

491

492def paragraph_handler(element: Tag, context: StyleContext) -> Union[Paragraph, List[Block], Image]:

493 """

494 Handle <p> elements.

495

496 Special handling for paragraphs containing images:

497 - If the paragraph contains only an image (common in EPUBs), return the image block

498 - If the paragraph contains images mixed with text, split into separate blocks

499 - Otherwise, return a normal paragraph with text content

500 """

501 # Check if paragraph contains any img tags (including nested ones)

502 img_tags = element.find_all('img')

503

504 if img_tags:

505 # Paragraph contains images - need special handling

506 blocks = []

507

508 # Check if this is an image-only paragraph (very common in EPUBs)

509 # Get text content without the img tags

510 text_content = element.get_text(strip=True)

511

512 if not text_content or len(text_content.strip()) == 0:

513 # Image-only paragraph - return just the image(s)

514 for img_tag in img_tags:

515 child_context = apply_element_styling(context, img_tag)

516 img_block = image_handler(img_tag, child_context)

517 if img_block: 517 ↛ 514line 517 didn't jump to line 514 because the condition on line 517 was always true

518 blocks.append(img_block)

519

520 # Return single image or list of images

521 if len(blocks) == 1:

522 return blocks[0]

523 return blocks if blocks else Paragraph(context.font)

524

525 # Mixed content - paragraph has both text and images

526 # Process children in order to preserve structure

527 for child in element.children:

528 if isinstance(child, Tag):

529 if child.name == 'img': 529 ↛ 538line 529 didn't jump to line 538 because the condition on line 529 was always true

530 # Add the image as a separate block

531 child_context = apply_element_styling(context, child)

532 img_block = image_handler(child, child_context)

533 if img_block: 533 ↛ 527line 533 didn't jump to line 527 because the condition on line 533 was always true

534 blocks.append(img_block)

535 else:

536 # Process other inline elements as part of text

537 # This will be handled by extract_text_content below

538 pass

539

540 # Also add a paragraph with the text content

541 paragraph = Paragraph(context.font)

542 words = extract_text_content(element, context)

543 if words: 543 ↛ 548line 543 didn't jump to line 548 because the condition on line 543 was always true

544 for word in words:

545 paragraph.add_word(word)

546 blocks.insert(0, paragraph) # Text comes before images

547

548 return blocks if blocks else Paragraph(context.font)

549

550 # No images - normal paragraph handling

551 paragraph = Paragraph(context.font)

552 words = extract_text_content(element, context)

553 for word in words:

554 paragraph.add_word(word)

555 return paragraph

556

557

558def div_handler(element: Tag, context: StyleContext) -> List[Block]:

559 """Handle <div> elements - treat as generic container."""

560 blocks = []

561 for child in element.children:

562 if isinstance(child, Tag):

563 child_context = apply_element_styling(context, child)

564 result = process_element(child, child_context)

565 if result:

566 if isinstance(result, list):

567 blocks.extend(result)

568 else:

569 blocks.append(result)

570 return blocks

571

572

573def heading_handler(element: Tag, context: StyleContext) -> Heading:

574 """Handle <h1>-<h6> elements."""

575 level_map = {

576 "h1": HeadingLevel.H1,

577 "h2": HeadingLevel.H2,

578 "h3": HeadingLevel.H3,

579 "h4": HeadingLevel.H4,

580 "h5": HeadingLevel.H5,

581 "h6": HeadingLevel.H6,

582 }

583

584 level = level_map.get(element.name.lower(), HeadingLevel.H1)

585 heading = Heading(level, context.font)

586 words = extract_text_content(element, context)

587 for word in words:

588 heading.add_word(word)

589 return heading

590

591

592def blockquote_handler(element: Tag, context: StyleContext) -> Quote:

593 """Handle <blockquote> elements."""

594 quote = Quote(context.font)

595 for child in element.children:

596 if isinstance(child, Tag):

597 child_context = apply_element_styling(context, child)

598 result = process_element(child, child_context)

599 if result: 599 ↛ 595line 599 didn't jump to line 595 because the condition on line 599 was always true

600 if isinstance(result, list): 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true

601 for block in result:

602 quote.add_block(block)

603 else:

604 quote.add_block(result)

605 return quote

606

607

608def preformatted_handler(element: Tag, context: StyleContext) -> CodeBlock:

609 """Handle <pre> elements."""

610 language = context.element_attributes.get("data-language", "")

611 code_block = CodeBlock(language)

612

613 # Preserve whitespace and line breaks in preformatted text

614 text = element.get_text(separator="\n", strip=False)

615 for line in text.split("\n"):

616 code_block.add_line(line)

617

618 return code_block

619

620

621def code_handler(element: Tag, context: StyleContext) -> Union[CodeBlock, None]:

622 """Handle <code> elements."""

623 # If parent is <pre>, this is handled by preformatted_handler

624 if context.parent_elements and context.parent_elements[-1] == "pre": 624 ↛ 625line 624 didn't jump to line 625 because the condition on line 624 was never true

625 return None # Will be handled by parent

626

627 # Inline code - handled during text extraction

628 return None

629

630

631def unordered_list_handler(element: Tag, context: StyleContext) -> HList:

632 """Handle <ul> elements."""

633 hlist = HList(ListStyle.UNORDERED, context.font)

634 for child in element.children:

635 if isinstance(child, Tag) and child.name.lower() == "li":

636 child_context = apply_element_styling(context, child)

637 item = process_element(child, child_context)

638 if item: 638 ↛ 634line 638 didn't jump to line 634 because the condition on line 638 was always true

639 hlist.add_item(item)

640 return hlist

641

642

643def ordered_list_handler(element: Tag, context: StyleContext) -> HList:

644 """Handle <ol> elements."""

645 hlist = HList(ListStyle.ORDERED, context.font)

646 for child in element.children:

647 if isinstance(child, Tag) and child.name.lower() == "li":

648 child_context = apply_element_styling(context, child)

649 item = process_element(child, child_context)

650 if item: 650 ↛ 646line 650 didn't jump to line 646 because the condition on line 650 was always true

651 hlist.add_item(item)

652 return hlist

653

654

655def list_item_handler(element: Tag, context: StyleContext) -> ListItem:

656 """Handle <li> elements."""

657 list_item = ListItem(None, context.font)

658

659 for child in element.children:

660 if isinstance(child, Tag):

661 child_context = apply_element_styling(context, child)

662 result = process_element(child, child_context)

663 if result:

664 if isinstance(result, list): 664 ↛ 665line 664 didn't jump to line 665 because the condition on line 664 was never true

665 for block in result:

666 list_item.add_block(block)

667 else:

668 list_item.add_block(result)

669 elif isinstance(child, NavigableString): 669 ↛ 659line 669 didn't jump to line 659 because the condition on line 669 was always true

670 # Direct text in list item - create paragraph

671 text = str(child).strip()

672 if text:

673 paragraph = Paragraph(context.font)

674 words = text.split()

675 for word_text in words:

676 if word_text: 676 ↛ 675line 676 didn't jump to line 675 because the condition on line 676 was always true

677 paragraph.add_word(Word(word_text, context.font))

678 list_item.add_block(paragraph)

679

680 return list_item

681

682

683def table_handler(element: Tag, context: StyleContext) -> Table:

684 """Handle <table> elements."""

685 caption = None

686 caption_elem = element.find("caption")

687 if caption_elem: 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true

688 caption = caption_elem.get_text(strip=True)

689

690 table = Table(caption, context.font)

691

692 # Process table rows

693 for child in element.children:

694 if isinstance(child, Tag):

695 if child.name.lower() == "tr":

696 child_context = apply_element_styling(context, child)

697 row = process_element(child, child_context)

698 if row: 698 ↛ 693line 698 didn't jump to line 693 because the condition on line 698 was always true

699 table.add_row(row)

700 elif child.name.lower() in ["thead", "tbody", "tfoot"]: 700 ↛ 693line 700 didn't jump to line 693 because the condition on line 700 was always true

701 section = "header" if child.name.lower() == "thead" else "body"

702 section = "footer" if child.name.lower() == "tfoot" else section

703

704 for row_elem in child.find_all("tr"):

705 child_context = apply_element_styling(context, row_elem)

706 row = process_element(row_elem, child_context)

707 if row: 707 ↛ 704line 707 didn't jump to line 704 because the condition on line 707 was always true

708 table.add_row(row, section)

709

710 return table

711

712

713def table_row_handler(element: Tag, context: StyleContext) -> TableRow:

714 """Handle <tr> elements."""

715 row = TableRow(context.font)

716 for child in element.children:

717 if isinstance(child, Tag) and child.name.lower() in ["td", "th"]:

718 child_context = apply_element_styling(context, child)

719 cell = process_element(child, child_context)

720 if cell: 720 ↛ 716line 720 didn't jump to line 716 because the condition on line 720 was always true

721 row.add_cell(cell)

722 return row

723

724

725def table_cell_handler(element: Tag, context: StyleContext) -> TableCell:

726 """Handle <td> elements."""

727 colspan = int(context.element_attributes.get("colspan", 1))

728 rowspan = int(context.element_attributes.get("rowspan", 1))

729 cell = TableCell(False, colspan, rowspan, context.font)

730

731 # Process cell content

732 for child in element.children:

733 if isinstance(child, Tag):

734 child_context = apply_element_styling(context, child)

735 result = process_element(child, child_context)

736 if result:

737 if isinstance(result, list):

738 for block in result:

739 cell.add_block(block)

740 else:

741 cell.add_block(result)

742 elif isinstance(child, NavigableString): 742 ↛ 732line 742 didn't jump to line 732 because the condition on line 742 was always true

743 # Direct text in cell - create paragraph

744 text = str(child).strip()

745 if text:

746 paragraph = Paragraph(context.font)

747 words = text.split()

748 for word_text in words:

749 if word_text: 749 ↛ 748line 749 didn't jump to line 748 because the condition on line 749 was always true

750 paragraph.add_word(Word(word_text, context.font))

751 cell.add_block(paragraph)

752

753 return cell

754

755

756def table_header_cell_handler(element: Tag, context: StyleContext) -> TableCell:

757 """Handle <th> elements."""

758 colspan = int(context.element_attributes.get("colspan", 1))

759 rowspan = int(context.element_attributes.get("rowspan", 1))

760 cell = TableCell(True, colspan, rowspan, context.font)

761

762 # Process cell content (same as td)

763 for child in element.children:

764 if isinstance(child, Tag):

765 child_context = apply_element_styling(context, child)

766 result = process_element(child, child_context)

767 if result:

768 if isinstance(result, list): 768 ↛ 772line 768 didn't jump to line 772 because the condition on line 768 was always true

769 for block in result:

770 cell.add_block(block)

771 else:

772 cell.add_block(result)

773 elif isinstance(child, NavigableString): 773 ↛ 763line 773 didn't jump to line 763 because the condition on line 773 was always true

774 text = str(child).strip()

775 if text:

776 paragraph = Paragraph(context.font)

777 words = text.split()

778 for word_text in words:

779 if word_text: 779 ↛ 778line 779 didn't jump to line 778 because the condition on line 779 was always true

780 paragraph.add_word(Word(word_text, context.font))

781 cell.add_block(paragraph)

782

783 return cell

784

785

786def horizontal_rule_handler(element: Tag, context: StyleContext) -> HorizontalRule:

787 """Handle <hr> elements."""

788 return HorizontalRule()

789

790

791def line_break_handler(element: Tag, context: StyleContext) -> None:

792 """Handle <br> elements."""

793 # Line breaks are typically handled at the paragraph level

794 return None

795

796

797def image_handler(element: Tag, context: StyleContext) -> Image:

798 """Handle <img> elements."""

799 import os

800 import urllib.parse

801

802 src = context.element_attributes.get("src", "")

803 alt_text = context.element_attributes.get("alt", "")

804

805 # Resolve relative paths if base_path is provided

806 if context.base_path and src and not src.startswith(('http://', 'https://', '/')):

807 # Parse the src to handle URL-encoded characters

808 src_decoded = urllib.parse.unquote(src)

809 # Resolve relative path to absolute path

810 src = os.path.normpath(os.path.join(context.base_path, src_decoded))

811

812 # Parse dimensions if provided

813 width = height = None

814 try:

815 if "width" in context.element_attributes:

816 width = int(context.element_attributes["width"])

817 if "height" in context.element_attributes:

818 height = int(context.element_attributes["height"])

819 except ValueError:

820 pass

821

822 return Image(source=src, alt_text=alt_text, width=width, height=height)

823

824

825def ignore_handler(element: Tag, context: StyleContext) -> None:

826 """Handle elements that should be ignored."""

827 return None

828

829

830def generic_handler(element: Tag, context: StyleContext) -> List[Block]:

831 """Handle unknown elements as generic containers."""

832 return div_handler(element, context)

833

834

835# Handler registry - maps HTML tag names to handler functions

836HANDLERS: Dict[str, Callable[[Tag, StyleContext], Union[Block, List[Block], None]]] = {

837 # Block elements

838 "p": paragraph_handler,

839 "div": div_handler,

840 "h1": heading_handler,

841 "h2": heading_handler,

842 "h3": heading_handler,

843 "h4": heading_handler,

844 "h5": heading_handler,

845 "h6": heading_handler,

846 "blockquote": blockquote_handler,

847 "pre": preformatted_handler,

848 "code": code_handler,

849 "ul": unordered_list_handler,

850 "ol": ordered_list_handler,

851 "li": list_item_handler,

852 "table": table_handler,

853 "tr": table_row_handler,

854 "td": table_cell_handler,

855 "th": table_header_cell_handler,

856 "hr": horizontal_rule_handler,

857 "br": line_break_handler,

858 # Semantic elements (treated as containers)

859 "section": div_handler,

860 "article": div_handler,

861 "aside": div_handler,

862 "nav": div_handler,

863 "header": div_handler,

864 "footer": div_handler,

865 "main": div_handler,

866 "figure": div_handler,

867 "figcaption": paragraph_handler,

868 # Media elements

869 "img": image_handler,

870 # Inline elements (handled during text extraction)

871 "span": ignore_handler,

872 "a": ignore_handler,

873 "strong": ignore_handler,

874 "b": ignore_handler,

875 "em": ignore_handler,

876 "i": ignore_handler,

877 "u": ignore_handler,

878 "s": ignore_handler,

879 "del": ignore_handler,

880 "ins": ignore_handler,

881 "mark": ignore_handler,

882 "small": ignore_handler,

883 "sub": ignore_handler,

884 "sup": ignore_handler,

885 "q": ignore_handler,

886 "cite": ignore_handler,

887 "abbr": ignore_handler,

888 "time": ignore_handler,

889 # Ignored elements

890 "script": ignore_handler,

891 "style": ignore_handler,

892 "meta": ignore_handler,

893 "link": ignore_handler,

894 "head": ignore_handler,

895 "title": ignore_handler,

896}

897

898

899def parse_html_string(

900 html_string: str, base_font: Optional[Font] = None, document=None, base_path: Optional[str] = None

901) -> List[Block]:

902 """

903 Parse HTML string and return list of Block objects.

904

905 Args:

906 html_string: HTML content to parse

907 base_font: Base font for styling, defaults to system default

908 document: Document instance for font registry to avoid duplicate fonts

909 base_path: Base directory path for resolving relative URLs (e.g., image sources)

910

911 Returns:

912 List of Block objects representing the document structure

913 """

914 soup = BeautifulSoup(html_string, "html.parser")

915 context = create_base_context(base_font, document, base_path)

916

917 blocks = []

918

919 # Process the body if it exists, otherwise process all top-level elements

920 root_element = soup.find("body") or soup

921

922 for element in root_element.children:

923 if isinstance(element, Tag):

924 element_context = apply_element_styling(context, element)

925 result = process_element(element, element_context)

926 if result:

927 if isinstance(result, list):

928 blocks.extend(result)

929 else:

930 blocks.append(result)

931

932 return blocks

Coverage for pyWebLayout/io/readers/html_extraction.py: 88%

424 statements