#!/usr/bin/env python3 """ Simple EPUB page renderer tool. This tool uses the pyWebLayout epub_reader and layout modules to: 1. Load an EPUB file 2. Render the first X pages according to command line arguments 3. Save the pages as PNG images Usage: python epub_page_renderer.py book.epub --pages 5 --output-dir rendered_pages """ import os import sys import argparse from pathlib import Path from typing import List # Add the parent directory to sys.path to import pyWebLayout sys.path.insert(0, str(Path(__file__).parent.parent)) try: from pyWebLayout.io.readers.epub_reader import read_epub from pyWebLayout.layout.ereader_layout import BidirectionalLayouter, RenderingPosition from pyWebLayout.concrete.page import Page from pyWebLayout.style.page_style import PageStyle from pyWebLayout.abstract.block import Block from PIL import Image, ImageDraw except ImportError as e: print(f"Error importing required modules: {e}") print("Make sure pyWebLayout is properly installed and PIL is available") sys.exit(1) def render_page_to_image(page: Page) -> Image.Image: """ Render a Page object to a PIL Image using pyWebLayout's built-in rendering. Args: page: The Page object to render Returns: PIL Image object """ try: # Use the Page's built-in render method rendered_image = page.render() if isinstance(rendered_image, Image.Image): return rendered_image else: # If render() doesn't return a PIL Image, create error image error_image = Image.new('RGB', page.size, 'white') draw = ImageDraw.Draw(error_image) draw.text((20, 20), "Error: Page.render() did not return PIL Image", fill='red') return error_image except Exception as e: # Create error image if rendering fails error_image = Image.new('RGB', page.size, 'white') draw = ImageDraw.Draw(error_image) draw.text((20, 20), f"Rendering error: {str(e)}", fill='red') print(f"Warning: Error rendering page: {e}") return error_image def extract_text_from_page(page: Page) -> str: """ Extract text content from a Page object for verification purposes. Args: page: The Page object to extract text from Returns: String containing the page's text content """ text_lines = [] text_lines.append("=== PAGE CONTENT ===") text_lines.append("") try: # Recursively extract text from page children def extract_from_element(element, indent_level=0): indent = " " * indent_level # Import abstract block types from pyWebLayout.abstract.block import Paragraph, Heading, HList, Table, Image as AbstractImage from pyWebLayout.concrete.text import Line # Handle Line objects (concrete) if isinstance(element, Line): line_text = [] if hasattr(element, '_text_objects') and element._text_objects: for text_obj in element._text_objects: if hasattr(text_obj, 'text'): line_text.append(str(text_obj.text)) if line_text: text_lines.append(f"{indent}{' '.join(line_text)}") # Handle abstract block objects elif isinstance(element, (Paragraph, Heading)): # Extract text from paragraph/heading paragraph_text = extract_text_from_paragraph(element) if paragraph_text: block_type = "HEADING" if isinstance(element, Heading) else "PARAGRAPH" text_lines.append(f"{indent}{block_type}: {paragraph_text}") elif isinstance(element, HList): text_lines.append(f"{indent}LIST:") # Extract text from list items try: for item in element.items(): item_text = extract_text_from_paragraph(item) if item_text: text_lines.append(f"{indent} - {item_text}") except Exception: text_lines.append(f"{indent} (List content extraction failed)") elif isinstance(element, Table): text_lines.append(f"{indent}[TABLE]") elif isinstance(element, AbstractImage): alt_text = getattr(element, 'alt_text', '') src = getattr(element, 'source', 'Unknown') text_lines.append(f"{indent}[IMAGE: {alt_text or src}]") # Handle containers with children elif hasattr(element, '_children') and element._children: for child in element._children: extract_from_element(child, indent_level + 1) # Handle text elements elif hasattr(element, 'text'): text = str(element.text).strip() if text: text_lines.append(f"{indent}{text}") # Handle other object types by showing their class name else: class_name = element.__class__.__name__ text_lines.append(f"{indent}[{class_name}]") # Helper function to extract text from paragraph-like objects def extract_text_from_paragraph(para_obj): words = [] try: # Try to get words from the paragraph if hasattr(para_obj, 'words_iter') and callable(para_obj.words_iter): for _, word in para_obj.words_iter(): if hasattr(word, 'text'): words.append(word.text) else: words.append(str(word)) elif hasattr(para_obj, '_words'): # Direct access to words list for word in para_obj._words: if hasattr(word, 'text'): words.append(word.text) else: words.append(str(word)) except Exception as e: return f"(Text extraction error: {str(e)})" return ' '.join(words) if words else "(No text)" # Extract text from page children if hasattr(page, '_children'): for child in page._children: extract_from_element(child) # If no text was extracted, add a note if len(text_lines) <= 2: # Only header and empty line text_lines.append("(No text content found)") except Exception as e: text_lines.append(f"Error extracting text: {str(e)}") import traceback text_lines.append(traceback.format_exc()) return "\n".join(text_lines) def get_all_blocks_from_book(book) -> List[Block]: """ Extract all blocks from all chapters in the book. Args: book: The Book object from epub_reader Returns: List of all Block objects """ all_blocks = [] # Iterate through all chapters for chapter in book.chapters: # Get blocks from the chapter if hasattr(chapter, '_blocks'): all_blocks.extend(chapter._blocks) return all_blocks def main(): """Main function to handle command line arguments and process the EPUB.""" parser = argparse.ArgumentParser( description='Render EPUB pages to images using pyWebLayout', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python epub_page_renderer.py book.epub --pages 5 python epub_page_renderer.py book.epub --pages 10 --output-dir my_output --width 600 --height 800 """ ) parser.add_argument( 'epub_file', help='Path to the EPUB file to render' ) parser.add_argument( '--pages', '-p', type=int, default=5, help='Number of pages to render (default: 5)' ) parser.add_argument( '--output-dir', '-o', default='rendered_pages', help='Output directory for rendered images (default: rendered_pages)' ) parser.add_argument( '--width', '-w', type=int, default=800, help='Page width in pixels (default: 800)' ) parser.add_argument( '--height', '-t', type=int, default=1000, help='Page height in pixels (default: 1000)' ) parser.add_argument( '--margin', '-m', type=int, default=40, help='Page margin in pixels (default: 40)' ) parser.add_argument( '--align', '-a', choices=['left', 'justify'], default='left', help='Text alignment: left or justify (default: left)' ) args = parser.parse_args() # Validate arguments if not os.path.exists(args.epub_file): print(f"Error: EPUB file '{args.epub_file}' not found") return 1 if args.pages <= 0: print("Error: Number of pages must be positive") return 1 # Create output directory try: os.makedirs(args.output_dir, exist_ok=True) except OSError as e: print(f"Error creating output directory: {e}") return 1 print(f"Loading EPUB file: {args.epub_file}") # Load the EPUB file try: book = read_epub(args.epub_file) print(f"Successfully loaded EPUB: {book.get_title() or 'Unknown Title'}") # Print book information author = book.get_metadata('AUTHOR') if author: print(f"Author: {author}") print(f"Chapters: {len(book.chapters) if hasattr(book, 'chapters') else 'Unknown'}") except Exception as e: print(f"Error loading EPUB file: {e}") import traceback traceback.print_exc() return 1 # Extract all blocks from the book print("Extracting content blocks...") try: all_blocks = get_all_blocks_from_book(book) print(f"Extracted {len(all_blocks)} content blocks") if not all_blocks: print("No content blocks found in EPUB. The book might be empty.") return 1 # Apply alignment setting to all paragraphs and headings from pyWebLayout.style.alignment import Alignment alignment = Alignment.JUSTIFY if args.align == 'justify' else Alignment.LEFT print(f"Applying {args.align} alignment to all text blocks...") # Note: We'll pass alignment to the layouter which will handle it during rendering # The alignment is applied at the Line level in paragraph_layouter except Exception as e: print(f"Error extracting blocks: {e}") import traceback traceback.print_exc() return 1 # Set up page style and layouter page_size = (args.width, args.height) page_style = PageStyle( background_color=(255, 255, 255), border_width=args.margin, border_color=(200, 200, 200), padding=(10, 10, 10, 10), # top, right, bottom, left line_spacing=5, inter_block_spacing=15 ) print(f"Setting up layouter with page size {page_size} and {args.align} alignment") try: layouter = BidirectionalLayouter( blocks=all_blocks, page_style=page_style, page_size=page_size, alignment_override=alignment ) except Exception as e: print(f"Error setting up layouter: {e}") import traceback traceback.print_exc() return 1 # Render pages print(f"Rendering up to {args.pages} pages...") try: pages = [] current_position = RenderingPosition() # Start from beginning for page_num in range(args.pages): print(f"Rendering page {page_num + 1}/{args.pages}...") try: # Render the page page, next_position = layouter.render_page_forward(current_position) pages.append(page) # Check if we've reached the end of the document if next_position.block_index >= len(all_blocks): print(f"Reached end of document after {page_num + 1} pages") break # Update position for next page current_position = next_position except Exception as e: print(f"Error rendering page {page_num + 1}: {e}") import traceback traceback.print_exc() break if not pages: print("No pages were generated.") return 1 print(f"Generated {len(pages)} pages") # Save each page to an image and extract text for i, page in enumerate(pages): print(f"Saving page {i + 1}/{len(pages)}...") try: # Create image from page using pyWebLayout's built-in rendering image = render_page_to_image(page) # Save the image output_filename = f"page_{i + 1:03d}.png" output_path = os.path.join(args.output_dir, output_filename) image.save(output_path, 'PNG') # Extract and save text content for verification page_text = extract_text_from_page(page) text_filename = f"page_{i + 1:03d}.txt" text_path = os.path.join(args.output_dir, text_filename) with open(text_path, 'w', encoding='utf-8') as f: f.write(page_text) print(f"Saved: {output_path} and {text_path}") except Exception as e: print(f"Error saving page {i + 1}: {e}") import traceback traceback.print_exc() continue print(f"\nCompleted! Rendered {len(pages)} pages to {args.output_dir}") # Calculate progress through the book if len(all_blocks) > 0: progress = (current_position.block_index / len(all_blocks)) * 100 print(f"Progress through book: {progress:.1f}%") except Exception as e: print(f"Error during pagination/rendering: {e}") import traceback traceback.print_exc() return 1 return 0 if __name__ == "__main__": sys.exit(main())