pyWebLayout/scripts/epub_page_renderer.py

#!/usr/bin/env python3
"""
Simple EPUB page renderer tool.

This tool uses the pyWebLayout epub_reader and layout modules to:
1. Load an EPUB file
2. Render the first X pages according to command line arguments
3. Save the pages as PNG images

Usage:
    python epub_page_renderer.py book.epub --pages 5 --output-dir rendered_pages
"""

import os
import sys
import argparse
from pathlib import Path
from typing import Optional, List

# Add the parent directory to sys.path to import pyWebLayout
sys.path.insert(0, str(Path(__file__).parent.parent))

try:
    from pyWebLayout.io.readers.epub_reader import read_epub
    from pyWebLayout.layout.ereader_layout import BidirectionalLayouter, RenderingPosition
    from pyWebLayout.layout.document_layouter import paragraph_layouter
    from pyWebLayout.concrete.page import Page
    from pyWebLayout.style.page_style import PageStyle
    from pyWebLayout.style.fonts import Font
    from pyWebLayout.abstract.block import Block
    from PIL import Image, ImageDraw
except ImportError as e:
    print(f"Error importing required modules: {e}")
    print("Make sure pyWebLayout is properly installed and PIL is available")
    sys.exit(1)


def render_page_to_image(page: Page) -> Image.Image:
    """
    Render a Page object to a PIL Image using pyWebLayout's built-in rendering.

    Args:
        page: The Page object to render

    Returns:
        PIL Image object
    """
    try:
        # Use the Page's built-in render method
        rendered_image = page.render()
        if isinstance(rendered_image, Image.Image):
            return rendered_image
        else:
            # If render() doesn't return a PIL Image, create error image
            error_image = Image.new('RGB', page.size, 'white')
            draw = ImageDraw.Draw(error_image)
            draw.text((20, 20), "Error: Page.render() did not return PIL Image", fill='red')
            return error_image

    except Exception as e:
        # Create error image if rendering fails
        error_image = Image.new('RGB', page.size, 'white')
        draw = ImageDraw.Draw(error_image)
        draw.text((20, 20), f"Rendering error: {str(e)}", fill='red')
        print(f"Warning: Error rendering page: {e}")
        return error_image


def extract_text_from_page(page: Page) -> str:
    """
    Extract text content from a Page object for verification purposes.

    Args:
        page: The Page object to extract text from

    Returns:
        String containing the page's text content
    """
    text_lines = []
    text_lines.append(f"=== PAGE CONTENT ===")
    text_lines.append("")

    try:
        # Recursively extract text from page children
        def extract_from_element(element, indent_level=0):
            indent = "  " * indent_level

            # Import abstract block types
            from pyWebLayout.abstract.block import Paragraph, Heading, HList, Table, Image as AbstractImage
            from pyWebLayout.concrete.text import Line

            # Handle Line objects (concrete)
            if isinstance(element, Line):
                line_text = []
                if hasattr(element, '_text_objects') and element._text_objects:
                    for text_obj in element._text_objects:
                        if hasattr(text_obj, 'text'):
                            line_text.append(str(text_obj.text))
                if line_text:
                    text_lines.append(f"{indent}{' '.join(line_text)}")

            # Handle abstract block objects
            elif isinstance(element, (Paragraph, Heading)):
                # Extract text from paragraph/heading
                paragraph_text = extract_text_from_paragraph(element)
                if paragraph_text:
                    block_type = "HEADING" if isinstance(element, Heading) else "PARAGRAPH"
                    text_lines.append(f"{indent}{block_type}: {paragraph_text}")

            elif isinstance(element, HList):
                text_lines.append(f"{indent}LIST:")
                # Extract text from list items
                try:
                    for item in element.items():
                        item_text = extract_text_from_paragraph(item)
                        if item_text:
                            text_lines.append(f"{indent}  - {item_text}")
                except:
                    text_lines.append(f"{indent}  (List content extraction failed)")

            elif isinstance(element, Table):
                text_lines.append(f"{indent}[TABLE]")

            elif isinstance(element, AbstractImage):
                alt_text = getattr(element, 'alt_text', '')
                src = getattr(element, 'source', 'Unknown')
                text_lines.append(f"{indent}[IMAGE: {alt_text or src}]")

            # Handle containers with children
            elif hasattr(element, '_children') and element._children:
                for child in element._children:
                    extract_from_element(child, indent_level + 1)

            # Handle text elements
            elif hasattr(element, 'text'):
                text = str(element.text).strip()
                if text:
                    text_lines.append(f"{indent}{text}")

            # Handle other object types by showing their class name
            else:
                class_name = element.__class__.__name__
                text_lines.append(f"{indent}[{class_name}]")

        # Helper function to extract text from paragraph-like objects
        def extract_text_from_paragraph(para_obj):
            words = []
            try:
                # Try to get words from the paragraph
                if hasattr(para_obj, 'words_iter') and callable(para_obj.words_iter):
                    for _, word in para_obj.words_iter():
                        if hasattr(word, 'text'):
                            words.append(word.text)
                        else:
                            words.append(str(word))
                elif hasattr(para_obj, '_words'):
                    # Direct access to words list
                    for word in para_obj._words:
                        if hasattr(word, 'text'):
                            words.append(word.text)
                        else:
                            words.append(str(word))
            except Exception as e:
                return f"(Text extraction error: {str(e)})"

            return ' '.join(words) if words else "(No text)"

        # Extract text from page children
        if hasattr(page, '_children'):
            for child in page._children:
                extract_from_element(child)

        # If no text was extracted, add a note
        if len(text_lines) <= 2:  # Only header and empty line
            text_lines.append("(No text content found)")

    except Exception as e:
        text_lines.append(f"Error extracting text: {str(e)}")
        import traceback
        text_lines.append(traceback.format_exc())

    return "\n".join(text_lines)


def get_all_blocks_from_book(book) -> List[Block]:
    """
    Extract all blocks from all chapters in the book.

    Args:
        book: The Book object from epub_reader

    Returns:
        List of all Block objects
    """
    all_blocks = []

    # Iterate through all chapters
    for chapter in book.chapters:
        # Get blocks from the chapter
        if hasattr(chapter, '_blocks'):
            all_blocks.extend(chapter._blocks)

    return all_blocks


def main():
    """Main function to handle command line arguments and process the EPUB."""
    parser = argparse.ArgumentParser(
        description='Render EPUB pages to images using pyWebLayout',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python epub_page_renderer.py book.epub --pages 5
  python epub_page_renderer.py book.epub --pages 10 --output-dir my_output --width 600 --height 800
        """
    )

    parser.add_argument(
        'epub_file',
        help='Path to the EPUB file to render'
    )

    parser.add_argument(
        '--pages', '-p',
        type=int,
        default=5,
        help='Number of pages to render (default: 5)'
    )

    parser.add_argument(
        '--output-dir', '-o',
        default='rendered_pages',
        help='Output directory for rendered images (default: rendered_pages)'
    )

    parser.add_argument(
        '--width', '-w',
        type=int,
        default=800,
        help='Page width in pixels (default: 800)'
    )

    parser.add_argument(
        '--height', '-t',
        type=int,
        default=1000,
        help='Page height in pixels (default: 1000)'
    )

    parser.add_argument(
        '--margin', '-m',
        type=int,
        default=40,
        help='Page margin in pixels (default: 40)'
    )

    parser.add_argument(
        '--align', '-a',
        choices=['left', 'justify'],
        default='left',
        help='Text alignment: left or justify (default: left)'
    )

    args = parser.parse_args()

    # Validate arguments
    if not os.path.exists(args.epub_file):
        print(f"Error: EPUB file '{args.epub_file}' not found")
        return 1

    if args.pages <= 0:
        print("Error: Number of pages must be positive")
        return 1

    # Create output directory
    try:
        os.makedirs(args.output_dir, exist_ok=True)
    except OSError as e:
        print(f"Error creating output directory: {e}")
        return 1

    print(f"Loading EPUB file: {args.epub_file}")

    # Load the EPUB file
    try:
        book = read_epub(args.epub_file)
        print(f"Successfully loaded EPUB: {book.get_title() or 'Unknown Title'}")

        # Print book information
        author = book.get_metadata('AUTHOR')
        if author:
            print(f"Author: {author}")

        print(f"Chapters: {len(book.chapters) if hasattr(book, 'chapters') else 'Unknown'}")

    except Exception as e:
        print(f"Error loading EPUB file: {e}")
        import traceback
        traceback.print_exc()
        return 1

    # Extract all blocks from the book
    print("Extracting content blocks...")
    try:
        all_blocks = get_all_blocks_from_book(book)
        print(f"Extracted {len(all_blocks)} content blocks")

        if not all_blocks:
            print("No content blocks found in EPUB. The book might be empty.")
            return 1

        # Apply alignment setting to all paragraphs and headings
        from pyWebLayout.abstract.block import Paragraph, Heading
        from pyWebLayout.style.alignment import Alignment

        alignment = Alignment.JUSTIFY if args.align == 'justify' else Alignment.LEFT
        print(f"Applying {args.align} alignment to all text blocks...")

        # Note: We'll pass alignment to the layouter which will handle it during rendering
        # The alignment is applied at the Line level in paragraph_layouter

    except Exception as e:
        print(f"Error extracting blocks: {e}")
        import traceback
        traceback.print_exc()
        return 1

    # Set up page style and layouter
    page_size = (args.width, args.height)
    page_style = PageStyle(
        background_color=(255, 255, 255),
        border_width=args.margin,
        border_color=(200, 200, 200),
        padding=(10, 10, 10, 10),  # top, right, bottom, left
        line_spacing=5,
        inter_block_spacing=15
    )

    print(f"Setting up layouter with page size {page_size} and {args.align} alignment")

    try:
        layouter = BidirectionalLayouter(
            blocks=all_blocks,
            page_style=page_style,
            page_size=page_size,
            alignment_override=alignment
        )
    except Exception as e:
        print(f"Error setting up layouter: {e}")
        import traceback
        traceback.print_exc()
        return 1

    # Render pages
    print(f"Rendering up to {args.pages} pages...")

    try:
        pages = []
        current_position = RenderingPosition()  # Start from beginning

        for page_num in range(args.pages):
            print(f"Rendering page {page_num + 1}/{args.pages}...")

            try:
                # Render the page
                page, next_position = layouter.render_page_forward(current_position)
                pages.append(page)

                # Check if we've reached the end of the document
                if next_position.block_index >= len(all_blocks):
                    print(f"Reached end of document after {page_num + 1} pages")
                    break

                # Update position for next page
                current_position = next_position

            except Exception as e:
                print(f"Error rendering page {page_num + 1}: {e}")
                import traceback
                traceback.print_exc()
                break

        if not pages:
            print("No pages were generated.")
            return 1

        print(f"Generated {len(pages)} pages")

        # Save each page to an image and extract text
        for i, page in enumerate(pages):
            print(f"Saving page {i + 1}/{len(pages)}...")

            try:
                # Create image from page using pyWebLayout's built-in rendering
                image = render_page_to_image(page)

                # Save the image
                output_filename = f"page_{i + 1:03d}.png"
                output_path = os.path.join(args.output_dir, output_filename)
                image.save(output_path, 'PNG')

                # Extract and save text content for verification
                page_text = extract_text_from_page(page)
                text_filename = f"page_{i + 1:03d}.txt"
                text_path = os.path.join(args.output_dir, text_filename)
                with open(text_path, 'w', encoding='utf-8') as f:
                    f.write(page_text)

                print(f"Saved: {output_path} and {text_path}")

            except Exception as e:
                print(f"Error saving page {i + 1}: {e}")
                import traceback
                traceback.print_exc()
                continue

        print(f"\nCompleted! Rendered {len(pages)} pages to {args.output_dir}")

        # Calculate progress through the book
        if len(all_blocks) > 0:
            progress = (current_position.block_index / len(all_blocks)) * 100
            print(f"Progress through book: {progress:.1f}%")

    except Exception as e:
        print(f"Error during pagination/rendering: {e}")
        import traceback
        traceback.print_exc()
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())