432 lines
14 KiB
Python
Executable File
432 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Simple EPUB page renderer tool.
|
|
|
|
This tool uses the pyWebLayout epub_reader and layout modules to:
|
|
1. Load an EPUB file
|
|
2. Render the first X pages according to command line arguments
|
|
3. Save the pages as PNG images
|
|
|
|
Usage:
|
|
python epub_page_renderer.py book.epub --pages 5 --output-dir rendered_pages
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
# Add the parent directory to sys.path to import pyWebLayout
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
try:
|
|
from pyWebLayout.io.readers.epub_reader import read_epub
|
|
from pyWebLayout.layout.ereader_layout import BidirectionalLayouter, RenderingPosition
|
|
from pyWebLayout.concrete.page import Page
|
|
from pyWebLayout.style.page_style import PageStyle
|
|
from pyWebLayout.abstract.block import Block
|
|
from PIL import Image, ImageDraw
|
|
except ImportError as e:
|
|
print(f"Error importing required modules: {e}")
|
|
print("Make sure pyWebLayout is properly installed and PIL is available")
|
|
sys.exit(1)
|
|
|
|
|
|
def render_page_to_image(page: Page) -> Image.Image:
|
|
"""
|
|
Render a Page object to a PIL Image using pyWebLayout's built-in rendering.
|
|
|
|
Args:
|
|
page: The Page object to render
|
|
|
|
Returns:
|
|
PIL Image object
|
|
"""
|
|
try:
|
|
# Use the Page's built-in render method
|
|
rendered_image = page.render()
|
|
if isinstance(rendered_image, Image.Image):
|
|
return rendered_image
|
|
else:
|
|
# If render() doesn't return a PIL Image, create error image
|
|
error_image = Image.new('RGB', page.size, 'white')
|
|
draw = ImageDraw.Draw(error_image)
|
|
draw.text((20, 20), "Error: Page.render() did not return PIL Image", fill='red')
|
|
return error_image
|
|
|
|
except Exception as e:
|
|
# Create error image if rendering fails
|
|
error_image = Image.new('RGB', page.size, 'white')
|
|
draw = ImageDraw.Draw(error_image)
|
|
draw.text((20, 20), f"Rendering error: {str(e)}", fill='red')
|
|
print(f"Warning: Error rendering page: {e}")
|
|
return error_image
|
|
|
|
|
|
def extract_text_from_page(page: Page) -> str:
|
|
"""
|
|
Extract text content from a Page object for verification purposes.
|
|
|
|
Args:
|
|
page: The Page object to extract text from
|
|
|
|
Returns:
|
|
String containing the page's text content
|
|
"""
|
|
text_lines = []
|
|
text_lines.append("=== PAGE CONTENT ===")
|
|
text_lines.append("")
|
|
|
|
try:
|
|
# Recursively extract text from page children
|
|
def extract_from_element(element, indent_level=0):
|
|
indent = " " * indent_level
|
|
|
|
# Import abstract block types
|
|
from pyWebLayout.abstract.block import Paragraph, Heading, HList, Table, Image as AbstractImage
|
|
from pyWebLayout.concrete.text import Line
|
|
|
|
# Handle Line objects (concrete)
|
|
if isinstance(element, Line):
|
|
line_text = []
|
|
if hasattr(element, '_text_objects') and element._text_objects:
|
|
for text_obj in element._text_objects:
|
|
if hasattr(text_obj, 'text'):
|
|
line_text.append(str(text_obj.text))
|
|
if line_text:
|
|
text_lines.append(f"{indent}{' '.join(line_text)}")
|
|
|
|
# Handle abstract block objects
|
|
elif isinstance(element, (Paragraph, Heading)):
|
|
# Extract text from paragraph/heading
|
|
paragraph_text = extract_text_from_paragraph(element)
|
|
if paragraph_text:
|
|
block_type = "HEADING" if isinstance(element, Heading) else "PARAGRAPH"
|
|
text_lines.append(f"{indent}{block_type}: {paragraph_text}")
|
|
|
|
elif isinstance(element, HList):
|
|
text_lines.append(f"{indent}LIST:")
|
|
# Extract text from list items
|
|
try:
|
|
for item in element.items():
|
|
item_text = extract_text_from_paragraph(item)
|
|
if item_text:
|
|
text_lines.append(f"{indent} - {item_text}")
|
|
except Exception:
|
|
text_lines.append(f"{indent} (List content extraction failed)")
|
|
|
|
elif isinstance(element, Table):
|
|
text_lines.append(f"{indent}[TABLE]")
|
|
|
|
elif isinstance(element, AbstractImage):
|
|
alt_text = getattr(element, 'alt_text', '')
|
|
src = getattr(element, 'source', 'Unknown')
|
|
text_lines.append(f"{indent}[IMAGE: {alt_text or src}]")
|
|
|
|
# Handle containers with children
|
|
elif hasattr(element, '_children') and element._children:
|
|
for child in element._children:
|
|
extract_from_element(child, indent_level + 1)
|
|
|
|
# Handle text elements
|
|
elif hasattr(element, 'text'):
|
|
text = str(element.text).strip()
|
|
if text:
|
|
text_lines.append(f"{indent}{text}")
|
|
|
|
# Handle other object types by showing their class name
|
|
else:
|
|
class_name = element.__class__.__name__
|
|
text_lines.append(f"{indent}[{class_name}]")
|
|
|
|
# Helper function to extract text from paragraph-like objects
|
|
def extract_text_from_paragraph(para_obj):
|
|
words = []
|
|
try:
|
|
# Try to get words from the paragraph
|
|
if hasattr(para_obj, 'words_iter') and callable(para_obj.words_iter):
|
|
for _, word in para_obj.words_iter():
|
|
if hasattr(word, 'text'):
|
|
words.append(word.text)
|
|
else:
|
|
words.append(str(word))
|
|
elif hasattr(para_obj, '_words'):
|
|
# Direct access to words list
|
|
for word in para_obj._words:
|
|
if hasattr(word, 'text'):
|
|
words.append(word.text)
|
|
else:
|
|
words.append(str(word))
|
|
except Exception as e:
|
|
return f"(Text extraction error: {str(e)})"
|
|
|
|
return ' '.join(words) if words else "(No text)"
|
|
|
|
# Extract text from page children
|
|
if hasattr(page, '_children'):
|
|
for child in page._children:
|
|
extract_from_element(child)
|
|
|
|
# If no text was extracted, add a note
|
|
if len(text_lines) <= 2: # Only header and empty line
|
|
text_lines.append("(No text content found)")
|
|
|
|
except Exception as e:
|
|
text_lines.append(f"Error extracting text: {str(e)}")
|
|
import traceback
|
|
text_lines.append(traceback.format_exc())
|
|
|
|
return "\n".join(text_lines)
|
|
|
|
|
|
def get_all_blocks_from_book(book) -> List[Block]:
|
|
"""
|
|
Extract all blocks from all chapters in the book.
|
|
|
|
Args:
|
|
book: The Book object from epub_reader
|
|
|
|
Returns:
|
|
List of all Block objects
|
|
"""
|
|
all_blocks = []
|
|
|
|
# Iterate through all chapters
|
|
for chapter in book.chapters:
|
|
# Get blocks from the chapter
|
|
if hasattr(chapter, '_blocks'):
|
|
all_blocks.extend(chapter._blocks)
|
|
|
|
return all_blocks
|
|
|
|
|
|
def main():
|
|
"""Main function to handle command line arguments and process the EPUB."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Render EPUB pages to images using pyWebLayout',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python epub_page_renderer.py book.epub --pages 5
|
|
python epub_page_renderer.py book.epub --pages 10 --output-dir my_output --width 600 --height 800
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'epub_file',
|
|
help='Path to the EPUB file to render'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--pages', '-p',
|
|
type=int,
|
|
default=5,
|
|
help='Number of pages to render (default: 5)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output-dir', '-o',
|
|
default='rendered_pages',
|
|
help='Output directory for rendered images (default: rendered_pages)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--width', '-w',
|
|
type=int,
|
|
default=800,
|
|
help='Page width in pixels (default: 800)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--height', '-t',
|
|
type=int,
|
|
default=1000,
|
|
help='Page height in pixels (default: 1000)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--margin', '-m',
|
|
type=int,
|
|
default=40,
|
|
help='Page margin in pixels (default: 40)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--align', '-a',
|
|
choices=['left', 'justify'],
|
|
default='left',
|
|
help='Text alignment: left or justify (default: left)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate arguments
|
|
if not os.path.exists(args.epub_file):
|
|
print(f"Error: EPUB file '{args.epub_file}' not found")
|
|
return 1
|
|
|
|
if args.pages <= 0:
|
|
print("Error: Number of pages must be positive")
|
|
return 1
|
|
|
|
# Create output directory
|
|
try:
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
except OSError as e:
|
|
print(f"Error creating output directory: {e}")
|
|
return 1
|
|
|
|
print(f"Loading EPUB file: {args.epub_file}")
|
|
|
|
# Load the EPUB file
|
|
try:
|
|
book = read_epub(args.epub_file)
|
|
print(f"Successfully loaded EPUB: {book.get_title() or 'Unknown Title'}")
|
|
|
|
# Print book information
|
|
author = book.get_metadata('AUTHOR')
|
|
if author:
|
|
print(f"Author: {author}")
|
|
|
|
print(f"Chapters: {len(book.chapters) if hasattr(book, 'chapters') else 'Unknown'}")
|
|
|
|
except Exception as e:
|
|
print(f"Error loading EPUB file: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
# Extract all blocks from the book
|
|
print("Extracting content blocks...")
|
|
try:
|
|
all_blocks = get_all_blocks_from_book(book)
|
|
print(f"Extracted {len(all_blocks)} content blocks")
|
|
|
|
if not all_blocks:
|
|
print("No content blocks found in EPUB. The book might be empty.")
|
|
return 1
|
|
|
|
# Apply alignment setting to all paragraphs and headings
|
|
from pyWebLayout.style.alignment import Alignment
|
|
|
|
alignment = Alignment.JUSTIFY if args.align == 'justify' else Alignment.LEFT
|
|
print(f"Applying {args.align} alignment to all text blocks...")
|
|
|
|
# Note: We'll pass alignment to the layouter which will handle it during rendering
|
|
# The alignment is applied at the Line level in paragraph_layouter
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting blocks: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
# Set up page style and layouter
|
|
page_size = (args.width, args.height)
|
|
page_style = PageStyle(
|
|
background_color=(255, 255, 255),
|
|
border_width=args.margin,
|
|
border_color=(200, 200, 200),
|
|
padding=(10, 10, 10, 10), # top, right, bottom, left
|
|
line_spacing=5,
|
|
inter_block_spacing=15
|
|
)
|
|
|
|
print(f"Setting up layouter with page size {page_size} and {args.align} alignment")
|
|
|
|
try:
|
|
layouter = BidirectionalLayouter(
|
|
blocks=all_blocks,
|
|
page_style=page_style,
|
|
page_size=page_size,
|
|
alignment_override=alignment
|
|
)
|
|
except Exception as e:
|
|
print(f"Error setting up layouter: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
# Render pages
|
|
print(f"Rendering up to {args.pages} pages...")
|
|
|
|
try:
|
|
pages = []
|
|
current_position = RenderingPosition() # Start from beginning
|
|
|
|
for page_num in range(args.pages):
|
|
print(f"Rendering page {page_num + 1}/{args.pages}...")
|
|
|
|
try:
|
|
# Render the page
|
|
page, next_position = layouter.render_page_forward(current_position)
|
|
pages.append(page)
|
|
|
|
# Check if we've reached the end of the document
|
|
if next_position.block_index >= len(all_blocks):
|
|
print(f"Reached end of document after {page_num + 1} pages")
|
|
break
|
|
|
|
# Update position for next page
|
|
current_position = next_position
|
|
|
|
except Exception as e:
|
|
print(f"Error rendering page {page_num + 1}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
break
|
|
|
|
if not pages:
|
|
print("No pages were generated.")
|
|
return 1
|
|
|
|
print(f"Generated {len(pages)} pages")
|
|
|
|
# Save each page to an image and extract text
|
|
for i, page in enumerate(pages):
|
|
print(f"Saving page {i + 1}/{len(pages)}...")
|
|
|
|
try:
|
|
# Create image from page using pyWebLayout's built-in rendering
|
|
image = render_page_to_image(page)
|
|
|
|
# Save the image
|
|
output_filename = f"page_{i + 1:03d}.png"
|
|
output_path = os.path.join(args.output_dir, output_filename)
|
|
image.save(output_path, 'PNG')
|
|
|
|
# Extract and save text content for verification
|
|
page_text = extract_text_from_page(page)
|
|
text_filename = f"page_{i + 1:03d}.txt"
|
|
text_path = os.path.join(args.output_dir, text_filename)
|
|
with open(text_path, 'w', encoding='utf-8') as f:
|
|
f.write(page_text)
|
|
|
|
print(f"Saved: {output_path} and {text_path}")
|
|
|
|
except Exception as e:
|
|
print(f"Error saving page {i + 1}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
print(f"\nCompleted! Rendered {len(pages)} pages to {args.output_dir}")
|
|
|
|
# Calculate progress through the book
|
|
if len(all_blocks) > 0:
|
|
progress = (current_position.block_index / len(all_blocks)) * 100
|
|
print(f"Progress through book: {progress:.1f}%")
|
|
|
|
except Exception as e:
|
|
print(f"Error during pagination/rendering: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|