Spaces:
Running
Running
import gradio as gr | |
import tempfile | |
from io import BytesIO | |
from PIL import Image | |
from ebooklib import epub | |
from olmocr.model import process_pdf # your forked olmocr model | |
def process_pdf_to_epub(pdf_file, title, author): | |
# Run the OCR + LLM pipeline from olmocr | |
print("Starting PDF processing...") | |
page_results = process_pdf(pdf_file.name) | |
# Create the EPUB book | |
book = epub.EpubBook() | |
book.set_identifier("id123456") | |
book.set_title(title if title else "Untitled Document") | |
book.set_language("en") | |
if author: | |
book.add_author(author) | |
# Try to use the first page as cover | |
try: | |
with Image.open(pdf_file.name) as img: | |
img.convert("RGB").save("cover.jpg", "JPEG") | |
with open("cover.jpg", "rb") as f: | |
cover_data = f.read() | |
book.set_cover("cover.jpg", cover_data) | |
except Exception as e: | |
print("Could not generate cover:", e) | |
# Add chapters from pages | |
chapters = [] | |
for i, page in enumerate(page_results): | |
text = page.get("decoded", {}).get("natural_text", "") | |
if not text.strip(): | |
continue | |
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en") | |
chapter.content = f"<h1>Page {i+1}</h1><p>{text.replace('\n', '<br/>')}</p>" | |
book.add_item(chapter) | |
chapters.append(chapter) | |
# Define spine and table of contents | |
book.toc = chapters | |
book.spine = ["nav"] + chapters | |
book.add_item(epub.EpubNcx()) | |
book.add_item(epub.EpubNav()) | |
# Write to in-memory buffer | |
output_buffer = BytesIO() | |
epub.write_epub(output_buffer, book) | |
output_buffer.seek(0) | |
return output_buffer | |
with gr.Blocks() as demo: | |
gr.Markdown("# PDF to EPUB Converter\nPowered by `olmOCR`") | |
with gr.Row(): | |
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
title = gr.Textbox(label="EPUB Title", placeholder="Optional title") | |
author = gr.Textbox(label="Author", placeholder="Optional author name") | |
convert_button = gr.Button("Convert to EPUB") | |
epub_output = gr.File(label="Download EPUB", file_types=[".epub"]) | |
convert_button.click( | |
fn=process_pdf_to_epub, | |
inputs=[pdf_input, title, author], | |
outputs=epub_output | |
) | |
if __name__ == "__main__": | |
demo.launch() | |