olmocr-demo / app.py
leonarb's picture
Fixes EPUB output
d5f7d0d verified
raw
history blame
2.36 kB
import gradio as gr
import tempfile
from io import BytesIO
from PIL import Image
from ebooklib import epub
from olmocr.model import process_pdf # your forked olmocr model
def process_pdf_to_epub(pdf_file, title, author):
# Run the OCR + LLM pipeline from olmocr
print("Starting PDF processing...")
page_results = process_pdf(pdf_file.name)
# Create the EPUB book
book = epub.EpubBook()
book.set_identifier("id123456")
book.set_title(title if title else "Untitled Document")
book.set_language("en")
if author:
book.add_author(author)
# Try to use the first page as cover
try:
with Image.open(pdf_file.name) as img:
img.convert("RGB").save("cover.jpg", "JPEG")
with open("cover.jpg", "rb") as f:
cover_data = f.read()
book.set_cover("cover.jpg", cover_data)
except Exception as e:
print("Could not generate cover:", e)
# Add chapters from pages
chapters = []
for i, page in enumerate(page_results):
text = page.get("decoded", {}).get("natural_text", "")
if not text.strip():
continue
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
chapter.content = f"<h1>Page {i+1}</h1><p>{text.replace('\n', '<br/>')}</p>"
book.add_item(chapter)
chapters.append(chapter)
# Define spine and table of contents
book.toc = chapters
book.spine = ["nav"] + chapters
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Write to in-memory buffer
output_buffer = BytesIO()
epub.write_epub(output_buffer, book)
output_buffer.seek(0)
return output_buffer
with gr.Blocks() as demo:
gr.Markdown("# PDF to EPUB Converter\nPowered by `olmOCR`")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
title = gr.Textbox(label="EPUB Title", placeholder="Optional title")
author = gr.Textbox(label="Author", placeholder="Optional author name")
convert_button = gr.Button("Convert to EPUB")
epub_output = gr.File(label="Download EPUB", file_types=[".epub"])
convert_button.click(
fn=process_pdf_to_epub,
inputs=[pdf_input, title, author],
outputs=epub_output
)
if __name__ == "__main__":
demo.launch()