Spaces:
Running
Running
File size: 2,355 Bytes
5827499 d5f7d0d af75cff d45f3e7 5827499 d5f7d0d 8be5494 5827499 d5f7d0d 5827499 d5f7d0d fff0f58 5827499 d5f7d0d 5827499 d5f7d0d 5827499 d5f7d0d 5827499 fff0f58 d5f7d0d d45f3e7 d5f7d0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
import tempfile
from io import BytesIO
from PIL import Image
from ebooklib import epub
from olmocr.model import process_pdf # your forked olmocr model
def process_pdf_to_epub(pdf_file, title, author):
# Run the OCR + LLM pipeline from olmocr
print("Starting PDF processing...")
page_results = process_pdf(pdf_file.name)
# Create the EPUB book
book = epub.EpubBook()
book.set_identifier("id123456")
book.set_title(title if title else "Untitled Document")
book.set_language("en")
if author:
book.add_author(author)
# Try to use the first page as cover
try:
with Image.open(pdf_file.name) as img:
img.convert("RGB").save("cover.jpg", "JPEG")
with open("cover.jpg", "rb") as f:
cover_data = f.read()
book.set_cover("cover.jpg", cover_data)
except Exception as e:
print("Could not generate cover:", e)
# Add chapters from pages
chapters = []
for i, page in enumerate(page_results):
text = page.get("decoded", {}).get("natural_text", "")
if not text.strip():
continue
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
chapter.content = f"<h1>Page {i+1}</h1><p>{text.replace('\n', '<br/>')}</p>"
book.add_item(chapter)
chapters.append(chapter)
# Define spine and table of contents
book.toc = chapters
book.spine = ["nav"] + chapters
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# Write to in-memory buffer
output_buffer = BytesIO()
epub.write_epub(output_buffer, book)
output_buffer.seek(0)
return output_buffer
with gr.Blocks() as demo:
gr.Markdown("# PDF to EPUB Converter\nPowered by `olmOCR`")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
title = gr.Textbox(label="EPUB Title", placeholder="Optional title")
author = gr.Textbox(label="Author", placeholder="Optional author name")
convert_button = gr.Button("Convert to EPUB")
epub_output = gr.File(label="Download EPUB", file_types=[".epub"])
convert_button.click(
fn=process_pdf_to_epub,
inputs=[pdf_input, title, author],
outputs=epub_output
)
if __name__ == "__main__":
demo.launch()
|