Spaces:
Running
Running
File size: 2,146 Bytes
6a0411c d5f7d0d 6a0411c 5827499 6a0411c 8be5494 6a0411c 5827499 fff0f58 5827499 6a0411c d5f7d0d 6a0411c d5f7d0d 5827499 6a0411c d5f7d0d 6a0411c d5f7d0d 6a0411c a32e7b0 5827499 6a0411c 5827499 fff0f58 6a0411c fff0f58 6a0411c d5f7d0d 6a0411c d5f7d0d 6a0411c d45f3e7 6a0411c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import os
import tempfile
from pathlib import Path
import gradio as gr
from ebooklib import epub
from olmocr import process_pdf # your forked olmocr model
def process_pdf_to_epub(pdf_path, title="Untitled", author="Unknown"):
print(f"Processing PDF: {pdf_path}")
output_dir = tempfile.mkdtemp()
results = process_pdf(pdf_path, output_dir)
book = epub.EpubBook()
book.set_identifier("id123456")
book.set_title(title)
book.set_language("en")
book.add_author(author)
chapters = []
for i, result in enumerate(results):
text = result.get("decoded_content", {}).get("natural_text", "")
if not text:
continue
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
# Avoid backslash in f-string expression
safe_text = text.replace("\n", "<br/>")
chapter.content = f"<h1>Page {i+1}</h1><p>{safe_text}</p>"
book.add_item(chapter)
chapters.append(chapter)
print(f"Processed page {i+1}")
if not chapters:
raise ValueError("No content extracted from PDF.")
book.toc = tuple(chapters)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ["nav"] + chapters
with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
epub.write_epub(tmp.name, book)
print(f"EPUB written to {tmp.name}")
return tmp.name
# Gradio UI
title_input = gr.Textbox(label="EPUB Title", value="Untitled")
author_input = gr.Textbox(label="Author", value="Unknown")
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
output_file = gr.File(label="Download EPUB")
iface = gr.Interface(
fn=process_pdf_to_epub,
inputs=[file_input, title_input, author_input],
outputs=output_file,
title="PDF to EPUB Converter with olmOCR",
description="Upload a PDF to convert it into an EPUB. First page is used as the cover."
)
if __name__ == "__main__":
iface.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
debug=True,
allowed_paths=["/tmp"]
)
|