File size: 2,146 Bytes
6a0411c
d5f7d0d
6a0411c
 
 
5827499
6a0411c
8be5494
6a0411c
 
 
 
5827499
fff0f58
5827499
6a0411c
d5f7d0d
6a0411c
d5f7d0d
5827499
6a0411c
 
 
 
d5f7d0d
6a0411c
d5f7d0d
6a0411c
 
a32e7b0
5827499
 
 
6a0411c
 
 
 
 
 
5827499
fff0f58
6a0411c
fff0f58
6a0411c
 
 
 
d5f7d0d
6a0411c
 
 
 
d5f7d0d
6a0411c
 
 
 
 
 
 
 
 
d45f3e7
 
6a0411c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import tempfile
from pathlib import Path

import gradio as gr
from ebooklib import epub
from olmocr import process_pdf  # your forked olmocr model

def process_pdf_to_epub(pdf_path, title="Untitled", author="Unknown"):
    print(f"Processing PDF: {pdf_path}")
    output_dir = tempfile.mkdtemp()
    results = process_pdf(pdf_path, output_dir)

    book = epub.EpubBook()
    book.set_identifier("id123456")
    book.set_title(title)
    book.set_language("en")
    book.add_author(author)

    chapters = []

    for i, result in enumerate(results):
        text = result.get("decoded_content", {}).get("natural_text", "")
        if not text:
            continue

        chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
        # Avoid backslash in f-string expression
        safe_text = text.replace("\n", "<br/>")
        chapter.content = f"<h1>Page {i+1}</h1><p>{safe_text}</p>"
        book.add_item(chapter)
        chapters.append(chapter)

        print(f"Processed page {i+1}")

    if not chapters:
        raise ValueError("No content extracted from PDF.")

    book.toc = tuple(chapters)
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    book.spine = ["nav"] + chapters

    with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
        epub.write_epub(tmp.name, book)
        print(f"EPUB written to {tmp.name}")
        return tmp.name

# Gradio UI
title_input = gr.Textbox(label="EPUB Title", value="Untitled")
author_input = gr.Textbox(label="Author", value="Unknown")
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])

output_file = gr.File(label="Download EPUB")

iface = gr.Interface(
    fn=process_pdf_to_epub,
    inputs=[file_input, title_input, author_input],
    outputs=output_file,
    title="PDF to EPUB Converter with olmOCR",
    description="Upload a PDF to convert it into an EPUB. First page is used as the cover."
)

if __name__ == "__main__":
    iface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        debug=True,
        allowed_paths=["/tmp"]
    )