File size: 2,355 Bytes
5827499
d5f7d0d
af75cff
d45f3e7
5827499
d5f7d0d
8be5494
5827499
d5f7d0d
 
 
5827499
d5f7d0d
fff0f58
5827499
d5f7d0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5827499
d5f7d0d
 
 
 
 
 
5827499
 
 
d5f7d0d
 
 
5827499
fff0f58
 
d5f7d0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d45f3e7
 
d5f7d0d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import tempfile
from io import BytesIO
from PIL import Image
from ebooklib import epub
from olmocr.model import process_pdf  # your forked olmocr model

def process_pdf_to_epub(pdf_file, title, author):
    # Run the OCR + LLM pipeline from olmocr
    print("Starting PDF processing...")
    page_results = process_pdf(pdf_file.name)

    # Create the EPUB book
    book = epub.EpubBook()
    book.set_identifier("id123456")
    book.set_title(title if title else "Untitled Document")
    book.set_language("en")
    if author:
        book.add_author(author)

    # Try to use the first page as cover
    try:
        with Image.open(pdf_file.name) as img:
            img.convert("RGB").save("cover.jpg", "JPEG")
            with open("cover.jpg", "rb") as f:
                cover_data = f.read()
                book.set_cover("cover.jpg", cover_data)
    except Exception as e:
        print("Could not generate cover:", e)

    # Add chapters from pages
    chapters = []
    for i, page in enumerate(page_results):
        text = page.get("decoded", {}).get("natural_text", "")
        if not text.strip():
            continue
        chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
        chapter.content = f"<h1>Page {i+1}</h1><p>{text.replace('\n', '<br/>')}</p>"
        book.add_item(chapter)
        chapters.append(chapter)

    # Define spine and table of contents
    book.toc = chapters
    book.spine = ["nav"] + chapters
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    # Write to in-memory buffer
    output_buffer = BytesIO()
    epub.write_epub(output_buffer, book)
    output_buffer.seek(0)
    return output_buffer

with gr.Blocks() as demo:
    gr.Markdown("# PDF to EPUB Converter\nPowered by `olmOCR`")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
    title = gr.Textbox(label="EPUB Title", placeholder="Optional title")
    author = gr.Textbox(label="Author", placeholder="Optional author name")
    convert_button = gr.Button("Convert to EPUB")
    epub_output = gr.File(label="Download EPUB", file_types=[".epub"])

    convert_button.click(
        fn=process_pdf_to_epub,
        inputs=[pdf_input, title, author],
        outputs=epub_output
    )

if __name__ == "__main__":
    demo.launch()