File size: 4,372 Bytes
e7faf36
 
80deca1
 
 
 
 
 
 
e7faf36
afbaa03
19918ea
afbaa03
8be5494
d45f3e7
8be5494
 
 
 
afbaa03
 
 
1e4eb1b
8be5494
 
afbaa03
8be5494
afbaa03
 
 
 
8be5494
 
afbaa03
 
 
 
8be5494
19918ea
8be5494
 
 
 
 
afbaa03
8be5494
 
 
19918ea
afbaa03
 
 
8be5494
d45f3e7
8be5494
afbaa03
8be5494
 
afbaa03
8be5494
 
d45f3e7
8be5494
afbaa03
 
 
0225b8c
 
afbaa03
 
 
 
f01e8a4
afbaa03
 
0225b8c
afbaa03
f01e8a4
 
 
afbaa03
f01e8a4
afbaa03
 
 
 
0225b8c
afbaa03
 
 
 
 
 
 
f01e8a4
afbaa03
 
 
 
 
f01e8a4
afbaa03
 
f01e8a4
afbaa03
 
 
f01e8a4
d45f3e7
afbaa03
 
 
8be5494
afbaa03
 
 
 
8be5494
f01e8a4
afbaa03
 
 
d45f3e7
 
 
683e2df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os

# Set Hugging Face and Torch cache to a guaranteed-writable location
cache_dir = "/tmp/huggingface_cache"
os.environ["HF_HOME"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir

# Create the directory if it doesn't exist
os.makedirs(cache_dir, exist_ok=True)

import gradio as gr
import torch
from PyPDF2 import PdfReader
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from ebooklib import epub
import base64
import tempfile



# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")


def ocr_page(pdf_path, page_num):
    # Render page to base64 PNG
    image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
    anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
    prompt = build_finetuning_prompt(anchor_text)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
            ],
        }
    ]

    prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
    inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            temperature=0.8,
            max_new_tokens=1024,
            do_sample=True,
        )

    prompt_len = inputs["input_ids"].shape[1]
    new_tokens = outputs[:, prompt_len:]
    decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    return decoded[0] if decoded else ""


def convert_pdf_to_epub(pdf_file, title, author, language):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
        tmp_pdf.write(pdf_file.read())
        tmp_pdf_path = tmp_pdf.name

    reader = PdfReader(tmp_pdf_path)
    num_pages = len(reader.pages)

    # Create EPUB book
    book = epub.EpubBook()
    book.set_title(title)
    book.add_author(author)
    book.set_language(language)

    # Use first page as cover
    cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
    cover_image_bytes = base64.b64decode(cover_image_b64)
    book.set_cover("cover.jpg", cover_image_bytes)

    # OCR and add pages
    for i in range(num_pages):
        text = ocr_page(tmp_pdf_path, i)
        chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
        chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
        book.add_item(chapter)
        book.spine.append(chapter)

    # Finalize EPUB
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
    epub.write_epub(epub_path, book, {})

    with open(epub_path, "rb") as f:
        return epub_path, f.read()


def interface_fn(pdf, title, author, language):
    epub_path, epub_bytes = convert_pdf_to_epub(pdf, title, author, language)
    return epub_path


demo = gr.Interface(
    fn=interface_fn,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
        gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
        gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
    ],
    outputs=gr.File(label="Download EPUB"),
    title="PDF to EPUB Converter (olmOCR)",
    description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch(share = True)