File size: 4,968 Bytes
e7faf36
af75cff
 
 
e7faf36
19918ea
af75cff
d45f3e7
af75cff
 
fff0f58
8be5494
af75cff
89a1632
af75cff
 
 
 
8be5494
b3d319d
36bb738
b3d319d
 
 
 
 
 
 
 
 
 
89a1632
 
b3d319d
89a1632
af75cff
8be5494
afbaa03
af75cff
 
afbaa03
 
8be5494
afbaa03
 
 
8be5494
19918ea
af75cff
 
 
 
 
 
 
19918ea
afbaa03
 
 
8be5494
d45f3e7
8be5494
afbaa03
8be5494
 
afbaa03
8be5494
 
d45f3e7
8be5494
afbaa03
 
 
0225b8c
fff0f58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afbaa03
10b8e9d
afbaa03
a67d3a2
 
 
 
b3d319d
 
 
 
 
 
a67d3a2
 
 
 
 
 
 
 
 
 
 
f01e8a4
afbaa03
af75cff
f01e8a4
d45f3e7
afbaa03
 
8be5494
afbaa03
 
 
 
8be5494
f01e8a4
afbaa03
 
 
d45f3e7
 
 
af75cff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import base64
import tempfile
from io import BytesIO

import torch
import gradio as gr
from PIL import Image
from PyPDF2 import PdfReader
from ebooklib import epub
from pdf2image import convert_from_path
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

# Set cache and log paths
cache_dir = "/tmp/huggingface_cache"
os.environ["HF_HOME"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir
os.makedirs(cache_dir, exist_ok=True)

# Patch logging to avoid permission errors
import logging
from logging import FileHandler
class SafeFileHandler(FileHandler):
    def __init__(self, filename, mode='a', encoding=None, delay=False, errors=None):
        # Redirect all logs to tmp
        safe_path = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
        super().__init__(safe_path, mode, encoding, delay, errors)
logging.FileHandler = SafeFileHandler

# Now import olmocr
from olmocr.run_ocr import ocr_pdf_to_text
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from olmocr.data.renderpdf import render_pdf_to_base64png

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "allenai/olmOCR-7B-0225-preview",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

def ocr_page(pdf_path, page_num):
    image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
    anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
    prompt = build_finetuning_prompt(anchor_text)

    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
        ],
    }]

    prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
    inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            temperature=0.8,
            max_new_tokens=1024,
            do_sample=True,
        )

    prompt_len = inputs["input_ids"].shape[1]
    new_tokens = outputs[:, prompt_len:]
    decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    return decoded[0] if decoded else ""

def create_epub_from_text(text, output_path, title, author, language, cover_image):
    book = epub.EpubBook()
    book.set_title(title)
    book.set_language(language)
    book.add_author(author)

    with open(cover_image, "rb") as cover_file:
        cover_data = cover_file.read()
    cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
    book.add_item(cover_item)

    chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
    chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
    book.add_item(chapter)
    book.toc = (epub.Link("content.xhtml", "Content", "content"),)
    book.add_item(epub.EpubNav())
    epub.write_epub(output_path, book)

def convert_pdf_to_epub(pdf_file, title, author, language):
    tmp_pdf_path = pdf_file.name
    reader = PdfReader(tmp_pdf_path)
    cover_path = "/tmp/cover.jpg"
    images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
    images[0].save(cover_path, "JPEG")

    # Use official AllenAI OCR function
    ocr_text = ocr_pdf_to_text(
        pdf_path=tmp_pdf_path,
        model=model,
        processor=processor
    )

    epub_path = "/tmp/output.epub"
    create_epub_from_text(
        text=ocr_text,
        output_path=epub_path,
        title=title,
        author=author,
        language=language,
        cover_image=cover_path
    )
    return epub_path, cover_path

def interface_fn(pdf, title, author, language):
    epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
    return epub_path

demo = gr.Interface(
    fn=interface_fn,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
        gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
        gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
    ],
    outputs=gr.File(label="Download EPUB"),
    title="PDF to EPUB Converter (olmOCR)",
    description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch(share=True)