File size: 4,020 Bytes
c986ff1
19918ea
8be5494
 
d45f3e7
8be5494
f01e8a4
8be5494
 
 
 
 
 
0225b8c
 
 
f01e8a4
 
8be5494
 
 
 
 
 
 
 
 
0225b8c
 
 
 
8be5494
0225b8c
 
8be5494
19918ea
8be5494
 
 
 
 
 
 
 
 
19918ea
8be5494
0225b8c
8be5494
d45f3e7
8be5494
 
 
 
 
 
 
 
d45f3e7
8be5494
 
f01e8a4
0225b8c
 
 
 
 
 
 
 
 
f01e8a4
0225b8c
 
 
 
 
 
 
f01e8a4
 
 
 
 
 
0225b8c
 
 
 
 
 
f01e8a4
0225b8c
f01e8a4
 
 
 
 
 
 
0225b8c
f01e8a4
 
 
d45f3e7
f01e8a4
8be5494
d45f3e7
8be5494
 
0225b8c
f01e8a4
 
8be5494
f01e8a4
0225b8c
 
f01e8a4
d45f3e7
 
 
8be5494
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import torch
import base64
from io import BytesIO
from PIL import Image
import gradio as gr
from ebooklib import epub

from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

from PyPDF2 import PdfReader

# Set a writable cache directory for HF
os.environ['HF_HOME'] = '/tmp/.cache/huggingface'

# Load processor and model
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
).eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def extract_text_from_page(pdf_path, page_num):
    # Render image
    image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
    image = Image.open(BytesIO(base64.b64decode(image_base64)))

    # Prompt and input
    anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
    prompt = build_finetuning_prompt(anchor_text)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model.generate(
            **inputs,
            temperature=0.8,
            max_new_tokens=256,
            num_return_sequences=1,
            do_sample=True,
        )

    prompt_len = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_len:]
    decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
    return decoded, image_base64 if page_num == 1 else None

def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"):
    file_path = file.name
    reader = PdfReader(file_path)
    num_pages = len(reader.pages)

    all_text = []
    cover_image_data = None

    for page in range(1, num_pages + 1):
        text, cover_image = extract_text_from_page(file_path, page)
        all_text.append(f"<h2>Page {page}</h2><p>{text}</p>")
        if cover_image and not cover_image_data:
            cover_image_data = cover_image  # base64

    # Build EPUB
    book = epub.EpubBook()
    book.set_identifier("id123456")
    book.set_title(title)
    book.set_language(language)
    book.add_author(author)

    # Add cover image
    if cover_image_data:
        cover_bytes = base64.b64decode(cover_image_data)
        book.set_cover("cover.jpg", cover_bytes)

    # Create chapter with all text
    chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
    chapter.content = f"<h1>{title}</h1>{''.join(all_text)}"
    book.add_item(chapter)

    book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
    book.add_item(epub.EpubNavi())
    book.add_item(epub.EpubNCX())
    book.spine = ['nav', chapter]

    epub_path = f"/tmp/{title.replace(' ', '_')}.epub"
    epub.write_epub(epub_path, book)

    return epub_path

# Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Textbox(value="Extracted PDF", label="EPUB Title"),
        gr.Textbox(value="olmOCR", label="Author"),
        gr.Textbox(value="en", label="Language"),
    ],
    outputs=gr.File(label="Download EPUB"),
    title="olmOCR PDF to EPUB (Full PDF + Cover Image)",
    description="Extract text from ALL pages of a PDF and generate an EPUB with the first page as cover.",
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()