Spaces:
Running
Running
File size: 3,310 Bytes
c986ff1 19918ea 8be5494 d45f3e7 8be5494 f01e8a4 8be5494 f01e8a4 8be5494 f01e8a4 8be5494 19918ea 8be5494 19918ea 8be5494 d45f3e7 8be5494 d45f3e7 8be5494 f01e8a4 d45f3e7 f01e8a4 8be5494 d45f3e7 8be5494 f01e8a4 8be5494 f01e8a4 d45f3e7 8be5494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import os
import torch
import base64
from io import BytesIO
from PIL import Image
import gradio as gr
from ebooklib import epub
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
# Set a writable directory for Hugging Face's cache
os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
# Load processor and model
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language="en"):
file_path = file.name
image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
output = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=256,
num_return_sequences=1,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
# Create EPUB
book = epub.EpubBook()
book.set_identifier("id123456")
book.set_title(title)
book.set_language(language)
book.add_author(author)
chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
chapter.content = f"<h1>{title}</h1><p>{decoded}</p>"
book.add_item(chapter)
book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
book.add_item(epub.EpubNavi())
book.add_item(epub.EpubNCX())
book.spine = ['nav', chapter]
epub_path = f"/tmp/{title.replace(' ', '_')}_page_{page}.epub"
epub.write_epub(epub_path, book)
return epub_path
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF"),
gr.Number(value=1, label="Page Number"),
gr.Textbox(value="Extracted Page", label="EPUB Title"),
gr.Textbox(value="olmOCR", label="Author"),
gr.Textbox(value="en", label="Language"),
],
outputs=gr.File(label="Download EPUB"),
title="olmOCR PDF to EPUB",
description="Extract text from a selected page of a PDF and download it as an EPUB file.",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()
|