olmocr-demo / app.py
leonarb's picture
Update app.py
f01e8a4 verified
raw
history blame
3.31 kB
import os
import torch
import base64
from io import BytesIO
from PIL import Image
import gradio as gr
from ebooklib import epub
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
# Set a writable directory for Hugging Face's cache
os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
# Load processor and model
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language="en"):
file_path = file.name
image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
output = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=256,
num_return_sequences=1,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
# Create EPUB
book = epub.EpubBook()
book.set_identifier("id123456")
book.set_title(title)
book.set_language(language)
book.add_author(author)
chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
chapter.content = f"<h1>{title}</h1><p>{decoded}</p>"
book.add_item(chapter)
book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
book.add_item(epub.EpubNavi())
book.add_item(epub.EpubNCX())
book.spine = ['nav', chapter]
epub_path = f"/tmp/{title.replace(' ', '_')}_page_{page}.epub"
epub.write_epub(epub_path, book)
return epub_path
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF"),
gr.Number(value=1, label="Page Number"),
gr.Textbox(value="Extracted Page", label="EPUB Title"),
gr.Textbox(value="olmOCR", label="Author"),
gr.Textbox(value="en", label="Language"),
],
outputs=gr.File(label="Download EPUB"),
title="olmOCR PDF to EPUB",
description="Extract text from a selected page of a PDF and download it as an EPUB file.",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()