Spaces:

leonarb
/

olmocr-demo

Running

File size: 2,509 Bytes

import torch
import base64
from io import BytesIO
from PIL import Image
import gradio as gr

from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Load processor and model
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
).eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def process_pdf(file, page=1):
    # Save uploaded file to disk
    file_path = file.name

    # Render the selected PDF page to base64 PNG
    image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

    # Extract document metadata and build the prompt
    anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
    prompt = build_finetuning_prompt(anchor_text)

    # Construct chat message
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
            ],
        }
    ]

    # Tokenize inputs
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Run model
    with torch.no_grad():
        output = model.generate(
            **inputs,
            temperature=0.8,
            max_new_tokens=256,
            num_return_sequences=1,
            do_sample=True,
        )

    # Decode
    prompt_len = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_len:]
    decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    return decoded[0]

# Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Number(value=1, label="Page Number"),
    ],
    outputs="text",
    title="olmOCR PDF Text Extractor",
    description="Upload a PDF and select a page to extract text using the olmOCR model.",
)

if __name__ == "__main__":
    iface.launch()