olmocr-demo / app.py
leonarb's picture
Update app.py
d45f3e7 verified
raw
history blame
1.32 kB
import gradio as gr
import torch
import pypdfium2
from PIL import Image
from transformers import Qwen2VLProcessor, Qwen2VLModel
# Load model and processor
model_name = "Qwen/Qwen-VL" # You may replace with your preferred VL model
processor = Qwen2VLProcessor.from_pretrained(model_name)
model = Qwen2VLModel.from_pretrained(
model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()
# Convert PDF to list of PIL images (one per page)
def pdf_to_images(pdf_path):
pdf = pypdfium2.PdfDocument(pdf_path)
return [page.render().to_pil() for page in pdf]
# Generate text from each image using the vision-language model
def process_pdf(pdf_file):
images = pdf_to_images(pdf_file.name)
results = []
for image in images:
inputs = processor(images=image, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=256)
text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
results.append(text.strip())
return "\n\n".join(results)
# Gradio UI
demo = gr.Interface(
fn=process_pdf,
inputs=gr.File(type="file", file_types=[".pdf"]),
outputs="text",
title="olmOCR PDF Processor"
)
if __name__ == "__main__":
demo.launch()