Spaces:
Running
Running
import gradio as gr | |
import torch | |
import pypdfium2 | |
from PIL import Image | |
from transformers import Qwen2VLProcessor, Qwen2VLModel | |
# Load model and processor | |
model_name = "Qwen/Qwen-VL" # You may replace with your preferred VL model | |
processor = Qwen2VLProcessor.from_pretrained(model_name) | |
model = Qwen2VLModel.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
) | |
model.eval() | |
# Convert PDF to list of PIL images (one per page) | |
def pdf_to_images(pdf_path): | |
pdf = pypdfium2.PdfDocument(pdf_path) | |
return [page.render().to_pil() for page in pdf] | |
# Generate text from each image using the vision-language model | |
def process_pdf(pdf_file): | |
images = pdf_to_images(pdf_file.name) | |
results = [] | |
for image in images: | |
inputs = processor(images=image, return_tensors="pt").to(model.device) | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_new_tokens=256) | |
text = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
results.append(text.strip()) | |
return "\n\n".join(results) | |
# Gradio UI | |
demo = gr.Interface( | |
fn=process_pdf, | |
inputs=gr.File(type="file", file_types=[".pdf"]), | |
outputs="text", | |
title="olmOCR PDF Processor" | |
) | |
if __name__ == "__main__": | |
demo.launch() | |