Spaces:
Running
Running
File size: 1,505 Bytes
19918ea d45f3e7 0f8f93a 19918ea d45f3e7 0f8f93a d45f3e7 19918ea d45f3e7 19918ea d45f3e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import gradio as gr
import torch
import pypdfium2
from PIL import Image
from transformers import Qwen2VLProcessor, Qwen2VLImageProcessor, AutoTokenizer, Qwen2VLModel
# Load model and processor
model_name = "Qwen/Qwen-VL" # You may replace with your preferred VL model
image_processor = Qwen2VLImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = Qwen2VLProcessor(image_processor=image_processor, tokenizer=tokenizer)
model = Qwen2VLModel.from_pretrained(
model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()
# Convert PDF to list of PIL images (one per page)
def pdf_to_images(pdf_path):
pdf = pypdfium2.PdfDocument(pdf_path)
return [page.render().to_pil() for page in pdf]
# Generate text from each image using the vision-language model
def process_pdf(pdf_file):
images = pdf_to_images(pdf_file.name)
results = []
for image in images:
inputs = processor(images=image, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=256)
text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
results.append(text.strip())
return "\n\n".join(results)
# Gradio UI
demo = gr.Interface(
fn=process_pdf,
inputs=gr.File(type="file", file_types=[".pdf"]),
outputs="text",
title="olmOCR PDF Processor"
)
if __name__ == "__main__":
demo.launch()
|