import gradio as gr import torch import pypdfium2 from PIL import Image from transformers import Qwen2VLProcessor, Qwen2VLImageProcessor, AutoTokenizer, Qwen2VLModel # Load model and processor model_name = "Qwen/Qwen-VL" # You may replace with your preferred VL model image_processor = Qwen2VLImageProcessor.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) processor = Qwen2VLProcessor(image_processor=image_processor, tokenizer=tokenizer) model = Qwen2VLModel.from_pretrained( model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) model.eval() # Convert PDF to list of PIL images (one per page) def pdf_to_images(pdf_path): pdf = pypdfium2.PdfDocument(pdf_path) return [page.render().to_pil() for page in pdf] # Generate text from each image using the vision-language model def process_pdf(pdf_file): images = pdf_to_images(pdf_file.name) results = [] for image in images: inputs = processor(images=image, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=256) text = processor.batch_decode(outputs, skip_special_tokens=True)[0] results.append(text.strip()) return "\n\n".join(results) # Gradio UI demo = gr.Interface( fn=process_pdf, inputs=gr.File(type="file", file_types=[".pdf"]), outputs="text", title="olmOCR PDF Processor" ) if __name__ == "__main__": demo.launch()