import gradio as gr from transformers import DonutProcessor, VisionEncoderDecoderModel from PIL import Image processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa") model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa") def extract_info(image): image = image.convert("RGB") pixel_values = processor(images=image, return_tensors="pt").pixel_values # Prompt for question answering (DocVQA) task_prompt = "What is the total amount?" decoder_input_ids = processor.tokenizer(task_prompt, return_tensors="pt").input_ids outputs = model.generate(pixel_values, decoder_input_ids=decoder_input_ids, max_length=512) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] return generated_text gr.Interface(fn=extract_info, inputs=gr.Image(type="pil"), outputs="text", title="Receipt Total Extractor").launch()