import gradio as gr from transformers import DonutProcessor, VisionEncoderDecoderModel from PIL import Image model = VisionEncoderDecoderModel.from_pretrained("AdamCodd/donut-receipts-extract", use_auth_token=True) processor = DonutProcessor.from_pretrained("AdamCodd/donut-receipts-extract", use_auth_token=True) def extract_info(image): image = image.convert("RGB") pixel_values = processor(images=image, return_tensors="pt").pixel_values task_prompt = "" decoder_input_ids = processor.tokenizer(task_prompt, return_tensors="pt").input_ids outputs = model.generate(pixel_values, decoder_input_ids=decoder_input_ids, max_length=512) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] return generated_text gr.Interface(fn=extract_info, inputs=gr.Image(type="pil"), outputs="text").launch()