import torch from transformers import VisionEncoderDecoderModel, DonutProcessor from PIL import Image from pdf2image import convert_from_bytes import gradio as gr # Configuración del modelo Donut MODEL_ID = "mychen76/invoice-and-receipts_donut_v1" print("Cargando modelo Donut...") model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID) processor = DonutProcessor.from_pretrained(MODEL_ID) model.eval() # Función para procesar documentos def process_document(file_path): # Leer y procesar el archivo if file_path.endswith(".pdf"): with open(file_path, "rb") as pdf_file: images = convert_from_bytes(pdf_file.read(), dpi=300) else: images = [Image.open(file_path).convert("RGB")] results = [] for img in images: # Preprocesar la imagen inputs = processor(img, return_tensors="pt", max_patches=1024) # Generar predicción with torch.no_grad(): outputs = model.generate(**inputs) # Decodificar resultado result = processor.batch_decode(outputs, skip_special_tokens=True)[0] results.append(result) return results # Interfaz Gradio iface = gr.Interface( fn=process_document, inputs=gr.File(label="Sube tu factura o recibo (PDF o imagen)", type="filepath"), outputs="json", title="Donut OCR - Extracción de datos de facturas", description="Sube un PDF o imagen y extrae información estructurada (número de factura, fecha, monto, etc.) utilizando Donut OCR." ) # Iniciar la aplicación if __name__ == "__main__": iface.launch()