rag-trial / app.py
xzerus's picture
Update app.py
08d3750 verified
raw
history blame
1.33 kB
import os
from transformers import AutoProcessor, AutoModelForCausalLM
import gradio as gr
import torch
from PIL import Image
# Load the Hugging Face token from environment variables
hf_token = os.getenv("HF_AUTH_TOKEN")
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the model and processor with authentication
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token, torch_dtype=torch.float16, trust_remote_code=True).to(device)
# Function to process image and text prompt
def process_image(image, prompt="<ocr>"):
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=1024)
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
return generated_text
# Gradio Interface
iface = gr.Interface(
fn=process_image,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(value="<ocr>", label="Prompt"),
],
outputs="text",
title="OCR with Llama-3.2-11B-Vision-Instruct",
description="Upload an image and input a prompt (e.g., '<ocr>') to extract text.",
)
iface.launch()