|
import os |
|
from transformers import AutoProcessor, AutoModelForCausalLM |
|
import gradio as gr |
|
import torch |
|
from PIL import Image |
|
|
|
|
|
hf_token = os.getenv("HF_AUTH_TOKEN") |
|
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token, torch_dtype=torch.float16, trust_remote_code=True).to(device) |
|
|
|
|
|
def process_image(image, prompt="<ocr>"): |
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device) |
|
outputs = model.generate(**inputs, max_new_tokens=1024) |
|
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] |
|
return generated_text |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_image, |
|
inputs=[ |
|
gr.Image(type="pil", label="Upload Image"), |
|
gr.Textbox(value="<ocr>", label="Prompt"), |
|
], |
|
outputs="text", |
|
title="OCR with Llama-3.2-11B-Vision-Instruct", |
|
description="Upload an image and input a prompt (e.g., '<ocr>') to extract text.", |
|
) |
|
|
|
iface.launch() |
|
|