File size: 1,150 Bytes
3cf4149
7fe122b
3dc1b9e
 
 
3cf4149
 
7fe122b
 
a1c289f
3cf4149
3dc1b9e
3cf4149
 
 
 
 
 
 
 
 
 
 
 
 
3dc1b9e
7fe122b
3dc1b9e
3cf4149
 
 
 
 
 
 
 
 
 
3dc1b9e
 
 
3cf4149
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import gradio as gr
from PIL import Image

# Load model and processor
model_name = "google/pix2struct-docvqa-large"
model = Pix2StructForConditionalGeneration.from_pretrained(model_name)
processor = Pix2StructProcessor.from_pretrained(model_name)

def process_image(image_path):
    try:
        # Load the image
        image = Image.open(image_path).convert("RGB")

        # Prepare the input
        inputs = processor(images=image, text="What does this image say?", return_tensors="pt")

        # Generate prediction
        output = model.generate(**inputs)

        # Decode the output
        solution = processor.decode(output[0], skip_special_tokens=True)
        return solution

    except Exception as e:
        return f"Error processing image: {str(e)}"

def predict(image):
    """Handles image input for Gradio."""
    return process_image(image)

# Gradio app
iface = gr.Interface(
    fn=predict,
    inputs=gr.Image(type="filepath"),
    outputs="text",
    title="Image Text Solution"
)

if __name__ == "__main__":
    iface.launch()