sahalhes commited on
Commit
1da5841
·
1 Parent(s): bc58a1e
Files changed (2) hide show
  1. app.py +33 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
+ import torch
4
+ from PIL import Image
5
+
6
+ model_name = "nlpconnect/vit-gpt2-image-captioning"
7
+ model = VisionEncoderDecoderModel.from_pretrained(model_name)
8
+ feature_extractor = ViTImageProcessor.from_pretrained(model_name)
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ model.to(device)
13
+
14
+ def generate_caption(image):
15
+ if image is None:
16
+ return "Please upload an image."
17
+
18
+ pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
19
+ pixel_values = pixel_values.to(device)
20
+
21
+ output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
22
+ caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
23
+ return caption
24
+
25
+ demo = gr.Interface(
26
+ fn=generate_caption,
27
+ inputs=gr.Image(type="pil"),
28
+ outputs="text",
29
+ title="🖼️ Image Caption Generator",
30
+ description="Upload an image and get a caption describing it using a VisionEncoderDecoder model (ViT + GPT-2)."
31
+ )
32
+
33
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio
4
+ Pillow