preston-cell commited on
Commit
0bf1b01
·
verified ·
1 Parent(s): 4f56a4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -37
app.py CHANGED
@@ -1,52 +1,23 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
- from datasets import load_dataset
4
- import torch
5
- import numpy as np
6
 
7
  # Load BLIP model for image captioning
8
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
9
 
10
- # Load SpeechT5 model for text-to-speech
11
- synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
12
-
13
- # Load a speaker embedding
14
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
15
- speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
16
-
17
-
18
- def process_image(image):
19
  try:
20
- # Generate caption from the image
21
  caption = caption_model(image)[0]['generated_text']
22
-
23
- # Convert caption to speech
24
- speech = synthesiser(
25
- caption,
26
- forward_params={"speaker_embeddings": speaker_embedding}
27
- )
28
-
29
- # Prepare audio data
30
- audio = np.array(speech["audio"])
31
- rate = speech["sampling_rate"]
32
-
33
- # Return both audio and caption
34
- return (rate, audio), caption
35
-
36
  except Exception as e:
37
- return None, f"Error: {str(e)}"
38
-
39
 
40
- # Gradio Interface
41
  iface = gr.Interface(
42
- fn=process_image,
43
  inputs=gr.Image(type='pil', label="Upload an Image"),
44
- outputs=[
45
- gr.Audio(label="Generated Audio"),
46
- gr.Textbox(label="Generated Caption")
47
- ],
48
- title="SeeSay with SpeechT5",
49
- description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
50
  )
51
 
52
  iface.launch()
 
1
  import gradio as gr
2
  from transformers import pipeline
 
 
 
3
 
4
  # Load BLIP model for image captioning
5
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
7
+ def generate_caption(image):
 
 
 
 
 
 
 
 
8
  try:
 
9
  caption = caption_model(image)[0]['generated_text']
10
+ return caption
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  except Exception as e:
12
+ return f"Error: {str(e)}"
 
13
 
14
+ # Simple Gradio interface
15
  iface = gr.Interface(
16
+ fn=generate_caption,
17
  inputs=gr.Image(type='pil', label="Upload an Image"),
18
+ outputs=gr.Textbox(label="Generated Caption"),
19
+ title="Image Captioning with BLIP",
20
+ description="Quickly generate a caption from your uploaded image."
 
 
 
21
  )
22
 
23
  iface.launch()