preston-cell commited on
Commit
4f56a4b
·
verified ·
1 Parent(s): ed4af8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -13
app.py CHANGED
@@ -1,27 +1,41 @@
1
  import gradio as gr
2
  from transformers import pipeline
 
 
3
  import numpy as np
4
 
5
- # Load image captioning model
6
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
7
 
8
- # Load Whisper TTS model
9
- speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
 
 
 
 
 
10
 
11
  def process_image(image):
12
  try:
13
- # Step 1: Generate caption from image
14
  caption = caption_model(image)[0]['generated_text']
15
 
16
- # Step 2: Convert caption to speech using Whisper
17
- speech = speech_model(caption)
 
 
 
 
 
18
  audio = np.array(speech["audio"])
19
  rate = speech["sampling_rate"]
20
 
21
- # Return both the audio and the caption
22
- return (audio, rate), caption
 
23
  except Exception as e:
24
- return str(e), "Error generating caption or audio."
 
25
 
26
  # Gradio Interface
27
  iface = gr.Interface(
@@ -31,10 +45,8 @@ iface = gr.Interface(
31
  gr.Audio(label="Generated Audio"),
32
  gr.Textbox(label="Generated Caption")
33
  ],
34
- title="SeeSay",
35
- description="Upload an image to generate a caption and hear it described with speech."
36
  )
37
 
38
  iface.launch()
39
-
40
-
 
1
  import gradio as gr
2
  from transformers import pipeline
3
+ from datasets import load_dataset
4
+ import torch
5
  import numpy as np
6
 
7
+ # Load BLIP model for image captioning
8
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
9
 
10
+ # Load SpeechT5 model for text-to-speech
11
+ synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
12
+
13
+ # Load a speaker embedding
14
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
15
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
16
+
17
 
18
  def process_image(image):
19
  try:
20
+ # Generate caption from the image
21
  caption = caption_model(image)[0]['generated_text']
22
 
23
+ # Convert caption to speech
24
+ speech = synthesiser(
25
+ caption,
26
+ forward_params={"speaker_embeddings": speaker_embedding}
27
+ )
28
+
29
+ # Prepare audio data
30
  audio = np.array(speech["audio"])
31
  rate = speech["sampling_rate"]
32
 
33
+ # Return both audio and caption
34
+ return (rate, audio), caption
35
+
36
  except Exception as e:
37
+ return None, f"Error: {str(e)}"
38
+
39
 
40
  # Gradio Interface
41
  iface = gr.Interface(
 
45
  gr.Audio(label="Generated Audio"),
46
  gr.Textbox(label="Generated Caption")
47
  ],
48
+ title="SeeSay with SpeechT5",
49
+ description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
50
  )
51
 
52
  iface.launch()