preston-cell commited on
Commit
516718c
·
verified ·
1 Parent(s): 497cd4c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -18
app.py CHANGED
@@ -1,35 +1,39 @@
1
  import gradio as gr
 
2
  from transformers import pipeline
3
 
4
- # Load the Whisper model for generating speech
5
- speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
6
 
7
- # Load the BLIP model for image captioning
8
- caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
9
 
10
- def generate_caption_and_speech(image):
11
  try:
12
- # Generate a caption from the image
13
- caption = caption_model(image)[0]['generated_text']
14
 
15
- # Generate speech using the caption
16
- speech = speech_model(caption)
 
 
 
 
17
 
18
- # Return both the caption and the audio
19
- return speech["audio"], caption
20
  except Exception as e:
21
- return str(e), ""
22
 
23
- # Set up the Gradio interface
24
  iface = gr.Interface(
25
- fn=generate_caption_and_speech,
26
- inputs=gr.Image(type='pil', label="Upload Image"),
27
  outputs=[
28
- gr.Audio(type="filepath", label="Generated Audio"),
29
- "text")
30
  ],
31
- title="SeeSay: Image to Speech",
32
  description="Upload an image to generate a caption and hear it described with speech."
33
  )
34
 
35
  iface.launch(share=True)
 
 
1
  import gradio as gr
2
+ import torch
3
  from transformers import pipeline
4
 
5
+ # Load the image captioning pipeline
6
+ captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
7
 
8
+ # Load Whisper model for speech synthesis
9
+ pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
10
 
11
+ def launch(input_image):
12
  try:
13
+ # Step 1: Generate caption
14
+ caption = captioner(input_image)[0]['generated_text']
15
 
16
+ # Step 2: Synthesize speech from caption
17
+ speech = pipe(caption)
18
+ audio_data = speech['audio']
19
+ sample_rate = speech['sampling_rate']
20
+
21
+ return (audio_data, sample_rate), caption
22
 
 
 
23
  except Exception as e:
24
+ return str(e), "Error in processing."
25
 
26
+ # Gradio UI
27
  iface = gr.Interface(
28
+ fn=launch,
29
+ inputs=gr.Image(type='pil', label="Upload an Image"),
30
  outputs=[
31
+ gr.Audio(type="numpy", label="Generated Audio"),
32
+ gr.Textbox(label="Generated Caption")
33
  ],
34
+ title="Whisper Image to Audio",
35
  description="Upload an image to generate a caption and hear it described with speech."
36
  )
37
 
38
  iface.launch(share=True)
39
+