preston-cell commited on
Commit
ed4af8f
·
verified ·
1 Parent(s): f67b2c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -26
app.py CHANGED
@@ -1,47 +1,40 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  import numpy as np
4
- import torch
5
 
6
- # Load the image-to-text model
7
- captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
8
 
9
- # Load the Whisper model for speech synthesis
10
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
12
 
13
- model_id = "openai/whisper-large-v3-turbo"
14
- model = pipeline("text-to-speech", model=model_id, device=device)
15
-
16
-
17
- def process_image(input_image):
18
  try:
19
- # Step 1: Generate caption
20
- caption = captioner(input_image)[0]['generated_text']
21
 
22
- # Step 2: Convert caption to speech
23
- speech_output = model(caption)
24
- audio_data = np.array(speech_output["audio"]).astype(np.float32)
25
- sample_rate = speech_output["sampling_rate"]
26
-
27
- return (audio_data, sample_rate), caption
28
 
 
 
29
  except Exception as e:
30
- return str(e)
31
-
32
 
33
- # Create Gradio Interface
34
  iface = gr.Interface(
35
  fn=process_image,
36
- inputs=gr.Image(type='pil', label="Upload Image"),
37
  outputs=[
38
- gr.Audio(label="Generated Speech"),
39
  gr.Textbox(label="Generated Caption")
40
  ],
41
- title="Image to Audio with Caption",
42
  description="Upload an image to generate a caption and hear it described with speech."
43
  )
44
 
45
- iface.launch(share=True)
46
 
47
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import numpy as np
 
4
 
5
+ # Load image captioning model
6
+ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
7
 
8
+ # Load Whisper TTS model
9
+ speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
 
10
 
11
+ def process_image(image):
 
 
 
 
12
  try:
13
+ # Step 1: Generate caption from image
14
+ caption = caption_model(image)[0]['generated_text']
15
 
16
+ # Step 2: Convert caption to speech using Whisper
17
+ speech = speech_model(caption)
18
+ audio = np.array(speech["audio"])
19
+ rate = speech["sampling_rate"]
 
 
20
 
21
+ # Return both the audio and the caption
22
+ return (audio, rate), caption
23
  except Exception as e:
24
+ return str(e), "Error generating caption or audio."
 
25
 
26
+ # Gradio Interface
27
  iface = gr.Interface(
28
  fn=process_image,
29
+ inputs=gr.Image(type='pil', label="Upload an Image"),
30
  outputs=[
31
+ gr.Audio(label="Generated Audio"),
32
  gr.Textbox(label="Generated Caption")
33
  ],
34
+ title="SeeSay",
35
  description="Upload an image to generate a caption and hear it described with speech."
36
  )
37
 
38
+ iface.launch()
39
 
40