preston-cell commited on
Commit
f67b2c3
·
verified ·
1 Parent(s): 1b7018a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -18
app.py CHANGED
@@ -1,41 +1,47 @@
1
  import gradio as gr
2
- import torch
3
  from transformers import pipeline
 
 
4
 
5
- # Load the image captioning pipeline
6
  captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
7
 
8
- # Load Whisper model for speech synthesis
9
- pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
 
 
 
 
10
 
11
- def launch(input_image):
 
12
  try:
13
  # Step 1: Generate caption
14
- out = captioner(input_image)
15
- caption = out[0]['generated_text']
16
-
17
 
18
- # Step 2: Synthesize speech from caption
19
- speech = pipe(caption)
20
- audio_data = speech['audio']
21
- sample_rate = speech['sampling_rate']
22
 
23
  return (audio_data, sample_rate), caption
24
 
25
  except Exception as e:
26
- return str(e), "Error in processing."
 
27
 
28
- # Gradio UI
29
  iface = gr.Interface(
30
- fn=launch,
31
- inputs=gr.Image(type='pil', label="Upload an Image"),
32
  outputs=[
33
- gr.Audio(type="numpy", label="Generated Audio"),
34
  gr.Textbox(label="Generated Caption")
35
  ],
36
- title="Whisper Image to Audio",
37
  description="Upload an image to generate a caption and hear it described with speech."
38
  )
39
 
40
  iface.launch(share=True)
41
 
 
 
1
  import gradio as gr
 
2
  from transformers import pipeline
3
+ import numpy as np
4
+ import torch
5
 
6
+ # Load the image-to-text model
7
  captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
8
 
9
+ # Load the Whisper model for speech synthesis
10
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
12
+
13
+ model_id = "openai/whisper-large-v3-turbo"
14
+ model = pipeline("text-to-speech", model=model_id, device=device)
15
 
16
+
17
+ def process_image(input_image):
18
  try:
19
  # Step 1: Generate caption
20
+ caption = captioner(input_image)[0]['generated_text']
 
 
21
 
22
+ # Step 2: Convert caption to speech
23
+ speech_output = model(caption)
24
+ audio_data = np.array(speech_output["audio"]).astype(np.float32)
25
+ sample_rate = speech_output["sampling_rate"]
26
 
27
  return (audio_data, sample_rate), caption
28
 
29
  except Exception as e:
30
+ return str(e)
31
+
32
 
33
+ # Create Gradio Interface
34
  iface = gr.Interface(
35
+ fn=process_image,
36
+ inputs=gr.Image(type='pil', label="Upload Image"),
37
  outputs=[
38
+ gr.Audio(label="Generated Speech"),
39
  gr.Textbox(label="Generated Caption")
40
  ],
41
+ title="Image to Audio with Caption",
42
  description="Upload an image to generate a caption and hear it described with speech."
43
  )
44
 
45
  iface.launch(share=True)
46
 
47
+