preston-cell commited on
Commit
ba2d445
·
verified ·
1 Parent(s): 6668036

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -4,12 +4,29 @@ from transformers import pipeline
4
 
5
  pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
 
 
 
7
  def launch(input):
8
- out = pipe(input)
9
- return out[0]['generated_text']
 
 
 
 
 
 
 
 
 
 
10
 
11
  iface = gr.Interface(launch,
12
- inputs=gr.Image(type='pil'),
13
- outputs="text")
 
 
 
 
14
 
15
  iface.launch()
 
4
 
5
  pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
7
+ narrator = pipeline("text-to-speech",
8
+ model="./models/kakao-enterprise/vits-ljs")
9
+
10
  def launch(input):
11
+ # Step 1: Extract text from image
12
+ caption = pipe(input_image)[0]['generated_text']
13
+
14
+ # Step 2: Generate speech from the caption
15
+ audio_output = narrator(caption)
16
+
17
+ # Step 3: Save the audio to a temporary file
18
+ audio_data = audio_output["audio"]
19
+ sampling_rate = audio_output["sampling_rate"]
20
+
21
+ # Gradio expects a tuple: (numpy_array, sampling_rate)
22
+ return (np.array(audio_data), sampling_rate)
23
 
24
  iface = gr.Interface(launch,
25
+ fn=launch,
26
+ inputs=gr.Image(type='pil'),
27
+ outputs=gr.Audio(type="numpy", label="Narrated Output"),
28
+ title="SeeSay",
29
+ description="Upload an image to hear its context narrated aloud."
30
+ )
31
 
32
  iface.launch()