preston-cell commited on
Commit
83cd235
·
verified ·
1 Parent(s): c770d2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -9
app.py CHANGED
@@ -1,25 +1,32 @@
1
- import gradio as gr
2
  # Use a pipeline as a high-level helper
3
  from transformers import pipeline
4
 
5
- captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
- narrator = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
 
7
 
8
  def launch(input_image):
9
- caption = captioner(img)[0]["generated_text"]
 
 
 
10
  audio_output = narrator(caption)
11
- return (np.array(audio_output["audio"]), audio_output["sampling_rate"]), caption
 
 
 
 
12
 
13
  # Use dictionary to avoid conflicting argument ordering
14
  iface = gr.Interface(
15
  fn=launch,
16
- inputs=gr.Image(type="pil", label="Upload an image"),
17
  outputs=[
18
  gr.Audio(type="numpy", label="Narrated Audio"),
19
- gr.Textbox(label="Image Caption")
20
  ],
21
- title="🧠 SeeSay",
22
- description="Upload an image to hear its description spoken aloud."
23
  )
24
 
25
  iface.launch(share=True)
 
 
1
  # Use a pipeline as a high-level helper
2
  from transformers import pipeline
3
 
4
+ pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
5
+
6
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
7
 
8
  def launch(input_image):
9
+ # Step 1: Extract caption
10
+ caption = pipe(input_image)[0]["generated_text"]
11
+
12
+ # Step 2: Convert caption to audio
13
  audio_output = narrator(caption)
14
+ audio_array = np.array(audio_output["audio"])
15
+ sample_rate = audio_output["sampling_rate"]
16
+
17
+ # Step 3: Return audio + caption
18
+ return (audio_array, sample_rate), caption
19
 
20
  # Use dictionary to avoid conflicting argument ordering
21
  iface = gr.Interface(
22
  fn=launch,
23
+ inputs=gr.Image(type='pil', label="Upload Image"),
24
  outputs=[
25
  gr.Audio(type="numpy", label="Narrated Audio"),
26
+ gr.Textbox(label="Extracted Caption")
27
  ],
28
+ title="SeeSay",
29
+ description="Upload an image to hear its context narrated aloud."
30
  )
31
 
32
  iface.launch(share=True)