preston-cell commited on
Commit
c770d2e
·
verified ·
1 Parent(s): 9da8728

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -16
app.py CHANGED
@@ -2,32 +2,24 @@ import gradio as gr
2
  # Use a pipeline as a high-level helper
3
  from transformers import pipeline
4
 
5
- pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
-
7
- narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
8
 
9
  def launch(input_image):
10
- # Step 1: Extract caption
11
- caption = pipe(input_image)[0]["generated_text"]
12
-
13
- # Step 2: Convert caption to audio
14
  audio_output = narrator(caption)
15
- audio_array = np.array(audio_output["audio"])
16
- sample_rate = audio_output["sampling_rate"]
17
-
18
- # Step 3: Return audio + caption
19
- return (audio_array, sample_rate), caption
20
 
21
  # Use dictionary to avoid conflicting argument ordering
22
  iface = gr.Interface(
23
  fn=launch,
24
- inputs=gr.Image(type='pil', label="Upload Image"),
25
  outputs=[
26
  gr.Audio(type="numpy", label="Narrated Audio"),
27
- gr.Textbox(label="Extracted Caption")
28
  ],
29
- title="SeeSay",
30
- description="Upload an image to hear its context narrated aloud."
31
  )
32
 
33
  iface.launch(share=True)
 
2
  # Use a pipeline as a high-level helper
3
  from transformers import pipeline
4
 
5
+ captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
+ narrator = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
 
7
 
8
  def launch(input_image):
9
+ caption = captioner(img)[0]["generated_text"]
 
 
 
10
  audio_output = narrator(caption)
11
+ return (np.array(audio_output["audio"]), audio_output["sampling_rate"]), caption
 
 
 
 
12
 
13
  # Use dictionary to avoid conflicting argument ordering
14
  iface = gr.Interface(
15
  fn=launch,
16
+ inputs=gr.Image(type="pil", label="Upload an image"),
17
  outputs=[
18
  gr.Audio(type="numpy", label="Narrated Audio"),
19
+ gr.Textbox(label="Image Caption")
20
  ],
21
+ title="🧠 SeeSay",
22
+ description="Upload an image to hear its description spoken aloud."
23
  )
24
 
25
  iface.launch(share=True)