preston-cell commited on
Commit
1129fe7
·
verified ·
1 Parent(s): 1af6a32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -20
app.py CHANGED
@@ -1,36 +1,33 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
 
4
- pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 
5
 
6
- from phonemizer.backend.espeak.wrapper import EspeakWrapper
7
-
8
- EspeakWrapper.set_library('C:\Program Files\eSpeak NG\libespeak-ng.dll')
9
-
10
- narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
11
 
12
  def launch(input_image):
13
- # Step 1: Extract caption
14
- caption = pipe(input_image)[0]["generated_text"]
15
 
16
- # Step 2: Convert caption to audio
17
- audio_output = narrator(caption)
18
- audio_array = np.array(audio_output["audio"])
19
- sample_rate = audio_output["sampling_rate"]
20
 
21
- # Step 3: Return audio + caption
22
- return (audio_array, sample_rate), caption
23
 
24
- # Use dictionary to avoid conflicting argument ordering
25
  iface = gr.Interface(
26
  fn=launch,
27
- inputs=gr.Image(type='pil', label="Upload Image"),
28
  outputs=[
29
  gr.Audio(type="numpy", label="Narrated Audio"),
30
- gr.Textbox(label="Extracted Caption")
31
  ],
32
- title="SeeSay",
33
- description="Upload an image to hear its context narrated aloud."
34
  )
35
 
36
- iface.launch(share=True)
 
1
  import gradio as gr
2
  from transformers import pipeline
3
 
4
+ # Captioning model
5
+ captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
7
+ # Bark TTS model
8
+ synthesiser = pipeline("text-to-speech", model="suno/bark")
 
 
 
9
 
10
  def launch(input_image):
11
+ # Step 1: Generate caption
12
+ caption = captioner(input_image)[0]['generated_text']
13
 
14
+ # Step 2: Synthesize speech from caption
15
+ speech = synthesiser(caption, forward_params={"do_sample": True})
16
+ audio = np.array(speech["audio"])
17
+ rate = speech["sampling_rate"]
18
 
19
+ return (audio, rate), caption
 
20
 
21
+ # Gradio UI
22
  iface = gr.Interface(
23
  fn=launch,
24
+ inputs=gr.Image(type="pil", label="Upload Image"),
25
  outputs=[
26
  gr.Audio(type="numpy", label="Narrated Audio"),
27
+ gr.Textbox(label="Generated Caption")
28
  ],
29
+ title="🎙️ SeeSay",
30
+ description="Upload an image to hear it described with expressive speech."
31
  )
32
 
33
+ iface.launch(share = True)