preston-cell commited on
Commit
608498c
·
verified ·
1 Parent(s): e35301b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -18
app.py CHANGED
@@ -1,34 +1,46 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  import numpy as np
 
 
4
 
5
- # Captioning model
 
 
 
6
  captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
7
 
8
- # Bark TTS model
9
- synthesiser = pipeline("text-to-speech", model="suno/bark")
 
 
 
 
 
 
 
 
 
 
10
 
11
- def launch(input_image):
12
- # Step 1: Generate caption
13
- caption = captioner(input_image)[0]['generated_text']
14
 
15
- # Step 2: Synthesize speech from caption
16
- speech = synthesiser(caption, forward_params={"do_sample": True})
17
- audio = np.array(speech["audio"])
18
- rate = speech["sampling_rate"]
19
 
20
- return (audio, rate), caption
 
21
 
22
- # Gradio UI
23
  iface = gr.Interface(
24
- fn=launch,
25
- inputs=gr.Image(type="pil", label="Upload Image"),
26
  outputs=[
27
- gr.Audio(type="numpy", label="Narrated Audio"),
28
  gr.Textbox(label="Generated Caption")
29
  ],
30
- title="🎙️ SeeSay",
31
- description="Upload an image to hear it described with expressive speech."
32
  )
33
 
34
- iface.launch(share = True)
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import numpy as np
4
+ from generator import load_csm_1b
5
+ import torchaudio
6
 
7
+ # Load CSM model
8
+ generator = load_csm_1b(device="cpu")
9
+
10
+ # Load image-to-text model
11
  captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
12
 
13
+ def process_image(input_image):
14
+ try:
15
+ # Generate caption
16
+ caption = captioner(input_image)[0]['generated_text']
17
+
18
+ # Generate speech using CSM
19
+ audio = generator.generate(
20
+ text=caption,
21
+ speaker=0,
22
+ context=[],
23
+ max_audio_length_ms=10_000,
24
+ )
25
 
26
+ # Convert the audio tensor to NumPy for Gradio
27
+ audio_np = audio.unsqueeze(0).cpu().numpy()
 
28
 
29
+ return (audio_np, generator.sample_rate), caption
 
 
 
30
 
31
+ except Exception as e:
32
+ return str(e), "Error generating caption or speech."
33
 
34
+ # Set up Gradio UI
35
  iface = gr.Interface(
36
+ fn=process_image,
37
+ inputs=gr.Image(type='pil', label="Upload Image"),
38
  outputs=[
39
+ gr.Audio(type="numpy", label="Generated Speech"),
40
  gr.Textbox(label="Generated Caption")
41
  ],
42
+ title="🎙️ SeeSay with CSM",
43
+ description="Upload an image to generate a caption and hear it narrated using CSM."
44
  )
45
 
46
+ iface.launch(share=True)