Spaces:

preston-cell
/

image-text-to-text

Sleeping

preston-cell commited on Mar 27

Commit

608498c

verified ·

1 Parent(s): e35301b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,34 +1,46 @@
 import gradio as gr
 from transformers import pipeline
 import numpy as np
-# Captioning model
 captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Bark TTS model
-synthesiser = pipeline("text-to-speech", model="suno/bark")
-def launch(input_image):
-    # Step 1: Generate caption
-    caption = captioner(input_image)[0]['generated_text']
-    # Step 2: Synthesize speech from caption
-    speech = synthesiser(caption, forward_params={"do_sample": True})
-    audio = np.array(speech["audio"])
-    rate = speech["sampling_rate"]
-    return (audio, rate), caption
-# Gradio UI
 iface = gr.Interface(
-    fn=launch,
-    inputs=gr.Image(type="pil", label="Upload Image"),
     outputs=[
-        gr.Audio(type="numpy", label="Narrated Audio"),
         gr.Textbox(label="Generated Caption")
     ],
-    title="🎙️ SeeSay",
-    description="Upload an image to hear it described with expressive speech."
 )
-iface.launch(share = True)

 import gradio as gr
 from transformers import pipeline
 import numpy as np
+from generator import load_csm_1b
+import torchaudio
+# Load CSM model
+generator = load_csm_1b(device="cpu")
+# Load image-to-text model
 captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+def process_image(input_image):
+    try:
+        # Generate caption
+        caption = captioner(input_image)[0]['generated_text']
+        # Generate speech using CSM
+        audio = generator.generate(
+            text=caption,
+            speaker=0,
+            context=[],
+            max_audio_length_ms=10_000,
+        )
+        # Convert the audio tensor to NumPy for Gradio
+        audio_np = audio.unsqueeze(0).cpu().numpy()
+        return (audio_np, generator.sample_rate), caption
+    except Exception as e:
+        return str(e), "Error generating caption or speech."
+# Set up Gradio UI
 iface = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type='pil', label="Upload Image"),
     outputs=[
+        gr.Audio(type="numpy", label="Generated Speech"),
         gr.Textbox(label="Generated Caption")
     ],
+    title="🎙️ SeeSay with CSM",
+    description="Upload an image to generate a caption and hear it narrated using CSM."
 )
+iface.launch(share=True)