Spaces:

preston-cell
/

image-text-to-text

Sleeping

preston-cell commited on Mar 25

Commit

1129fe7

verified ·

1 Parent(s): 1af6a32

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,36 +1,33 @@
 import gradio as gr
 from transformers import pipeline
-pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-from phonemizer.backend.espeak.wrapper import EspeakWrapper
-EspeakWrapper.set_library('C:\Program Files\eSpeak NG\libespeak-ng.dll')
-narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
 def launch(input_image):
-    # Step 1: Extract caption
-    caption = pipe(input_image)[0]["generated_text"]
-    # Step 2: Convert caption to audio
-    audio_output = narrator(caption)
-    audio_array = np.array(audio_output["audio"])
-    sample_rate = audio_output["sampling_rate"]
-    # Step 3: Return audio + caption
-    return (audio_array, sample_rate), caption
-# Use dictionary to avoid conflicting argument ordering
 iface = gr.Interface(
     fn=launch,
-    inputs=gr.Image(type='pil', label="Upload Image"),
     outputs=[
         gr.Audio(type="numpy", label="Narrated Audio"),
-        gr.Textbox(label="Extracted Caption")
     ],
-    title="SeeSay",
-    description="Upload an image to hear its context narrated aloud."
 )
-iface.launch(share=True)

 import gradio as gr
 from transformers import pipeline
+# Captioning model
+captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Bark TTS model
+synthesiser = pipeline("text-to-speech", model="suno/bark")
 def launch(input_image):
+    # Step 1: Generate caption
+    caption = captioner(input_image)[0]['generated_text']
+    # Step 2: Synthesize speech from caption
+    speech = synthesiser(caption, forward_params={"do_sample": True})
+    audio = np.array(speech["audio"])
+    rate = speech["sampling_rate"]
+    return (audio, rate), caption
+# Gradio UI
 iface = gr.Interface(
     fn=launch,
+    inputs=gr.Image(type="pil", label="Upload Image"),
     outputs=[
         gr.Audio(type="numpy", label="Narrated Audio"),
+        gr.Textbox(label="Generated Caption")
     ],
+    title="🎙️ SeeSay",
+    description="Upload an image to hear it described with expressive speech."
 )
+iface.launch(share = True)