Spaces:

preston-cell
/

image-text-to-text

Running

preston-cell commited on Mar 21

Commit

83cd235

verified ·

1 Parent(s): c770d2e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,25 +1,32 @@
-import gradio as gr
 # Use a pipeline as a high-level helper
 from transformers import pipeline
-captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-narrator = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
 def launch(input_image):
-    caption = captioner(img)[0]["generated_text"]
     audio_output = narrator(caption)
-    return (np.array(audio_output["audio"]), audio_output["sampling_rate"]), caption
 # Use dictionary to avoid conflicting argument ordering
 iface = gr.Interface(
     fn=launch,
-    inputs=gr.Image(type="pil", label="Upload an image"),
     outputs=[
         gr.Audio(type="numpy", label="Narrated Audio"),
-        gr.Textbox(label="Image Caption")
     ],
-    title="🧠 SeeSay",
-    description="Upload an image to hear its description spoken aloud."
 )
 iface.launch(share=True)

 # Use a pipeline as a high-level helper
 from transformers import pipeline
+pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
 def launch(input_image):
+    # Step 1: Extract caption
+    caption = pipe(input_image)[0]["generated_text"]
+    # Step 2: Convert caption to audio
     audio_output = narrator(caption)
+    audio_array = np.array(audio_output["audio"])
+    sample_rate = audio_output["sampling_rate"]
+    # Step 3: Return audio + caption
+    return (audio_array, sample_rate), caption
 # Use dictionary to avoid conflicting argument ordering
 iface = gr.Interface(
     fn=launch,
+    inputs=gr.Image(type='pil', label="Upload Image"),
     outputs=[
         gr.Audio(type="numpy", label="Narrated Audio"),
+        gr.Textbox(label="Extracted Caption")
     ],
+    title="SeeSay",
+    description="Upload an image to hear its context narrated aloud."
 )
 iface.launch(share=True)