Spaces:

preston-cell
/

image-text-to-text

Sleeping

preston-cell commited on Mar 21

Commit

ba2d445

verified ·

1 Parent(s): 6668036

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,12 +4,29 @@ from transformers import pipeline
 pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 def launch(input):
-    out = pipe(input)
-    return out[0]['generated_text']
 iface = gr.Interface(launch,
-                     inputs=gr.Image(type='pil'),
-                     outputs="text")
 iface.launch()

 pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+narrator = pipeline("text-to-speech",
+                    model="./models/kakao-enterprise/vits-ljs")
 def launch(input):
+    # Step 1: Extract text from image
+    caption = pipe(input_image)[0]['generated_text']
+    # Step 2: Generate speech from the caption
+    audio_output = narrator(caption)
+    # Step 3: Save the audio to a temporary file
+    audio_data = audio_output["audio"]
+    sampling_rate = audio_output["sampling_rate"]
+    # Gradio expects a tuple: (numpy_array, sampling_rate)
+    return (np.array(audio_data), sampling_rate)
 iface = gr.Interface(launch,
+    fn=launch,
+    inputs=gr.Image(type='pil'),
+    outputs=gr.Audio(type="numpy", label="Narrated Output"),
+    title="SeeSay",
+    description="Upload an image to hear its context narrated aloud."
+                    )
 iface.launch()