Spaces:

preston-cell
/

image-text-to-text

Sleeping

preston-cell commited on Mar 27

Commit

516718c

verified ·

1 Parent(s): 497cd4c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,35 +1,39 @@
 import gradio as gr
 from transformers import pipeline
-# Load the Whisper model for generating speech
-speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
-# Load the BLIP model for image captioning
-caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-def generate_caption_and_speech(image):
     try:
-        # Generate a caption from the image
-        caption = caption_model(image)[0]['generated_text']
-        # Generate speech using the caption
-        speech = speech_model(caption)
-        # Return both the caption and the audio
-        return speech["audio"], caption
     except Exception as e:
-        return str(e), ""
-# Set up the Gradio interface
 iface = gr.Interface(
-    fn=generate_caption_and_speech,
-    inputs=gr.Image(type='pil', label="Upload Image"),
     outputs=[
-        gr.Audio(type="filepath", label="Generated Audio"),
-        "text")
     ],
-    title="SeeSay: Image to Speech",
     description="Upload an image to generate a caption and hear it described with speech."
 )
 iface.launch(share=True)

 import gradio as gr
+import torch
 from transformers import pipeline
+# Load the image captioning pipeline
+captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Load Whisper model for speech synthesis
+pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
+def launch(input_image):
     try:
+        # Step 1: Generate caption
+        caption = captioner(input_image)[0]['generated_text']
+        # Step 2: Synthesize speech from caption
+        speech = pipe(caption)
+        audio_data = speech['audio']
+        sample_rate = speech['sampling_rate']
+        return (audio_data, sample_rate), caption
     except Exception as e:
+        return str(e), "Error in processing."
+# Gradio UI
 iface = gr.Interface(
+    fn=launch,
+    inputs=gr.Image(type='pil', label="Upload an Image"),
     outputs=[
+        gr.Audio(type="numpy", label="Generated Audio"),
+        gr.Textbox(label="Generated Caption")
     ],
+    title="Whisper Image to Audio",
     description="Upload an image to generate a caption and hear it described with speech."
 )
 iface.launch(share=True)