Spaces:

preston-cell
/

image-text-to-text

Running

preston-cell commited on Apr 1

Commit

4f56a4b

verified ·

1 Parent(s): ed4af8f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,27 +1,41 @@
 import gradio as gr
 from transformers import pipeline
 import numpy as np
-# Load image captioning model
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Load Whisper TTS model
-speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
 def process_image(image):
     try:
-        # Step 1: Generate caption from image
         caption = caption_model(image)[0]['generated_text']
-        # Step 2: Convert caption to speech using Whisper
-        speech = speech_model(caption)
         audio = np.array(speech["audio"])
         rate = speech["sampling_rate"]
-        # Return both the audio and the caption
-        return (audio, rate), caption
     except Exception as e:
-        return str(e), "Error generating caption or audio."
 # Gradio Interface
 iface = gr.Interface(
@@ -31,10 +45,8 @@ iface = gr.Interface(
         gr.Audio(label="Generated Audio"),
         gr.Textbox(label="Generated Caption")
     ],
-    title="SeeSay",
-    description="Upload an image to generate a caption and hear it described with speech."
 )
 iface.launch()

 import gradio as gr
 from transformers import pipeline
+from datasets import load_dataset
+import torch
 import numpy as np
+# Load BLIP model for image captioning
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Load SpeechT5 model for text-to-speech
+synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
+# Load a speaker embedding
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def process_image(image):
     try:
+        # Generate caption from the image
         caption = caption_model(image)[0]['generated_text']
+        # Convert caption to speech
+        speech = synthesiser(
+            caption,
+            forward_params={"speaker_embeddings": speaker_embedding}
+        )
+        # Prepare audio data
         audio = np.array(speech["audio"])
         rate = speech["sampling_rate"]
+        # Return both audio and caption
+        return (rate, audio), caption
     except Exception as e:
+        return None, f"Error: {str(e)}"
 # Gradio Interface
 iface = gr.Interface(
         gr.Audio(label="Generated Audio"),
         gr.Textbox(label="Generated Caption")
     ],
+    title="SeeSay with SpeechT5",
+    description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
 )
 iface.launch()