Spaces:

preston-cell
/

image-text-to-text

Running

preston-cell commited on Mar 27

Commit

ed4af8f

verified ·

1 Parent(s): f67b2c3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,47 +1,40 @@
 import gradio as gr
 from transformers import pipeline
 import numpy as np
-import torch
-# Load the image-to-text model
-captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Load the Whisper model for speech synthesis
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-model_id = "openai/whisper-large-v3-turbo"
-model = pipeline("text-to-speech", model=model_id, device=device)
-def process_image(input_image):
     try:
-        # Step 1: Generate caption
-        caption = captioner(input_image)[0]['generated_text']
-        # Step 2: Convert caption to speech
-        speech_output = model(caption)
-        audio_data = np.array(speech_output["audio"]).astype(np.float32)
-        sample_rate = speech_output["sampling_rate"]
-        return (audio_data, sample_rate), caption
     except Exception as e:
-        return str(e)
-# Create Gradio Interface
 iface = gr.Interface(
     fn=process_image,
-    inputs=gr.Image(type='pil', label="Upload Image"),
     outputs=[
-        gr.Audio(label="Generated Speech"),
         gr.Textbox(label="Generated Caption")
     ],
-    title="Image to Audio with Caption",
     description="Upload an image to generate a caption and hear it described with speech."
 )
-iface.launch(share=True)

 import gradio as gr
 from transformers import pipeline
 import numpy as np
+# Load image captioning model
+caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Load Whisper TTS model
+speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
+def process_image(image):
     try:
+        # Step 1: Generate caption from image
+        caption = caption_model(image)[0]['generated_text']
+        # Step 2: Convert caption to speech using Whisper
+        speech = speech_model(caption)
+        audio = np.array(speech["audio"])
+        rate = speech["sampling_rate"]
+        # Return both the audio and the caption
+        return (audio, rate), caption
     except Exception as e:
+        return str(e), "Error generating caption or audio."
+# Gradio Interface
 iface = gr.Interface(
     fn=process_image,
+    inputs=gr.Image(type='pil', label="Upload an Image"),
     outputs=[
+        gr.Audio(label="Generated Audio"),
         gr.Textbox(label="Generated Caption")
     ],
+    title="SeeSay",
     description="Upload an image to generate a caption and hear it described with speech."
 )
+iface.launch()