Spaces:

preston-cell
/

image-text-to-text

Running

App Files Files Community

preston-cell commited on Apr 1

Commit

0bf1b01

verified ·

1 Parent(s): 4f56a4b

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -37

app.py CHANGED Viewed

@@ -1,52 +1,23 @@
 import gradio as gr
 from transformers import pipeline
-from datasets import load_dataset
-import torch
-import numpy as np
 # Load BLIP model for image captioning
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Load SpeechT5 model for text-to-speech
-synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
-# Load a speaker embedding
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-def process_image(image):
     try:
-        # Generate caption from the image
         caption = caption_model(image)[0]['generated_text']
-        # Convert caption to speech
-        speech = synthesiser(
-            caption,
-            forward_params={"speaker_embeddings": speaker_embedding}
-        )
-        # Prepare audio data
-        audio = np.array(speech["audio"])
-        rate = speech["sampling_rate"]
-        # Return both audio and caption
-        return (rate, audio), caption
     except Exception as e:
-        return None, f"Error: {str(e)}"
-# Gradio Interface
 iface = gr.Interface(
-    fn=process_image,
     inputs=gr.Image(type='pil', label="Upload an Image"),
-    outputs=[
-        gr.Audio(label="Generated Audio"),
-        gr.Textbox(label="Generated Caption")
-    ],
-    title="SeeSay with SpeechT5",
-    description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
 )
 iface.launch()

 import gradio as gr
 from transformers import pipeline
 # Load BLIP model for image captioning
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+def generate_caption(image):
     try:
         caption = caption_model(image)[0]['generated_text']
+        return caption
     except Exception as e:
+        return f"Error: {str(e)}"
+# Simple Gradio interface
 iface = gr.Interface(
+    fn=generate_caption,
     inputs=gr.Image(type='pil', label="Upload an Image"),
+    outputs=gr.Textbox(label="Generated Caption"),
+    title="Image Captioning with BLIP",
+    description="Quickly generate a caption from your uploaded image."
 )
 iface.launch()