Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
from generator import load_csm_1b | |
import torchaudio | |
# Load CSM model | |
generator = load_csm_1b(device="cpu") | |
# Load image-to-text model | |
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
def process_image(input_image): | |
try: | |
# Generate caption | |
caption = captioner(input_image)[0]['generated_text'] | |
# Generate speech using CSM | |
audio = generator.generate( | |
text=caption, | |
speaker=0, | |
context=[], | |
max_audio_length_ms=10_000, | |
) | |
# Convert the audio tensor to NumPy for Gradio | |
audio_np = audio.unsqueeze(0).cpu().numpy() | |
return (audio_np, generator.sample_rate), caption | |
except Exception as e: | |
return str(e), "Error generating caption or speech." | |
# Set up Gradio UI | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type='pil', label="Upload Image"), | |
outputs=[ | |
gr.Audio(type="numpy", label="Generated Speech"), | |
gr.Textbox(label="Generated Caption") | |
], | |
title="🎙️ SeeSay with CSM", | |
description="Upload an image to generate a caption and hear it narrated using CSM." | |
) | |
iface.launch(share=True) |