preston-cell's picture
Update app.py
608498c verified
raw
history blame
1.27 kB
import gradio as gr
from transformers import pipeline
import numpy as np
from generator import load_csm_1b
import torchaudio
# Load CSM model
generator = load_csm_1b(device="cpu")
# Load image-to-text model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
def process_image(input_image):
try:
# Generate caption
caption = captioner(input_image)[0]['generated_text']
# Generate speech using CSM
audio = generator.generate(
text=caption,
speaker=0,
context=[],
max_audio_length_ms=10_000,
)
# Convert the audio tensor to NumPy for Gradio
audio_np = audio.unsqueeze(0).cpu().numpy()
return (audio_np, generator.sample_rate), caption
except Exception as e:
return str(e), "Error generating caption or speech."
# Set up Gradio UI
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload Image"),
outputs=[
gr.Audio(type="numpy", label="Generated Speech"),
gr.Textbox(label="Generated Caption")
],
title="🎙️ SeeSay with CSM",
description="Upload an image to generate a caption and hear it narrated using CSM."
)
iface.launch(share=True)