Spaces:
Sleeping
Sleeping
File size: 1,274 Bytes
f30b843 e363033 e35301b 608498c 78c45fd 608498c 1129fe7 83cd235 608498c ba2d445 608498c 83cd235 608498c 83cd235 608498c ba2d445 608498c 520c499 608498c 520c499 608498c 1129fe7 520c499 608498c 5c86456 608498c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import gradio as gr
from transformers import pipeline
import numpy as np
from generator import load_csm_1b
import torchaudio
# Load CSM model
generator = load_csm_1b(device="cpu")
# Load image-to-text model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
def process_image(input_image):
try:
# Generate caption
caption = captioner(input_image)[0]['generated_text']
# Generate speech using CSM
audio = generator.generate(
text=caption,
speaker=0,
context=[],
max_audio_length_ms=10_000,
)
# Convert the audio tensor to NumPy for Gradio
audio_np = audio.unsqueeze(0).cpu().numpy()
return (audio_np, generator.sample_rate), caption
except Exception as e:
return str(e), "Error generating caption or speech."
# Set up Gradio UI
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload Image"),
outputs=[
gr.Audio(type="numpy", label="Generated Speech"),
gr.Textbox(label="Generated Caption")
],
title="🎙️ SeeSay with CSM",
description="Upload an image to generate a caption and hear it narrated using CSM."
)
iface.launch(share=True) |