Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline | |
import torch | |
import numpy as np | |
from pyannote.audio import Pipeline as VAD | |
import dac | |
# Load models with proper error handling | |
def load_models(): | |
try: | |
# Ultravox via transformers (no separate package needed) | |
ultra_proc = AutoProcessor.from_pretrained("fixie-ai/ultravox-v0_4", trust_remote_code=True) | |
ultra_model = AutoModelForCausalLM.from_pretrained("fixie-ai/ultravox-v0_4", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) | |
# Speech emotion recognition via transformers pipeline | |
emotion_pipeline = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device=0 if torch.cuda.is_available() else -1) | |
# Audio diffusion (using transformers instead of torch.hub for HF compatibility) | |
from diffusers import DiffusionPipeline | |
diff_pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-instrumental-hiphop-256") | |
# Descript Audio Codec | |
from dac.utils import load_model as load_dac_model | |
rvq = load_dac_model(tag="latest", model_type="44khz") | |
rvq.eval() | |
if torch.cuda.is_available(): | |
rvq = rvq.to("cuda") | |
# VAD | |
vad = VAD.from_pretrained("pyannote/voice-activity-detection") | |
# Dia TTS | |
from dia.model import Dia | |
dia = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float16") | |
return ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia | |
except Exception as e: | |
print(f"Error loading models: {e}") | |
return None, None, None, None, None, None, None | |
# Initialize models | |
ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia = load_models() | |
def process_audio(audio): | |
try: | |
if audio is None: | |
return None, "No audio input provided" | |
# Convert audio to proper format | |
audio_array = audio[1] if isinstance(audio, tuple) else audio["array"] | |
sample_rate = audio[0] if isinstance(audio, tuple) else audio["sampling_rate"] | |
# Ensure audio is numpy array | |
if torch.is_tensor(audio_array): | |
audio_array = audio_array.numpy() | |
# VAD processing | |
if vad is not None: | |
speech_segments = vad({"waveform": torch.from_numpy(audio_array).unsqueeze(0), "sample_rate": sample_rate}) | |
# Emotion recognition | |
emotion_result = "neutral" | |
if emotion_pipeline is not None: | |
try: | |
emotion_pred = emotion_pipeline(audio_array, sampling_rate=sample_rate) | |
emotion_result = emotion_pred[0]["label"] if emotion_pred else "neutral" | |
except: | |
emotion_result = "neutral" | |
# RVQ encode/decode | |
if rvq is not None: | |
try: | |
audio_tensor = torch.from_numpy(audio_array).float().unsqueeze(0) | |
if torch.cuda.is_available(): | |
audio_tensor = audio_tensor.to("cuda") | |
encoded = rvq.encode(audio_tensor) | |
decoded_audio = rvq.decode(encoded) | |
if torch.cuda.is_available(): | |
decoded_audio = decoded_audio.cpu() | |
audio_array = decoded_audio.squeeze().numpy() | |
except Exception as e: | |
print(f"RVQ processing error: {e}") | |
# Ultravox generation | |
response_text = "I understand your audio input." | |
if ultra_proc is not None and ultra_model is not None: | |
try: | |
inputs = ultra_proc(audio_array, sampling_rate=sample_rate, return_tensors="pt") | |
if torch.cuda.is_available(): | |
inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = ultra_model.generate(**inputs, max_new_tokens=50) | |
response_text = ultra_proc.decode(outputs[0], skip_special_tokens=True) | |
except Exception as e: | |
print(f"Ultravox generation error: {e}") | |
response_text = f"Detected emotion: {emotion_result}" | |
# TTS generation | |
output_audio = None | |
if dia is not None: | |
try: | |
tts_text = f"[emotion:{emotion_result}] {response_text}" | |
output_audio = dia.generate(tts_text) | |
if torch.is_tensor(output_audio): | |
output_audio = output_audio.cpu().numpy() | |
# Normalize audio | |
if output_audio is not None: | |
output_audio = output_audio / np.max(np.abs(output_audio)) * 0.95 | |
except Exception as e: | |
print(f"TTS generation error: {e}") | |
return (sample_rate, output_audio) if output_audio is not None else None, response_text | |
except Exception as e: | |
return None, f"Processing error: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks(title="Supernatural Speech AI") as demo: | |
gr.Markdown("# Supernatural Speech AI Agent") | |
gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio(source="microphone", type="numpy", label="Record Audio") | |
process_btn = gr.Button("Process Audio", variant="primary") | |
with gr.Column(): | |
audio_output = gr.Audio(label="AI Response") | |
text_output = gr.Textbox(label="Response Text", lines=3) | |
conversation_history = gr.State([]) | |
process_btn.click( | |
fn=process_audio, | |
inputs=[audio_input], | |
outputs=[audio_output, text_output] | |
) | |
if __name__ == "__main__": | |
demo.queue(concurrency_limit=20, max_size=50).launch() | |