Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoProcessor, AutoModelForCausalLM, pipeline | |
| import torch | |
| import numpy as np | |
| from pyannote.audio import Pipeline as VAD | |
| import dac | |
| # Load models with proper error handling | |
| def load_models(): | |
| try: | |
| # Ultravox via transformers (no separate package needed) | |
| ultra_proc = AutoProcessor.from_pretrained("fixie-ai/ultravox-v0_4", trust_remote_code=True) | |
| ultra_model = AutoModelForCausalLM.from_pretrained("fixie-ai/ultravox-v0_4", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) | |
| # Speech emotion recognition via transformers pipeline | |
| emotion_pipeline = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device=0 if torch.cuda.is_available() else -1) | |
| # Audio diffusion (using transformers instead of torch.hub for HF compatibility) | |
| from diffusers import DiffusionPipeline | |
| diff_pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-instrumental-hiphop-256") | |
| # Descript Audio Codec | |
| from dac.utils import load_model as load_dac_model | |
| rvq = load_dac_model(tag="latest", model_type="44khz") | |
| rvq.eval() | |
| if torch.cuda.is_available(): | |
| rvq = rvq.to("cuda") | |
| # VAD | |
| vad = VAD.from_pretrained("pyannote/voice-activity-detection") | |
| # Dia TTS | |
| from dia.model import Dia | |
| dia = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float16") | |
| return ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia | |
| except Exception as e: | |
| print(f"Error loading models: {e}") | |
| return None, None, None, None, None, None, None | |
| # Initialize models | |
| ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia = load_models() | |
| def process_audio(audio): | |
| try: | |
| if audio is None: | |
| return None, "No audio input provided" | |
| # Convert audio to proper format | |
| audio_array = audio[1] if isinstance(audio, tuple) else audio["array"] | |
| sample_rate = audio[0] if isinstance(audio, tuple) else audio["sampling_rate"] | |
| # Ensure audio is numpy array | |
| if torch.is_tensor(audio_array): | |
| audio_array = audio_array.numpy() | |
| # VAD processing | |
| if vad is not None: | |
| speech_segments = vad({"waveform": torch.from_numpy(audio_array).unsqueeze(0), "sample_rate": sample_rate}) | |
| # Emotion recognition | |
| emotion_result = "neutral" | |
| if emotion_pipeline is not None: | |
| try: | |
| emotion_pred = emotion_pipeline(audio_array, sampling_rate=sample_rate) | |
| emotion_result = emotion_pred[0]["label"] if emotion_pred else "neutral" | |
| except: | |
| emotion_result = "neutral" | |
| # RVQ encode/decode | |
| if rvq is not None: | |
| try: | |
| audio_tensor = torch.from_numpy(audio_array).float().unsqueeze(0) | |
| if torch.cuda.is_available(): | |
| audio_tensor = audio_tensor.to("cuda") | |
| encoded = rvq.encode(audio_tensor) | |
| decoded_audio = rvq.decode(encoded) | |
| if torch.cuda.is_available(): | |
| decoded_audio = decoded_audio.cpu() | |
| audio_array = decoded_audio.squeeze().numpy() | |
| except Exception as e: | |
| print(f"RVQ processing error: {e}") | |
| # Ultravox generation | |
| response_text = "I understand your audio input." | |
| if ultra_proc is not None and ultra_model is not None: | |
| try: | |
| inputs = ultra_proc(audio_array, sampling_rate=sample_rate, return_tensors="pt") | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = ultra_model.generate(**inputs, max_new_tokens=50) | |
| response_text = ultra_proc.decode(outputs[0], skip_special_tokens=True) | |
| except Exception as e: | |
| print(f"Ultravox generation error: {e}") | |
| response_text = f"Detected emotion: {emotion_result}" | |
| # TTS generation | |
| output_audio = None | |
| if dia is not None: | |
| try: | |
| tts_text = f"[emotion:{emotion_result}] {response_text}" | |
| output_audio = dia.generate(tts_text) | |
| if torch.is_tensor(output_audio): | |
| output_audio = output_audio.cpu().numpy() | |
| # Normalize audio | |
| if output_audio is not None: | |
| output_audio = output_audio / np.max(np.abs(output_audio)) * 0.95 | |
| except Exception as e: | |
| print(f"TTS generation error: {e}") | |
| return (sample_rate, output_audio) if output_audio is not None else None, response_text | |
| except Exception as e: | |
| return None, f"Processing error: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Supernatural Speech AI") as demo: | |
| gr.Markdown("# Supernatural Speech AI Agent") | |
| gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Record Audio") | |
| process_btn = gr.Button("Process Audio", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="AI Response") | |
| text_output = gr.Textbox(label="Response Text", lines=3) | |
| conversation_history = gr.State([]) | |
| process_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input], | |
| outputs=[audio_output, text_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(concurrency_limit=20, max_size=50).launch() | |