import os import gradio as gr import torch import numpy as np from transformers import pipeline from diffusers import DiffusionPipeline from pyannote.audio import Pipeline as PyannotePipeline from dia.model import Dia from dac.utils import load_model as load_dac_model from accelerate import init_empty_weights, load_checkpoint_and_dispatch # Retrieve HF_TOKEN from Secrets HF_TOKEN = os.environ["HF_TOKEN"] # Automatically shard across 4× L4 GPUs device_map = "auto" # 1. Load Descript Audio Codec (RVQ) rvq = load_dac_model(tag="latest", model_type="44khz") rvq.eval() if torch.cuda.is_available(): rvq = rvq.to("cuda") # 2. Load Voice Activity Detection via Pyannote vad_pipe = PyannotePipeline.from_pretrained( "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN ) # 3. Load Ultravox (speech-to-text + LLM) via Transformers ultravox_pipe = pipeline( model="fixie-ai/ultravox-v0_4", trust_remote_code=True, device_map=device_map, torch_dtype=torch.float16 ) # 4. Load Audio Diffusion model via Diffusers diff_pipe = DiffusionPipeline.from_pretrained( "teticio/audio-diffusion-instrumental-hiphop-256" ).to("cuda") # 5. Load Dia TTS with meta-weight initialization and multi-GPU dispatch with init_empty_weights(): dia = Dia.from_pretrained("nari-labs/Dia-1.6B") dia = load_checkpoint_and_dispatch( dia, "nari-labs/Dia-1.6B", device_map=device_map, dtype=torch.float16 ) # Inference function def process_audio(audio): sr, array = audio array = array.numpy() if torch.is_tensor(array) else array # 2.1 VAD: segment speech regions (not used further here) _ = vad_pipe(array, sampling_rate=sr) # 1.1 RVQ encode/decode for discrete audio tokens x = torch.tensor(array).unsqueeze(0).to("cuda") codes = rvq.encode(x) decoded = rvq.decode(codes).squeeze().cpu().numpy() # 3. Ultravox ASR + LLM to generate response text ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr}) text = ultra_out.get("text", "") # 4. Diffusion-based prosody enhancement pros = diff_pipe(raw_audio=decoded)["audios"][0] # 5. Dia TTS synthesis with neutral emotion tag tts = dia.generate(f"[emotion:neutral] {text}") tts_np = tts.squeeze().cpu().numpy() tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 return (sr, tts_np), text # Gradio UI with gr.Blocks(title="Maya AI 📈") as demo: gr.Markdown("## Maya-AI: Supernatural Conversational Agent") audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice") send_btn = gr.Button("Send") audio_out = gr.Audio(label="AI’s Response") text_out = gr.Textbox(label="Generated Text") send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out]) if __name__ == "__main__": demo.launch()