import os import gradio as gr import torch import numpy as np from transformers import pipeline from diffusers import DiffusionPipeline from pyannote.audio import Pipeline as PyannotePipeline from dia.model import Dia from dac.utils import load_model as load_dac_model from accelerate import init_empty_weights, load_checkpoint_and_dispatch #-- Configuration HF_TOKEN = os.environ["HF_TOKEN"] # Gated model access[2] device_map = "auto" # Distribute models on 4×L4 GPUs[3] #-- 1. Descript Audio Codec (RVQ) rvq = load_dac_model(tag="latest", model_type="44khz") # RVQ encoder/decoder[4] rvq.eval() if torch.cuda.is_available(): rvq = rvq.to("cuda") #-- 2. Voice Activity Detection via Pyannote vad_pipe = PyannotePipeline.from_pretrained( "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN ) # Proper gated VAD load[2] #-- 3. Ultravox ASR+LLM Pipeline ultravox_pipe = pipeline( model="fixie-ai/ultravox-v0_4", trust_remote_code=True, device_map=device_map, torch_dtype=torch.float16 ) # Custom speech pipeline[2] #-- 4. Audio Diffusion Model (Prosody) diff_pipe = DiffusionPipeline.from_pretrained( "teticio/audio-diffusion-instrumental-hiphop-256", torch_dtype=torch.float16 ).to("cuda") # Diffusers-based load[2] #-- 5. Dia TTS Model Sharded Across GPUs dia = Dia.from_pretrained( "nari-labs/Dia-1.6B", device_map=device_map, torch_dtype=torch.float16, trust_remote_code=True ) # Auto-sharding in Transformers[2] #-- Inference Function def process_audio(audio): sr, arr = audio arr = arr.numpy() if torch.is_tensor(arr) else arr # VAD segmentation _ = vad_pipe({"waveform": torch.tensor(arr).unsqueeze(0), "sample_rate": sr}) # RVQ encode/decode x = torch.tensor(arr).unsqueeze(0).to("cuda") codes = rvq.encode(x) decoded = rvq.decode(codes).squeeze().cpu().numpy() # Ultravox ASR → text ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr}) text = ultra_out.get("text", "") # Diffusion-based prosody enhancement pros = diff_pipe(raw_audio=decoded)["audios"][0] # Dia TTS synthesis tts = dia.generate(f"[emotion:neutral] {text}") tts_np = tts.squeeze().cpu().numpy() tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np return (sr, tts_np), text #-- Gradio UI with gr.Blocks(title="Maya AI 📈") as demo: gr.Markdown("## Maya-AI: Supernatural Conversational Agent") audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice") send_btn = gr.Button("Send") audio_out = gr.Audio(label="AI Response") text_out = gr.Textbox(label="Generated Text") send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out]) if __name__ == "__main__": demo.launch()