import os import gradio as gr import torch import numpy as np from transformers import pipeline from diffusers import DiffusionPipeline from pyannote.audio import Pipeline as PyannotePipeline from dia.model import Dia from dac.utils import load_model as load_dac_model # Load HF token and configure multi-GPU sharding HF_TOKEN = os.environ["HF_TOKEN"] device_map = "auto" # 1. Descript Audio Codec (RVQ) rvq = load_dac_model(tag="latest", model_type="44khz") rvq.eval() if torch.cuda.is_available(): rvq = rvq.to("cuda") # 2. Voice Activity Detection via Pyannote vad_pipe = PyannotePipeline.from_pretrained( "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN ) # 3. Ultravox ASR+LLM (generic pipeline) ultravox_pipe = pipeline( model="fixie-ai/ultravox-v0_4", trust_remote_code=True, device_map=device_map, torch_dtype=torch.float16 ) # 4. Audio Diffusion (Diffusers loader) diff_pipe = DiffusionPipeline.from_pretrained( "teticio/audio-diffusion-instrumental-hiphop-256", torch_dtype=torch.float16 ).to("cuda") # 5. Dia TTS with device sharding dia = Dia.from_pretrained( "nari-labs/Dia-1.6B", device_map=device_map, torch_dtype=torch.float16, trust_remote_code=True ) def process_audio(audio): sr, array = audio array = array.numpy() if torch.is_tensor(array) else array # VAD segmentation _ = vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr}) # RVQ encode/decode x = torch.tensor(array).unsqueeze(0).to("cuda") codes = rvq.encode(x) decoded = rvq.decode(codes).squeeze().cpu().numpy() # Ultravox: speech → text out = ultravox_pipe({"array": decoded, "sampling_rate": sr}) text = out.get("text", "") # Diffusion-based prosody pros = diff_pipe(raw_audio=decoded)["audios"][0] # Dia TTS synthesis tts = dia.generate(f"[emotion:neutral] {text}") tts_np = tts.squeeze().cpu().numpy() tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np return (sr, tts_np), text with gr.Blocks(title="Maya AI 📈") as demo: gr.Markdown("## Maya-AI: Supernatural Conversational Agent") audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice") send_btn = gr.Button("Send") audio_out = gr.Audio(label="AI Response") text_out = gr.Textbox(label="Generated Text") send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out]) if __name__ == "__main__": demo.launch()