import os import gradio as gr import torch import numpy as np from transformers import pipeline from diffusers import DiffusionPipeline from pyannote.audio import Pipeline as PyannotePipeline from dia.model import Dia from dac.utils import load_model as load_dac_model from accelerate import init_empty_weights, load_checkpoint_and_dispatch HF_TOKEN = os.environ["HF_TOKEN"] device_map = "auto" # 1. RVQ Codec rvq = load_dac_model(tag="latest", model_type="44khz") rvq.eval() if torch.cuda.is_available(): rvq = rvq.to("cuda") # 2. VAD vad_pipe = PyannotePipeline.from_pretrained( "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN ) # 3. Ultravox ultravox_pipe = pipeline( model="fixie-ai/ultravox-v0_4", trust_remote_code=True, device_map=device_map, torch_dtype=torch.float16 ) # 4. Audio Diffusion diff_pipe = DiffusionPipeline.from_pretrained( "teticio/audio-diffusion-instrumental-hiphop-256" ).to("cuda") # 5. Dia TTS with init_empty_weights(): dia = Dia.from_pretrained( "nari-labs/Dia-1.6B", trust_remote_code=True ) dia = load_checkpoint_and_dispatch( dia, "nari-labs/Dia-1.6B", device_map=device_map, dtype=torch.float16 ) # Inference def process_audio(audio): sr, array = audio array = array.numpy() if torch.is_tensor(array) else array _ = vad_pipe(array, sampling_rate=sr) x = torch.tensor(array).unsqueeze(0).to("cuda") codes = rvq.encode(x) decoded = rvq.decode(codes).squeeze().cpu().numpy() ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr}) text = ultra_out.get("text", "") pros = diff_pipe(raw_audio=decoded)["audios"][0] tts = dia.generate(f"[emotion:neutral] {text}") tts_np = tts.squeeze().cpu().numpy() tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 return (sr, tts_np), text # UI with gr.Blocks(title="Maya AI 📈") as demo: gr.Markdown("## Maya-AI: Supernatural Conversational Agent") audio_in = gr.Audio(source="microphone", type="numpy") send_btn = gr.Button("Send") audio_out = gr.Audio() text_out = gr.Textbox() send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out]) if __name__ == "__main__": demo.launch()