Maya-AI / app.py
Devakumar868's picture
Update app.py
55c39a0 verified
raw
history blame
2.63 kB
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoModel
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
# 1. Retrieve HF token and set device mapping
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto" # auto-shard models across 4Γ—L4 GPUs
print("Loading RVQ Codec...")
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
print("Loading VAD pipeline...")
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
print("Loading Ultravox pipeline...")
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
print("Loading Audio Diffusion model...")
diff_pipe = DiffusionPipeline.from_pretrained(
"teticio/audio-diffusion-instrumental-hiphop-256",
torch_dtype=torch.float16
).to("cuda")
print("Loading Dia TTS (sharded across GPUs)...")
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
device_map=device_map,
torch_dtype=torch.float16,
trust_remote_code=True
)
print("All models loaded successfully!")
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
# 1. Voice activity detection
vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
# 2. RVQ encode/decode
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# 3. Ultravox ASR β†’ text
out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = out.get("text", "")
# 4. Prosody diffusion
pros = diff_pipe(raw_audio=decoded)["audios"][0]
# 5. Dia TTS synthesis
tts = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
return (sr, tts_np), text
# Gradio UI
with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
gr.Markdown("## Maya-AI Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(label="Generated Text")
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()