Maya-AI / app.py
Devakumar868's picture
Update app.py
0fbde27 verified
raw
history blame
2.52 kB
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
# Load HF token and configure multi-GPU sharding
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto"
# 1. Descript Audio Codec (RVQ)
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")
# 2. Voice Activity Detection via Pyannote
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
# 3. Ultravox ASR+LLM (generic pipeline)
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Audio Diffusion (Diffusers loader)
diff_pipe = DiffusionPipeline.from_pretrained(
"teticio/audio-diffusion-instrumental-hiphop-256",
torch_dtype=torch.float16
).to("cuda")
# 5. Dia TTS with device sharding
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
device_map=device_map,
torch_dtype=torch.float16,
trust_remote_code=True
)
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
# VAD segmentation
_ = vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
# RVQ encode/decode
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox: speech β†’ text
out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = out.get("text", "")
# Diffusion-based prosody
pros = diff_pipe(raw_audio=decoded)["audios"][0]
# Dia TTS synthesis
tts = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
return (sr, tts_np), text
with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(label="Generated Text")
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()