File size: 3,034 Bytes
6e55da8
c5ef34e
653911d
 
c11bb04
ee439d6
036f56f
c11bb04
1a24747
3972023
5adc99b
c11bb04
 
 
5adc99b
c11bb04
 
1a24747
0fbde27
036f56f
c11bb04
036f56f
 
 
c11bb04
1a24747
c11bb04
1a24747
 
 
55c39a0
1a24747
c11bb04
1a24747
c11bb04
ee439d6
0e0768b
 
c11bb04
1a24747
c11bb04
 
55c39a0
 
c11bb04
 
 
d9c827c
c11bb04
0e0768b
c11bb04
 
 
 
 
55c39a0
c11bb04
 
 
 
55c39a0
c11bb04
3972023
 
55c39a0
c11bb04
55c39a0
 
c11bb04
 
 
55c39a0
 
 
 
c11bb04
ee439d6
0fbde27
c11bb04
 
55c39a0
 
 
653911d
 
1a24747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

#-- Configuration
HF_TOKEN   = os.environ["HF_TOKEN"]                 # Gated model access[2]
device_map = "auto"                                 # Distribute models on 4Γ—L4 GPUs[3]

#-- 1. Descript Audio Codec (RVQ)
rvq = load_dac_model(tag="latest", model_type="44khz")  # RVQ encoder/decoder[4]
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")

#-- 2. Voice Activity Detection via Pyannote
vad_pipe = PyannotePipeline.from_pretrained(
    "pyannote/voice-activity-detection",
    use_auth_token=HF_TOKEN
)                                                   # Proper gated VAD load[2]

#-- 3. Ultravox ASR+LLM Pipeline
ultravox_pipe = pipeline(
    model="fixie-ai/ultravox-v0_4",
    trust_remote_code=True,
    device_map=device_map,
    torch_dtype=torch.float16
)                                                   # Custom speech pipeline[2]

#-- 4. Audio Diffusion Model (Prosody)
diff_pipe = DiffusionPipeline.from_pretrained(
    "teticio/audio-diffusion-instrumental-hiphop-256",
    torch_dtype=torch.float16
).to("cuda")                                        # Diffusers-based load[2]

#-- 5. Dia TTS Model Sharded Across GPUs
dia = Dia.from_pretrained(
    "nari-labs/Dia-1.6B",
    device_map=device_map,
    torch_dtype=torch.float16,
    trust_remote_code=True
)                                                   # Auto-sharding in Transformers[2]

#-- Inference Function
def process_audio(audio):
    sr, arr = audio
    arr = arr.numpy() if torch.is_tensor(arr) else arr

    # VAD segmentation
    _ = vad_pipe({"waveform": torch.tensor(arr).unsqueeze(0), "sample_rate": sr})

    # RVQ encode/decode
    x = torch.tensor(arr).unsqueeze(0).to("cuda")
    codes = rvq.encode(x)
    decoded = rvq.decode(codes).squeeze().cpu().numpy()

    # Ultravox ASR β†’ text
    ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
    text = ultra_out.get("text", "")

    # Diffusion-based prosody enhancement
    pros = diff_pipe(raw_audio=decoded)["audios"][0]

    # Dia TTS synthesis
    tts = dia.generate(f"[emotion:neutral] {text}")
    tts_np = tts.squeeze().cpu().numpy()
    tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np

    return (sr, tts_np), text

#-- Gradio UI
with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
    gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
    audio_in  = gr.Audio(source="microphone", type="numpy", label="Your Voice")
    send_btn  = gr.Button("Send")
    audio_out = gr.Audio(label="AI Response")
    text_out  = gr.Textbox(label="Generated Text")
    send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])

if __name__ == "__main__":
    demo.launch()