Spaces:
Runtime error
Runtime error
File size: 3,034 Bytes
6e55da8 c5ef34e 653911d c11bb04 ee439d6 036f56f c11bb04 1a24747 3972023 5adc99b c11bb04 5adc99b c11bb04 1a24747 0fbde27 036f56f c11bb04 036f56f c11bb04 1a24747 c11bb04 1a24747 55c39a0 1a24747 c11bb04 1a24747 c11bb04 ee439d6 0e0768b c11bb04 1a24747 c11bb04 55c39a0 c11bb04 d9c827c c11bb04 0e0768b c11bb04 55c39a0 c11bb04 55c39a0 c11bb04 3972023 55c39a0 c11bb04 55c39a0 c11bb04 55c39a0 c11bb04 ee439d6 0fbde27 c11bb04 55c39a0 653911d 1a24747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
#-- Configuration
HF_TOKEN = os.environ["HF_TOKEN"] # Gated model access[2]
device_map = "auto" # Distribute models on 4ΓL4 GPUs[3]
#-- 1. Descript Audio Codec (RVQ)
rvq = load_dac_model(tag="latest", model_type="44khz") # RVQ encoder/decoder[4]
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")
#-- 2. Voice Activity Detection via Pyannote
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
) # Proper gated VAD load[2]
#-- 3. Ultravox ASR+LLM Pipeline
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
) # Custom speech pipeline[2]
#-- 4. Audio Diffusion Model (Prosody)
diff_pipe = DiffusionPipeline.from_pretrained(
"teticio/audio-diffusion-instrumental-hiphop-256",
torch_dtype=torch.float16
).to("cuda") # Diffusers-based load[2]
#-- 5. Dia TTS Model Sharded Across GPUs
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
device_map=device_map,
torch_dtype=torch.float16,
trust_remote_code=True
) # Auto-sharding in Transformers[2]
#-- Inference Function
def process_audio(audio):
sr, arr = audio
arr = arr.numpy() if torch.is_tensor(arr) else arr
# VAD segmentation
_ = vad_pipe({"waveform": torch.tensor(arr).unsqueeze(0), "sample_rate": sr})
# RVQ encode/decode
x = torch.tensor(arr).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox ASR β text
ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = ultra_out.get("text", "")
# Diffusion-based prosody enhancement
pros = diff_pipe(raw_audio=decoded)["audios"][0]
# Dia TTS synthesis
tts = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
return (sr, tts_np), text
#-- Gradio UI
with gr.Blocks(title="Maya AI π") as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(label="Generated Text")
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()
|