Spaces:
Runtime error
Runtime error
File size: 2,251 Bytes
6e55da8 c5ef34e 653911d 6e55da8 ee439d6 036f56f 1a24747 5adc99b 6e55da8 42e6e01 5adc99b c0e0942 1a24747 c0e0942 036f56f c0e0942 036f56f 42e6e01 1a24747 c0e0942 1a24747 c0e0942 ee439d6 c0e0942 ee439d6 1a24747 c0e0942 1a24747 42e6e01 1a24747 42e6e01 1a24747 c0e0942 653911d 6e55da8 ee439d6 6e55da8 ee439d6 c0e0942 6e55da8 ee439d6 6e55da8 ee439d6 6e55da8 c0e0942 6e55da8 c0e0942 6e55da8 c0e0942 ee439d6 6e55da8 c0e0942 653911d 1a24747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto"
# 1. RVQ Codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")
# 2. VAD
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
# 3. Ultravox
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Audio Diffusion
diff_pipe = DiffusionPipeline.from_pretrained(
"teticio/audio-diffusion-instrumental-hiphop-256"
).to("cuda")
# 5. Dia TTS
with init_empty_weights():
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
trust_remote_code=True
)
dia = load_checkpoint_and_dispatch(
dia,
"nari-labs/Dia-1.6B",
device_map=device_map,
dtype=torch.float16
)
# Inference
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
_ = vad_pipe(array, sampling_rate=sr)
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = ultra_out.get("text", "")
pros = diff_pipe(raw_audio=decoded)["audios"][0]
tts = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95
return (sr, tts_np), text
# UI
with gr.Blocks(title="Maya AI π") as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy")
send_btn = gr.Button("Send")
audio_out = gr.Audio()
text_out = gr.Textbox()
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()
|