Spaces:
Runtime error
Runtime error
File size: 3,808 Bytes
6e55da8 c5ef34e 653911d c11bb04 036f56f c11bb04 1a24747 5adc99b 501663c 5adc99b 501663c 1a24747 501663c 036f56f 501663c 036f56f 501663c 1a24747 501663c 1a24747 55c39a0 1a24747 501663c d9c827c 501663c 55c39a0 501663c 55c39a0 501663c 55c39a0 501663c 55c39a0 501663c ee439d6 501663c 653911d 1a24747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
# Environment setup
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto"
print("Loading models...")
# 1. Load RVQ Codec
print("Loading RVQ Codec...")
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
# 2. Load VAD Pipeline
print("Loading VAD...")
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
# 3. Load Ultravox Pipeline
print("Loading Ultravox...")
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Skip Audio Diffusion (causing UNet mismatch)
print("Skipping Audio Diffusion due to compatibility issues...")
diff_pipe = None
# 5. Load Dia TTS (correct method based on current API)
print("Loading Dia TTS...")
dia = Dia.from_pretrained("nari-labs/Dia-1.6B")
print("All models loaded successfully!")
def process_audio(audio):
try:
if audio is None:
return None, "No audio input provided"
sr, array = audio
# Ensure numpy array
if torch.is_tensor(array):
array = array.numpy()
# VAD processing
try:
vad_result = vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
except Exception as e:
print(f"VAD processing error: {e}")
# RVQ encode/decode
audio_tensor = torch.tensor(array).unsqueeze(0)
if torch.cuda.is_available():
audio_tensor = audio_tensor.to("cuda")
codes = rvq.encode(audio_tensor)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox ASR + LLM
ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = ultra_out.get("text", "I understand your audio input.")
# Skip diffusion processing due to compatibility issues
prosody_audio = decoded
# Dia TTS generation
tts_output = dia.generate(f"[emotion:neutral] {text}")
# Convert to numpy and normalize
if torch.is_tensor(tts_output):
tts_np = tts_output.squeeze().cpu().numpy()
else:
tts_np = np.array(tts_output)
# Normalize audio output
if len(tts_np) > 0:
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95
return (sr, tts_np), text
except Exception as e:
print(f"Error in process_audio: {e}")
return None, f"Processing error: {str(e)}"
# Gradio Interface
with gr.Blocks(title="Maya AI π") as demo:
gr.Markdown("# Maya-AI: Supernatural Conversational Agent")
gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.")
with gr.Row():
with gr.Column():
audio_in = gr.Audio(
source="microphone",
type="numpy",
label="Record Your Voice"
)
send_btn = gr.Button("Send", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(
label="Generated Text",
lines=3,
placeholder="AI response will appear here..."
)
# Event handler
send_btn.click(
fn=process_audio,
inputs=audio_in,
outputs=[audio_out, text_out]
)
if __name__ == "__main__":
demo.launch()
|