File size: 3,808 Bytes
6e55da8
c5ef34e
653911d
 
c11bb04
036f56f
c11bb04
1a24747
5adc99b
501663c
 
 
5adc99b
501663c
 
 
 
 
1a24747
501663c
 
036f56f
501663c
 
036f56f
 
 
501663c
1a24747
501663c
 
1a24747
 
 
55c39a0
1a24747
501663c
d9c827c
501663c
 
 
55c39a0
501663c
 
 
55c39a0
501663c
55c39a0
501663c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55c39a0
501663c
ee439d6
501663c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653911d
 
1a24747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model

# Environment setup
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto"

print("Loading models...")

# 1. Load RVQ Codec
print("Loading RVQ Codec...")
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
    rvq = rvq.to("cuda")

# 2. Load VAD Pipeline
print("Loading VAD...")
vad_pipe = PyannotePipeline.from_pretrained(
    "pyannote/voice-activity-detection",
    use_auth_token=HF_TOKEN
)

# 3. Load Ultravox Pipeline
print("Loading Ultravox...")
ultravox_pipe = pipeline(
    model="fixie-ai/ultravox-v0_4",
    trust_remote_code=True,
    device_map=device_map,
    torch_dtype=torch.float16
)

# 4. Skip Audio Diffusion (causing UNet mismatch)
print("Skipping Audio Diffusion due to compatibility issues...")
diff_pipe = None

# 5. Load Dia TTS (correct method based on current API)
print("Loading Dia TTS...")
dia = Dia.from_pretrained("nari-labs/Dia-1.6B")

print("All models loaded successfully!")

def process_audio(audio):
    try:
        if audio is None:
            return None, "No audio input provided"
        
        sr, array = audio
        
        # Ensure numpy array
        if torch.is_tensor(array):
            array = array.numpy()
        
        # VAD processing
        try:
            vad_result = vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
        except Exception as e:
            print(f"VAD processing error: {e}")
        
        # RVQ encode/decode
        audio_tensor = torch.tensor(array).unsqueeze(0)
        if torch.cuda.is_available():
            audio_tensor = audio_tensor.to("cuda")
        codes = rvq.encode(audio_tensor)
        decoded = rvq.decode(codes).squeeze().cpu().numpy()
        
        # Ultravox ASR + LLM
        ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
        text = ultra_out.get("text", "I understand your audio input.")
        
        # Skip diffusion processing due to compatibility issues
        prosody_audio = decoded
        
        # Dia TTS generation
        tts_output = dia.generate(f"[emotion:neutral] {text}")
        
        # Convert to numpy and normalize
        if torch.is_tensor(tts_output):
            tts_np = tts_output.squeeze().cpu().numpy()
        else:
            tts_np = np.array(tts_output)
        
        # Normalize audio output
        if len(tts_np) > 0:
            tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95
        
        return (sr, tts_np), text
        
    except Exception as e:
        print(f"Error in process_audio: {e}")
        return None, f"Processing error: {str(e)}"

# Gradio Interface
with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
    gr.Markdown("# Maya-AI: Supernatural Conversational Agent")
    gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.")
    
    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(
                source="microphone", 
                type="numpy", 
                label="Record Your Voice"
            )
            send_btn = gr.Button("Send", variant="primary")
        
        with gr.Column():
            audio_out = gr.Audio(label="AI Response")
            text_out = gr.Textbox(
                label="Generated Text", 
                lines=3,
                placeholder="AI response will appear here..."
            )
    
    # Event handler
    send_btn.click(
        fn=process_audio,
        inputs=audio_in,
        outputs=[audio_out, text_out]
    )

if __name__ == "__main__":
    demo.launch()