File size: 5,984 Bytes
c5ef34e
653911d
 
 
5adc99b
653911d
5adc99b
653911d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5adc99b
653911d
 
5adc99b
653911d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00432e3
653911d
 
5adc99b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
import torch
import numpy as np
from pyannote.audio import Pipeline as VAD
import dac

# Load models with proper error handling
def load_models():
    try:
        # Ultravox via transformers (no separate package needed)
        ultra_proc = AutoProcessor.from_pretrained("fixie-ai/ultravox-v0_4", trust_remote_code=True)
        ultra_model = AutoModelForCausalLM.from_pretrained("fixie-ai/ultravox-v0_4", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
        
        # Speech emotion recognition via transformers pipeline
        emotion_pipeline = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device=0 if torch.cuda.is_available() else -1)
        
        # Audio diffusion (using transformers instead of torch.hub for HF compatibility)
        from diffusers import DiffusionPipeline
        diff_pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-instrumental-hiphop-256")
        
        # Descript Audio Codec
        from dac.utils import load_model as load_dac_model
        rvq = load_dac_model(tag="latest", model_type="44khz")
        rvq.eval()
        if torch.cuda.is_available():
            rvq = rvq.to("cuda")
        
        # VAD
        vad = VAD.from_pretrained("pyannote/voice-activity-detection")
        
        # Dia TTS
        from dia.model import Dia
        dia = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float16")
        
        return ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia
        
    except Exception as e:
        print(f"Error loading models: {e}")
        return None, None, None, None, None, None, None

# Initialize models
ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia = load_models()

def process_audio(audio):
    try:
        if audio is None:
            return None, "No audio input provided"
        
        # Convert audio to proper format
        audio_array = audio[1] if isinstance(audio, tuple) else audio["array"]
        sample_rate = audio[0] if isinstance(audio, tuple) else audio["sampling_rate"]
        
        # Ensure audio is numpy array
        if torch.is_tensor(audio_array):
            audio_array = audio_array.numpy()
        
        # VAD processing
        if vad is not None:
            speech_segments = vad({"waveform": torch.from_numpy(audio_array).unsqueeze(0), "sample_rate": sample_rate})
        
        # Emotion recognition
        emotion_result = "neutral"
        if emotion_pipeline is not None:
            try:
                emotion_pred = emotion_pipeline(audio_array, sampling_rate=sample_rate)
                emotion_result = emotion_pred[0]["label"] if emotion_pred else "neutral"
            except:
                emotion_result = "neutral"
        
        # RVQ encode/decode
        if rvq is not None:
            try:
                audio_tensor = torch.from_numpy(audio_array).float().unsqueeze(0)
                if torch.cuda.is_available():
                    audio_tensor = audio_tensor.to("cuda")
                encoded = rvq.encode(audio_tensor)
                decoded_audio = rvq.decode(encoded)
                if torch.cuda.is_available():
                    decoded_audio = decoded_audio.cpu()
                audio_array = decoded_audio.squeeze().numpy()
            except Exception as e:
                print(f"RVQ processing error: {e}")
        
        # Ultravox generation
        response_text = "I understand your audio input."
        if ultra_proc is not None and ultra_model is not None:
            try:
                inputs = ultra_proc(audio_array, sampling_rate=sample_rate, return_tensors="pt")
                if torch.cuda.is_available():
                    inputs = {k: v.to("cuda") for k, v in inputs.items()}
                
                with torch.no_grad():
                    outputs = ultra_model.generate(**inputs, max_new_tokens=50)
                    response_text = ultra_proc.decode(outputs[0], skip_special_tokens=True)
            except Exception as e:
                print(f"Ultravox generation error: {e}")
                response_text = f"Detected emotion: {emotion_result}"
        
        # TTS generation
        output_audio = None
        if dia is not None:
            try:
                tts_text = f"[emotion:{emotion_result}] {response_text}"
                output_audio = dia.generate(tts_text)
                if torch.is_tensor(output_audio):
                    output_audio = output_audio.cpu().numpy()
                # Normalize audio
                if output_audio is not None:
                    output_audio = output_audio / np.max(np.abs(output_audio)) * 0.95
            except Exception as e:
                print(f"TTS generation error: {e}")
        
        return (sample_rate, output_audio) if output_audio is not None else None, response_text
        
    except Exception as e:
        return None, f"Processing error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Supernatural Speech AI") as demo:
    gr.Markdown("# Supernatural Speech AI Agent")
    gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(source="microphone", type="numpy", label="Record Audio")
            process_btn = gr.Button("Process Audio", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(label="AI Response")
            text_output = gr.Textbox(label="Response Text", lines=3)
    
    conversation_history = gr.State([])
    
    process_btn.click(
        fn=process_audio,
        inputs=[audio_input],
        outputs=[audio_output, text_output]
    )

if __name__ == "__main__":
    demo.queue(concurrency_limit=20, max_size=50).launch()