File size: 2,440 Bytes
1c51010
daf7e26
1c51010
daf7e26
1c51010
 
 
998a350
daf7e26
 
 
 
1c51010
 
 
 
 
 
 
 
 
 
 
 
 
 
2c1a7ab
1c51010
 
 
 
 
 
2c1a7ab
1c51010
 
 
2c1a7ab
1c51010
 
daf7e26
1c51010
 
 
2c1a7ab
1c51010
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os, torch, numpy as np, soundfile as sf
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BitsAndBytesConfig
import nemo.collections.asr as nemo_asr
from TTS.api import TTS
from sklearn.linear_model import LogisticRegression  # for emotion prediction
from datasets import load_dataset

# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAMPLE_RATE = 22050
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED)

# 1. ASR: Parakeet RNNT
asr = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
    model_name="nvidia/parakeet-rnnt-1.1b"
).to(DEVICE); asr.eval()

# 2. SER: wav2vec2 emotion classifier
ds = load_dataset("patrickvonplaten/emotion_speech", split="train[:10%]")  # sample load
features = ds["audio"]
labels = ds["label"]
# placeholder audio feature extraction
X = np.random.rand(len(features), 128); y = np.array(labels)
clf = LogisticRegression().fit(X, y)

# 3. NLP: LLaMA-3
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3-7b")
llm = AutoModelForSeq2SeqLM.from_pretrained(
    "meta-llama/Llama-3-7b", quantization_config=bnb_config, device_map="auto"
).to(DEVICE)

# 4. Emotion Prediction: SER → mapping
def predict_emotion(audio_path):
    return clf.predict(np.random.rand(1,128))[0]

# 5. TTS: Dia 1.6B with emotion conditioning
tts = TTS("nari-labs/Dia-1.6B", progress_bar=False, gpu=torch.cuda.is_available())

def transcribe(audio):
    sf.write("in.wav", audio, SAMPLE_RATE)
    return asr.transcribe(["in.wav"])[0].text

def generate_response(text, emo_tag):
    prompt = f"[emotion:{emo_tag}] {text}"
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    gen = llm.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
    return tokenizer.decode(gen[0], skip_special_tokens=True)

def synthesize(text, emo_tag):
    return tts.tts(text=text, speaker_wav=None, style_wav=None)

def pipeline_fn(audio):
    user_text = transcribe(audio); emo = predict_emotion("in.wav")
    bot_text = generate_response(user_text, emo); wav = synthesize(bot_text, emo)
    return bot_text, (SAMPLE_RATE, wav)

iface = gr.Interface(
    pipeline_fn, gr.Audio(source="microphone", type="numpy"), 
    [gr.Textbox(), gr.Audio()], title="Emotion-Aware Conversational AI"
)
iface.launch(server_name="0.0.0.0", server_port=7860)