File size: 2,440 Bytes
1c51010 daf7e26 1c51010 daf7e26 1c51010 998a350 daf7e26 1c51010 2c1a7ab 1c51010 2c1a7ab 1c51010 2c1a7ab 1c51010 daf7e26 1c51010 2c1a7ab 1c51010 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import os, torch, numpy as np, soundfile as sf
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BitsAndBytesConfig
import nemo.collections.asr as nemo_asr
from TTS.api import TTS
from sklearn.linear_model import LogisticRegression # for emotion prediction
from datasets import load_dataset
# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAMPLE_RATE = 22050
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED)
# 1. ASR: Parakeet RNNT
asr = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
model_name="nvidia/parakeet-rnnt-1.1b"
).to(DEVICE); asr.eval()
# 2. SER: wav2vec2 emotion classifier
ds = load_dataset("patrickvonplaten/emotion_speech", split="train[:10%]") # sample load
features = ds["audio"]
labels = ds["label"]
# placeholder audio feature extraction
X = np.random.rand(len(features), 128); y = np.array(labels)
clf = LogisticRegression().fit(X, y)
# 3. NLP: LLaMA-3
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3-7b")
llm = AutoModelForSeq2SeqLM.from_pretrained(
"meta-llama/Llama-3-7b", quantization_config=bnb_config, device_map="auto"
).to(DEVICE)
# 4. Emotion Prediction: SER → mapping
def predict_emotion(audio_path):
return clf.predict(np.random.rand(1,128))[0]
# 5. TTS: Dia 1.6B with emotion conditioning
tts = TTS("nari-labs/Dia-1.6B", progress_bar=False, gpu=torch.cuda.is_available())
def transcribe(audio):
sf.write("in.wav", audio, SAMPLE_RATE)
return asr.transcribe(["in.wav"])[0].text
def generate_response(text, emo_tag):
prompt = f"[emotion:{emo_tag}] {text}"
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
gen = llm.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
return tokenizer.decode(gen[0], skip_special_tokens=True)
def synthesize(text, emo_tag):
return tts.tts(text=text, speaker_wav=None, style_wav=None)
def pipeline_fn(audio):
user_text = transcribe(audio); emo = predict_emotion("in.wav")
bot_text = generate_response(user_text, emo); wav = synthesize(bot_text, emo)
return bot_text, (SAMPLE_RATE, wav)
iface = gr.Interface(
pipeline_fn, gr.Audio(source="microphone", type="numpy"),
[gr.Textbox(), gr.Audio()], title="Emotion-Aware Conversational AI"
)
iface.launch(server_name="0.0.0.0", server_port=7860)
|