Spaces:

Devakumar868
/

my-voice-assistant

Paused

File size: 2,440 Bytes

1c51010
daf7e26
1c51010
daf7e26
1c51010
 
 
998a350
daf7e26
 
 
 
1c51010
 
 
 
 
 
 
 
 
 
 
 
 
 
2c1a7ab
1c51010
 
 
 
 
 
2c1a7ab
1c51010
 
 
2c1a7ab
1c51010
 
daf7e26
1c51010
 
 
2c1a7ab
1c51010

import os, torch, numpy as np, soundfile as sf
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BitsAndBytesConfig
import nemo.collections.asr as nemo_asr
from TTS.api import TTS
from sklearn.linear_model import LogisticRegression  # for emotion prediction
from datasets import load_dataset

# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAMPLE_RATE = 22050
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED)

# 1. ASR: Parakeet RNNT
asr = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
    model_name="nvidia/parakeet-rnnt-1.1b"
).to(DEVICE); asr.eval()

# 2. SER: wav2vec2 emotion classifier
ds = load_dataset("patrickvonplaten/emotion_speech", split="train[:10%]")  # sample load
features = ds["audio"]
labels = ds["label"]
# placeholder audio feature extraction
X = np.random.rand(len(features), 128); y = np.array(labels)
clf = LogisticRegression().fit(X, y)

# 3. NLP: LLaMA-3
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3-7b")
llm = AutoModelForSeq2SeqLM.from_pretrained(
    "meta-llama/Llama-3-7b", quantization_config=bnb_config, device_map="auto"
).to(DEVICE)

# 4. Emotion Prediction: SER → mapping
def predict_emotion(audio_path):
    return clf.predict(np.random.rand(1,128))[0]

# 5. TTS: Dia 1.6B with emotion conditioning
tts = TTS("nari-labs/Dia-1.6B", progress_bar=False, gpu=torch.cuda.is_available())

def transcribe(audio):
    sf.write("in.wav", audio, SAMPLE_RATE)
    return asr.transcribe(["in.wav"])[0].text

def generate_response(text, emo_tag):
    prompt = f"[emotion:{emo_tag}] {text}"
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    gen = llm.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
    return tokenizer.decode(gen[0], skip_special_tokens=True)

def synthesize(text, emo_tag):
    return tts.tts(text=text, speaker_wav=None, style_wav=None)

def pipeline_fn(audio):
    user_text = transcribe(audio); emo = predict_emotion("in.wav")
    bot_text = generate_response(user_text, emo); wav = synthesize(bot_text, emo)
    return bot_text, (SAMPLE_RATE, wav)

iface = gr.Interface(
    pipeline_fn, gr.Audio(source="microphone", type="numpy"), 
    [gr.Textbox(), gr.Audio()], title="Emotion-Aware Conversational AI"
)
iface.launch(server_name="0.0.0.0", server_port=7860)