Update app.py to use GPU for faster processing
Browse files
app.py
CHANGED
|
@@ -106,6 +106,7 @@ class ChatGroq:
|
|
| 106 |
except requests.exceptions.RequestException as e:
|
| 107 |
logging.error(f"ChatGroq API error: {e}")
|
| 108 |
return "Interviewer: Due to a system issue, let's move on to another question."
|
|
|
|
| 109 |
groq_llm = ChatGroq(
|
| 110 |
temperature=0.7,
|
| 111 |
model_name="llama-3.3-70b-versatile",
|
|
@@ -122,26 +123,34 @@ if HF_TOKEN:
|
|
| 122 |
else:
|
| 123 |
raise EnvironmentError("Missing HF_TOKEN environment variable.")
|
| 124 |
|
| 125 |
-
|
| 126 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 127 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
-
|
| 133 |
|
| 134 |
-
|
| 135 |
MODEL_PATH,
|
| 136 |
-
torch_dtype=torch.
|
| 137 |
device_map="auto",
|
| 138 |
-
|
| 139 |
)
|
| 140 |
|
| 141 |
-
|
| 142 |
"text-generation",
|
| 143 |
-
model=
|
| 144 |
-
tokenizer=
|
| 145 |
max_new_tokens=128,
|
| 146 |
temperature=0.3,
|
| 147 |
top_p=0.9,
|
|
@@ -149,9 +158,6 @@ falcon_pipeline = pipeline(
|
|
| 149 |
repetition_penalty=1.1,
|
| 150 |
)
|
| 151 |
|
| 152 |
-
# β
Test it
|
| 153 |
-
# result = falcon_pipeline("Explain LLMs:")
|
| 154 |
-
# print(result[0]["generated_text"])
|
| 155 |
|
| 156 |
# embedding model
|
| 157 |
from sentence_transformers import SentenceTransformer
|
|
@@ -1455,6 +1461,10 @@ import whisper
|
|
| 1455 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
| 1456 |
import librosa
|
| 1457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1458 |
# Bark TTS
|
| 1459 |
print("π Loading Bark model...")
|
| 1460 |
model_bark = BarkModel.from_pretrained("suno/bark")
|
|
@@ -1481,7 +1491,7 @@ def bark_tts(text):
|
|
| 1481 |
|
| 1482 |
# Whisper STT
|
| 1483 |
print("π Loading Whisper model...")
|
| 1484 |
-
whisper_model = whisper.load_model("base")
|
| 1485 |
print("β
Whisper model loaded")
|
| 1486 |
def whisper_stt(audio_path):
|
| 1487 |
if not audio_path or not os.path.exists(audio_path): return ""
|
|
@@ -1543,6 +1553,7 @@ local_wav2vec_model_path = "HaniaRuby/speech-emotion-recognition-wav2vec2" # Loc
|
|
| 1543 |
print("π Loading Wav2Vec processor and model...")
|
| 1544 |
wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
|
| 1545 |
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
|
|
|
|
| 1546 |
print("β
Wav2Vec model loaded")
|
| 1547 |
wav2vec_model.eval()
|
| 1548 |
voice_label_map = {
|
|
@@ -1555,14 +1566,23 @@ voice_label_map = {
|
|
| 1555 |
def analyze_audio_emotion(audio_path):
|
| 1556 |
print(f"π Analyzing audio emotion for: {audio_path}")
|
| 1557 |
if not audio_path or not os.path.exists(audio_path): return "neutral"
|
|
|
|
| 1558 |
speech, sr = librosa.load(audio_path, sr=16000)
|
| 1559 |
inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1560 |
with torch.no_grad():
|
| 1561 |
logits = wav2vec_model(**inputs).logits
|
|
|
|
| 1562 |
probs = torch.nn.functional.softmax(logits, dim=-1)
|
| 1563 |
predicted_id = torch.argmax(probs, dim=-1).item()
|
| 1564 |
return voice_label_map.get(predicted_id, "neutral")
|
| 1565 |
|
|
|
|
| 1566 |
# --- Effective confidence calculation
|
| 1567 |
def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
|
| 1568 |
emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
|
|
|
|
| 106 |
except requests.exceptions.RequestException as e:
|
| 107 |
logging.error(f"ChatGroq API error: {e}")
|
| 108 |
return "Interviewer: Due to a system issue, let's move on to another question."
|
| 109 |
+
|
| 110 |
groq_llm = ChatGroq(
|
| 111 |
temperature=0.7,
|
| 112 |
model_name="llama-3.3-70b-versatile",
|
|
|
|
| 123 |
else:
|
| 124 |
raise EnvironmentError("Missing HF_TOKEN environment variable.")
|
| 125 |
|
| 126 |
+
#Load mistral Model
|
| 127 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 128 |
import torch
|
| 129 |
+
print(torch.cuda.is_available())
|
| 130 |
+
|
| 131 |
+
MODEL_PATH = "mistralai/Mistral-7B-Instruct-v0.3"
|
| 132 |
+
#MODEL_PATH = "tiiuae/falcon-rw-1b"
|
| 133 |
|
| 134 |
+
bnb_config = BitsAndBytesConfig(
|
| 135 |
+
load_in_4bit=True,
|
| 136 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 137 |
+
bnb_4bit_use_double_quant=True,
|
| 138 |
+
bnb_4bit_quant_type="nf4"
|
| 139 |
+
)
|
| 140 |
|
| 141 |
+
mistral_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,use_auth_token=True)
|
| 142 |
|
| 143 |
+
judge_llm = AutoModelForCausalLM.from_pretrained(
|
| 144 |
MODEL_PATH,
|
| 145 |
+
quantization_config=bnb_config,torch_dtype=torch.float16,
|
| 146 |
device_map="auto",
|
| 147 |
+
use_auth_token=True
|
| 148 |
)
|
| 149 |
|
| 150 |
+
judge_pipeline = pipeline(
|
| 151 |
"text-generation",
|
| 152 |
+
model=judge_llm,
|
| 153 |
+
tokenizer=mistral_tokenizer,
|
| 154 |
max_new_tokens=128,
|
| 155 |
temperature=0.3,
|
| 156 |
top_p=0.9,
|
|
|
|
| 158 |
repetition_penalty=1.1,
|
| 159 |
)
|
| 160 |
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
# embedding model
|
| 163 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 1461 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
| 1462 |
import librosa
|
| 1463 |
|
| 1464 |
+
import torch
|
| 1465 |
+
print(torch.cuda.is_available()) # β
Tells you if GPU is available
|
| 1466 |
+
|
| 1467 |
+
|
| 1468 |
# Bark TTS
|
| 1469 |
print("π Loading Bark model...")
|
| 1470 |
model_bark = BarkModel.from_pretrained("suno/bark")
|
|
|
|
| 1491 |
|
| 1492 |
# Whisper STT
|
| 1493 |
print("π Loading Whisper model...")
|
| 1494 |
+
whisper_model = whisper.load_model("base", device="cuda")
|
| 1495 |
print("β
Whisper model loaded")
|
| 1496 |
def whisper_stt(audio_path):
|
| 1497 |
if not audio_path or not os.path.exists(audio_path): return ""
|
|
|
|
| 1553 |
print("π Loading Wav2Vec processor and model...")
|
| 1554 |
wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
|
| 1555 |
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
|
| 1556 |
+
wav2vec_model = wav2vec_model.to("cuda" if torch.cuda.is_available() else "cpu")
|
| 1557 |
print("β
Wav2Vec model loaded")
|
| 1558 |
wav2vec_model.eval()
|
| 1559 |
voice_label_map = {
|
|
|
|
| 1566 |
def analyze_audio_emotion(audio_path):
|
| 1567 |
print(f"π Analyzing audio emotion for: {audio_path}")
|
| 1568 |
if not audio_path or not os.path.exists(audio_path): return "neutral"
|
| 1569 |
+
|
| 1570 |
speech, sr = librosa.load(audio_path, sr=16000)
|
| 1571 |
inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
|
| 1572 |
+
|
| 1573 |
+
# π₯ Move model and inputs to GPU
|
| 1574 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 1575 |
+
wav2vec_model.to(device)
|
| 1576 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 1577 |
+
|
| 1578 |
with torch.no_grad():
|
| 1579 |
logits = wav2vec_model(**inputs).logits
|
| 1580 |
+
|
| 1581 |
probs = torch.nn.functional.softmax(logits, dim=-1)
|
| 1582 |
predicted_id = torch.argmax(probs, dim=-1).item()
|
| 1583 |
return voice_label_map.get(predicted_id, "neutral")
|
| 1584 |
|
| 1585 |
+
|
| 1586 |
# --- Effective confidence calculation
|
| 1587 |
def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
|
| 1588 |
emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
|