Update app.py to use GPU for faster processing
Browse files
app.py
CHANGED
@@ -106,6 +106,7 @@ class ChatGroq:
|
|
106 |
except requests.exceptions.RequestException as e:
|
107 |
logging.error(f"ChatGroq API error: {e}")
|
108 |
return "Interviewer: Due to a system issue, let's move on to another question."
|
|
|
109 |
groq_llm = ChatGroq(
|
110 |
temperature=0.7,
|
111 |
model_name="llama-3.3-70b-versatile",
|
@@ -122,26 +123,34 @@ if HF_TOKEN:
|
|
122 |
else:
|
123 |
raise EnvironmentError("Missing HF_TOKEN environment variable.")
|
124 |
|
125 |
-
|
126 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
127 |
import torch
|
|
|
|
|
|
|
|
|
128 |
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
131 |
|
132 |
-
|
133 |
|
134 |
-
|
135 |
MODEL_PATH,
|
136 |
-
torch_dtype=torch.
|
137 |
device_map="auto",
|
138 |
-
|
139 |
)
|
140 |
|
141 |
-
|
142 |
"text-generation",
|
143 |
-
model=
|
144 |
-
tokenizer=
|
145 |
max_new_tokens=128,
|
146 |
temperature=0.3,
|
147 |
top_p=0.9,
|
@@ -149,9 +158,6 @@ falcon_pipeline = pipeline(
|
|
149 |
repetition_penalty=1.1,
|
150 |
)
|
151 |
|
152 |
-
# β
Test it
|
153 |
-
# result = falcon_pipeline("Explain LLMs:")
|
154 |
-
# print(result[0]["generated_text"])
|
155 |
|
156 |
# embedding model
|
157 |
from sentence_transformers import SentenceTransformer
|
@@ -1455,6 +1461,10 @@ import whisper
|
|
1455 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
1456 |
import librosa
|
1457 |
|
|
|
|
|
|
|
|
|
1458 |
# Bark TTS
|
1459 |
print("π Loading Bark model...")
|
1460 |
model_bark = BarkModel.from_pretrained("suno/bark")
|
@@ -1481,7 +1491,7 @@ def bark_tts(text):
|
|
1481 |
|
1482 |
# Whisper STT
|
1483 |
print("π Loading Whisper model...")
|
1484 |
-
whisper_model = whisper.load_model("base")
|
1485 |
print("β
Whisper model loaded")
|
1486 |
def whisper_stt(audio_path):
|
1487 |
if not audio_path or not os.path.exists(audio_path): return ""
|
@@ -1543,6 +1553,7 @@ local_wav2vec_model_path = "HaniaRuby/speech-emotion-recognition-wav2vec2" # Loc
|
|
1543 |
print("π Loading Wav2Vec processor and model...")
|
1544 |
wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
|
1545 |
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
|
|
|
1546 |
print("β
Wav2Vec model loaded")
|
1547 |
wav2vec_model.eval()
|
1548 |
voice_label_map = {
|
@@ -1555,14 +1566,23 @@ voice_label_map = {
|
|
1555 |
def analyze_audio_emotion(audio_path):
|
1556 |
print(f"π Analyzing audio emotion for: {audio_path}")
|
1557 |
if not audio_path or not os.path.exists(audio_path): return "neutral"
|
|
|
1558 |
speech, sr = librosa.load(audio_path, sr=16000)
|
1559 |
inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
|
|
|
|
|
|
|
|
|
|
|
|
|
1560 |
with torch.no_grad():
|
1561 |
logits = wav2vec_model(**inputs).logits
|
|
|
1562 |
probs = torch.nn.functional.softmax(logits, dim=-1)
|
1563 |
predicted_id = torch.argmax(probs, dim=-1).item()
|
1564 |
return voice_label_map.get(predicted_id, "neutral")
|
1565 |
|
|
|
1566 |
# --- Effective confidence calculation
|
1567 |
def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
|
1568 |
emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
|
|
|
106 |
except requests.exceptions.RequestException as e:
|
107 |
logging.error(f"ChatGroq API error: {e}")
|
108 |
return "Interviewer: Due to a system issue, let's move on to another question."
|
109 |
+
|
110 |
groq_llm = ChatGroq(
|
111 |
temperature=0.7,
|
112 |
model_name="llama-3.3-70b-versatile",
|
|
|
123 |
else:
|
124 |
raise EnvironmentError("Missing HF_TOKEN environment variable.")
|
125 |
|
126 |
+
#Load mistral Model
|
127 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
128 |
import torch
|
129 |
+
print(torch.cuda.is_available())
|
130 |
+
|
131 |
+
MODEL_PATH = "mistralai/Mistral-7B-Instruct-v0.3"
|
132 |
+
#MODEL_PATH = "tiiuae/falcon-rw-1b"
|
133 |
|
134 |
+
bnb_config = BitsAndBytesConfig(
|
135 |
+
load_in_4bit=True,
|
136 |
+
bnb_4bit_compute_dtype=torch.float16,
|
137 |
+
bnb_4bit_use_double_quant=True,
|
138 |
+
bnb_4bit_quant_type="nf4"
|
139 |
+
)
|
140 |
|
141 |
+
mistral_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,use_auth_token=True)
|
142 |
|
143 |
+
judge_llm = AutoModelForCausalLM.from_pretrained(
|
144 |
MODEL_PATH,
|
145 |
+
quantization_config=bnb_config,torch_dtype=torch.float16,
|
146 |
device_map="auto",
|
147 |
+
use_auth_token=True
|
148 |
)
|
149 |
|
150 |
+
judge_pipeline = pipeline(
|
151 |
"text-generation",
|
152 |
+
model=judge_llm,
|
153 |
+
tokenizer=mistral_tokenizer,
|
154 |
max_new_tokens=128,
|
155 |
temperature=0.3,
|
156 |
top_p=0.9,
|
|
|
158 |
repetition_penalty=1.1,
|
159 |
)
|
160 |
|
|
|
|
|
|
|
161 |
|
162 |
# embedding model
|
163 |
from sentence_transformers import SentenceTransformer
|
|
|
1461 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
1462 |
import librosa
|
1463 |
|
1464 |
+
import torch
|
1465 |
+
print(torch.cuda.is_available()) # β
Tells you if GPU is available
|
1466 |
+
|
1467 |
+
|
1468 |
# Bark TTS
|
1469 |
print("π Loading Bark model...")
|
1470 |
model_bark = BarkModel.from_pretrained("suno/bark")
|
|
|
1491 |
|
1492 |
# Whisper STT
|
1493 |
print("π Loading Whisper model...")
|
1494 |
+
whisper_model = whisper.load_model("base", device="cuda")
|
1495 |
print("β
Whisper model loaded")
|
1496 |
def whisper_stt(audio_path):
|
1497 |
if not audio_path or not os.path.exists(audio_path): return ""
|
|
|
1553 |
print("π Loading Wav2Vec processor and model...")
|
1554 |
wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
|
1555 |
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
|
1556 |
+
wav2vec_model = wav2vec_model.to("cuda" if torch.cuda.is_available() else "cpu")
|
1557 |
print("β
Wav2Vec model loaded")
|
1558 |
wav2vec_model.eval()
|
1559 |
voice_label_map = {
|
|
|
1566 |
def analyze_audio_emotion(audio_path):
|
1567 |
print(f"π Analyzing audio emotion for: {audio_path}")
|
1568 |
if not audio_path or not os.path.exists(audio_path): return "neutral"
|
1569 |
+
|
1570 |
speech, sr = librosa.load(audio_path, sr=16000)
|
1571 |
inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
|
1572 |
+
|
1573 |
+
# π₯ Move model and inputs to GPU
|
1574 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
1575 |
+
wav2vec_model.to(device)
|
1576 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
1577 |
+
|
1578 |
with torch.no_grad():
|
1579 |
logits = wav2vec_model(**inputs).logits
|
1580 |
+
|
1581 |
probs = torch.nn.functional.softmax(logits, dim=-1)
|
1582 |
predicted_id = torch.argmax(probs, dim=-1).item()
|
1583 |
return voice_label_map.get(predicted_id, "neutral")
|
1584 |
|
1585 |
+
|
1586 |
# --- Effective confidence calculation
|
1587 |
def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
|
1588 |
emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
|