Update app.py
Browse files
app.py
CHANGED
@@ -13,9 +13,6 @@ import time
|
|
13 |
import numpy as np
|
14 |
import librosa
|
15 |
import subprocess
|
16 |
-
import pyaudio
|
17 |
-
import wave
|
18 |
-
import io
|
19 |
|
20 |
# Suppress warnings for a clean console
|
21 |
logging.getLogger("torch").setLevel(logging.CRITICAL)
|
@@ -32,7 +29,7 @@ st.set_page_config(layout="wide", page_title="Advanced Voice Emotion Analyzer")
|
|
32 |
|
33 |
# Interface design
|
34 |
st.title("🎙️ Advanced Voice Emotion Analyzer")
|
35 |
-
st.write("Analyze all emotions from audio
|
36 |
|
37 |
# Audio Preprocessing
|
38 |
def make_audio_scarier(audio_path, output_path):
|
@@ -83,14 +80,27 @@ def perform_audio_emotion_detection(audio_path):
|
|
83 |
audio_emotions = ["neutral", "happy", "sad", "angry", "fearful", "surprise", "disgust"]
|
84 |
emotion_dict = {emotion: float(scores[i]) for i, emotion in enumerate(audio_emotions)}
|
85 |
top_emotion = audio_emotions[np.argmax(scores)]
|
86 |
-
#
|
87 |
features = extract_audio_features(audio_path)
|
88 |
if features.get("pitch_mean", 0) < 200 and features.get("energy_mean", 0) > 0.1 and features.get("zcr_mean", 0) > 0.1:
|
89 |
-
emotion_dict["fearful"] = min(1.0, emotion_dict.get("fearful", 0) + 0.
|
90 |
top_emotion = "fearful" if emotion_dict["fearful"] > emotion_dict[top_emotion] else top_emotion
|
91 |
-
elif features.get("energy_mean", 0) > 0.
|
92 |
-
emotion_dict["angry"] = min(1.0, emotion_dict.get("angry", 0) + 0.
|
93 |
top_emotion = "angry" if emotion_dict["angry"] > emotion_dict[top_emotion] else top_emotion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
return emotion_dict, top_emotion
|
95 |
except Exception as e:
|
96 |
st.error(f"Audio emotion detection failed: {str(e)}")
|
@@ -114,6 +124,10 @@ def perform_text_emotion_detection(text):
|
|
114 |
"pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]
|
115 |
emotions_dict = {result['label']: result['score'] for result in results if result['label'] in emotions}
|
116 |
top_emotion = max(emotions_dict, key=emotions_dict.get)
|
|
|
|
|
|
|
|
|
117 |
return emotions_dict, top_emotion
|
118 |
except Exception as e:
|
119 |
st.error(f"Text emotion detection failed: {str(e)}")
|
@@ -169,50 +183,12 @@ def transcribe_audio(audio_path):
|
|
169 |
st.error(f"Transcription failed: {str(e)}")
|
170 |
return ""
|
171 |
|
172 |
-
# Python Audio Recording
|
173 |
-
def record_audio():
|
174 |
-
CHUNK = 1024
|
175 |
-
FORMAT = pyaudio.paInt16
|
176 |
-
CHANNELS = 1
|
177 |
-
RATE = 16000
|
178 |
-
RECORD_SECONDS = st.slider("Recording duration (seconds)", 1, 30, 5)
|
179 |
-
|
180 |
-
p = pyaudio.PyAudio()
|
181 |
-
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
|
182 |
-
|
183 |
-
if st.button("Start Recording"):
|
184 |
-
st.write("Recording...")
|
185 |
-
frames = []
|
186 |
-
for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
|
187 |
-
data = stream.read(CHUNK)
|
188 |
-
frames.append(data)
|
189 |
-
st.write("Recording finished.")
|
190 |
-
|
191 |
-
stream.stop_stream()
|
192 |
-
stream.close()
|
193 |
-
p.terminate()
|
194 |
-
|
195 |
-
temp_file_path = os.path.join(tempfile.gettempdir(), f"recorded_audio_{int(time.time())}.wav")
|
196 |
-
wf = wave.open(temp_file_path, 'wb')
|
197 |
-
wf.setnchannels(CHANNELS)
|
198 |
-
wf.setsampwidth(p.get_sample_size(FORMAT))
|
199 |
-
wf.setframerate(RATE)
|
200 |
-
wf.writeframes(b''.join(frames))
|
201 |
-
wf.close()
|
202 |
-
|
203 |
-
return temp_file_path
|
204 |
-
return None
|
205 |
-
|
206 |
# Process Audio Files
|
207 |
def process_audio_file(audio_data):
|
208 |
temp_dir = tempfile.gettempdir()
|
209 |
temp_file_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")
|
210 |
with open(temp_file_path, "wb") as f:
|
211 |
-
|
212 |
-
with open(audio_data, "rb") as f_audio:
|
213 |
-
f.write(f_audio.read())
|
214 |
-
else:
|
215 |
-
f.write(audio_data.getvalue())
|
216 |
if not validate_audio(temp_file_path):
|
217 |
return None
|
218 |
return temp_file_path
|
@@ -230,6 +206,7 @@ def display_analysis_results(audio_path):
|
|
230 |
audio_emotions, audio_top_emotion = perform_audio_emotion_detection(processed_audio_path)
|
231 |
st.subheader("Audio-Based Emotion")
|
232 |
st.write(f"**Dominant Emotion:** {audio_top_emotion} (Score: {audio_emotions.get(audio_top_emotion, 0):.3f})")
|
|
|
233 |
|
234 |
# Transcription and text emotion detection
|
235 |
transcribed_text = transcribe_audio(processed_audio_path)
|
@@ -238,6 +215,7 @@ def display_analysis_results(audio_path):
|
|
238 |
if transcribed_text:
|
239 |
text_emotions, text_top_emotion = perform_text_emotion_detection(transcribed_text)
|
240 |
st.write(f"**Text-Based Dominant Emotion:** {text_top_emotion} (Score: {text_emotions.get(text_top_emotion, 0):.3f})")
|
|
|
241 |
|
242 |
# Combine emotions (prioritize audio, map to 27 emotions)
|
243 |
emotion_map = {
|
@@ -256,7 +234,12 @@ def display_analysis_results(audio_path):
|
|
256 |
for text_emotion, score in text_emotions.items():
|
257 |
combined_emotions[text_emotion] = combined_emotions.get(text_emotion, 0) + score * 0.3
|
258 |
|
|
|
259 |
top_emotion = max(combined_emotions, key=combined_emotions.get)
|
|
|
|
|
|
|
|
|
260 |
sentiment = "POSITIVE" if top_emotion in ["admiration", "amusement", "approval", "caring", "desire", "excitement",
|
261 |
"gratitude", "joy", "love", "optimism", "pride", "relief"] else "NEGATIVE" if top_emotion in ["anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"] else "NEUTRAL"
|
262 |
|
@@ -287,7 +270,7 @@ def display_analysis_results(audio_path):
|
|
287 |
st.write(f"**Audio Features:** {extract_audio_features(processed_audio_path)}")
|
288 |
st.write("""
|
289 |
**How it works:**
|
290 |
-
- Audio Emotion: Wav2Vec2 detects 7 emotions
|
291 |
- Transcription: Whisper converts audio to text.
|
292 |
- Text Emotion: RoBERTa refines 27 emotions from text.
|
293 |
- Sarcasm: Analyzes text for irony.
|
@@ -301,25 +284,13 @@ def display_analysis_results(audio_path):
|
|
301 |
|
302 |
# Main App Logic
|
303 |
def main():
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
audio_file = st.file_uploader("Upload audio (wav, mp3, ogg)", type=["wav", "mp3", "ogg"])
|
309 |
-
if audio_file:
|
310 |
-
temp_audio_path = process_audio_file(audio_file)
|
311 |
-
if temp_audio_path:
|
312 |
-
if st.button("Analyze Upload"):
|
313 |
-
with st.spinner("Analyzing..."):
|
314 |
-
display_analysis_results(temp_audio_path)
|
315 |
-
|
316 |
-
with tab2:
|
317 |
-
st.header("Record Your Voice")
|
318 |
-
st.write("Record audio to analyze emotions in real-time.")
|
319 |
-
temp_audio_path = record_audio()
|
320 |
if temp_audio_path:
|
321 |
-
if st.button("Analyze
|
322 |
-
with st.spinner("
|
323 |
display_analysis_results(temp_audio_path)
|
324 |
|
325 |
st.sidebar.header("About")
|
@@ -329,6 +300,7 @@ def main():
|
|
329 |
- Text: SamLowe/roberta-base-go_emotions (27 emotions)
|
330 |
- Sarcasm: cardiffnlp/twitter-roberta-base-irony
|
331 |
- Speech: OpenAI Whisper (large-v3)
|
|
|
332 |
""")
|
333 |
|
334 |
if __name__ == "__main__":
|
|
|
13 |
import numpy as np
|
14 |
import librosa
|
15 |
import subprocess
|
|
|
|
|
|
|
16 |
|
17 |
# Suppress warnings for a clean console
|
18 |
logging.getLogger("torch").setLevel(logging.CRITICAL)
|
|
|
29 |
|
30 |
# Interface design
|
31 |
st.title("🎙️ Advanced Voice Emotion Analyzer")
|
32 |
+
st.write("Analyze all 27 emotions from uploaded audio with enhanced detection to avoid neutral defaults.")
|
33 |
|
34 |
# Audio Preprocessing
|
35 |
def make_audio_scarier(audio_path, output_path):
|
|
|
80 |
audio_emotions = ["neutral", "happy", "sad", "angry", "fearful", "surprise", "disgust"]
|
81 |
emotion_dict = {emotion: float(scores[i]) for i, emotion in enumerate(audio_emotions)}
|
82 |
top_emotion = audio_emotions[np.argmax(scores)]
|
83 |
+
# Enhanced boosting based on audio features
|
84 |
features = extract_audio_features(audio_path)
|
85 |
if features.get("pitch_mean", 0) < 200 and features.get("energy_mean", 0) > 0.1 and features.get("zcr_mean", 0) > 0.1:
|
86 |
+
emotion_dict["fearful"] = min(1.0, emotion_dict.get("fearful", 0) + 0.4) # Increased boost
|
87 |
top_emotion = "fearful" if emotion_dict["fearful"] > emotion_dict[top_emotion] else top_emotion
|
88 |
+
elif features.get("energy_mean", 0) > 0.25: # Stricter threshold
|
89 |
+
emotion_dict["angry"] = min(1.0, emotion_dict.get("angry", 0) + 0.35)
|
90 |
top_emotion = "angry" if emotion_dict["angry"] > emotion_dict[top_emotion] else top_emotion
|
91 |
+
elif features.get("pitch_mean", 0) > 500 and features.get("energy_mean", 0) < 0.05:
|
92 |
+
emotion_dict["sad"] = min(1.0, emotion_dict.get("sad", 0) + 0.3)
|
93 |
+
top_emotion = "sad" if emotion_dict["sad"] > emotion_dict[top_emotion] else top_emotion
|
94 |
+
elif features.get("energy_mean", 0) > 0.15 and features.get("pitch_mean", 0) > 300:
|
95 |
+
emotion_dict["happy"] = min(1.0, emotion_dict.get("happy", 0) + 0.3)
|
96 |
+
top_emotion = "happy" if emotion_dict["happy"] > emotion_dict[top_emotion] else top_emotion
|
97 |
+
elif features.get("zcr_mean", 0) > 0.15 and features.get("energy_mean", 0) > 0.1:
|
98 |
+
emotion_dict["surprise"] = min(1.0, emotion_dict.get("surprise", 0) + 0.25)
|
99 |
+
top_emotion = "surprise" if emotion_dict["surprise"] > emotion_dict[top_emotion] else top_emotion
|
100 |
+
# Fallback to avoid neutral if score is low
|
101 |
+
if emotion_dict["neutral"] > 0.5 and max([v for k, v in emotion_dict.items() if k != "neutral"]) > 0.3:
|
102 |
+
emotion_dict["neutral"] = max(0.0, emotion_dict["neutral"] - 0.2) # Reduce neutral weight
|
103 |
+
top_emotion = max(emotion_dict, key=emotion_dict.get)
|
104 |
return emotion_dict, top_emotion
|
105 |
except Exception as e:
|
106 |
st.error(f"Audio emotion detection failed: {str(e)}")
|
|
|
124 |
"pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]
|
125 |
emotions_dict = {result['label']: result['score'] for result in results if result['label'] in emotions}
|
126 |
top_emotion = max(emotions_dict, key=emotions_dict.get)
|
127 |
+
# Reduce neutral influence if other emotions are strong
|
128 |
+
if emotions_dict.get("neutral", 0) > 0.5 and max([v for k, v in emotions_dict.items() if k != "neutral"]) > 0.4:
|
129 |
+
emotions_dict["neutral"] = max(0.0, emotions_dict["neutral"] - 0.15)
|
130 |
+
top_emotion = max(emotions_dict, key=emotions_dict.get)
|
131 |
return emotions_dict, top_emotion
|
132 |
except Exception as e:
|
133 |
st.error(f"Text emotion detection failed: {str(e)}")
|
|
|
183 |
st.error(f"Transcription failed: {str(e)}")
|
184 |
return ""
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
# Process Audio Files
|
187 |
def process_audio_file(audio_data):
|
188 |
temp_dir = tempfile.gettempdir()
|
189 |
temp_file_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")
|
190 |
with open(temp_file_path, "wb") as f:
|
191 |
+
f.write(audio_data.getvalue())
|
|
|
|
|
|
|
|
|
192 |
if not validate_audio(temp_file_path):
|
193 |
return None
|
194 |
return temp_file_path
|
|
|
206 |
audio_emotions, audio_top_emotion = perform_audio_emotion_detection(processed_audio_path)
|
207 |
st.subheader("Audio-Based Emotion")
|
208 |
st.write(f"**Dominant Emotion:** {audio_top_emotion} (Score: {audio_emotions.get(audio_top_emotion, 0):.3f})")
|
209 |
+
st.write("Audio Emotions:", audio_emotions) # Debug output
|
210 |
|
211 |
# Transcription and text emotion detection
|
212 |
transcribed_text = transcribe_audio(processed_audio_path)
|
|
|
215 |
if transcribed_text:
|
216 |
text_emotions, text_top_emotion = perform_text_emotion_detection(transcribed_text)
|
217 |
st.write(f"**Text-Based Dominant Emotion:** {text_top_emotion} (Score: {text_emotions.get(text_top_emotion, 0):.3f})")
|
218 |
+
st.write("Text Emotions:", text_emotions) # Debug output
|
219 |
|
220 |
# Combine emotions (prioritize audio, map to 27 emotions)
|
221 |
emotion_map = {
|
|
|
234 |
for text_emotion, score in text_emotions.items():
|
235 |
combined_emotions[text_emotion] = combined_emotions.get(text_emotion, 0) + score * 0.3
|
236 |
|
237 |
+
# Avoid neutral if other emotions are competitive
|
238 |
top_emotion = max(combined_emotions, key=combined_emotions.get)
|
239 |
+
if combined_emotions["neutral"] > 0.5 and max([v for k, v in combined_emotions.items() if k != "neutral"]) > 0.4:
|
240 |
+
combined_emotions["neutral"] = max(0.0, combined_emotions["neutral"] - 0.25) # Stronger reduction
|
241 |
+
top_emotion = max(combined_emotions, key=combined_emotions.get)
|
242 |
+
|
243 |
sentiment = "POSITIVE" if top_emotion in ["admiration", "amusement", "approval", "caring", "desire", "excitement",
|
244 |
"gratitude", "joy", "love", "optimism", "pride", "relief"] else "NEGATIVE" if top_emotion in ["anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"] else "NEUTRAL"
|
245 |
|
|
|
270 |
st.write(f"**Audio Features:** {extract_audio_features(processed_audio_path)}")
|
271 |
st.write("""
|
272 |
**How it works:**
|
273 |
+
- Audio Emotion: Wav2Vec2 detects 7 emotions with feature-based boosts.
|
274 |
- Transcription: Whisper converts audio to text.
|
275 |
- Text Emotion: RoBERTa refines 27 emotions from text.
|
276 |
- Sarcasm: Analyzes text for irony.
|
|
|
284 |
|
285 |
# Main App Logic
|
286 |
def main():
|
287 |
+
st.header("Upload Audio File")
|
288 |
+
audio_file = st.file_uploader("Upload audio (wav, mp3, ogg)", type=["wav", "mp3", "ogg"])
|
289 |
+
if audio_file:
|
290 |
+
temp_audio_path = process_audio_file(audio_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
if temp_audio_path:
|
292 |
+
if st.button("Analyze Audio"):
|
293 |
+
with st.spinner("Analyzing..."):
|
294 |
display_analysis_results(temp_audio_path)
|
295 |
|
296 |
st.sidebar.header("About")
|
|
|
300 |
- Text: SamLowe/roberta-base-go_emotions (27 emotions)
|
301 |
- Sarcasm: cardiffnlp/twitter-roberta-base-irony
|
302 |
- Speech: OpenAI Whisper (large-v3)
|
303 |
+
**Note:** Recording is not supported on Hugging Face Spaces; use uploaded files.
|
304 |
""")
|
305 |
|
306 |
if __name__ == "__main__":
|