Spaces:

Nick021402
/

Voice2PersonaAI

Running

App Files Files Community

Nick021402 commited on May 23

Commit

5a8efbe

verified ·

1 Parent(s): 441e8aa

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -26

app.py CHANGED Viewed

@@ -9,7 +9,8 @@ from transformers import (
     AutoModelForAudioClassification,
     AutoFeatureExtractor,
     T5ForConditionalGeneration,
-    T5Tokenizer
 )
 import librosa
 import warnings
@@ -23,9 +24,16 @@ stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
 stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
-# Emotion Recognition Model
-emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
-emotion_model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
 # Personality Generation Model
 personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
@@ -33,15 +41,15 @@ personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-b
 print("Models loaded successfully!")
-# Emotion labels mapping
 EMOTION_LABELS = {
     0: "angry",
-    1: "disgust",
-    2: "fear",
-    3: "happy",
-    4: "neutral",
-    5: "sad",
-    6: "surprise"
 }
 def preprocess_audio(audio_path, target_sr=16000):
@@ -82,29 +90,47 @@ def transcribe_audio(audio_path):
         return f"Transcription error: {str(e)}"
 def detect_emotion(audio_path):
-    """Detect emotion from audio using specialized model"""
     try:
         audio, sr = preprocess_audio(audio_path)
         if audio is None:
             return "Error: Could not process audio file", 0.0
-        # Extract features for emotion model
-        inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
-        # Get emotion predictions
-        with torch.no_grad():
-            outputs = emotion_model(**inputs)
-            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
-        # Get the most likely emotion
-        emotion_id = torch.argmax(predictions, dim=-1).item()
-        confidence = torch.max(predictions).item()
-        emotion_label = EMOTION_LABELS.get(emotion_id, "unknown")
         return emotion_label, confidence
     except Exception as e:
-        return f"Emotion detection error: {str(e)}", 0.0
 def generate_personality(transcription, emotion, confidence):
     """Generate personality description using FLAN-T5"""

     AutoModelForAudioClassification,
     AutoFeatureExtractor,
     T5ForConditionalGeneration,
+    T5Tokenizer,
+    Wav2Vec2ForSequenceClassification
 )
 import librosa
 import warnings
 stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+# Emotion Recognition Model - using a more reliable model
+try:
+    from transformers import Wav2Vec2ForSequenceClassification
+    emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
+    emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
+except:
+    # Fallback to a simpler approach using audio features
+    emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+    emotion_model = None
+    print("Using fallback emotion detection method")
 # Personality Generation Model
 personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
 print("Models loaded successfully!")
+# Emotion labels mapping (updated for broader coverage)
 EMOTION_LABELS = {
     0: "angry",
+    1: "happy",
+    2: "sad",
+    3: "neutral",
+    4: "excited",
+    5: "calm",
+    6: "surprised"
 }
 def preprocess_audio(audio_path, target_sr=16000):
         return f"Transcription error: {str(e)}"
 def detect_emotion(audio_path):
+    """Detect emotion from audio using audio features analysis"""
     try:
         audio, sr = preprocess_audio(audio_path)
         if audio is None:
             return "Error: Could not process audio file", 0.0
+        if emotion_model is not None:
+            # Use the wav2vec2 emotion model if available
+            inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+            with torch.no_grad():
+                outputs = emotion_model(**inputs)
+                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            emotion_id = torch.argmax(predictions, dim=-1).item()
+            confidence = torch.max(predictions).item()
+            emotion_label = EMOTION_LABELS.get(emotion_id, "neutral")
+        else:
+            # Fallback: Simple audio feature-based emotion detection
+            # Analyze audio characteristics
+            rms_energy = np.sqrt(np.mean(audio**2))
+            zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0])
+            spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0])
+            # Simple heuristic-based emotion classification
+            if rms_energy > 0.02 and zero_crossing_rate > 0.1:
+                emotion_label = "excited"
+                confidence = 0.75
+            elif rms_energy < 0.005:
+                emotion_label = "calm"
+                confidence = 0.70
+            elif spectral_centroid > 2000:
+                emotion_label = "happy"
+                confidence = 0.65
+            else:
+                emotion_label = "neutral"
+                confidence = 0.60
         return emotion_label, confidence
     except Exception as e:
+        return "neutral", 0.50  # Default fallback
 def generate_personality(transcription, emotion, confidence):
     """Generate personality description using FLAN-T5"""