PMS61 commited on
Commit
3a70449
·
1 Parent(s): 8ecb9cb
Files changed (7) hide show
  1. Dockerfile +7 -4
  2. app.py +13 -10
  3. requirements.txt +1 -1
  4. utils/expressions.py +4 -21
  5. utils/models.py +35 -0
  6. utils/transcription.py +17 -54
  7. utils/vocals.py +3 -29
Dockerfile CHANGED
@@ -4,6 +4,9 @@ FROM python:3.11-slim
4
  # Set the working directory in the container
5
  WORKDIR /code
6
 
 
 
 
7
  # Install system dependencies required by your project (ffmpeg, opencv)
8
  RUN apt-get update && apt-get install -y --no-install-recommends \
9
  build-essential \
@@ -16,9 +19,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
16
  COPY ./requirements.txt /code/requirements.txt
17
 
18
  # Install any needed packages specified in requirements.txt
19
- # We add gunicorn here for a production-ready web server
20
  RUN pip install --no-cache-dir --upgrade pip
21
- RUN pip install --no-cache-dir -r requirements.txt gunicorn
22
 
23
  # Copy the rest of the application's code to the working directory
24
  COPY . /code/
@@ -26,5 +29,5 @@ COPY . /code/
26
  # Expose the port the app runs on (Hugging Face Spaces default is 7860)
27
  EXPOSE 7860
28
 
29
- # Command to run the application using Gunicorn
30
- CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--timeout", "600", "app:app"]
 
4
  # Set the working directory in the container
5
  WORKDIR /code
6
 
7
+ # Set the PYTHONPATH environment variable
8
+ ENV PYTHONPATH="/code"
9
+
10
  # Install system dependencies required by your project (ffmpeg, opencv)
11
  RUN apt-get update && apt-get install -y --no-install-recommends \
12
  build-essential \
 
19
  COPY ./requirements.txt /code/requirements.txt
20
 
21
  # Install any needed packages specified in requirements.txt
22
+ # We add gunicorn and gevent here for a production-ready web server
23
  RUN pip install --no-cache-dir --upgrade pip
24
+ RUN pip install --no-cache-dir -r requirements.txt gunicorn gevent
25
 
26
  # Copy the rest of the application's code to the working directory
27
  COPY . /code/
 
29
  # Expose the port the app runs on (Hugging Face Spaces default is 7860)
30
  EXPOSE 7860
31
 
32
+ # Command to run the application using Gunicorn with gevent workers
33
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "4", "--worker-class", "gevent", "--timeout", "600", "app:app"]
app.py CHANGED
@@ -10,17 +10,20 @@ from utils.transcription import speech_to_text_long
10
  from utils.vocals import predict_emotion
11
  from utils.vocabulary import evaluate_vocabulary
12
  from groq import Groq
13
- from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
14
  import pandas as pd
15
  from bson import ObjectId
16
  import json
17
  from dotenv import load_dotenv
18
  from datetime import datetime
 
19
 
20
  load_dotenv()
21
  app = Flask(__name__)
22
  CORS(app)
23
 
 
 
 
24
  # MongoDB connection
25
  client = pymongo.MongoClient("mongodb+srv://pmsankheb23:[email protected]/")
26
  db = client["Eloquence"]
@@ -39,11 +42,6 @@ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
39
  if not os.path.exists(UPLOAD_FOLDER):
40
  os.makedirs(UPLOAD_FOLDER)
41
 
42
- model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
43
- model = AutoModelForAudioClassification.from_pretrained(model_id)
44
- feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
45
- id2label = model.config.id2label
46
-
47
  def allowed_file(filename):
48
  return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
49
 
@@ -96,10 +94,15 @@ def upload_file():
96
 
97
  emotion_analysis = pd.DataFrame()
98
  if mode == "video":
99
- emotion_analysis = analyze_video_emotions(file_path)
100
-
101
- transcription = speech_to_text_long(audio_path)
102
- audio_emotion = predict_emotion(audio_path, model, feature_extractor, id2label)
 
 
 
 
 
103
  vocabulary_report = evaluate_vocabulary(transcription, context)
104
  scores = generate_scores(transcription, audio_emotion, emotion_analysis)
105
  speech_report = generate_speech_report(transcription, context, audio_emotion)
 
10
  from utils.vocals import predict_emotion
11
  from utils.vocabulary import evaluate_vocabulary
12
  from groq import Groq
 
13
  import pandas as pd
14
  from bson import ObjectId
15
  import json
16
  from dotenv import load_dotenv
17
  from datetime import datetime
18
+ from utils.models import load_models
19
 
20
  load_dotenv()
21
  app = Flask(__name__)
22
  CORS(app)
23
 
24
+ # Load models on startup
25
+ models = load_models()
26
+
27
  # MongoDB connection
28
  client = pymongo.MongoClient("mongodb+srv://pmsankheb23:[email protected]/")
29
  db = client["Eloquence"]
 
42
  if not os.path.exists(UPLOAD_FOLDER):
43
  os.makedirs(UPLOAD_FOLDER)
44
 
 
 
 
 
 
45
  def allowed_file(filename):
46
  return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
47
 
 
94
 
95
  emotion_analysis = pd.DataFrame()
96
  if mode == "video":
97
+ emotion_analysis = analyze_video_emotions(file_path, models["fer"])
98
+
99
+ transcription = speech_to_text_long(audio_path, models["whisper"])
100
+ audio_emotion = predict_emotion(
101
+ audio_path,
102
+ models["emotion_model"],
103
+ models["emotion_feature_extractor"],
104
+ models["emotion_id2label"],
105
+ )
106
  vocabulary_report = evaluate_vocabulary(transcription, context)
107
  scores = generate_scores(transcription, audio_emotion, emotion_analysis)
108
  speech_report = generate_speech_report(transcription, context, audio_emotion)
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- absl-py
2
  annotated-types
3
  anyio
4
  audeer
@@ -34,6 +33,7 @@ Flask-Cors
34
  fonttools
35
  fsspec
36
  future
 
37
  groq
38
  h11
39
  h5py
 
 
1
  annotated-types
2
  anyio
3
  audeer
 
33
  fonttools
34
  fsspec
35
  future
36
+ gevent
37
  groq
38
  h11
39
  h5py
utils/expressions.py CHANGED
@@ -1,44 +1,27 @@
1
- from fer import Video
2
- from fer import FER
3
  import pandas as pd
4
 
5
- def analyze_video_emotions(video_file_path):
6
  """
7
  Analyzes the emotions in a given video file and returns a dataframe of scores.
8
-
9
- Args:
10
- video_file_path (str): Path to the video file to be analyzed.
11
-
12
- Returns:
13
- pd.DataFrame: DataFrame containing the emotion scores.
14
  """
15
- # Initialize the face detector
16
- face_detector = FER(mtcnn=True)
17
-
18
- # Input the video for processing
19
  input_video = Video(video_file_path)
20
-
21
- # Analyze the video
22
  processing_data = input_video.analyze(face_detector, display=False)
23
 
24
- # Check if any faces were detected
25
  if not processing_data:
26
  print("No faces detected in the video.")
27
- return pd.DataFrame() # Return an empty DataFrame if no faces are detected
28
 
29
- # Convert the results to a DataFrame
30
  vid_df = input_video.to_pandas(processing_data)
31
  vid_df = input_video.get_first_face(vid_df)
32
  vid_df = input_video.get_emotions(vid_df)
33
 
34
- # Calculate the sum of each emotion
35
  emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
36
  emotions_values = [sum(vid_df[emotion]) for emotion in emotions]
37
 
38
- # Create a DataFrame for comparison
39
  score_comparisons = pd.DataFrame({
40
  'Human Emotions': [emotion.capitalize() for emotion in emotions],
41
  'Emotion Value from the Video': emotions_values
42
  })
43
 
44
- return score_comparisons
 
1
+ ```from fer import Video
 
2
  import pandas as pd
3
 
4
+ def analyze_video_emotions(video_file_path, face_detector):
5
  """
6
  Analyzes the emotions in a given video file and returns a dataframe of scores.
 
 
 
 
 
 
7
  """
 
 
 
 
8
  input_video = Video(video_file_path)
 
 
9
  processing_data = input_video.analyze(face_detector, display=False)
10
 
 
11
  if not processing_data:
12
  print("No faces detected in the video.")
13
+ return pd.DataFrame()
14
 
 
15
  vid_df = input_video.to_pandas(processing_data)
16
  vid_df = input_video.get_first_face(vid_df)
17
  vid_df = input_video.get_emotions(vid_df)
18
 
 
19
  emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
20
  emotions_values = [sum(vid_df[emotion]) for emotion in emotions]
21
 
 
22
  score_comparisons = pd.DataFrame({
23
  'Human Emotions': [emotion.capitalize() for emotion in emotions],
24
  'Emotion Value from the Video': emotions_values
25
  })
26
 
27
+ return score_comparisons
utils/models.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForAudioClassification, AutoFeatureExtractor
3
+ from fer import FER
4
+
5
+ def load_models():
6
+ """
7
+ Loads all the machine learning models and returns them as a dictionary.
8
+ """
9
+ # Whisper model for transcription
10
+ whisper_model_name = "openai/whisper-base"
11
+ whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
12
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ whisper_model = whisper_model.to(device)
15
+
16
+ # Speech emotion recognition model
17
+ emotion_model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
18
+ emotion_model = AutoModelForAudioClassification.from_pretrained(emotion_model_id)
19
+ emotion_feature_extractor = AutoFeatureExtractor.from_pretrained(emotion_model_id, do_normalize=True)
20
+ emotion_id2label = emotion_model.config.id2label
21
+
22
+ # Facial emotion recognition model
23
+ fer_detector = FER(mtcnn=True)
24
+
25
+ return {
26
+ "whisper": {
27
+ "processor": whisper_processor,
28
+ "model": whisper_model,
29
+ "device": device,
30
+ },
31
+ "emotion_model": emotion_model,
32
+ "emotion_feature_extractor": emotion_feature_extractor,
33
+ "emotion_id2label": emotion_id2label,
34
+ "fer": fer_detector,
35
+ }
utils/transcription.py CHANGED
@@ -1,25 +1,11 @@
1
  import torch
2
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  from pydub import AudioSegment
4
  import soundfile as sf
5
  import os
6
 
7
- model_name = "openai/whisper-base"
8
- processor = WhisperProcessor.from_pretrained(model_name)
9
- model = WhisperForConditionalGeneration.from_pretrained(model_name)
10
- device = "cuda" if torch.cuda.is_available() else "cpu"
11
- model = model.to(device)
12
-
13
  def preprocess_audio(input_audio_path, output_audio_path):
14
  """
15
  Converts audio to 16kHz WAV format.
16
-
17
- Args:
18
- input_audio_path (str): Path to the input audio file.
19
- output_audio_path (str): Path to save the processed audio file.
20
-
21
- Returns:
22
- str: Path to the processed audio file.
23
  """
24
  audio = AudioSegment.from_file(input_audio_path)
25
  audio = audio.set_frame_rate(16000).set_channels(1)
@@ -29,59 +15,36 @@ def preprocess_audio(input_audio_path, output_audio_path):
29
  def split_audio(audio_path, chunk_length_ms=30000):
30
  """
31
  Splits audio into chunks of specified length.
32
-
33
- Args:
34
- audio_path (str): Path to the audio file.
35
- chunk_length_ms (int): Length of each chunk in milliseconds.
36
-
37
- Returns:
38
- list: List of audio chunks.
39
  """
40
  audio = AudioSegment.from_file(audio_path)
41
- chunks = [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
42
- return chunks
43
 
44
- def transcribe_chunk(audio_chunk, chunk_index):
45
  """
46
- Transcribes a single audio chunk.
47
-
48
- Args:
49
- audio_chunk (AudioSegment): The audio chunk to transcribe.
50
- chunk_index (int): Index of the chunk.
51
-
52
- Returns:
53
- str: Transcription of the chunk.
54
  """
55
  temp_path = f"temp_chunk_{chunk_index}.wav"
56
  audio_chunk.export(temp_path, format="wav")
57
- audio, sampling_rate = sf.read(temp_path)
58
- inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
59
- input_features = inputs.input_features.to(device)
60
- predicted_ids = model.generate(input_features)
61
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
62
- os.remove(temp_path) # Clean up temporary file
 
 
 
63
  return transcription
64
 
65
- def speech_to_text_long(audio_path):
66
  """
67
  Transcribes a long audio file by splitting it into chunks.
68
-
69
- Args:
70
- audio_path (str): Path to the audio file.
71
-
72
- Returns:
73
- str: Full transcription of the audio.
74
  """
75
  processed_audio_path = "processed_audio.wav"
76
  preprocess_audio(audio_path, processed_audio_path)
77
 
78
- # Split audio into chunks
79
- chunks = split_audio(processed_audio_path, chunk_length_ms=30000) # 30 seconds per chunk
80
- transcriptions = []
81
-
82
- for idx, chunk in enumerate(chunks):
83
- print(f"Transcribing chunk {idx + 1} of {len(chunks)}...")
84
- transcription = transcribe_chunk(chunk, idx)
85
- transcriptions.append(transcription)
86
 
87
- return " ".join(transcriptions)
 
 
1
  import torch
 
2
  from pydub import AudioSegment
3
  import soundfile as sf
4
  import os
5
 
 
 
 
 
 
 
6
  def preprocess_audio(input_audio_path, output_audio_path):
7
  """
8
  Converts audio to 16kHz WAV format.
 
 
 
 
 
 
 
9
  """
10
  audio = AudioSegment.from_file(input_audio_path)
11
  audio = audio.set_frame_rate(16000).set_channels(1)
 
15
  def split_audio(audio_path, chunk_length_ms=30000):
16
  """
17
  Splits audio into chunks of specified length.
 
 
 
 
 
 
 
18
  """
19
  audio = AudioSegment.from_file(audio_path)
20
+ return [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
 
21
 
22
+ def transcribe_chunk(audio_chunk, chunk_index, whisper_models):
23
  """
24
+ Transcribes a single audio chunk using the pre-loaded Whisper model.
 
 
 
 
 
 
 
25
  """
26
  temp_path = f"temp_chunk_{chunk_index}.wav"
27
  audio_chunk.export(temp_path, format="wav")
28
+
29
+ audio, _ = sf.read(temp_path)
30
+ inputs = whisper_models["processor"](audio, sampling_rate=16000, return_tensors="pt")
31
+ input_features = inputs.input_features.to(whisper_models["device"])
32
+
33
+ predicted_ids = whisper_models["model"].generate(input_features)
34
+ transcription = whisper_models["processor"].batch_decode(predicted_ids, skip_special_tokens=True)[0]
35
+
36
+ os.remove(temp_path)
37
  return transcription
38
 
39
+ def speech_to_text_long(audio_path, whisper_models):
40
  """
41
  Transcribes a long audio file by splitting it into chunks.
 
 
 
 
 
 
42
  """
43
  processed_audio_path = "processed_audio.wav"
44
  preprocess_audio(audio_path, processed_audio_path)
45
 
46
+ chunks = split_audio(processed_audio_path)
47
+ transcriptions = [transcribe_chunk(chunk, idx, whisper_models) for idx, chunk in enumerate(chunks)]
 
 
 
 
 
 
48
 
49
+ os.remove(processed_audio_path)
50
+ return " ".join(transcriptions)
utils/vocals.py CHANGED
@@ -1,25 +1,10 @@
1
- from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
2
  import librosa
3
  import torch
4
  import numpy as np
5
 
6
- model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
7
- model = AutoModelForAudioClassification.from_pretrained(model_id)
8
- feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
9
- id2label = model.config.id2label
10
-
11
  def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3000):
12
  """
13
  Preprocesses audio for emotion prediction.
14
-
15
- Args:
16
- audio_array (np.array): The audio data as a numpy array.
17
- feature_extractor: The feature extractor for the model.
18
- sampling_rate (int): The sampling rate of the audio.
19
- max_length (int): Maximum length of the audio features.
20
-
21
- Returns:
22
- dict: Preprocessed inputs for the model.
23
  """
24
  inputs = feature_extractor(
25
  audio_array,
@@ -41,22 +26,11 @@ def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3
41
  def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rate=16000, chunk_duration=8.0):
42
  """
43
  Predicts emotions from an audio file.
44
-
45
- Args:
46
- audio_path (str): Path to the audio file.
47
- model: The emotion prediction model.
48
- feature_extractor: The feature extractor for the model.
49
- id2label (dict): Mapping from label IDs to emotion names.
50
- sampling_rate (int): The sampling rate of the audio.
51
- chunk_duration (float): Duration of each chunk in seconds.
52
-
53
- Returns:
54
- list: List of dictionaries containing emotion predictions for each chunk.
55
  """
56
  audio_array, _ = librosa.load(audio_path, sr=sampling_rate)
57
  chunk_length = int(sampling_rate * chunk_duration)
58
  num_chunks = len(audio_array) // chunk_length + int(len(audio_array) % chunk_length > 0)
59
-
60
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61
  model = model.to(device)
62
 
@@ -78,7 +52,7 @@ def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rat
78
  logits = outputs.logits
79
  predicted_id = torch.argmax(logits, dim=-1).item()
80
  predicted_label = id2label[predicted_id]
81
-
82
  results.append({"chunk": i + 1, "start_time": start_time, "end_time": end_time, "emotion": predicted_label})
83
 
84
- return results
 
 
1
  import librosa
2
  import torch
3
  import numpy as np
4
 
 
 
 
 
 
5
  def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3000):
6
  """
7
  Preprocesses audio for emotion prediction.
 
 
 
 
 
 
 
 
 
8
  """
9
  inputs = feature_extractor(
10
  audio_array,
 
26
  def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rate=16000, chunk_duration=8.0):
27
  """
28
  Predicts emotions from an audio file.
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
  audio_array, _ = librosa.load(audio_path, sr=sampling_rate)
31
  chunk_length = int(sampling_rate * chunk_duration)
32
  num_chunks = len(audio_array) // chunk_length + int(len(audio_array) % chunk_length > 0)
33
+
34
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
  model = model.to(device)
36
 
 
52
  logits = outputs.logits
53
  predicted_id = torch.argmax(logits, dim=-1).item()
54
  predicted_label = id2label[predicted_id]
55
+
56
  results.append({"chunk": i + 1, "start_time": start_time, "end_time": end_time, "emotion": predicted_label})
57
 
58
+ return results