Spaces:

Aurum79
/

Eloquence-Backend

Runtime error

App Files Files Community

PMS61 commited on Jul 20

Commit

3a70449

1 Parent(s): 8ecb9cb

fixes

Browse files

Files changed (7) hide show

Dockerfile +7 -4
app.py +13 -10
requirements.txt +1 -1
utils/expressions.py +4 -21
utils/models.py +35 -0
utils/transcription.py +17 -54
utils/vocals.py +3 -29

Dockerfile CHANGED Viewed

@@ -4,6 +4,9 @@ FROM python:3.11-slim
 # Set the working directory in the container
 WORKDIR /code
 # Install system dependencies required by your project (ffmpeg, opencv)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
@@ -16,9 +19,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 COPY ./requirements.txt /code/requirements.txt
 # Install any needed packages specified in requirements.txt
-# We add gunicorn here for a production-ready web server
 RUN pip install --no-cache-dir --upgrade pip
-RUN pip install --no-cache-dir -r requirements.txt gunicorn
 # Copy the rest of the application's code to the working directory
 COPY . /code/
@@ -26,5 +29,5 @@ COPY . /code/
 # Expose the port the app runs on (Hugging Face Spaces default is 7860)
 EXPOSE 7860
-# Command to run the application using Gunicorn
-CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--timeout", "600", "app:app"]

 # Set the working directory in the container
 WORKDIR /code
+# Set the PYTHONPATH environment variable
+ENV PYTHONPATH="/code"
 # Install system dependencies required by your project (ffmpeg, opencv)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
 COPY ./requirements.txt /code/requirements.txt
 # Install any needed packages specified in requirements.txt
+# We add gunicorn and gevent here for a production-ready web server
 RUN pip install --no-cache-dir --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt gunicorn gevent
 # Copy the rest of the application's code to the working directory
 COPY . /code/
 # Expose the port the app runs on (Hugging Face Spaces default is 7860)
 EXPOSE 7860
+# Command to run the application using Gunicorn with gevent workers
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "4", "--worker-class", "gevent", "--timeout", "600", "app:app"]

app.py CHANGED Viewed

@@ -10,17 +10,20 @@ from utils.transcription import speech_to_text_long
 from utils.vocals import predict_emotion
 from utils.vocabulary import evaluate_vocabulary
 from groq import Groq
-from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
 import pandas as pd
 from bson import ObjectId
 import json
 from dotenv import load_dotenv
 from datetime import datetime
 load_dotenv()
 app = Flask(__name__)
 CORS(app)
 # MongoDB connection
 client = pymongo.MongoClient("mongodb+srv://pmsankheb23:[email protected]/")
 db = client["Eloquence"]
@@ -39,11 +42,6 @@ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 if not os.path.exists(UPLOAD_FOLDER):
     os.makedirs(UPLOAD_FOLDER)
-model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
-model = AutoModelForAudioClassification.from_pretrained(model_id)
-feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
-id2label = model.config.id2label
 def allowed_file(filename):
     return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@@ -96,10 +94,15 @@ def upload_file():
         emotion_analysis = pd.DataFrame()
         if mode == "video":
-            emotion_analysis = analyze_video_emotions(file_path)
-        transcription = speech_to_text_long(audio_path)
-        audio_emotion = predict_emotion(audio_path, model, feature_extractor, id2label)
         vocabulary_report = evaluate_vocabulary(transcription, context)
         scores = generate_scores(transcription, audio_emotion, emotion_analysis)
         speech_report = generate_speech_report(transcription, context, audio_emotion)

 from utils.vocals import predict_emotion
 from utils.vocabulary import evaluate_vocabulary
 from groq import Groq
 import pandas as pd
 from bson import ObjectId
 import json
 from dotenv import load_dotenv
 from datetime import datetime
+from utils.models import load_models
 load_dotenv()
 app = Flask(__name__)
 CORS(app)
+# Load models on startup
+models = load_models()
 # MongoDB connection
 client = pymongo.MongoClient("mongodb+srv://pmsankheb23:[email protected]/")
 db = client["Eloquence"]
 if not os.path.exists(UPLOAD_FOLDER):
     os.makedirs(UPLOAD_FOLDER)
 def allowed_file(filename):
     return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
         emotion_analysis = pd.DataFrame()
         if mode == "video":
+            emotion_analysis = analyze_video_emotions(file_path, models["fer"])
+        transcription = speech_to_text_long(audio_path, models["whisper"])
+        audio_emotion = predict_emotion(
+            audio_path,
+            models["emotion_model"],
+            models["emotion_feature_extractor"],
+            models["emotion_id2label"],
+        )
         vocabulary_report = evaluate_vocabulary(transcription, context)
         scores = generate_scores(transcription, audio_emotion, emotion_analysis)
         speech_report = generate_speech_report(transcription, context, audio_emotion)

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-absl-py
 annotated-types
 anyio
 audeer
@@ -34,6 +33,7 @@ Flask-Cors
 fonttools
 fsspec
 future
 groq
 h11
 h5py

 annotated-types
 anyio
 audeer
 fonttools
 fsspec
 future
+gevent
 groq
 h11
 h5py

utils/expressions.py CHANGED Viewed

@@ -1,44 +1,27 @@
-from fer import Video
-from fer import FER
 import pandas as pd
-def analyze_video_emotions(video_file_path):
     """
     Analyzes the emotions in a given video file and returns a dataframe of scores.
-    Args:
-        video_file_path (str): Path to the video file to be analyzed.
-    Returns:
-        pd.DataFrame: DataFrame containing the emotion scores.
     """
-    # Initialize the face detector
-    face_detector = FER(mtcnn=True)
-    # Input the video for processing
     input_video = Video(video_file_path)
-    # Analyze the video
     processing_data = input_video.analyze(face_detector, display=False)
-    # Check if any faces were detected
     if not processing_data:
         print("No faces detected in the video.")
-        return pd.DataFrame()  # Return an empty DataFrame if no faces are detected
-    # Convert the results to a DataFrame
     vid_df = input_video.to_pandas(processing_data)
     vid_df = input_video.get_first_face(vid_df)
     vid_df = input_video.get_emotions(vid_df)
-    # Calculate the sum of each emotion
     emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
     emotions_values = [sum(vid_df[emotion]) for emotion in emotions]
-    # Create a DataFrame for comparison
     score_comparisons = pd.DataFrame({
         'Human Emotions': [emotion.capitalize() for emotion in emotions],
         'Emotion Value from the Video': emotions_values
     })
-    return score_comparisons

+```from fer import Video
 import pandas as pd
+def analyze_video_emotions(video_file_path, face_detector):
     """
     Analyzes the emotions in a given video file and returns a dataframe of scores.
     """
     input_video = Video(video_file_path)
     processing_data = input_video.analyze(face_detector, display=False)
     if not processing_data:
         print("No faces detected in the video.")
+        return pd.DataFrame()
     vid_df = input_video.to_pandas(processing_data)
     vid_df = input_video.get_first_face(vid_df)
     vid_df = input_video.get_emotions(vid_df)
     emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
     emotions_values = [sum(vid_df[emotion]) for emotion in emotions]
     score_comparisons = pd.DataFrame({
         'Human Emotions': [emotion.capitalize() for emotion in emotions],
         'Emotion Value from the Video': emotions_values
     })
+    return score_comparisons

utils/models.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForAudioClassification, AutoFeatureExtractor
+from fer import FER
+def load_models():
+    """
+    Loads all the machine learning models and returns them as a dictionary.
+    """
+    # Whisper model for transcription
+    whisper_model_name = "openai/whisper-base"
+    whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
+    whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    whisper_model = whisper_model.to(device)
+    # Speech emotion recognition model
+    emotion_model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
+    emotion_model = AutoModelForAudioClassification.from_pretrained(emotion_model_id)
+    emotion_feature_extractor = AutoFeatureExtractor.from_pretrained(emotion_model_id, do_normalize=True)
+    emotion_id2label = emotion_model.config.id2label
+    # Facial emotion recognition model
+    fer_detector = FER(mtcnn=True)
+    return {
+        "whisper": {
+            "processor": whisper_processor,
+            "model": whisper_model,
+            "device": device,
+        },
+        "emotion_model": emotion_model,
+        "emotion_feature_extractor": emotion_feature_extractor,
+        "emotion_id2label": emotion_id2label,
+        "fer": fer_detector,
+    }

utils/transcription.py CHANGED Viewed

@@ -1,25 +1,11 @@
 import torch
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from pydub import AudioSegment
 import soundfile as sf
 import os
-model_name = "openai/whisper-base"
-processor = WhisperProcessor.from_pretrained(model_name)
-model = WhisperForConditionalGeneration.from_pretrained(model_name)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = model.to(device)
 def preprocess_audio(input_audio_path, output_audio_path):
     """
     Converts audio to 16kHz WAV format.
-    Args:
-        input_audio_path (str): Path to the input audio file.
-        output_audio_path (str): Path to save the processed audio file.
-    Returns:
-        str: Path to the processed audio file.
     """
     audio = AudioSegment.from_file(input_audio_path)
     audio = audio.set_frame_rate(16000).set_channels(1)
@@ -29,59 +15,36 @@ def preprocess_audio(input_audio_path, output_audio_path):
 def split_audio(audio_path, chunk_length_ms=30000):
     """
     Splits audio into chunks of specified length.
-    Args:
-        audio_path (str): Path to the audio file.
-        chunk_length_ms (int): Length of each chunk in milliseconds.
-    Returns:
-        list: List of audio chunks.
     """
     audio = AudioSegment.from_file(audio_path)
-    chunks = [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
-    return chunks
-def transcribe_chunk(audio_chunk, chunk_index):
     """
-    Transcribes a single audio chunk.
-    Args:
-        audio_chunk (AudioSegment): The audio chunk to transcribe.
-        chunk_index (int): Index of the chunk.
-    Returns:
-        str: Transcription of the chunk.
     """
     temp_path = f"temp_chunk_{chunk_index}.wav"
     audio_chunk.export(temp_path, format="wav")
-    audio, sampling_rate = sf.read(temp_path)
-    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
-    input_features = inputs.input_features.to(device)
-    predicted_ids = model.generate(input_features)
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    os.remove(temp_path)  # Clean up temporary file
     return transcription
-def speech_to_text_long(audio_path):
     """
     Transcribes a long audio file by splitting it into chunks.
-    Args:
-        audio_path (str): Path to the audio file.
-    Returns:
-        str: Full transcription of the audio.
     """
     processed_audio_path = "processed_audio.wav"
     preprocess_audio(audio_path, processed_audio_path)
-    # Split audio into chunks
-    chunks = split_audio(processed_audio_path, chunk_length_ms=30000)  # 30 seconds per chunk
-    transcriptions = []
-    for idx, chunk in enumerate(chunks):
-        print(f"Transcribing chunk {idx + 1} of {len(chunks)}...")
-        transcription = transcribe_chunk(chunk, idx)
-        transcriptions.append(transcription)
-    return " ".join(transcriptions)

 import torch
 from pydub import AudioSegment
 import soundfile as sf
 import os
 def preprocess_audio(input_audio_path, output_audio_path):
     """
     Converts audio to 16kHz WAV format.
     """
     audio = AudioSegment.from_file(input_audio_path)
     audio = audio.set_frame_rate(16000).set_channels(1)
 def split_audio(audio_path, chunk_length_ms=30000):
     """
     Splits audio into chunks of specified length.
     """
     audio = AudioSegment.from_file(audio_path)
+    return [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+def transcribe_chunk(audio_chunk, chunk_index, whisper_models):
     """
+    Transcribes a single audio chunk using the pre-loaded Whisper model.
     """
     temp_path = f"temp_chunk_{chunk_index}.wav"
     audio_chunk.export(temp_path, format="wav")
+    audio, _ = sf.read(temp_path)
+    inputs = whisper_models["processor"](audio, sampling_rate=16000, return_tensors="pt")
+    input_features = inputs.input_features.to(whisper_models["device"])
+    predicted_ids = whisper_models["model"].generate(input_features)
+    transcription = whisper_models["processor"].batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    os.remove(temp_path)
     return transcription
+def speech_to_text_long(audio_path, whisper_models):
     """
     Transcribes a long audio file by splitting it into chunks.
     """
     processed_audio_path = "processed_audio.wav"
     preprocess_audio(audio_path, processed_audio_path)
+    chunks = split_audio(processed_audio_path)
+    transcriptions = [transcribe_chunk(chunk, idx, whisper_models) for idx, chunk in enumerate(chunks)]
+    os.remove(processed_audio_path)
+    return " ".join(transcriptions)

utils/vocals.py CHANGED Viewed

@@ -1,25 +1,10 @@
-from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
 import librosa
 import torch
 import numpy as np
-model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
-model = AutoModelForAudioClassification.from_pretrained(model_id)
-feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
-id2label = model.config.id2label
 def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3000):
     """
     Preprocesses audio for emotion prediction.
-    Args:
-        audio_array (np.array): The audio data as a numpy array.
-        feature_extractor: The feature extractor for the model.
-        sampling_rate (int): The sampling rate of the audio.
-        max_length (int): Maximum length of the audio features.
-    Returns:
-        dict: Preprocessed inputs for the model.
     """
     inputs = feature_extractor(
         audio_array,
@@ -41,22 +26,11 @@ def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3
 def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rate=16000, chunk_duration=8.0):
     """
     Predicts emotions from an audio file.
-    Args:
-        audio_path (str): Path to the audio file.
-        model: The emotion prediction model.
-        feature_extractor: The feature extractor for the model.
-        id2label (dict): Mapping from label IDs to emotion names.
-        sampling_rate (int): The sampling rate of the audio.
-        chunk_duration (float): Duration of each chunk in seconds.
-    Returns:
-        list: List of dictionaries containing emotion predictions for each chunk.
     """
     audio_array, _ = librosa.load(audio_path, sr=sampling_rate)
     chunk_length = int(sampling_rate * chunk_duration)
     num_chunks = len(audio_array) // chunk_length + int(len(audio_array) % chunk_length > 0)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = model.to(device)
@@ -78,7 +52,7 @@ def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rat
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
         predicted_label = id2label[predicted_id]
         results.append({"chunk": i + 1, "start_time": start_time, "end_time": end_time, "emotion": predicted_label})
-    return results

 import librosa
 import torch
 import numpy as np
 def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3000):
     """
     Preprocesses audio for emotion prediction.
     """
     inputs = feature_extractor(
         audio_array,
 def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rate=16000, chunk_duration=8.0):
     """
     Predicts emotions from an audio file.
     """
     audio_array, _ = librosa.load(audio_path, sr=sampling_rate)
     chunk_length = int(sampling_rate * chunk_duration)
     num_chunks = len(audio_array) // chunk_length + int(len(audio_array) % chunk_length > 0)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = model.to(device)
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
         predicted_label = id2label[predicted_id]
         results.append({"chunk": i + 1, "start_time": start_time, "end_time": end_time, "emotion": predicted_label})
+    return results