Spaces:
Runtime error
Runtime error
PMS61
commited on
Commit
·
3a70449
1
Parent(s):
8ecb9cb
fixes
Browse files- Dockerfile +7 -4
- app.py +13 -10
- requirements.txt +1 -1
- utils/expressions.py +4 -21
- utils/models.py +35 -0
- utils/transcription.py +17 -54
- utils/vocals.py +3 -29
Dockerfile
CHANGED
|
@@ -4,6 +4,9 @@ FROM python:3.11-slim
|
|
| 4 |
# Set the working directory in the container
|
| 5 |
WORKDIR /code
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
# Install system dependencies required by your project (ffmpeg, opencv)
|
| 8 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
build-essential \
|
|
@@ -16,9 +19,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 16 |
COPY ./requirements.txt /code/requirements.txt
|
| 17 |
|
| 18 |
# Install any needed packages specified in requirements.txt
|
| 19 |
-
# We add gunicorn here for a production-ready web server
|
| 20 |
RUN pip install --no-cache-dir --upgrade pip
|
| 21 |
-
RUN pip install --no-cache-dir -r requirements.txt gunicorn
|
| 22 |
|
| 23 |
# Copy the rest of the application's code to the working directory
|
| 24 |
COPY . /code/
|
|
@@ -26,5 +29,5 @@ COPY . /code/
|
|
| 26 |
# Expose the port the app runs on (Hugging Face Spaces default is 7860)
|
| 27 |
EXPOSE 7860
|
| 28 |
|
| 29 |
-
# Command to run the application using Gunicorn
|
| 30 |
-
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--timeout", "600", "app:app"]
|
|
|
|
| 4 |
# Set the working directory in the container
|
| 5 |
WORKDIR /code
|
| 6 |
|
| 7 |
+
# Set the PYTHONPATH environment variable
|
| 8 |
+
ENV PYTHONPATH="/code"
|
| 9 |
+
|
| 10 |
# Install system dependencies required by your project (ffmpeg, opencv)
|
| 11 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 12 |
build-essential \
|
|
|
|
| 19 |
COPY ./requirements.txt /code/requirements.txt
|
| 20 |
|
| 21 |
# Install any needed packages specified in requirements.txt
|
| 22 |
+
# We add gunicorn and gevent here for a production-ready web server
|
| 23 |
RUN pip install --no-cache-dir --upgrade pip
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt gunicorn gevent
|
| 25 |
|
| 26 |
# Copy the rest of the application's code to the working directory
|
| 27 |
COPY . /code/
|
|
|
|
| 29 |
# Expose the port the app runs on (Hugging Face Spaces default is 7860)
|
| 30 |
EXPOSE 7860
|
| 31 |
|
| 32 |
+
# Command to run the application using Gunicorn with gevent workers
|
| 33 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "4", "--worker-class", "gevent", "--timeout", "600", "app:app"]
|
app.py
CHANGED
|
@@ -10,17 +10,20 @@ from utils.transcription import speech_to_text_long
|
|
| 10 |
from utils.vocals import predict_emotion
|
| 11 |
from utils.vocabulary import evaluate_vocabulary
|
| 12 |
from groq import Groq
|
| 13 |
-
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
|
| 14 |
import pandas as pd
|
| 15 |
from bson import ObjectId
|
| 16 |
import json
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
from datetime import datetime
|
|
|
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
app = Flask(__name__)
|
| 22 |
CORS(app)
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
# MongoDB connection
|
| 25 |
client = pymongo.MongoClient("mongodb+srv://pmsankheb23:[email protected]/")
|
| 26 |
db = client["Eloquence"]
|
|
@@ -39,11 +42,6 @@ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
|
| 39 |
if not os.path.exists(UPLOAD_FOLDER):
|
| 40 |
os.makedirs(UPLOAD_FOLDER)
|
| 41 |
|
| 42 |
-
model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
|
| 43 |
-
model = AutoModelForAudioClassification.from_pretrained(model_id)
|
| 44 |
-
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
|
| 45 |
-
id2label = model.config.id2label
|
| 46 |
-
|
| 47 |
def allowed_file(filename):
|
| 48 |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 49 |
|
|
@@ -96,10 +94,15 @@ def upload_file():
|
|
| 96 |
|
| 97 |
emotion_analysis = pd.DataFrame()
|
| 98 |
if mode == "video":
|
| 99 |
-
emotion_analysis = analyze_video_emotions(file_path)
|
| 100 |
-
|
| 101 |
-
transcription = speech_to_text_long(audio_path)
|
| 102 |
-
audio_emotion = predict_emotion(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
vocabulary_report = evaluate_vocabulary(transcription, context)
|
| 104 |
scores = generate_scores(transcription, audio_emotion, emotion_analysis)
|
| 105 |
speech_report = generate_speech_report(transcription, context, audio_emotion)
|
|
|
|
| 10 |
from utils.vocals import predict_emotion
|
| 11 |
from utils.vocabulary import evaluate_vocabulary
|
| 12 |
from groq import Groq
|
|
|
|
| 13 |
import pandas as pd
|
| 14 |
from bson import ObjectId
|
| 15 |
import json
|
| 16 |
from dotenv import load_dotenv
|
| 17 |
from datetime import datetime
|
| 18 |
+
from utils.models import load_models
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
app = Flask(__name__)
|
| 22 |
CORS(app)
|
| 23 |
|
| 24 |
+
# Load models on startup
|
| 25 |
+
models = load_models()
|
| 26 |
+
|
| 27 |
# MongoDB connection
|
| 28 |
client = pymongo.MongoClient("mongodb+srv://pmsankheb23:[email protected]/")
|
| 29 |
db = client["Eloquence"]
|
|
|
|
| 42 |
if not os.path.exists(UPLOAD_FOLDER):
|
| 43 |
os.makedirs(UPLOAD_FOLDER)
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
def allowed_file(filename):
|
| 46 |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 47 |
|
|
|
|
| 94 |
|
| 95 |
emotion_analysis = pd.DataFrame()
|
| 96 |
if mode == "video":
|
| 97 |
+
emotion_analysis = analyze_video_emotions(file_path, models["fer"])
|
| 98 |
+
|
| 99 |
+
transcription = speech_to_text_long(audio_path, models["whisper"])
|
| 100 |
+
audio_emotion = predict_emotion(
|
| 101 |
+
audio_path,
|
| 102 |
+
models["emotion_model"],
|
| 103 |
+
models["emotion_feature_extractor"],
|
| 104 |
+
models["emotion_id2label"],
|
| 105 |
+
)
|
| 106 |
vocabulary_report = evaluate_vocabulary(transcription, context)
|
| 107 |
scores = generate_scores(transcription, audio_emotion, emotion_analysis)
|
| 108 |
speech_report = generate_speech_report(transcription, context, audio_emotion)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
absl-py
|
| 2 |
annotated-types
|
| 3 |
anyio
|
| 4 |
audeer
|
|
@@ -34,6 +33,7 @@ Flask-Cors
|
|
| 34 |
fonttools
|
| 35 |
fsspec
|
| 36 |
future
|
|
|
|
| 37 |
groq
|
| 38 |
h11
|
| 39 |
h5py
|
|
|
|
|
|
|
| 1 |
annotated-types
|
| 2 |
anyio
|
| 3 |
audeer
|
|
|
|
| 33 |
fonttools
|
| 34 |
fsspec
|
| 35 |
future
|
| 36 |
+
gevent
|
| 37 |
groq
|
| 38 |
h11
|
| 39 |
h5py
|
utils/expressions.py
CHANGED
|
@@ -1,44 +1,27 @@
|
|
| 1 |
-
from fer import Video
|
| 2 |
-
from fer import FER
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
-
def analyze_video_emotions(video_file_path):
|
| 6 |
"""
|
| 7 |
Analyzes the emotions in a given video file and returns a dataframe of scores.
|
| 8 |
-
|
| 9 |
-
Args:
|
| 10 |
-
video_file_path (str): Path to the video file to be analyzed.
|
| 11 |
-
|
| 12 |
-
Returns:
|
| 13 |
-
pd.DataFrame: DataFrame containing the emotion scores.
|
| 14 |
"""
|
| 15 |
-
# Initialize the face detector
|
| 16 |
-
face_detector = FER(mtcnn=True)
|
| 17 |
-
|
| 18 |
-
# Input the video for processing
|
| 19 |
input_video = Video(video_file_path)
|
| 20 |
-
|
| 21 |
-
# Analyze the video
|
| 22 |
processing_data = input_video.analyze(face_detector, display=False)
|
| 23 |
|
| 24 |
-
# Check if any faces were detected
|
| 25 |
if not processing_data:
|
| 26 |
print("No faces detected in the video.")
|
| 27 |
-
return pd.DataFrame()
|
| 28 |
|
| 29 |
-
# Convert the results to a DataFrame
|
| 30 |
vid_df = input_video.to_pandas(processing_data)
|
| 31 |
vid_df = input_video.get_first_face(vid_df)
|
| 32 |
vid_df = input_video.get_emotions(vid_df)
|
| 33 |
|
| 34 |
-
# Calculate the sum of each emotion
|
| 35 |
emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
|
| 36 |
emotions_values = [sum(vid_df[emotion]) for emotion in emotions]
|
| 37 |
|
| 38 |
-
# Create a DataFrame for comparison
|
| 39 |
score_comparisons = pd.DataFrame({
|
| 40 |
'Human Emotions': [emotion.capitalize() for emotion in emotions],
|
| 41 |
'Emotion Value from the Video': emotions_values
|
| 42 |
})
|
| 43 |
|
| 44 |
-
return score_comparisons
|
|
|
|
| 1 |
+
```from fer import Video
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
def analyze_video_emotions(video_file_path, face_detector):
|
| 5 |
"""
|
| 6 |
Analyzes the emotions in a given video file and returns a dataframe of scores.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
input_video = Video(video_file_path)
|
|
|
|
|
|
|
| 9 |
processing_data = input_video.analyze(face_detector, display=False)
|
| 10 |
|
|
|
|
| 11 |
if not processing_data:
|
| 12 |
print("No faces detected in the video.")
|
| 13 |
+
return pd.DataFrame()
|
| 14 |
|
|
|
|
| 15 |
vid_df = input_video.to_pandas(processing_data)
|
| 16 |
vid_df = input_video.get_first_face(vid_df)
|
| 17 |
vid_df = input_video.get_emotions(vid_df)
|
| 18 |
|
|
|
|
| 19 |
emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
|
| 20 |
emotions_values = [sum(vid_df[emotion]) for emotion in emotions]
|
| 21 |
|
|
|
|
| 22 |
score_comparisons = pd.DataFrame({
|
| 23 |
'Human Emotions': [emotion.capitalize() for emotion in emotions],
|
| 24 |
'Emotion Value from the Video': emotions_values
|
| 25 |
})
|
| 26 |
|
| 27 |
+
return score_comparisons
|
utils/models.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForAudioClassification, AutoFeatureExtractor
|
| 3 |
+
from fer import FER
|
| 4 |
+
|
| 5 |
+
def load_models():
|
| 6 |
+
"""
|
| 7 |
+
Loads all the machine learning models and returns them as a dictionary.
|
| 8 |
+
"""
|
| 9 |
+
# Whisper model for transcription
|
| 10 |
+
whisper_model_name = "openai/whisper-base"
|
| 11 |
+
whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
|
| 12 |
+
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
|
| 13 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 14 |
+
whisper_model = whisper_model.to(device)
|
| 15 |
+
|
| 16 |
+
# Speech emotion recognition model
|
| 17 |
+
emotion_model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
|
| 18 |
+
emotion_model = AutoModelForAudioClassification.from_pretrained(emotion_model_id)
|
| 19 |
+
emotion_feature_extractor = AutoFeatureExtractor.from_pretrained(emotion_model_id, do_normalize=True)
|
| 20 |
+
emotion_id2label = emotion_model.config.id2label
|
| 21 |
+
|
| 22 |
+
# Facial emotion recognition model
|
| 23 |
+
fer_detector = FER(mtcnn=True)
|
| 24 |
+
|
| 25 |
+
return {
|
| 26 |
+
"whisper": {
|
| 27 |
+
"processor": whisper_processor,
|
| 28 |
+
"model": whisper_model,
|
| 29 |
+
"device": device,
|
| 30 |
+
},
|
| 31 |
+
"emotion_model": emotion_model,
|
| 32 |
+
"emotion_feature_extractor": emotion_feature_extractor,
|
| 33 |
+
"emotion_id2label": emotion_id2label,
|
| 34 |
+
"fer": fer_detector,
|
| 35 |
+
}
|
utils/transcription.py
CHANGED
|
@@ -1,25 +1,11 @@
|
|
| 1 |
import torch
|
| 2 |
-
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
| 3 |
from pydub import AudioSegment
|
| 4 |
import soundfile as sf
|
| 5 |
import os
|
| 6 |
|
| 7 |
-
model_name = "openai/whisper-base"
|
| 8 |
-
processor = WhisperProcessor.from_pretrained(model_name)
|
| 9 |
-
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
| 10 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
-
model = model.to(device)
|
| 12 |
-
|
| 13 |
def preprocess_audio(input_audio_path, output_audio_path):
|
| 14 |
"""
|
| 15 |
Converts audio to 16kHz WAV format.
|
| 16 |
-
|
| 17 |
-
Args:
|
| 18 |
-
input_audio_path (str): Path to the input audio file.
|
| 19 |
-
output_audio_path (str): Path to save the processed audio file.
|
| 20 |
-
|
| 21 |
-
Returns:
|
| 22 |
-
str: Path to the processed audio file.
|
| 23 |
"""
|
| 24 |
audio = AudioSegment.from_file(input_audio_path)
|
| 25 |
audio = audio.set_frame_rate(16000).set_channels(1)
|
|
@@ -29,59 +15,36 @@ def preprocess_audio(input_audio_path, output_audio_path):
|
|
| 29 |
def split_audio(audio_path, chunk_length_ms=30000):
|
| 30 |
"""
|
| 31 |
Splits audio into chunks of specified length.
|
| 32 |
-
|
| 33 |
-
Args:
|
| 34 |
-
audio_path (str): Path to the audio file.
|
| 35 |
-
chunk_length_ms (int): Length of each chunk in milliseconds.
|
| 36 |
-
|
| 37 |
-
Returns:
|
| 38 |
-
list: List of audio chunks.
|
| 39 |
"""
|
| 40 |
audio = AudioSegment.from_file(audio_path)
|
| 41 |
-
|
| 42 |
-
return chunks
|
| 43 |
|
| 44 |
-
def transcribe_chunk(audio_chunk, chunk_index):
|
| 45 |
"""
|
| 46 |
-
Transcribes a single audio chunk.
|
| 47 |
-
|
| 48 |
-
Args:
|
| 49 |
-
audio_chunk (AudioSegment): The audio chunk to transcribe.
|
| 50 |
-
chunk_index (int): Index of the chunk.
|
| 51 |
-
|
| 52 |
-
Returns:
|
| 53 |
-
str: Transcription of the chunk.
|
| 54 |
"""
|
| 55 |
temp_path = f"temp_chunk_{chunk_index}.wav"
|
| 56 |
audio_chunk.export(temp_path, format="wav")
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
return transcription
|
| 64 |
|
| 65 |
-
def speech_to_text_long(audio_path):
|
| 66 |
"""
|
| 67 |
Transcribes a long audio file by splitting it into chunks.
|
| 68 |
-
|
| 69 |
-
Args:
|
| 70 |
-
audio_path (str): Path to the audio file.
|
| 71 |
-
|
| 72 |
-
Returns:
|
| 73 |
-
str: Full transcription of the audio.
|
| 74 |
"""
|
| 75 |
processed_audio_path = "processed_audio.wav"
|
| 76 |
preprocess_audio(audio_path, processed_audio_path)
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
transcriptions = []
|
| 81 |
-
|
| 82 |
-
for idx, chunk in enumerate(chunks):
|
| 83 |
-
print(f"Transcribing chunk {idx + 1} of {len(chunks)}...")
|
| 84 |
-
transcription = transcribe_chunk(chunk, idx)
|
| 85 |
-
transcriptions.append(transcription)
|
| 86 |
|
| 87 |
-
|
|
|
|
|
|
| 1 |
import torch
|
|
|
|
| 2 |
from pydub import AudioSegment
|
| 3 |
import soundfile as sf
|
| 4 |
import os
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
def preprocess_audio(input_audio_path, output_audio_path):
|
| 7 |
"""
|
| 8 |
Converts audio to 16kHz WAV format.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
audio = AudioSegment.from_file(input_audio_path)
|
| 11 |
audio = audio.set_frame_rate(16000).set_channels(1)
|
|
|
|
| 15 |
def split_audio(audio_path, chunk_length_ms=30000):
|
| 16 |
"""
|
| 17 |
Splits audio into chunks of specified length.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
audio = AudioSegment.from_file(audio_path)
|
| 20 |
+
return [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
|
|
|
| 21 |
|
| 22 |
+
def transcribe_chunk(audio_chunk, chunk_index, whisper_models):
|
| 23 |
"""
|
| 24 |
+
Transcribes a single audio chunk using the pre-loaded Whisper model.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
temp_path = f"temp_chunk_{chunk_index}.wav"
|
| 27 |
audio_chunk.export(temp_path, format="wav")
|
| 28 |
+
|
| 29 |
+
audio, _ = sf.read(temp_path)
|
| 30 |
+
inputs = whisper_models["processor"](audio, sampling_rate=16000, return_tensors="pt")
|
| 31 |
+
input_features = inputs.input_features.to(whisper_models["device"])
|
| 32 |
+
|
| 33 |
+
predicted_ids = whisper_models["model"].generate(input_features)
|
| 34 |
+
transcription = whisper_models["processor"].batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
| 35 |
+
|
| 36 |
+
os.remove(temp_path)
|
| 37 |
return transcription
|
| 38 |
|
| 39 |
+
def speech_to_text_long(audio_path, whisper_models):
|
| 40 |
"""
|
| 41 |
Transcribes a long audio file by splitting it into chunks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
"""
|
| 43 |
processed_audio_path = "processed_audio.wav"
|
| 44 |
preprocess_audio(audio_path, processed_audio_path)
|
| 45 |
|
| 46 |
+
chunks = split_audio(processed_audio_path)
|
| 47 |
+
transcriptions = [transcribe_chunk(chunk, idx, whisper_models) for idx, chunk in enumerate(chunks)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
os.remove(processed_audio_path)
|
| 50 |
+
return " ".join(transcriptions)
|
utils/vocals.py
CHANGED
|
@@ -1,25 +1,10 @@
|
|
| 1 |
-
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
|
| 2 |
import librosa
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
|
| 6 |
-
model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
|
| 7 |
-
model = AutoModelForAudioClassification.from_pretrained(model_id)
|
| 8 |
-
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
|
| 9 |
-
id2label = model.config.id2label
|
| 10 |
-
|
| 11 |
def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3000):
|
| 12 |
"""
|
| 13 |
Preprocesses audio for emotion prediction.
|
| 14 |
-
|
| 15 |
-
Args:
|
| 16 |
-
audio_array (np.array): The audio data as a numpy array.
|
| 17 |
-
feature_extractor: The feature extractor for the model.
|
| 18 |
-
sampling_rate (int): The sampling rate of the audio.
|
| 19 |
-
max_length (int): Maximum length of the audio features.
|
| 20 |
-
|
| 21 |
-
Returns:
|
| 22 |
-
dict: Preprocessed inputs for the model.
|
| 23 |
"""
|
| 24 |
inputs = feature_extractor(
|
| 25 |
audio_array,
|
|
@@ -41,22 +26,11 @@ def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3
|
|
| 41 |
def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rate=16000, chunk_duration=8.0):
|
| 42 |
"""
|
| 43 |
Predicts emotions from an audio file.
|
| 44 |
-
|
| 45 |
-
Args:
|
| 46 |
-
audio_path (str): Path to the audio file.
|
| 47 |
-
model: The emotion prediction model.
|
| 48 |
-
feature_extractor: The feature extractor for the model.
|
| 49 |
-
id2label (dict): Mapping from label IDs to emotion names.
|
| 50 |
-
sampling_rate (int): The sampling rate of the audio.
|
| 51 |
-
chunk_duration (float): Duration of each chunk in seconds.
|
| 52 |
-
|
| 53 |
-
Returns:
|
| 54 |
-
list: List of dictionaries containing emotion predictions for each chunk.
|
| 55 |
"""
|
| 56 |
audio_array, _ = librosa.load(audio_path, sr=sampling_rate)
|
| 57 |
chunk_length = int(sampling_rate * chunk_duration)
|
| 58 |
num_chunks = len(audio_array) // chunk_length + int(len(audio_array) % chunk_length > 0)
|
| 59 |
-
|
| 60 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 61 |
model = model.to(device)
|
| 62 |
|
|
@@ -78,7 +52,7 @@ def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rat
|
|
| 78 |
logits = outputs.logits
|
| 79 |
predicted_id = torch.argmax(logits, dim=-1).item()
|
| 80 |
predicted_label = id2label[predicted_id]
|
| 81 |
-
|
| 82 |
results.append({"chunk": i + 1, "start_time": start_time, "end_time": end_time, "emotion": predicted_label})
|
| 83 |
|
| 84 |
-
return results
|
|
|
|
|
|
|
| 1 |
import librosa
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
def preprocess_audio(audio_array, feature_extractor, sampling_rate, max_length=3000):
|
| 6 |
"""
|
| 7 |
Preprocesses audio for emotion prediction.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
inputs = feature_extractor(
|
| 10 |
audio_array,
|
|
|
|
| 26 |
def predict_emotion(audio_path, model, feature_extractor, id2label, sampling_rate=16000, chunk_duration=8.0):
|
| 27 |
"""
|
| 28 |
Predicts emotions from an audio file.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
audio_array, _ = librosa.load(audio_path, sr=sampling_rate)
|
| 31 |
chunk_length = int(sampling_rate * chunk_duration)
|
| 32 |
num_chunks = len(audio_array) // chunk_length + int(len(audio_array) % chunk_length > 0)
|
| 33 |
+
|
| 34 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 35 |
model = model.to(device)
|
| 36 |
|
|
|
|
| 52 |
logits = outputs.logits
|
| 53 |
predicted_id = torch.argmax(logits, dim=-1).item()
|
| 54 |
predicted_label = id2label[predicted_id]
|
| 55 |
+
|
| 56 |
results.append({"chunk": i + 1, "start_time": start_time, "end_time": end_time, "emotion": predicted_label})
|
| 57 |
|
| 58 |
+
return results
|