vitorcalvi commited on
Commit
0073001
·
1 Parent(s): c9de652
Files changed (3) hide show
  1. Dockerfile +3 -0
  2. app.py +17 -31
  3. requirements.txt +1 -1
Dockerfile CHANGED
@@ -8,6 +8,9 @@ WORKDIR /app
8
  COPY requirements.txt .
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
 
 
 
11
  # Copy the application code.
12
  COPY . .
13
 
 
8
  COPY requirements.txt .
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
11
+ # Install additional system dependencies for soundfile
12
+ RUN apt-get update && apt-get install -y libsndfile1
13
+
14
  # Copy the application code.
15
  COPY . .
16
 
app.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI, File, UploadFile, HTTPException, Form
2
  from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
- import torchaudio
5
  import numpy as np
6
  import tempfile
7
  import os
@@ -12,42 +12,28 @@ warnings.filterwarnings("ignore")
12
  app = FastAPI()
13
 
14
  def extract_audio_features(audio_file_path):
15
- # Load the audio file using torchaudio
16
- waveform, sample_rate = torchaudio.load(audio_file_path)
17
-
18
- # Ensure waveform is mono by averaging channels if necessary
19
- if waveform.shape[0] > 1:
20
- waveform = waveform.mean(dim=0, keepdim=True)
21
-
22
- waveform = waveform.squeeze() # Remove channel dimension if it's 1
23
-
24
- # Extract pitch (fundamental frequency)
25
- pitch_frequencies, voiced_flags, _ = torchaudio.functional.detect_pitch_frequency(
26
- waveform, sample_rate, frame_time=0.01, win_length=1024
27
- )
28
- f0 = pitch_frequencies[voiced_flags > 0]
29
 
30
- # Extract energy
31
- energy = waveform.pow(2).numpy()
32
-
33
- # Extract MFCCs
34
- mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=13)
35
- mfccs = mfcc_transform(waveform.unsqueeze(0)).squeeze(0).numpy()
 
36
 
37
- # Estimate speech rate (simplified)
38
- tempo = torchaudio.functional.estimate_tempo(waveform, sample_rate)
39
- speech_rate = tempo / 60 if tempo is not None else 0
 
40
 
41
- return f0.numpy(), energy, speech_rate, mfccs, waveform.numpy(), sample_rate
42
 
43
  def analyze_voice_stress(audio_file_path):
44
  f0, energy, speech_rate, mfccs, waveform, sample_rate = extract_audio_features(audio_file_path)
45
- if len(f0) == 0:
46
- raise ValueError("Could not extract fundamental frequency from the audio.")
47
- mean_f0 = np.mean(f0)
48
- std_f0 = np.std(f0)
49
- mean_energy = np.mean(energy)
50
- std_energy = np.std(energy)
51
  gender = 'male' if mean_f0 < 165 else 'female'
52
  norm_mean_f0 = 110 if gender == 'male' else 220
53
  norm_std_f0 = 20
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException, Form
2
  from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
+ import soundfile as sf
5
  import numpy as np
6
  import tempfile
7
  import os
 
12
  app = FastAPI()
13
 
14
  def extract_audio_features(audio_file_path):
15
+ # Load the audio file using soundfile
16
+ waveform, sample_rate = sf.read(audio_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # Ensure waveform is a 1D array (mono audio)
19
+ if waveform.ndim > 1:
20
+ waveform = waveform.mean(axis=1)
21
+
22
+ # Calculate basic features (pitch estimation requires a more complex algorithm, but we'll simplify)
23
+ energy = np.mean(waveform ** 2)
24
+ mfccs = np.mean(np.abs(np.fft.fft(waveform)[:13]), axis=0) # Simplified MFCC-like features
25
 
26
+ # Placeholder for speech rate and fundamental frequency
27
+ # Speech rate and pitch extraction would require more complex DSP techniques or external libraries.
28
+ speech_rate = 4.0 # Arbitrary placeholder value for speech rate
29
+ f0 = np.mean(np.abs(np.diff(waveform))) * sample_rate / (2 * np.pi) # Rough pitch estimate
30
 
31
+ return f0, energy, speech_rate, mfccs, waveform, sample_rate
32
 
33
  def analyze_voice_stress(audio_file_path):
34
  f0, energy, speech_rate, mfccs, waveform, sample_rate = extract_audio_features(audio_file_path)
35
+ mean_f0 = f0
36
+ mean_energy = energy
 
 
 
 
37
  gender = 'male' if mean_f0 < 165 else 'female'
38
  norm_mean_f0 = 110 if gender == 'male' else 220
39
  norm_std_f0 = 20
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  fastapi
2
  uvicorn
3
- torchaudio
4
  numpy
5
  pydantic
6
  python-multipart
 
1
  fastapi
2
  uvicorn
3
+ soundfile
4
  numpy
5
  pydantic
6
  python-multipart