mulasagg commited on
Commit
8031a8f
·
1 Parent(s): 29c8799
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ **/__pycache__/
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ # Create user
7
+ RUN useradd -m -u 1000 user
8
+
9
+ # Install system packages (as root before switching to user)
10
+ RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Switch to non-root user
13
+ USER user
14
+
15
+ # Set pip install path
16
+ ENV PATH="/home/user/.local/bin:$PATH"
17
+
18
+ # Set working directory
19
+ WORKDIR /app
20
+
21
+ # Install Python dependencies
22
+ COPY --chown=user ./requirements.txt requirements.txt
23
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
24
+
25
+ # Copy rest of the code
26
+ COPY --chown=user . /app
27
+
28
+ # Run the app
29
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: Voice
3
- emoji: 📊
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Voice Deploy
3
+ emoji: 🏢
4
+ colorFrom: green
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, Form , HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ import sys
5
+ import os
6
+ import shutil
7
+ import uuid
8
+
9
+ # Ensure sibling module fluency is discoverable
10
+ #sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11
+
12
+ from fluency.fluency_api import main as analyze_fluency_main
13
+ from tone_modulation.tone_api import main as analyze_tone_main
14
+ from vcs.vcs_api import main as analyze_vcs_main
15
+ from vers.vers_api import main as analyze_vers_main
16
+ from voice_confidence_score.voice_confidence_api import main as analyze_voice_confidence_main
17
+ from vps.vps_api import main as analyze_vps_main
18
+ from ves.ves import calc_voice_engagement_score
19
+ from transcribe import transcribe_audio
20
+ from filler_count.filler_score import analyze_fillers
21
+ #from emotion.emo_predict import predict_emotion
22
+
23
+ app = FastAPI()
24
+
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=["*"], # In production, replace "*" with allowed frontend domains
28
+ allow_credentials=True,
29
+ allow_methods=["*"],
30
+ allow_headers=["*"],
31
+ )
32
+
33
+ @app.post("/analyze_fluency/")
34
+ async def analyze_fluency(file: UploadFile):
35
+ # idk if we can use pydantic model here If we need I can add later
36
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
37
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
38
+
39
+ # Generate a safe temporary file path for temporary storage of the uploaded file this will be deleted after processing
40
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
41
+ temp_dir = "temp_uploads"
42
+ temp_filepath = os.path.join(temp_dir, temp_filename)
43
+ os.makedirs(temp_dir, exist_ok=True)
44
+
45
+ try:
46
+ # Save uploaded file
47
+ with open(temp_filepath, "wb") as buffer:
48
+ shutil.copyfileobj(file.file, buffer)
49
+
50
+
51
+ result = analyze_fluency_main(temp_filepath, model_size="base")
52
+
53
+ return JSONResponse(content=result)
54
+
55
+ except Exception as e:
56
+ raise HTTPException(status_code=500, detail=f"Fluency analysis failed: {str(e)}")
57
+
58
+ finally:
59
+ # Clean up temporary file
60
+ if os.path.exists(temp_filepath):
61
+ os.remove(temp_filepath)
62
+
63
+ @app.post('/analyze_tone/')
64
+ async def analyze_tone(file: UploadFile):
65
+ """
66
+ Endpoint to analyze tone of an uploaded audio file (.wav or .mp3).
67
+ """
68
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
69
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
70
+
71
+ # Generate a safe temporary file path
72
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
73
+ temp_dir = "temp_uploads"
74
+ temp_filepath = os.path.join(temp_dir, temp_filename)
75
+ os.makedirs(temp_dir, exist_ok=True)
76
+
77
+ try:
78
+ # Save uploaded file
79
+ with open(temp_filepath, "wb") as buffer:
80
+ shutil.copyfileobj(file.file, buffer)
81
+
82
+ # Analyze tone using your custom function
83
+ result = analyze_tone_main(temp_filepath)
84
+
85
+ return JSONResponse(content=result)
86
+
87
+ except Exception as e:
88
+ raise HTTPException(status_code=500, detail=f"Tone analysis failed: {str(e)}")
89
+
90
+ finally:
91
+ # Clean up temporary file
92
+ if os.path.exists(temp_filepath):
93
+ os.remove(temp_filepath)
94
+
95
+ @app.post('/analyze_vcs/')
96
+ async def analyze_vcs(file: UploadFile):
97
+ """
98
+ Endpoint to analyze voice clarity of an uploaded audio file (.wav or .mp3).
99
+ """
100
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
101
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
102
+
103
+ # Generate a safe temporary file path
104
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
105
+ temp_dir = "temp_uploads"
106
+ temp_filepath = os.path.join(temp_dir, temp_filename)
107
+ os.makedirs(temp_dir, exist_ok=True)
108
+
109
+ try:
110
+ # Save uploaded file
111
+ with open(temp_filepath, "wb") as buffer:
112
+ shutil.copyfileobj(file.file, buffer)
113
+
114
+ # Analyze voice clarity using your custom function
115
+ result = analyze_vcs_main(temp_filepath)
116
+
117
+ return JSONResponse(content=result)
118
+
119
+ except Exception as e:
120
+ raise HTTPException(status_code=500, detail=f"Voice clarity analysis failed: {str(e)}")
121
+
122
+ finally:
123
+ # Clean up temporary file
124
+ if os.path.exists(temp_filepath):
125
+ os.remove(temp_filepath)
126
+
127
+ @app.post('/analyze_vers/')
128
+ async def analyze_vers(file: UploadFile):
129
+ """
130
+ Endpoint to analyze VERS of an uploaded audio file (.wav or .mp3).
131
+ """
132
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
133
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
134
+
135
+ # Generate a safe temporary file path
136
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
137
+ temp_dir = "temp_uploads"
138
+ temp_filepath = os.path.join(temp_dir, temp_filename)
139
+ os.makedirs(temp_dir, exist_ok=True)
140
+
141
+ try:
142
+ # Save uploaded file
143
+ with open(temp_filepath, "wb") as buffer:
144
+ shutil.copyfileobj(file.file, buffer)
145
+
146
+ # Analyze VERS using your custom function
147
+ result = analyze_vers_main(temp_filepath)
148
+
149
+ return JSONResponse(content=result)
150
+
151
+ except Exception as e:
152
+ raise HTTPException(status_code=500, detail=f"VERS analysis failed: {str(e)}")
153
+
154
+ finally:
155
+ # Clean up temporary file
156
+ if os.path.exists(temp_filepath):
157
+ os.remove(temp_filepath)
158
+
159
+ @app.post('/voice_confidence/')
160
+ async def analyze_voice_confidence(file: UploadFile):
161
+ """
162
+ Endpoint to analyze voice confidence of an uploaded audio file (.wav or .mp3).
163
+ """
164
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
165
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
166
+
167
+ # Generate a safe temporary file path
168
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
169
+ temp_dir = "temp_uploads"
170
+ temp_filepath = os.path.join(temp_dir, temp_filename)
171
+ os.makedirs(temp_dir, exist_ok=True)
172
+
173
+ try:
174
+ # Save uploaded file
175
+ with open(temp_filepath, "wb") as buffer:
176
+ shutil.copyfileobj(file.file, buffer)
177
+
178
+ # Analyze voice confidence using your custom function
179
+ result = analyze_voice_confidence_main(temp_filepath)
180
+
181
+ return JSONResponse(content=result)
182
+
183
+ except Exception as e:
184
+ raise HTTPException(status_code=500, detail=f"Voice confidence analysis failed: {str(e)}")
185
+
186
+ finally:
187
+ # Clean up temporary file
188
+ if os.path.exists(temp_filepath):
189
+ os.remove(temp_filepath)
190
+
191
+ @app.post('/analyze_vps/')
192
+ async def analyze_vps(file: UploadFile):
193
+ """
194
+ Endpoint to analyze voice pacing score of an uploaded audio file (.wav or .mp3).
195
+ """
196
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
197
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
198
+
199
+ # Generate a safe temporary file path
200
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
201
+ temp_dir = "temp_uploads"
202
+ temp_filepath = os.path.join(temp_dir, temp_filename)
203
+ os.makedirs(temp_dir, exist_ok=True)
204
+
205
+ try:
206
+ # Save uploaded file
207
+ with open(temp_filepath, "wb") as buffer:
208
+ shutil.copyfileobj(file.file, buffer)
209
+
210
+ # Analyze voice pacing score using your custom function
211
+ result = analyze_vps_main(temp_filepath)
212
+
213
+ return JSONResponse(content=result)
214
+
215
+ except Exception as e:
216
+ raise HTTPException(status_code=500, detail=f"Voice pacing score analysis failed: {str(e)}")
217
+
218
+ finally:
219
+ # Clean up temporary file
220
+ if os.path.exists(temp_filepath):
221
+ os.remove(temp_filepath)
222
+
223
+ @app.post('/voice_engagement_score/')
224
+ async def analyze_voice_engagement_score(file: UploadFile):
225
+ """
226
+ Endpoint to analyze voice engagement score of an uploaded audio file (.wav or .mp3).
227
+ """
228
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
229
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
230
+
231
+ # Generate a safe temporary file path
232
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
233
+ temp_dir = "temp_uploads"
234
+ temp_filepath = os.path.join(temp_dir, temp_filename)
235
+ os.makedirs(temp_dir, exist_ok=True)
236
+
237
+ try:
238
+ # Save uploaded file
239
+ with open(temp_filepath, "wb") as buffer:
240
+ shutil.copyfileobj(file.file, buffer)
241
+
242
+ # Analyze voice engagement score using your custom function
243
+ result = calc_voice_engagement_score(temp_filepath)
244
+
245
+ return JSONResponse(content=result)
246
+
247
+ except Exception as e:
248
+ raise HTTPException(status_code=500, detail=f"Voice engagement score analysis failed: {str(e)}")
249
+
250
+ finally:
251
+ # Clean up temporary file
252
+ if os.path.exists(temp_filepath):
253
+ os.remove(temp_filepath)
254
+
255
+ @app.post('/analyze_fillers/')
256
+ async def analyze_fillers_count(file: UploadFile):
257
+ """
258
+ Endpoint to analyze filler words in an uploaded audio file (.wav or .mp3).
259
+ """
260
+ if not file.filename.endswith(('.wav', '.mp3','.mp4','.m4a','.flac')):
261
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
262
+
263
+ # Generate a safe temporary file path
264
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
265
+ temp_dir = "temp_uploads"
266
+ temp_filepath = os.path.join(temp_dir, temp_filename)
267
+ os.makedirs(temp_dir, exist_ok=True)
268
+
269
+ try:
270
+ # Save uploaded file
271
+ with open(temp_filepath, "wb") as buffer:
272
+ shutil.copyfileobj(file.file, buffer)
273
+
274
+ # Call the analysis function with the file path
275
+ result = analyze_fillers(temp_filepath) # Pass the file path, not the UploadFile object
276
+
277
+ return JSONResponse(content=result)
278
+
279
+ except Exception as e:
280
+ raise HTTPException(status_code=500, detail=f"Filler analysis failed: {str(e)}")
281
+
282
+ finally:
283
+ # Clean up temporary file
284
+ if os.path.exists(temp_filepath):
285
+ os.remove(temp_filepath)
286
+
287
+
288
+ import time
289
+
290
+
291
+
292
+ @app.post('/transcribe/')
293
+ async def transcribe(file: UploadFile):
294
+ """
295
+ Endpoint to transcribe an uploaded audio file ('.wav', '.mp3','mp4','.m4a','.flac' ).
296
+ """
297
+ #calculate time to transcribe
298
+ start_time = time.time()
299
+ if not file.filename.endswith(('.wav', '.mp3','mp4','.m4a','.flac')):
300
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav ,mp4 and .mp3 files are supported.")
301
+
302
+ # Generate a safe temporary file path
303
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
304
+ temp_dir = "temp_uploads"
305
+ temp_filepath = os.path.join(temp_dir, temp_filename)
306
+ os.makedirs(temp_dir, exist_ok=True)
307
+
308
+ try:
309
+ # Save uploaded file
310
+ with open(temp_filepath, "wb") as buffer:
311
+ shutil.copyfileobj(file.file, buffer)
312
+
313
+ # Transcribe using your custom function
314
+ result = transcribe_audio(temp_filepath, model_size="base")
315
+ end_time = time.time()
316
+ transcription_time = end_time - start_time
317
+ response = {
318
+ "transcription": result,
319
+ "transcription_time": transcription_time
320
+ }
321
+
322
+ return JSONResponse(content=response)
323
+
324
+ except Exception as e:
325
+ raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
326
+
327
+ finally:
328
+ # Clean up temporary file
329
+ if os.path.exists(temp_filepath):
330
+ os.remove(temp_filepath)
331
+
332
+ import datetime
333
+
334
+ @app.post('/analyze_all/')
335
+ async def analyze_all(file: UploadFile):
336
+ """
337
+ Endpoint to analyze all aspects of an uploaded audio file (.wav or .mp3).
338
+ """
339
+ print(f"Received request at {datetime.datetime.now()} for file: {file.filename}")
340
+ if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
341
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
342
+
343
+ # Generate a safe temporary file path
344
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
345
+ temp_dir = "temp_uploads"
346
+ temp_filepath = os.path.join(temp_dir, temp_filename)
347
+ os.makedirs(temp_dir, exist_ok=True)
348
+
349
+ try:
350
+ # Save uploaded file
351
+ with open(temp_filepath, "wb") as buffer:
352
+ shutil.copyfileobj(file.file, buffer)
353
+
354
+ # Analyze all aspects using your custom functions
355
+ fluency_result = analyze_fluency_main(temp_filepath, model_size="base")
356
+ tone_result = analyze_tone_main(temp_filepath)
357
+ vcs_result = analyze_vcs_main(temp_filepath)
358
+ vers_result = analyze_vers_main(temp_filepath)
359
+ voice_confidence_result = analyze_voice_confidence_main(temp_filepath)
360
+ vps_result = analyze_vps_main(temp_filepath)
361
+ ves_result = calc_voice_engagement_score(temp_filepath)
362
+ filler_count = analyze_fillers(temp_filepath) # Assuming this function returns a dict with filler count
363
+ transcript, language, _ = transcribe_audio(temp_filepath, "base") #fix this
364
+ #emotion = predict_emotion(temp_filepath)
365
+ avg_score = (fluency_result['fluency_score'] + tone_result['speech_dynamism_score'] + vcs_result['Voice Clarity Sore'] + vers_result['VERS Score'] + voice_confidence_result['voice_confidence_score'] + vps_result['VPS'] + ves_result['ves']) / 7
366
+
367
+
368
+ # Combine results into a single response
369
+ combined_result = {
370
+ "fluency": fluency_result,
371
+ "tone": tone_result,
372
+ "vcs": vcs_result,
373
+ "vers": vers_result,
374
+ "voice_confidence": voice_confidence_result,
375
+ "vps": vps_result,
376
+ "ves": ves_result,
377
+ "filler_words": filler_count,
378
+ "transcript": transcript,
379
+ "Detected Language": language,
380
+ #"emotion": emotion ,
381
+ "sank_score": avg_score
382
+ }
383
+
384
+ return JSONResponse(content=combined_result)
385
+
386
+ except Exception as e:
387
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
388
+
389
+ finally:
390
+ # Clean up temporary file
391
+ if os.path.exists(temp_filepath):
392
+ os.remove(temp_filepath)
emotion/__init__.py ADDED
File without changes
emotion/emo_predict.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
2
+ import librosa
3
+ import torch
4
+
5
+ # Load the feature extractor and model
6
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
7
+ model = Wav2Vec2ForSequenceClassification.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
8
+ model.eval()
9
+
10
+ def predict_emotion(audio_path):
11
+ # Load audio (mono, 16kHz)
12
+ audio, rate = librosa.load(audio_path, sr=16000)
13
+
14
+ # Extract features
15
+ inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt", padding=True)
16
+
17
+ # Predict emotion
18
+ with torch.no_grad():
19
+ outputs = model(**inputs)
20
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
21
+ pred_id = torch.argmax(probs, dim=-1).item()
22
+ emotion = model.config.id2label[pred_id]
23
+
24
+ return emotion
25
+
26
+ # # Example usage
27
+ # emotion = predict_emotion(r"D:\Intern\shankh\audio_samples\anga.wav")
28
+ # print(f"Predicted Emotion: {emotion}")
filler_count/__init__.py ADDED
File without changes
filler_count/filler_score.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import whisper
3
+ from pydub import AudioSegment # For accurate duration calculation
4
+
5
+ def analyze_fillers(file_path: str, model_size: str = "base") -> dict:
6
+ """
7
+ Analyzes English filler words in audio with proper duration handling.
8
+ """
9
+ try:
10
+ FILLER_WORDS = [
11
+ "um", "uh", "hmm", "ah", "er", "eh",
12
+ "umm", "uhh", "mmm", "ahh", "err",
13
+ "like", "you know", "well", "so", "actually", "basically",
14
+ "right", "okay", "sort of", "kind of"
15
+ ]
16
+
17
+ # First get accurate duration using pydub
18
+ audio = AudioSegment.from_file(file_path)
19
+ duration = len(audio) / 1000 # Convert ms to seconds
20
+
21
+ # Then run Whisper transcription
22
+ model = whisper.load_model(model_size)
23
+ result = model.transcribe(file_path, word_timestamps=False, fp16=False)
24
+ transcript = result["text"]
25
+
26
+ # Case-insensitive regex matching
27
+ pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)"
28
+ matches = re.findall(pattern, transcript, re.IGNORECASE)
29
+
30
+ # Count occurrences
31
+ filler_counts = {}
32
+ for word in matches:
33
+ key = word.lower()
34
+ filler_counts[key] = filler_counts.get(key, 0) + 1
35
+ total_fillers = sum(filler_counts.values())
36
+
37
+ # Calculate rate per minute
38
+ filler_per_min = (total_fillers / duration) * 60 if duration > 0 else 0
39
+
40
+ # Scoring
41
+ if total_fillers == 0:
42
+ filler_score = 100
43
+ elif filler_per_min < 1:
44
+ filler_score = 90
45
+ elif filler_per_min < 3:
46
+ filler_score = 80
47
+ elif filler_per_min < 5:
48
+ filler_score = 60
49
+ elif filler_per_min < 10:
50
+ filler_score = 40
51
+ else:
52
+ filler_score = 20
53
+
54
+ # Generate insight
55
+ top_fillers = sorted(filler_counts.items(), key=lambda x: x[1], reverse=True)[:2]
56
+
57
+ if total_fillers == 0:
58
+ insight = "Excellent! No filler words detected."
59
+ elif total_fillers <= 2:
60
+ insight = f"Minimal fillers ({total_fillers} total), mostly '{top_fillers[0][0]}'."
61
+ elif total_fillers <= 5:
62
+ examples = ", ".join(f"'{f[0]}'" for f in top_fillers)
63
+ insight = f"Moderate fillers ({total_fillers} total), mainly {examples}."
64
+ else:
65
+ examples = ", ".join(f"'{f[0]}'" for f in top_fillers)
66
+ insight = f"Excessive fillers ({total_fillers} total), dominated by {examples}."
67
+
68
+ return {
69
+ "filler_counts": filler_counts,
70
+ "total_fillers": total_fillers,
71
+ "filler_score": filler_score,
72
+ "filler_rate_per_min": round(filler_per_min, 1),
73
+ }
74
+
75
+ except Exception as e:
76
+ raise RuntimeError(f"Analysis failed: {str(e)}")
fluency/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # fluency/__init__.py
2
+ from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
3
+ from .filler_analyzer import detect_fillers
4
+ from .compute_fluency import compute_fluency_score
5
+
6
+ __all__ = [
7
+ 'calc_srs',
8
+ 'calculate_pas',
9
+ 'calculate_fluency',
10
+ 'get_fluency_insight',
11
+ 'detect_fillers',
12
+ 'compute_fluency_score'
13
+ ]
fluency/compute_fluency.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Compute fluency score from audio file using SRS and PAS calculations
3
+ """
4
+
5
+ import librosa
6
+ import numpy as np
7
+ from typing import Dict, Any, Union
8
+ from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
9
+ from .filler_analyzer import detect_fillers
10
+
11
+ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
12
+ """
13
+ Compute fluency score and its components from a speech sample.
14
+
15
+ Args:
16
+ file_path (str): Path to the audio file.
17
+ whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
18
+
19
+ Returns:
20
+ dict: A dictionary containing fluency score, SRS, PAS, and component scores.
21
+ """
22
+ # Transcribe audio
23
+ result = whisper_model.transcribe(file_path)
24
+ transcript = result.get("text", "").strip()
25
+ segments = result.get("segments", [])
26
+
27
+ # Validate early
28
+ if not transcript or not segments:
29
+ raise ValueError("Empty transcript or segments from Whisper.")
30
+
31
+ # Detect filler words
32
+ filler_count, _ = detect_fillers(transcript)
33
+
34
+ # Load audio
35
+ y, sr = librosa.load(file_path, sr=None)
36
+ duration = len(y) / sr if sr else 0.0
37
+ if duration <= 0:
38
+ raise ValueError("Audio duration invalid or zero.")
39
+
40
+ # Calculate pitch variation (in semitones)
41
+ f0, voiced_flags, voiced_probs = librosa.pyin(
42
+ y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
43
+ voiced_f0 = f0[~np.isnan(f0)]
44
+ pitch_variation = 0.0
45
+ if voiced_f0.size > 0:
46
+ median_f0 = np.nanmedian(voiced_f0)
47
+ median_f0 = max(median_f0, 1e-6)
48
+ semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
49
+ pitch_variation = float(np.nanstd(semitone_diffs))
50
+
51
+ # Analyze pauses
52
+ long_pause_count = 0
53
+ if segments:
54
+ for i in range(len(segments) - 1):
55
+ pause_dur = segments[i + 1]["start"] - segments[i]["end"]
56
+ if pause_dur > 1.0:
57
+ long_pause_count += 1
58
+ # Check beginning and end pauses
59
+ if segments[0]["start"] > 1.0:
60
+ long_pause_count += 1
61
+ if duration - segments[-1]["end"] > 1.0:
62
+ long_pause_count += 1
63
+
64
+ # Calculate WPM
65
+ word_count = len(transcript.split())
66
+ words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
67
+
68
+ # Calculate SRS - Speech Rate Stability
69
+ srs_score = calc_srs(
70
+ wpm=words_per_min,
71
+ filler_count=filler_count,
72
+ long_pause_count=long_pause_count,
73
+ pitch_variation=pitch_variation
74
+ )
75
+
76
+ # Calculate PAS - Pause Appropriateness Score
77
+ pas_result = calculate_pas(
78
+ transcript=transcript,
79
+ segments=segments,
80
+ filler_count=filler_count,
81
+ duration=duration
82
+ )
83
+ pas_score = pas_result["PAS"]
84
+
85
+ # Calculate final fluency score
86
+ fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
87
+ fluency_score = fluency_result["score"]
88
+ insight = get_fluency_insight(fluency_score)
89
+
90
+ # Build and return comprehensive result
91
+ return {
92
+ "fluency_score": fluency_score,
93
+ "insight": insight,
94
+ "SRS": srs_score,
95
+ "PAS": pas_score,
96
+ "components": {
97
+ "wpm": words_per_min,
98
+ "filler_count": filler_count,
99
+ "long_pause_count": long_pause_count,
100
+ "pitch_variation": pitch_variation,
101
+ "word_count": word_count,
102
+ "duration": duration,
103
+ "pas_components": pas_result
104
+ },
105
+ "transcript": transcript
106
+ }
fluency/filler_analyzer.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
2
+ # Mapping each variant to a common label (usually the Latin script for insight reporting)
3
+ FILLER_VARIANTS = {
4
+ # English fillers
5
+ "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
6
+ "umm": "um", "uhh": "uh", "mmm": "hmm",
7
+ "like": "like", "you know": "you know", "so": "so", "well": "well",
8
+ # Hindi fillers (Devanagari and transliteration)
9
+ "मतलब": "matlab", "matlab": "matlab",
10
+ "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
11
+ "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
12
+ "ऐसा है": "aisa hai", "aisa hai": "aisa hai",
13
+ "हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan"
14
+ "अच्छा": "acha", "acha": "acha",
15
+ # Tamil fillers (Tamil script and transliteration)
16
+ "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
17
+ "அப்பரம்": "apparam", "apparam": "apparam",
18
+ "என்ன": "enna", "enna": "enna"
19
+ }
20
+
21
+ def detect_fillers(transcript):
22
+ """
23
+ Detects filler words in the transcript.
24
+
25
+ Args:
26
+ transcript: Full transcript text
27
+
28
+ Returns:
29
+ tuple: (filler_count, filler_occurrences)
30
+ """
31
+ transcript_lower = transcript.lower()
32
+ filler_count = 0
33
+ # Track which specific fillers were used (for insight examples)
34
+ filler_occurrences = {}
35
+
36
+ for variant, label in FILLER_VARIANTS.items():
37
+ if variant in transcript_lower:
38
+ count = transcript_lower.count(variant)
39
+ if count > 0:
40
+ filler_count += count
41
+ # Accumulate count for the normalized label
42
+ filler_occurrences[label] = filler_occurrences.get(label, 0) + count
43
+
44
+ return filler_count, filler_occurrences
45
+
46
+ def analyze_filler_words(filler_count, filler_occurrences, duration):
47
+ """
48
+ Analyzes filler word usage in speech.
49
+
50
+ Args:
51
+ filler_count: Total count of filler words
52
+ filler_occurrences: Dictionary of specific filler words and their counts
53
+ duration: Duration of the audio in seconds
54
+
55
+ Returns:
56
+ dict: Contains the filler words score and insight text
57
+ """
58
+ # Extract top examples for insights
59
+ filler_examples = []
60
+ if filler_occurrences:
61
+ # Sort by frequency
62
+ sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
63
+ for label, count in sorted_fillers[:2]:
64
+ filler_examples.append(label)
65
+
66
+ # Compute fillers per minute as a gauge
67
+ filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
68
+
69
+ if filler_count == 0:
70
+ filler_score = 10
71
+ elif filler_per_min < 1:
72
+ filler_score = 9
73
+ elif filler_per_min < 3:
74
+ filler_score = 8
75
+ elif filler_per_min < 5:
76
+ filler_score = 6
77
+ elif filler_per_min < 10:
78
+ filler_score = 4
79
+ else:
80
+ filler_score = 2
81
+
82
+ filler_score = max(0, filler_score)
83
+
84
+ # Generate insight text based on the score and examples
85
+ if filler_count == 0:
86
+ insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
87
+ elif filler_count <= 2:
88
+ example = filler_examples[0] if filler_examples else "um"
89
+ insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
90
+ elif filler_count <= 5:
91
+ examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
92
+ insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
93
+ else:
94
+ examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
95
+ insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
96
+
97
+ return {
98
+ "score": int(filler_score),
99
+ "insight": insight
100
+ }
fluency/fluency.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import spacy
4
+ from typing import List, Dict
5
+
6
+ def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
7
+ """
8
+ Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
9
+
10
+ Args:
11
+ wpm (float): Words per minute
12
+ filler_count (int): Number of filler words ("um", "uh", etc.)
13
+ long_pause_count (int): Number of pauses longer than 1 second
14
+ pitch_variation (float): Standard deviation of pitch in semitones
15
+
16
+ Returns:
17
+ float: SRS score between 0-100
18
+
19
+ Requires:
20
+ - Words per Minute Consistency: Regularity in speech speed.
21
+ - Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes.
22
+ """
23
+ ideal_wpm = 150
24
+ wpm_deviation = min(30, abs(wpm - ideal_wpm)) # Cap at 30 WPM deviation
25
+ wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) # 100-50 for max deviation
26
+
27
+ # Sudden Speech Shift Penalty
28
+ filler_penalty = min(filler_count / 10, 1.0)
29
+ pause_penalty = min(long_pause_count / 5, 1.0)
30
+ pitch_penalty = min(pitch_variation / 3.0, 1.0) # High variation → unstable
31
+
32
+ # Combine into absence of sudden shifts
33
+ stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
34
+
35
+ # Final SRS Score
36
+ SRS = (0.45 * wpm_consistency) + (0.55 * stability)
37
+ return min(100, max(0, SRS))
38
+
39
+
40
+ def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
41
+ """
42
+ Calculate the Pause Appropriateness Score (PAS) and its components.
43
+
44
+ Args:
45
+ transcript (str): Full transcript text
46
+ segments (List[Dict]): List of transcript segments with start/end times
47
+ filler_count (int): Number of filler words detected
48
+ duration (float): Total duration of audio in seconds
49
+
50
+ Returns:
51
+ Dict[str, float]: Dictionary with NPP, AFW, and PAS scores
52
+ """
53
+ if not transcript or not segments or duration <= 0:
54
+ raise ValueError("Transcript, segments, and duration must be valid")
55
+
56
+ nlp = spacy.load("en_core_web_sm")
57
+ doc = nlp(transcript)
58
+
59
+ words = transcript.split()
60
+ total_words = len(words)
61
+ if total_words == 0:
62
+ raise ValueError("No words found in transcript")
63
+
64
+ # Calculate Avoidance of Filler Words (AFW)
65
+ filler_rate = filler_count / total_words if total_words > 0 else 0.0
66
+ if filler_rate >= 0.10:
67
+ afw = 0.0
68
+ elif filler_rate <= 0.0:
69
+ afw = 100.0
70
+ else:
71
+ afw = 100.0 - (filler_rate * 1000)
72
+ afw = max(0.0, min(100.0, afw))
73
+
74
+ # Calculate Natural Pause Placement (NPP)
75
+ total_pauses = 0
76
+ natural_pauses = 0
77
+ segment_texts = [seg["text"].strip() for seg in segments]
78
+ segment_starts = [seg["start"] for seg in segments]
79
+ segment_ends = [seg["end"] for seg in segments]
80
+
81
+ for i in range(len(segments) - 1):
82
+ pause_dur = segment_starts[i + 1] - segment_ends[i]
83
+ if pause_dur > 0.5:
84
+ total_pauses += 1
85
+ if segment_texts[i] and segment_texts[i][-1] in ".!?,":
86
+ natural_pauses += 1
87
+
88
+ # Check initial and final pauses
89
+ if segment_starts[0] > 0.5:
90
+ total_pauses += 1
91
+ if duration - segment_ends[-1] > 0.5:
92
+ total_pauses += 1
93
+ if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
94
+ natural_pauses += 1
95
+
96
+ npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
97
+
98
+ # Calculate final PAS
99
+ pas = (0.4 * npp) + (0.6 * afw)
100
+
101
+ return {
102
+ "NPP": npp,
103
+ "AFW": afw,
104
+ "PAS": pas
105
+ }
106
+
107
+
108
+ def calculate_fluency(srs: float, pas: float) -> Dict[str, float]:
109
+ """
110
+ Calculate fluency score based on Speech Rate Stability and Pause Appropriateness Score.
111
+
112
+ Args:
113
+ srs (float): Speech Rate Stability score (0-100)
114
+ pas (float): Pause Appropriateness Score (0-100)
115
+
116
+ Returns:
117
+ Dict[str, float]: Dictionary with fluency score (0-100) and component contributions
118
+ """
119
+ # Equal weighting of SRS and PAS for fluency
120
+ fluency_score = (0.5 * srs) + (0.5 * pas)
121
+
122
+
123
+ return {
124
+ "score": fluency_score,
125
+ "SRS_contribution": 0.5 * srs,
126
+ "PAS_contribution": 0.5 * pas
127
+ }
128
+
129
+
130
+ def get_fluency_insight(fluency_score: float) -> str:
131
+ """
132
+ Generate insight text based on the fluency score.
133
+
134
+ Args:
135
+ fluency_score (float): The calculated fluency score (0-100)
136
+
137
+ Returns:
138
+ str: Insight text explaining the score
139
+ """
140
+ if fluency_score >= 85:
141
+ return "Excellent fluency with very consistent pacing and natural pauses. Speech flows effortlessly."
142
+ elif fluency_score >= 70:
143
+ return "Good fluency with generally stable speech rate and appropriate pauses. Some minor inconsistencies."
144
+ elif fluency_score >= 50:
145
+ return "Moderate fluency with occasional disruptions in speech flow. Consider working on pace stability and pause placement."
146
+ elif fluency_score >= 30:
147
+ return "Below average fluency with noticeable disruptions. Focus on reducing filler words and maintaining consistent pace."
148
+ else:
149
+ return "Speech fluency needs significant improvement. Work on maintaining consistent pace, reducing long pauses, and eliminating filler words."
fluency/fluency_api.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from .compute_fluency import compute_fluency_score
3
+
4
+ def main(file_path: str, model_size: str = "base") -> dict:
5
+ try:
6
+
7
+ whisper_model = whisper.load_model(model_size)
8
+
9
+ results = compute_fluency_score(file_path, whisper_model)
10
+
11
+ # Structure response
12
+ response = {
13
+ "fluency_score": round(results['fluency_score'], 2)
14
+ # "insight": results["insight"],
15
+ # "SRS": round(results["SRS"], 2),
16
+ # "PAS": round(results["PAS"], 2),
17
+ # "transcript": results["transcript"]
18
+ }
19
+ return response
20
+
21
+ except Exception as e:
22
+ raise RuntimeError(f"Error during analysis: {str(e)}")
fluency/main.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import whisper
3
+ from .compute_fluency import compute_fluency_score
4
+
5
+ def main():
6
+ """
7
+ Main function to run fluency analysis on audio files
8
+ """
9
+ # Fixed parameters - modify these values directly in the code
10
+ audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav" # Path to your audio file
11
+ model_size = "base" # Whisper model size (tiny, base, small, medium, large)
12
+ verbose = True # Whether to print detailed results
13
+
14
+ try:
15
+ # Load whisper model
16
+ print(f"Loading Whisper model ({model_size})...")
17
+ whisper_model = whisper.load_model(model_size)
18
+
19
+ # Calculate fluency score
20
+ print(f"Analyzing fluency for {audio_file}...")
21
+ results = compute_fluency_score(audio_file, whisper_model)
22
+
23
+ # Print summary results
24
+ print("\nFluency Analysis Results:")
25
+ print(f"- Fluency Score: {results['fluency_score']:.2f}/100")
26
+ print(f"- Insight: {results['insight']}")
27
+ print(f"- Speech Rate Stability (SRS): {results['SRS']:.2f}/100")
28
+ print(f"- Pause Appropriateness (PAS): {results['PAS']:.2f}/100")
29
+
30
+ # Print verbose results if enabled
31
+ if verbose:
32
+ print("\nDetailed Metrics:")
33
+ print(f"- Words per minute: {results['components']['wpm']:.1f}")
34
+ print(f"- Filler word count: {results['components']['filler_count']}")
35
+ print(f"- Long pauses: {results['components']['long_pause_count']}")
36
+ print(f"- Pitch variation: {results['components']['pitch_variation']:.2f} semitones")
37
+ print(f"- Natural Pause Placement: {results['components']['pas_components']['NPP']:.2f}/100")
38
+ print(f"- Avoidance of Filler Words: {results['components']['pas_components']['AFW']:.2f}/100")
39
+
40
+ # Print first 100 characters of transcript
41
+ transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript']
42
+ print(f"\nTranscript preview: {transcript_preview}")
43
+
44
+ except Exception as e:
45
+ print(f"Error during analysis: {str(e)}")
46
+ return 1
47
+
48
+ if __name__ == "__main__":
49
+ exit(main())
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ fastapi
3
+ uvicorn
4
+ python-multipart
5
+ pydub
6
+
7
+ librosa
8
+ soundfile
9
+ pyworld
10
+ scipy
11
+
12
+
13
+ openai-whisper==20240930
14
+ spacy==3.8.5
15
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
16
+ transformers
17
+ torch
18
+
19
+
20
+ numpy
21
+ tqdm
22
+ requests
23
+ assemblyai
tone_modulation/__init__.py ADDED
File without changes
tone_modulation/sds.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import scipy.signal
3
+ import numpy as np
4
+ import librosa
5
+ import pyworld as pw
6
+
7
+ # def compute_pitch_variation(file_path):
8
+ # # Step 1: Load audio
9
+ # y, sr = librosa.load(file_path, sr=None)
10
+ # y = y.astype(np.float64) # pyworld expects float64
11
+
12
+ # # Step 2: Extract pitch (F0)
13
+ # _f0, t = pw.dio(y, sr) # Fast initial pitch estimation
14
+ # f0 = pw.stonemask(y, _f0, t, sr) # Refinement step
15
+
16
+ # # Step 3: Filter voiced frames
17
+ # voiced_f0 = f0[f0 > 0]
18
+
19
+ # # Handle empty case
20
+ # if voiced_f0.size == 0:
21
+ # return {
22
+ # "pitch_mean": 0.0,
23
+ # "pitch_std": 0.0,
24
+ # "pitch_range": 0.0,
25
+ # "semitone_std": 0.0,
26
+ # "pitch_variation_score": 0.0
27
+ # }
28
+
29
+ # # Step 4: Basic statistics
30
+ # pitch_mean = np.mean(voiced_f0)
31
+ # pitch_std = np.std(voiced_f0)
32
+ # pitch_range = np.max(voiced_f0) - np.min(voiced_f0)
33
+
34
+ # print(pitch_mean)
35
+ # print(f'voiced_f0: {voiced_f0}')
36
+ # # Step 5: Compute semitone-based variation (better for human perception)
37
+ # median_f0 = np.median(voiced_f0)
38
+ # if median_f0 <= 0:
39
+ # median_f0 = 1e-6 # Avoid division by zero
40
+
41
+ # semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
42
+ # semitone_std = np.std(semitone_diffs)
43
+ # print(semitone_std)
44
+
45
+ # # Step 6: Scale semitone_std to a 0–100 score (tunable)
46
+ # # For example: semitone_std of 0 → 0 score, ≥6 semitones → 100 score
47
+ # pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100)
48
+
49
+ # return {
50
+ # "pitch_mean": pitch_mean,
51
+ # "pitch_std": pitch_std,
52
+ # "pitch_range": pitch_range,
53
+ # "semitone_std": semitone_std,
54
+ # "pitch_variation_score": pitch_variation_score
55
+ # }
56
+ # def compute_intonation_range(file_path):
57
+ # # Step 1: Load and prepare audio
58
+ # y, sr = librosa.load(file_path, sr=None)
59
+ # y = y.astype(np.float64)
60
+
61
+ # # Step 2: Extract F0
62
+ # _f0, t = pw.dio(y, sr)
63
+ # f0 = pw.stonemask(y, _f0, t, sr)
64
+
65
+
66
+
67
+ # # Step 3: Filter voiced frames
68
+ # voiced_f0 = f0[f0 > 0]
69
+ # if voiced_f0.size == 0:
70
+ # return 0.0
71
+
72
+ # voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) &
73
+ # (voiced_f0 < np.percentile(voiced_f0, 95))]
74
+
75
+ # # Step 4: Compute intonation range (in semitones)
76
+ # f0_min = np.min(voiced_f0)
77
+ # f0_max = np.max(voiced_f0)
78
+ # if f0_min <= 0:
79
+ # f0_min = 1e-6 # to avoid log error
80
+ # intonation_range = 12 * np.log2(f0_max / f0_min)
81
+
82
+ # # range into scores:
83
+
84
+ # max_range = 12.0
85
+ # normalized = min(intonation_range, max_range) / max_range
86
+ # score = normalized * 100
87
+ # return round(score, 2), intonation_range
88
+
89
+
90
+
91
+ # def compute_pitch_variation(file_path):
92
+ # # Step 1: Load audio
93
+ # y, sr = librosa.load(file_path, sr=None)
94
+
95
+ # # Step 2: Extract pitch using librosa.pyin (YIN-based)
96
+ # f0, voiced_flags, voiced_probs = librosa.pyin(
97
+ # y,
98
+ # sr=sr,
99
+ # fmin=80,
100
+ # fmax=400,
101
+ # frame_length=1105,
102
+ # hop_length=256,
103
+ # fill_na=np.nan
104
+ # )
105
+
106
+ # # Step 3: Filter voiced frames
107
+ # voiced_f0 = f0[~np.isnan(f0)]
108
+
109
+
110
+ # voiced_f0 = voiced_f0[
111
+ # (voiced_f0 > np.percentile(voiced_f0, 5)) &
112
+ # (voiced_f0 < np.percentile(voiced_f0, 95))
113
+ # ]
114
+
115
+ # # Handle empty case
116
+ # if voiced_f0.size == 0:
117
+ # return {
118
+ # "pitch_mean": 0.0,
119
+ # "pitch_std": 0.0,
120
+ # "pitch_range": 0.0,
121
+ # "semitone_std": 0.0,
122
+ # "pitch_variation_score": 0.0
123
+ # }
124
+
125
+ # # Step 4: Basic statistics
126
+ # pitch_mean = float(np.mean(voiced_f0))
127
+ # pitch_std = float(np.std(voiced_f0))
128
+ # pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
129
+
130
+
131
+ # # Step 5: Compute semitone-based variation
132
+ # median_f0 = np.median(voiced_f0)
133
+ # if median_f0 <= 0:
134
+ # median_f0 = 1e-6
135
+
136
+ # semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
137
+ # semitone_std = float(np.std(semitone_diffs))
138
+
139
+
140
+ # # Step 6: Scale to 0–100 score
141
+ # pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
142
+ # return {
143
+ # "pitch_mean": pitch_mean,
144
+ # "pitch_std": pitch_std,
145
+ # "pitch_range": pitch_range,
146
+ # "semitone_std": semitone_std,
147
+ # "pitch_variation_score": pitch_variation_score
148
+ # }
149
+
150
+ # def compute_intonation_range(file_path):
151
+ # # Step 1: Load and prepare audio
152
+ # y, sr = librosa.load(file_path, sr=None)
153
+
154
+ # # Step 2: Extract F0 using librosa.pyin
155
+ # f0, voiced_flags, voiced_probs = librosa.pyin(
156
+ # y,
157
+ # sr=sr,
158
+ # fmin=80,
159
+ # fmax=400,
160
+ # frame_length=1105, # ensures two periods of fmin fit
161
+ # hop_length=256,
162
+ # fill_na=np.nan
163
+ # )
164
+
165
+ # # Step 3: Filter voiced frames
166
+ # voiced_f0 = f0[~np.isnan(f0)]
167
+ # if voiced_f0.size == 0:
168
+ # return 0.0, 0.0
169
+
170
+ # # Optional: remove outliers (5th to 95th percentile)
171
+ # voiced_f0 = voiced_f0[
172
+ # (voiced_f0 > np.percentile(voiced_f0, 5)) &
173
+ # (voiced_f0 < np.percentile(voiced_f0, 95))
174
+ # ]
175
+
176
+ # # Step 4: Compute intonation range in semitones
177
+ # f0_min = np.min(voiced_f0)
178
+ # f0_max = np.max(voiced_f0)
179
+ # if f0_min <= 0:
180
+ # f0_min = 1e-6
181
+
182
+ # intonation_range = 12 * np.log2(f0_max / f0_min)
183
+
184
+ # # Step 5: Normalize and convert to score out of 100
185
+ # max_range = 12.0 # ~1 octave
186
+ # normalized = min(intonation_range, max_range) / max_range
187
+ # score = normalized * 100
188
+
189
+ # return round(score, 2), float(intonation_range)
190
+
191
+
192
+
193
+ # def compute_speech_rhythm_variability(file_path):
194
+ # """
195
+ # Computes the speech rhythm variability score from an audio file.
196
+ # The method estimates tempo consistency across time using onset intervals.
197
+
198
+ # Returns:
199
+ # score (float): Normalized rhythm variability score out of 100.
200
+ # raw_std (float): Raw standard deviation of inter-onset intervals.
201
+ # """
202
+ # # Step 1: Load audio
203
+ # y, sr = librosa.load(file_path, sr=None)
204
+
205
+ # # Step 2: Onset detection
206
+ # onset_env = librosa.onset.onset_strength(y=y, sr=sr)
207
+ # onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
208
+
209
+ # if len(onsets) < 2:
210
+ # return 0.0, 0.0 # Not enough onsets to compute rhythm
211
+
212
+ # # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy
213
+ # iois = np.diff(onsets)
214
+
215
+ # # Optional: Remove outliers (5th–95th percentile)
216
+ # ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
217
+ # if len(ioi_clean) < 2:
218
+ # return 0.0, 0.0
219
+
220
+ # # Step 4: Compute variability — standard deviation of IOIs
221
+ # raw_std = np.std(ioi_clean)
222
+
223
+ # # Step 5: Normalize raw_std to 0–100 score
224
+ # # Lower std = more consistent rhythm → higher score
225
+ # min_std = 0.05 # near-perfect rhythm (tight pacing)
226
+ # max_std = 0.6 # highly irregular rhythm
227
+
228
+ # # Clamp and reverse-score
229
+ # clamped_std = np.clip(raw_std, min_std, max_std)
230
+ # normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
231
+ # score = normalized * 100
232
+
233
+ # return round(score, 2), round(float(raw_std), 4)
234
+
235
+
236
+ # def calc_sds(file_path):
237
+
238
+ # # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability
239
+
240
+ # pitch_variation = compute_pitch_variation(file_path)
241
+ # intonation_range = compute_intonation_range(file_path)
242
+ # speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
243
+ # # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
244
+ # # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
245
+ # # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
246
+
247
+ # sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0]
248
+ # return round(sds, 2)
249
+
250
+ # path = r'D:\Intern\shankh\audio_samples\anga.wav'
251
+
252
+ # result = calc_sds(path)
253
+ # print(f"SDS: {result}")
254
+
255
+ import numpy as np
256
+ import librosa
257
+ import pyworld
258
+
259
+ def compute_pitch_variation(file_path):
260
+ # Step 1: Load audio
261
+ y, sr = librosa.load(file_path, sr=None)
262
+
263
+ # Step 2: Extract pitch using pyworld
264
+ _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
265
+ f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
266
+
267
+ # Step 3: Filter voiced frames
268
+ voiced_f0 = f0[f0 > 0]
269
+
270
+ # Remove outliers (5th to 95th percentile)
271
+ voiced_f0 = voiced_f0[
272
+ (voiced_f0 > np.percentile(voiced_f0, 5)) &
273
+ (voiced_f0 < np.percentile(voiced_f0, 95))
274
+ ]
275
+
276
+ if voiced_f0.size == 0:
277
+ return {
278
+ "pitch_mean": 0.0,
279
+ "pitch_std": 0.0,
280
+ "pitch_range": 0.0,
281
+ "semitone_std": 0.0,
282
+ "pitch_variation_score": 0.0
283
+ }
284
+
285
+ # Step 4: Basic statistics
286
+ pitch_mean = float(np.mean(voiced_f0))
287
+ pitch_std = float(np.std(voiced_f0))
288
+ pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
289
+
290
+ # Step 5: Semitone-based variation
291
+ median_f0 = np.median(voiced_f0)
292
+ if median_f0 <= 0:
293
+ median_f0 = 1e-6
294
+ semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
295
+ semitone_std = float(np.std(semitone_diffs))
296
+
297
+ # Step 6: Scaled variation score
298
+ pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
299
+
300
+ return {
301
+ "pitch_mean": pitch_mean,
302
+ "pitch_std": pitch_std,
303
+ "pitch_range": pitch_range,
304
+ "semitone_std": semitone_std,
305
+ "pitch_variation_score": pitch_variation_score
306
+ }
307
+
308
+
309
+ def compute_intonation_range(file_path):
310
+ # Step 1: Load audio
311
+ y, sr = librosa.load(file_path, sr=None)
312
+
313
+ # Step 2: Extract pitch using pyworld
314
+ _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
315
+ f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
316
+
317
+ # Step 3: Filter voiced frames
318
+ voiced_f0 = f0[f0 > 0]
319
+ if voiced_f0.size == 0:
320
+ return 0.0, 0.0
321
+
322
+ # Remove outliers
323
+ voiced_f0 = voiced_f0[
324
+ (voiced_f0 > np.percentile(voiced_f0, 5)) &
325
+ (voiced_f0 < np.percentile(voiced_f0, 95))
326
+ ]
327
+ if voiced_f0.size == 0:
328
+ return 0.0, 0.0
329
+
330
+ # Step 4: Compute intonation range
331
+ f0_min = np.min(voiced_f0)
332
+ f0_max = np.max(voiced_f0)
333
+ if f0_min <= 0:
334
+ f0_min = 1e-6
335
+ intonation_range = 12 * np.log2(f0_max / f0_min)
336
+
337
+ # Step 5: Normalize
338
+ max_range = 12.0
339
+ normalized = min(intonation_range, max_range) / max_range
340
+ score = normalized * 100
341
+
342
+ return round(score, 2), float(intonation_range)
343
+
344
+
345
+ def compute_speech_rhythm_variability(file_path):
346
+ """
347
+ Computes the speech rhythm variability score from an audio file.
348
+ The method estimates tempo consistency across time using onset intervals.
349
+ """
350
+ y, sr = librosa.load(file_path, sr=None)
351
+
352
+ # Step 2: Onset detection
353
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
354
+ onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
355
+
356
+ if len(onsets) < 2:
357
+ return 0.0, 0.0
358
+
359
+ iois = np.diff(onsets)
360
+
361
+ ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
362
+ if len(ioi_clean) < 2:
363
+ return 0.0, 0.0
364
+
365
+ raw_std = np.std(ioi_clean)
366
+
367
+ min_std = 0.05
368
+ max_std = 0.6
369
+ clamped_std = np.clip(raw_std, min_std, max_std)
370
+ normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
371
+ score = normalized * 100
372
+
373
+ return round(score, 2), round(float(raw_std), 4)
374
+
375
+
376
+ def calc_sds(file_path):
377
+ pitch_variation = compute_pitch_variation(file_path)
378
+ intonation_range = compute_intonation_range(file_path)
379
+ speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
380
+
381
+ sds = 0.35 * pitch_variation['pitch_variation_score'] + \
382
+ 0.35 * intonation_range[0] + \
383
+ 0.3 * speech_rhythm_variability[0]
384
+
385
+ return round(sds, 2)
tone_modulation/tone_api.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from .sds import calc_sds
3
+
4
+ import logging
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def main(file_path: str) -> dict:
8
+ logger.info(f"Starting tone analysis for: {file_path}")
9
+ try:
10
+
11
+
12
+ results = calc_sds(file_path)
13
+
14
+ # Structure response
15
+ response = {
16
+ "speech_dynamism_score" : round(results, 2),
17
+ }
18
+ logger.info("Tone analysis complete")
19
+ return response
20
+
21
+ except Exception as e:
22
+ logger.error(f"Tone analysis failed internally: {e}", exc_info=True)
23
+ raise RuntimeError(f"Error during analysis: {str(e)}")
transcribe.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import assemblyai as aai
2
+
3
+ aai.settings.api_key = "2c02e1bdab874068bdcfb2e226f048a4" # Use env var in production
4
+
5
+ def transcribe_audio(file_path: str, model_size=None) -> tuple[str, str, float]:
6
+ print(f"Transcribing audio file: {file_path} with language detection")
7
+
8
+ config = aai.TranscriptionConfig(
9
+ speech_model=aai.SpeechModel.nano,
10
+ language_detection=True,
11
+ language_confidence_threshold=0.4
12
+ )
13
+
14
+ transcriber = aai.Transcriber()
15
+
16
+ transcript = transcriber.transcribe(file_path, config)
17
+
18
+ if transcript.status == "error":
19
+ raise RuntimeError(f"Transcription failed: {transcript.error}")
20
+
21
+ # Access detected language and confidence from json_response
22
+ response = transcript.json_response
23
+ language = response.get("language_code")
24
+ confidence = response.get("language_confidence")
25
+
26
+ result = {
27
+ "transcript": transcript.text,
28
+ "language": language,
29
+ "confidence": confidence
30
+ }
31
+
32
+ return transcript.text, language, confidence
vcs/__init__.py ADDED
File without changes
vcs/compute_vcs.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Compute Voice Clarity Score from audio file
3
+ """
4
+
5
+ import librosa
6
+ import numpy as np
7
+ from typing import Dict, Any
8
+ from .vcs import calculate_voice_clarity_score, get_clarity_insight
9
+
10
+ def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
11
+ """
12
+ Compute Voice Clarity Score and its components from a speech sample.
13
+
14
+ Args:
15
+ file_path (str): Path to the audio file.
16
+ whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
17
+
18
+ Returns:
19
+ dict: A dictionary containing Voice Clarity Score and component scores.
20
+ """
21
+ # Transcribe audio
22
+ result = whisper_model.transcribe(file_path)
23
+ transcript = result.get("text", "").strip()
24
+ segments = result.get("segments", [])
25
+
26
+ # Validate early
27
+ if not transcript or not segments:
28
+ raise ValueError("Empty transcript or segments from Whisper.")
29
+
30
+ # Load audio
31
+ y, sr = librosa.load(file_path, sr=None)
32
+ duration = len(y) / sr if sr else 0.0
33
+ if duration <= 0:
34
+ raise ValueError("Audio duration invalid or zero.")
35
+
36
+ # Calculate Voice Clarity Score
37
+ clarity_result = calculate_voice_clarity_score(y, sr, segments)
38
+
39
+ # Add transcript to results
40
+ clarity_result["transcript"] = transcript
41
+
42
+ # Add word count and duration info for reference
43
+ word_count = len(transcript.split())
44
+ clarity_result["components"]["word_count"] = word_count
45
+ clarity_result["components"]["duration"] = duration
46
+
47
+ return clarity_result
48
+
49
+ def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
50
+ """
51
+ Comprehensive voice quality analysis including clarity.
52
+
53
+ Args:
54
+ file_path (str): Path to the audio file
55
+ whisper_model: Transcription model
56
+
57
+ Returns:
58
+ Dict[str, Any]: Complete voice quality analysis
59
+ """
60
+ # Get Voice Clarity Score
61
+ clarity_results = compute_voice_clarity_score(file_path, whisper_model)
62
+ vcs = clarity_results["VCS"]
63
+
64
+ # Load audio for additional analysis
65
+ y, sr = librosa.load(file_path, sr=None)
66
+
67
+ # Calculate additional voice quality metrics
68
+
69
+ # Voice stability - based on pitch (F0) stability
70
+ f0, voiced_flags, voiced_probs = librosa.pyin(
71
+ y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
72
+ voiced_f0 = f0[~np.isnan(f0)]
73
+
74
+ pitch_stability = 0.0
75
+ if voiced_f0.size > 0:
76
+ # Calculate coefficient of variation (lower is more stable)
77
+ cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
78
+ # Convert to score (0-100)
79
+ pitch_stability = max(0, min(100, 100 - (cv * 100)))
80
+
81
+ # Voice resonance - based on spectral bandwidth
82
+ bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
83
+ # Normalize (ideal range is around 1500-2500 Hz for speech)
84
+ if bandwidth < 1000:
85
+ resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow
86
+ elif bandwidth <= 2500:
87
+ resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range
88
+ else:
89
+ resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide
90
+
91
+ # Voice strength - based on RMS energy
92
+ rms = np.mean(librosa.feature.rms(y=y))
93
+ # Normalize (typical speech RMS values range from 0.01 to 0.2)
94
+ strength_score = min(100, max(0, rms / 0.2 * 100))
95
+
96
+ # Combine additional metrics
97
+ additional_metrics = {
98
+ "pitch_stability": pitch_stability,
99
+ "voice_resonance": resonance_score,
100
+ "voice_strength": strength_score
101
+ }
102
+
103
+ # Add to results
104
+ combined_results = {
105
+ "VCS": vcs,
106
+ "insight": clarity_results["insight"],
107
+ "components": {
108
+ **clarity_results["components"],
109
+ **additional_metrics
110
+ },
111
+ "transcript": clarity_results["transcript"]
112
+ }
113
+
114
+ return combined_results
115
+
116
+ # Ensure the functions are exposed when imported
117
+ __all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']
vcs/main.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import whisper
3
+ from .compute_vcs import analyze_voice_quality
4
+
5
+ def main():
6
+ """
7
+ Main function to run voice clarity analysis on audio files
8
+ """
9
+ # Fixed parameters - modify these values directly in the code
10
+ audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav" # Path to your audio file
11
+ model_size = "base" # Whisper model size (tiny, base, small, medium, large)
12
+ verbose = True # Whether to print detailed results
13
+
14
+ try:
15
+ # Load whisper model
16
+ print(f"Loading Whisper model ({model_size})...")
17
+ whisper_model = whisper.load_model(model_size)
18
+
19
+ # Calculate voice clarity score
20
+ print(f"Analyzing voice clarity for {audio_file}...")
21
+ results = analyze_voice_quality(audio_file, whisper_model)
22
+
23
+ # Print summary results
24
+ print("\nVoice Quality Analysis Results:")
25
+ print(f"- Voice Clarity Score (VCS): {results['VCS']:.2f}/100")
26
+ print(f"- Insight: {results['insight']}")
27
+ print(f"- Articulation: {results['components']['articulation']:.2f}/100")
28
+ print(f"- Enunciation: {results['components']['enunciation']:.2f}/100")
29
+ print(f"- Speech Pause Control: {results['components']['speech_pause_control']:.2f}/100")
30
+
31
+ # Print verbose results if enabled
32
+ if verbose:
33
+ print("\nDetailed Metrics:")
34
+ print(f"- Pitch Stability: {results['components']['pitch_stability']:.2f}/100")
35
+ print(f"- Voice Resonance: {results['components']['voice_resonance']:.2f}/100")
36
+ print(f"- Voice Strength: {results['components']['voice_strength']:.2f}/100")
37
+ print(f"- Word Count: {results['components']['word_count']}")
38
+ print(f"- Duration: {results['components']['duration']:.2f} seconds")
39
+
40
+ # Print first 100 characters of transcript
41
+ transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript']
42
+ print(f"\nTranscript preview: {transcript_preview}")
43
+
44
+ except Exception as e:
45
+ print(f"Error during analysis: {str(e)}")
46
+ return 1
47
+
48
+ if __name__ == "__main__":
49
+ exit(main())
vcs/vcs.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voice Clarity Score calculation module
3
+ """
4
+
5
+ import librosa
6
+ import numpy as np
7
+ from typing import Dict, Any, List
8
+ import soundfile as sf
9
+
10
+ def calculate_articulation(y: np.ndarray, sr: int) -> float:
11
+ """
12
+ Calculate articulation quality based on spectral contrast.
13
+
14
+ Articulation refers to how clearly individual phonemes are produced.
15
+
16
+ Args:
17
+ y (np.ndarray): Audio signal
18
+ sr (int): Sample rate
19
+
20
+ Returns:
21
+ float: Articulation score (0-100)
22
+ """
23
+ # Extract spectral contrast
24
+ # Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
25
+ S = np.abs(librosa.stft(y))
26
+ contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
27
+
28
+ # Average across frequency bands and frames
29
+ mean_contrast = np.mean(contrast)
30
+
31
+ # Normalize to 0-100 scale (empirically determined range)
32
+ # Typical values range from 10-50 dB
33
+ min_contrast = 10
34
+ max_contrast = 50
35
+ normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))
36
+
37
+ return normalized_contrast
38
+
39
+ def calculate_enunciation(y: np.ndarray, sr: int) -> float:
40
+ """
41
+ Calculate enunciation quality based on formant clarity and spectral flatness.
42
+
43
+ Enunciation is the precision in pronouncing vowels and consonants.
44
+
45
+ Args:
46
+ y (np.ndarray): Audio signal
47
+ sr (int): Sample rate
48
+
49
+ Returns:
50
+ float: Enunciation score (0-100)
51
+ """
52
+ # Compute spectral flatness - lower values indicate clearer formants and better enunciation
53
+ flatness = np.mean(librosa.feature.spectral_flatness(y=y))
54
+
55
+ # Compute spectral centroid - related to "brightness" or articulation clarity
56
+ centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
57
+
58
+ # Normalize flatness (lower is better for speech) - range typically 0.01-0.5
59
+ norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))
60
+
61
+ # Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
62
+ ideal_centroid = 2500 # Hz
63
+ centroid_deviation = abs(centroid - ideal_centroid) / 2000 # Normalized by expected deviation
64
+ norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))
65
+
66
+ # Combine the two metrics (with more weight on flatness)
67
+ enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)
68
+
69
+ return enunciation_score
70
+
71
+ def calculate_speech_pause_control(segments: List[Dict]) -> float:
72
+ """
73
+ Calculate how effectively pauses are integrated in speech.
74
+
75
+ Speech pause control refers to the natural vs. abrupt pauses in speech.
76
+
77
+ Args:
78
+ segments (List[Dict]): List of transcript segments with timing information
79
+
80
+ Returns:
81
+ float: Speech pause control score (0-100)
82
+ """
83
+ if len(segments) < 2:
84
+ return 100.0 # Not enough segments to evaluate pauses
85
+
86
+ pause_durations = []
87
+ for i in range(len(segments) - 1):
88
+ pause_dur = segments[i + 1]["start"] - segments[i]["end"]
89
+ if pause_dur > 0.05: # Only consider actual pauses
90
+ pause_durations.append(pause_dur)
91
+
92
+ if not pause_durations:
93
+ return 100.0 # No significant pauses detected
94
+
95
+ # Calculate the standard deviation of pause durations
96
+ # More consistent pauses indicate better control
97
+ pause_std = np.std(pause_durations)
98
+
99
+ # Calculate proportion of very long pauses (potentially awkward)
100
+ long_pauses = sum(1 for d in pause_durations if d > 2.0)
101
+ long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0
102
+
103
+ # Normalize std dev (lower is better, but not too low)
104
+ # Ideal range is around 0.2-0.5 seconds
105
+ if pause_std < 0.1:
106
+ std_score = 70 # Too consistent might sound robotic
107
+ elif pause_std < 0.5:
108
+ std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) # Scale 70-100
109
+ else:
110
+ std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) # Scale down from 70
111
+
112
+ # Penalize for too many long pauses
113
+ long_pause_penalty = long_pause_ratio * 50
114
+
115
+ # Final score
116
+ pause_control_score = max(0, min(100, std_score - long_pause_penalty))
117
+
118
+ return pause_control_score
119
+
120
+ def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
121
+ """
122
+ Calculate the Voice Clarity Score (VCS) and its components.
123
+
124
+ VCS reflects the clarity and intelligibility of speech.
125
+
126
+ Args:
127
+ y (np.ndarray): Audio signal
128
+ sr (int): Sample rate
129
+ segments (List[Dict]): List of transcript segments with timing information
130
+
131
+ Returns:
132
+ Dict[str, Any]: Dictionary with VCS and component scores
133
+ """
134
+ # Calculate component scores
135
+ articulation_score = calculate_articulation(y, sr)
136
+ enunciation_score = calculate_enunciation(y, sr)
137
+ speech_pause_control_score = calculate_speech_pause_control(segments)
138
+
139
+ # Calculate Voice Clarity Score using the formula from the paper
140
+ vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)
141
+
142
+ # Create result dictionary
143
+ result = {
144
+ "VCS": vcs,
145
+ "components": {
146
+ "articulation": articulation_score,
147
+ "enunciation": enunciation_score,
148
+ "speech_pause_control": speech_pause_control_score
149
+ }
150
+ }
151
+
152
+ # Add interpretation
153
+ result["insight"] = get_clarity_insight(vcs)
154
+
155
+ return result
156
+
157
+ def get_clarity_insight(vcs: float) -> str:
158
+ """
159
+ Generate insight text based on the Voice Clarity Score.
160
+
161
+ Args:
162
+ vcs (float): Voice Clarity Score (0-100)
163
+
164
+ Returns:
165
+ str: Insight text explaining the score
166
+ """
167
+ if vcs >= 85:
168
+ return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
169
+ elif vcs >= 70:
170
+ return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
171
+ elif vcs >= 50:
172
+ return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
173
+ elif vcs >= 30:
174
+ return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
175
+ else:
176
+ return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."
vcs/vcs_api.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from .compute_vcs import analyze_voice_quality
3
+
4
+ def main(file_path: str, model_size: str = "base") -> dict:
5
+ try:
6
+
7
+ whisper_model = whisper.load_model(model_size)
8
+
9
+ results = analyze_voice_quality(file_path, whisper_model)
10
+
11
+ # Structure response
12
+ response = {
13
+ "Voice Clarity Sore": round(results['VCS'], 2)
14
+ # "Articulation": round(results['components']['articulation'],2),
15
+ # "Enunciation": round(results['components']['enunciation'],2),
16
+ # "Speech Pause Control": round(results['components']['speech_pause_control'],2),
17
+ }
18
+ return response
19
+
20
+ except Exception as e:
21
+ raise RuntimeError(f"Error during analysis: {str(e)}")
vers/__init__.py ADDED
File without changes
vers/compute_vers_score.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .vers import calc_vers
2
+ import librosa
3
+ import numpy as np
4
+ import math
5
+ from .filler_analyzer import detect_fillers
6
+ from .find_valence import get_valence_score
7
+
8
+ def compute_vers_score(file_path: str, whisper_model) -> dict:
9
+ """
10
+ Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
11
+ """
12
+ result = whisper_model.transcribe(file_path)
13
+ transcript = result.get("text", "").strip()
14
+ segments = result.get("segments", [])
15
+
16
+
17
+
18
+ # Filler count
19
+ filler_count, _ = detect_fillers(transcript)
20
+
21
+ # Load audio
22
+ y, sr = librosa.load(file_path, sr=None)
23
+ duration = len(y) / sr if sr else 0.0
24
+
25
+ # Volume (RMS)
26
+ rms = librosa.feature.rms(y=y)[0]
27
+ mean_rms = float(np.mean(rms))
28
+ mean_volume_db = 20 * math.log10(mean_rms + 1e-6) if mean_rms > 0 else -80.0
29
+ volume_std = np.std(20 * np.log10(rms + 1e-6))
30
+
31
+ # Max volume
32
+ vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
33
+ vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0
34
+
35
+ # Pitch variation
36
+ f0, voiced_flags, voiced_probs = librosa.pyin(
37
+ y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
38
+ voiced_f0 = f0[~np.isnan(f0)]
39
+ pitch_variation = 0.0
40
+ if voiced_f0.size > 0:
41
+ median_f0 = np.nanmedian(voiced_f0)
42
+ median_f0 = max(median_f0, 1e-6)
43
+ semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
44
+ pitch_variation = float(np.nanstd(semitone_diffs))
45
+
46
+ # Pause analysis
47
+ total_speaking_time = 0.0
48
+ long_pause_count = 0
49
+ if segments:
50
+ for seg in segments:
51
+ total_speaking_time += (seg["end"] - seg["start"])
52
+ for i in range(len(segments) - 1):
53
+ pause_dur = segments[i+1]["start"] - segments[i]["end"]
54
+ if pause_dur > 1.0:
55
+ long_pause_count += 1
56
+ first_start = segments[0]["start"]
57
+ last_end = segments[-1]["end"]
58
+ if first_start > 1.0:
59
+ long_pause_count += 1
60
+ if duration - last_end > 1.0:
61
+ long_pause_count += 1
62
+
63
+ # WPM
64
+ words = transcript.split()
65
+ word_count = len(words)
66
+ words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
67
+
68
+
69
+ valence_scores = get_valence_score(file_path)
70
+
71
+ # Calculate VERS
72
+ vers_result = calc_vers(
73
+ filler_count=filler_count,
74
+ long_pause_count=long_pause_count,
75
+ pitch_variation=pitch_variation,
76
+ mean_volume_db=mean_volume_db,
77
+ vol_max_db=vol_max_db,
78
+ wpm=words_per_min,
79
+ volume_std=volume_std,
80
+ valence_scores=valence_scores
81
+ )
82
+ return vers_result
vers/filler_analyzer.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
2
+ # Mapping each variant to a common label (usually the Latin script for insight reporting)
3
+ FILLER_VARIANTS = {
4
+ # English fillers
5
+ "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
6
+ "umm": "um", "uhh": "uh", "mmm": "hmm",
7
+ "like": "like", "you know": "you know", "so": "so", "well": "well",
8
+ # Hindi fillers (Devanagari and transliteration)
9
+ "मतलब": "matlab", "matlab": "matlab",
10
+ "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
11
+ "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
12
+ "ऐसा है": "aisa hai", "aisa hai": "aisa hai",
13
+ "हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan"
14
+ "अच्छा": "acha", "acha": "acha",
15
+ # Tamil fillers (Tamil script and transliteration)
16
+ "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
17
+ "அப்பரம்": "apparam", "apparam": "apparam",
18
+ "என்ன": "enna", "enna": "enna"
19
+ }
20
+
21
+ def detect_fillers(transcript):
22
+ """
23
+ Detects filler words in the transcript.
24
+
25
+ Args:
26
+ transcript: Full transcript text
27
+
28
+ Returns:
29
+ tuple: (filler_count, filler_occurrences)
30
+ """
31
+ transcript_lower = transcript.lower()
32
+ filler_count = 0
33
+ # Track which specific fillers were used (for insight examples)
34
+ filler_occurrences = {}
35
+
36
+ for variant, label in FILLER_VARIANTS.items():
37
+ if variant in transcript_lower:
38
+ count = transcript_lower.count(variant)
39
+ if count > 0:
40
+ filler_count += count
41
+ # Accumulate count for the normalized label
42
+ filler_occurrences[label] = filler_occurrences.get(label, 0) + count
43
+
44
+ return filler_count, filler_occurrences
45
+
46
+
47
+ def analyze_filler_words(filler_count, filler_occurrences, duration):
48
+ """
49
+ Analyzes filler word usage in speech.
50
+
51
+ Args:
52
+ filler_count: Total count of filler words
53
+ filler_occurrences: Dictionary of specific filler words and their counts
54
+ duration: Duration of the audio in seconds
55
+
56
+ Returns:
57
+ dict: Contains the filler words score and insight text
58
+ """
59
+ # Extract top examples for insights
60
+ filler_examples = []
61
+ if filler_occurrences:
62
+ # Sort by frequency
63
+ sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
64
+ for label, count in sorted_fillers[:2]:
65
+ filler_examples.append(label)
66
+
67
+ # Compute fillers per minute as a gauge
68
+ filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
69
+
70
+ if filler_count == 0:
71
+ filler_score = 10
72
+ elif filler_per_min < 1:
73
+ filler_score = 9
74
+ elif filler_per_min < 3:
75
+ filler_score = 8
76
+ elif filler_per_min < 5:
77
+ filler_score = 6
78
+ elif filler_per_min < 10:
79
+ filler_score = 4
80
+ else:
81
+ filler_score = 2
82
+
83
+ filler_score = max(0, filler_score)
84
+
85
+ # Generate insight text based on the score and examples
86
+ if filler_count == 0:
87
+ insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
88
+ elif filler_count <= 2:
89
+ example = filler_examples[0] if filler_examples else "um"
90
+ insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
91
+ elif filler_count <= 5:
92
+ examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
93
+ insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
94
+ else:
95
+ examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
96
+ insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
97
+
98
+ return {
99
+ "score": int(filler_score),
100
+ "insight": insight
101
+ }
vers/find_valence.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from transformers.models.wav2vec2 import Wav2Vec2Model, Wav2Vec2FeatureExtractor
2
+ # import torchaudio
3
+ # import torch
4
+ # import torch.nn as nn
5
+
6
+
7
+
8
+ def get_valence_score(file_path):
9
+ # class VADPredictor(nn.Module):
10
+ # """Model to predict VAD Scores"""
11
+ # def __init__(self, pretrained_model_name="facebook/wav2vec2-base-960h", freeze_feature_extractor=True):
12
+ # super(VADPredictor, self).__init__()
13
+
14
+ # self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
15
+
16
+ # if freeze_feature_extractor:
17
+ # for param in self.wav2vec2.feature_extractor.parameters():
18
+ # param.requires_grad = False
19
+
20
+ # hidden_size = self.wav2vec2.config.hidden_size
21
+
22
+ # self.valence_layers = nn.Sequential(
23
+ # nn.Linear(hidden_size, 256),
24
+ # nn.ReLU(),
25
+ # nn.Dropout(0.3),
26
+ # nn.Linear(256,64),
27
+ # nn.Linear(64,1)
28
+ # )
29
+ # self.arousal_layers = nn.Sequential(
30
+ # nn.Linear(hidden_size, 256),
31
+ # nn.ReLU(),
32
+ # nn.Dropout(0.3),
33
+ # nn.Linear(256,64),
34
+ # nn.Linear(64,1)
35
+ # )
36
+ # self.dominance_layers = nn.Sequential(
37
+ # nn.Linear(hidden_size, 256),
38
+ # nn.ReLU(),
39
+ # nn.Dropout(0.3),
40
+ # nn.Linear(256,64),
41
+ # nn.Linear(64,1)
42
+ # )
43
+
44
+ # def forward(self, input_values, attention_mask=None):
45
+ # outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
46
+ # last_hidden_state = outputs.last_hidden_state
47
+ # pooled_output = torch.mean(last_hidden_state, dim=1)
48
+
49
+ # valence = self.valence_layers(pooled_output)
50
+ # arousal = self.arousal_layers(pooled_output)
51
+ # dominance = self.dominance_layers(pooled_output)
52
+
53
+ # return {
54
+ # 'valence': valence.squeeze(-1),
55
+ # 'arousal': arousal.squeeze(-1),
56
+ # 'dominance': dominance.squeeze(-1)
57
+ # }
58
+
59
+
60
+ # model = VADPredictor()
61
+ # model.load_state_dict(torch.load(r"D:\Intern\shankh\DUMP\vad_predictor_model.pt", map_location=torch.device("cpu")))
62
+ # model.eval()
63
+
64
+ # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
65
+
66
+ # # Load and process audio
67
+ # file_path = file_path
68
+ # waveform, sr = torchaudio.load(file_path)
69
+
70
+ # # Convert to mono
71
+ # if waveform.shape[0] > 1:
72
+ # waveform = waveform.mean(dim=0, keepdim=True)
73
+
74
+ # # Resample to 16000 Hz
75
+ # if sr != 16000:
76
+ # resampler = torchaudio.transforms.Resample(sr, 16000)
77
+ # waveform = resampler(waveform)
78
+ # sr = 16000
79
+
80
+ # # Normalize
81
+ # waveform = waveform / waveform.abs().max()
82
+
83
+ # # Parameters
84
+ # segment_sec = 1
85
+ # segment_samples = int(segment_sec * sr)
86
+
87
+ # valence_scores = []
88
+
89
+ # # Inference per segment
90
+ # with torch.no_grad():
91
+ # for start in range(0, waveform.shape[1] - segment_samples + 1, segment_samples):
92
+ # segment = waveform[:, start:start+segment_samples]
93
+ # input_values = feature_extractor(segment.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
94
+ # output = model(input_values)
95
+ # val = output['valence'].item()
96
+ # valence_scores.append(val)
97
+ valence_scores = 5.0
98
+
99
+ return valence_scores
100
+
vers/main.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from .compute_vers_score import compute_vers_score
3
+ import whisper
4
+
5
+
6
+
7
+ whisper_model = whisper.load_model("base")
8
+
9
+ test_result = compute_vers_score(r"D:\Intern\shankh\audio_samples\obama_short.wav", whisper_model)
10
+
11
+ print("VERS Score:", test_result["VERS"])
12
+ print("ESS:", test_result["ESS"])
13
+ print("LCS:", test_result["LCS"])
14
+ print("SRS:", test_result["SRS"])
15
+ print("Insight:", test_result["insight"])
16
+ print("Transcript:", test_result["transcript"])
vers/vers.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores):
4
+ """
5
+ Emotional Stability Score(ESS) : Measures the consistency of the speaker's emotional tone, reflecting their ability to regulate emotions during speech.
6
+
7
+ Requires:
8
+ Tonal Steadiness: The lack of extreme fluctuations in emotional tone.
9
+ Absence of Sudden Loudness Spikes: Indicates controlled expression without abrupt emotional shifts.
10
+ Valence Stability: Consistency in the overall positive or negative tone across the speech.
11
+ """
12
+ # calculate tonal steadiness
13
+ tonal_steadiness = max(0, 100 - (pitch_variation * 10))
14
+
15
+ # calculate loudness spikes
16
+ spike = max(0, vol_max_db - mean_volume_db - 15)
17
+ spike_ratio = min(spike / 30, 1.0) # Normalize with typical loudness range
18
+ stability = 1 - spike_ratio
19
+ loudness_stability = stability * 100
20
+
21
+ # calculate valence stability
22
+ valence_stability = 100 - (np.std(valence_scores) * 20)
23
+
24
+ ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability))
25
+ print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}")
26
+ return ESS
27
+
28
+ def calc_lcs(volume_std, vol_max_db, mean_volume_db):
29
+ """
30
+ Loudness Control Score (LCS): Evaluates how well the speaker manages volume
31
+
32
+ Requires:
33
+ - Volume Stability: Consistency in speech amplitude.
34
+ - Controlled Emphasis: The ability to modulate loudness smoothly for emphasis rather than abrupt changes.
35
+ """
36
+ vol_stability = max(0, 100 - (volume_std * 5)) # Scale std for speech (5 dB std = 75)
37
+
38
+ # Controlled Emphasis (45%)
39
+ emphasis_spike = max(0, vol_max_db - mean_volume_db - 3)
40
+ spike_ratio = min(emphasis_spike / 15, 1.0) # Normalize to 15 dB range
41
+ emphasis_control = (1 - spike_ratio) * 100
42
+
43
+ # Combine scores
44
+ lcs = 0.55 * vol_stability + 0.45 * emphasis_control
45
+ print(f"vol_stability: {vol_stability}, emphasis_control: {emphasis_control}")
46
+ return min(100, max(0, lcs))
47
+
48
+ def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
49
+ """
50
+ Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
51
+
52
+ Requires:
53
+ - Words per Minute Consistency: Regularity in speech speed.
54
+ - Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes.
55
+ """
56
+ ideal_wpm = 150
57
+ wpm_deviation = min(30, abs(wpm - ideal_wpm)) # Cap at 30 WPM deviation
58
+ wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) # 100-50 for max deviation
59
+
60
+ # Sudden Speech Shift Penalty
61
+ filler_penalty = min(filler_count / 10, 1.0)
62
+ pause_penalty = min(long_pause_count / 5, 1.0)
63
+ pitch_penalty = min(pitch_variation / 3.0, 1.0) # High variation → unstable
64
+
65
+ # Combine into absence of sudden shifts
66
+ stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
67
+
68
+ # Final SRS Score
69
+ SRS = (0.45 * wpm_consistency) + (0.55 * stability)
70
+ print(f"wpm_consistency: {wpm_consistency}, stability: {stability}")
71
+ return min(100, max(0, SRS))
72
+
73
+ def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores):
74
+ ESS = calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores)
75
+ LCS = calc_lcs(volume_std, vol_max_db, mean_volume_db)
76
+ SRS = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
77
+
78
+ # Calculate the VERS score using the formula
79
+ VERS = (0.5 * ESS) + (0.3 * LCS) + (0.2 * SRS) # This would be value from 0 to 100
80
+
81
+ if VERS > 0 and VERS < 50:
82
+ insight = """Poor regulation—noticeable swings in tone and uncontrolled
83
+ emotional expression. Feedback: Consider exercises and professional
84
+ coaching to stabilize your emotional delivery."""
85
+ elif VERS >= 50 and VERS < 80:
86
+ insight = """Moderate regulation—occasional fluctuations or abrupt changes.
87
+ Feedback: Work on smoothing out volume changes and maintaining a steady tone."""
88
+ elif VERS >= 80 and VERS <= 100:
89
+ insight = """Excellent regulation—steady tone and controlled volume dynamics.
90
+ Feedback: Continue using techniques that maintain emotional balance."""
91
+ else:
92
+ insight = "Invalid score calculated"
93
+
94
+ return {
95
+ "VERS": int(VERS),
96
+ "ESS": round(ESS, 1),
97
+ "LCS": round(LCS, 1),
98
+ "SRS": round(SRS, 1),
99
+ "insight": insight
100
+ }
101
+
102
+ # # Test input
103
+ # test_result = calc_vers(
104
+ # filler_count=4,
105
+ # long_pause_count=2,
106
+ # pitch_variation=3.2,
107
+ # mean_volume_db=65,
108
+ # vol_max_db=82,
109
+ # wpm=148,
110
+ # volume_std=4.1,
111
+ # valence_scores=[5.2, 5.5, 4.9]
112
+ # )
113
+
114
+ # print("VERS Score:", test_result["VERS"])
115
+ # print("ESS:", test_result["ESS"])
116
+ # print("LCS:", test_result["LCS"])
117
+ # print("SRS:", test_result["SRS"])
118
+ # print("Insight:", test_result["insight"])
vers/vers_api.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import numpy as np
3
+ from .compute_vers_score import compute_vers_score
4
+
5
+ def convert_numpy_types(obj):
6
+ """Convert NumPy types to Python native types for JSON serialization."""
7
+ if isinstance(obj, np.integer):
8
+ return int(obj)
9
+ elif isinstance(obj, np.floating):
10
+ return float(obj)
11
+ elif isinstance(obj, np.ndarray):
12
+ return obj.tolist()
13
+ elif isinstance(obj, dict):
14
+ return {k: convert_numpy_types(v) for k, v in obj.items()}
15
+ elif isinstance(obj, list):
16
+ return [convert_numpy_types(i) for i in obj]
17
+ else:
18
+ return obj
19
+
20
+ def main(file_path: str, model_size: str = "base") -> dict:
21
+ try:
22
+ # Load whisper model
23
+ whisper_model = whisper.load_model(model_size)
24
+
25
+ # Compute VERS score
26
+ results = compute_vers_score(file_path, whisper_model)
27
+
28
+ # Convert any NumPy types to native Python types
29
+ results = convert_numpy_types(results)
30
+
31
+ # Structure response with rounded values
32
+ # (using Python's built-in round function which returns Python native float)
33
+ response = {
34
+ "VERS Score": round(results['VERS'], 2)
35
+ # "ESS": round(results['ESS'], 2),
36
+ # "LCS": round(results['LCS'], 2),
37
+ # "SRS": round(results['SRS'], 2),
38
+ # "Insight": results['insight'],
39
+ }
40
+
41
+ return response
42
+
43
+ except Exception as e:
44
+ raise RuntimeError(f"Error during analysis: {str(e)}")
ves/__init__.py ADDED
File without changes
ves/ves.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # voice engagement score = 0.4 * valence + 0.3 * arousal + 0.3 * SDS
2
+ from tone_modulation.sds import calc_sds
3
+
4
+ def get_valence_and_arousal(file_path):
5
+
6
+ valence = 4.5 #we get this from model
7
+
8
+ arousal = 3.2 #we get this from model
9
+
10
+ return valence, arousal
11
+
12
+
13
+
14
+ def calc_voice_engagement_score(file_path):
15
+ valence, arousal = get_valence_and_arousal(file_path)
16
+
17
+ # Calculate SDS
18
+
19
+ sds = calc_sds(file_path)
20
+
21
+ ves = 0.4 * valence + 0.3 * arousal + 0.3 * sds
22
+
23
+ return {
24
+ # "sds": sds,
25
+ "ves": ves
26
+ }
voice_confidence_score/__init__.py ADDED
File without changes
voice_confidence_score/main.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .voice_confidence import calc_voice_confidence_score
2
+ import whisper
3
+
4
+ model_size = "base"
5
+ whisper_model = whisper.load_model(model_size)
6
+
7
+ audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav"
8
+
9
+ result = calc_voice_confidence_score(audio_file, whisper_model)
10
+
11
+ print(f"Voice Confidence Score: {result:.2f}")
voice_confidence_score/voice_confidence.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # voice confidence score = 0.4 * dominance + 0.3 * scs + 0.3 * fluency.
2
+
3
+ import whisper
4
+ from fluency.compute_fluency import compute_fluency_score
5
+ from vcs.compute_vcs import analyze_voice_quality
6
+
7
+
8
+ def calc_fluency_score(audio_path, whisper_model):
9
+
10
+ # Calculate fluency score
11
+ print(f"Analyzing fluency for {audio_path}...")
12
+ results = compute_fluency_score(audio_path, whisper_model)
13
+ fluency_score = results['fluency_score']
14
+
15
+ return fluency_score
16
+
17
+ def calc_vcs(audio_path, whisper_model):
18
+
19
+
20
+ # Calculate voice clarity score
21
+ print(f"Analyzing voice clarity for {audio_path}...")
22
+ results = analyze_voice_quality(audio_path, whisper_model)
23
+ vcs = results['VCS']
24
+
25
+ return vcs
26
+
27
+ dominance = 5.6 # dummy for now i add later
28
+
29
+ def calc_voice_confidence_score(audio_path, model):
30
+
31
+ fluency_score = calc_fluency_score(audio_path, model)
32
+ vcs = calc_vcs(audio_path, model)
33
+
34
+ # Calculate voice confidence score
35
+ voice_confidence_score = 0.4 * dominance + 0.3 * vcs + 0.3 * fluency_score
36
+
37
+ return voice_confidence_score
38
+
voice_confidence_score/voice_confidence_api.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from .voice_confidence import calc_voice_confidence_score
3
+
4
+ def main(file_path: str, model_size: str = "base") -> dict:
5
+ try:
6
+ # Load the Whisper model
7
+ whisper_model = whisper.load_model(model_size)
8
+
9
+ # Calculate the voice confidence score
10
+ result = calc_voice_confidence_score(file_path, whisper_model)
11
+
12
+ # Return the result as a dictionary
13
+ return {"voice_confidence_score": round(result, 2)}
14
+ except Exception as e:
15
+ return {"error": str(e)}
16
+
vps/__init__.py ADDED
File without changes
vps/compute_vps_score.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .vps import calculate_vps # Your file where calc_srs, calculate_pas, calculate_rcs, calculate_vps live
2
+ import librosa
3
+ import numpy as np
4
+ import math
5
+ from .filler_analyzer import detect_fillers
6
+
7
+ def compute_vps_score(file_path: str, whisper_model) -> dict:
8
+ """
9
+ Compute VPS (Voice Pacing Score) and its components from a speech sample.
10
+
11
+ Args:
12
+ file_path (str): Path to the audio file.
13
+ whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
14
+
15
+ Returns:
16
+ dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
17
+ """
18
+ # Transcribe
19
+ result = whisper_model.transcribe(file_path)
20
+ transcript = result.get("text", "").strip()
21
+ segments = result.get("segments", [])
22
+
23
+ # Validate early
24
+ if not transcript or not segments:
25
+ raise ValueError("Empty transcript or segments from Whisper.")
26
+
27
+ # Filler count
28
+ filler_count, _ = detect_fillers(transcript)
29
+
30
+ # Load audio
31
+ y, sr = librosa.load(file_path, sr=None)
32
+ duration = len(y) / sr if sr else 0.0
33
+ if duration <= 0:
34
+ raise ValueError("Audio duration invalid or zero.")
35
+
36
+ # Pitch variation (in semitones)
37
+ f0, voiced_flags, voiced_probs = librosa.pyin(
38
+ y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
39
+ voiced_f0 = f0[~np.isnan(f0)]
40
+ pitch_variation = 0.0
41
+ if voiced_f0.size > 0:
42
+ median_f0 = np.nanmedian(voiced_f0)
43
+ median_f0 = max(median_f0, 1e-6)
44
+ semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
45
+ pitch_variation = float(np.nanstd(semitone_diffs))
46
+
47
+ # Pause analysis
48
+ long_pause_count = 0
49
+ if segments:
50
+ for i in range(len(segments) - 1):
51
+ pause_dur = segments[i + 1]["start"] - segments[i]["end"]
52
+ if pause_dur > 1.0:
53
+ long_pause_count += 1
54
+ # Beginning and end
55
+ if segments[0]["start"] > 1.0:
56
+ long_pause_count += 1
57
+ if duration - segments[-1]["end"] > 1.0:
58
+ long_pause_count += 1
59
+
60
+ # WPM
61
+ word_count = len(transcript.split())
62
+ words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
63
+
64
+ # Calculate VPS and components
65
+ vps_result = calculate_vps(
66
+ transcript=transcript,
67
+ segments=segments,
68
+ filler_count=filler_count,
69
+ duration=duration,
70
+ wpm=words_per_min,
71
+ long_pause_count=long_pause_count,
72
+ pitch_variation=pitch_variation,
73
+ y=y,
74
+ sr=sr
75
+ )
76
+
77
+ # Include transcript optionally
78
+ vps_result["transcript"] = transcript
79
+ return vps_result
vps/filler_analyzer.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
2
+ # Mapping each variant to a common label (usually the Latin script for insight reporting)
3
+ FILLER_VARIANTS = {
4
+ # English fillers
5
+ "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
6
+ "umm": "um", "uhh": "uh", "mmm": "hmm",
7
+ "like": "like", "you know": "you know", "so": "so", "well": "well",
8
+ # Hindi fillers (Devanagari and transliteration)
9
+ "मतलब": "matlab", "matlab": "matlab",
10
+ "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
11
+ "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
12
+ "ऐसा है": "aisa hai", "aisa hai": "aisa hai",
13
+ "हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan"
14
+ "अच्छा": "acha", "acha": "acha",
15
+ # Tamil fillers (Tamil script and transliteration)
16
+ "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
17
+ "அப்பரம்": "apparam", "apparam": "apparam",
18
+ "என்ன": "enna", "enna": "enna"
19
+ }
20
+
21
+ def detect_fillers(transcript):
22
+ """
23
+ Detects filler words in the transcript.
24
+
25
+ Args:
26
+ transcript: Full transcript text
27
+
28
+ Returns:
29
+ tuple: (filler_count, filler_occurrences)
30
+ """
31
+ transcript_lower = transcript.lower()
32
+ filler_count = 0
33
+ # Track which specific fillers were used (for insight examples)
34
+ filler_occurrences = {}
35
+
36
+ for variant, label in FILLER_VARIANTS.items():
37
+ if variant in transcript_lower:
38
+ count = transcript_lower.count(variant)
39
+ if count > 0:
40
+ filler_count += count
41
+ # Accumulate count for the normalized label
42
+ filler_occurrences[label] = filler_occurrences.get(label, 0) + count
43
+
44
+ return filler_count, filler_occurrences
45
+
46
+ def analyze_filler_words(filler_count, filler_occurrences, duration):
47
+ """
48
+ Analyzes filler word usage in speech.
49
+
50
+ Args:
51
+ filler_count: Total count of filler words
52
+ filler_occurrences: Dictionary of specific filler words and their counts
53
+ duration: Duration of the audio in seconds
54
+
55
+ Returns:
56
+ dict: Contains the filler words score and insight text
57
+ """
58
+ # Extract top examples for insights
59
+ filler_examples = []
60
+ if filler_occurrences:
61
+ # Sort by frequency
62
+ sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
63
+ for label, count in sorted_fillers[:2]:
64
+ filler_examples.append(label)
65
+
66
+ # Compute fillers per minute as a gauge
67
+ filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
68
+
69
+ if filler_count == 0:
70
+ filler_score = 10
71
+ elif filler_per_min < 1:
72
+ filler_score = 9
73
+ elif filler_per_min < 3:
74
+ filler_score = 8
75
+ elif filler_per_min < 5:
76
+ filler_score = 6
77
+ elif filler_per_min < 10:
78
+ filler_score = 4
79
+ else:
80
+ filler_score = 2
81
+
82
+ filler_score = max(0, filler_score)
83
+
84
+ # Generate insight text based on the score and examples
85
+ if filler_count == 0:
86
+ insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
87
+ elif filler_count <= 2:
88
+ example = filler_examples[0] if filler_examples else "um"
89
+ insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
90
+ elif filler_count <= 5:
91
+ examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
92
+ insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
93
+ else:
94
+ examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
95
+ insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
96
+
97
+ return {
98
+ "score": int(filler_score),
99
+ "insight": insight
100
+ }
vps/main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from .compute_vps_score import compute_vps_score # Ensure this path is correct
3
+
4
+ def main():
5
+ # 🔧 Set your input audio file path here
6
+ audio_path = r"D:\Intern\shankh\audio_samples\obama_short.wav"
7
+
8
+ # 🔧 Choose Whisper model (tiny, base, small, medium, large)
9
+ model_size = "base"
10
+
11
+ print(f"Loading Whisper model: {model_size}")
12
+ whisper_model = whisper.load_model(model_size)
13
+
14
+ print(f"Analyzing audio: {audio_path}")
15
+ try:
16
+ vps_result = compute_vps_score(audio_path, whisper_model)
17
+
18
+ print("\n--- Voice Pacing Score (VPS) ---")
19
+ print(f"VPS Score: {vps_result['VPS']:.2f}")
20
+ print(f" - SRS (Speech Rate Stability): {vps_result['SRS']:.2f}")
21
+ print(f" - PAS (Pause Appropriateness): {vps_result['PAS']:.2f}")
22
+ print(f" - NPP: {vps_result['NPP']:.2f}")
23
+ print(f" - AFW: {vps_result['AFW']:.2f}")
24
+ print(f" - RCS (Rhythm Consistency): {vps_result['RCS']:.2f}")
25
+ print(f" - STR: {vps_result['STR']:.2f}")
26
+ print(f" - STW: {vps_result['STW']:.2f}")
27
+
28
+ print("\nTranscript:")
29
+ print(vps_result["transcript"])
30
+
31
+ except Exception as e:
32
+ print(f"[Error] {e}")
33
+
34
+ if __name__ == "__main__":
35
+ main()
vps/vps.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ import librosa
3
+ import numpy as np
4
+ import spacy
5
+ import math
6
+ from .filler_analyzer import detect_fillers
7
+
8
+ def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float:
9
+ """
10
+ Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
11
+ """
12
+ ideal_wpm = 150
13
+ wpm_deviation = min(30, abs(wpm - ideal_wpm))
14
+ wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))
15
+
16
+ filler_penalty = min(filler_count / 10, 1.0)
17
+ pause_penalty = min(long_pause_count / 5, 1.0)
18
+ pitch_penalty = min(pitch_variation / 3.0, 1.0)
19
+
20
+ stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
21
+ SRS = (0.45 * wpm_consistency) + (0.55 * stability)
22
+ return min(100, max(0, SRS))
23
+
24
+ def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
25
+ """
26
+ Calculate the Pause Appropriateness Score (PAS) and its components.
27
+ """
28
+ if not transcript or not segments or duration <= 0:
29
+ raise ValueError("Transcript, segments, and duration must be valid")
30
+
31
+ nlp = spacy.load("en_core_web_sm")
32
+ doc = nlp(transcript)
33
+
34
+ words = transcript.split()
35
+ total_words = len(words)
36
+ if total_words == 0:
37
+ raise ValueError("No words found in transcript")
38
+
39
+ filler_rate = filler_count / total_words if total_words > 0 else 0.0
40
+ if filler_rate >= 0.10:
41
+ afw = 0.0
42
+ elif filler_rate <= 0.0:
43
+ afw = 100.0
44
+ else:
45
+ afw = 100.0 - (filler_rate * 1000)
46
+ afw = max(0.0, min(100.0, afw))
47
+
48
+ total_pauses = 0
49
+ natural_pauses = 0
50
+ segment_texts = [seg["text"].strip() for seg in segments]
51
+ segment_starts = [seg["start"] for seg in segments]
52
+ segment_ends = [seg["end"] for seg in segments]
53
+
54
+ for i in range(len(segments) - 1):
55
+ pause_dur = segment_starts[i + 1] - segment_ends[i]
56
+ if pause_dur > 0.5:
57
+ total_pauses += 1
58
+ if segment_texts[i] and segment_texts[i][-1] in ".!?,":
59
+ natural_pauses += 1
60
+
61
+ if segment_starts[0] > 0.5:
62
+ total_pauses += 1
63
+ if duration - segment_ends[-1] > 0.5:
64
+ total_pauses += 1
65
+ if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
66
+ natural_pauses += 1
67
+
68
+ npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
69
+ pas = (0.4 * npp) + (0.6 * afw)
70
+
71
+ return {
72
+ "NPP": npp,
73
+ "AFW": afw,
74
+ "PAS": pas
75
+ }
76
+
77
+ def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]:
78
+ """
79
+ Calculate the Rhythm Consistency Score (RCS) and its components.
80
+ """
81
+ if y.size == 0 or sr <= 0 or duration <= 0 or not segments:
82
+ raise ValueError("Audio signal, sampling rate, duration, and segments must be valid")
83
+
84
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256)
85
+ onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256)
86
+
87
+ if len(onsets) > 1:
88
+ iois = np.diff(onsets)
89
+ ioi_std = np.std(iois)
90
+ ioi_std = min(max(ioi_std, 0.1), 0.5)
91
+ str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1)
92
+ str_score = max(0.0, min(100.0, str_score))
93
+ else:
94
+ str_score = 100.0
95
+
96
+ total_transitions = 0
97
+ smooth_transitions = 0
98
+ pause_threshold = 0.3
99
+
100
+ for i in range(len(segments) - 1):
101
+ gap = segments[i + 1]["start"] - segments[i]["end"]
102
+ total_transitions += 1
103
+ if gap <= pause_threshold:
104
+ smooth_transitions += 1
105
+
106
+ for segment in segments:
107
+ words = segment["text"].strip().split()
108
+ if len(words) > 1:
109
+ smooth_transitions += len(words) - 1
110
+ total_transitions += len(words) - 1
111
+
112
+ stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0
113
+ rcs = (0.5 * str_score) + (0.5 * stw)
114
+
115
+ return {
116
+ "STR": str_score,
117
+ "STW": stw,
118
+ "RCS": rcs
119
+ }
120
+
121
+ def calculate_vps(
122
+ transcript: str,
123
+ segments: List[Dict],
124
+ filler_count: int,
125
+ duration: float,
126
+ wpm: float,
127
+ long_pause_count: int,
128
+ pitch_variation: float,
129
+ y: np.ndarray,
130
+ sr: int
131
+ ) -> Dict[str, float]:
132
+ """
133
+ Calculate the Voice Pacing Score (VPS) and its components:
134
+ - SRS: Speech Rate Stability Score
135
+ - PAS: Pause Appropriateness Score
136
+ - RCS: Rhythm Consistency Score
137
+ - VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS)
138
+
139
+ Args:
140
+ transcript (str): Transcribed text.
141
+ segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'.
142
+ filler_count (int): Number of filler words.
143
+ duration (float): Audio duration (seconds).
144
+ wpm (float): Words per minute.
145
+ long_pause_count (int): Number of long pauses (>1.0s).
146
+ pitch_variation (float): Pitch variation in semitones.
147
+ y (np.ndarray): Audio signal.
148
+ sr (int): Sampling rate.
149
+
150
+ Returns:
151
+ Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates.
152
+ """
153
+ # Validate inputs
154
+ if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0:
155
+ raise ValueError("Invalid inputs")
156
+
157
+ # Calculate SRS
158
+ srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
159
+
160
+ # Calculate PAS
161
+ pas_result = calculate_pas(transcript, segments, filler_count, duration)
162
+ pas = pas_result["PAS"]
163
+ npp = pas_result["NPP"]
164
+ afw = pas_result["AFW"]
165
+
166
+ # Calculate RCS
167
+ rcs_result = calculate_rcs(y, sr, segments, duration)
168
+ rcs = rcs_result["RCS"]
169
+ str_score = rcs_result["STR"]
170
+ stw = rcs_result["STW"]
171
+
172
+ # Calculate VPS
173
+ vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs)
174
+ vps = max(0.0, min(100.0, vps))
175
+
176
+ return {
177
+ "SRS": srs,
178
+ "PAS": pas,
179
+ "NPP": npp,
180
+ "AFW": afw,
181
+ "RCS": rcs,
182
+ "STR": str_score,
183
+ "STW": stw,
184
+ "VPS": vps
185
+ }
vps/vps_api.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from .compute_vps_score import compute_vps_score
3
+
4
+ def main(file_path: str, model_size: str = "base") -> dict:
5
+ try:
6
+ # Load the Whisper model
7
+ whisper_model = whisper.load_model(model_size)
8
+
9
+ # Calculate the voice confidence score
10
+ result = compute_vps_score(file_path, whisper_model)
11
+
12
+ # Return the result as a dictionary
13
+ return {
14
+ "VPS": result["VPS"]
15
+ # "SRS": result["SRS"],
16
+ # "PAS": result["PAS"],
17
+ # "NPP": result["NPP"],
18
+ # "AFW": result["AFW"],
19
+ # "RCS": result["RCS"],
20
+ # "STR": result["STR"],
21
+ # "STW": result["STW"]
22
+ }
23
+ except Exception as e:
24
+ return {"error": str(e)}
25
+