first
Browse files- .gitignore +1 -0
- Dockerfile +29 -0
- README.md +5 -4
- app.py +392 -0
- emotion/__init__.py +0 -0
- emotion/emo_predict.py +28 -0
- filler_count/__init__.py +0 -0
- filler_count/filler_score.py +76 -0
- fluency/__init__.py +13 -0
- fluency/compute_fluency.py +106 -0
- fluency/filler_analyzer.py +100 -0
- fluency/fluency.py +149 -0
- fluency/fluency_api.py +22 -0
- fluency/main.py +49 -0
- requirements.txt +23 -0
- tone_modulation/__init__.py +0 -0
- tone_modulation/sds.py +385 -0
- tone_modulation/tone_api.py +23 -0
- transcribe.py +32 -0
- vcs/__init__.py +0 -0
- vcs/compute_vcs.py +117 -0
- vcs/main.py +49 -0
- vcs/vcs.py +176 -0
- vcs/vcs_api.py +21 -0
- vers/__init__.py +0 -0
- vers/compute_vers_score.py +82 -0
- vers/filler_analyzer.py +101 -0
- vers/find_valence.py +100 -0
- vers/main.py +16 -0
- vers/vers.py +118 -0
- vers/vers_api.py +44 -0
- ves/__init__.py +0 -0
- ves/ves.py +26 -0
- voice_confidence_score/__init__.py +0 -0
- voice_confidence_score/main.py +11 -0
- voice_confidence_score/voice_confidence.py +38 -0
- voice_confidence_score/voice_confidence_api.py +16 -0
- vps/__init__.py +0 -0
- vps/compute_vps_score.py +79 -0
- vps/filler_analyzer.py +100 -0
- vps/main.py +35 -0
- vps/vps.py +185 -0
- vps/vps_api.py +25 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
**/__pycache__/
|
Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9
|
5 |
+
|
6 |
+
# Create user
|
7 |
+
RUN useradd -m -u 1000 user
|
8 |
+
|
9 |
+
# Install system packages (as root before switching to user)
|
10 |
+
RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
# Switch to non-root user
|
13 |
+
USER user
|
14 |
+
|
15 |
+
# Set pip install path
|
16 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
17 |
+
|
18 |
+
# Set working directory
|
19 |
+
WORKDIR /app
|
20 |
+
|
21 |
+
# Install Python dependencies
|
22 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
23 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
24 |
+
|
25 |
+
# Copy rest of the code
|
26 |
+
COPY --chown=user . /app
|
27 |
+
|
28 |
+
# Run the app
|
29 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
---
|
2 |
-
title: Voice
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
|
|
8 |
---
|
9 |
|
10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Voice Deploy
|
3 |
+
emoji: 🏢
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: gray
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
+
license: mit
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, UploadFile, File, Form , HTTPException
|
2 |
+
from fastapi.responses import JSONResponse
|
3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
4 |
+
import sys
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
import uuid
|
8 |
+
|
9 |
+
# Ensure sibling module fluency is discoverable
|
10 |
+
#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
11 |
+
|
12 |
+
from fluency.fluency_api import main as analyze_fluency_main
|
13 |
+
from tone_modulation.tone_api import main as analyze_tone_main
|
14 |
+
from vcs.vcs_api import main as analyze_vcs_main
|
15 |
+
from vers.vers_api import main as analyze_vers_main
|
16 |
+
from voice_confidence_score.voice_confidence_api import main as analyze_voice_confidence_main
|
17 |
+
from vps.vps_api import main as analyze_vps_main
|
18 |
+
from ves.ves import calc_voice_engagement_score
|
19 |
+
from transcribe import transcribe_audio
|
20 |
+
from filler_count.filler_score import analyze_fillers
|
21 |
+
#from emotion.emo_predict import predict_emotion
|
22 |
+
|
23 |
+
app = FastAPI()
|
24 |
+
|
25 |
+
app.add_middleware(
|
26 |
+
CORSMiddleware,
|
27 |
+
allow_origins=["*"], # In production, replace "*" with allowed frontend domains
|
28 |
+
allow_credentials=True,
|
29 |
+
allow_methods=["*"],
|
30 |
+
allow_headers=["*"],
|
31 |
+
)
|
32 |
+
|
33 |
+
@app.post("/analyze_fluency/")
|
34 |
+
async def analyze_fluency(file: UploadFile):
|
35 |
+
# idk if we can use pydantic model here If we need I can add later
|
36 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
37 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
38 |
+
|
39 |
+
# Generate a safe temporary file path for temporary storage of the uploaded file this will be deleted after processing
|
40 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
41 |
+
temp_dir = "temp_uploads"
|
42 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
43 |
+
os.makedirs(temp_dir, exist_ok=True)
|
44 |
+
|
45 |
+
try:
|
46 |
+
# Save uploaded file
|
47 |
+
with open(temp_filepath, "wb") as buffer:
|
48 |
+
shutil.copyfileobj(file.file, buffer)
|
49 |
+
|
50 |
+
|
51 |
+
result = analyze_fluency_main(temp_filepath, model_size="base")
|
52 |
+
|
53 |
+
return JSONResponse(content=result)
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
raise HTTPException(status_code=500, detail=f"Fluency analysis failed: {str(e)}")
|
57 |
+
|
58 |
+
finally:
|
59 |
+
# Clean up temporary file
|
60 |
+
if os.path.exists(temp_filepath):
|
61 |
+
os.remove(temp_filepath)
|
62 |
+
|
63 |
+
@app.post('/analyze_tone/')
|
64 |
+
async def analyze_tone(file: UploadFile):
|
65 |
+
"""
|
66 |
+
Endpoint to analyze tone of an uploaded audio file (.wav or .mp3).
|
67 |
+
"""
|
68 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
69 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
70 |
+
|
71 |
+
# Generate a safe temporary file path
|
72 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
73 |
+
temp_dir = "temp_uploads"
|
74 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
75 |
+
os.makedirs(temp_dir, exist_ok=True)
|
76 |
+
|
77 |
+
try:
|
78 |
+
# Save uploaded file
|
79 |
+
with open(temp_filepath, "wb") as buffer:
|
80 |
+
shutil.copyfileobj(file.file, buffer)
|
81 |
+
|
82 |
+
# Analyze tone using your custom function
|
83 |
+
result = analyze_tone_main(temp_filepath)
|
84 |
+
|
85 |
+
return JSONResponse(content=result)
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
raise HTTPException(status_code=500, detail=f"Tone analysis failed: {str(e)}")
|
89 |
+
|
90 |
+
finally:
|
91 |
+
# Clean up temporary file
|
92 |
+
if os.path.exists(temp_filepath):
|
93 |
+
os.remove(temp_filepath)
|
94 |
+
|
95 |
+
@app.post('/analyze_vcs/')
|
96 |
+
async def analyze_vcs(file: UploadFile):
|
97 |
+
"""
|
98 |
+
Endpoint to analyze voice clarity of an uploaded audio file (.wav or .mp3).
|
99 |
+
"""
|
100 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
101 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
102 |
+
|
103 |
+
# Generate a safe temporary file path
|
104 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
105 |
+
temp_dir = "temp_uploads"
|
106 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
107 |
+
os.makedirs(temp_dir, exist_ok=True)
|
108 |
+
|
109 |
+
try:
|
110 |
+
# Save uploaded file
|
111 |
+
with open(temp_filepath, "wb") as buffer:
|
112 |
+
shutil.copyfileobj(file.file, buffer)
|
113 |
+
|
114 |
+
# Analyze voice clarity using your custom function
|
115 |
+
result = analyze_vcs_main(temp_filepath)
|
116 |
+
|
117 |
+
return JSONResponse(content=result)
|
118 |
+
|
119 |
+
except Exception as e:
|
120 |
+
raise HTTPException(status_code=500, detail=f"Voice clarity analysis failed: {str(e)}")
|
121 |
+
|
122 |
+
finally:
|
123 |
+
# Clean up temporary file
|
124 |
+
if os.path.exists(temp_filepath):
|
125 |
+
os.remove(temp_filepath)
|
126 |
+
|
127 |
+
@app.post('/analyze_vers/')
|
128 |
+
async def analyze_vers(file: UploadFile):
|
129 |
+
"""
|
130 |
+
Endpoint to analyze VERS of an uploaded audio file (.wav or .mp3).
|
131 |
+
"""
|
132 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
133 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
134 |
+
|
135 |
+
# Generate a safe temporary file path
|
136 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
137 |
+
temp_dir = "temp_uploads"
|
138 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
139 |
+
os.makedirs(temp_dir, exist_ok=True)
|
140 |
+
|
141 |
+
try:
|
142 |
+
# Save uploaded file
|
143 |
+
with open(temp_filepath, "wb") as buffer:
|
144 |
+
shutil.copyfileobj(file.file, buffer)
|
145 |
+
|
146 |
+
# Analyze VERS using your custom function
|
147 |
+
result = analyze_vers_main(temp_filepath)
|
148 |
+
|
149 |
+
return JSONResponse(content=result)
|
150 |
+
|
151 |
+
except Exception as e:
|
152 |
+
raise HTTPException(status_code=500, detail=f"VERS analysis failed: {str(e)}")
|
153 |
+
|
154 |
+
finally:
|
155 |
+
# Clean up temporary file
|
156 |
+
if os.path.exists(temp_filepath):
|
157 |
+
os.remove(temp_filepath)
|
158 |
+
|
159 |
+
@app.post('/voice_confidence/')
|
160 |
+
async def analyze_voice_confidence(file: UploadFile):
|
161 |
+
"""
|
162 |
+
Endpoint to analyze voice confidence of an uploaded audio file (.wav or .mp3).
|
163 |
+
"""
|
164 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
165 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
166 |
+
|
167 |
+
# Generate a safe temporary file path
|
168 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
169 |
+
temp_dir = "temp_uploads"
|
170 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
171 |
+
os.makedirs(temp_dir, exist_ok=True)
|
172 |
+
|
173 |
+
try:
|
174 |
+
# Save uploaded file
|
175 |
+
with open(temp_filepath, "wb") as buffer:
|
176 |
+
shutil.copyfileobj(file.file, buffer)
|
177 |
+
|
178 |
+
# Analyze voice confidence using your custom function
|
179 |
+
result = analyze_voice_confidence_main(temp_filepath)
|
180 |
+
|
181 |
+
return JSONResponse(content=result)
|
182 |
+
|
183 |
+
except Exception as e:
|
184 |
+
raise HTTPException(status_code=500, detail=f"Voice confidence analysis failed: {str(e)}")
|
185 |
+
|
186 |
+
finally:
|
187 |
+
# Clean up temporary file
|
188 |
+
if os.path.exists(temp_filepath):
|
189 |
+
os.remove(temp_filepath)
|
190 |
+
|
191 |
+
@app.post('/analyze_vps/')
|
192 |
+
async def analyze_vps(file: UploadFile):
|
193 |
+
"""
|
194 |
+
Endpoint to analyze voice pacing score of an uploaded audio file (.wav or .mp3).
|
195 |
+
"""
|
196 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
197 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
198 |
+
|
199 |
+
# Generate a safe temporary file path
|
200 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
201 |
+
temp_dir = "temp_uploads"
|
202 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
203 |
+
os.makedirs(temp_dir, exist_ok=True)
|
204 |
+
|
205 |
+
try:
|
206 |
+
# Save uploaded file
|
207 |
+
with open(temp_filepath, "wb") as buffer:
|
208 |
+
shutil.copyfileobj(file.file, buffer)
|
209 |
+
|
210 |
+
# Analyze voice pacing score using your custom function
|
211 |
+
result = analyze_vps_main(temp_filepath)
|
212 |
+
|
213 |
+
return JSONResponse(content=result)
|
214 |
+
|
215 |
+
except Exception as e:
|
216 |
+
raise HTTPException(status_code=500, detail=f"Voice pacing score analysis failed: {str(e)}")
|
217 |
+
|
218 |
+
finally:
|
219 |
+
# Clean up temporary file
|
220 |
+
if os.path.exists(temp_filepath):
|
221 |
+
os.remove(temp_filepath)
|
222 |
+
|
223 |
+
@app.post('/voice_engagement_score/')
|
224 |
+
async def analyze_voice_engagement_score(file: UploadFile):
|
225 |
+
"""
|
226 |
+
Endpoint to analyze voice engagement score of an uploaded audio file (.wav or .mp3).
|
227 |
+
"""
|
228 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
229 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
230 |
+
|
231 |
+
# Generate a safe temporary file path
|
232 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
233 |
+
temp_dir = "temp_uploads"
|
234 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
235 |
+
os.makedirs(temp_dir, exist_ok=True)
|
236 |
+
|
237 |
+
try:
|
238 |
+
# Save uploaded file
|
239 |
+
with open(temp_filepath, "wb") as buffer:
|
240 |
+
shutil.copyfileobj(file.file, buffer)
|
241 |
+
|
242 |
+
# Analyze voice engagement score using your custom function
|
243 |
+
result = calc_voice_engagement_score(temp_filepath)
|
244 |
+
|
245 |
+
return JSONResponse(content=result)
|
246 |
+
|
247 |
+
except Exception as e:
|
248 |
+
raise HTTPException(status_code=500, detail=f"Voice engagement score analysis failed: {str(e)}")
|
249 |
+
|
250 |
+
finally:
|
251 |
+
# Clean up temporary file
|
252 |
+
if os.path.exists(temp_filepath):
|
253 |
+
os.remove(temp_filepath)
|
254 |
+
|
255 |
+
@app.post('/analyze_fillers/')
|
256 |
+
async def analyze_fillers_count(file: UploadFile):
|
257 |
+
"""
|
258 |
+
Endpoint to analyze filler words in an uploaded audio file (.wav or .mp3).
|
259 |
+
"""
|
260 |
+
if not file.filename.endswith(('.wav', '.mp3','.mp4','.m4a','.flac')):
|
261 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
262 |
+
|
263 |
+
# Generate a safe temporary file path
|
264 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
265 |
+
temp_dir = "temp_uploads"
|
266 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
267 |
+
os.makedirs(temp_dir, exist_ok=True)
|
268 |
+
|
269 |
+
try:
|
270 |
+
# Save uploaded file
|
271 |
+
with open(temp_filepath, "wb") as buffer:
|
272 |
+
shutil.copyfileobj(file.file, buffer)
|
273 |
+
|
274 |
+
# Call the analysis function with the file path
|
275 |
+
result = analyze_fillers(temp_filepath) # Pass the file path, not the UploadFile object
|
276 |
+
|
277 |
+
return JSONResponse(content=result)
|
278 |
+
|
279 |
+
except Exception as e:
|
280 |
+
raise HTTPException(status_code=500, detail=f"Filler analysis failed: {str(e)}")
|
281 |
+
|
282 |
+
finally:
|
283 |
+
# Clean up temporary file
|
284 |
+
if os.path.exists(temp_filepath):
|
285 |
+
os.remove(temp_filepath)
|
286 |
+
|
287 |
+
|
288 |
+
import time
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
@app.post('/transcribe/')
|
293 |
+
async def transcribe(file: UploadFile):
|
294 |
+
"""
|
295 |
+
Endpoint to transcribe an uploaded audio file ('.wav', '.mp3','mp4','.m4a','.flac' ).
|
296 |
+
"""
|
297 |
+
#calculate time to transcribe
|
298 |
+
start_time = time.time()
|
299 |
+
if not file.filename.endswith(('.wav', '.mp3','mp4','.m4a','.flac')):
|
300 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav ,mp4 and .mp3 files are supported.")
|
301 |
+
|
302 |
+
# Generate a safe temporary file path
|
303 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
304 |
+
temp_dir = "temp_uploads"
|
305 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
306 |
+
os.makedirs(temp_dir, exist_ok=True)
|
307 |
+
|
308 |
+
try:
|
309 |
+
# Save uploaded file
|
310 |
+
with open(temp_filepath, "wb") as buffer:
|
311 |
+
shutil.copyfileobj(file.file, buffer)
|
312 |
+
|
313 |
+
# Transcribe using your custom function
|
314 |
+
result = transcribe_audio(temp_filepath, model_size="base")
|
315 |
+
end_time = time.time()
|
316 |
+
transcription_time = end_time - start_time
|
317 |
+
response = {
|
318 |
+
"transcription": result,
|
319 |
+
"transcription_time": transcription_time
|
320 |
+
}
|
321 |
+
|
322 |
+
return JSONResponse(content=response)
|
323 |
+
|
324 |
+
except Exception as e:
|
325 |
+
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
|
326 |
+
|
327 |
+
finally:
|
328 |
+
# Clean up temporary file
|
329 |
+
if os.path.exists(temp_filepath):
|
330 |
+
os.remove(temp_filepath)
|
331 |
+
|
332 |
+
import datetime
|
333 |
+
|
334 |
+
@app.post('/analyze_all/')
|
335 |
+
async def analyze_all(file: UploadFile):
|
336 |
+
"""
|
337 |
+
Endpoint to analyze all aspects of an uploaded audio file (.wav or .mp3).
|
338 |
+
"""
|
339 |
+
print(f"Received request at {datetime.datetime.now()} for file: {file.filename}")
|
340 |
+
if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
|
341 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
|
342 |
+
|
343 |
+
# Generate a safe temporary file path
|
344 |
+
temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
|
345 |
+
temp_dir = "temp_uploads"
|
346 |
+
temp_filepath = os.path.join(temp_dir, temp_filename)
|
347 |
+
os.makedirs(temp_dir, exist_ok=True)
|
348 |
+
|
349 |
+
try:
|
350 |
+
# Save uploaded file
|
351 |
+
with open(temp_filepath, "wb") as buffer:
|
352 |
+
shutil.copyfileobj(file.file, buffer)
|
353 |
+
|
354 |
+
# Analyze all aspects using your custom functions
|
355 |
+
fluency_result = analyze_fluency_main(temp_filepath, model_size="base")
|
356 |
+
tone_result = analyze_tone_main(temp_filepath)
|
357 |
+
vcs_result = analyze_vcs_main(temp_filepath)
|
358 |
+
vers_result = analyze_vers_main(temp_filepath)
|
359 |
+
voice_confidence_result = analyze_voice_confidence_main(temp_filepath)
|
360 |
+
vps_result = analyze_vps_main(temp_filepath)
|
361 |
+
ves_result = calc_voice_engagement_score(temp_filepath)
|
362 |
+
filler_count = analyze_fillers(temp_filepath) # Assuming this function returns a dict with filler count
|
363 |
+
transcript, language, _ = transcribe_audio(temp_filepath, "base") #fix this
|
364 |
+
#emotion = predict_emotion(temp_filepath)
|
365 |
+
avg_score = (fluency_result['fluency_score'] + tone_result['speech_dynamism_score'] + vcs_result['Voice Clarity Sore'] + vers_result['VERS Score'] + voice_confidence_result['voice_confidence_score'] + vps_result['VPS'] + ves_result['ves']) / 7
|
366 |
+
|
367 |
+
|
368 |
+
# Combine results into a single response
|
369 |
+
combined_result = {
|
370 |
+
"fluency": fluency_result,
|
371 |
+
"tone": tone_result,
|
372 |
+
"vcs": vcs_result,
|
373 |
+
"vers": vers_result,
|
374 |
+
"voice_confidence": voice_confidence_result,
|
375 |
+
"vps": vps_result,
|
376 |
+
"ves": ves_result,
|
377 |
+
"filler_words": filler_count,
|
378 |
+
"transcript": transcript,
|
379 |
+
"Detected Language": language,
|
380 |
+
#"emotion": emotion ,
|
381 |
+
"sank_score": avg_score
|
382 |
+
}
|
383 |
+
|
384 |
+
return JSONResponse(content=combined_result)
|
385 |
+
|
386 |
+
except Exception as e:
|
387 |
+
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
388 |
+
|
389 |
+
finally:
|
390 |
+
# Clean up temporary file
|
391 |
+
if os.path.exists(temp_filepath):
|
392 |
+
os.remove(temp_filepath)
|
emotion/__init__.py
ADDED
File without changes
|
emotion/emo_predict.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
|
2 |
+
import librosa
|
3 |
+
import torch
|
4 |
+
|
5 |
+
# Load the feature extractor and model
|
6 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
|
7 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
|
8 |
+
model.eval()
|
9 |
+
|
10 |
+
def predict_emotion(audio_path):
|
11 |
+
# Load audio (mono, 16kHz)
|
12 |
+
audio, rate = librosa.load(audio_path, sr=16000)
|
13 |
+
|
14 |
+
# Extract features
|
15 |
+
inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt", padding=True)
|
16 |
+
|
17 |
+
# Predict emotion
|
18 |
+
with torch.no_grad():
|
19 |
+
outputs = model(**inputs)
|
20 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
21 |
+
pred_id = torch.argmax(probs, dim=-1).item()
|
22 |
+
emotion = model.config.id2label[pred_id]
|
23 |
+
|
24 |
+
return emotion
|
25 |
+
|
26 |
+
# # Example usage
|
27 |
+
# emotion = predict_emotion(r"D:\Intern\shankh\audio_samples\anga.wav")
|
28 |
+
# print(f"Predicted Emotion: {emotion}")
|
filler_count/__init__.py
ADDED
File without changes
|
filler_count/filler_score.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import whisper
|
3 |
+
from pydub import AudioSegment # For accurate duration calculation
|
4 |
+
|
5 |
+
def analyze_fillers(file_path: str, model_size: str = "base") -> dict:
|
6 |
+
"""
|
7 |
+
Analyzes English filler words in audio with proper duration handling.
|
8 |
+
"""
|
9 |
+
try:
|
10 |
+
FILLER_WORDS = [
|
11 |
+
"um", "uh", "hmm", "ah", "er", "eh",
|
12 |
+
"umm", "uhh", "mmm", "ahh", "err",
|
13 |
+
"like", "you know", "well", "so", "actually", "basically",
|
14 |
+
"right", "okay", "sort of", "kind of"
|
15 |
+
]
|
16 |
+
|
17 |
+
# First get accurate duration using pydub
|
18 |
+
audio = AudioSegment.from_file(file_path)
|
19 |
+
duration = len(audio) / 1000 # Convert ms to seconds
|
20 |
+
|
21 |
+
# Then run Whisper transcription
|
22 |
+
model = whisper.load_model(model_size)
|
23 |
+
result = model.transcribe(file_path, word_timestamps=False, fp16=False)
|
24 |
+
transcript = result["text"]
|
25 |
+
|
26 |
+
# Case-insensitive regex matching
|
27 |
+
pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)"
|
28 |
+
matches = re.findall(pattern, transcript, re.IGNORECASE)
|
29 |
+
|
30 |
+
# Count occurrences
|
31 |
+
filler_counts = {}
|
32 |
+
for word in matches:
|
33 |
+
key = word.lower()
|
34 |
+
filler_counts[key] = filler_counts.get(key, 0) + 1
|
35 |
+
total_fillers = sum(filler_counts.values())
|
36 |
+
|
37 |
+
# Calculate rate per minute
|
38 |
+
filler_per_min = (total_fillers / duration) * 60 if duration > 0 else 0
|
39 |
+
|
40 |
+
# Scoring
|
41 |
+
if total_fillers == 0:
|
42 |
+
filler_score = 100
|
43 |
+
elif filler_per_min < 1:
|
44 |
+
filler_score = 90
|
45 |
+
elif filler_per_min < 3:
|
46 |
+
filler_score = 80
|
47 |
+
elif filler_per_min < 5:
|
48 |
+
filler_score = 60
|
49 |
+
elif filler_per_min < 10:
|
50 |
+
filler_score = 40
|
51 |
+
else:
|
52 |
+
filler_score = 20
|
53 |
+
|
54 |
+
# Generate insight
|
55 |
+
top_fillers = sorted(filler_counts.items(), key=lambda x: x[1], reverse=True)[:2]
|
56 |
+
|
57 |
+
if total_fillers == 0:
|
58 |
+
insight = "Excellent! No filler words detected."
|
59 |
+
elif total_fillers <= 2:
|
60 |
+
insight = f"Minimal fillers ({total_fillers} total), mostly '{top_fillers[0][0]}'."
|
61 |
+
elif total_fillers <= 5:
|
62 |
+
examples = ", ".join(f"'{f[0]}'" for f in top_fillers)
|
63 |
+
insight = f"Moderate fillers ({total_fillers} total), mainly {examples}."
|
64 |
+
else:
|
65 |
+
examples = ", ".join(f"'{f[0]}'" for f in top_fillers)
|
66 |
+
insight = f"Excessive fillers ({total_fillers} total), dominated by {examples}."
|
67 |
+
|
68 |
+
return {
|
69 |
+
"filler_counts": filler_counts,
|
70 |
+
"total_fillers": total_fillers,
|
71 |
+
"filler_score": filler_score,
|
72 |
+
"filler_rate_per_min": round(filler_per_min, 1),
|
73 |
+
}
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
raise RuntimeError(f"Analysis failed: {str(e)}")
|
fluency/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# fluency/__init__.py
|
2 |
+
from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
|
3 |
+
from .filler_analyzer import detect_fillers
|
4 |
+
from .compute_fluency import compute_fluency_score
|
5 |
+
|
6 |
+
__all__ = [
|
7 |
+
'calc_srs',
|
8 |
+
'calculate_pas',
|
9 |
+
'calculate_fluency',
|
10 |
+
'get_fluency_insight',
|
11 |
+
'detect_fillers',
|
12 |
+
'compute_fluency_score'
|
13 |
+
]
|
fluency/compute_fluency.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Compute fluency score from audio file using SRS and PAS calculations
|
3 |
+
"""
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
from typing import Dict, Any, Union
|
8 |
+
from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
|
9 |
+
from .filler_analyzer import detect_fillers
|
10 |
+
|
11 |
+
def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
|
12 |
+
"""
|
13 |
+
Compute fluency score and its components from a speech sample.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
file_path (str): Path to the audio file.
|
17 |
+
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
dict: A dictionary containing fluency score, SRS, PAS, and component scores.
|
21 |
+
"""
|
22 |
+
# Transcribe audio
|
23 |
+
result = whisper_model.transcribe(file_path)
|
24 |
+
transcript = result.get("text", "").strip()
|
25 |
+
segments = result.get("segments", [])
|
26 |
+
|
27 |
+
# Validate early
|
28 |
+
if not transcript or not segments:
|
29 |
+
raise ValueError("Empty transcript or segments from Whisper.")
|
30 |
+
|
31 |
+
# Detect filler words
|
32 |
+
filler_count, _ = detect_fillers(transcript)
|
33 |
+
|
34 |
+
# Load audio
|
35 |
+
y, sr = librosa.load(file_path, sr=None)
|
36 |
+
duration = len(y) / sr if sr else 0.0
|
37 |
+
if duration <= 0:
|
38 |
+
raise ValueError("Audio duration invalid or zero.")
|
39 |
+
|
40 |
+
# Calculate pitch variation (in semitones)
|
41 |
+
f0, voiced_flags, voiced_probs = librosa.pyin(
|
42 |
+
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
|
43 |
+
voiced_f0 = f0[~np.isnan(f0)]
|
44 |
+
pitch_variation = 0.0
|
45 |
+
if voiced_f0.size > 0:
|
46 |
+
median_f0 = np.nanmedian(voiced_f0)
|
47 |
+
median_f0 = max(median_f0, 1e-6)
|
48 |
+
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
|
49 |
+
pitch_variation = float(np.nanstd(semitone_diffs))
|
50 |
+
|
51 |
+
# Analyze pauses
|
52 |
+
long_pause_count = 0
|
53 |
+
if segments:
|
54 |
+
for i in range(len(segments) - 1):
|
55 |
+
pause_dur = segments[i + 1]["start"] - segments[i]["end"]
|
56 |
+
if pause_dur > 1.0:
|
57 |
+
long_pause_count += 1
|
58 |
+
# Check beginning and end pauses
|
59 |
+
if segments[0]["start"] > 1.0:
|
60 |
+
long_pause_count += 1
|
61 |
+
if duration - segments[-1]["end"] > 1.0:
|
62 |
+
long_pause_count += 1
|
63 |
+
|
64 |
+
# Calculate WPM
|
65 |
+
word_count = len(transcript.split())
|
66 |
+
words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
|
67 |
+
|
68 |
+
# Calculate SRS - Speech Rate Stability
|
69 |
+
srs_score = calc_srs(
|
70 |
+
wpm=words_per_min,
|
71 |
+
filler_count=filler_count,
|
72 |
+
long_pause_count=long_pause_count,
|
73 |
+
pitch_variation=pitch_variation
|
74 |
+
)
|
75 |
+
|
76 |
+
# Calculate PAS - Pause Appropriateness Score
|
77 |
+
pas_result = calculate_pas(
|
78 |
+
transcript=transcript,
|
79 |
+
segments=segments,
|
80 |
+
filler_count=filler_count,
|
81 |
+
duration=duration
|
82 |
+
)
|
83 |
+
pas_score = pas_result["PAS"]
|
84 |
+
|
85 |
+
# Calculate final fluency score
|
86 |
+
fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
|
87 |
+
fluency_score = fluency_result["score"]
|
88 |
+
insight = get_fluency_insight(fluency_score)
|
89 |
+
|
90 |
+
# Build and return comprehensive result
|
91 |
+
return {
|
92 |
+
"fluency_score": fluency_score,
|
93 |
+
"insight": insight,
|
94 |
+
"SRS": srs_score,
|
95 |
+
"PAS": pas_score,
|
96 |
+
"components": {
|
97 |
+
"wpm": words_per_min,
|
98 |
+
"filler_count": filler_count,
|
99 |
+
"long_pause_count": long_pause_count,
|
100 |
+
"pitch_variation": pitch_variation,
|
101 |
+
"word_count": word_count,
|
102 |
+
"duration": duration,
|
103 |
+
"pas_components": pas_result
|
104 |
+
},
|
105 |
+
"transcript": transcript
|
106 |
+
}
|
fluency/filler_analyzer.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
|
2 |
+
# Mapping each variant to a common label (usually the Latin script for insight reporting)
|
3 |
+
FILLER_VARIANTS = {
|
4 |
+
# English fillers
|
5 |
+
"um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
|
6 |
+
"umm": "um", "uhh": "uh", "mmm": "hmm",
|
7 |
+
"like": "like", "you know": "you know", "so": "so", "well": "well",
|
8 |
+
# Hindi fillers (Devanagari and transliteration)
|
9 |
+
"मतलब": "matlab", "matlab": "matlab",
|
10 |
+
"क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
|
11 |
+
"वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
|
12 |
+
"ऐसा है": "aisa hai", "aisa hai": "aisa hai",
|
13 |
+
"हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan"
|
14 |
+
"अच्छा": "acha", "acha": "acha",
|
15 |
+
# Tamil fillers (Tamil script and transliteration)
|
16 |
+
"பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
|
17 |
+
"அப்பரம்": "apparam", "apparam": "apparam",
|
18 |
+
"என்ன": "enna", "enna": "enna"
|
19 |
+
}
|
20 |
+
|
21 |
+
def detect_fillers(transcript):
|
22 |
+
"""
|
23 |
+
Detects filler words in the transcript.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
transcript: Full transcript text
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
tuple: (filler_count, filler_occurrences)
|
30 |
+
"""
|
31 |
+
transcript_lower = transcript.lower()
|
32 |
+
filler_count = 0
|
33 |
+
# Track which specific fillers were used (for insight examples)
|
34 |
+
filler_occurrences = {}
|
35 |
+
|
36 |
+
for variant, label in FILLER_VARIANTS.items():
|
37 |
+
if variant in transcript_lower:
|
38 |
+
count = transcript_lower.count(variant)
|
39 |
+
if count > 0:
|
40 |
+
filler_count += count
|
41 |
+
# Accumulate count for the normalized label
|
42 |
+
filler_occurrences[label] = filler_occurrences.get(label, 0) + count
|
43 |
+
|
44 |
+
return filler_count, filler_occurrences
|
45 |
+
|
46 |
+
def analyze_filler_words(filler_count, filler_occurrences, duration):
|
47 |
+
"""
|
48 |
+
Analyzes filler word usage in speech.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
filler_count: Total count of filler words
|
52 |
+
filler_occurrences: Dictionary of specific filler words and their counts
|
53 |
+
duration: Duration of the audio in seconds
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
dict: Contains the filler words score and insight text
|
57 |
+
"""
|
58 |
+
# Extract top examples for insights
|
59 |
+
filler_examples = []
|
60 |
+
if filler_occurrences:
|
61 |
+
# Sort by frequency
|
62 |
+
sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
|
63 |
+
for label, count in sorted_fillers[:2]:
|
64 |
+
filler_examples.append(label)
|
65 |
+
|
66 |
+
# Compute fillers per minute as a gauge
|
67 |
+
filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
|
68 |
+
|
69 |
+
if filler_count == 0:
|
70 |
+
filler_score = 10
|
71 |
+
elif filler_per_min < 1:
|
72 |
+
filler_score = 9
|
73 |
+
elif filler_per_min < 3:
|
74 |
+
filler_score = 8
|
75 |
+
elif filler_per_min < 5:
|
76 |
+
filler_score = 6
|
77 |
+
elif filler_per_min < 10:
|
78 |
+
filler_score = 4
|
79 |
+
else:
|
80 |
+
filler_score = 2
|
81 |
+
|
82 |
+
filler_score = max(0, filler_score)
|
83 |
+
|
84 |
+
# Generate insight text based on the score and examples
|
85 |
+
if filler_count == 0:
|
86 |
+
insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
|
87 |
+
elif filler_count <= 2:
|
88 |
+
example = filler_examples[0] if filler_examples else "um"
|
89 |
+
insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
|
90 |
+
elif filler_count <= 5:
|
91 |
+
examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
|
92 |
+
insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
|
93 |
+
else:
|
94 |
+
examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
|
95 |
+
insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
|
96 |
+
|
97 |
+
return {
|
98 |
+
"score": int(filler_score),
|
99 |
+
"insight": insight
|
100 |
+
}
|
fluency/fluency.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import spacy
|
4 |
+
from typing import List, Dict
|
5 |
+
|
6 |
+
def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
|
7 |
+
"""
|
8 |
+
Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
wpm (float): Words per minute
|
12 |
+
filler_count (int): Number of filler words ("um", "uh", etc.)
|
13 |
+
long_pause_count (int): Number of pauses longer than 1 second
|
14 |
+
pitch_variation (float): Standard deviation of pitch in semitones
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
float: SRS score between 0-100
|
18 |
+
|
19 |
+
Requires:
|
20 |
+
- Words per Minute Consistency: Regularity in speech speed.
|
21 |
+
- Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes.
|
22 |
+
"""
|
23 |
+
ideal_wpm = 150
|
24 |
+
wpm_deviation = min(30, abs(wpm - ideal_wpm)) # Cap at 30 WPM deviation
|
25 |
+
wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) # 100-50 for max deviation
|
26 |
+
|
27 |
+
# Sudden Speech Shift Penalty
|
28 |
+
filler_penalty = min(filler_count / 10, 1.0)
|
29 |
+
pause_penalty = min(long_pause_count / 5, 1.0)
|
30 |
+
pitch_penalty = min(pitch_variation / 3.0, 1.0) # High variation → unstable
|
31 |
+
|
32 |
+
# Combine into absence of sudden shifts
|
33 |
+
stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
|
34 |
+
|
35 |
+
# Final SRS Score
|
36 |
+
SRS = (0.45 * wpm_consistency) + (0.55 * stability)
|
37 |
+
return min(100, max(0, SRS))
|
38 |
+
|
39 |
+
|
40 |
+
def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
|
41 |
+
"""
|
42 |
+
Calculate the Pause Appropriateness Score (PAS) and its components.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
transcript (str): Full transcript text
|
46 |
+
segments (List[Dict]): List of transcript segments with start/end times
|
47 |
+
filler_count (int): Number of filler words detected
|
48 |
+
duration (float): Total duration of audio in seconds
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
Dict[str, float]: Dictionary with NPP, AFW, and PAS scores
|
52 |
+
"""
|
53 |
+
if not transcript or not segments or duration <= 0:
|
54 |
+
raise ValueError("Transcript, segments, and duration must be valid")
|
55 |
+
|
56 |
+
nlp = spacy.load("en_core_web_sm")
|
57 |
+
doc = nlp(transcript)
|
58 |
+
|
59 |
+
words = transcript.split()
|
60 |
+
total_words = len(words)
|
61 |
+
if total_words == 0:
|
62 |
+
raise ValueError("No words found in transcript")
|
63 |
+
|
64 |
+
# Calculate Avoidance of Filler Words (AFW)
|
65 |
+
filler_rate = filler_count / total_words if total_words > 0 else 0.0
|
66 |
+
if filler_rate >= 0.10:
|
67 |
+
afw = 0.0
|
68 |
+
elif filler_rate <= 0.0:
|
69 |
+
afw = 100.0
|
70 |
+
else:
|
71 |
+
afw = 100.0 - (filler_rate * 1000)
|
72 |
+
afw = max(0.0, min(100.0, afw))
|
73 |
+
|
74 |
+
# Calculate Natural Pause Placement (NPP)
|
75 |
+
total_pauses = 0
|
76 |
+
natural_pauses = 0
|
77 |
+
segment_texts = [seg["text"].strip() for seg in segments]
|
78 |
+
segment_starts = [seg["start"] for seg in segments]
|
79 |
+
segment_ends = [seg["end"] for seg in segments]
|
80 |
+
|
81 |
+
for i in range(len(segments) - 1):
|
82 |
+
pause_dur = segment_starts[i + 1] - segment_ends[i]
|
83 |
+
if pause_dur > 0.5:
|
84 |
+
total_pauses += 1
|
85 |
+
if segment_texts[i] and segment_texts[i][-1] in ".!?,":
|
86 |
+
natural_pauses += 1
|
87 |
+
|
88 |
+
# Check initial and final pauses
|
89 |
+
if segment_starts[0] > 0.5:
|
90 |
+
total_pauses += 1
|
91 |
+
if duration - segment_ends[-1] > 0.5:
|
92 |
+
total_pauses += 1
|
93 |
+
if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
|
94 |
+
natural_pauses += 1
|
95 |
+
|
96 |
+
npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
|
97 |
+
|
98 |
+
# Calculate final PAS
|
99 |
+
pas = (0.4 * npp) + (0.6 * afw)
|
100 |
+
|
101 |
+
return {
|
102 |
+
"NPP": npp,
|
103 |
+
"AFW": afw,
|
104 |
+
"PAS": pas
|
105 |
+
}
|
106 |
+
|
107 |
+
|
108 |
+
def calculate_fluency(srs: float, pas: float) -> Dict[str, float]:
|
109 |
+
"""
|
110 |
+
Calculate fluency score based on Speech Rate Stability and Pause Appropriateness Score.
|
111 |
+
|
112 |
+
Args:
|
113 |
+
srs (float): Speech Rate Stability score (0-100)
|
114 |
+
pas (float): Pause Appropriateness Score (0-100)
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
Dict[str, float]: Dictionary with fluency score (0-100) and component contributions
|
118 |
+
"""
|
119 |
+
# Equal weighting of SRS and PAS for fluency
|
120 |
+
fluency_score = (0.5 * srs) + (0.5 * pas)
|
121 |
+
|
122 |
+
|
123 |
+
return {
|
124 |
+
"score": fluency_score,
|
125 |
+
"SRS_contribution": 0.5 * srs,
|
126 |
+
"PAS_contribution": 0.5 * pas
|
127 |
+
}
|
128 |
+
|
129 |
+
|
130 |
+
def get_fluency_insight(fluency_score: float) -> str:
|
131 |
+
"""
|
132 |
+
Generate insight text based on the fluency score.
|
133 |
+
|
134 |
+
Args:
|
135 |
+
fluency_score (float): The calculated fluency score (0-100)
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
str: Insight text explaining the score
|
139 |
+
"""
|
140 |
+
if fluency_score >= 85:
|
141 |
+
return "Excellent fluency with very consistent pacing and natural pauses. Speech flows effortlessly."
|
142 |
+
elif fluency_score >= 70:
|
143 |
+
return "Good fluency with generally stable speech rate and appropriate pauses. Some minor inconsistencies."
|
144 |
+
elif fluency_score >= 50:
|
145 |
+
return "Moderate fluency with occasional disruptions in speech flow. Consider working on pace stability and pause placement."
|
146 |
+
elif fluency_score >= 30:
|
147 |
+
return "Below average fluency with noticeable disruptions. Focus on reducing filler words and maintaining consistent pace."
|
148 |
+
else:
|
149 |
+
return "Speech fluency needs significant improvement. Work on maintaining consistent pace, reducing long pauses, and eliminating filler words."
|
fluency/fluency_api.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from .compute_fluency import compute_fluency_score
|
3 |
+
|
4 |
+
def main(file_path: str, model_size: str = "base") -> dict:
|
5 |
+
try:
|
6 |
+
|
7 |
+
whisper_model = whisper.load_model(model_size)
|
8 |
+
|
9 |
+
results = compute_fluency_score(file_path, whisper_model)
|
10 |
+
|
11 |
+
# Structure response
|
12 |
+
response = {
|
13 |
+
"fluency_score": round(results['fluency_score'], 2)
|
14 |
+
# "insight": results["insight"],
|
15 |
+
# "SRS": round(results["SRS"], 2),
|
16 |
+
# "PAS": round(results["PAS"], 2),
|
17 |
+
# "transcript": results["transcript"]
|
18 |
+
}
|
19 |
+
return response
|
20 |
+
|
21 |
+
except Exception as e:
|
22 |
+
raise RuntimeError(f"Error during analysis: {str(e)}")
|
fluency/main.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import whisper
|
3 |
+
from .compute_fluency import compute_fluency_score
|
4 |
+
|
5 |
+
def main():
|
6 |
+
"""
|
7 |
+
Main function to run fluency analysis on audio files
|
8 |
+
"""
|
9 |
+
# Fixed parameters - modify these values directly in the code
|
10 |
+
audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav" # Path to your audio file
|
11 |
+
model_size = "base" # Whisper model size (tiny, base, small, medium, large)
|
12 |
+
verbose = True # Whether to print detailed results
|
13 |
+
|
14 |
+
try:
|
15 |
+
# Load whisper model
|
16 |
+
print(f"Loading Whisper model ({model_size})...")
|
17 |
+
whisper_model = whisper.load_model(model_size)
|
18 |
+
|
19 |
+
# Calculate fluency score
|
20 |
+
print(f"Analyzing fluency for {audio_file}...")
|
21 |
+
results = compute_fluency_score(audio_file, whisper_model)
|
22 |
+
|
23 |
+
# Print summary results
|
24 |
+
print("\nFluency Analysis Results:")
|
25 |
+
print(f"- Fluency Score: {results['fluency_score']:.2f}/100")
|
26 |
+
print(f"- Insight: {results['insight']}")
|
27 |
+
print(f"- Speech Rate Stability (SRS): {results['SRS']:.2f}/100")
|
28 |
+
print(f"- Pause Appropriateness (PAS): {results['PAS']:.2f}/100")
|
29 |
+
|
30 |
+
# Print verbose results if enabled
|
31 |
+
if verbose:
|
32 |
+
print("\nDetailed Metrics:")
|
33 |
+
print(f"- Words per minute: {results['components']['wpm']:.1f}")
|
34 |
+
print(f"- Filler word count: {results['components']['filler_count']}")
|
35 |
+
print(f"- Long pauses: {results['components']['long_pause_count']}")
|
36 |
+
print(f"- Pitch variation: {results['components']['pitch_variation']:.2f} semitones")
|
37 |
+
print(f"- Natural Pause Placement: {results['components']['pas_components']['NPP']:.2f}/100")
|
38 |
+
print(f"- Avoidance of Filler Words: {results['components']['pas_components']['AFW']:.2f}/100")
|
39 |
+
|
40 |
+
# Print first 100 characters of transcript
|
41 |
+
transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript']
|
42 |
+
print(f"\nTranscript preview: {transcript_preview}")
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
print(f"Error during analysis: {str(e)}")
|
46 |
+
return 1
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
exit(main())
|
requirements.txt
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
fastapi
|
3 |
+
uvicorn
|
4 |
+
python-multipart
|
5 |
+
pydub
|
6 |
+
|
7 |
+
librosa
|
8 |
+
soundfile
|
9 |
+
pyworld
|
10 |
+
scipy
|
11 |
+
|
12 |
+
|
13 |
+
openai-whisper==20240930
|
14 |
+
spacy==3.8.5
|
15 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
|
16 |
+
transformers
|
17 |
+
torch
|
18 |
+
|
19 |
+
|
20 |
+
numpy
|
21 |
+
tqdm
|
22 |
+
requests
|
23 |
+
assemblyai
|
tone_modulation/__init__.py
ADDED
File without changes
|
tone_modulation/sds.py
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import scipy.signal
|
3 |
+
import numpy as np
|
4 |
+
import librosa
|
5 |
+
import pyworld as pw
|
6 |
+
|
7 |
+
# def compute_pitch_variation(file_path):
|
8 |
+
# # Step 1: Load audio
|
9 |
+
# y, sr = librosa.load(file_path, sr=None)
|
10 |
+
# y = y.astype(np.float64) # pyworld expects float64
|
11 |
+
|
12 |
+
# # Step 2: Extract pitch (F0)
|
13 |
+
# _f0, t = pw.dio(y, sr) # Fast initial pitch estimation
|
14 |
+
# f0 = pw.stonemask(y, _f0, t, sr) # Refinement step
|
15 |
+
|
16 |
+
# # Step 3: Filter voiced frames
|
17 |
+
# voiced_f0 = f0[f0 > 0]
|
18 |
+
|
19 |
+
# # Handle empty case
|
20 |
+
# if voiced_f0.size == 0:
|
21 |
+
# return {
|
22 |
+
# "pitch_mean": 0.0,
|
23 |
+
# "pitch_std": 0.0,
|
24 |
+
# "pitch_range": 0.0,
|
25 |
+
# "semitone_std": 0.0,
|
26 |
+
# "pitch_variation_score": 0.0
|
27 |
+
# }
|
28 |
+
|
29 |
+
# # Step 4: Basic statistics
|
30 |
+
# pitch_mean = np.mean(voiced_f0)
|
31 |
+
# pitch_std = np.std(voiced_f0)
|
32 |
+
# pitch_range = np.max(voiced_f0) - np.min(voiced_f0)
|
33 |
+
|
34 |
+
# print(pitch_mean)
|
35 |
+
# print(f'voiced_f0: {voiced_f0}')
|
36 |
+
# # Step 5: Compute semitone-based variation (better for human perception)
|
37 |
+
# median_f0 = np.median(voiced_f0)
|
38 |
+
# if median_f0 <= 0:
|
39 |
+
# median_f0 = 1e-6 # Avoid division by zero
|
40 |
+
|
41 |
+
# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
|
42 |
+
# semitone_std = np.std(semitone_diffs)
|
43 |
+
# print(semitone_std)
|
44 |
+
|
45 |
+
# # Step 6: Scale semitone_std to a 0–100 score (tunable)
|
46 |
+
# # For example: semitone_std of 0 → 0 score, ≥6 semitones → 100 score
|
47 |
+
# pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100)
|
48 |
+
|
49 |
+
# return {
|
50 |
+
# "pitch_mean": pitch_mean,
|
51 |
+
# "pitch_std": pitch_std,
|
52 |
+
# "pitch_range": pitch_range,
|
53 |
+
# "semitone_std": semitone_std,
|
54 |
+
# "pitch_variation_score": pitch_variation_score
|
55 |
+
# }
|
56 |
+
# def compute_intonation_range(file_path):
|
57 |
+
# # Step 1: Load and prepare audio
|
58 |
+
# y, sr = librosa.load(file_path, sr=None)
|
59 |
+
# y = y.astype(np.float64)
|
60 |
+
|
61 |
+
# # Step 2: Extract F0
|
62 |
+
# _f0, t = pw.dio(y, sr)
|
63 |
+
# f0 = pw.stonemask(y, _f0, t, sr)
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
# # Step 3: Filter voiced frames
|
68 |
+
# voiced_f0 = f0[f0 > 0]
|
69 |
+
# if voiced_f0.size == 0:
|
70 |
+
# return 0.0
|
71 |
+
|
72 |
+
# voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) &
|
73 |
+
# (voiced_f0 < np.percentile(voiced_f0, 95))]
|
74 |
+
|
75 |
+
# # Step 4: Compute intonation range (in semitones)
|
76 |
+
# f0_min = np.min(voiced_f0)
|
77 |
+
# f0_max = np.max(voiced_f0)
|
78 |
+
# if f0_min <= 0:
|
79 |
+
# f0_min = 1e-6 # to avoid log error
|
80 |
+
# intonation_range = 12 * np.log2(f0_max / f0_min)
|
81 |
+
|
82 |
+
# # range into scores:
|
83 |
+
|
84 |
+
# max_range = 12.0
|
85 |
+
# normalized = min(intonation_range, max_range) / max_range
|
86 |
+
# score = normalized * 100
|
87 |
+
# return round(score, 2), intonation_range
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
# def compute_pitch_variation(file_path):
|
92 |
+
# # Step 1: Load audio
|
93 |
+
# y, sr = librosa.load(file_path, sr=None)
|
94 |
+
|
95 |
+
# # Step 2: Extract pitch using librosa.pyin (YIN-based)
|
96 |
+
# f0, voiced_flags, voiced_probs = librosa.pyin(
|
97 |
+
# y,
|
98 |
+
# sr=sr,
|
99 |
+
# fmin=80,
|
100 |
+
# fmax=400,
|
101 |
+
# frame_length=1105,
|
102 |
+
# hop_length=256,
|
103 |
+
# fill_na=np.nan
|
104 |
+
# )
|
105 |
+
|
106 |
+
# # Step 3: Filter voiced frames
|
107 |
+
# voiced_f0 = f0[~np.isnan(f0)]
|
108 |
+
|
109 |
+
|
110 |
+
# voiced_f0 = voiced_f0[
|
111 |
+
# (voiced_f0 > np.percentile(voiced_f0, 5)) &
|
112 |
+
# (voiced_f0 < np.percentile(voiced_f0, 95))
|
113 |
+
# ]
|
114 |
+
|
115 |
+
# # Handle empty case
|
116 |
+
# if voiced_f0.size == 0:
|
117 |
+
# return {
|
118 |
+
# "pitch_mean": 0.0,
|
119 |
+
# "pitch_std": 0.0,
|
120 |
+
# "pitch_range": 0.0,
|
121 |
+
# "semitone_std": 0.0,
|
122 |
+
# "pitch_variation_score": 0.0
|
123 |
+
# }
|
124 |
+
|
125 |
+
# # Step 4: Basic statistics
|
126 |
+
# pitch_mean = float(np.mean(voiced_f0))
|
127 |
+
# pitch_std = float(np.std(voiced_f0))
|
128 |
+
# pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
|
129 |
+
|
130 |
+
|
131 |
+
# # Step 5: Compute semitone-based variation
|
132 |
+
# median_f0 = np.median(voiced_f0)
|
133 |
+
# if median_f0 <= 0:
|
134 |
+
# median_f0 = 1e-6
|
135 |
+
|
136 |
+
# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
|
137 |
+
# semitone_std = float(np.std(semitone_diffs))
|
138 |
+
|
139 |
+
|
140 |
+
# # Step 6: Scale to 0–100 score
|
141 |
+
# pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
|
142 |
+
# return {
|
143 |
+
# "pitch_mean": pitch_mean,
|
144 |
+
# "pitch_std": pitch_std,
|
145 |
+
# "pitch_range": pitch_range,
|
146 |
+
# "semitone_std": semitone_std,
|
147 |
+
# "pitch_variation_score": pitch_variation_score
|
148 |
+
# }
|
149 |
+
|
150 |
+
# def compute_intonation_range(file_path):
|
151 |
+
# # Step 1: Load and prepare audio
|
152 |
+
# y, sr = librosa.load(file_path, sr=None)
|
153 |
+
|
154 |
+
# # Step 2: Extract F0 using librosa.pyin
|
155 |
+
# f0, voiced_flags, voiced_probs = librosa.pyin(
|
156 |
+
# y,
|
157 |
+
# sr=sr,
|
158 |
+
# fmin=80,
|
159 |
+
# fmax=400,
|
160 |
+
# frame_length=1105, # ensures two periods of fmin fit
|
161 |
+
# hop_length=256,
|
162 |
+
# fill_na=np.nan
|
163 |
+
# )
|
164 |
+
|
165 |
+
# # Step 3: Filter voiced frames
|
166 |
+
# voiced_f0 = f0[~np.isnan(f0)]
|
167 |
+
# if voiced_f0.size == 0:
|
168 |
+
# return 0.0, 0.0
|
169 |
+
|
170 |
+
# # Optional: remove outliers (5th to 95th percentile)
|
171 |
+
# voiced_f0 = voiced_f0[
|
172 |
+
# (voiced_f0 > np.percentile(voiced_f0, 5)) &
|
173 |
+
# (voiced_f0 < np.percentile(voiced_f0, 95))
|
174 |
+
# ]
|
175 |
+
|
176 |
+
# # Step 4: Compute intonation range in semitones
|
177 |
+
# f0_min = np.min(voiced_f0)
|
178 |
+
# f0_max = np.max(voiced_f0)
|
179 |
+
# if f0_min <= 0:
|
180 |
+
# f0_min = 1e-6
|
181 |
+
|
182 |
+
# intonation_range = 12 * np.log2(f0_max / f0_min)
|
183 |
+
|
184 |
+
# # Step 5: Normalize and convert to score out of 100
|
185 |
+
# max_range = 12.0 # ~1 octave
|
186 |
+
# normalized = min(intonation_range, max_range) / max_range
|
187 |
+
# score = normalized * 100
|
188 |
+
|
189 |
+
# return round(score, 2), float(intonation_range)
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
# def compute_speech_rhythm_variability(file_path):
|
194 |
+
# """
|
195 |
+
# Computes the speech rhythm variability score from an audio file.
|
196 |
+
# The method estimates tempo consistency across time using onset intervals.
|
197 |
+
|
198 |
+
# Returns:
|
199 |
+
# score (float): Normalized rhythm variability score out of 100.
|
200 |
+
# raw_std (float): Raw standard deviation of inter-onset intervals.
|
201 |
+
# """
|
202 |
+
# # Step 1: Load audio
|
203 |
+
# y, sr = librosa.load(file_path, sr=None)
|
204 |
+
|
205 |
+
# # Step 2: Onset detection
|
206 |
+
# onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
207 |
+
# onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
|
208 |
+
|
209 |
+
# if len(onsets) < 2:
|
210 |
+
# return 0.0, 0.0 # Not enough onsets to compute rhythm
|
211 |
+
|
212 |
+
# # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy
|
213 |
+
# iois = np.diff(onsets)
|
214 |
+
|
215 |
+
# # Optional: Remove outliers (5th–95th percentile)
|
216 |
+
# ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
|
217 |
+
# if len(ioi_clean) < 2:
|
218 |
+
# return 0.0, 0.0
|
219 |
+
|
220 |
+
# # Step 4: Compute variability — standard deviation of IOIs
|
221 |
+
# raw_std = np.std(ioi_clean)
|
222 |
+
|
223 |
+
# # Step 5: Normalize raw_std to 0–100 score
|
224 |
+
# # Lower std = more consistent rhythm → higher score
|
225 |
+
# min_std = 0.05 # near-perfect rhythm (tight pacing)
|
226 |
+
# max_std = 0.6 # highly irregular rhythm
|
227 |
+
|
228 |
+
# # Clamp and reverse-score
|
229 |
+
# clamped_std = np.clip(raw_std, min_std, max_std)
|
230 |
+
# normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
|
231 |
+
# score = normalized * 100
|
232 |
+
|
233 |
+
# return round(score, 2), round(float(raw_std), 4)
|
234 |
+
|
235 |
+
|
236 |
+
# def calc_sds(file_path):
|
237 |
+
|
238 |
+
# # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability
|
239 |
+
|
240 |
+
# pitch_variation = compute_pitch_variation(file_path)
|
241 |
+
# intonation_range = compute_intonation_range(file_path)
|
242 |
+
# speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
|
243 |
+
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
|
244 |
+
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
|
245 |
+
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
|
246 |
+
|
247 |
+
# sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0]
|
248 |
+
# return round(sds, 2)
|
249 |
+
|
250 |
+
# path = r'D:\Intern\shankh\audio_samples\anga.wav'
|
251 |
+
|
252 |
+
# result = calc_sds(path)
|
253 |
+
# print(f"SDS: {result}")
|
254 |
+
|
255 |
+
import numpy as np
|
256 |
+
import librosa
|
257 |
+
import pyworld
|
258 |
+
|
259 |
+
def compute_pitch_variation(file_path):
|
260 |
+
# Step 1: Load audio
|
261 |
+
y, sr = librosa.load(file_path, sr=None)
|
262 |
+
|
263 |
+
# Step 2: Extract pitch using pyworld
|
264 |
+
_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
|
265 |
+
f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
|
266 |
+
|
267 |
+
# Step 3: Filter voiced frames
|
268 |
+
voiced_f0 = f0[f0 > 0]
|
269 |
+
|
270 |
+
# Remove outliers (5th to 95th percentile)
|
271 |
+
voiced_f0 = voiced_f0[
|
272 |
+
(voiced_f0 > np.percentile(voiced_f0, 5)) &
|
273 |
+
(voiced_f0 < np.percentile(voiced_f0, 95))
|
274 |
+
]
|
275 |
+
|
276 |
+
if voiced_f0.size == 0:
|
277 |
+
return {
|
278 |
+
"pitch_mean": 0.0,
|
279 |
+
"pitch_std": 0.0,
|
280 |
+
"pitch_range": 0.0,
|
281 |
+
"semitone_std": 0.0,
|
282 |
+
"pitch_variation_score": 0.0
|
283 |
+
}
|
284 |
+
|
285 |
+
# Step 4: Basic statistics
|
286 |
+
pitch_mean = float(np.mean(voiced_f0))
|
287 |
+
pitch_std = float(np.std(voiced_f0))
|
288 |
+
pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
|
289 |
+
|
290 |
+
# Step 5: Semitone-based variation
|
291 |
+
median_f0 = np.median(voiced_f0)
|
292 |
+
if median_f0 <= 0:
|
293 |
+
median_f0 = 1e-6
|
294 |
+
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
|
295 |
+
semitone_std = float(np.std(semitone_diffs))
|
296 |
+
|
297 |
+
# Step 6: Scaled variation score
|
298 |
+
pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
|
299 |
+
|
300 |
+
return {
|
301 |
+
"pitch_mean": pitch_mean,
|
302 |
+
"pitch_std": pitch_std,
|
303 |
+
"pitch_range": pitch_range,
|
304 |
+
"semitone_std": semitone_std,
|
305 |
+
"pitch_variation_score": pitch_variation_score
|
306 |
+
}
|
307 |
+
|
308 |
+
|
309 |
+
def compute_intonation_range(file_path):
|
310 |
+
# Step 1: Load audio
|
311 |
+
y, sr = librosa.load(file_path, sr=None)
|
312 |
+
|
313 |
+
# Step 2: Extract pitch using pyworld
|
314 |
+
_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
|
315 |
+
f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
|
316 |
+
|
317 |
+
# Step 3: Filter voiced frames
|
318 |
+
voiced_f0 = f0[f0 > 0]
|
319 |
+
if voiced_f0.size == 0:
|
320 |
+
return 0.0, 0.0
|
321 |
+
|
322 |
+
# Remove outliers
|
323 |
+
voiced_f0 = voiced_f0[
|
324 |
+
(voiced_f0 > np.percentile(voiced_f0, 5)) &
|
325 |
+
(voiced_f0 < np.percentile(voiced_f0, 95))
|
326 |
+
]
|
327 |
+
if voiced_f0.size == 0:
|
328 |
+
return 0.0, 0.0
|
329 |
+
|
330 |
+
# Step 4: Compute intonation range
|
331 |
+
f0_min = np.min(voiced_f0)
|
332 |
+
f0_max = np.max(voiced_f0)
|
333 |
+
if f0_min <= 0:
|
334 |
+
f0_min = 1e-6
|
335 |
+
intonation_range = 12 * np.log2(f0_max / f0_min)
|
336 |
+
|
337 |
+
# Step 5: Normalize
|
338 |
+
max_range = 12.0
|
339 |
+
normalized = min(intonation_range, max_range) / max_range
|
340 |
+
score = normalized * 100
|
341 |
+
|
342 |
+
return round(score, 2), float(intonation_range)
|
343 |
+
|
344 |
+
|
345 |
+
def compute_speech_rhythm_variability(file_path):
|
346 |
+
"""
|
347 |
+
Computes the speech rhythm variability score from an audio file.
|
348 |
+
The method estimates tempo consistency across time using onset intervals.
|
349 |
+
"""
|
350 |
+
y, sr = librosa.load(file_path, sr=None)
|
351 |
+
|
352 |
+
# Step 2: Onset detection
|
353 |
+
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
354 |
+
onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
|
355 |
+
|
356 |
+
if len(onsets) < 2:
|
357 |
+
return 0.0, 0.0
|
358 |
+
|
359 |
+
iois = np.diff(onsets)
|
360 |
+
|
361 |
+
ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
|
362 |
+
if len(ioi_clean) < 2:
|
363 |
+
return 0.0, 0.0
|
364 |
+
|
365 |
+
raw_std = np.std(ioi_clean)
|
366 |
+
|
367 |
+
min_std = 0.05
|
368 |
+
max_std = 0.6
|
369 |
+
clamped_std = np.clip(raw_std, min_std, max_std)
|
370 |
+
normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
|
371 |
+
score = normalized * 100
|
372 |
+
|
373 |
+
return round(score, 2), round(float(raw_std), 4)
|
374 |
+
|
375 |
+
|
376 |
+
def calc_sds(file_path):
|
377 |
+
pitch_variation = compute_pitch_variation(file_path)
|
378 |
+
intonation_range = compute_intonation_range(file_path)
|
379 |
+
speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
|
380 |
+
|
381 |
+
sds = 0.35 * pitch_variation['pitch_variation_score'] + \
|
382 |
+
0.35 * intonation_range[0] + \
|
383 |
+
0.3 * speech_rhythm_variability[0]
|
384 |
+
|
385 |
+
return round(sds, 2)
|
tone_modulation/tone_api.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from .sds import calc_sds
|
3 |
+
|
4 |
+
import logging
|
5 |
+
logger = logging.getLogger(__name__)
|
6 |
+
|
7 |
+
def main(file_path: str) -> dict:
|
8 |
+
logger.info(f"Starting tone analysis for: {file_path}")
|
9 |
+
try:
|
10 |
+
|
11 |
+
|
12 |
+
results = calc_sds(file_path)
|
13 |
+
|
14 |
+
# Structure response
|
15 |
+
response = {
|
16 |
+
"speech_dynamism_score" : round(results, 2),
|
17 |
+
}
|
18 |
+
logger.info("Tone analysis complete")
|
19 |
+
return response
|
20 |
+
|
21 |
+
except Exception as e:
|
22 |
+
logger.error(f"Tone analysis failed internally: {e}", exc_info=True)
|
23 |
+
raise RuntimeError(f"Error during analysis: {str(e)}")
|
transcribe.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import assemblyai as aai
|
2 |
+
|
3 |
+
aai.settings.api_key = "2c02e1bdab874068bdcfb2e226f048a4" # Use env var in production
|
4 |
+
|
5 |
+
def transcribe_audio(file_path: str, model_size=None) -> tuple[str, str, float]:
|
6 |
+
print(f"Transcribing audio file: {file_path} with language detection")
|
7 |
+
|
8 |
+
config = aai.TranscriptionConfig(
|
9 |
+
speech_model=aai.SpeechModel.nano,
|
10 |
+
language_detection=True,
|
11 |
+
language_confidence_threshold=0.4
|
12 |
+
)
|
13 |
+
|
14 |
+
transcriber = aai.Transcriber()
|
15 |
+
|
16 |
+
transcript = transcriber.transcribe(file_path, config)
|
17 |
+
|
18 |
+
if transcript.status == "error":
|
19 |
+
raise RuntimeError(f"Transcription failed: {transcript.error}")
|
20 |
+
|
21 |
+
# Access detected language and confidence from json_response
|
22 |
+
response = transcript.json_response
|
23 |
+
language = response.get("language_code")
|
24 |
+
confidence = response.get("language_confidence")
|
25 |
+
|
26 |
+
result = {
|
27 |
+
"transcript": transcript.text,
|
28 |
+
"language": language,
|
29 |
+
"confidence": confidence
|
30 |
+
}
|
31 |
+
|
32 |
+
return transcript.text, language, confidence
|
vcs/__init__.py
ADDED
File without changes
|
vcs/compute_vcs.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Compute Voice Clarity Score from audio file
|
3 |
+
"""
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
from typing import Dict, Any
|
8 |
+
from .vcs import calculate_voice_clarity_score, get_clarity_insight
|
9 |
+
|
10 |
+
def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
|
11 |
+
"""
|
12 |
+
Compute Voice Clarity Score and its components from a speech sample.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
file_path (str): Path to the audio file.
|
16 |
+
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
dict: A dictionary containing Voice Clarity Score and component scores.
|
20 |
+
"""
|
21 |
+
# Transcribe audio
|
22 |
+
result = whisper_model.transcribe(file_path)
|
23 |
+
transcript = result.get("text", "").strip()
|
24 |
+
segments = result.get("segments", [])
|
25 |
+
|
26 |
+
# Validate early
|
27 |
+
if not transcript or not segments:
|
28 |
+
raise ValueError("Empty transcript or segments from Whisper.")
|
29 |
+
|
30 |
+
# Load audio
|
31 |
+
y, sr = librosa.load(file_path, sr=None)
|
32 |
+
duration = len(y) / sr if sr else 0.0
|
33 |
+
if duration <= 0:
|
34 |
+
raise ValueError("Audio duration invalid or zero.")
|
35 |
+
|
36 |
+
# Calculate Voice Clarity Score
|
37 |
+
clarity_result = calculate_voice_clarity_score(y, sr, segments)
|
38 |
+
|
39 |
+
# Add transcript to results
|
40 |
+
clarity_result["transcript"] = transcript
|
41 |
+
|
42 |
+
# Add word count and duration info for reference
|
43 |
+
word_count = len(transcript.split())
|
44 |
+
clarity_result["components"]["word_count"] = word_count
|
45 |
+
clarity_result["components"]["duration"] = duration
|
46 |
+
|
47 |
+
return clarity_result
|
48 |
+
|
49 |
+
def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
|
50 |
+
"""
|
51 |
+
Comprehensive voice quality analysis including clarity.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
file_path (str): Path to the audio file
|
55 |
+
whisper_model: Transcription model
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
Dict[str, Any]: Complete voice quality analysis
|
59 |
+
"""
|
60 |
+
# Get Voice Clarity Score
|
61 |
+
clarity_results = compute_voice_clarity_score(file_path, whisper_model)
|
62 |
+
vcs = clarity_results["VCS"]
|
63 |
+
|
64 |
+
# Load audio for additional analysis
|
65 |
+
y, sr = librosa.load(file_path, sr=None)
|
66 |
+
|
67 |
+
# Calculate additional voice quality metrics
|
68 |
+
|
69 |
+
# Voice stability - based on pitch (F0) stability
|
70 |
+
f0, voiced_flags, voiced_probs = librosa.pyin(
|
71 |
+
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
|
72 |
+
voiced_f0 = f0[~np.isnan(f0)]
|
73 |
+
|
74 |
+
pitch_stability = 0.0
|
75 |
+
if voiced_f0.size > 0:
|
76 |
+
# Calculate coefficient of variation (lower is more stable)
|
77 |
+
cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
|
78 |
+
# Convert to score (0-100)
|
79 |
+
pitch_stability = max(0, min(100, 100 - (cv * 100)))
|
80 |
+
|
81 |
+
# Voice resonance - based on spectral bandwidth
|
82 |
+
bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
|
83 |
+
# Normalize (ideal range is around 1500-2500 Hz for speech)
|
84 |
+
if bandwidth < 1000:
|
85 |
+
resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow
|
86 |
+
elif bandwidth <= 2500:
|
87 |
+
resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range
|
88 |
+
else:
|
89 |
+
resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide
|
90 |
+
|
91 |
+
# Voice strength - based on RMS energy
|
92 |
+
rms = np.mean(librosa.feature.rms(y=y))
|
93 |
+
# Normalize (typical speech RMS values range from 0.01 to 0.2)
|
94 |
+
strength_score = min(100, max(0, rms / 0.2 * 100))
|
95 |
+
|
96 |
+
# Combine additional metrics
|
97 |
+
additional_metrics = {
|
98 |
+
"pitch_stability": pitch_stability,
|
99 |
+
"voice_resonance": resonance_score,
|
100 |
+
"voice_strength": strength_score
|
101 |
+
}
|
102 |
+
|
103 |
+
# Add to results
|
104 |
+
combined_results = {
|
105 |
+
"VCS": vcs,
|
106 |
+
"insight": clarity_results["insight"],
|
107 |
+
"components": {
|
108 |
+
**clarity_results["components"],
|
109 |
+
**additional_metrics
|
110 |
+
},
|
111 |
+
"transcript": clarity_results["transcript"]
|
112 |
+
}
|
113 |
+
|
114 |
+
return combined_results
|
115 |
+
|
116 |
+
# Ensure the functions are exposed when imported
|
117 |
+
__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']
|
vcs/main.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import whisper
|
3 |
+
from .compute_vcs import analyze_voice_quality
|
4 |
+
|
5 |
+
def main():
|
6 |
+
"""
|
7 |
+
Main function to run voice clarity analysis on audio files
|
8 |
+
"""
|
9 |
+
# Fixed parameters - modify these values directly in the code
|
10 |
+
audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav" # Path to your audio file
|
11 |
+
model_size = "base" # Whisper model size (tiny, base, small, medium, large)
|
12 |
+
verbose = True # Whether to print detailed results
|
13 |
+
|
14 |
+
try:
|
15 |
+
# Load whisper model
|
16 |
+
print(f"Loading Whisper model ({model_size})...")
|
17 |
+
whisper_model = whisper.load_model(model_size)
|
18 |
+
|
19 |
+
# Calculate voice clarity score
|
20 |
+
print(f"Analyzing voice clarity for {audio_file}...")
|
21 |
+
results = analyze_voice_quality(audio_file, whisper_model)
|
22 |
+
|
23 |
+
# Print summary results
|
24 |
+
print("\nVoice Quality Analysis Results:")
|
25 |
+
print(f"- Voice Clarity Score (VCS): {results['VCS']:.2f}/100")
|
26 |
+
print(f"- Insight: {results['insight']}")
|
27 |
+
print(f"- Articulation: {results['components']['articulation']:.2f}/100")
|
28 |
+
print(f"- Enunciation: {results['components']['enunciation']:.2f}/100")
|
29 |
+
print(f"- Speech Pause Control: {results['components']['speech_pause_control']:.2f}/100")
|
30 |
+
|
31 |
+
# Print verbose results if enabled
|
32 |
+
if verbose:
|
33 |
+
print("\nDetailed Metrics:")
|
34 |
+
print(f"- Pitch Stability: {results['components']['pitch_stability']:.2f}/100")
|
35 |
+
print(f"- Voice Resonance: {results['components']['voice_resonance']:.2f}/100")
|
36 |
+
print(f"- Voice Strength: {results['components']['voice_strength']:.2f}/100")
|
37 |
+
print(f"- Word Count: {results['components']['word_count']}")
|
38 |
+
print(f"- Duration: {results['components']['duration']:.2f} seconds")
|
39 |
+
|
40 |
+
# Print first 100 characters of transcript
|
41 |
+
transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript']
|
42 |
+
print(f"\nTranscript preview: {transcript_preview}")
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
print(f"Error during analysis: {str(e)}")
|
46 |
+
return 1
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
exit(main())
|
vcs/vcs.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Voice Clarity Score calculation module
|
3 |
+
"""
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
from typing import Dict, Any, List
|
8 |
+
import soundfile as sf
|
9 |
+
|
10 |
+
def calculate_articulation(y: np.ndarray, sr: int) -> float:
|
11 |
+
"""
|
12 |
+
Calculate articulation quality based on spectral contrast.
|
13 |
+
|
14 |
+
Articulation refers to how clearly individual phonemes are produced.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
y (np.ndarray): Audio signal
|
18 |
+
sr (int): Sample rate
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
float: Articulation score (0-100)
|
22 |
+
"""
|
23 |
+
# Extract spectral contrast
|
24 |
+
# Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
|
25 |
+
S = np.abs(librosa.stft(y))
|
26 |
+
contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
|
27 |
+
|
28 |
+
# Average across frequency bands and frames
|
29 |
+
mean_contrast = np.mean(contrast)
|
30 |
+
|
31 |
+
# Normalize to 0-100 scale (empirically determined range)
|
32 |
+
# Typical values range from 10-50 dB
|
33 |
+
min_contrast = 10
|
34 |
+
max_contrast = 50
|
35 |
+
normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))
|
36 |
+
|
37 |
+
return normalized_contrast
|
38 |
+
|
39 |
+
def calculate_enunciation(y: np.ndarray, sr: int) -> float:
|
40 |
+
"""
|
41 |
+
Calculate enunciation quality based on formant clarity and spectral flatness.
|
42 |
+
|
43 |
+
Enunciation is the precision in pronouncing vowels and consonants.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
y (np.ndarray): Audio signal
|
47 |
+
sr (int): Sample rate
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
float: Enunciation score (0-100)
|
51 |
+
"""
|
52 |
+
# Compute spectral flatness - lower values indicate clearer formants and better enunciation
|
53 |
+
flatness = np.mean(librosa.feature.spectral_flatness(y=y))
|
54 |
+
|
55 |
+
# Compute spectral centroid - related to "brightness" or articulation clarity
|
56 |
+
centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
|
57 |
+
|
58 |
+
# Normalize flatness (lower is better for speech) - range typically 0.01-0.5
|
59 |
+
norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))
|
60 |
+
|
61 |
+
# Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
|
62 |
+
ideal_centroid = 2500 # Hz
|
63 |
+
centroid_deviation = abs(centroid - ideal_centroid) / 2000 # Normalized by expected deviation
|
64 |
+
norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))
|
65 |
+
|
66 |
+
# Combine the two metrics (with more weight on flatness)
|
67 |
+
enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)
|
68 |
+
|
69 |
+
return enunciation_score
|
70 |
+
|
71 |
+
def calculate_speech_pause_control(segments: List[Dict]) -> float:
|
72 |
+
"""
|
73 |
+
Calculate how effectively pauses are integrated in speech.
|
74 |
+
|
75 |
+
Speech pause control refers to the natural vs. abrupt pauses in speech.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
segments (List[Dict]): List of transcript segments with timing information
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
float: Speech pause control score (0-100)
|
82 |
+
"""
|
83 |
+
if len(segments) < 2:
|
84 |
+
return 100.0 # Not enough segments to evaluate pauses
|
85 |
+
|
86 |
+
pause_durations = []
|
87 |
+
for i in range(len(segments) - 1):
|
88 |
+
pause_dur = segments[i + 1]["start"] - segments[i]["end"]
|
89 |
+
if pause_dur > 0.05: # Only consider actual pauses
|
90 |
+
pause_durations.append(pause_dur)
|
91 |
+
|
92 |
+
if not pause_durations:
|
93 |
+
return 100.0 # No significant pauses detected
|
94 |
+
|
95 |
+
# Calculate the standard deviation of pause durations
|
96 |
+
# More consistent pauses indicate better control
|
97 |
+
pause_std = np.std(pause_durations)
|
98 |
+
|
99 |
+
# Calculate proportion of very long pauses (potentially awkward)
|
100 |
+
long_pauses = sum(1 for d in pause_durations if d > 2.0)
|
101 |
+
long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0
|
102 |
+
|
103 |
+
# Normalize std dev (lower is better, but not too low)
|
104 |
+
# Ideal range is around 0.2-0.5 seconds
|
105 |
+
if pause_std < 0.1:
|
106 |
+
std_score = 70 # Too consistent might sound robotic
|
107 |
+
elif pause_std < 0.5:
|
108 |
+
std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) # Scale 70-100
|
109 |
+
else:
|
110 |
+
std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) # Scale down from 70
|
111 |
+
|
112 |
+
# Penalize for too many long pauses
|
113 |
+
long_pause_penalty = long_pause_ratio * 50
|
114 |
+
|
115 |
+
# Final score
|
116 |
+
pause_control_score = max(0, min(100, std_score - long_pause_penalty))
|
117 |
+
|
118 |
+
return pause_control_score
|
119 |
+
|
120 |
+
def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
|
121 |
+
"""
|
122 |
+
Calculate the Voice Clarity Score (VCS) and its components.
|
123 |
+
|
124 |
+
VCS reflects the clarity and intelligibility of speech.
|
125 |
+
|
126 |
+
Args:
|
127 |
+
y (np.ndarray): Audio signal
|
128 |
+
sr (int): Sample rate
|
129 |
+
segments (List[Dict]): List of transcript segments with timing information
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
Dict[str, Any]: Dictionary with VCS and component scores
|
133 |
+
"""
|
134 |
+
# Calculate component scores
|
135 |
+
articulation_score = calculate_articulation(y, sr)
|
136 |
+
enunciation_score = calculate_enunciation(y, sr)
|
137 |
+
speech_pause_control_score = calculate_speech_pause_control(segments)
|
138 |
+
|
139 |
+
# Calculate Voice Clarity Score using the formula from the paper
|
140 |
+
vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)
|
141 |
+
|
142 |
+
# Create result dictionary
|
143 |
+
result = {
|
144 |
+
"VCS": vcs,
|
145 |
+
"components": {
|
146 |
+
"articulation": articulation_score,
|
147 |
+
"enunciation": enunciation_score,
|
148 |
+
"speech_pause_control": speech_pause_control_score
|
149 |
+
}
|
150 |
+
}
|
151 |
+
|
152 |
+
# Add interpretation
|
153 |
+
result["insight"] = get_clarity_insight(vcs)
|
154 |
+
|
155 |
+
return result
|
156 |
+
|
157 |
+
def get_clarity_insight(vcs: float) -> str:
|
158 |
+
"""
|
159 |
+
Generate insight text based on the Voice Clarity Score.
|
160 |
+
|
161 |
+
Args:
|
162 |
+
vcs (float): Voice Clarity Score (0-100)
|
163 |
+
|
164 |
+
Returns:
|
165 |
+
str: Insight text explaining the score
|
166 |
+
"""
|
167 |
+
if vcs >= 85:
|
168 |
+
return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
|
169 |
+
elif vcs >= 70:
|
170 |
+
return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
|
171 |
+
elif vcs >= 50:
|
172 |
+
return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
|
173 |
+
elif vcs >= 30:
|
174 |
+
return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
|
175 |
+
else:
|
176 |
+
return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."
|
vcs/vcs_api.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from .compute_vcs import analyze_voice_quality
|
3 |
+
|
4 |
+
def main(file_path: str, model_size: str = "base") -> dict:
|
5 |
+
try:
|
6 |
+
|
7 |
+
whisper_model = whisper.load_model(model_size)
|
8 |
+
|
9 |
+
results = analyze_voice_quality(file_path, whisper_model)
|
10 |
+
|
11 |
+
# Structure response
|
12 |
+
response = {
|
13 |
+
"Voice Clarity Sore": round(results['VCS'], 2)
|
14 |
+
# "Articulation": round(results['components']['articulation'],2),
|
15 |
+
# "Enunciation": round(results['components']['enunciation'],2),
|
16 |
+
# "Speech Pause Control": round(results['components']['speech_pause_control'],2),
|
17 |
+
}
|
18 |
+
return response
|
19 |
+
|
20 |
+
except Exception as e:
|
21 |
+
raise RuntimeError(f"Error during analysis: {str(e)}")
|
vers/__init__.py
ADDED
File without changes
|
vers/compute_vers_score.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .vers import calc_vers
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
import math
|
5 |
+
from .filler_analyzer import detect_fillers
|
6 |
+
from .find_valence import get_valence_score
|
7 |
+
|
8 |
+
def compute_vers_score(file_path: str, whisper_model) -> dict:
|
9 |
+
"""
|
10 |
+
Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
|
11 |
+
"""
|
12 |
+
result = whisper_model.transcribe(file_path)
|
13 |
+
transcript = result.get("text", "").strip()
|
14 |
+
segments = result.get("segments", [])
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
# Filler count
|
19 |
+
filler_count, _ = detect_fillers(transcript)
|
20 |
+
|
21 |
+
# Load audio
|
22 |
+
y, sr = librosa.load(file_path, sr=None)
|
23 |
+
duration = len(y) / sr if sr else 0.0
|
24 |
+
|
25 |
+
# Volume (RMS)
|
26 |
+
rms = librosa.feature.rms(y=y)[0]
|
27 |
+
mean_rms = float(np.mean(rms))
|
28 |
+
mean_volume_db = 20 * math.log10(mean_rms + 1e-6) if mean_rms > 0 else -80.0
|
29 |
+
volume_std = np.std(20 * np.log10(rms + 1e-6))
|
30 |
+
|
31 |
+
# Max volume
|
32 |
+
vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
|
33 |
+
vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0
|
34 |
+
|
35 |
+
# Pitch variation
|
36 |
+
f0, voiced_flags, voiced_probs = librosa.pyin(
|
37 |
+
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
|
38 |
+
voiced_f0 = f0[~np.isnan(f0)]
|
39 |
+
pitch_variation = 0.0
|
40 |
+
if voiced_f0.size > 0:
|
41 |
+
median_f0 = np.nanmedian(voiced_f0)
|
42 |
+
median_f0 = max(median_f0, 1e-6)
|
43 |
+
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
|
44 |
+
pitch_variation = float(np.nanstd(semitone_diffs))
|
45 |
+
|
46 |
+
# Pause analysis
|
47 |
+
total_speaking_time = 0.0
|
48 |
+
long_pause_count = 0
|
49 |
+
if segments:
|
50 |
+
for seg in segments:
|
51 |
+
total_speaking_time += (seg["end"] - seg["start"])
|
52 |
+
for i in range(len(segments) - 1):
|
53 |
+
pause_dur = segments[i+1]["start"] - segments[i]["end"]
|
54 |
+
if pause_dur > 1.0:
|
55 |
+
long_pause_count += 1
|
56 |
+
first_start = segments[0]["start"]
|
57 |
+
last_end = segments[-1]["end"]
|
58 |
+
if first_start > 1.0:
|
59 |
+
long_pause_count += 1
|
60 |
+
if duration - last_end > 1.0:
|
61 |
+
long_pause_count += 1
|
62 |
+
|
63 |
+
# WPM
|
64 |
+
words = transcript.split()
|
65 |
+
word_count = len(words)
|
66 |
+
words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
|
67 |
+
|
68 |
+
|
69 |
+
valence_scores = get_valence_score(file_path)
|
70 |
+
|
71 |
+
# Calculate VERS
|
72 |
+
vers_result = calc_vers(
|
73 |
+
filler_count=filler_count,
|
74 |
+
long_pause_count=long_pause_count,
|
75 |
+
pitch_variation=pitch_variation,
|
76 |
+
mean_volume_db=mean_volume_db,
|
77 |
+
vol_max_db=vol_max_db,
|
78 |
+
wpm=words_per_min,
|
79 |
+
volume_std=volume_std,
|
80 |
+
valence_scores=valence_scores
|
81 |
+
)
|
82 |
+
return vers_result
|
vers/filler_analyzer.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
|
2 |
+
# Mapping each variant to a common label (usually the Latin script for insight reporting)
|
3 |
+
FILLER_VARIANTS = {
|
4 |
+
# English fillers
|
5 |
+
"um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
|
6 |
+
"umm": "um", "uhh": "uh", "mmm": "hmm",
|
7 |
+
"like": "like", "you know": "you know", "so": "so", "well": "well",
|
8 |
+
# Hindi fillers (Devanagari and transliteration)
|
9 |
+
"मतलब": "matlab", "matlab": "matlab",
|
10 |
+
"क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
|
11 |
+
"वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
|
12 |
+
"ऐसा है": "aisa hai", "aisa hai": "aisa hai",
|
13 |
+
"हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan"
|
14 |
+
"अच्छा": "acha", "acha": "acha",
|
15 |
+
# Tamil fillers (Tamil script and transliteration)
|
16 |
+
"பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
|
17 |
+
"அப்பரம்": "apparam", "apparam": "apparam",
|
18 |
+
"என்ன": "enna", "enna": "enna"
|
19 |
+
}
|
20 |
+
|
21 |
+
def detect_fillers(transcript):
|
22 |
+
"""
|
23 |
+
Detects filler words in the transcript.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
transcript: Full transcript text
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
tuple: (filler_count, filler_occurrences)
|
30 |
+
"""
|
31 |
+
transcript_lower = transcript.lower()
|
32 |
+
filler_count = 0
|
33 |
+
# Track which specific fillers were used (for insight examples)
|
34 |
+
filler_occurrences = {}
|
35 |
+
|
36 |
+
for variant, label in FILLER_VARIANTS.items():
|
37 |
+
if variant in transcript_lower:
|
38 |
+
count = transcript_lower.count(variant)
|
39 |
+
if count > 0:
|
40 |
+
filler_count += count
|
41 |
+
# Accumulate count for the normalized label
|
42 |
+
filler_occurrences[label] = filler_occurrences.get(label, 0) + count
|
43 |
+
|
44 |
+
return filler_count, filler_occurrences
|
45 |
+
|
46 |
+
|
47 |
+
def analyze_filler_words(filler_count, filler_occurrences, duration):
|
48 |
+
"""
|
49 |
+
Analyzes filler word usage in speech.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
filler_count: Total count of filler words
|
53 |
+
filler_occurrences: Dictionary of specific filler words and their counts
|
54 |
+
duration: Duration of the audio in seconds
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
dict: Contains the filler words score and insight text
|
58 |
+
"""
|
59 |
+
# Extract top examples for insights
|
60 |
+
filler_examples = []
|
61 |
+
if filler_occurrences:
|
62 |
+
# Sort by frequency
|
63 |
+
sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
|
64 |
+
for label, count in sorted_fillers[:2]:
|
65 |
+
filler_examples.append(label)
|
66 |
+
|
67 |
+
# Compute fillers per minute as a gauge
|
68 |
+
filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
|
69 |
+
|
70 |
+
if filler_count == 0:
|
71 |
+
filler_score = 10
|
72 |
+
elif filler_per_min < 1:
|
73 |
+
filler_score = 9
|
74 |
+
elif filler_per_min < 3:
|
75 |
+
filler_score = 8
|
76 |
+
elif filler_per_min < 5:
|
77 |
+
filler_score = 6
|
78 |
+
elif filler_per_min < 10:
|
79 |
+
filler_score = 4
|
80 |
+
else:
|
81 |
+
filler_score = 2
|
82 |
+
|
83 |
+
filler_score = max(0, filler_score)
|
84 |
+
|
85 |
+
# Generate insight text based on the score and examples
|
86 |
+
if filler_count == 0:
|
87 |
+
insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
|
88 |
+
elif filler_count <= 2:
|
89 |
+
example = filler_examples[0] if filler_examples else "um"
|
90 |
+
insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
|
91 |
+
elif filler_count <= 5:
|
92 |
+
examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
|
93 |
+
insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
|
94 |
+
else:
|
95 |
+
examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
|
96 |
+
insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
|
97 |
+
|
98 |
+
return {
|
99 |
+
"score": int(filler_score),
|
100 |
+
"insight": insight
|
101 |
+
}
|
vers/find_valence.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from transformers.models.wav2vec2 import Wav2Vec2Model, Wav2Vec2FeatureExtractor
|
2 |
+
# import torchaudio
|
3 |
+
# import torch
|
4 |
+
# import torch.nn as nn
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
def get_valence_score(file_path):
|
9 |
+
# class VADPredictor(nn.Module):
|
10 |
+
# """Model to predict VAD Scores"""
|
11 |
+
# def __init__(self, pretrained_model_name="facebook/wav2vec2-base-960h", freeze_feature_extractor=True):
|
12 |
+
# super(VADPredictor, self).__init__()
|
13 |
+
|
14 |
+
# self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
|
15 |
+
|
16 |
+
# if freeze_feature_extractor:
|
17 |
+
# for param in self.wav2vec2.feature_extractor.parameters():
|
18 |
+
# param.requires_grad = False
|
19 |
+
|
20 |
+
# hidden_size = self.wav2vec2.config.hidden_size
|
21 |
+
|
22 |
+
# self.valence_layers = nn.Sequential(
|
23 |
+
# nn.Linear(hidden_size, 256),
|
24 |
+
# nn.ReLU(),
|
25 |
+
# nn.Dropout(0.3),
|
26 |
+
# nn.Linear(256,64),
|
27 |
+
# nn.Linear(64,1)
|
28 |
+
# )
|
29 |
+
# self.arousal_layers = nn.Sequential(
|
30 |
+
# nn.Linear(hidden_size, 256),
|
31 |
+
# nn.ReLU(),
|
32 |
+
# nn.Dropout(0.3),
|
33 |
+
# nn.Linear(256,64),
|
34 |
+
# nn.Linear(64,1)
|
35 |
+
# )
|
36 |
+
# self.dominance_layers = nn.Sequential(
|
37 |
+
# nn.Linear(hidden_size, 256),
|
38 |
+
# nn.ReLU(),
|
39 |
+
# nn.Dropout(0.3),
|
40 |
+
# nn.Linear(256,64),
|
41 |
+
# nn.Linear(64,1)
|
42 |
+
# )
|
43 |
+
|
44 |
+
# def forward(self, input_values, attention_mask=None):
|
45 |
+
# outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
|
46 |
+
# last_hidden_state = outputs.last_hidden_state
|
47 |
+
# pooled_output = torch.mean(last_hidden_state, dim=1)
|
48 |
+
|
49 |
+
# valence = self.valence_layers(pooled_output)
|
50 |
+
# arousal = self.arousal_layers(pooled_output)
|
51 |
+
# dominance = self.dominance_layers(pooled_output)
|
52 |
+
|
53 |
+
# return {
|
54 |
+
# 'valence': valence.squeeze(-1),
|
55 |
+
# 'arousal': arousal.squeeze(-1),
|
56 |
+
# 'dominance': dominance.squeeze(-1)
|
57 |
+
# }
|
58 |
+
|
59 |
+
|
60 |
+
# model = VADPredictor()
|
61 |
+
# model.load_state_dict(torch.load(r"D:\Intern\shankh\DUMP\vad_predictor_model.pt", map_location=torch.device("cpu")))
|
62 |
+
# model.eval()
|
63 |
+
|
64 |
+
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
|
65 |
+
|
66 |
+
# # Load and process audio
|
67 |
+
# file_path = file_path
|
68 |
+
# waveform, sr = torchaudio.load(file_path)
|
69 |
+
|
70 |
+
# # Convert to mono
|
71 |
+
# if waveform.shape[0] > 1:
|
72 |
+
# waveform = waveform.mean(dim=0, keepdim=True)
|
73 |
+
|
74 |
+
# # Resample to 16000 Hz
|
75 |
+
# if sr != 16000:
|
76 |
+
# resampler = torchaudio.transforms.Resample(sr, 16000)
|
77 |
+
# waveform = resampler(waveform)
|
78 |
+
# sr = 16000
|
79 |
+
|
80 |
+
# # Normalize
|
81 |
+
# waveform = waveform / waveform.abs().max()
|
82 |
+
|
83 |
+
# # Parameters
|
84 |
+
# segment_sec = 1
|
85 |
+
# segment_samples = int(segment_sec * sr)
|
86 |
+
|
87 |
+
# valence_scores = []
|
88 |
+
|
89 |
+
# # Inference per segment
|
90 |
+
# with torch.no_grad():
|
91 |
+
# for start in range(0, waveform.shape[1] - segment_samples + 1, segment_samples):
|
92 |
+
# segment = waveform[:, start:start+segment_samples]
|
93 |
+
# input_values = feature_extractor(segment.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
|
94 |
+
# output = model(input_values)
|
95 |
+
# val = output['valence'].item()
|
96 |
+
# valence_scores.append(val)
|
97 |
+
valence_scores = 5.0
|
98 |
+
|
99 |
+
return valence_scores
|
100 |
+
|
vers/main.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from .compute_vers_score import compute_vers_score
|
3 |
+
import whisper
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
whisper_model = whisper.load_model("base")
|
8 |
+
|
9 |
+
test_result = compute_vers_score(r"D:\Intern\shankh\audio_samples\obama_short.wav", whisper_model)
|
10 |
+
|
11 |
+
print("VERS Score:", test_result["VERS"])
|
12 |
+
print("ESS:", test_result["ESS"])
|
13 |
+
print("LCS:", test_result["LCS"])
|
14 |
+
print("SRS:", test_result["SRS"])
|
15 |
+
print("Insight:", test_result["insight"])
|
16 |
+
print("Transcript:", test_result["transcript"])
|
vers/vers.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
def calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores):
|
4 |
+
"""
|
5 |
+
Emotional Stability Score(ESS) : Measures the consistency of the speaker's emotional tone, reflecting their ability to regulate emotions during speech.
|
6 |
+
|
7 |
+
Requires:
|
8 |
+
Tonal Steadiness: The lack of extreme fluctuations in emotional tone.
|
9 |
+
Absence of Sudden Loudness Spikes: Indicates controlled expression without abrupt emotional shifts.
|
10 |
+
Valence Stability: Consistency in the overall positive or negative tone across the speech.
|
11 |
+
"""
|
12 |
+
# calculate tonal steadiness
|
13 |
+
tonal_steadiness = max(0, 100 - (pitch_variation * 10))
|
14 |
+
|
15 |
+
# calculate loudness spikes
|
16 |
+
spike = max(0, vol_max_db - mean_volume_db - 15)
|
17 |
+
spike_ratio = min(spike / 30, 1.0) # Normalize with typical loudness range
|
18 |
+
stability = 1 - spike_ratio
|
19 |
+
loudness_stability = stability * 100
|
20 |
+
|
21 |
+
# calculate valence stability
|
22 |
+
valence_stability = 100 - (np.std(valence_scores) * 20)
|
23 |
+
|
24 |
+
ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability))
|
25 |
+
print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}")
|
26 |
+
return ESS
|
27 |
+
|
28 |
+
def calc_lcs(volume_std, vol_max_db, mean_volume_db):
|
29 |
+
"""
|
30 |
+
Loudness Control Score (LCS): Evaluates how well the speaker manages volume
|
31 |
+
|
32 |
+
Requires:
|
33 |
+
- Volume Stability: Consistency in speech amplitude.
|
34 |
+
- Controlled Emphasis: The ability to modulate loudness smoothly for emphasis rather than abrupt changes.
|
35 |
+
"""
|
36 |
+
vol_stability = max(0, 100 - (volume_std * 5)) # Scale std for speech (5 dB std = 75)
|
37 |
+
|
38 |
+
# Controlled Emphasis (45%)
|
39 |
+
emphasis_spike = max(0, vol_max_db - mean_volume_db - 3)
|
40 |
+
spike_ratio = min(emphasis_spike / 15, 1.0) # Normalize to 15 dB range
|
41 |
+
emphasis_control = (1 - spike_ratio) * 100
|
42 |
+
|
43 |
+
# Combine scores
|
44 |
+
lcs = 0.55 * vol_stability + 0.45 * emphasis_control
|
45 |
+
print(f"vol_stability: {vol_stability}, emphasis_control: {emphasis_control}")
|
46 |
+
return min(100, max(0, lcs))
|
47 |
+
|
48 |
+
def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
|
49 |
+
"""
|
50 |
+
Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
|
51 |
+
|
52 |
+
Requires:
|
53 |
+
- Words per Minute Consistency: Regularity in speech speed.
|
54 |
+
- Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes.
|
55 |
+
"""
|
56 |
+
ideal_wpm = 150
|
57 |
+
wpm_deviation = min(30, abs(wpm - ideal_wpm)) # Cap at 30 WPM deviation
|
58 |
+
wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) # 100-50 for max deviation
|
59 |
+
|
60 |
+
# Sudden Speech Shift Penalty
|
61 |
+
filler_penalty = min(filler_count / 10, 1.0)
|
62 |
+
pause_penalty = min(long_pause_count / 5, 1.0)
|
63 |
+
pitch_penalty = min(pitch_variation / 3.0, 1.0) # High variation → unstable
|
64 |
+
|
65 |
+
# Combine into absence of sudden shifts
|
66 |
+
stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
|
67 |
+
|
68 |
+
# Final SRS Score
|
69 |
+
SRS = (0.45 * wpm_consistency) + (0.55 * stability)
|
70 |
+
print(f"wpm_consistency: {wpm_consistency}, stability: {stability}")
|
71 |
+
return min(100, max(0, SRS))
|
72 |
+
|
73 |
+
def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores):
|
74 |
+
ESS = calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores)
|
75 |
+
LCS = calc_lcs(volume_std, vol_max_db, mean_volume_db)
|
76 |
+
SRS = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
|
77 |
+
|
78 |
+
# Calculate the VERS score using the formula
|
79 |
+
VERS = (0.5 * ESS) + (0.3 * LCS) + (0.2 * SRS) # This would be value from 0 to 100
|
80 |
+
|
81 |
+
if VERS > 0 and VERS < 50:
|
82 |
+
insight = """Poor regulation—noticeable swings in tone and uncontrolled
|
83 |
+
emotional expression. Feedback: Consider exercises and professional
|
84 |
+
coaching to stabilize your emotional delivery."""
|
85 |
+
elif VERS >= 50 and VERS < 80:
|
86 |
+
insight = """Moderate regulation—occasional fluctuations or abrupt changes.
|
87 |
+
Feedback: Work on smoothing out volume changes and maintaining a steady tone."""
|
88 |
+
elif VERS >= 80 and VERS <= 100:
|
89 |
+
insight = """Excellent regulation—steady tone and controlled volume dynamics.
|
90 |
+
Feedback: Continue using techniques that maintain emotional balance."""
|
91 |
+
else:
|
92 |
+
insight = "Invalid score calculated"
|
93 |
+
|
94 |
+
return {
|
95 |
+
"VERS": int(VERS),
|
96 |
+
"ESS": round(ESS, 1),
|
97 |
+
"LCS": round(LCS, 1),
|
98 |
+
"SRS": round(SRS, 1),
|
99 |
+
"insight": insight
|
100 |
+
}
|
101 |
+
|
102 |
+
# # Test input
|
103 |
+
# test_result = calc_vers(
|
104 |
+
# filler_count=4,
|
105 |
+
# long_pause_count=2,
|
106 |
+
# pitch_variation=3.2,
|
107 |
+
# mean_volume_db=65,
|
108 |
+
# vol_max_db=82,
|
109 |
+
# wpm=148,
|
110 |
+
# volume_std=4.1,
|
111 |
+
# valence_scores=[5.2, 5.5, 4.9]
|
112 |
+
# )
|
113 |
+
|
114 |
+
# print("VERS Score:", test_result["VERS"])
|
115 |
+
# print("ESS:", test_result["ESS"])
|
116 |
+
# print("LCS:", test_result["LCS"])
|
117 |
+
# print("SRS:", test_result["SRS"])
|
118 |
+
# print("Insight:", test_result["insight"])
|
vers/vers_api.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import numpy as np
|
3 |
+
from .compute_vers_score import compute_vers_score
|
4 |
+
|
5 |
+
def convert_numpy_types(obj):
|
6 |
+
"""Convert NumPy types to Python native types for JSON serialization."""
|
7 |
+
if isinstance(obj, np.integer):
|
8 |
+
return int(obj)
|
9 |
+
elif isinstance(obj, np.floating):
|
10 |
+
return float(obj)
|
11 |
+
elif isinstance(obj, np.ndarray):
|
12 |
+
return obj.tolist()
|
13 |
+
elif isinstance(obj, dict):
|
14 |
+
return {k: convert_numpy_types(v) for k, v in obj.items()}
|
15 |
+
elif isinstance(obj, list):
|
16 |
+
return [convert_numpy_types(i) for i in obj]
|
17 |
+
else:
|
18 |
+
return obj
|
19 |
+
|
20 |
+
def main(file_path: str, model_size: str = "base") -> dict:
|
21 |
+
try:
|
22 |
+
# Load whisper model
|
23 |
+
whisper_model = whisper.load_model(model_size)
|
24 |
+
|
25 |
+
# Compute VERS score
|
26 |
+
results = compute_vers_score(file_path, whisper_model)
|
27 |
+
|
28 |
+
# Convert any NumPy types to native Python types
|
29 |
+
results = convert_numpy_types(results)
|
30 |
+
|
31 |
+
# Structure response with rounded values
|
32 |
+
# (using Python's built-in round function which returns Python native float)
|
33 |
+
response = {
|
34 |
+
"VERS Score": round(results['VERS'], 2)
|
35 |
+
# "ESS": round(results['ESS'], 2),
|
36 |
+
# "LCS": round(results['LCS'], 2),
|
37 |
+
# "SRS": round(results['SRS'], 2),
|
38 |
+
# "Insight": results['insight'],
|
39 |
+
}
|
40 |
+
|
41 |
+
return response
|
42 |
+
|
43 |
+
except Exception as e:
|
44 |
+
raise RuntimeError(f"Error during analysis: {str(e)}")
|
ves/__init__.py
ADDED
File without changes
|
ves/ves.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# voice engagement score = 0.4 * valence + 0.3 * arousal + 0.3 * SDS
|
2 |
+
from tone_modulation.sds import calc_sds
|
3 |
+
|
4 |
+
def get_valence_and_arousal(file_path):
|
5 |
+
|
6 |
+
valence = 4.5 #we get this from model
|
7 |
+
|
8 |
+
arousal = 3.2 #we get this from model
|
9 |
+
|
10 |
+
return valence, arousal
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def calc_voice_engagement_score(file_path):
|
15 |
+
valence, arousal = get_valence_and_arousal(file_path)
|
16 |
+
|
17 |
+
# Calculate SDS
|
18 |
+
|
19 |
+
sds = calc_sds(file_path)
|
20 |
+
|
21 |
+
ves = 0.4 * valence + 0.3 * arousal + 0.3 * sds
|
22 |
+
|
23 |
+
return {
|
24 |
+
# "sds": sds,
|
25 |
+
"ves": ves
|
26 |
+
}
|
voice_confidence_score/__init__.py
ADDED
File without changes
|
voice_confidence_score/main.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .voice_confidence import calc_voice_confidence_score
|
2 |
+
import whisper
|
3 |
+
|
4 |
+
model_size = "base"
|
5 |
+
whisper_model = whisper.load_model(model_size)
|
6 |
+
|
7 |
+
audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav"
|
8 |
+
|
9 |
+
result = calc_voice_confidence_score(audio_file, whisper_model)
|
10 |
+
|
11 |
+
print(f"Voice Confidence Score: {result:.2f}")
|
voice_confidence_score/voice_confidence.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# voice confidence score = 0.4 * dominance + 0.3 * scs + 0.3 * fluency.
|
2 |
+
|
3 |
+
import whisper
|
4 |
+
from fluency.compute_fluency import compute_fluency_score
|
5 |
+
from vcs.compute_vcs import analyze_voice_quality
|
6 |
+
|
7 |
+
|
8 |
+
def calc_fluency_score(audio_path, whisper_model):
|
9 |
+
|
10 |
+
# Calculate fluency score
|
11 |
+
print(f"Analyzing fluency for {audio_path}...")
|
12 |
+
results = compute_fluency_score(audio_path, whisper_model)
|
13 |
+
fluency_score = results['fluency_score']
|
14 |
+
|
15 |
+
return fluency_score
|
16 |
+
|
17 |
+
def calc_vcs(audio_path, whisper_model):
|
18 |
+
|
19 |
+
|
20 |
+
# Calculate voice clarity score
|
21 |
+
print(f"Analyzing voice clarity for {audio_path}...")
|
22 |
+
results = analyze_voice_quality(audio_path, whisper_model)
|
23 |
+
vcs = results['VCS']
|
24 |
+
|
25 |
+
return vcs
|
26 |
+
|
27 |
+
dominance = 5.6 # dummy for now i add later
|
28 |
+
|
29 |
+
def calc_voice_confidence_score(audio_path, model):
|
30 |
+
|
31 |
+
fluency_score = calc_fluency_score(audio_path, model)
|
32 |
+
vcs = calc_vcs(audio_path, model)
|
33 |
+
|
34 |
+
# Calculate voice confidence score
|
35 |
+
voice_confidence_score = 0.4 * dominance + 0.3 * vcs + 0.3 * fluency_score
|
36 |
+
|
37 |
+
return voice_confidence_score
|
38 |
+
|
voice_confidence_score/voice_confidence_api.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from .voice_confidence import calc_voice_confidence_score
|
3 |
+
|
4 |
+
def main(file_path: str, model_size: str = "base") -> dict:
|
5 |
+
try:
|
6 |
+
# Load the Whisper model
|
7 |
+
whisper_model = whisper.load_model(model_size)
|
8 |
+
|
9 |
+
# Calculate the voice confidence score
|
10 |
+
result = calc_voice_confidence_score(file_path, whisper_model)
|
11 |
+
|
12 |
+
# Return the result as a dictionary
|
13 |
+
return {"voice_confidence_score": round(result, 2)}
|
14 |
+
except Exception as e:
|
15 |
+
return {"error": str(e)}
|
16 |
+
|
vps/__init__.py
ADDED
File without changes
|
vps/compute_vps_score.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .vps import calculate_vps # Your file where calc_srs, calculate_pas, calculate_rcs, calculate_vps live
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
import math
|
5 |
+
from .filler_analyzer import detect_fillers
|
6 |
+
|
7 |
+
def compute_vps_score(file_path: str, whisper_model) -> dict:
|
8 |
+
"""
|
9 |
+
Compute VPS (Voice Pacing Score) and its components from a speech sample.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
file_path (str): Path to the audio file.
|
13 |
+
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
|
17 |
+
"""
|
18 |
+
# Transcribe
|
19 |
+
result = whisper_model.transcribe(file_path)
|
20 |
+
transcript = result.get("text", "").strip()
|
21 |
+
segments = result.get("segments", [])
|
22 |
+
|
23 |
+
# Validate early
|
24 |
+
if not transcript or not segments:
|
25 |
+
raise ValueError("Empty transcript or segments from Whisper.")
|
26 |
+
|
27 |
+
# Filler count
|
28 |
+
filler_count, _ = detect_fillers(transcript)
|
29 |
+
|
30 |
+
# Load audio
|
31 |
+
y, sr = librosa.load(file_path, sr=None)
|
32 |
+
duration = len(y) / sr if sr else 0.0
|
33 |
+
if duration <= 0:
|
34 |
+
raise ValueError("Audio duration invalid or zero.")
|
35 |
+
|
36 |
+
# Pitch variation (in semitones)
|
37 |
+
f0, voiced_flags, voiced_probs = librosa.pyin(
|
38 |
+
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
|
39 |
+
voiced_f0 = f0[~np.isnan(f0)]
|
40 |
+
pitch_variation = 0.0
|
41 |
+
if voiced_f0.size > 0:
|
42 |
+
median_f0 = np.nanmedian(voiced_f0)
|
43 |
+
median_f0 = max(median_f0, 1e-6)
|
44 |
+
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
|
45 |
+
pitch_variation = float(np.nanstd(semitone_diffs))
|
46 |
+
|
47 |
+
# Pause analysis
|
48 |
+
long_pause_count = 0
|
49 |
+
if segments:
|
50 |
+
for i in range(len(segments) - 1):
|
51 |
+
pause_dur = segments[i + 1]["start"] - segments[i]["end"]
|
52 |
+
if pause_dur > 1.0:
|
53 |
+
long_pause_count += 1
|
54 |
+
# Beginning and end
|
55 |
+
if segments[0]["start"] > 1.0:
|
56 |
+
long_pause_count += 1
|
57 |
+
if duration - segments[-1]["end"] > 1.0:
|
58 |
+
long_pause_count += 1
|
59 |
+
|
60 |
+
# WPM
|
61 |
+
word_count = len(transcript.split())
|
62 |
+
words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
|
63 |
+
|
64 |
+
# Calculate VPS and components
|
65 |
+
vps_result = calculate_vps(
|
66 |
+
transcript=transcript,
|
67 |
+
segments=segments,
|
68 |
+
filler_count=filler_count,
|
69 |
+
duration=duration,
|
70 |
+
wpm=words_per_min,
|
71 |
+
long_pause_count=long_pause_count,
|
72 |
+
pitch_variation=pitch_variation,
|
73 |
+
y=y,
|
74 |
+
sr=sr
|
75 |
+
)
|
76 |
+
|
77 |
+
# Include transcript optionally
|
78 |
+
vps_result["transcript"] = transcript
|
79 |
+
return vps_result
|
vps/filler_analyzer.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
|
2 |
+
# Mapping each variant to a common label (usually the Latin script for insight reporting)
|
3 |
+
FILLER_VARIANTS = {
|
4 |
+
# English fillers
|
5 |
+
"um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
|
6 |
+
"umm": "um", "uhh": "uh", "mmm": "hmm",
|
7 |
+
"like": "like", "you know": "you know", "so": "so", "well": "well",
|
8 |
+
# Hindi fillers (Devanagari and transliteration)
|
9 |
+
"मतलब": "matlab", "matlab": "matlab",
|
10 |
+
"क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
|
11 |
+
"वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
|
12 |
+
"ऐसा है": "aisa hai", "aisa hai": "aisa hai",
|
13 |
+
"हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan"
|
14 |
+
"अच्छा": "acha", "acha": "acha",
|
15 |
+
# Tamil fillers (Tamil script and transliteration)
|
16 |
+
"பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
|
17 |
+
"அப்பரம்": "apparam", "apparam": "apparam",
|
18 |
+
"என்ன": "enna", "enna": "enna"
|
19 |
+
}
|
20 |
+
|
21 |
+
def detect_fillers(transcript):
|
22 |
+
"""
|
23 |
+
Detects filler words in the transcript.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
transcript: Full transcript text
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
tuple: (filler_count, filler_occurrences)
|
30 |
+
"""
|
31 |
+
transcript_lower = transcript.lower()
|
32 |
+
filler_count = 0
|
33 |
+
# Track which specific fillers were used (for insight examples)
|
34 |
+
filler_occurrences = {}
|
35 |
+
|
36 |
+
for variant, label in FILLER_VARIANTS.items():
|
37 |
+
if variant in transcript_lower:
|
38 |
+
count = transcript_lower.count(variant)
|
39 |
+
if count > 0:
|
40 |
+
filler_count += count
|
41 |
+
# Accumulate count for the normalized label
|
42 |
+
filler_occurrences[label] = filler_occurrences.get(label, 0) + count
|
43 |
+
|
44 |
+
return filler_count, filler_occurrences
|
45 |
+
|
46 |
+
def analyze_filler_words(filler_count, filler_occurrences, duration):
|
47 |
+
"""
|
48 |
+
Analyzes filler word usage in speech.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
filler_count: Total count of filler words
|
52 |
+
filler_occurrences: Dictionary of specific filler words and their counts
|
53 |
+
duration: Duration of the audio in seconds
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
dict: Contains the filler words score and insight text
|
57 |
+
"""
|
58 |
+
# Extract top examples for insights
|
59 |
+
filler_examples = []
|
60 |
+
if filler_occurrences:
|
61 |
+
# Sort by frequency
|
62 |
+
sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
|
63 |
+
for label, count in sorted_fillers[:2]:
|
64 |
+
filler_examples.append(label)
|
65 |
+
|
66 |
+
# Compute fillers per minute as a gauge
|
67 |
+
filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
|
68 |
+
|
69 |
+
if filler_count == 0:
|
70 |
+
filler_score = 10
|
71 |
+
elif filler_per_min < 1:
|
72 |
+
filler_score = 9
|
73 |
+
elif filler_per_min < 3:
|
74 |
+
filler_score = 8
|
75 |
+
elif filler_per_min < 5:
|
76 |
+
filler_score = 6
|
77 |
+
elif filler_per_min < 10:
|
78 |
+
filler_score = 4
|
79 |
+
else:
|
80 |
+
filler_score = 2
|
81 |
+
|
82 |
+
filler_score = max(0, filler_score)
|
83 |
+
|
84 |
+
# Generate insight text based on the score and examples
|
85 |
+
if filler_count == 0:
|
86 |
+
insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
|
87 |
+
elif filler_count <= 2:
|
88 |
+
example = filler_examples[0] if filler_examples else "um"
|
89 |
+
insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
|
90 |
+
elif filler_count <= 5:
|
91 |
+
examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
|
92 |
+
insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
|
93 |
+
else:
|
94 |
+
examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
|
95 |
+
insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
|
96 |
+
|
97 |
+
return {
|
98 |
+
"score": int(filler_score),
|
99 |
+
"insight": insight
|
100 |
+
}
|
vps/main.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from .compute_vps_score import compute_vps_score # Ensure this path is correct
|
3 |
+
|
4 |
+
def main():
|
5 |
+
# 🔧 Set your input audio file path here
|
6 |
+
audio_path = r"D:\Intern\shankh\audio_samples\obama_short.wav"
|
7 |
+
|
8 |
+
# 🔧 Choose Whisper model (tiny, base, small, medium, large)
|
9 |
+
model_size = "base"
|
10 |
+
|
11 |
+
print(f"Loading Whisper model: {model_size}")
|
12 |
+
whisper_model = whisper.load_model(model_size)
|
13 |
+
|
14 |
+
print(f"Analyzing audio: {audio_path}")
|
15 |
+
try:
|
16 |
+
vps_result = compute_vps_score(audio_path, whisper_model)
|
17 |
+
|
18 |
+
print("\n--- Voice Pacing Score (VPS) ---")
|
19 |
+
print(f"VPS Score: {vps_result['VPS']:.2f}")
|
20 |
+
print(f" - SRS (Speech Rate Stability): {vps_result['SRS']:.2f}")
|
21 |
+
print(f" - PAS (Pause Appropriateness): {vps_result['PAS']:.2f}")
|
22 |
+
print(f" - NPP: {vps_result['NPP']:.2f}")
|
23 |
+
print(f" - AFW: {vps_result['AFW']:.2f}")
|
24 |
+
print(f" - RCS (Rhythm Consistency): {vps_result['RCS']:.2f}")
|
25 |
+
print(f" - STR: {vps_result['STR']:.2f}")
|
26 |
+
print(f" - STW: {vps_result['STW']:.2f}")
|
27 |
+
|
28 |
+
print("\nTranscript:")
|
29 |
+
print(vps_result["transcript"])
|
30 |
+
|
31 |
+
except Exception as e:
|
32 |
+
print(f"[Error] {e}")
|
33 |
+
|
34 |
+
if __name__ == "__main__":
|
35 |
+
main()
|
vps/vps.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
import spacy
|
5 |
+
import math
|
6 |
+
from .filler_analyzer import detect_fillers
|
7 |
+
|
8 |
+
def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float:
|
9 |
+
"""
|
10 |
+
Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
|
11 |
+
"""
|
12 |
+
ideal_wpm = 150
|
13 |
+
wpm_deviation = min(30, abs(wpm - ideal_wpm))
|
14 |
+
wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))
|
15 |
+
|
16 |
+
filler_penalty = min(filler_count / 10, 1.0)
|
17 |
+
pause_penalty = min(long_pause_count / 5, 1.0)
|
18 |
+
pitch_penalty = min(pitch_variation / 3.0, 1.0)
|
19 |
+
|
20 |
+
stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
|
21 |
+
SRS = (0.45 * wpm_consistency) + (0.55 * stability)
|
22 |
+
return min(100, max(0, SRS))
|
23 |
+
|
24 |
+
def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
|
25 |
+
"""
|
26 |
+
Calculate the Pause Appropriateness Score (PAS) and its components.
|
27 |
+
"""
|
28 |
+
if not transcript or not segments or duration <= 0:
|
29 |
+
raise ValueError("Transcript, segments, and duration must be valid")
|
30 |
+
|
31 |
+
nlp = spacy.load("en_core_web_sm")
|
32 |
+
doc = nlp(transcript)
|
33 |
+
|
34 |
+
words = transcript.split()
|
35 |
+
total_words = len(words)
|
36 |
+
if total_words == 0:
|
37 |
+
raise ValueError("No words found in transcript")
|
38 |
+
|
39 |
+
filler_rate = filler_count / total_words if total_words > 0 else 0.0
|
40 |
+
if filler_rate >= 0.10:
|
41 |
+
afw = 0.0
|
42 |
+
elif filler_rate <= 0.0:
|
43 |
+
afw = 100.0
|
44 |
+
else:
|
45 |
+
afw = 100.0 - (filler_rate * 1000)
|
46 |
+
afw = max(0.0, min(100.0, afw))
|
47 |
+
|
48 |
+
total_pauses = 0
|
49 |
+
natural_pauses = 0
|
50 |
+
segment_texts = [seg["text"].strip() for seg in segments]
|
51 |
+
segment_starts = [seg["start"] for seg in segments]
|
52 |
+
segment_ends = [seg["end"] for seg in segments]
|
53 |
+
|
54 |
+
for i in range(len(segments) - 1):
|
55 |
+
pause_dur = segment_starts[i + 1] - segment_ends[i]
|
56 |
+
if pause_dur > 0.5:
|
57 |
+
total_pauses += 1
|
58 |
+
if segment_texts[i] and segment_texts[i][-1] in ".!?,":
|
59 |
+
natural_pauses += 1
|
60 |
+
|
61 |
+
if segment_starts[0] > 0.5:
|
62 |
+
total_pauses += 1
|
63 |
+
if duration - segment_ends[-1] > 0.5:
|
64 |
+
total_pauses += 1
|
65 |
+
if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
|
66 |
+
natural_pauses += 1
|
67 |
+
|
68 |
+
npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
|
69 |
+
pas = (0.4 * npp) + (0.6 * afw)
|
70 |
+
|
71 |
+
return {
|
72 |
+
"NPP": npp,
|
73 |
+
"AFW": afw,
|
74 |
+
"PAS": pas
|
75 |
+
}
|
76 |
+
|
77 |
+
def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]:
|
78 |
+
"""
|
79 |
+
Calculate the Rhythm Consistency Score (RCS) and its components.
|
80 |
+
"""
|
81 |
+
if y.size == 0 or sr <= 0 or duration <= 0 or not segments:
|
82 |
+
raise ValueError("Audio signal, sampling rate, duration, and segments must be valid")
|
83 |
+
|
84 |
+
onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256)
|
85 |
+
onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256)
|
86 |
+
|
87 |
+
if len(onsets) > 1:
|
88 |
+
iois = np.diff(onsets)
|
89 |
+
ioi_std = np.std(iois)
|
90 |
+
ioi_std = min(max(ioi_std, 0.1), 0.5)
|
91 |
+
str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1)
|
92 |
+
str_score = max(0.0, min(100.0, str_score))
|
93 |
+
else:
|
94 |
+
str_score = 100.0
|
95 |
+
|
96 |
+
total_transitions = 0
|
97 |
+
smooth_transitions = 0
|
98 |
+
pause_threshold = 0.3
|
99 |
+
|
100 |
+
for i in range(len(segments) - 1):
|
101 |
+
gap = segments[i + 1]["start"] - segments[i]["end"]
|
102 |
+
total_transitions += 1
|
103 |
+
if gap <= pause_threshold:
|
104 |
+
smooth_transitions += 1
|
105 |
+
|
106 |
+
for segment in segments:
|
107 |
+
words = segment["text"].strip().split()
|
108 |
+
if len(words) > 1:
|
109 |
+
smooth_transitions += len(words) - 1
|
110 |
+
total_transitions += len(words) - 1
|
111 |
+
|
112 |
+
stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0
|
113 |
+
rcs = (0.5 * str_score) + (0.5 * stw)
|
114 |
+
|
115 |
+
return {
|
116 |
+
"STR": str_score,
|
117 |
+
"STW": stw,
|
118 |
+
"RCS": rcs
|
119 |
+
}
|
120 |
+
|
121 |
+
def calculate_vps(
|
122 |
+
transcript: str,
|
123 |
+
segments: List[Dict],
|
124 |
+
filler_count: int,
|
125 |
+
duration: float,
|
126 |
+
wpm: float,
|
127 |
+
long_pause_count: int,
|
128 |
+
pitch_variation: float,
|
129 |
+
y: np.ndarray,
|
130 |
+
sr: int
|
131 |
+
) -> Dict[str, float]:
|
132 |
+
"""
|
133 |
+
Calculate the Voice Pacing Score (VPS) and its components:
|
134 |
+
- SRS: Speech Rate Stability Score
|
135 |
+
- PAS: Pause Appropriateness Score
|
136 |
+
- RCS: Rhythm Consistency Score
|
137 |
+
- VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS)
|
138 |
+
|
139 |
+
Args:
|
140 |
+
transcript (str): Transcribed text.
|
141 |
+
segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'.
|
142 |
+
filler_count (int): Number of filler words.
|
143 |
+
duration (float): Audio duration (seconds).
|
144 |
+
wpm (float): Words per minute.
|
145 |
+
long_pause_count (int): Number of long pauses (>1.0s).
|
146 |
+
pitch_variation (float): Pitch variation in semitones.
|
147 |
+
y (np.ndarray): Audio signal.
|
148 |
+
sr (int): Sampling rate.
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates.
|
152 |
+
"""
|
153 |
+
# Validate inputs
|
154 |
+
if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0:
|
155 |
+
raise ValueError("Invalid inputs")
|
156 |
+
|
157 |
+
# Calculate SRS
|
158 |
+
srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
|
159 |
+
|
160 |
+
# Calculate PAS
|
161 |
+
pas_result = calculate_pas(transcript, segments, filler_count, duration)
|
162 |
+
pas = pas_result["PAS"]
|
163 |
+
npp = pas_result["NPP"]
|
164 |
+
afw = pas_result["AFW"]
|
165 |
+
|
166 |
+
# Calculate RCS
|
167 |
+
rcs_result = calculate_rcs(y, sr, segments, duration)
|
168 |
+
rcs = rcs_result["RCS"]
|
169 |
+
str_score = rcs_result["STR"]
|
170 |
+
stw = rcs_result["STW"]
|
171 |
+
|
172 |
+
# Calculate VPS
|
173 |
+
vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs)
|
174 |
+
vps = max(0.0, min(100.0, vps))
|
175 |
+
|
176 |
+
return {
|
177 |
+
"SRS": srs,
|
178 |
+
"PAS": pas,
|
179 |
+
"NPP": npp,
|
180 |
+
"AFW": afw,
|
181 |
+
"RCS": rcs,
|
182 |
+
"STR": str_score,
|
183 |
+
"STW": stw,
|
184 |
+
"VPS": vps
|
185 |
+
}
|
vps/vps_api.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from .compute_vps_score import compute_vps_score
|
3 |
+
|
4 |
+
def main(file_path: str, model_size: str = "base") -> dict:
|
5 |
+
try:
|
6 |
+
# Load the Whisper model
|
7 |
+
whisper_model = whisper.load_model(model_size)
|
8 |
+
|
9 |
+
# Calculate the voice confidence score
|
10 |
+
result = compute_vps_score(file_path, whisper_model)
|
11 |
+
|
12 |
+
# Return the result as a dictionary
|
13 |
+
return {
|
14 |
+
"VPS": result["VPS"]
|
15 |
+
# "SRS": result["SRS"],
|
16 |
+
# "PAS": result["PAS"],
|
17 |
+
# "NPP": result["NPP"],
|
18 |
+
# "AFW": result["AFW"],
|
19 |
+
# "RCS": result["RCS"],
|
20 |
+
# "STR": result["STR"],
|
21 |
+
# "STW": result["STW"]
|
22 |
+
}
|
23 |
+
except Exception as e:
|
24 |
+
return {"error": str(e)}
|
25 |
+
|