catiR
commited on
Commit
·
e2901c5
1
Parent(s):
0d95ee8
run clustering
Browse files- scripts/runSQ.py +8 -36
scripts/runSQ.py
CHANGED
|
@@ -192,12 +192,19 @@ def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"):
|
|
| 192 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
| 193 |
if not os.path.exists(fpath):
|
| 194 |
no_f0.append(v)
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
| 196 |
if no_f0:
|
| 197 |
print(f'Need to estimate pitch for {len(no_f0)} voices')
|
| 198 |
for v in voices:
|
| 199 |
wav_path = f'{ttsdir}{dpath}/{v}.wav'
|
| 200 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
f0_data = estimate_pitch(wav_path, reaper_path)
|
| 202 |
save_pitch(f0_data,fpath)
|
| 203 |
|
|
@@ -251,41 +258,6 @@ def localtest():
|
|
| 251 |
|
| 252 |
# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
|
| 253 |
|
| 254 |
-
# CLUSTER the humans
|
| 255 |
-
# - read energy and pitch, to alignments
|
| 256 |
-
# - dtw based with selected chunking ? code should exist.
|
| 257 |
-
|
| 258 |
-
# ... experimental variants?
|
| 259 |
-
# ** 1 dimension at a time vs 2 on top of each other
|
| 260 |
-
# ** 25 points resampling (euclidean, kmeans, i guess....) vs all points dtw kmediods
|
| 261 |
-
# +/or maybe some intermediate parts of that??? like 25 points dtw medoids particularly **
|
| 262 |
-
# --different normings for pitch? different settings for energy (tbqh i hope not too much?)
|
| 263 |
-
# TODO '''replacement with a constant low value''' ********
|
| 264 |
-
# errrrrrrrm duration?
|
| 265 |
-
# duration feature vector will have a different length than the others, BUT,
|
| 266 |
-
# besides the single clustering,,
|
| 267 |
-
# i SUPPOSE one could TRY assigning the phone's 'speech rate' value to every frame of the phone, so it doesn't change while the other 2 values do change.... like it would still VAGUELY represent that 2 people elongating the same vowel/syllable are doing similar things with duration while someone eliding that vowel is doing a different durational thing right there?
|
| 268 |
-
# might want to z-score this dimension across ALL speakers tho not within a speaker
|
| 269 |
-
# try doing it both ways at least. bc not sure to what extent i want absolute vs. relative rate info here.
|
| 270 |
-
#(note - unless chengs dur metric is of a kind where only rel makes sense in the first place. idr.)
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
# GRAPH the humans.
|
| 276 |
-
# - probably modify this code a bit to centre on boundary.
|
| 277 |
-
# - idk.
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
# TEST each TTS
|
| 281 |
-
# - structure its features
|
| 282 |
-
# - find its avg dist for each human cluster
|
| 283 |
-
# - find the lowest dist cluster
|
| 284 |
-
# - report the dist for i guess this and all clusters
|
| 285 |
-
# - GRAPH the tts with its best cluster
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
# EVALUATION
|
| 290 |
# - of the tts
|
| 291 |
# - of the method: consistency? coherency / interpretability of 'best' voice across different features; alt. ability to recover good & problematic features from a combined method if that is chosen as the best?
|
|
|
|
| 192 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
| 193 |
if not os.path.exists(fpath):
|
| 194 |
no_f0.append(v)
|
| 195 |
+
|
| 196 |
+
ttt = subprocess.run(["ls", "-la", "ttsdir"], capture_output=True, text=True)
|
| 197 |
+
print('LS::', ttt.stdout)
|
| 198 |
+
|
| 199 |
if no_f0:
|
| 200 |
print(f'Need to estimate pitch for {len(no_f0)} voices')
|
| 201 |
for v in voices:
|
| 202 |
wav_path = f'{ttsdir}{dpath}/{v}.wav'
|
| 203 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
| 204 |
+
|
| 205 |
+
print(wav_path)
|
| 206 |
+
print(fpath)
|
| 207 |
+
|
| 208 |
f0_data = estimate_pitch(wav_path, reaper_path)
|
| 209 |
save_pitch(f0_data,fpath)
|
| 210 |
|
|
|
|
| 258 |
|
| 259 |
# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
# EVALUATION
|
| 262 |
# - of the tts
|
| 263 |
# - of the method: consistency? coherency / interpretability of 'best' voice across different features; alt. ability to recover good & problematic features from a combined method if that is chosen as the best?
|