Spaces:
Sleeping
Sleeping
Refactor SVS evaluation and move SingMOS functions to svs_eval.py; add pitch interval and chroma entropy
Browse files- server.py +3 -4
- svs_eval.py +113 -0
- svs_utils.py +0 -17
server.py
CHANGED
@@ -3,9 +3,7 @@ from fastapi.responses import FileResponse, JSONResponse
|
|
3 |
import base64
|
4 |
import argparse
|
5 |
import librosa
|
6 |
-
import torch
|
7 |
import tempfile
|
8 |
-
import os
|
9 |
from transformers import pipeline
|
10 |
import re
|
11 |
from svs_utils import svs_warmup, svs_inference
|
@@ -14,7 +12,8 @@ import soundfile as sf
|
|
14 |
from pypinyin import lazy_pinyin
|
15 |
import jiwer
|
16 |
import librosa
|
17 |
-
from svs_utils import
|
|
|
18 |
|
19 |
app = FastAPI()
|
20 |
|
@@ -49,7 +48,7 @@ config = argparse.Namespace(
|
|
49 |
|
50 |
# load model
|
51 |
svs_model = svs_warmup(config)
|
52 |
-
predictor
|
53 |
sample_rate = 44100
|
54 |
|
55 |
# load dataset for random_select
|
|
|
3 |
import base64
|
4 |
import argparse
|
5 |
import librosa
|
|
|
6 |
import tempfile
|
|
|
7 |
from transformers import pipeline
|
8 |
import re
|
9 |
from svs_utils import svs_warmup, svs_inference
|
|
|
12 |
from pypinyin import lazy_pinyin
|
13 |
import jiwer
|
14 |
import librosa
|
15 |
+
from svs_utils import load_song_database, estimate_sentence_length
|
16 |
+
from svs_eval import singmos_warmup, singmos_evaluation
|
17 |
|
18 |
app = FastAPI()
|
19 |
|
|
|
48 |
|
49 |
# load model
|
50 |
svs_model = svs_warmup(config)
|
51 |
+
predictor = singmos_warmup()
|
52 |
sample_rate = 44100
|
53 |
|
54 |
# load dataset for random_select
|
svs_eval.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import pyworld as pw
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
|
7 |
+
def singmos_warmup():
|
8 |
+
predictor = torch.hub.load(
|
9 |
+
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
10 |
+
)
|
11 |
+
return predictor
|
12 |
+
|
13 |
+
|
14 |
+
def singmos_evaluation(predictor, wav_info, fs):
|
15 |
+
wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
|
16 |
+
wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
|
17 |
+
len_mos = torch.tensor([wav_mos.shape[1]])
|
18 |
+
score = predictor(wav_mos, len_mos)
|
19 |
+
return score
|
20 |
+
|
21 |
+
|
22 |
+
def pitch_interval_evaluation(y, fs):
|
23 |
+
_f0, t = pw.dio(y.astype(np.float64), fs)
|
24 |
+
f0 = pw.stonemask(y.astype(np.float64), _f0, t, fs)
|
25 |
+
|
26 |
+
f0[f0 == 0] = np.nan
|
27 |
+
midi_f0 = librosa.hz_to_midi(f0)
|
28 |
+
|
29 |
+
if len(midi_f0) < 2:
|
30 |
+
return np.nan, np.nan
|
31 |
+
|
32 |
+
# only consider the intervals between notes
|
33 |
+
intervals = np.diff(midi_f0)
|
34 |
+
intervals = intervals[~np.isnan(intervals)]
|
35 |
+
interval_mean = np.mean(np.abs(intervals))
|
36 |
+
interval_std = np.std(intervals)
|
37 |
+
return interval_mean, interval_std
|
38 |
+
|
39 |
+
|
40 |
+
def chroma_entropy_evaluation(y, fs):
|
41 |
+
chroma = librosa.feature.chroma_cqt(y=y, sr=fs)
|
42 |
+
chroma_sum = np.sum(chroma, axis=0, keepdims=True)
|
43 |
+
chroma_sum = np.clip(chroma_sum, 1e-6, None)
|
44 |
+
chroma_norm = chroma / chroma_sum
|
45 |
+
chroma_norm = np.clip(chroma_norm, 1e-6, 1.0)
|
46 |
+
entropy = np.sum(chroma_norm * np.log2(chroma_norm), axis=0)
|
47 |
+
return np.mean(entropy)
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
import argparse
|
52 |
+
from pathlib import Path
|
53 |
+
|
54 |
+
parser = argparse.ArgumentParser()
|
55 |
+
parser.add_argument(
|
56 |
+
"--wav_path",
|
57 |
+
type=Path,
|
58 |
+
help="Path to the wav file",
|
59 |
+
)
|
60 |
+
parser.add_argument(
|
61 |
+
"--results_csv",
|
62 |
+
type=Path,
|
63 |
+
help="csv file to save the results",
|
64 |
+
)
|
65 |
+
parser.parse_args()
|
66 |
+
|
67 |
+
args = parser.parse_args()
|
68 |
+
|
69 |
+
args.results_csv.parent.mkdir(parents=True, exist_ok=True)
|
70 |
+
|
71 |
+
y, fs = librosa.load(args.wav_path, sr=None)
|
72 |
+
|
73 |
+
# warmup
|
74 |
+
predictor = singmos_warmup()
|
75 |
+
|
76 |
+
# singmos evaluation
|
77 |
+
score = singmos_evaluation(predictor, y, fs)
|
78 |
+
|
79 |
+
# pitch interval evaluation
|
80 |
+
interval_mean, interval_std = pitch_interval_evaluation(y, fs)
|
81 |
+
# chroma entropy evaluation
|
82 |
+
chroma_entropy = chroma_entropy_evaluation(y, fs)
|
83 |
+
|
84 |
+
# # visualize
|
85 |
+
# import matplotlib.pyplot as plt
|
86 |
+
# import librosa.display
|
87 |
+
# chroma = librosa.feature.chroma_cqt(y=y, sr=fs)
|
88 |
+
# img = librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
|
89 |
+
# plt.colorbar(img)
|
90 |
+
# plt.savefig(args.results_csv.parent / args.wav_path.with_suffix('.png'))
|
91 |
+
|
92 |
+
# save results
|
93 |
+
results = {
|
94 |
+
"singmos": score,
|
95 |
+
"pitch_interval_mean": interval_mean,
|
96 |
+
"pitch_interval_std": interval_std,
|
97 |
+
"chroma_entropy": chroma_entropy,
|
98 |
+
}
|
99 |
+
|
100 |
+
with open(args.results_csv, "a") as f:
|
101 |
+
header = "file," + ",".join(results.keys()) + "\n"
|
102 |
+
if f.tell() == 0:
|
103 |
+
f.write(header)
|
104 |
+
else:
|
105 |
+
with open(args.results_csv, "r") as f2:
|
106 |
+
file_header = f2.readline()
|
107 |
+
if file_header != header:
|
108 |
+
raise ValueError(
|
109 |
+
f"Header mismatch: {file_header} vs {header}"
|
110 |
+
)
|
111 |
+
|
112 |
+
line = ",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n"
|
113 |
+
f.write(line)
|
svs_utils.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import json
|
2 |
import random
|
3 |
|
4 |
-
import librosa
|
5 |
import numpy as np
|
6 |
-
import torch
|
7 |
from espnet2.bin.svs_inference import SingingGenerate
|
8 |
from espnet_model_zoo.downloader import ModelDownloader
|
9 |
|
@@ -227,21 +225,6 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
|
|
227 |
return wav_info
|
228 |
|
229 |
|
230 |
-
def singmos_warmup():
|
231 |
-
predictor = torch.hub.load(
|
232 |
-
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
233 |
-
)
|
234 |
-
return predictor, "South-Twilight/SingMOS:v0.2.0"
|
235 |
-
|
236 |
-
|
237 |
-
def singmos_evaluation(predictor, wav_info, fs):
|
238 |
-
wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
|
239 |
-
wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
|
240 |
-
len_mos = torch.tensor([wav_mos.shape[1]])
|
241 |
-
score = predictor(wav_mos, len_mos)
|
242 |
-
return score
|
243 |
-
|
244 |
-
|
245 |
def estimate_sentence_length(query, config, song2note_lengths):
|
246 |
if config.melody_source == "random_select.touhou":
|
247 |
song_name = "touhou"
|
|
|
1 |
import json
|
2 |
import random
|
3 |
|
|
|
4 |
import numpy as np
|
|
|
5 |
from espnet2.bin.svs_inference import SingingGenerate
|
6 |
from espnet_model_zoo.downloader import ModelDownloader
|
7 |
|
|
|
225 |
return wav_info
|
226 |
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
def estimate_sentence_length(query, config, song2note_lengths):
|
229 |
if config.melody_source == "random_select.touhou":
|
230 |
song_name = "touhou"
|