jhansss commited on
Commit
1ad80f6
·
1 Parent(s): 427f657

Refactor SVS evaluation and move SingMOS functions to svs_eval.py; add pitch interval and chroma entropy

Browse files
Files changed (3) hide show
  1. server.py +3 -4
  2. svs_eval.py +113 -0
  3. svs_utils.py +0 -17
server.py CHANGED
@@ -3,9 +3,7 @@ from fastapi.responses import FileResponse, JSONResponse
3
  import base64
4
  import argparse
5
  import librosa
6
- import torch
7
  import tempfile
8
- import os
9
  from transformers import pipeline
10
  import re
11
  from svs_utils import svs_warmup, svs_inference
@@ -14,7 +12,8 @@ import soundfile as sf
14
  from pypinyin import lazy_pinyin
15
  import jiwer
16
  import librosa
17
- from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
 
18
 
19
  app = FastAPI()
20
 
@@ -49,7 +48,7 @@ config = argparse.Namespace(
49
 
50
  # load model
51
  svs_model = svs_warmup(config)
52
- predictor, _ = singmos_warmup()
53
  sample_rate = 44100
54
 
55
  # load dataset for random_select
 
3
  import base64
4
  import argparse
5
  import librosa
 
6
  import tempfile
 
7
  from transformers import pipeline
8
  import re
9
  from svs_utils import svs_warmup, svs_inference
 
12
  from pypinyin import lazy_pinyin
13
  import jiwer
14
  import librosa
15
+ from svs_utils import load_song_database, estimate_sentence_length
16
+ from svs_eval import singmos_warmup, singmos_evaluation
17
 
18
  app = FastAPI()
19
 
 
48
 
49
  # load model
50
  svs_model = svs_warmup(config)
51
+ predictor = singmos_warmup()
52
  sample_rate = 44100
53
 
54
  # load dataset for random_select
svs_eval.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import pyworld as pw
3
+ import numpy as np
4
+ import torch
5
+
6
+
7
+ def singmos_warmup():
8
+ predictor = torch.hub.load(
9
+ "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
10
+ )
11
+ return predictor
12
+
13
+
14
+ def singmos_evaluation(predictor, wav_info, fs):
15
+ wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
16
+ wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
17
+ len_mos = torch.tensor([wav_mos.shape[1]])
18
+ score = predictor(wav_mos, len_mos)
19
+ return score
20
+
21
+
22
+ def pitch_interval_evaluation(y, fs):
23
+ _f0, t = pw.dio(y.astype(np.float64), fs)
24
+ f0 = pw.stonemask(y.astype(np.float64), _f0, t, fs)
25
+
26
+ f0[f0 == 0] = np.nan
27
+ midi_f0 = librosa.hz_to_midi(f0)
28
+
29
+ if len(midi_f0) < 2:
30
+ return np.nan, np.nan
31
+
32
+ # only consider the intervals between notes
33
+ intervals = np.diff(midi_f0)
34
+ intervals = intervals[~np.isnan(intervals)]
35
+ interval_mean = np.mean(np.abs(intervals))
36
+ interval_std = np.std(intervals)
37
+ return interval_mean, interval_std
38
+
39
+
40
+ def chroma_entropy_evaluation(y, fs):
41
+ chroma = librosa.feature.chroma_cqt(y=y, sr=fs)
42
+ chroma_sum = np.sum(chroma, axis=0, keepdims=True)
43
+ chroma_sum = np.clip(chroma_sum, 1e-6, None)
44
+ chroma_norm = chroma / chroma_sum
45
+ chroma_norm = np.clip(chroma_norm, 1e-6, 1.0)
46
+ entropy = np.sum(chroma_norm * np.log2(chroma_norm), axis=0)
47
+ return np.mean(entropy)
48
+
49
+
50
+ if __name__ == "__main__":
51
+ import argparse
52
+ from pathlib import Path
53
+
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument(
56
+ "--wav_path",
57
+ type=Path,
58
+ help="Path to the wav file",
59
+ )
60
+ parser.add_argument(
61
+ "--results_csv",
62
+ type=Path,
63
+ help="csv file to save the results",
64
+ )
65
+ parser.parse_args()
66
+
67
+ args = parser.parse_args()
68
+
69
+ args.results_csv.parent.mkdir(parents=True, exist_ok=True)
70
+
71
+ y, fs = librosa.load(args.wav_path, sr=None)
72
+
73
+ # warmup
74
+ predictor = singmos_warmup()
75
+
76
+ # singmos evaluation
77
+ score = singmos_evaluation(predictor, y, fs)
78
+
79
+ # pitch interval evaluation
80
+ interval_mean, interval_std = pitch_interval_evaluation(y, fs)
81
+ # chroma entropy evaluation
82
+ chroma_entropy = chroma_entropy_evaluation(y, fs)
83
+
84
+ # # visualize
85
+ # import matplotlib.pyplot as plt
86
+ # import librosa.display
87
+ # chroma = librosa.feature.chroma_cqt(y=y, sr=fs)
88
+ # img = librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
89
+ # plt.colorbar(img)
90
+ # plt.savefig(args.results_csv.parent / args.wav_path.with_suffix('.png'))
91
+
92
+ # save results
93
+ results = {
94
+ "singmos": score,
95
+ "pitch_interval_mean": interval_mean,
96
+ "pitch_interval_std": interval_std,
97
+ "chroma_entropy": chroma_entropy,
98
+ }
99
+
100
+ with open(args.results_csv, "a") as f:
101
+ header = "file," + ",".join(results.keys()) + "\n"
102
+ if f.tell() == 0:
103
+ f.write(header)
104
+ else:
105
+ with open(args.results_csv, "r") as f2:
106
+ file_header = f2.readline()
107
+ if file_header != header:
108
+ raise ValueError(
109
+ f"Header mismatch: {file_header} vs {header}"
110
+ )
111
+
112
+ line = ",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n"
113
+ f.write(line)
svs_utils.py CHANGED
@@ -1,9 +1,7 @@
1
  import json
2
  import random
3
 
4
- import librosa
5
  import numpy as np
6
- import torch
7
  from espnet2.bin.svs_inference import SingingGenerate
8
  from espnet_model_zoo.downloader import ModelDownloader
9
 
@@ -227,21 +225,6 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
227
  return wav_info
228
 
229
 
230
- def singmos_warmup():
231
- predictor = torch.hub.load(
232
- "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
233
- )
234
- return predictor, "South-Twilight/SingMOS:v0.2.0"
235
-
236
-
237
- def singmos_evaluation(predictor, wav_info, fs):
238
- wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
239
- wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
240
- len_mos = torch.tensor([wav_mos.shape[1]])
241
- score = predictor(wav_mos, len_mos)
242
- return score
243
-
244
-
245
  def estimate_sentence_length(query, config, song2note_lengths):
246
  if config.melody_source == "random_select.touhou":
247
  song_name = "touhou"
 
1
  import json
2
  import random
3
 
 
4
  import numpy as np
 
5
  from espnet2.bin.svs_inference import SingingGenerate
6
  from espnet_model_zoo.downloader import ModelDownloader
7
 
 
225
  return wav_info
226
 
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  def estimate_sentence_length(query, config, song2note_lengths):
229
  if config.melody_source == "random_select.touhou":
230
  song_name = "touhou"