Han Jionghao commited on
Commit
f98847a
·
unverified ·
2 Parent(s): 810614d 017498a

Merge pull request #2 from HANJionghao/main

Browse files

Move Evaluation to svs_eval.py; Add Score Metrics; Update Model Path

Files changed (5) hide show
  1. requirements.txt +6 -1
  2. server.py +5 -6
  3. svs_eval.py +120 -0
  4. svs_utils.py +17 -19
  5. util.py +2 -2
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- git+https://github.com/South-Twilight/espnet==202402
2
  espnet_model_zoo
3
  # pyopenjtalk
4
  datasets
@@ -9,3 +9,8 @@ fastapi
9
  uvicorn
10
  fugashi
11
  pykakasi
 
 
 
 
 
 
1
+ git+https://github.com/espnet/espnet
2
  espnet_model_zoo
3
  # pyopenjtalk
4
  datasets
 
9
  uvicorn
10
  fugashi
11
  pykakasi
12
+ basic-pitch[onnx]
13
+ audiobox_aesthetics
14
+ transformers
15
+ s3prl
16
+ git+https://github.com/sea-turt1e/kanjiconv
server.py CHANGED
@@ -1,9 +1,7 @@
1
  import base64
2
  import argparse
3
  import librosa
4
- import torch
5
  import tempfile
6
- import os
7
  from transformers import pipeline
8
  import re
9
  from svs_utils import svs_warmup, svs_inference
@@ -12,7 +10,8 @@ import soundfile as sf
12
  from pypinyin import lazy_pinyin
13
  import jiwer
14
  import librosa
15
- from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
 
16
 
17
 
18
  asr_pipeline = pipeline(
@@ -37,7 +36,7 @@ SYSTEM_PROMPT = """
37
 
38
 
39
  config = argparse.Namespace(
40
- model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
41
  cache_dir="cache",
42
  device="cuda", # "cpu"
43
  melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
@@ -47,8 +46,8 @@ config = argparse.Namespace(
47
 
48
  # load model
49
  svs_model = svs_warmup(config)
50
- predictor, _ = singmos_warmup()
51
- sample_rate = 48000
52
 
53
  # load dataset for random_select
54
  song2note_lengths, song_db = load_song_database(config)
 
1
  import base64
2
  import argparse
3
  import librosa
 
4
  import tempfile
 
5
  from transformers import pipeline
6
  import re
7
  from svs_utils import svs_warmup, svs_inference
 
10
  from pypinyin import lazy_pinyin
11
  import jiwer
12
  import librosa
13
+ from svs_utils import load_song_database, estimate_sentence_length
14
+ from svs_eval import singmos_warmup, singmos_evaluation
15
 
16
 
17
  asr_pipeline = pipeline(
 
36
 
37
 
38
  config = argparse.Namespace(
39
+ model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
40
  cache_dir="cache",
41
  device="cuda", # "cpu"
42
  melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
 
46
 
47
  # load model
48
  svs_model = svs_warmup(config)
49
+ predictor = singmos_warmup()
50
+ sample_rate = 44100
51
 
52
  # load dataset for random_select
53
  song2note_lengths, song_db = load_song_database(config)
svs_eval.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+
5
+
6
+ def singmos_warmup():
7
+ predictor = torch.hub.load(
8
+ "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
9
+ )
10
+ return predictor
11
+
12
+
13
+ def singmos_evaluation(predictor, wav_info, fs):
14
+ wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
15
+ wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
16
+ len_mos = torch.tensor([wav_mos.shape[1]])
17
+ score = predictor(wav_mos, len_mos)
18
+ return score
19
+
20
+
21
+ def initialize_audiobox_predictor():
22
+ from audiobox_aesthetics.infer import initialize_predictor
23
+ predictor = initialize_predictor()
24
+ return predictor
25
+
26
+
27
+ def audiobox_aesthetics_evaluation(predictor, audio_path):
28
+ score = predictor.forward([{"path": str(audio_path)}])
29
+ return score
30
+
31
+
32
+ def score_extract_warmpup():
33
+ from basic_pitch.inference import predict
34
+
35
+ return predict
36
+
37
+
38
+ def score_metric_evaluation(score_extractor, audio_path):
39
+ model_output, midi_data, note_events = score_extractor(audio_path)
40
+ metrics = {}
41
+ assert (
42
+ len(midi_data.instruments) == 1
43
+ ), f"Detected {len(midi_data.instruments)} instruments for {audio_path}"
44
+ midi_notes = midi_data.instruments[0].notes
45
+ melody = [note.pitch for note in midi_notes]
46
+ if len(melody) == 0:
47
+ print(f"No notes detected in {audio_path}")
48
+ return {}
49
+ intervals = [abs(melody[i + 1] - melody[i]) for i in range(len(melody) - 1)]
50
+ metrics["pitch_range"] = max(melody) - min(melody)
51
+ if len(intervals) > 0:
52
+ metrics["interval_mean"] = np.mean(intervals)
53
+ metrics["interval_std"] = np.std(intervals)
54
+ metrics["interval_large_jump_ratio"] = np.mean([i > 5 for i in intervals])
55
+ metrics["dissonance_rate"] = compute_dissonance_rate(intervals)
56
+ return metrics
57
+
58
+
59
+ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
60
+ dissonant = [i % 12 in dissonant_intervals for i in intervals]
61
+ return np.mean(dissonant) if intervals else np.nan
62
+
63
+
64
+ if __name__ == "__main__":
65
+ import argparse
66
+ from pathlib import Path
67
+
68
+ parser = argparse.ArgumentParser()
69
+ parser.add_argument(
70
+ "--wav_path",
71
+ type=Path,
72
+ help="Path to the wav file",
73
+ )
74
+ parser.add_argument(
75
+ "--results_csv",
76
+ type=Path,
77
+ help="csv file to save the results",
78
+ )
79
+
80
+ args = parser.parse_args()
81
+
82
+ args.results_csv.parent.mkdir(parents=True, exist_ok=True)
83
+
84
+ y, fs = librosa.load(args.wav_path, sr=None)
85
+
86
+ # warmup
87
+ predictor = singmos_warmup()
88
+ score_extractor = score_extract_warmpup()
89
+ aesthetic_predictor = initialize_audiobox_predictor()
90
+
91
+ # evaluate the audio
92
+ metrics = {}
93
+
94
+ # singmos evaluation
95
+ score = singmos_evaluation(predictor, y, fs)
96
+ metrics["singmos"] = score
97
+
98
+ # score metric evaluation
99
+ score_results = score_metric_evaluation(score_extractor, args.wav_path)
100
+ metrics.update(score_results)
101
+
102
+ # audiobox aesthetics evaluation
103
+ score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path)
104
+ metrics.update(score_results[0])
105
+
106
+ # save results
107
+ with open(args.results_csv, "a") as f:
108
+ header = "file," + ",".join(metrics.keys()) + "\n"
109
+ if f.tell() == 0:
110
+ f.write(header)
111
+ else:
112
+ with open(args.results_csv, "r") as f2:
113
+ file_header = f2.readline()
114
+ if file_header != header:
115
+ raise ValueError(f"Header mismatch: {file_header} vs {header}")
116
+
117
+ line = (
118
+ ",".join([str(args.wav_path)] + [str(v) for v in metrics.values()]) + "\n"
119
+ )
120
+ f.write(line)
svs_utils.py CHANGED
@@ -1,9 +1,7 @@
1
  import json
2
  import random
3
 
4
- import librosa
5
  import numpy as np
6
- import torch
7
  from espnet2.bin.svs_inference import SingingGenerate
8
  from espnet_model_zoo.downloader import ModelDownloader
9
 
@@ -30,6 +28,21 @@ def svs_warmup(config):
30
  model_file=downloaded["model_file"],
31
  device=config.device,
32
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  else:
34
  raise NotImplementedError(f"Model {config.model_path} not supported")
35
  return model
@@ -212,7 +225,7 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
212
  if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
213
  sid = np.array([int(config.speaker)])
214
  output_dict = svs_model(batch, sids=sid)
215
- elif config.model_path == "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained":
216
  langs = {
217
  "zh": 2,
218
  "jp": 1,
@@ -227,21 +240,6 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
227
  return wav_info
228
 
229
 
230
- def singmos_warmup():
231
- predictor = torch.hub.load(
232
- "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
233
- )
234
- return predictor, "South-Twilight/SingMOS:v0.2.0"
235
-
236
-
237
- def singmos_evaluation(predictor, wav_info, fs):
238
- wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
239
- wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
240
- len_mos = torch.tensor([wav_mos.shape[1]])
241
- score = predictor(wav_mos, len_mos)
242
- return score
243
-
244
-
245
  def estimate_sentence_length(query, config, song2note_lengths):
246
  if config.melody_source == "random_select.touhou":
247
  song_name = "touhou"
@@ -376,7 +374,7 @@ if __name__ == "__main__":
376
 
377
  # -------- demo code for generate audio from randomly selected song ---------#
378
  config = argparse.Namespace(
379
- model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
380
  cache_dir="cache",
381
  device="cuda", # "cpu"
382
  melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou"
 
1
  import json
2
  import random
3
 
 
4
  import numpy as np
 
5
  from espnet2.bin.svs_inference import SingingGenerate
6
  from espnet_model_zoo.downloader import ModelDownloader
7
 
 
28
  model_file=downloaded["model_file"],
29
  device=config.device,
30
  )
31
+ dummy_batch = {
32
+ "score": (
33
+ 75, # tempo
34
+ [
35
+ (0.0, 0.25, "r_en", 63.0, "r_en"),
36
+ (0.25, 0.5, "—", 63.0, "en"),
37
+ ],
38
+ ),
39
+ "text": "r en en",
40
+ }
41
+ model(
42
+ dummy_batch,
43
+ lids=np.array([2]),
44
+ spembs=np.load("resource/singer/singer_embedding_ace-2.npy"),
45
+ ) # warmup
46
  else:
47
  raise NotImplementedError(f"Model {config.model_path} not supported")
48
  return model
 
225
  if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
226
  sid = np.array([int(config.speaker)])
227
  output_dict = svs_model(batch, sids=sid)
228
+ elif config.model_path == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
229
  langs = {
230
  "zh": 2,
231
  "jp": 1,
 
240
  return wav_info
241
 
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  def estimate_sentence_length(query, config, song2note_lengths):
244
  if config.melody_source == "random_select.touhou":
245
  song_name = "touhou"
 
374
 
375
  # -------- demo code for generate audio from randomly selected song ---------#
376
  config = argparse.Namespace(
377
+ model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
378
  cache_dir="cache",
379
  device="cuda", # "cpu"
380
  melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou"
util.py CHANGED
@@ -61,7 +61,7 @@ def get_tokenizer(model, lang):
61
  return lambda text: split_pinyin_py(text)
62
  else:
63
  raise ValueError(f"Only support Chinese language for {model}")
64
- elif model == "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained":
65
  if lang == "zh":
66
  with open(os.path.join("resource/all_plans.json"), "r") as f:
67
  all_plan_dict = json.load(f)
@@ -74,7 +74,7 @@ def get_tokenizer(model, lang):
74
  else:
75
  raise ValueError(f"Only support Chinese and Japanese language for {model}")
76
  else:
77
- raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkembed_lang_pretrained for now")
78
 
79
 
80
  def get_pinyin(texts):
 
61
  return lambda text: split_pinyin_py(text)
62
  else:
63
  raise ValueError(f"Only support Chinese language for {model}")
64
+ elif model == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
65
  if lang == "zh":
66
  with open(os.path.join("resource/all_plans.json"), "r") as f:
67
  all_plan_dict = json.load(f)
 
74
  else:
75
  raise ValueError(f"Only support Chinese and Japanese language for {model}")
76
  else:
77
+ raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
78
 
79
 
80
  def get_pinyin(texts):