Spaces:
Sleeping
Sleeping
Merge pull request #2 from HANJionghao/main
Browse filesMove Evaluation to svs_eval.py; Add Score Metrics; Update Model Path
- requirements.txt +6 -1
- server.py +5 -6
- svs_eval.py +120 -0
- svs_utils.py +17 -19
- util.py +2 -2
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
git+https://github.com/
|
2 |
espnet_model_zoo
|
3 |
# pyopenjtalk
|
4 |
datasets
|
@@ -9,3 +9,8 @@ fastapi
|
|
9 |
uvicorn
|
10 |
fugashi
|
11 |
pykakasi
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/espnet/espnet
|
2 |
espnet_model_zoo
|
3 |
# pyopenjtalk
|
4 |
datasets
|
|
|
9 |
uvicorn
|
10 |
fugashi
|
11 |
pykakasi
|
12 |
+
basic-pitch[onnx]
|
13 |
+
audiobox_aesthetics
|
14 |
+
transformers
|
15 |
+
s3prl
|
16 |
+
git+https://github.com/sea-turt1e/kanjiconv
|
server.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import base64
|
2 |
import argparse
|
3 |
import librosa
|
4 |
-
import torch
|
5 |
import tempfile
|
6 |
-
import os
|
7 |
from transformers import pipeline
|
8 |
import re
|
9 |
from svs_utils import svs_warmup, svs_inference
|
@@ -12,7 +10,8 @@ import soundfile as sf
|
|
12 |
from pypinyin import lazy_pinyin
|
13 |
import jiwer
|
14 |
import librosa
|
15 |
-
from svs_utils import
|
|
|
16 |
|
17 |
|
18 |
asr_pipeline = pipeline(
|
@@ -37,7 +36,7 @@ SYSTEM_PROMPT = """
|
|
37 |
|
38 |
|
39 |
config = argparse.Namespace(
|
40 |
-
model_path="espnet/
|
41 |
cache_dir="cache",
|
42 |
device="cuda", # "cpu"
|
43 |
melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
|
@@ -47,8 +46,8 @@ config = argparse.Namespace(
|
|
47 |
|
48 |
# load model
|
49 |
svs_model = svs_warmup(config)
|
50 |
-
predictor
|
51 |
-
sample_rate =
|
52 |
|
53 |
# load dataset for random_select
|
54 |
song2note_lengths, song_db = load_song_database(config)
|
|
|
1 |
import base64
|
2 |
import argparse
|
3 |
import librosa
|
|
|
4 |
import tempfile
|
|
|
5 |
from transformers import pipeline
|
6 |
import re
|
7 |
from svs_utils import svs_warmup, svs_inference
|
|
|
10 |
from pypinyin import lazy_pinyin
|
11 |
import jiwer
|
12 |
import librosa
|
13 |
+
from svs_utils import load_song_database, estimate_sentence_length
|
14 |
+
from svs_eval import singmos_warmup, singmos_evaluation
|
15 |
|
16 |
|
17 |
asr_pipeline = pipeline(
|
|
|
36 |
|
37 |
|
38 |
config = argparse.Namespace(
|
39 |
+
model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
|
40 |
cache_dir="cache",
|
41 |
device="cuda", # "cpu"
|
42 |
melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
|
|
|
46 |
|
47 |
# load model
|
48 |
svs_model = svs_warmup(config)
|
49 |
+
predictor = singmos_warmup()
|
50 |
+
sample_rate = 44100
|
51 |
|
52 |
# load dataset for random_select
|
53 |
song2note_lengths, song_db = load_song_database(config)
|
svs_eval.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
def singmos_warmup():
|
7 |
+
predictor = torch.hub.load(
|
8 |
+
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
9 |
+
)
|
10 |
+
return predictor
|
11 |
+
|
12 |
+
|
13 |
+
def singmos_evaluation(predictor, wav_info, fs):
|
14 |
+
wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
|
15 |
+
wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
|
16 |
+
len_mos = torch.tensor([wav_mos.shape[1]])
|
17 |
+
score = predictor(wav_mos, len_mos)
|
18 |
+
return score
|
19 |
+
|
20 |
+
|
21 |
+
def initialize_audiobox_predictor():
|
22 |
+
from audiobox_aesthetics.infer import initialize_predictor
|
23 |
+
predictor = initialize_predictor()
|
24 |
+
return predictor
|
25 |
+
|
26 |
+
|
27 |
+
def audiobox_aesthetics_evaluation(predictor, audio_path):
|
28 |
+
score = predictor.forward([{"path": str(audio_path)}])
|
29 |
+
return score
|
30 |
+
|
31 |
+
|
32 |
+
def score_extract_warmpup():
|
33 |
+
from basic_pitch.inference import predict
|
34 |
+
|
35 |
+
return predict
|
36 |
+
|
37 |
+
|
38 |
+
def score_metric_evaluation(score_extractor, audio_path):
|
39 |
+
model_output, midi_data, note_events = score_extractor(audio_path)
|
40 |
+
metrics = {}
|
41 |
+
assert (
|
42 |
+
len(midi_data.instruments) == 1
|
43 |
+
), f"Detected {len(midi_data.instruments)} instruments for {audio_path}"
|
44 |
+
midi_notes = midi_data.instruments[0].notes
|
45 |
+
melody = [note.pitch for note in midi_notes]
|
46 |
+
if len(melody) == 0:
|
47 |
+
print(f"No notes detected in {audio_path}")
|
48 |
+
return {}
|
49 |
+
intervals = [abs(melody[i + 1] - melody[i]) for i in range(len(melody) - 1)]
|
50 |
+
metrics["pitch_range"] = max(melody) - min(melody)
|
51 |
+
if len(intervals) > 0:
|
52 |
+
metrics["interval_mean"] = np.mean(intervals)
|
53 |
+
metrics["interval_std"] = np.std(intervals)
|
54 |
+
metrics["interval_large_jump_ratio"] = np.mean([i > 5 for i in intervals])
|
55 |
+
metrics["dissonance_rate"] = compute_dissonance_rate(intervals)
|
56 |
+
return metrics
|
57 |
+
|
58 |
+
|
59 |
+
def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
|
60 |
+
dissonant = [i % 12 in dissonant_intervals for i in intervals]
|
61 |
+
return np.mean(dissonant) if intervals else np.nan
|
62 |
+
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
import argparse
|
66 |
+
from pathlib import Path
|
67 |
+
|
68 |
+
parser = argparse.ArgumentParser()
|
69 |
+
parser.add_argument(
|
70 |
+
"--wav_path",
|
71 |
+
type=Path,
|
72 |
+
help="Path to the wav file",
|
73 |
+
)
|
74 |
+
parser.add_argument(
|
75 |
+
"--results_csv",
|
76 |
+
type=Path,
|
77 |
+
help="csv file to save the results",
|
78 |
+
)
|
79 |
+
|
80 |
+
args = parser.parse_args()
|
81 |
+
|
82 |
+
args.results_csv.parent.mkdir(parents=True, exist_ok=True)
|
83 |
+
|
84 |
+
y, fs = librosa.load(args.wav_path, sr=None)
|
85 |
+
|
86 |
+
# warmup
|
87 |
+
predictor = singmos_warmup()
|
88 |
+
score_extractor = score_extract_warmpup()
|
89 |
+
aesthetic_predictor = initialize_audiobox_predictor()
|
90 |
+
|
91 |
+
# evaluate the audio
|
92 |
+
metrics = {}
|
93 |
+
|
94 |
+
# singmos evaluation
|
95 |
+
score = singmos_evaluation(predictor, y, fs)
|
96 |
+
metrics["singmos"] = score
|
97 |
+
|
98 |
+
# score metric evaluation
|
99 |
+
score_results = score_metric_evaluation(score_extractor, args.wav_path)
|
100 |
+
metrics.update(score_results)
|
101 |
+
|
102 |
+
# audiobox aesthetics evaluation
|
103 |
+
score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path)
|
104 |
+
metrics.update(score_results[0])
|
105 |
+
|
106 |
+
# save results
|
107 |
+
with open(args.results_csv, "a") as f:
|
108 |
+
header = "file," + ",".join(metrics.keys()) + "\n"
|
109 |
+
if f.tell() == 0:
|
110 |
+
f.write(header)
|
111 |
+
else:
|
112 |
+
with open(args.results_csv, "r") as f2:
|
113 |
+
file_header = f2.readline()
|
114 |
+
if file_header != header:
|
115 |
+
raise ValueError(f"Header mismatch: {file_header} vs {header}")
|
116 |
+
|
117 |
+
line = (
|
118 |
+
",".join([str(args.wav_path)] + [str(v) for v in metrics.values()]) + "\n"
|
119 |
+
)
|
120 |
+
f.write(line)
|
svs_utils.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import json
|
2 |
import random
|
3 |
|
4 |
-
import librosa
|
5 |
import numpy as np
|
6 |
-
import torch
|
7 |
from espnet2.bin.svs_inference import SingingGenerate
|
8 |
from espnet_model_zoo.downloader import ModelDownloader
|
9 |
|
@@ -30,6 +28,21 @@ def svs_warmup(config):
|
|
30 |
model_file=downloaded["model_file"],
|
31 |
device=config.device,
|
32 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
else:
|
34 |
raise NotImplementedError(f"Model {config.model_path} not supported")
|
35 |
return model
|
@@ -212,7 +225,7 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
|
|
212 |
if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
213 |
sid = np.array([int(config.speaker)])
|
214 |
output_dict = svs_model(batch, sids=sid)
|
215 |
-
elif config.model_path == "espnet/
|
216 |
langs = {
|
217 |
"zh": 2,
|
218 |
"jp": 1,
|
@@ -227,21 +240,6 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
|
|
227 |
return wav_info
|
228 |
|
229 |
|
230 |
-
def singmos_warmup():
|
231 |
-
predictor = torch.hub.load(
|
232 |
-
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
233 |
-
)
|
234 |
-
return predictor, "South-Twilight/SingMOS:v0.2.0"
|
235 |
-
|
236 |
-
|
237 |
-
def singmos_evaluation(predictor, wav_info, fs):
|
238 |
-
wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
|
239 |
-
wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
|
240 |
-
len_mos = torch.tensor([wav_mos.shape[1]])
|
241 |
-
score = predictor(wav_mos, len_mos)
|
242 |
-
return score
|
243 |
-
|
244 |
-
|
245 |
def estimate_sentence_length(query, config, song2note_lengths):
|
246 |
if config.melody_source == "random_select.touhou":
|
247 |
song_name = "touhou"
|
@@ -376,7 +374,7 @@ if __name__ == "__main__":
|
|
376 |
|
377 |
# -------- demo code for generate audio from randomly selected song ---------#
|
378 |
config = argparse.Namespace(
|
379 |
-
model_path="espnet/
|
380 |
cache_dir="cache",
|
381 |
device="cuda", # "cpu"
|
382 |
melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou"
|
|
|
1 |
import json
|
2 |
import random
|
3 |
|
|
|
4 |
import numpy as np
|
|
|
5 |
from espnet2.bin.svs_inference import SingingGenerate
|
6 |
from espnet_model_zoo.downloader import ModelDownloader
|
7 |
|
|
|
28 |
model_file=downloaded["model_file"],
|
29 |
device=config.device,
|
30 |
)
|
31 |
+
dummy_batch = {
|
32 |
+
"score": (
|
33 |
+
75, # tempo
|
34 |
+
[
|
35 |
+
(0.0, 0.25, "r_en", 63.0, "r_en"),
|
36 |
+
(0.25, 0.5, "—", 63.0, "en"),
|
37 |
+
],
|
38 |
+
),
|
39 |
+
"text": "r en en",
|
40 |
+
}
|
41 |
+
model(
|
42 |
+
dummy_batch,
|
43 |
+
lids=np.array([2]),
|
44 |
+
spembs=np.load("resource/singer/singer_embedding_ace-2.npy"),
|
45 |
+
) # warmup
|
46 |
else:
|
47 |
raise NotImplementedError(f"Model {config.model_path} not supported")
|
48 |
return model
|
|
|
225 |
if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
226 |
sid = np.array([int(config.speaker)])
|
227 |
output_dict = svs_model(batch, sids=sid)
|
228 |
+
elif config.model_path == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
229 |
langs = {
|
230 |
"zh": 2,
|
231 |
"jp": 1,
|
|
|
240 |
return wav_info
|
241 |
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
def estimate_sentence_length(query, config, song2note_lengths):
|
244 |
if config.melody_source == "random_select.touhou":
|
245 |
song_name = "touhou"
|
|
|
374 |
|
375 |
# -------- demo code for generate audio from randomly selected song ---------#
|
376 |
config = argparse.Namespace(
|
377 |
+
model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
|
378 |
cache_dir="cache",
|
379 |
device="cuda", # "cpu"
|
380 |
melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou"
|
util.py
CHANGED
@@ -61,7 +61,7 @@ def get_tokenizer(model, lang):
|
|
61 |
return lambda text: split_pinyin_py(text)
|
62 |
else:
|
63 |
raise ValueError(f"Only support Chinese language for {model}")
|
64 |
-
elif model == "espnet/
|
65 |
if lang == "zh":
|
66 |
with open(os.path.join("resource/all_plans.json"), "r") as f:
|
67 |
all_plan_dict = json.load(f)
|
@@ -74,7 +74,7 @@ def get_tokenizer(model, lang):
|
|
74 |
else:
|
75 |
raise ValueError(f"Only support Chinese and Japanese language for {model}")
|
76 |
else:
|
77 |
-
raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/
|
78 |
|
79 |
|
80 |
def get_pinyin(texts):
|
|
|
61 |
return lambda text: split_pinyin_py(text)
|
62 |
else:
|
63 |
raise ValueError(f"Only support Chinese language for {model}")
|
64 |
+
elif model == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
65 |
if lang == "zh":
|
66 |
with open(os.path.join("resource/all_plans.json"), "r") as f:
|
67 |
all_plan_dict = json.load(f)
|
|
|
74 |
else:
|
75 |
raise ValueError(f"Only support Chinese and Japanese language for {model}")
|
76 |
else:
|
77 |
+
raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
|
78 |
|
79 |
|
80 |
def get_pinyin(texts):
|