Spaces:

jhansss
/

SingingSDS

Sleeping

App Files Files Community

Han Jionghao commited on May 29

Commit

f98847a

unverified ·

2 Parent(s): 810614d 017498a

Merge pull request #2 from HANJionghao/main

Browse files

Move Evaluation to svs_eval.py; Add Score Metrics; Update Model Path

Files changed (5) hide show

requirements.txt +6 -1
server.py +5 -6
svs_eval.py +120 -0
svs_utils.py +17 -19
util.py +2 -2

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-git+https://github.com/South-Twilight/espnet==202402
 espnet_model_zoo
 # pyopenjtalk
 datasets
@@ -9,3 +9,8 @@ fastapi
 uvicorn
 fugashi
 pykakasi

+git+https://github.com/espnet/espnet
 espnet_model_zoo
 # pyopenjtalk
 datasets
 uvicorn
 fugashi
 pykakasi
+basic-pitch[onnx]
+audiobox_aesthetics
+transformers
+s3prl
+git+https://github.com/sea-turt1e/kanjiconv

server.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import base64
 import argparse
 import librosa
-import torch
 import tempfile
-import os
 from transformers import pipeline
 import re
 from svs_utils import svs_warmup, svs_inference
@@ -12,7 +10,8 @@ import soundfile as sf
 from pypinyin import lazy_pinyin
 import jiwer
 import librosa
-from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
 asr_pipeline = pipeline(
@@ -37,7 +36,7 @@ SYSTEM_PROMPT = """
 config = argparse.Namespace(
-    model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
     cache_dir="cache",
     device="cuda", # "cpu"
     melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
@@ -47,8 +46,8 @@ config = argparse.Namespace(
 # load model
 svs_model = svs_warmup(config)
-predictor, _ = singmos_warmup()
-sample_rate = 48000
 # load dataset for random_select
 song2note_lengths, song_db = load_song_database(config)

 import base64
 import argparse
 import librosa
 import tempfile
 from transformers import pipeline
 import re
 from svs_utils import svs_warmup, svs_inference
 from pypinyin import lazy_pinyin
 import jiwer
 import librosa
+from svs_utils import load_song_database, estimate_sentence_length
+from svs_eval import singmos_warmup, singmos_evaluation
 asr_pipeline = pipeline(
 config = argparse.Namespace(
+    model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
     cache_dir="cache",
     device="cuda", # "cpu"
     melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
 # load model
 svs_model = svs_warmup(config)
+predictor = singmos_warmup()
+sample_rate = 44100
 # load dataset for random_select
 song2note_lengths, song_db = load_song_database(config)

svs_eval.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import librosa
+import numpy as np
+import torch
+def singmos_warmup():
+    predictor = torch.hub.load(
+        "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
+    )
+    return predictor
+def singmos_evaluation(predictor, wav_info, fs):
+    wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
+    wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
+    len_mos = torch.tensor([wav_mos.shape[1]])
+    score = predictor(wav_mos, len_mos)
+    return score
+def initialize_audiobox_predictor():
+    from audiobox_aesthetics.infer import initialize_predictor
+    predictor = initialize_predictor()
+    return predictor
+def audiobox_aesthetics_evaluation(predictor, audio_path):
+    score = predictor.forward([{"path": str(audio_path)}])
+    return score
+def score_extract_warmpup():
+    from basic_pitch.inference import predict
+    return predict
+def score_metric_evaluation(score_extractor, audio_path):
+    model_output, midi_data, note_events = score_extractor(audio_path)
+    metrics = {}
+    assert (
+        len(midi_data.instruments) == 1
+    ), f"Detected {len(midi_data.instruments)} instruments for {audio_path}"
+    midi_notes = midi_data.instruments[0].notes
+    melody = [note.pitch for note in midi_notes]
+    if len(melody) == 0:
+        print(f"No notes detected in {audio_path}")
+        return {}
+    intervals = [abs(melody[i + 1] - melody[i]) for i in range(len(melody) - 1)]
+    metrics["pitch_range"] = max(melody) - min(melody)
+    if len(intervals) > 0:
+        metrics["interval_mean"] = np.mean(intervals)
+        metrics["interval_std"] = np.std(intervals)
+        metrics["interval_large_jump_ratio"] = np.mean([i > 5 for i in intervals])
+        metrics["dissonance_rate"] = compute_dissonance_rate(intervals)
+    return metrics
+def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
+    dissonant = [i % 12 in dissonant_intervals for i in intervals]
+    return np.mean(dissonant) if intervals else np.nan
+if __name__ == "__main__":
+    import argparse
+    from pathlib import Path
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wav_path",
+        type=Path,
+        help="Path to the wav file",
+    )
+    parser.add_argument(
+        "--results_csv",
+        type=Path,
+        help="csv file to save the results",
+    )
+    args = parser.parse_args()
+    args.results_csv.parent.mkdir(parents=True, exist_ok=True)
+    y, fs = librosa.load(args.wav_path, sr=None)
+    # warmup
+    predictor = singmos_warmup()
+    score_extractor = score_extract_warmpup()
+    aesthetic_predictor = initialize_audiobox_predictor()
+    # evaluate the audio
+    metrics = {}
+    # singmos evaluation
+    score = singmos_evaluation(predictor, y, fs)
+    metrics["singmos"] = score
+    # score metric evaluation
+    score_results = score_metric_evaluation(score_extractor, args.wav_path)
+    metrics.update(score_results)
+    # audiobox aesthetics evaluation
+    score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path)
+    metrics.update(score_results[0])
+    # save results
+    with open(args.results_csv, "a") as f:
+        header = "file," + ",".join(metrics.keys()) + "\n"
+        if f.tell() == 0:
+            f.write(header)
+        else:
+            with open(args.results_csv, "r") as f2:
+                file_header = f2.readline()
+            if file_header != header:
+                raise ValueError(f"Header mismatch: {file_header} vs {header}")
+        line = (
+            ",".join([str(args.wav_path)] + [str(v) for v in metrics.values()]) + "\n"
+        )
+        f.write(line)

svs_utils.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import json
 import random
-import librosa
 import numpy as np
-import torch
 from espnet2.bin.svs_inference import SingingGenerate
 from espnet_model_zoo.downloader import ModelDownloader
@@ -30,6 +28,21 @@ def svs_warmup(config):
             model_file=downloaded["model_file"],
             device=config.device,
         )
     else:
         raise NotImplementedError(f"Model {config.model_path} not supported")
     return model
@@ -212,7 +225,7 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
     if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
         sid = np.array([int(config.speaker)])
         output_dict = svs_model(batch, sids=sid)
-    elif config.model_path == "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained":
         langs = {
             "zh": 2,
             "jp": 1,
@@ -227,21 +240,6 @@ def svs_inference(answer_text, svs_model, config, **kwargs):
     return wav_info
-def singmos_warmup():
-    predictor = torch.hub.load(
-        "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
-    )
-    return predictor, "South-Twilight/SingMOS:v0.2.0"
-def singmos_evaluation(predictor, wav_info, fs):
-    wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
-    wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
-    len_mos = torch.tensor([wav_mos.shape[1]])
-    score = predictor(wav_mos, len_mos)
-    return score
 def estimate_sentence_length(query, config, song2note_lengths):
     if config.melody_source == "random_select.touhou":
         song_name = "touhou"
@@ -376,7 +374,7 @@ if __name__ == "__main__":
     # -------- demo code for generate audio from randomly selected song ---------#
     config = argparse.Namespace(
-        model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
         cache_dir="cache",
         device="cuda", # "cpu"
         melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation",  "random_select.touhou"

 import json
 import random
 import numpy as np
 from espnet2.bin.svs_inference import SingingGenerate
 from espnet_model_zoo.downloader import ModelDownloader
             model_file=downloaded["model_file"],
             device=config.device,
         )
+        dummy_batch = {
+            "score": (
+                75,  # tempo
+                [
+                    (0.0, 0.25, "r_en", 63.0, "r_en"),
+                    (0.25, 0.5, "—", 63.0, "en"),
+                ],
+            ),
+            "text": "r en en",
+        }
+        model(
+            dummy_batch,
+            lids=np.array([2]),
+            spembs=np.load("resource/singer/singer_embedding_ace-2.npy"),
+        )  # warmup
     else:
         raise NotImplementedError(f"Model {config.model_path} not supported")
     return model
     if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
         sid = np.array([int(config.speaker)])
         output_dict = svs_model(batch, sids=sid)
+    elif config.model_path == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
         langs = {
             "zh": 2,
             "jp": 1,
     return wav_info
 def estimate_sentence_length(query, config, song2note_lengths):
     if config.melody_source == "random_select.touhou":
         song_name = "touhou"
     # -------- demo code for generate audio from randomly selected song ---------#
     config = argparse.Namespace(
+        model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained",
         cache_dir="cache",
         device="cuda", # "cpu"
         melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation",  "random_select.touhou"

util.py CHANGED Viewed

@@ -61,7 +61,7 @@ def get_tokenizer(model, lang):
             return lambda text: split_pinyin_py(text)
         else:
             raise ValueError(f"Only support Chinese language for {model}")
-    elif model == "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained":
         if lang == "zh":
             with open(os.path.join("resource/all_plans.json"), "r") as f:
                 all_plan_dict = json.load(f)
@@ -74,7 +74,7 @@ def get_tokenizer(model, lang):
         else:
             raise ValueError(f"Only support Chinese and Japanese language for {model}")
     else:
-        raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkembed_lang_pretrained for now")
 def get_pinyin(texts):

             return lambda text: split_pinyin_py(text)
         else:
             raise ValueError(f"Only support Chinese language for {model}")
+    elif model == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
         if lang == "zh":
             with open(os.path.join("resource/all_plans.json"), "r") as f:
                 all_plan_dict = json.load(f)
         else:
             raise ValueError(f"Only support Chinese and Japanese language for {model}")
     else:
+        raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
 def get_pinyin(texts):