import json import random import numpy as np from espnet2.bin.svs_inference import SingingGenerate from espnet_model_zoo.downloader import ModelDownloader from util import get_pinyin, get_tokenizer, postprocess_phn, preprocess_input from kanjiconv import KanjiConv import unicodedata kanji_to_kana = KanjiConv() def svs_warmup(config): """ What: module loading, and model loading Input: config dict/namespace (e.g., model path, cache dir, device, language, possibly speaker selection) Return: the inference prototype function (which creates pitch/duration and runs model-specific inference) """ if config.model_path.startswith("espnet"): espnet_downloader = ModelDownloader(config.cache_dir) downloaded = espnet_downloader.download_and_unpack(config.model_path) model = SingingGenerate( train_config=downloaded["train_config"], model_file=downloaded["model_file"], device=config.device, ) dummy_batch = { "score": ( 75, # tempo [ (0.0, 0.25, "r_en", 63.0, "r_en"), (0.25, 0.5, "—", 63.0, "en"), ], ), "text": "r en en", } model( dummy_batch, lids=np.array([2]), spembs=np.load("resource/singer/singer_embedding_ace-2.npy"), ) # warmup else: raise NotImplementedError(f"Model {config.model_path} not supported") return model yoon_map = { "ぁ": "あ", "ぃ": "い", "ぅ": "う", "ぇ": "え", "ぉ": "お", "ゃ": "や", "ゅ": "ゆ", "ょ": "よ", "ゎ": "わ" } def replace_chouonpu(hiragana_text): """ process「ー」since the previous packages didn't support """ vowels = { "あ": "あ", "い": "い", "う": "う", "え": "え", "お": "う", "か": "あ", "き": "い", "く": "う", "け": "え", "こ": "う", "さ": "あ", "し": "い", "す": "う", "せ": "え", "そ": "う", "た": "あ", "ち": "い", "つ": "う", "て": "え", "と": "う", "な": "あ", "に": "い", "ぬ": "う", "ね": "え", "の": "う", "は": "あ", "ひ": "い", "ふ": "う", "へ": "え", "ほ": "う", "ま": "あ", "み": "い", "む": "う", "め": "え", "も": "う", "や": "あ", "ゆ": "う", "よ": "う", "ら": "あ", "り": "い", "る": "う", "れ": "え", "ろ": "う", "わ": "あ", "を": "う", } new_text = [] for i, char in enumerate(hiragana_text): if char == "ー" and i > 0: prev_char = new_text[-1] if prev_char in yoon_map: prev_char = yoon_map[prev_char] new_text.append(vowels.get(prev_char, prev_char)) else: new_text.append(char) return "".join(new_text) def is_small_kana(kana): # ょ True よ False for char in kana: name = unicodedata.name(char, "") if "SMALL" in name: return True return False def kanji_to_SVSDictKana(text): hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", "")) hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ") # list # print(f'debug -- hiragana_text {hiragana_text_wl}') final_ls = [] for subword in hiragana_text_wl: sl_prev = 0 for i in range(len(subword)-1): if sl_prev>=len(subword)-1: break sl = sl_prev + 1 if subword[sl] in yoon_map: final_ls.append(subword[sl_prev:sl+1]) sl_prev+=2 else: final_ls.append(subword[sl_prev]) sl_prev+=1 final_ls.append(subword[sl_prev]) # final_str = " ".join(final_ls) return final_ls def svs_text_preprocessor(model_path, texts, lang): """ Input: - model_path (str), for getting the corresponding tokenizer - texts (str), in Chinese character or Japanese character - lang (str), language label jp/zh, input if is not espnet model Output: - lyric_ls (lyric list), each element as 'k@zhe@zh' - sybs (phn w/ _ list), each element as 'k@zh_e@zh' - labels (phn w/o _ list), each element as 'k@zh' """ fs = 44100 if texts is None: raise ValueError("texts is None when calling svs_text_preprocessor") # preprocess if lang == "zh": texts = preprocess_input(texts, "") text_list = get_pinyin(texts) elif lang == "jp": text_list = kanji_to_SVSDictKana(texts) # texts = preprocess_input(texts, "") # text_list = list(texts) # text to phoneme tokenizer = get_tokenizer(model_path, lang) sybs = [] # phoneme list for text in text_list: if text == "AP" or text == "SP": rev = [text] elif text == "-" or text == "——": rev = [text] else: rev = tokenizer(text) if rev == False: return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!" rev = postprocess_phn(rev, model_path, lang) phns = "_".join(rev) sybs.append(phns) lyric_ls = [] labels = [] pre_phn = "" for phns in sybs: if phns == "-" or phns == "——": phns = pre_phn phn_list = phns.split("_") lyric = "".join(phn_list) for phn in phn_list: labels.append(phn) pre_phn = labels[-1] lyric_ls.append(lyric) return lyric_ls, sybs, labels def create_batch_with_randomized_melody(lyric_ls, sybs, labels, config): """ Input: - answer_text (str), in Chinese character or Japanese character - model_path (str), loaded pretrained model name - lang (str), language label jp/zh, input if is not espnet model Output: - batch (dict) {'score': (75, [[0, 0.48057527844210024, 'n@zhi@zh', 66, 'n@zh_i@zh'], [0.48057527844210024, 0.8049310140914353, 'k@zhe@zh', 57, 'k@zh_e@zh'], [0.8049310140914353, 1.1905956333296641, 'm@zhei@zh', 64, 'm@zh_ei@zh']]), 'text': 'n@zh i@zh k@zh e@zh m@zh ei@zh'} """ tempo = 120 len_note = len(lyric_ls) notes = [] # midi_range = (57,69) st = 0 for id_lyric in range(len_note): pitch = random.randint(57, 69) period = round(random.uniform(0.1, 0.5), 4) ed = st + period note = [st, ed, lyric_ls[id_lyric], pitch, sybs[id_lyric]] st = ed notes.append(note) phns_str = " ".join(labels) batch = { "score": ( int(tempo), notes, ), "text": phns_str, } return batch def svs_inference(answer_text, svs_model, config, **kwargs): lyric_ls, sybs, labels = svs_text_preprocessor( config.model_path, answer_text, config.lang ) if config.melody_source.startswith("random_generate"): batch = create_batch_with_randomized_melody(lyric_ls, sybs, labels, config) elif config.melody_source.startswith("random_select"): segment_iterator = song_segment_iterator(kwargs["song_db"], kwargs["metadata"]) batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels, config) else: raise NotImplementedError(f"melody source {config.melody_source} not supported") if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain": sid = np.array([int(config.speaker)]) output_dict = svs_model(batch, sids=sid) elif config.model_path == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained": langs = { "zh": 2, "jp": 1, "en": 2, } lid = np.array([langs[config.lang]]) spk_embed = np.load(config.speaker) output_dict = svs_model(batch, lids=lid, spembs=spk_embed) else: raise NotImplementedError(f"Model {config.model_path} not supported") wav_info = output_dict["wav"].cpu().numpy() return wav_info def estimate_sentence_length(query, config, song2note_lengths): if config.melody_source == "random_select.touhou": song_name = "touhou" phrase_length = None metadata = {"song_name": song_name} return phrase_length, metadata if config.melody_source.startswith("random_select"): song_name = random.choice(list(song2note_lengths.keys())) phrase_length = song2note_lengths[song_name] metadata = {"song_name": song_name} return phrase_length, metadata else: raise NotImplementedError(f"melody source {config.melody_source} not supported") def align_score_and_text(segment_iterator, lyric_ls, sybs, labels, config): text = [] lyric_idx = 0 notes_info = [] while lyric_idx < len(lyric_ls): score = next(segment_iterator) for note_start_time, note_end_time, reference_note_lyric, note_midi in zip( score["note_start_times"], score["note_end_times"], score["note_lyrics"], score["note_midi"], ): if reference_note_lyric in ["", ""]: notes_info.append( [ note_start_time, note_end_time, reference_note_lyric.strip("<>"), note_midi, reference_note_lyric.strip("<>"), ] ) text.append(reference_note_lyric.strip("<>")) elif ( reference_note_lyric in ["-", "——"] and config.melody_source == "random_select.take_lyric_continuation" ): notes_info.append( [ note_start_time, note_end_time, reference_note_lyric, note_midi, text[-1], ] ) text.append(text[-1]) else: notes_info.append( [ note_start_time, note_end_time, lyric_ls[lyric_idx], note_midi, sybs[lyric_idx], ] ) text += sybs[lyric_idx].split("_") lyric_idx += 1 if lyric_idx >= len(lyric_ls): break batch = { "score": ( score["tempo"], # Assume the tempo is the same for all segments notes_info, ), "text": " ".join(text), } return batch def load_list_from_json(json_path): with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) data = [ { "tempo": d["tempo"], "note_start_times": [n[0] * (100/d["tempo"]) for n in d["score"]], "note_end_times": [n[1] * (100/d["tempo"]) for n in d["score"]], "note_lyrics": ["" for n in d["score"]], "note_midi": [n[2] for n in d["score"]], } for d in data ] if isinstance(data, list): return data else: raise ValueError("The contents of the json is not list.") def song_segment_iterator(song_db, metadata): song_name = metadata["song_name"] if song_name.startswith("kising_"): # return a iterator that load from song_name_{001} and increment segment_id = 1 while f"{song_name}_{segment_id:03d}" in song_db.index: yield song_db.loc[f"{song_name}_{segment_id:03d}"] segment_id += 1 elif song_name.startswith("touhou"): # return a iterator that load from touhou musics data = load_list_from_json("data/touhou/note_data.json") while True: yield random.choice(data) else: raise NotImplementedError(f"song name {song_name} not supported") def load_song_database(config): from datasets import load_dataset song_db = load_dataset( "jhansss/kising_score_segments", cache_dir="cache", split="train" ).to_pandas() song_db.set_index("segment_id", inplace=True) if ".take_lyric_continuation" in config.melody_source: with open("data/song2word_lengths.json", "r") as f: song2note_lengths = json.load(f) else: with open("data/song2note_lengths.json", "r") as f: song2note_lengths = json.load(f) return song2note_lengths, song_db if __name__ == "__main__": import argparse import soundfile as sf # -------- demo code for generate audio from randomly selected song ---------# config = argparse.Namespace( model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained", cache_dir="cache", device="cuda", # "cpu" melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou" lang="zh", speaker="resource/singer/singer_embedding_ace-2.npy", ) # load model model = svs_warmup(config) if config.lang == "zh": answer_text = "天气真好\n空气清新\n气温温和\n风和日丽\n天高气爽\n阳光明媚" elif config.lang == "jp": answer_text = "流れてく時の中ででもけだるさが" else: print(f"Currently system does not support {config.lang}") exit(1) sample_rate = 44100 if config.melody_source.startswith("random_select"): # load song database: jhansss/kising_score_segments song2note_lengths, song_db = load_song_database(config) # get song_name and phrase_length phrase_length, metadata = estimate_sentence_length( None, config, song2note_lengths ) # then, phrase_length info should be added to llm prompt, and get the answer lyrics from llm additional_kwargs = {"song_db": song_db, "metadata": metadata} else: additional_kwargs = {} wav_info = svs_inference(answer_text, model, config, **additional_kwargs) # write wav to output_retrieved.wav save_name = config.melody_source sf.write(f"{save_name}_{config.lang}.wav", wav_info, samplerate=sample_rate)