Spaces:
Sleeping
Sleeping
syntax updates
Browse files- svs_utils.py +62 -35
- util.py +3 -14
svs_utils.py
CHANGED
@@ -1,4 +1,9 @@
|
|
1 |
-
from util import
|
|
|
|
|
|
|
|
|
|
|
2 |
from espnet_model_zoo.downloader import ModelDownloader
|
3 |
from espnet2.bin.svs_inference import SingingGenerate
|
4 |
import librosa
|
@@ -7,6 +12,7 @@ import numpy as np
|
|
7 |
import random
|
8 |
import json
|
9 |
from datasets import load_dataset
|
|
|
10 |
# the code below should be in app.py than svs_utils.py
|
11 |
# espnet_model_dict = {
|
12 |
# "Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain",
|
@@ -55,7 +61,7 @@ def svs_warmup(config):
|
|
55 |
model = SingingGenerate(
|
56 |
train_config=downloaded["train_config"],
|
57 |
model_file=downloaded["model_file"],
|
58 |
-
device=config.device
|
59 |
)
|
60 |
else:
|
61 |
raise NotImplementedError(f"Model {config.model_path} not supported")
|
@@ -63,8 +69,8 @@ def svs_warmup(config):
|
|
63 |
|
64 |
|
65 |
def svs_text_preprocessor(model_path, texts, lang):
|
66 |
-
|
67 |
-
Input:
|
68 |
- model_path (str), for getting the corresponding tokenizer
|
69 |
- texts (str), in Chinese character or Japanese character
|
70 |
- lang (str), language label jp/zh, input if is not espnet model
|
@@ -74,7 +80,7 @@ def svs_text_preprocessor(model_path, texts, lang):
|
|
74 |
- sybs (phn w/ _ list), each element as 'k@zh_e@zh'
|
75 |
- labels (phn w/o _ list), each element as 'k@zh'
|
76 |
|
77 |
-
|
78 |
fs = 44100
|
79 |
|
80 |
if texts is None:
|
@@ -122,22 +128,21 @@ def svs_text_preprocessor(model_path, texts, lang):
|
|
122 |
|
123 |
|
124 |
def svs_get_batch(model_path, answer_text, lang, random_gen=True):
|
125 |
-
|
126 |
-
Input:
|
127 |
- answer_text (str), in Chinese character or Japanese character
|
128 |
- model_path (str), loaded pretrained model name
|
129 |
- lang (str), language label jp/zh, input if is not espnet model
|
130 |
Output:
|
131 |
-
- batch (dict)
|
132 |
|
133 |
-
{'score': (75, [[0, 0.48057527844210024, 'n@zhi@zh', 66, 'n@zh_i@zh'],
|
134 |
-
[0.48057527844210024, 0.8049310140914353, 'k@zhe@zh', 57, 'k@zh_e@zh'],
|
135 |
-
[0.8049310140914353, 1.1905956333296641, 'm@zhei@zh', 64, 'm@zh_ei@zh']]),
|
136 |
'text': 'n@zh i@zh k@zh e@zh m@zh ei@zh'}
|
137 |
-
|
138 |
tempo = 120
|
139 |
-
lyric_ls, sybs, labels = svs_text_preprocessor(
|
140 |
-
model_path, answer_text, lang)
|
141 |
len_note = len(lyric_ls)
|
142 |
notes = []
|
143 |
if random_gen:
|
@@ -146,7 +151,7 @@ def svs_get_batch(model_path, answer_text, lang, random_gen=True):
|
|
146 |
for id_lyric in range(len_note):
|
147 |
pitch = random.randint(57, 69)
|
148 |
period = round(random.uniform(0.1, 0.5), 4)
|
149 |
-
ed = st+period
|
150 |
note = [st, ed, lyric_ls[id_lyric], pitch, sybs[id_lyric]]
|
151 |
st = ed
|
152 |
notes.append(note)
|
@@ -175,8 +180,7 @@ svs = None
|
|
175 |
|
176 |
|
177 |
def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=44100):
|
178 |
-
batch = svs_get_batch(model_name, answer_text, lang,
|
179 |
-
random_gen=random_gen)
|
180 |
|
181 |
# Infer
|
182 |
spk = "singer1 (male)"
|
@@ -209,7 +213,8 @@ def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=
|
|
209 |
|
210 |
def singmos_warmup(config):
|
211 |
predictor = torch.hub.load(
|
212 |
-
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
|
|
213 |
return predictor, "South-Twilight/SingMOS:v0.2.0"
|
214 |
|
215 |
|
@@ -230,8 +235,7 @@ def estimate_sentence_length(query, config, song2note_lengths):
|
|
230 |
metadata = {"song_name": song_name}
|
231 |
return phrase_length, metadata
|
232 |
else:
|
233 |
-
raise NotImplementedError(
|
234 |
-
f"melody source {config.melody_source} not supported")
|
235 |
|
236 |
|
237 |
def align_score_and_text(segment_iterator, lyric_ls, sybs, labels):
|
@@ -240,22 +244,43 @@ def align_score_and_text(segment_iterator, lyric_ls, sybs, labels):
|
|
240 |
notes_info = []
|
241 |
while lyric_idx < len(lyric_ls):
|
242 |
score = next(segment_iterator)
|
243 |
-
for note_start_time, note_end_time, reference_note_lyric, note_midi in zip(
|
|
|
|
|
|
|
|
|
|
|
244 |
if reference_note_lyric in ["<AP>", "<SP>"]:
|
245 |
-
notes_info.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
text.append(reference_note_lyric.strip("<>"))
|
247 |
else:
|
248 |
-
notes_info.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
text += sybs[lyric_idx].split("_")
|
250 |
lyric_idx += 1
|
251 |
if lyric_idx >= len(lyric_ls):
|
252 |
break
|
253 |
batch = {
|
254 |
"score": (
|
255 |
-
score[
|
256 |
-
notes_info
|
257 |
),
|
258 |
-
"text": " ".join(text)
|
259 |
}
|
260 |
return batch
|
261 |
|
@@ -273,15 +298,17 @@ def song_segment_iterator(song_db, metadata):
|
|
273 |
|
274 |
|
275 |
def load_song_database():
|
276 |
-
song_db = load_dataset(
|
277 |
-
|
|
|
278 |
song_db.set_index("segment_id", inplace=True)
|
279 |
|
280 |
with open("data/song2note_lengths.json", "r") as f:
|
281 |
song2note_lengths = json.load(f)
|
282 |
-
return song2note_lengths,song_db
|
|
|
283 |
|
284 |
-
if __name__ ==
|
285 |
import argparse
|
286 |
|
287 |
# -------- demo code for generate audio from randomly selected song ---------#
|
@@ -300,23 +327,23 @@ if __name__ == '__main__':
|
|
300 |
song2note_lengths, song_db = load_song_database()
|
301 |
|
302 |
# get song_name and phrase_length
|
303 |
-
phrase_length, metadata = estimate_sentence_length(
|
304 |
-
None, config, song2note_lengths)
|
305 |
|
306 |
# then, phrase_length info should be added to llm prompt, and get the answer lyrics from llm
|
307 |
# e.g. answer_text = "天气真好\n空气清新"
|
308 |
answer_text = "天气真好\n空气清新"
|
309 |
lyric_ls, sybs, labels = svs_text_preprocessor(
|
310 |
-
config.model_path, answer_text, config.lang
|
|
|
311 |
segment_iterator = song_segment_iterator(song_db, metadata)
|
312 |
batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels)
|
313 |
-
singer_embedding = np.load(
|
314 |
-
singer_embeddings[config.model_path]["singer2 (female)"])
|
315 |
lid = np.array([langs[config.lang]])
|
316 |
output_dict = model(batch, lids=lid, spembs=singer_embedding)
|
317 |
wav_info = output_dict["wav"].cpu().numpy()
|
318 |
# write wav to output_retrieved.wav
|
319 |
import soundfile as sf
|
|
|
320 |
sf.write("output_retrieved.wav", wav_info, samplerate=44100)
|
321 |
|
322 |
# -------- some other processes ---------#
|
|
|
1 |
+
from util import (
|
2 |
+
preprocess_input,
|
3 |
+
postprocess_phn,
|
4 |
+
get_tokenizer,
|
5 |
+
get_pinyin,
|
6 |
+
)
|
7 |
from espnet_model_zoo.downloader import ModelDownloader
|
8 |
from espnet2.bin.svs_inference import SingingGenerate
|
9 |
import librosa
|
|
|
12 |
import random
|
13 |
import json
|
14 |
from datasets import load_dataset
|
15 |
+
|
16 |
# the code below should be in app.py than svs_utils.py
|
17 |
# espnet_model_dict = {
|
18 |
# "Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain",
|
|
|
61 |
model = SingingGenerate(
|
62 |
train_config=downloaded["train_config"],
|
63 |
model_file=downloaded["model_file"],
|
64 |
+
device=config.device,
|
65 |
)
|
66 |
else:
|
67 |
raise NotImplementedError(f"Model {config.model_path} not supported")
|
|
|
69 |
|
70 |
|
71 |
def svs_text_preprocessor(model_path, texts, lang):
|
72 |
+
"""
|
73 |
+
Input:
|
74 |
- model_path (str), for getting the corresponding tokenizer
|
75 |
- texts (str), in Chinese character or Japanese character
|
76 |
- lang (str), language label jp/zh, input if is not espnet model
|
|
|
80 |
- sybs (phn w/ _ list), each element as 'k@zh_e@zh'
|
81 |
- labels (phn w/o _ list), each element as 'k@zh'
|
82 |
|
83 |
+
"""
|
84 |
fs = 44100
|
85 |
|
86 |
if texts is None:
|
|
|
128 |
|
129 |
|
130 |
def svs_get_batch(model_path, answer_text, lang, random_gen=True):
|
131 |
+
"""
|
132 |
+
Input:
|
133 |
- answer_text (str), in Chinese character or Japanese character
|
134 |
- model_path (str), loaded pretrained model name
|
135 |
- lang (str), language label jp/zh, input if is not espnet model
|
136 |
Output:
|
137 |
+
- batch (dict)
|
138 |
|
139 |
+
{'score': (75, [[0, 0.48057527844210024, 'n@zhi@zh', 66, 'n@zh_i@zh'],
|
140 |
+
[0.48057527844210024, 0.8049310140914353, 'k@zhe@zh', 57, 'k@zh_e@zh'],
|
141 |
+
[0.8049310140914353, 1.1905956333296641, 'm@zhei@zh', 64, 'm@zh_ei@zh']]),
|
142 |
'text': 'n@zh i@zh k@zh e@zh m@zh ei@zh'}
|
143 |
+
"""
|
144 |
tempo = 120
|
145 |
+
lyric_ls, sybs, labels = svs_text_preprocessor(model_path, answer_text, lang)
|
|
|
146 |
len_note = len(lyric_ls)
|
147 |
notes = []
|
148 |
if random_gen:
|
|
|
151 |
for id_lyric in range(len_note):
|
152 |
pitch = random.randint(57, 69)
|
153 |
period = round(random.uniform(0.1, 0.5), 4)
|
154 |
+
ed = st + period
|
155 |
note = [st, ed, lyric_ls[id_lyric], pitch, sybs[id_lyric]]
|
156 |
st = ed
|
157 |
notes.append(note)
|
|
|
180 |
|
181 |
|
182 |
def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=44100):
|
183 |
+
batch = svs_get_batch(model_name, answer_text, lang, random_gen=random_gen)
|
|
|
184 |
|
185 |
# Infer
|
186 |
spk = "singer1 (male)"
|
|
|
213 |
|
214 |
def singmos_warmup(config):
|
215 |
predictor = torch.hub.load(
|
216 |
+
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
217 |
+
)
|
218 |
return predictor, "South-Twilight/SingMOS:v0.2.0"
|
219 |
|
220 |
|
|
|
235 |
metadata = {"song_name": song_name}
|
236 |
return phrase_length, metadata
|
237 |
else:
|
238 |
+
raise NotImplementedError(f"melody source {config.melody_source} not supported")
|
|
|
239 |
|
240 |
|
241 |
def align_score_and_text(segment_iterator, lyric_ls, sybs, labels):
|
|
|
244 |
notes_info = []
|
245 |
while lyric_idx < len(lyric_ls):
|
246 |
score = next(segment_iterator)
|
247 |
+
for note_start_time, note_end_time, reference_note_lyric, note_midi in zip(
|
248 |
+
score["note_start_times"],
|
249 |
+
score["note_end_times"],
|
250 |
+
score["note_lyrics"],
|
251 |
+
score["note_midi"],
|
252 |
+
):
|
253 |
if reference_note_lyric in ["<AP>", "<SP>"]:
|
254 |
+
notes_info.append(
|
255 |
+
[
|
256 |
+
note_start_time,
|
257 |
+
note_end_time,
|
258 |
+
reference_note_lyric.strip("<>"),
|
259 |
+
note_midi,
|
260 |
+
reference_note_lyric.strip("<>"),
|
261 |
+
]
|
262 |
+
)
|
263 |
text.append(reference_note_lyric.strip("<>"))
|
264 |
else:
|
265 |
+
notes_info.append(
|
266 |
+
[
|
267 |
+
note_start_time,
|
268 |
+
note_end_time,
|
269 |
+
lyric_ls[lyric_idx],
|
270 |
+
note_midi,
|
271 |
+
sybs[lyric_idx],
|
272 |
+
]
|
273 |
+
)
|
274 |
text += sybs[lyric_idx].split("_")
|
275 |
lyric_idx += 1
|
276 |
if lyric_idx >= len(lyric_ls):
|
277 |
break
|
278 |
batch = {
|
279 |
"score": (
|
280 |
+
score["tempo"], # Assume the tempo is the same for all segments
|
281 |
+
notes_info,
|
282 |
),
|
283 |
+
"text": " ".join(text),
|
284 |
}
|
285 |
return batch
|
286 |
|
|
|
298 |
|
299 |
|
300 |
def load_song_database():
|
301 |
+
song_db = load_dataset(
|
302 |
+
"jhansss/kising_score_segments", cache_dir="cache", split="train"
|
303 |
+
).to_pandas()
|
304 |
song_db.set_index("segment_id", inplace=True)
|
305 |
|
306 |
with open("data/song2note_lengths.json", "r") as f:
|
307 |
song2note_lengths = json.load(f)
|
308 |
+
return song2note_lengths, song_db
|
309 |
+
|
310 |
|
311 |
+
if __name__ == "__main__":
|
312 |
import argparse
|
313 |
|
314 |
# -------- demo code for generate audio from randomly selected song ---------#
|
|
|
327 |
song2note_lengths, song_db = load_song_database()
|
328 |
|
329 |
# get song_name and phrase_length
|
330 |
+
phrase_length, metadata = estimate_sentence_length(None, config, song2note_lengths)
|
|
|
331 |
|
332 |
# then, phrase_length info should be added to llm prompt, and get the answer lyrics from llm
|
333 |
# e.g. answer_text = "天气真好\n空气清新"
|
334 |
answer_text = "天气真好\n空气清新"
|
335 |
lyric_ls, sybs, labels = svs_text_preprocessor(
|
336 |
+
config.model_path, answer_text, config.lang
|
337 |
+
)
|
338 |
segment_iterator = song_segment_iterator(song_db, metadata)
|
339 |
batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels)
|
340 |
+
singer_embedding = np.load(singer_embeddings[config.model_path]["singer2 (female)"])
|
|
|
341 |
lid = np.array([langs[config.lang]])
|
342 |
output_dict = model(batch, lids=lid, spembs=singer_embedding)
|
343 |
wav_info = output_dict["wav"].cpu().numpy()
|
344 |
# write wav to output_retrieved.wav
|
345 |
import soundfile as sf
|
346 |
+
|
347 |
sf.write("output_retrieved.wav", wav_info, samplerate=44100)
|
348 |
|
349 |
# -------- some other processes ---------#
|
util.py
CHANGED
@@ -8,6 +8,7 @@ import pyopenjtalk
|
|
8 |
from resource.pinyin_dict import PINYIN_DICT
|
9 |
from pypinyin import lazy_pinyin
|
10 |
|
|
|
11 |
def preprocess_input(src_str, seg_syb=" "):
|
12 |
src_str = src_str.replace("\n", seg_syb)
|
13 |
src_str = src_str.replace(" ", seg_syb)
|
@@ -29,7 +30,6 @@ def pyopenjtalk_g2p(text) -> List[str]:
|
|
29 |
for warning in w:
|
30 |
if "No phoneme" in str(warning.message):
|
31 |
return False
|
32 |
-
|
33 |
phones = phones.split(" ")
|
34 |
return phones
|
35 |
|
@@ -73,21 +73,10 @@ def get_pinyin(texts):
|
|
73 |
pinyin_list = lazy_pinyin(texts)
|
74 |
text_list = []
|
75 |
for text in pinyin_list:
|
76 |
-
if text[0] == "S" or text[0] == "A" or text[0] ==
|
77 |
-
sp_strs = re.findall(r
|
78 |
for phn in sp_strs:
|
79 |
text_list.append(phn)
|
80 |
else:
|
81 |
text_list.append(text)
|
82 |
return text_list
|
83 |
-
|
84 |
-
|
85 |
-
def load_pitch_dict(file_path = "resource/midi-note.scp"):
|
86 |
-
pitch_dict = {}
|
87 |
-
with open(file_path, "r", encoding="utf-8") as f:
|
88 |
-
for line in f:
|
89 |
-
items = line.strip().split()
|
90 |
-
pitch_dict[items[0]] = int(items[1])
|
91 |
-
pitch_dict[items[1]] = int(items[1])
|
92 |
-
return pitch_dict
|
93 |
-
|
|
|
8 |
from resource.pinyin_dict import PINYIN_DICT
|
9 |
from pypinyin import lazy_pinyin
|
10 |
|
11 |
+
|
12 |
def preprocess_input(src_str, seg_syb=" "):
|
13 |
src_str = src_str.replace("\n", seg_syb)
|
14 |
src_str = src_str.replace(" ", seg_syb)
|
|
|
30 |
for warning in w:
|
31 |
if "No phoneme" in str(warning.message):
|
32 |
return False
|
|
|
33 |
phones = phones.split(" ")
|
34 |
return phones
|
35 |
|
|
|
73 |
pinyin_list = lazy_pinyin(texts)
|
74 |
text_list = []
|
75 |
for text in pinyin_list:
|
76 |
+
if text[0] == "S" or text[0] == "A" or text[0] == "-":
|
77 |
+
sp_strs = re.findall(r"-|AP|SP", text)
|
78 |
for phn in sp_strs:
|
79 |
text_list.append(phn)
|
80 |
else:
|
81 |
text_list.append(text)
|
82 |
return text_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|