jhansss commited on
Commit
987c46e
·
1 Parent(s): 366a231

syntax updates

Browse files
Files changed (2) hide show
  1. svs_utils.py +62 -35
  2. util.py +3 -14
svs_utils.py CHANGED
@@ -1,4 +1,9 @@
1
- from util import preprocess_input, postprocess_phn, get_tokenizer, load_pitch_dict, get_pinyin
 
 
 
 
 
2
  from espnet_model_zoo.downloader import ModelDownloader
3
  from espnet2.bin.svs_inference import SingingGenerate
4
  import librosa
@@ -7,6 +12,7 @@ import numpy as np
7
  import random
8
  import json
9
  from datasets import load_dataset
 
10
  # the code below should be in app.py than svs_utils.py
11
  # espnet_model_dict = {
12
  # "Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain",
@@ -55,7 +61,7 @@ def svs_warmup(config):
55
  model = SingingGenerate(
56
  train_config=downloaded["train_config"],
57
  model_file=downloaded["model_file"],
58
- device=config.device
59
  )
60
  else:
61
  raise NotImplementedError(f"Model {config.model_path} not supported")
@@ -63,8 +69,8 @@ def svs_warmup(config):
63
 
64
 
65
  def svs_text_preprocessor(model_path, texts, lang):
66
- '''
67
- Input:
68
  - model_path (str), for getting the corresponding tokenizer
69
  - texts (str), in Chinese character or Japanese character
70
  - lang (str), language label jp/zh, input if is not espnet model
@@ -74,7 +80,7 @@ def svs_text_preprocessor(model_path, texts, lang):
74
  - sybs (phn w/ _ list), each element as 'k@zh_e@zh'
75
  - labels (phn w/o _ list), each element as 'k@zh'
76
 
77
- '''
78
  fs = 44100
79
 
80
  if texts is None:
@@ -122,22 +128,21 @@ def svs_text_preprocessor(model_path, texts, lang):
122
 
123
 
124
  def svs_get_batch(model_path, answer_text, lang, random_gen=True):
125
- '''
126
- Input:
127
  - answer_text (str), in Chinese character or Japanese character
128
  - model_path (str), loaded pretrained model name
129
  - lang (str), language label jp/zh, input if is not espnet model
130
  Output:
131
- - batch (dict)
132
 
133
- {'score': (75, [[0, 0.48057527844210024, 'n@zhi@zh', 66, 'n@zh_i@zh'],
134
- [0.48057527844210024, 0.8049310140914353, 'k@zhe@zh', 57, 'k@zh_e@zh'],
135
- [0.8049310140914353, 1.1905956333296641, 'm@zhei@zh', 64, 'm@zh_ei@zh']]),
136
  'text': 'n@zh i@zh k@zh e@zh m@zh ei@zh'}
137
- '''
138
  tempo = 120
139
- lyric_ls, sybs, labels = svs_text_preprocessor(
140
- model_path, answer_text, lang)
141
  len_note = len(lyric_ls)
142
  notes = []
143
  if random_gen:
@@ -146,7 +151,7 @@ def svs_get_batch(model_path, answer_text, lang, random_gen=True):
146
  for id_lyric in range(len_note):
147
  pitch = random.randint(57, 69)
148
  period = round(random.uniform(0.1, 0.5), 4)
149
- ed = st+period
150
  note = [st, ed, lyric_ls[id_lyric], pitch, sybs[id_lyric]]
151
  st = ed
152
  notes.append(note)
@@ -175,8 +180,7 @@ svs = None
175
 
176
 
177
  def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=44100):
178
- batch = svs_get_batch(model_name, answer_text, lang,
179
- random_gen=random_gen)
180
 
181
  # Infer
182
  spk = "singer1 (male)"
@@ -209,7 +213,8 @@ def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=
209
 
210
  def singmos_warmup(config):
211
  predictor = torch.hub.load(
212
- "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True)
 
213
  return predictor, "South-Twilight/SingMOS:v0.2.0"
214
 
215
 
@@ -230,8 +235,7 @@ def estimate_sentence_length(query, config, song2note_lengths):
230
  metadata = {"song_name": song_name}
231
  return phrase_length, metadata
232
  else:
233
- raise NotImplementedError(
234
- f"melody source {config.melody_source} not supported")
235
 
236
 
237
  def align_score_and_text(segment_iterator, lyric_ls, sybs, labels):
@@ -240,22 +244,43 @@ def align_score_and_text(segment_iterator, lyric_ls, sybs, labels):
240
  notes_info = []
241
  while lyric_idx < len(lyric_ls):
242
  score = next(segment_iterator)
243
- for note_start_time, note_end_time, reference_note_lyric, note_midi in zip(score['note_start_times'], score['note_end_times'], score["note_lyrics"], score['note_midi']):
 
 
 
 
 
244
  if reference_note_lyric in ["<AP>", "<SP>"]:
245
- notes_info.append([note_start_time, note_end_time, reference_note_lyric.strip("<>"), note_midi, reference_note_lyric.strip("<>")])
 
 
 
 
 
 
 
 
246
  text.append(reference_note_lyric.strip("<>"))
247
  else:
248
- notes_info.append([note_start_time, note_end_time, lyric_ls[lyric_idx], note_midi, sybs[lyric_idx]])
 
 
 
 
 
 
 
 
249
  text += sybs[lyric_idx].split("_")
250
  lyric_idx += 1
251
  if lyric_idx >= len(lyric_ls):
252
  break
253
  batch = {
254
  "score": (
255
- score['tempo'], # Assume the tempo is the same for all segments
256
- notes_info
257
  ),
258
- "text": " ".join(text)
259
  }
260
  return batch
261
 
@@ -273,15 +298,17 @@ def song_segment_iterator(song_db, metadata):
273
 
274
 
275
  def load_song_database():
276
- song_db = load_dataset("jhansss/kising_score_segments",
277
- cache_dir="cache")["train"].to_pandas()
 
278
  song_db.set_index("segment_id", inplace=True)
279
 
280
  with open("data/song2note_lengths.json", "r") as f:
281
  song2note_lengths = json.load(f)
282
- return song2note_lengths,song_db
 
283
 
284
- if __name__ == '__main__':
285
  import argparse
286
 
287
  # -------- demo code for generate audio from randomly selected song ---------#
@@ -300,23 +327,23 @@ if __name__ == '__main__':
300
  song2note_lengths, song_db = load_song_database()
301
 
302
  # get song_name and phrase_length
303
- phrase_length, metadata = estimate_sentence_length(
304
- None, config, song2note_lengths)
305
 
306
  # then, phrase_length info should be added to llm prompt, and get the answer lyrics from llm
307
  # e.g. answer_text = "天气真好\n空气清新"
308
  answer_text = "天气真好\n空气清新"
309
  lyric_ls, sybs, labels = svs_text_preprocessor(
310
- config.model_path, answer_text, config.lang)
 
311
  segment_iterator = song_segment_iterator(song_db, metadata)
312
  batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels)
313
- singer_embedding = np.load(
314
- singer_embeddings[config.model_path]["singer2 (female)"])
315
  lid = np.array([langs[config.lang]])
316
  output_dict = model(batch, lids=lid, spembs=singer_embedding)
317
  wav_info = output_dict["wav"].cpu().numpy()
318
  # write wav to output_retrieved.wav
319
  import soundfile as sf
 
320
  sf.write("output_retrieved.wav", wav_info, samplerate=44100)
321
 
322
  # -------- some other processes ---------#
 
1
+ from util import (
2
+ preprocess_input,
3
+ postprocess_phn,
4
+ get_tokenizer,
5
+ get_pinyin,
6
+ )
7
  from espnet_model_zoo.downloader import ModelDownloader
8
  from espnet2.bin.svs_inference import SingingGenerate
9
  import librosa
 
12
  import random
13
  import json
14
  from datasets import load_dataset
15
+
16
  # the code below should be in app.py than svs_utils.py
17
  # espnet_model_dict = {
18
  # "Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain",
 
61
  model = SingingGenerate(
62
  train_config=downloaded["train_config"],
63
  model_file=downloaded["model_file"],
64
+ device=config.device,
65
  )
66
  else:
67
  raise NotImplementedError(f"Model {config.model_path} not supported")
 
69
 
70
 
71
  def svs_text_preprocessor(model_path, texts, lang):
72
+ """
73
+ Input:
74
  - model_path (str), for getting the corresponding tokenizer
75
  - texts (str), in Chinese character or Japanese character
76
  - lang (str), language label jp/zh, input if is not espnet model
 
80
  - sybs (phn w/ _ list), each element as 'k@zh_e@zh'
81
  - labels (phn w/o _ list), each element as 'k@zh'
82
 
83
+ """
84
  fs = 44100
85
 
86
  if texts is None:
 
128
 
129
 
130
  def svs_get_batch(model_path, answer_text, lang, random_gen=True):
131
+ """
132
+ Input:
133
  - answer_text (str), in Chinese character or Japanese character
134
  - model_path (str), loaded pretrained model name
135
  - lang (str), language label jp/zh, input if is not espnet model
136
  Output:
137
+ - batch (dict)
138
 
139
+ {'score': (75, [[0, 0.48057527844210024, 'n@zhi@zh', 66, 'n@zh_i@zh'],
140
+ [0.48057527844210024, 0.8049310140914353, 'k@zhe@zh', 57, 'k@zh_e@zh'],
141
+ [0.8049310140914353, 1.1905956333296641, 'm@zhei@zh', 64, 'm@zh_ei@zh']]),
142
  'text': 'n@zh i@zh k@zh e@zh m@zh ei@zh'}
143
+ """
144
  tempo = 120
145
+ lyric_ls, sybs, labels = svs_text_preprocessor(model_path, answer_text, lang)
 
146
  len_note = len(lyric_ls)
147
  notes = []
148
  if random_gen:
 
151
  for id_lyric in range(len_note):
152
  pitch = random.randint(57, 69)
153
  period = round(random.uniform(0.1, 0.5), 4)
154
+ ed = st + period
155
  note = [st, ed, lyric_ls[id_lyric], pitch, sybs[id_lyric]]
156
  st = ed
157
  notes.append(note)
 
180
 
181
 
182
  def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=44100):
183
+ batch = svs_get_batch(model_name, answer_text, lang, random_gen=random_gen)
 
184
 
185
  # Infer
186
  spk = "singer1 (male)"
 
213
 
214
  def singmos_warmup(config):
215
  predictor = torch.hub.load(
216
+ "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
217
+ )
218
  return predictor, "South-Twilight/SingMOS:v0.2.0"
219
 
220
 
 
235
  metadata = {"song_name": song_name}
236
  return phrase_length, metadata
237
  else:
238
+ raise NotImplementedError(f"melody source {config.melody_source} not supported")
 
239
 
240
 
241
  def align_score_and_text(segment_iterator, lyric_ls, sybs, labels):
 
244
  notes_info = []
245
  while lyric_idx < len(lyric_ls):
246
  score = next(segment_iterator)
247
+ for note_start_time, note_end_time, reference_note_lyric, note_midi in zip(
248
+ score["note_start_times"],
249
+ score["note_end_times"],
250
+ score["note_lyrics"],
251
+ score["note_midi"],
252
+ ):
253
  if reference_note_lyric in ["<AP>", "<SP>"]:
254
+ notes_info.append(
255
+ [
256
+ note_start_time,
257
+ note_end_time,
258
+ reference_note_lyric.strip("<>"),
259
+ note_midi,
260
+ reference_note_lyric.strip("<>"),
261
+ ]
262
+ )
263
  text.append(reference_note_lyric.strip("<>"))
264
  else:
265
+ notes_info.append(
266
+ [
267
+ note_start_time,
268
+ note_end_time,
269
+ lyric_ls[lyric_idx],
270
+ note_midi,
271
+ sybs[lyric_idx],
272
+ ]
273
+ )
274
  text += sybs[lyric_idx].split("_")
275
  lyric_idx += 1
276
  if lyric_idx >= len(lyric_ls):
277
  break
278
  batch = {
279
  "score": (
280
+ score["tempo"], # Assume the tempo is the same for all segments
281
+ notes_info,
282
  ),
283
+ "text": " ".join(text),
284
  }
285
  return batch
286
 
 
298
 
299
 
300
  def load_song_database():
301
+ song_db = load_dataset(
302
+ "jhansss/kising_score_segments", cache_dir="cache", split="train"
303
+ ).to_pandas()
304
  song_db.set_index("segment_id", inplace=True)
305
 
306
  with open("data/song2note_lengths.json", "r") as f:
307
  song2note_lengths = json.load(f)
308
+ return song2note_lengths, song_db
309
+
310
 
311
+ if __name__ == "__main__":
312
  import argparse
313
 
314
  # -------- demo code for generate audio from randomly selected song ---------#
 
327
  song2note_lengths, song_db = load_song_database()
328
 
329
  # get song_name and phrase_length
330
+ phrase_length, metadata = estimate_sentence_length(None, config, song2note_lengths)
 
331
 
332
  # then, phrase_length info should be added to llm prompt, and get the answer lyrics from llm
333
  # e.g. answer_text = "天气真好\n空气清新"
334
  answer_text = "天气真好\n空气清新"
335
  lyric_ls, sybs, labels = svs_text_preprocessor(
336
+ config.model_path, answer_text, config.lang
337
+ )
338
  segment_iterator = song_segment_iterator(song_db, metadata)
339
  batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels)
340
+ singer_embedding = np.load(singer_embeddings[config.model_path]["singer2 (female)"])
 
341
  lid = np.array([langs[config.lang]])
342
  output_dict = model(batch, lids=lid, spembs=singer_embedding)
343
  wav_info = output_dict["wav"].cpu().numpy()
344
  # write wav to output_retrieved.wav
345
  import soundfile as sf
346
+
347
  sf.write("output_retrieved.wav", wav_info, samplerate=44100)
348
 
349
  # -------- some other processes ---------#
util.py CHANGED
@@ -8,6 +8,7 @@ import pyopenjtalk
8
  from resource.pinyin_dict import PINYIN_DICT
9
  from pypinyin import lazy_pinyin
10
 
 
11
  def preprocess_input(src_str, seg_syb=" "):
12
  src_str = src_str.replace("\n", seg_syb)
13
  src_str = src_str.replace(" ", seg_syb)
@@ -29,7 +30,6 @@ def pyopenjtalk_g2p(text) -> List[str]:
29
  for warning in w:
30
  if "No phoneme" in str(warning.message):
31
  return False
32
-
33
  phones = phones.split(" ")
34
  return phones
35
 
@@ -73,21 +73,10 @@ def get_pinyin(texts):
73
  pinyin_list = lazy_pinyin(texts)
74
  text_list = []
75
  for text in pinyin_list:
76
- if text[0] == "S" or text[0] == "A" or text[0] == '-':
77
- sp_strs = re.findall(r'-|AP|SP', text)
78
  for phn in sp_strs:
79
  text_list.append(phn)
80
  else:
81
  text_list.append(text)
82
  return text_list
83
-
84
-
85
- def load_pitch_dict(file_path = "resource/midi-note.scp"):
86
- pitch_dict = {}
87
- with open(file_path, "r", encoding="utf-8") as f:
88
- for line in f:
89
- items = line.strip().split()
90
- pitch_dict[items[0]] = int(items[1])
91
- pitch_dict[items[1]] = int(items[1])
92
- return pitch_dict
93
-
 
8
  from resource.pinyin_dict import PINYIN_DICT
9
  from pypinyin import lazy_pinyin
10
 
11
+
12
  def preprocess_input(src_str, seg_syb=" "):
13
  src_str = src_str.replace("\n", seg_syb)
14
  src_str = src_str.replace(" ", seg_syb)
 
30
  for warning in w:
31
  if "No phoneme" in str(warning.message):
32
  return False
 
33
  phones = phones.split(" ")
34
  return phones
35
 
 
73
  pinyin_list = lazy_pinyin(texts)
74
  text_list = []
75
  for text in pinyin_list:
76
+ if text[0] == "S" or text[0] == "A" or text[0] == "-":
77
+ sp_strs = re.findall(r"-|AP|SP", text)
78
  for phn in sp_strs:
79
  text_list.append(phn)
80
  else:
81
  text_list.append(text)
82
  return text_list