Han Jionghao commited on
Commit
93bddf5
·
unverified ·
2 Parent(s): ee7581f 586bf69

Merge branch 'SingingSDS:main' into main

Browse files
Files changed (4) hide show
  1. requirements.txt +1 -0
  2. server.py +4 -3
  3. test_performance.py +3 -3
  4. util.py +31 -8
requirements.txt CHANGED
@@ -13,4 +13,5 @@ basic-pitch[onnx]
13
  audiobox_aesthetics
14
  transformers
15
  s3prl
 
16
  git+https://github.com/sea-turt1e/kanjiconv
 
13
  audiobox_aesthetics
14
  transformers
15
  s3prl
16
+ zhconv
17
  git+https://github.com/sea-turt1e/kanjiconv
server.py CHANGED
@@ -12,6 +12,7 @@ import jiwer
12
  import librosa
13
  from svs_utils import load_song_database, estimate_sentence_length
14
  from svs_eval import singmos_warmup, singmos_evaluation
 
15
 
16
 
17
  asr_pipeline = pipeline(
@@ -144,13 +145,13 @@ def on_click_metrics():
144
  # OWSM ctc + PER
145
  y, sr = librosa.load("tmp/response.wav", sr=16000)
146
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
147
- hyp_pinin = lazy_pinyin(asr_result)
148
 
149
  with open(f"tmp/llm.txt", "r") as f:
150
  ref = f.read().replace(' ', '')
151
 
152
- ref_pinin = lazy_pinyin(ref)
153
- per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
154
 
155
  audio = librosa.load(f"tmp/response.wav", sr=sample_rate)[0]
156
  singmos = singmos_evaluation(
 
12
  import librosa
13
  from svs_utils import load_song_database, estimate_sentence_length
14
  from svs_eval import singmos_warmup, singmos_evaluation
15
+ from util import get_pinyin
16
 
17
 
18
  asr_pipeline = pipeline(
 
145
  # OWSM ctc + PER
146
  y, sr = librosa.load("tmp/response.wav", sr=16000)
147
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
148
+ hyp_pinyin = get_pinyin(asr_result)
149
 
150
  with open(f"tmp/llm.txt", "r") as f:
151
  ref = f.read().replace(' ', '')
152
 
153
+ ref_pinyin = get_pinyin(ref)
154
+ per = jiwer.wer(" ".join(ref_pinyin), " ".join(hyp_pinyin))
155
 
156
  audio = librosa.load(f"tmp/response.wav", sr=sample_rate)[0]
157
  singmos = singmos_evaluation(
test_performance.py CHANGED
@@ -118,10 +118,10 @@ def on_click_metrics(audio_path, ref):
118
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
119
 
120
  # Espnet embeded g2p, but sometimes it will mispronunce polyphonic characters
121
- hyp_pinin = pypinyin_g2p_phone_without_prosody(asr_result)
122
 
123
- ref_pinin = pypinyin_g2p_phone_without_prosody(ref)
124
- per = jiwer.wer(ref_pinin, hyp_pinin)
125
 
126
  audio = librosa.load(audio_path, sr=22050)[0]
127
  singmos = singmos_evaluation(
 
118
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
119
 
120
  # Espnet embeded g2p, but sometimes it will mispronunce polyphonic characters
121
+ hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result)
122
 
123
+ ref_pinyin = pypinyin_g2p_phone_without_prosody(ref)
124
+ per = jiwer.wer(ref_pinyin, hyp_pinyin)
125
 
126
  audio = librosa.load(audio_path, sr=22050)[0]
127
  singmos = singmos_evaluation(
util.py CHANGED
@@ -5,7 +5,8 @@ from typing import List
5
  import re
6
 
7
  from resource.pinyin_dict import PINYIN_DICT
8
- from pypinyin import lazy_pinyin
 
9
 
10
 
11
  def preprocess_input(src_str, seg_syb=" "):
@@ -77,14 +78,36 @@ def get_tokenizer(model, lang):
77
  raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
78
 
79
 
 
 
 
 
 
 
 
 
80
  def get_pinyin(texts):
81
- pinyin_list = lazy_pinyin(texts)
 
 
 
 
 
 
 
 
 
82
  text_list = []
83
- for text in pinyin_list:
84
- if text[0] == "S" or text[0] == "A" or text[0] == "-":
85
- sp_strs = re.findall(r"-|AP|SP", text)
86
- for phn in sp_strs:
87
- text_list.append(phn)
88
  else:
89
- text_list.append(text)
 
 
 
 
 
90
  return text_list
 
5
  import re
6
 
7
  from resource.pinyin_dict import PINYIN_DICT
8
+ from pypinyin import pinyin, Style
9
+ from zhconv import convert
10
 
11
 
12
  def preprocess_input(src_str, seg_syb=" "):
 
78
  raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
79
 
80
 
81
+ def is_chinese(char):
82
+ return '\u4e00' <= char <= '\u9fff'
83
+
84
+
85
+ def is_special(block):
86
+ return any(token in block for token in ['-', 'AP', 'SP'])
87
+
88
+
89
  def get_pinyin(texts):
90
+ texts = preprocess_input(texts, seg_syb="")
91
+ blocks = re.compile(r'[\u4e00-\u9fff]|[^\u4e00-\u9fff]+').findall(texts)
92
+
93
+ characters = [block for block in blocks if is_chinese(block)]
94
+ chinese_text = ''.join(characters)
95
+ chinese_text = convert(chinese_text, 'zh-cn')
96
+
97
+ chinese_pinyin = pinyin(chinese_text, style=Style.NORMAL)
98
+ chinese_pinyin = [item[0] for item in chinese_pinyin]
99
+
100
  text_list = []
101
+ pinyin_idx = 0
102
+ for block in blocks:
103
+ if is_chinese(block):
104
+ text_list.append(chinese_pinyin[pinyin_idx])
105
+ pinyin_idx += 1
106
  else:
107
+ if is_special(block):
108
+ specials = re.compile(r"-|AP|SP").findall(block)
109
+ text_list.extend(specials)
110
+ else:
111
+ text_list.append(block)
112
+
113
  return text_list