lanlanliu commited on
Commit
a50183d
·
1 Parent(s): f98847a

modified Chinese polyphoic characters processing

Browse files
Files changed (1) hide show
  1. util.py +28 -8
util.py CHANGED
@@ -5,7 +5,8 @@ from typing import List
5
  import re
6
 
7
  from resource.pinyin_dict import PINYIN_DICT
8
- from pypinyin import lazy_pinyin
 
9
 
10
 
11
  def preprocess_input(src_str, seg_syb=" "):
@@ -77,14 +78,33 @@ def get_tokenizer(model, lang):
77
  raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
78
 
79
 
 
 
 
 
 
 
 
 
80
  def get_pinyin(texts):
81
- pinyin_list = lazy_pinyin(texts)
 
 
 
 
 
 
 
 
 
 
82
  text_list = []
83
- for text in pinyin_list:
84
- if text[0] == "S" or text[0] == "A" or text[0] == "-":
85
- sp_strs = re.findall(r"-|AP|SP", text)
86
- for phn in sp_strs:
87
- text_list.append(phn)
88
  else:
89
- text_list.append(text)
 
90
  return text_list
 
5
  import re
6
 
7
  from resource.pinyin_dict import PINYIN_DICT
8
+ from pypinyin import pinyin, Style
9
+ from zhconv import convert
10
 
11
 
12
  def preprocess_input(src_str, seg_syb=" "):
 
78
  raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
79
 
80
 
81
+ def is_chinese(char):
82
+ return '\u4e00' <= char <= '\u9fff'
83
+
84
+
85
+ def is_special(char):
86
+ return re.match(r'^[-——APSP]+$', char) is not None
87
+
88
+
89
  def get_pinyin(texts):
90
+ texts = preprocess_input(texts, seg_syb="")
91
+ pattern = re.compile(r'[\u4e00-\u9fff]|[^\u4e00-\u9fff]+')
92
+ blocks = pattern.findall(texts)
93
+
94
+ characters = [block for block in blocks if is_chinese(block)]
95
+ chinese_text = ''.join(characters)
96
+ chinese_text = convert(chinese_text, 'zh-cn')
97
+
98
+ chinese_pinyin = pinyin(chinese_text, style=Style.NORMAL)
99
+ chinese_pinyin = [item[0] for item in chinese_pinyin]
100
+
101
  text_list = []
102
+ pinyin_idx = 0
103
+ for block in blocks:
104
+ if is_chinese(block):
105
+ text_list.append(chinese_pinyin[pinyin_idx])
106
+ pinyin_idx += 1
107
  else:
108
+ text_list.append(block)
109
+
110
  return text_list