Yiwen Zhao commited on
Commit
d191039
ยท
1 Parent(s): e285e98

update jp preprocess

Browse files
Files changed (2) hide show
  1. requirements.txt +3 -1
  2. svs_utils.py +85 -3
requirements.txt CHANGED
@@ -6,4 +6,6 @@ torchaudio
6
  typeguard==4.4.0
7
  jiwer
8
  fastapi
9
- uvicorn
 
 
 
6
  typeguard==4.4.0
7
  jiwer
8
  fastapi
9
+ uvicorn
10
+ fugashi
11
+ pykakasi
svs_utils.py CHANGED
@@ -9,6 +9,10 @@ from espnet_model_zoo.downloader import ModelDownloader
9
 
10
  from util import get_pinyin, get_tokenizer, postprocess_phn, preprocess_input
11
 
 
 
 
 
12
 
13
  def svs_warmup(config):
14
  """
@@ -29,6 +33,83 @@ def svs_warmup(config):
29
  return model
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def svs_text_preprocessor(model_path, texts, lang):
33
  """
34
  Input:
@@ -52,8 +133,9 @@ def svs_text_preprocessor(model_path, texts, lang):
52
  texts = preprocess_input(texts, "")
53
  text_list = get_pinyin(texts)
54
  elif lang == "jp":
55
- texts = preprocess_input(texts, "")
56
- text_list = list(texts)
 
57
 
58
  # text to phoneme
59
  tokenizer = get_tokenizer(model_path, lang)
@@ -289,7 +371,7 @@ if __name__ == "__main__":
289
  if config.lang == "zh":
290
  answer_text = "ๅคฉๆฐ”็œŸๅฅฝ\n็ฉบๆฐ”ๆธ…ๆ–ฐ\nๆฐ”ๆธฉๆธฉๅ’Œ\n้ฃŽๅ’Œๆ—ฅไธฝ\nๅคฉ้ซ˜ๆฐ”็ˆฝ\n้˜ณๅ…‰ๆ˜Žๅชš"
291
  elif config.lang == "jp":
292
- answer_text = "ใ›ใ‹ใ„ใงใ„ใกใฐใ‚“ใŠใฒใ‚ใ•ใพ\nใใ†ใ„ใ†ใ‚ใคใ‹ใ„\nใ“ใ“ใ‚ใˆใฆใ‚ˆใญ"
293
  else:
294
  print(f"Currently system does not support {config.lang}")
295
  exit(1)
 
9
 
10
  from util import get_pinyin, get_tokenizer, postprocess_phn, preprocess_input
11
 
12
+ import fugashi
13
+ import unicodedata
14
+ import pykakasi
15
+
16
 
17
  def svs_warmup(config):
18
  """
 
33
  return model
34
 
35
 
36
+ yoon_map = {
37
+ "ใ": "ใ‚", "ใƒ": "ใ„", "ใ…": "ใ†", "ใ‡": "ใˆ", "ใ‰": "ใŠ",
38
+ "ใ‚ƒ": "ใ‚„", "ใ‚…": "ใ‚†", "ใ‚‡": "ใ‚ˆ", "ใ‚Ž": "ใ‚"
39
+ }
40
+
41
+ def replace_chouonpu(hiragana_text):
42
+ """ processใ€Œใƒผใ€since the previous packages didn't support """
43
+ vowels = {
44
+ "ใ‚": "ใ‚", "ใ„": "ใ„", "ใ†": "ใ†", "ใˆ": "ใˆ", "ใŠ": "ใ†",
45
+ "ใ‹": "ใ‚", "ใ": "ใ„", "ใ": "ใ†", "ใ‘": "ใˆ", "ใ“": "ใ†",
46
+ "ใ•": "ใ‚", "ใ—": "ใ„", "ใ™": "ใ†", "ใ›": "ใˆ", "ใ": "ใ†",
47
+ "ใŸ": "ใ‚", "ใก": "ใ„", "ใค": "ใ†", "ใฆ": "ใˆ", "ใจ": "ใ†",
48
+ "ใช": "ใ‚", "ใซ": "ใ„", "ใฌ": "ใ†", "ใญ": "ใˆ", "ใฎ": "ใ†",
49
+ "ใฏ": "ใ‚", "ใฒ": "ใ„", "ใต": "ใ†", "ใธ": "ใˆ", "ใป": "ใ†",
50
+ "ใพ": "ใ‚", "ใฟ": "ใ„", "ใ‚€": "ใ†", "ใ‚": "ใˆ", "ใ‚‚": "ใ†",
51
+ "ใ‚„": "ใ‚", "ใ‚†": "ใ†", "ใ‚ˆ": "ใ†",
52
+ "ใ‚‰": "ใ‚", "ใ‚Š": "ใ„", "ใ‚‹": "ใ†", "ใ‚Œ": "ใˆ", "ใ‚": "ใ†",
53
+ "ใ‚": "ใ‚", "ใ‚’": "ใ†",
54
+ }
55
+
56
+ new_text = []
57
+ for i, char in enumerate(hiragana_text):
58
+ if char == "ใƒผ" and i > 0:
59
+ prev_char = new_text[-1]
60
+ if prev_char in yoon_map:
61
+ prev_char = yoon_map[prev_char]
62
+ new_text.append(vowels.get(prev_char, prev_char))
63
+ else:
64
+ new_text.append(char)
65
+ return "".join(new_text)
66
+
67
+
68
+ def is_small_kana(kana): # ใ‚‡ True ใ‚ˆ False
69
+ for char in kana:
70
+ name = unicodedata.name(char, "")
71
+ if "SMALL" in name:
72
+ return True
73
+ return False
74
+
75
+
76
+ def kanji_to_SVSDictKana(text):
77
+ tagger = fugashi.Tagger()
78
+
79
+ katagana_text = " ".join(word.feature.pron if word.feature.pron else word.surface for word in tagger(text))
80
+ print(katagana_text) # ['ใƒˆใƒผใ‚ญใƒงใƒผ', 'ใƒ€ใ‚คใ‚ฌใ‚ฏ', 'ใƒˆ', 'ใ‚ญใƒงใƒผใƒˆ', 'ใƒ€ใ‚คใ‚ฌใ‚ฏ'] # NOTE(yiwen) the svs predefined dict does not support ใƒผ
81
+
82
+ kks = pykakasi.kakasi()
83
+ kks.setMode("K", "H") # ็‰‡ไปฎๅ โ†’ ๅนณไปฎๅ
84
+ conv = kks.getConverter()
85
+
86
+ hiragana_text = " ".join(
87
+ conv.do(word.feature.pron) if word.feature.pron else word.surface
88
+ for word in tagger(katagana_text)
89
+ )
90
+
91
+ hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ") # list
92
+ # print(f'debug -- hiragana_text {hiragana_text_wl}')
93
+
94
+ final_ls = []
95
+ for subword in hiragana_text_wl:
96
+ sl_prev = 0
97
+ for i in range(len(subword)-1):
98
+ if sl_prev>=len(subword)-1:
99
+ break
100
+ sl = sl_prev + 1
101
+ if subword[sl] in yoon_map:
102
+ final_ls.append(subword[sl_prev:sl+1])
103
+ sl_prev+=2
104
+ else:
105
+ final_ls.append(subword[sl_prev])
106
+ sl_prev+=1
107
+ final_ls.append(subword[sl_prev])
108
+
109
+ # final_str = " ".join(final_ls)
110
+ return final_ls
111
+
112
+
113
  def svs_text_preprocessor(model_path, texts, lang):
114
  """
115
  Input:
 
133
  texts = preprocess_input(texts, "")
134
  text_list = get_pinyin(texts)
135
  elif lang == "jp":
136
+ text_list = kanji_to_SVSDictKana(texts)
137
+ # texts = preprocess_input(texts, "")
138
+ # text_list = list(texts)
139
 
140
  # text to phoneme
141
  tokenizer = get_tokenizer(model_path, lang)
 
371
  if config.lang == "zh":
372
  answer_text = "ๅคฉๆฐ”็œŸๅฅฝ\n็ฉบๆฐ”ๆธ…ๆ–ฐ\nๆฐ”ๆธฉๆธฉๅ’Œ\n้ฃŽๅ’Œๆ—ฅไธฝ\nๅคฉ้ซ˜ๆฐ”็ˆฝ\n้˜ณๅ…‰ๆ˜Žๅชš"
373
  elif config.lang == "jp":
374
+ answer_text = "ไธ–็•Œใงไธ€็•ชใŠใฒใ‚ใ•ใพ ใใ†ใ„ใ†ๆ‰ฑใ„ๅฟƒๅพ—ใฆใ‚ˆใญ\n็งใ‚’่ชฐใ ใจๆ€ใฃใฆใ‚‹ใฎ"
375
  else:
376
  print(f"Currently system does not support {config.lang}")
377
  exit(1)