kemuriririn's picture
add front end
00bfabc
raw
history blame
2.85 kB
# -*- coding: utf-8 -*-
import traceback
import os
import sys
import re
import re
class TextNormalizer:
def __init__(self):
# self.normalizer = Normalizer(cache_dir="textprocessing/tn")
self.zh_normalizer = None
self.en_normalizer = None
self.char_rep_map = {
"๏ผš": ",",
"๏ผ›": ",",
";": ",",
"๏ผŒ": ",",
"ใ€‚": ".",
"๏ผ": "!",
"๏ผŸ": "?",
"\n": ".",
"ยท": ",",
"ใ€": ",",
"...": "โ€ฆ",
"โ€ฆโ€ฆ": "โ€ฆ",
"$": ".",
"โ€œ": "'",
"โ€": "'",
'"': "'",
"โ€˜": "'",
"โ€™": "'",
"๏ผˆ": "'",
"๏ผ‰": "'",
"(": "'",
")": "'",
"ใ€Š": "'",
"ใ€‹": "'",
"ใ€": "'",
"ใ€‘": "'",
"[": "'",
"]": "'",
"โ€”": "-",
"๏ฝž": "-",
"~": "-",
"ใ€Œ": "'",
"ใ€": "'",
":": ",",
}
def match_email(self, email):
# ๆญฃๅˆ™่กจ่พพๅผๅŒน้…้‚ฎ็ฎฑๆ ผๅผ๏ผšๆ•ฐๅญ—่‹ฑๆ–‡@ๆ•ฐๅญ—่‹ฑๆ–‡.่‹ฑๆ–‡
pattern = r'^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$'
return re.match(pattern, email) is not None
def use_chinese(self, s):
has_chinese = bool(re.search(r'[\u4e00-\u9fff]', s))
has_digit = bool(re.search(r'\d', s))
has_alpha = bool(re.search(r'[a-zA-Z]', s))
is_email = self.match_email(s)
if has_chinese or not has_alpha or is_email:
return True
else:
return False
def load(self):
# print(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
# sys.path.append(model_dir)
from tn.chinese.normalizer import Normalizer as NormalizerZh
from tn.english.normalizer import Normalizer as NormalizerEn
self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False)
self.en_normalizer = NormalizerEn()
def infer(self, text):
pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys()))
replaced_text = pattern.sub(lambda x: self.char_rep_map[x.group()], text)
if not self.zh_normalizer or not self.en_normalizer:
print("Error, text normalizer is not initialized !!!")
return ""
try:
normalizer = self.zh_normalizer if self.use_chinese(text) else self.en_normalizer
result = normalizer.normalize(text)
except Exception:
result = ""
print(traceback.format_exc())
return result
if __name__ == '__main__':
# ๆต‹่ฏ•็จ‹ๅบ
text_normalizer = TextNormalizer()
print(text_normalizer.infer("2.5ๅนณๆ–น็”ต็บฟ"))