Spaces:
Runtime error
Runtime error
import re | |
from opencc import OpenCC | |
t2s_converter = OpenCC('t2s') | |
s2t_converter = OpenCC('s2t') | |
EMOJI_PATTERN = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # Emoticons | |
"]+", flags=re.UNICODE | |
) | |
# 创建一个翻译表,用于替换和移除字符 | |
TRANSLATION_TABLE = str.maketrans({ | |
'-': ' ', # 将 '-' 替换为空格 | |
',': None, | |
'.': None, | |
',': None, | |
'。': None, | |
'!': None, | |
'!': None, | |
'?': None, | |
'?': None, | |
'…': None, | |
';': None, | |
';': None, | |
':': None, | |
':': None, | |
'\u3000': ' ', # 将全角空格替换为空格 | |
}) | |
# 替换括号中的内容,包括中括号和小括号 | |
BACKSLASH_PATTERN = re.compile(r'\(.*?\)|\[.*?\]') | |
SPACE_PATTERN = re.compile('(?<!^)\s+(?!$)') | |
def normalize_text(text, language, strip=True): | |
""" | |
对文本进行标准化处理,去除标点符号,转为小写(如果适用) | |
""" | |
# Step 1: 替换 '-' 为 ' ' 并移除标点符号 | |
text = text.translate(TRANSLATION_TABLE) | |
# Step 2: 移除表情符号 | |
text = EMOJI_PATTERN.sub('', text) | |
# Step 3: 连续空白字符替换为单个空格,首位除外 | |
text = SPACE_PATTERN.sub(' ', text) | |
# Step 4: 去除首尾空白字符(如果需要) | |
if strip: | |
text = text.strip() | |
# Step 5: 转为小写 | |
text = text.lower() | |
# Step 6: 多语言转换 | |
if language == "zh": | |
text = t2s_converter.convert(text) | |
if language == "yue": | |
text = s2t_converter.convert(text) | |
# 其他语言根据需要添加 | |
return text | |