Upload 2 files
Browse files- ForkLangSegment/LangSegment.py +538 -0
- ForkLangSegment/__init__.py +8 -0
ForkLangSegment/LangSegment.py
ADDED
@@ -0,0 +1,538 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file bundles language identification functions.
|
3 |
+
|
4 |
+
Modifications (fork): Copyright (c) 2021, Adrien Barbaresi.
|
5 |
+
|
6 |
+
Original code: Copyright (c) 2011 Marco Lui <[email protected]>.
|
7 |
+
Based on research by Marco Lui and Tim Baldwin.
|
8 |
+
|
9 |
+
See LICENSE file for more info.
|
10 |
+
https://github.com/adbar/py3langid
|
11 |
+
|
12 |
+
Projects:
|
13 |
+
https://github.com/juntaosun/LangSegment
|
14 |
+
"""
|
15 |
+
|
16 |
+
import re
|
17 |
+
from collections import defaultdict
|
18 |
+
|
19 |
+
# import langid
|
20 |
+
import py3langid as langid
|
21 |
+
# pip install py3langid==0.2.2
|
22 |
+
|
23 |
+
|
24 |
+
# -----------------------------------
|
25 |
+
# 更新日志:新版本分词更加精准。
|
26 |
+
# Changelog: The new version of the word segmentation is more accurate.
|
27 |
+
# -----------------------------------
|
28 |
+
|
29 |
+
|
30 |
+
# Word segmentation function:
|
31 |
+
# automatically identify and split the words (Chinese/English/Japanese/Korean) in the article or sentence according to different languages,
|
32 |
+
# making it more suitable for TTS processing.
|
33 |
+
# This code is designed for front-end text multi-lingual mixed annotation distinction, multi-language mixed training and inference of various TTS projects.
|
34 |
+
# This processing result is mainly for (Chinese = zh, Japanese = ja, English = en, Korean = ko), and can actually support up to 97 different language mixing processing.
|
35 |
+
|
36 |
+
|
37 |
+
# ===========================================================================================================
|
38 |
+
# 分词功能:将文章或句子里的例如(中/英/日/韩),按不同语言自动识别并拆分,让它更适合TTS处理。
|
39 |
+
# 本代码专为各种 TTS 项目的前端文本多语种混合标注区分,多语言混合训练和推理而编写。
|
40 |
+
# ===========================================================================================================
|
41 |
+
# (1)自动分词:“韩语中的오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型”
|
42 |
+
# (2)手动分词:“你的名字叫<ja>佐々木?<ja>吗?”
|
43 |
+
# 本处理结果主要针对(中文=zh , 日文=ja , 英文=en , 韩语=ko), 实际上可支持多达 97 种不同的语言混合处理。
|
44 |
+
# ===========================================================================================================
|
45 |
+
|
46 |
+
|
47 |
+
# 手动分词标签规范:<语言标签>文本内容</语言标签>
|
48 |
+
# Manual word segmentation tag specification: <language tags> text content </language tags>
|
49 |
+
# ===========================================================================================================
|
50 |
+
# For manual word segmentation, labels need to appear in pairs, such as:
|
51 |
+
# 如需手动分词,标签需要成对出现,例如:“<ja>佐々木<ja>” 或者 “<ja>佐々木</ja>”
|
52 |
+
# 错误示范:“你的名字叫<ja>佐々木。” 此句子中出现的单个<ja>标签将被忽略,不会处理。
|
53 |
+
# Error demonstration: "Your name is <ja>佐々木。" Single <ja> tags that appear in this sentence will be ignored and will not be processed.
|
54 |
+
# ===========================================================================================================
|
55 |
+
|
56 |
+
class LangSegment():
|
57 |
+
|
58 |
+
_text_cache = None
|
59 |
+
_text_lasts = None
|
60 |
+
_text_langs = None
|
61 |
+
_lang_count = None
|
62 |
+
_lang_eos = None
|
63 |
+
|
64 |
+
# 可自定义语言匹配标签:
|
65 |
+
# Customizable language matching tags: These are supported
|
66 |
+
# <zh>你好<zh> , <ja>佐々木</ja> , <en>OK<en> , <ko>오빠</ko> 这些写法均支持
|
67 |
+
SYMBOLS_PATTERN = r'(<([a-zA-Z|-]*)>(.*?)<\/*[a-zA-Z|-]*>)'
|
68 |
+
|
69 |
+
# 语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
|
70 |
+
# The language filter group function allows you to specify reserved languages.
|
71 |
+
# Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.
|
72 |
+
Langfilters = ["zh", "en", "ja", "ko"]
|
73 |
+
# 除此以外,它支持简写过滤器,只需按不同语种任意组合即可。
|
74 |
+
# In addition to that, it supports abbreviation filters, allowing for any combination of different languages.
|
75 |
+
# 示例:您可以任意指定多种组合,进行过滤
|
76 |
+
# Example: You can specify any combination to filter
|
77 |
+
|
78 |
+
# Langfilters = ["zh"] # 按中文识别
|
79 |
+
# Langfilters = ["en"] # 按英文识别
|
80 |
+
# Langfilters = ["ja"] # 按日文识别
|
81 |
+
# Langfilters = ["ko"] # 按韩文识别
|
82 |
+
# Langfilters = ["zh_ja"] # 中日混合识别
|
83 |
+
# Langfilters = ["zh_en"] # 中英混合识别
|
84 |
+
# Langfilters = ["ja_en"] # 日英混合识别
|
85 |
+
# Langfilters = ["zh_ko"] # 中韩混合识别
|
86 |
+
# Langfilters = ["ja_ko"] # 日韩混合识别
|
87 |
+
# Langfilters = ["en_ko"] # 英韩混合识别
|
88 |
+
# Langfilters = ["zh_ja_en"] # 中日英混合识别
|
89 |
+
# Langfilters = ["zh_ja_en_ko"] # 中日英韩混合识别
|
90 |
+
|
91 |
+
# 更多过滤组合,请您随意。。。For more filter combinations, please feel free to......
|
92 |
+
|
93 |
+
|
94 |
+
# DEFINITION
|
95 |
+
PARSE_TAG = re.compile(r'(⑥\$\d+[\d]{6,}⑥)')
|
96 |
+
|
97 |
+
@staticmethod
|
98 |
+
def _clears():
|
99 |
+
LangSegment._text_cache = None
|
100 |
+
LangSegment._text_lasts = None
|
101 |
+
LangSegment._text_langs = None
|
102 |
+
LangSegment._text_waits = None
|
103 |
+
LangSegment._lang_count = None
|
104 |
+
LangSegment._lang_eos = None
|
105 |
+
pass
|
106 |
+
|
107 |
+
@staticmethod
|
108 |
+
def _is_english_word(word):
|
109 |
+
return bool(re.match(r'^[a-zA-Z]+$', word))
|
110 |
+
|
111 |
+
@staticmethod
|
112 |
+
def _is_chinese(word):
|
113 |
+
for char in word:
|
114 |
+
if '\u4e00' <= char <= '\u9fff':
|
115 |
+
return True
|
116 |
+
return False
|
117 |
+
|
118 |
+
@staticmethod
|
119 |
+
def _is_japanese_kana(word):
|
120 |
+
pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]+')
|
121 |
+
matches = pattern.findall(word)
|
122 |
+
return len(matches) > 0
|
123 |
+
|
124 |
+
@staticmethod
|
125 |
+
def _insert_english_uppercase(word):
|
126 |
+
modified_text = re.sub(r'(?<!\b)([A-Z])', r' \1', word)
|
127 |
+
modified_text = modified_text.strip('-')
|
128 |
+
return modified_text + " "
|
129 |
+
|
130 |
+
@staticmethod
|
131 |
+
def _saveData(words,language:str,text:str):
|
132 |
+
# Language word statistics
|
133 |
+
lang_count = LangSegment._lang_count
|
134 |
+
if lang_count is None:lang_count = defaultdict(int)
|
135 |
+
if not "|" in language:lang_count[language] += int(len(text)//2) if language == "en" else len(text)
|
136 |
+
LangSegment._lang_count = lang_count
|
137 |
+
# Merge the same language and save the results
|
138 |
+
preData = words[-1] if len(words) > 0 else None
|
139 |
+
if preData and (preData["lang"] == language):
|
140 |
+
text = preData["text"] + text
|
141 |
+
preData["text"] = text
|
142 |
+
return preData
|
143 |
+
data = {"lang":language,"text": text}
|
144 |
+
filters = LangSegment.Langfilters
|
145 |
+
if filters is None or len(filters) == 0 or "?" in language or \
|
146 |
+
language in filters or language in filters[0] or \
|
147 |
+
filters[0] == "*" or filters[0] in "alls-mixs-autos":
|
148 |
+
words.append(data)
|
149 |
+
return data
|
150 |
+
|
151 |
+
@staticmethod
|
152 |
+
def _addwords(words,language,text):
|
153 |
+
if text is None or len(text.strip()) == 0:return True
|
154 |
+
if language is None:language = ""
|
155 |
+
language = language.lower()
|
156 |
+
if language == 'en':text = LangSegment._insert_english_uppercase(text)
|
157 |
+
# text = re.sub(r'[(())]', ',' , text) # Keep it.
|
158 |
+
text_waits = LangSegment._text_waits
|
159 |
+
ispre_waits = len(text_waits)>0
|
160 |
+
preResult = text_waits.pop() if ispre_waits else None
|
161 |
+
if preResult is None:preResult = words[-1] if len(words) > 0 else None
|
162 |
+
if preResult and ("|" in preResult["lang"]):
|
163 |
+
pre_lang = preResult["lang"]
|
164 |
+
if language in pre_lang:preResult["lang"] = language = language.split("|")[0]
|
165 |
+
else:preResult["lang"]=pre_lang.split("|")[0]
|
166 |
+
if ispre_waits:preResult = LangSegment._saveData(words,preResult["lang"],preResult["text"])
|
167 |
+
pre_lang = preResult["lang"] if preResult else None
|
168 |
+
if ("|" in language) and (pre_lang and not pre_lang in language and not "…" in language):language = language.split("|")[0]
|
169 |
+
filters = LangSegment.Langfilters
|
170 |
+
if "|" in language:LangSegment._text_waits.append({"lang":language,"text": text})
|
171 |
+
else:LangSegment._saveData(words,language,text)
|
172 |
+
return False
|
173 |
+
|
174 |
+
@staticmethod
|
175 |
+
def _get_prev_data(words):
|
176 |
+
data = words[-1] if words and len(words) > 0 else None
|
177 |
+
if data:return (data["lang"] , data["text"])
|
178 |
+
return (None,"")
|
179 |
+
|
180 |
+
@staticmethod
|
181 |
+
def _match_ending(input , index):
|
182 |
+
if input is None or len(input) == 0:return False,None
|
183 |
+
input = re.sub(r'\s+', '', input)
|
184 |
+
if len(input) == 0 or abs(index) > len(input):return False,None
|
185 |
+
ending_pattern = re.compile(r'([「」“”‘’"\'::。.!!?.?])')
|
186 |
+
return ending_pattern.match(input[index]),input[index]
|
187 |
+
|
188 |
+
@staticmethod
|
189 |
+
def _cleans_text(cleans_text):
|
190 |
+
cleans_text = re.sub(r'([^\w]+)', '', cleans_text)
|
191 |
+
return cleans_text
|
192 |
+
|
193 |
+
@staticmethod
|
194 |
+
def _lang_classify(cleans_text):
|
195 |
+
language, *_ = langid.classify(cleans_text)
|
196 |
+
return language
|
197 |
+
|
198 |
+
@staticmethod
|
199 |
+
def _parse_language(words , segment):
|
200 |
+
LANG_JA = "ja"
|
201 |
+
LANG_ZH = "zh"
|
202 |
+
language = LANG_ZH
|
203 |
+
regex_pattern = re.compile(r'([^\w\s]+)')
|
204 |
+
lines = regex_pattern.split(segment)
|
205 |
+
lines_max = len(lines)
|
206 |
+
LANG_EOS =LangSegment._lang_eos
|
207 |
+
for index, text in enumerate(lines):
|
208 |
+
if len(text) == 0:continue
|
209 |
+
EOS = index >= (lines_max - 1)
|
210 |
+
nextId = index + 1
|
211 |
+
nextText = lines[nextId] if not EOS else ""
|
212 |
+
nextPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',nextText)).strip()) == 0
|
213 |
+
textPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',text)).strip()) == 0
|
214 |
+
if not EOS and (textPunc == True or ( len(nextText.strip()) >= 0 and nextPunc == True)):
|
215 |
+
lines[nextId] = f'{text}{nextText}'
|
216 |
+
continue
|
217 |
+
number_tags = re.compile(r'(⑥\d{6,}⑥)')
|
218 |
+
cleans_text = re.sub(number_tags, '' ,text)
|
219 |
+
cleans_text = LangSegment._cleans_text(cleans_text)
|
220 |
+
language = LangSegment._lang_classify(cleans_text)
|
221 |
+
prev_language , prev_text = LangSegment._get_prev_data(words)
|
222 |
+
if len(cleans_text) <= 3 and LangSegment._is_chinese(cleans_text):
|
223 |
+
if EOS and LANG_EOS: language = LANG_ZH if len(cleans_text) <= 1 else language
|
224 |
+
elif LangSegment._is_japanese_kana(cleans_text):language = LANG_JA
|
225 |
+
else:
|
226 |
+
LANG_UNKNOWN = f'{LANG_ZH}|{LANG_JA}'
|
227 |
+
match_end,match_char = LangSegment._match_ending(text, -1)
|
228 |
+
referen = prev_language in LANG_UNKNOWN or LANG_UNKNOWN in prev_language if prev_language else False
|
229 |
+
if match_char in "。.": language = prev_language if referen and len(words) > 0 else language
|
230 |
+
else:language = f"{LANG_UNKNOWN}|…"
|
231 |
+
text,*_ = re.subn(number_tags , LangSegment._restore_number , text )
|
232 |
+
LangSegment._addwords(words,language,text)
|
233 |
+
pass
|
234 |
+
pass
|
235 |
+
|
236 |
+
@staticmethod
|
237 |
+
def _restore_number(matche):
|
238 |
+
value = matche.group(0)
|
239 |
+
text_cache = LangSegment._text_cache
|
240 |
+
if value in text_cache:
|
241 |
+
process , data = text_cache[value]
|
242 |
+
tag , match = data
|
243 |
+
value = match
|
244 |
+
return value
|
245 |
+
|
246 |
+
@staticmethod
|
247 |
+
def _pattern_symbols(item , text):
|
248 |
+
if text is None:return text
|
249 |
+
tag , pattern , process = item
|
250 |
+
matches = pattern.findall(text)
|
251 |
+
if len(matches) == 1 and "".join(matches[0]) == text:
|
252 |
+
return text
|
253 |
+
for i , match in enumerate(matches):
|
254 |
+
key = f"⑥{tag}{i:06d}⑥"
|
255 |
+
text = re.sub(pattern , key , text , count=1)
|
256 |
+
LangSegment._text_cache[key] = (process , (tag , match))
|
257 |
+
return text
|
258 |
+
|
259 |
+
@staticmethod
|
260 |
+
def _process_symbol(words,data):
|
261 |
+
tag , match = data
|
262 |
+
language = match[1]
|
263 |
+
text = match[2]
|
264 |
+
LangSegment._addwords(words,language,text)
|
265 |
+
pass
|
266 |
+
|
267 |
+
@staticmethod
|
268 |
+
def _process_english(words,data):
|
269 |
+
tag , match = data
|
270 |
+
text = match[0]
|
271 |
+
language = "en"
|
272 |
+
LangSegment._addwords(words,language,text)
|
273 |
+
pass
|
274 |
+
|
275 |
+
@staticmethod
|
276 |
+
def _process_korean(words,data):
|
277 |
+
tag , match = data
|
278 |
+
text = match[0]
|
279 |
+
language = "ko"
|
280 |
+
LangSegment._addwords(words,language,text)
|
281 |
+
pass
|
282 |
+
|
283 |
+
@staticmethod
|
284 |
+
def _process_quotes(words,data):
|
285 |
+
tag , match = data
|
286 |
+
text = "".join(match)
|
287 |
+
childs = LangSegment.PARSE_TAG.findall(text)
|
288 |
+
if len(childs) > 0:
|
289 |
+
LangSegment._process_tags(words , text , False)
|
290 |
+
else:
|
291 |
+
cleans_text = LangSegment._cleans_text(match[1])
|
292 |
+
if len(cleans_text) <= 3:
|
293 |
+
LangSegment._parse_language(words,text)
|
294 |
+
else:
|
295 |
+
language = LangSegment._lang_classify(cleans_text)
|
296 |
+
LangSegment._addwords(words,language,text)
|
297 |
+
pass
|
298 |
+
|
299 |
+
@staticmethod
|
300 |
+
def _process_number(words,data): # "$0" process only
|
301 |
+
"""
|
302 |
+
Numbers alone cannot accurately identify language.
|
303 |
+
Because numbers are universal in all languages.
|
304 |
+
So it won't be executed here, just for testing.
|
305 |
+
"""
|
306 |
+
tag , match = data
|
307 |
+
language = words[0]["lang"] if len(words) > 0 else "zh"
|
308 |
+
text = match
|
309 |
+
LangSegment._addwords(words,language,text)
|
310 |
+
pass
|
311 |
+
|
312 |
+
@staticmethod
|
313 |
+
def _process_tags(words , text , root_tag):
|
314 |
+
text_cache = LangSegment._text_cache
|
315 |
+
segments = re.split(LangSegment.PARSE_TAG, text)
|
316 |
+
segments_len = len(segments) - 1
|
317 |
+
for index , text in enumerate(segments):
|
318 |
+
if root_tag:LangSegment._lang_eos = index >= segments_len
|
319 |
+
if LangSegment.PARSE_TAG.match(text):
|
320 |
+
process , data = text_cache[text]
|
321 |
+
if process:process(words , data)
|
322 |
+
else:
|
323 |
+
LangSegment._parse_language(words , text)
|
324 |
+
pass
|
325 |
+
return words
|
326 |
+
|
327 |
+
@staticmethod
|
328 |
+
def _parse_symbols(text):
|
329 |
+
TAG_NUM = "00" # "00" => default channels , "$0" => testing channel
|
330 |
+
TAG_S1,TAG_P1,TAG_P2,TAG_EN,TAG_KO = "$1" ,"$2" ,"$3" ,"$4" ,"$5"
|
331 |
+
process_list = [
|
332 |
+
( TAG_S1 , re.compile(LangSegment.SYMBOLS_PATTERN) , LangSegment._process_symbol ), # Symbol Tag
|
333 |
+
( TAG_KO , re.compile('(([【《((“‘"\']*(\d+\W*\s*)*[\uac00-\ud7a3]+[\W\s]*)+)') , LangSegment._process_korean ), # Korean words
|
334 |
+
( TAG_NUM , re.compile(r'(\W*\d+\W+\d*\W*\d*)') , LangSegment._process_number ), # Number words, Universal in all languages, Ignore it.
|
335 |
+
( TAG_EN , re.compile(r'(([【《((“‘"\']*[a-zA-Z]+[\W\s]*)+)') , LangSegment._process_english ), # English words
|
336 |
+
( TAG_P1 , re.compile(r'(["\'])(.*?)(\1)') , LangSegment._process_quotes ), # Regular quotes
|
337 |
+
( TAG_P2 , re.compile(r'([\n]*[【《((“‘])([^【《((“‘’”))》】]{3,})([’”))》】][\W\s]*[\n]{,1})') , LangSegment._process_quotes ), # Special quotes, There are left and right.
|
338 |
+
]
|
339 |
+
LangSegment._lang_eos = False
|
340 |
+
text_cache = LangSegment._text_cache = {}
|
341 |
+
for item in process_list:
|
342 |
+
text = LangSegment._pattern_symbols(item , text)
|
343 |
+
words = LangSegment._process_tags([] , text , True)
|
344 |
+
lang_count = LangSegment._lang_count
|
345 |
+
if lang_count and len(lang_count) > 0:
|
346 |
+
lang_count = dict(sorted(lang_count.items(), key=lambda x: x[1], reverse=True))
|
347 |
+
lang_count = list(lang_count.items())
|
348 |
+
LangSegment._lang_count = lang_count
|
349 |
+
return words
|
350 |
+
|
351 |
+
@staticmethod
|
352 |
+
def setfilters(filters):
|
353 |
+
# 当过滤器更改时,清除缓存
|
354 |
+
# When the filter changes, clear the cache
|
355 |
+
if LangSegment.Langfilters != filters:
|
356 |
+
LangSegment._clears()
|
357 |
+
LangSegment.Langfilters = filters
|
358 |
+
pass
|
359 |
+
|
360 |
+
@staticmethod
|
361 |
+
def getfilters():
|
362 |
+
return LangSegment.Langfilters
|
363 |
+
|
364 |
+
|
365 |
+
@staticmethod
|
366 |
+
def getCounts():
|
367 |
+
lang_count = LangSegment._lang_count
|
368 |
+
if lang_count is not None:return lang_count
|
369 |
+
text_langs = LangSegment._text_langs
|
370 |
+
if text_langs is None or len(text_langs) == 0:return [("zh",0)]
|
371 |
+
lang_counts = defaultdict(int)
|
372 |
+
for d in text_langs:lang_counts[d['lang']] += int(len(d['text'])//2) if d['lang'] == "en" else len(d['text'])
|
373 |
+
lang_counts = dict(sorted(lang_counts.items(), key=lambda x: x[1], reverse=True))
|
374 |
+
lang_counts = list(lang_counts.items())
|
375 |
+
LangSegment._lang_count = lang_counts
|
376 |
+
return lang_counts
|
377 |
+
|
378 |
+
@staticmethod
|
379 |
+
def getTexts(text:str):
|
380 |
+
if text is None or len(text.strip()) == 0:
|
381 |
+
LangSegment._clears()
|
382 |
+
return []
|
383 |
+
# lasts
|
384 |
+
text_langs = LangSegment._text_langs
|
385 |
+
if LangSegment._text_lasts == text and text_langs is not None:return text_langs
|
386 |
+
# parse
|
387 |
+
LangSegment._text_waits = []
|
388 |
+
LangSegment._lang_count = None
|
389 |
+
LangSegment._text_lasts = text
|
390 |
+
text = LangSegment._parse_symbols(text)
|
391 |
+
LangSegment._text_langs = text
|
392 |
+
return text
|
393 |
+
|
394 |
+
@staticmethod
|
395 |
+
def classify(text:str):
|
396 |
+
return LangSegment.getTexts(text)
|
397 |
+
|
398 |
+
def setfilters(filters):
|
399 |
+
"""
|
400 |
+
功能:语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
|
401 |
+
Function: Language filter group function, you can specify reserved languages. \n
|
402 |
+
Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
|
403 |
+
Args:
|
404 |
+
filters (list): ["zh", "en", "ja", "ko"]
|
405 |
+
"""
|
406 |
+
LangSegment.setfilters(filters)
|
407 |
+
pass
|
408 |
+
|
409 |
+
def getfilters():
|
410 |
+
"""
|
411 |
+
功能:语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
|
412 |
+
Function: Language filter group function, you can specify reserved languages. \n
|
413 |
+
Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
|
414 |
+
Args:
|
415 |
+
filters (list): ["zh", "en", "ja", "ko"]
|
416 |
+
"""
|
417 |
+
return LangSegment.getfilters()
|
418 |
+
|
419 |
+
# @Deprecated:Use shorter setfilters
|
420 |
+
def setLangfilters(filters):
|
421 |
+
"""
|
422 |
+
>0.1.9废除:使用更简短的setfilters
|
423 |
+
"""
|
424 |
+
setfilters(filters)
|
425 |
+
# @Deprecated:Use shorter getfilters
|
426 |
+
def getLangfilters():
|
427 |
+
"""
|
428 |
+
>0.1.9废除:使用更简短的getfilters
|
429 |
+
"""
|
430 |
+
return getfilters()
|
431 |
+
|
432 |
+
def getTexts(text:str):
|
433 |
+
"""
|
434 |
+
功能:对输入的文本进行多语种分词\n
|
435 |
+
Feature: Tokenizing multilingual text input.\n
|
436 |
+
参数-Args:
|
437 |
+
text (str): Text content,文本内容\n
|
438 |
+
返回-Returns:
|
439 |
+
list: 示例结果:[{'lang':'zh','text':'?'},...]\n
|
440 |
+
lang=语种 , text=内容\n
|
441 |
+
"""
|
442 |
+
return LangSegment.getTexts(text)
|
443 |
+
|
444 |
+
def getCounts():
|
445 |
+
"""
|
446 |
+
功能:分词结果统计,按语种字数降序,用于确定其主要语言\n
|
447 |
+
Function: Tokenizing multilingual text input.\n
|
448 |
+
返回-Returns:
|
449 |
+
list: 示例结果:[('zh', 5), ('ja', 2), ('en', 1)] = [(语种,字数含标点)]\n
|
450 |
+
"""
|
451 |
+
return LangSegment.getCounts()
|
452 |
+
|
453 |
+
def classify(text:str):
|
454 |
+
"""
|
455 |
+
功能:兼容接口实现
|
456 |
+
Function: Compatible interface implementation
|
457 |
+
"""
|
458 |
+
return LangSegment.classify(text)
|
459 |
+
|
460 |
+
def printList(langlist):
|
461 |
+
"""
|
462 |
+
功能:打印数组结果
|
463 |
+
Function: Print array results
|
464 |
+
"""
|
465 |
+
print("\n\n===================【打印结果】===================")
|
466 |
+
if langlist is None or len(langlist) == 0:
|
467 |
+
print("无内容结果,No content result")
|
468 |
+
return
|
469 |
+
for line in langlist:
|
470 |
+
print(line)
|
471 |
+
pass
|
472 |
+
|
473 |
+
|
474 |
+
|
475 |
+
if __name__ == "__main__":
|
476 |
+
|
477 |
+
# -----------------------------------
|
478 |
+
# 更新日志:新版本分词更加精准。
|
479 |
+
# Changelog: The new version of the word segmentation is more accurate.
|
480 |
+
# -----------------------------------
|
481 |
+
|
482 |
+
# 输入示例1:(包含日文,中文)
|
483 |
+
# text = "“昨日は雨が降った,音楽、映画。。。”你今天学习日语了吗?春は桜の季節です。语种分词是语音合成必不可少的环节。言語分詞は音声合成に欠かせない環節である!"
|
484 |
+
|
485 |
+
# 输入示例2:(包含日文,中文)
|
486 |
+
# text = "欢迎来玩。東京,は日本の首都です。欢迎来玩. 太好了!"
|
487 |
+
|
488 |
+
# 输入示例3:(包含日文,中文)
|
489 |
+
# text = "明日、私たちは海辺にバカンスに行きます。你会说日语吗:“中国語、話せますか” 你的日语真好啊!"
|
490 |
+
|
491 |
+
# 输入示例4:(包含日文,中文,韩语,英文)
|
492 |
+
text = "你的名字叫<ja>佐々木?<ja>吗?韩语中的안녕 오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型和三款Apple Watch等一系列新品,这次的iPad Air采用了LCD屏幕"
|
493 |
+
|
494 |
+
# 进行分词:(接入TTS项目仅需一行代码调用)
|
495 |
+
langlist = LangSegment.getTexts(text)
|
496 |
+
printList(langlist)
|
497 |
+
|
498 |
+
|
499 |
+
# 语种统计:
|
500 |
+
print("\n===================【语种统计】===================")
|
501 |
+
# 获取所有语种数组结果,根据内容字数降序排列
|
502 |
+
langCounts = LangSegment.getCounts()
|
503 |
+
print(langCounts , "\n")
|
504 |
+
|
505 |
+
# 根据结果获取内容的主要语种 (语言,字数含标点)
|
506 |
+
lang , count = langCounts[0]
|
507 |
+
print(f"输入内容的主要语言为 = {lang} ,字数 = {count}")
|
508 |
+
print("==================================================\n")
|
509 |
+
|
510 |
+
|
511 |
+
# 分词输出:lang=语言,text=内容
|
512 |
+
# ===================【打印结果】===================
|
513 |
+
# {'lang': 'zh', 'text': '你的名字叫'}
|
514 |
+
# {'lang': 'ja', 'text': '佐々木?'}
|
515 |
+
# {'lang': 'zh', 'text': '吗?韩语中的'}
|
516 |
+
# {'lang': 'ko', 'text': '안녕 오빠'}
|
517 |
+
# {'lang': 'zh', 'text': '读什么呢?'}
|
518 |
+
# {'lang': 'ja', 'text': 'あなたの体育の先生は誰ですか?'}
|
519 |
+
# {'lang': 'zh', 'text': ' 此次发布会带来了四款'}
|
520 |
+
# {'lang': 'en', 'text': 'i Phone '}
|
521 |
+
# {'lang': 'zh', 'text': '15系列机型和三款'}
|
522 |
+
# {'lang': 'en', 'text': 'Apple Watch '}
|
523 |
+
# {'lang': 'zh', 'text': '等一系列新品,这次的'}
|
524 |
+
# {'lang': 'en', 'text': 'i Pad Air '}
|
525 |
+
# {'lang': 'zh', 'text': '采用了'}
|
526 |
+
# {'lang': 'en', 'text': 'L C D '}
|
527 |
+
# {'lang': 'zh', 'text': '屏幕'}
|
528 |
+
# ===================【语种统计】===================
|
529 |
+
|
530 |
+
# ===================【语种统计】===================
|
531 |
+
# [('zh', 51), ('ja', 19), ('en', 18), ('ko', 5)]
|
532 |
+
|
533 |
+
# 输入内容的主要语言为 = zh ,字数 = 51
|
534 |
+
# ==================================================
|
535 |
+
|
536 |
+
|
537 |
+
|
538 |
+
|
ForkLangSegment/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .LangSegment import LangSegment,getTexts,classify,getCounts,printList,setLangfilters,getLangfilters,setfilters,getfilters
|
2 |
+
|
3 |
+
# release
|
4 |
+
__version__ = '0.2.0'
|
5 |
+
|
6 |
+
|
7 |
+
# develop
|
8 |
+
__develop__ = 'dev-0.0.1'
|