Spaces:
Runtime error
Runtime error
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from typing import List | |
from typing import Tuple | |
import jieba_fast as jieba | |
from pypinyin import lazy_pinyin | |
from pypinyin import Style | |
class ToneSandhi: | |
def __init__(self): | |
self.must_neural_tone_words = { | |
"้บป็ฆ", | |
"้บปๅฉ", | |
"้ธณ้ธฏ", | |
"้ซ็ฒฑ", | |
"้ชจๅคด", | |
"้ช้ฉผ", | |
"้ฉฌ่", | |
"้ฆ้ฅฐ", | |
"้ฆๅคด", | |
"้ฆ้ฅจ", | |
"้ฃ็ญ", | |
"้พไธบ", | |
"้ไผ", | |
"้ๆฐ", | |
"้บๅฅณ", | |
"้จ้", | |
"้ๅคด", | |
"้บ็", | |
"้้", | |
"้ๅ ", | |
"้ฅๅ", | |
"้่", | |
"้ๅคด", | |
"้จๅ", | |
"้ฃไน", | |
"้ๅฃซ", | |
"้ ๅ", | |
"่ฟท็ณ", | |
"่ฟ็ดฏ", | |
"่ฟไน", | |
"่ฟไธช", | |
"่ฟๆฐ", | |
"่ฟๅป", | |
"่ฝฏๅ", | |
"่ฝฌๆ ", | |
"่ธๅฎ", | |
"่ทณ่ค", | |
"่ทๅคด", | |
"่ถ่ถ", | |
"่ดขไธป", | |
"่ฑ่ ", | |
"่ฎฒ็ฉถ", | |
"่ฎฐๆง", | |
"่ฎฐๅท", | |
"่ฎค่ฏ", | |
"่ง็ฉ", | |
"่ง่ฏ", | |
"่ฃ็ผ", | |
"่กฅไธ", | |
"่กฃ่ฃณ", | |
"่กฃๆ", | |
"่ก้จ", | |
"่กๅ", | |
"่กๆ", | |
"่กๅฝ", | |
"่ค่", | |
"่่", | |
"่่ท", | |
"่ซ่ฆ", | |
"่ก่", | |
"่ๅ", | |
"่ธ่ ", | |
"่ๆก", | |
"่ๅคด", | |
"่่", | |
"่้บป", | |
"่ๆ", | |
"่ๅฆ", | |
"่ๅคด", | |
"่ชๅจ", | |
"่่ฏ", | |
"่พๆฐ", | |
"่่ข", | |
"่ๆข", | |
"่ฝ่", | |
"่ณ่", | |
"่ญ่", | |
"่ก่", | |
"่ก็ด", | |
"่กๅ", | |
"่ชๆ", | |
"่ฝ่ฏฏ", | |
"่ฝๆ", | |
"่ทๆ", | |
"่ณๆต", | |
"่็ท", | |
"่ๅฎ", | |
"่ๅฉ", | |
"่ๅคด", | |
"่ๅคช", | |
"็ฟป่ พ", | |
"็ฝๅฆ", | |
"็ฝๅคด", | |
"็ผ่พ", | |
"็ปๅฎ", | |
"็บข็ซ", | |
"็ดฏ่ต", | |
"็ณจ็ณ", | |
"็ณๆถ", | |
"็ฒพ็ฅ", | |
"็ฒฎ้ฃ", | |
"็ฐธ็ฎ", | |
"็ฏฑ็ฌ", | |
"็ฎ่ฎก", | |
"็ฎ็", | |
"็ญๅบ", | |
"็ฌคๅธ", | |
"็ฌ่ฏญ", | |
"็ฌ่ฏ", | |
"็ช็ชฟ", | |
"็ชๅ", | |
"็ชๆท", | |
"็จณๅฝ", | |
"็จ็ฝ", | |
"็งฐๅผ", | |
"็งงๆญ", | |
"็งๆฐ", | |
"็งๆ", | |
"็ฆๆฐ", | |
"็ฅๅฎ", | |
"็ ๅฐ", | |
"็ ๅคด", | |
"็ณๆฆด", | |
"็ณๅคด", | |
"็ณๅ ", | |
"็ฅ่ฏ", | |
"็ผ็", | |
"็ฏ็ผ", | |
"็จๅทด", | |
"็ๆฏ", | |
"็ธๅฃฐ", | |
"็็ฎ", | |
"็ฝๅ", | |
"็ข็พ", | |
"็ๅฟซ", | |
"็็พ", | |
"็็ฉ", | |
"็ๅฟฝ", | |
"็็", | |
"็ๆ", | |
"็่", | |
"็ต็ถ", | |
"็ข็ฃจ", | |
"็็", | |
"็ป็", | |
"็ซ็ฐ", | |
"็ไน", | |
"็็ธ", | |
"็ถๅ ", | |
"็นๅก", | |
"็ฒๅฃ", | |
"็็ข", | |
"็ๆฅผ", | |
"็ฝๅฟซ", | |
"็ฑไบบ", | |
"็ญ้น", | |
"็ง้ฅผ", | |
"็็ญ", | |
"็็ณ", | |
"็นๅฟ", | |
"็ๅธ", | |
"็ฏ็ฌผ", | |
"็ซๅ", | |
"ๆผไบฎ", | |
"ๆปๆบ", | |
"ๆบ่พพ", | |
"ๆธฉๅ", | |
"ๆธ ๆฅ", | |
"ๆถๆฏ", | |
"ๆตชๅคด", | |
"ๆดปๆณผ", | |
"ๆฏๆน", | |
"ๆญฃ็ป", | |
"ๆฌบ่ด", | |
"ๆจก็ณ", | |
"ๆงๆฆ", | |
"ๆฃบๆ", | |
"ๆฃๆง", | |
"ๆฃ่ฑ", | |
"ๆ ธๆก", | |
"ๆ ๆ ", | |
"ๆด็ซ", | |
"ๆถๅฟ", | |
"ๆๅคด", | |
"ๆๆท", | |
"ๆบ็ต", | |
"ๆฌไบ", | |
"ๆจๅคด", | |
"ๆจๅ ", | |
"ๆๅ", | |
"ๆ้ฅผ", | |
"ๆไบฎ", | |
"ๆๅ", | |
"ๆ็ฝ", | |
"ๆถๅ", | |
"ๆฐ้ฒ", | |
"ๆ ไบ", | |
"ๆถๆพ", | |
"ๆถๆ", | |
"ๆ้ฒ", | |
"ๆ่ฆ", | |
"ๆๅ", | |
"ๆ็ฒ", | |
"ๆๅคด", | |
"ๆพๆ", | |
"ๆณๅคด", | |
"ๆจๅผ", | |
"ๆ็", | |
"ๆๅผ", | |
"ๆฌไธพ", | |
"ๆคๅฃซ", | |
"ๆ่ พ", | |
"ๆซๅธ", | |
"ๆ้", | |
"ๆ็ฎ", | |
"ๆ็น", | |
"ๆๆฎ", | |
"ๆๅฌ", | |
"ๆๅ", | |
"ๆๅฎ", | |
"ๆๆ ", | |
"ๆๆ", | |
"ๆๅพ", | |
"ๆ่ฏ", | |
"ๆๆ", | |
"ๆ ๅฝข", | |
"ๆๆง", | |
"ๆช็ฉ", | |
"ๆ้", | |
"ๆไน", | |
"ๅฟตๅคด", | |
"ๅฟตๅจ", | |
"ๅฟซๆดป", | |
"ๅฟๆดป", | |
"ๅฟๆฐ", | |
"ๅฟๆ", | |
"ๅพ็ฝช", | |
"ๅผ ็ฝ", | |
"ๅผๅ ", | |
"ๅผ้", | |
"ๅบ้ ฌ", | |
"ๅบ็จผ", | |
"ๅนฒไบ", | |
"ๅธฎๆ", | |
"ๅธ็ฏท", | |
"ๅธ็ฝ", | |
"ๅธ็ถ", | |
"ๅธๅ ", | |
"ๅทด็ป", | |
"ๅทดๆ", | |
"ๅทฎไบ", | |
"ๅทฅๅคซ", | |
"ๅฒๆฐ", | |
"ๅฑ่ก", | |
"ๅฐพๅทด", | |
"ๅฐ็ท", | |
"ๅฐๆฐ", | |
"ๅฐไผ", | |
"ๅฐๅฐฑ", | |
"ๅฏนๅคด", | |
"ๅฏนไป", | |
"ๅฏกๅฆ", | |
"ๅฎถไผ", | |
"ๅฎขๆฐ", | |
"ๅฎๅจ", | |
"ๅฎๅธ", | |
"ๅญฆ้ฎ", | |
"ๅญฆ็", | |
"ๅญๅท", | |
"ๅซๅฆ", | |
"ๅชณๅฆ", | |
"ๅชไบบ", | |
"ๅฉๅฎถ", | |
"ๅจๅฎถ", | |
"ๅงๅฑ", | |
"ๅงๅจ", | |
"ๅงๅคซ", | |
"ๅฆฏๅจ", | |
"ๅฆฅๅฝ", | |
"ๅฆ็ฒพ", | |
"ๅฅดๆ", | |
"ๅฅณๅฉฟ", | |
"ๅคดๅ", | |
"ๅคช้ณ", | |
"ๅคง็ท", | |
"ๅคงๆน", | |
"ๅคงๆ", | |
"ๅคงๅคซ", | |
"ๅคๅฐ", | |
"ๅคไน", | |
"ๅค็ฅ", | |
"ๅฃฎๅฎ", | |
"ๅฐ้", | |
"ๅฐๆน", | |
"ๅจไน", | |
"ๅฐ้พ", | |
"ๅดๅทด", | |
"ๅฑๅ", | |
"ๅๅ", | |
"ๅๅ", | |
"ๅๆฌข", | |
"ๅๅ", | |
"ๅๅญ", | |
"ๅ้", | |
"ๅพๆฒซ", | |
"ๅๅทด", | |
"ๅๆฌ ", | |
"ๅๅฆ", | |
"ๅณๅฝ", | |
"ๅๅฐ", | |
"ๅ่ฏ", | |
"ๅ็คบ", | |
"ๅซ็ณ", | |
"ๅๅฌ", | |
"ๅๅคด", | |
"ๅๅญ", | |
"ๅๅ ", | |
"ๅๅ", | |
"ๅๅ", | |
"ๅซๅค", | |
"ๅฃ่ข", | |
"ๅ้", | |
"ๅๅฎณ", | |
"ๅๆค", | |
"ๅ ่ขฑ", | |
"ๅ ๆถต", | |
"ๅ็งฐ", | |
"ๅคๅฟซ", | |
"ๅจ้", | |
"ๅจๅผน", | |
"ๅๅคซ", | |
"ๅๆฐ", | |
"ๅๅคด", | |
"ๅบ็ฌ", | |
"ๅบๆฟ", | |
"ๅซๆญ", | |
"ๅฉ่ฝ", | |
"ๅฉ็ดข", | |
"ๅฉๅฎณ", | |
"ๅๆ", | |
"ๅบๆฏ", | |
"ๅๅ", | |
"ๅๅฟซ", | |
"ๅทๆ", | |
"ๅคๆ", | |
"ๅๅคฑ", | |
"ๅ ปๆดป", | |
"ๅ ณ็ณป", | |
"ๅ ็", | |
"ๅ ๅผ", | |
"ไพฟๅฎ", | |
"ไฝฟๅค", | |
"ไฝฉๆ", | |
"ไฝๅ", | |
"ไฝ้ข", | |
"ไฝ็ฝฎ", | |
"ไผผ็", | |
"ไผ่ฎก", | |
"ไผๆฏ", | |
"ไปไน", | |
"ไบบๅฎถ", | |
"ไบฒๆ", | |
"ไบฒๅฎถ", | |
"ไบคๆ ", | |
"ไบๅฝฉ", | |
"ไบๆ ", | |
"ไนฐๅ", | |
"ไธปๆ", | |
"ไธซๅคด", | |
"ไธงๆฐ", | |
"ไธคๅฃ", | |
"ไธ่ฅฟ", | |
"ไธๅฎถ", | |
"ไธๆ ", | |
"ไธ็ฑ", | |
"ไธๅจ", | |
"ไธๆฐด", | |
"ไธๅทด", | |
"ไธๅคด", | |
"ไธๅธ", | |
"ไธๅคซ", | |
"ไธไบบ", | |
"ไธ่พ", | |
"้ฃไธช", | |
"่ฉ่จ", | |
"็ถไบฒ", | |
"ๆฏไบฒ", | |
"ๅๅ", | |
"้้ข", | |
"่ดน็จ", | |
"ๅคๅฎถ", | |
"็ๅคด", | |
"ไป็ป", | |
"่ๅ", | |
"ๅคงไบบ", | |
"ๆณฅ้ณ ", | |
"ๅนธ็ฆ", | |
"็ๆ", | |
"่ฎกๅ", | |
"ๆ่ พ", | |
"่ก็", | |
"ๅงฅ็ท", | |
"็ ง้กพ", | |
"ๅๅ", | |
"ๅไป", | |
"ๅผๅ ", | |
"่่ฑ", | |
"ๅคๅฐ", | |
"ๆๆฒ", | |
"ๅฏ็ข", | |
"็ณ่น", | |
"ๅ่ พ", | |
"ๆฅๅค", | |
"้ป่พ", | |
"็็ผ ", | |
"ๅฝๅฐ", | |
"็ข้ช", | |
"ๅๅฑ", | |
"ๆซๆ", | |
"ๆฆ่ฎฐ", | |
} | |
self.must_not_neural_tone_words = { | |
"็ทๅญ", | |
"ๅฅณๅญ", | |
"ๅๅญ", | |
"ๅๅญ", | |
"้ๅญ", | |
"่ฒๅญ", | |
"็ณๅญ", | |
"็ๅญ", | |
"็ตๅญ", | |
"ไบบไบบ", | |
"่่", | |
"ๅนบๅนบ", | |
"ๅนฒๅ", | |
"ๅญฆๅญ", | |
"ๅๅ", | |
"ๆฐๆฐ", | |
"่ข ่ข ", | |
"ๅฑๅฐ", | |
"ไปฅไธ", | |
"ๅจๅๅ", | |
"่ฑ่ฑ่่", | |
"็ๅพ", | |
"่ๅฐ", | |
"ๆณๆณ", | |
"็็", | |
"ๆๆ", | |
"ๅตๅญ", | |
"ๆญปๆญป", | |
"ๅๅ", | |
"ๆณๆณ", | |
"ไฝผไฝผ", | |
"ๅตๅต", | |
"ๆๆ", | |
"่่", | |
"ๆดๆด", | |
"่่", | |
"่ฝๅฐ", | |
"็ฎๅญ", | |
"ๅฎถๅฎถๆทๆท", | |
"้้", | |
} | |
self.punc = "๏ผ๏ผ๏ผใ๏ผ๏ผโโโโ':,;.?!" | |
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 | |
# e.g. | |
# word: "ๅฎถ้" | |
# pos: "s" | |
# finals: ['ia1', 'i3'] | |
def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: | |
# reduplication words for n. and v. e.g. ๅฅถๅฅถ, ่ฏ่ฏ, ๆบๆบ | |
for j, item in enumerate(word): | |
if ( | |
j - 1 >= 0 | |
and item == word[j - 1] | |
and pos[0] in {"n", "v", "a"} | |
and word not in self.must_not_neural_tone_words | |
): | |
finals[j] = finals[j][:-1] + "5" | |
ge_idx = word.find("ไธช") | |
if len(word) >= 1 and word[-1] in "ๅงๅขๅๅๅๅปๅๅๅจๅๅฆๅ้ขๆปดๅฉๅๅฝๅฐ่ถๅ่ฏถ": | |
finals[-1] = finals[-1][:-1] + "5" | |
elif len(word) >= 1 and word[-1] in "็ๅฐๅพ": | |
finals[-1] = finals[-1][:-1] + "5" | |
# e.g. ่ตฐไบ, ็็, ๅป่ฟ | |
elif len(word) == 1 and word in "ไบ็่ฟ" and pos in {"ul", "uz", "ug"}: | |
finals[-1] = finals[-1][:-1] + "5" | |
elif len(word) > 1 and word[-1] in "ไปฌๅญ" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words: | |
finals[-1] = finals[-1][:-1] + "5" | |
# e.g. ๆกไธ, ๅฐไธ, ๅฎถ้ | |
elif len(word) > 1 and word[-1] in "ไธไธ้" and pos in {"s", "l", "f"}: | |
finals[-1] = finals[-1][:-1] + "5" | |
# e.g. ไธๆฅ, ไธๅป | |
elif len(word) > 1 and word[-1] in "ๆฅๅป" and word[-2] in "ไธไธ่ฟๅบๅ่ฟ่ตทๅผ": | |
finals[-1] = finals[-1][:-1] + "5" | |
# ไธชๅ้่ฏ | |
elif ( | |
ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "ๅ ๆไธคๅๅคๅๆดๆฏๅๆฏ") | |
) or word == "ไธช": | |
finals[ge_idx] = finals[ge_idx][:-1] + "5" | |
else: | |
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: | |
finals[-1] = finals[-1][:-1] + "5" | |
word_list = self._split_word(word) | |
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] | |
for i, word in enumerate(word_list): | |
# conventional neural in Chinese | |
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: | |
finals_list[i][-1] = finals_list[i][-1][:-1] + "5" | |
finals = sum(finals_list, []) | |
return finals | |
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
# e.g. ็ไธๆ | |
if len(word) == 3 and word[1] == "ไธ": | |
finals[1] = finals[1][:-1] + "5" | |
else: | |
for i, char in enumerate(word): | |
# "ไธ" before tone4 should be bu2, e.g. ไธๆ | |
if char == "ไธ" and i + 1 < len(word) and finals[i + 1][-1] == "4": | |
finals[i] = finals[i][:-1] + "2" | |
return finals | |
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
# "ไธ" in number sequences, e.g. ไธ้ถ้ถ, ไบไธ้ถ | |
if word.find("ไธ") != -1 and all([item.isnumeric() for item in word if item != "ไธ"]): | |
return finals | |
# "ไธ" between reduplication words shold be yi5, e.g. ็ไธ็ | |
elif len(word) == 3 and word[1] == "ไธ" and word[0] == word[-1]: | |
finals[1] = finals[1][:-1] + "5" | |
# when "ไธ" is ordinal word, it should be yi1 | |
elif word.startswith("็ฌฌไธ"): | |
finals[1] = finals[1][:-1] + "1" | |
else: | |
for i, char in enumerate(word): | |
if char == "ไธ" and i + 1 < len(word): | |
# "ไธ" before tone4 should be yi2, e.g. ไธๆฎต | |
if finals[i + 1][-1] == "4": | |
finals[i] = finals[i][:-1] + "2" | |
# "ไธ" before non-tone4 should be yi4, e.g. ไธๅคฉ | |
else: | |
# "ไธ" ๅ้ขๅฆๆๆฏๆ ็น๏ผ่ฟ่ฏปไธๅฃฐ | |
if word[i + 1] not in self.punc: | |
finals[i] = finals[i][:-1] + "4" | |
return finals | |
def _split_word(self, word: str) -> List[str]: | |
word_list = jieba.cut_for_search(word) | |
word_list = sorted(word_list, key=lambda i: len(i), reverse=False) | |
first_subword = word_list[0] | |
first_begin_idx = word.find(first_subword) | |
if first_begin_idx == 0: | |
second_subword = word[len(first_subword) :] | |
new_word_list = [first_subword, second_subword] | |
else: | |
second_subword = word[: -len(first_subword)] | |
new_word_list = [second_subword, first_subword] | |
return new_word_list | |
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
if len(word) == 2 and self._all_tone_three(finals): | |
finals[0] = finals[0][:-1] + "2" | |
elif len(word) == 3: | |
word_list = self._split_word(word) | |
if self._all_tone_three(finals): | |
# disyllabic + monosyllabic, e.g. ่ๅค/ๅ | |
if len(word_list[0]) == 2: | |
finals[0] = finals[0][:-1] + "2" | |
finals[1] = finals[1][:-1] + "2" | |
# monosyllabic + disyllabic, e.g. ็บธ/่่ | |
elif len(word_list[0]) == 1: | |
finals[1] = finals[1][:-1] + "2" | |
else: | |
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] | |
if len(finals_list) == 2: | |
for i, sub in enumerate(finals_list): | |
# e.g. ๆๆ/ไบบ | |
if self._all_tone_three(sub) and len(sub) == 2: | |
finals_list[i][0] = finals_list[i][0][:-1] + "2" | |
# e.g. ๅฅฝ/ๅๆฌข | |
elif ( | |
i == 1 | |
and not self._all_tone_three(sub) | |
and finals_list[i][0][-1] == "3" | |
and finals_list[0][-1][-1] == "3" | |
): | |
finals_list[0][-1] = finals_list[0][-1][:-1] + "2" | |
finals = sum(finals_list, []) | |
# split idiom into two words who's length is 2 | |
elif len(word) == 4: | |
finals_list = [finals[:2], finals[2:]] | |
finals = [] | |
for sub in finals_list: | |
if self._all_tone_three(sub): | |
sub[0] = sub[0][:-1] + "2" | |
finals += sub | |
return finals | |
def _all_tone_three(self, finals: List[str]) -> bool: | |
return all(x[-1] == "3" for x in finals) | |
# merge "ไธ" and the word behind it | |
# if don't merge, "ไธ" sometimes appears alone according to jieba, which may occur sandhi error | |
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
last_word = "" | |
for word, pos in seg: | |
if last_word == "ไธ": | |
word = last_word + word | |
if word != "ไธ": | |
new_seg.append((word, pos)) | |
last_word = word[:] | |
if last_word == "ไธ": | |
new_seg.append((last_word, "d")) | |
last_word = "" | |
return new_seg | |
# function 1: merge "ไธ" and reduplication words in it's left and right, e.g. "ๅฌ","ไธ","ๅฌ" ->"ๅฌไธๅฌ" | |
# function 2: merge single "ไธ" and the word behind it | |
# if don't merge, "ไธ" sometimes appears alone according to jieba, which may occur sandhi error | |
# e.g. | |
# input seg: [('ๅฌ', 'v'), ('ไธ', 'm'), ('ๅฌ', 'v')] | |
# output seg: [['ๅฌไธๅฌ', 'v']] | |
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
i = 0 | |
# function 1 | |
while i < len(seg): | |
word, pos = seg[i] | |
merged = False | |
if ( | |
i - 1 >= 0 | |
and word == "ไธ" | |
and i + 1 < len(seg) | |
): | |
last = new_seg[-1] if new_seg else seg[i - 1] | |
if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v": | |
combined = last[0] + "ไธ" + seg[i + 1][0] | |
new_seg[-1] = [combined, last[1]] | |
i += 2 | |
merged = True | |
if not merged: | |
new_seg.append([word, pos]) | |
i += 1 | |
seg = new_seg | |
new_seg = [] | |
# function 2 | |
for word, pos in seg: | |
if new_seg and new_seg[-1][0] == "ไธ": | |
new_seg[-1][0] = new_seg[-1][0] + word | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
# the first and the second words are all_tone_three | |
def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
sub_finals_list = [ | |
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg | |
] | |
assert len(sub_finals_list) == len(seg) | |
merge_last = [False] * len(seg) | |
for i, (word, pos) in enumerate(seg): | |
if ( | |
i - 1 >= 0 | |
and self._all_tone_three(sub_finals_list[i - 1]) | |
and self._all_tone_three(sub_finals_list[i]) | |
and not merge_last[i - 1] | |
): | |
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi | |
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
merge_last[i] = True | |
else: | |
new_seg.append([word, pos]) | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def _is_reduplication(self, word: str) -> bool: | |
return len(word) == 2 and word[0] == word[1] | |
# the last char of first word and the first char of second word is tone_three | |
def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
sub_finals_list = [ | |
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg | |
] | |
assert len(sub_finals_list) == len(seg) | |
merge_last = [False] * len(seg) | |
for i, (word, pos) in enumerate(seg): | |
if ( | |
i - 1 >= 0 | |
and sub_finals_list[i - 1][-1][-1] == "3" | |
and sub_finals_list[i][0][-1] == "3" | |
and not merge_last[i - 1] | |
): | |
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi | |
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
merge_last[i] = True | |
else: | |
new_seg.append([word, pos]) | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
for i, (word, pos) in enumerate(seg): | |
if i - 1 >= 0 and word == "ๅฟ" and seg[i - 1][0] != "#": | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
for i, (word, pos) in enumerate(seg): | |
if new_seg and word == new_seg[-1][0]: | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
seg = self._merge_bu(seg) | |
try: | |
seg = self._merge_yi(seg) | |
except: | |
print("_merge_yi failed") | |
seg = self._merge_reduplication(seg) | |
try: | |
seg = self._merge_continuous_three_tones(seg) | |
except: | |
print("_merge_continuous_three_tones failed") | |
try: | |
seg = self._merge_continuous_three_tones_2(seg) | |
except: | |
print("_merge_continuous_three_tones_2 failed") | |
seg = self._merge_er(seg) | |
return seg | |
def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: | |
finals = self._bu_sandhi(word, finals) | |
finals = self._yi_sandhi(word, finals) | |
finals = self._neural_sandhi(word, pos, finals) | |
finals = self._three_sandhi(word, finals) | |
return finals | |