GPT-SoVITS-ProPlus / text /tone_sandhi.py
lj1995's picture
Upload folder using huggingface_hub
658835d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from typing import Tuple
import jieba_fast as jieba
from pypinyin import lazy_pinyin
from pypinyin import Style
class ToneSandhi:
def __init__(self):
self.must_neural_tone_words = {
"้บป็ƒฆ",
"้บปๅˆฉ",
"้ธณ้ธฏ",
"้ซ˜็ฒฑ",
"้ชจๅคด",
"้ช†้ฉผ",
"้ฉฌ่™Ž",
"้ฆ–้ฅฐ",
"้ฆ’ๅคด",
"้ฆ„้ฅจ",
"้ฃŽ็ญ",
"้šพไธบ",
"้˜Ÿไผ",
"้˜”ๆฐ”",
"้—บๅฅณ",
"้—จ้“",
"้”„ๅคด",
"้“บ็›–",
"้“ƒ้“›",
"้“ๅŒ ",
"้’ฅๅŒ™",
"้‡Œ่„Š",
"้‡Œๅคด",
"้ƒจๅˆ†",
"้‚ฃไนˆ",
"้“ๅฃซ",
"้€ ๅŒ–",
"่ฟท็ณŠ",
"่ฟž็ดฏ",
"่ฟ™ไนˆ",
"่ฟ™ไธช",
"่ฟๆฐ”",
"่ฟ‡ๅŽป",
"่ฝฏๅ’Œ",
"่ฝฌๆ‚ ",
"่ธๅฎž",
"่ทณ่šค",
"่ทŸๅคด",
"่ถ”่ถ„",
"่ดขไธป",
"่ฑ†่…",
"่ฎฒ็ฉถ",
"่ฎฐๆ€ง",
"่ฎฐๅท",
"่ฎค่ฏ†",
"่ง„็Ÿฉ",
"่ง่ฏ†",
"่ฃ็ผ",
"่กฅไธ",
"่กฃ่ฃณ",
"่กฃๆœ",
"่ก™้—จ",
"่ก—ๅŠ",
"่กŒๆŽ",
"่กŒๅฝ“",
"่›ค่Ÿ†",
"่˜‘่‡",
"่–„่ท",
"่‘ซ่Šฆ",
"่‘ก่„",
"่ๅœ",
"่ธ่ ",
"่‹—ๆก",
"่‹—ๅคด",
"่‹่‡",
"่Š้บป",
"่ˆ’ๆœ",
"่ˆ’ๅฆ",
"่ˆŒๅคด",
"่‡ชๅœจ",
"่†่ฏ",
"่„พๆฐ”",
"่„‘่ข‹",
"่„Šๆข",
"่ƒฝ่€",
"่ƒณ่†Š",
"่ƒญ่„‚",
"่ƒก่",
"่ƒก็ด",
"่ƒกๅŒ",
"่ชๆ˜Ž",
"่€ฝ่ฏฏ",
"่€ฝๆ",
"่€ทๆ‹‰",
"่€ณๆœต",
"่€็ˆท",
"่€ๅฎž",
"่€ๅฉ†",
"่€ๅคด",
"่€ๅคช",
"็ฟป่…พ",
"็ฝ—ๅ—ฆ",
"็ฝๅคด",
"็ผ–่พ‘",
"็ป“ๅฎž",
"็บข็ซ",
"็ดฏ่ต˜",
"็ณจ็ณŠ",
"็ณŠๆถ‚",
"็ฒพ็ฅž",
"็ฒฎ้ฃŸ",
"็ฐธ็ฎ•",
"็ฏฑ็ฌ†",
"็ฎ—่ฎก",
"็ฎ—็›˜",
"็ญ”ๅบ”",
"็ฌคๅธš",
"็ฌ‘่ฏญ",
"็ฌ‘่ฏ",
"็ชŸ็ชฟ",
"็ชๅ›Š",
"็ช—ๆˆท",
"็จณๅฝ“",
"็จ€็ฝ•",
"็งฐๅ‘ผ",
"็งงๆญŒ",
"็ง€ๆฐ”",
"็ง€ๆ‰",
"็ฆๆฐ”",
"็ฅ–ๅฎ—",
"็ šๅฐ",
"็ ๅคด",
"็Ÿณๆฆด",
"็Ÿณๅคด",
"็ŸณๅŒ ",
"็Ÿฅ่ฏ†",
"็œผ็›",
"็œฏ็ผ",
"็œจๅทด",
"็œ‰ๆฏ›",
"็›ธๅฃฐ",
"็›˜็ฎ—",
"็™ฝๅ‡€",
"็—ข็–พ",
"็—›ๅฟซ",
"็–Ÿ็–พ",
"็–™็˜ฉ",
"็–ๅฟฝ",
"็•œ็”Ÿ",
"็”Ÿๆ„",
"็”˜่”—",
"็ต็ถ",
"็ข็ฃจ",
"็‰็’ƒ",
"็Žป็’ƒ",
"็Žซ็‘ฐ",
"็Ž„ไนŽ",
"็‹็‹ธ",
"็Šถๅ…ƒ",
"็‰นๅŠก",
"็‰ฒๅฃ",
"็‰™็ขœ",
"็‰Œๆฅผ",
"็ˆฝๅฟซ",
"็ˆฑไบบ",
"็ƒญ้—น",
"็ƒง้ฅผ",
"็ƒŸ็ญ’",
"็ƒ‚็ณŠ",
"็‚นๅฟƒ",
"็‚Šๅธš",
"็ฏ็ฌผ",
"็ซๅ€™",
"ๆผ‚ไบฎ",
"ๆป‘ๆบœ",
"ๆบœ่พพ",
"ๆธฉๅ’Œ",
"ๆธ…ๆฅš",
"ๆถˆๆฏ",
"ๆตชๅคด",
"ๆดปๆณผ",
"ๆฏ”ๆ–น",
"ๆญฃ็ป",
"ๆฌบ่ดŸ",
"ๆจก็ณŠ",
"ๆงŸๆฆ”",
"ๆฃบๆ",
"ๆฃ’ๆงŒ",
"ๆฃ‰่Šฑ",
"ๆ ธๆกƒ",
"ๆ …ๆ ",
"ๆŸด็ซ",
"ๆžถๅŠฟ",
"ๆž•ๅคด",
"ๆž‡ๆท",
"ๆœบ็ต",
"ๆœฌไบ‹",
"ๆœจๅคด",
"ๆœจๅŒ ",
"ๆœ‹ๅ‹",
"ๆœˆ้ฅผ",
"ๆœˆไบฎ",
"ๆš–ๅ’Œ",
"ๆ˜Ž็™ฝ",
"ๆ—ถๅ€™",
"ๆ–ฐ้ฒœ",
"ๆ•…ไบ‹",
"ๆ”ถๆ‹พ",
"ๆ”ถๆˆ",
"ๆ้˜ฒ",
"ๆŒ–่‹ฆ",
"ๆŒ‘ๅ‰”",
"ๆŒ‡็”ฒ",
"ๆŒ‡ๅคด",
"ๆ‹พๆއ",
"ๆ‹ณๅคด",
"ๆ‹จๅผ„",
"ๆ‹›็‰Œ",
"ๆ‹›ๅ‘ผ",
"ๆŠฌไธพ",
"ๆŠคๅฃซ",
"ๆŠ˜่…พ",
"ๆ‰ซๅธš",
"ๆ‰“้‡",
"ๆ‰“็ฎ—",
"ๆ‰“็‚น",
"ๆ‰“ๆ‰ฎ",
"ๆ‰“ๅฌ",
"ๆ‰“ๅ‘",
"ๆ‰Žๅฎž",
"ๆ‰ๆ‹…",
"ๆˆ’ๆŒ‡",
"ๆ‡’ๅพ—",
"ๆ„่ฏ†",
"ๆ„ๆ€",
"ๆƒ…ๅฝข",
"ๆ‚Ÿๆ€ง",
"ๆ€ช็‰ฉ",
"ๆ€้‡",
"ๆ€Žไนˆ",
"ๅฟตๅคด",
"ๅฟตๅจ",
"ๅฟซๆดป",
"ๅฟ™ๆดป",
"ๅฟ—ๆฐ”",
"ๅฟƒๆ€",
"ๅพ—็ฝช",
"ๅผ ็ฝ—",
"ๅผŸๅ…„",
"ๅผ€้€š",
"ๅบ”้…ฌ",
"ๅบ„็จผ",
"ๅนฒไบ‹",
"ๅธฎๆ‰‹",
"ๅธ็ฏท",
"ๅธŒ็ฝ•",
"ๅธˆ็ˆถ",
"ๅธˆๅ‚…",
"ๅทด็ป“",
"ๅทดๆŽŒ",
"ๅทฎไบ‹",
"ๅทฅๅคซ",
"ๅฒๆ•ฐ",
"ๅฑ่‚ก",
"ๅฐพๅทด",
"ๅฐ‘็ˆท",
"ๅฐๆฐ”",
"ๅฐไผ™",
"ๅฐ†ๅฐฑ",
"ๅฏนๅคด",
"ๅฏนไป˜",
"ๅฏกๅฆ‡",
"ๅฎถไผ™",
"ๅฎขๆฐ”",
"ๅฎžๅœจ",
"ๅฎ˜ๅธ",
"ๅญฆ้—ฎ",
"ๅญฆ็”Ÿ",
"ๅญ—ๅท",
"ๅซๅฆ†",
"ๅชณๅฆ‡",
"ๅช’ไบบ",
"ๅฉ†ๅฎถ",
"ๅจ˜ๅฎถ",
"ๅง”ๅฑˆ",
"ๅง‘ๅจ˜",
"ๅงๅคซ",
"ๅฆฏๅจŒ",
"ๅฆฅๅฝ“",
"ๅฆ–็ฒพ",
"ๅฅดๆ‰",
"ๅฅณๅฉฟ",
"ๅคดๅ‘",
"ๅคช้˜ณ",
"ๅคง็ˆท",
"ๅคงๆ–น",
"ๅคงๆ„",
"ๅคงๅคซ",
"ๅคšๅฐ‘",
"ๅคšไนˆ",
"ๅค–็”ฅ",
"ๅฃฎๅฎž",
"ๅœฐ้“",
"ๅœฐๆ–น",
"ๅœจไนŽ",
"ๅ›ฐ้šพ",
"ๅ˜ดๅทด",
"ๅ˜ฑๅ’",
"ๅ˜Ÿๅ›”",
"ๅ˜€ๅ’•",
"ๅ–œๆฌข",
"ๅ–‡ๅ˜›",
"ๅ–‡ๅญ",
"ๅ•†้‡",
"ๅ”พๆฒซ",
"ๅ“‘ๅทด",
"ๅ“ˆๆฌ ",
"ๅ“†ๅ—ฆ",
"ๅ’ณๅ—ฝ",
"ๅ’Œๅฐš",
"ๅ‘Š่ฏ‰",
"ๅ‘Š็คบ",
"ๅซ็ณŠ",
"ๅ“ๅ”ฌ",
"ๅŽๅคด",
"ๅๅญ—",
"ๅๅ ‚",
"ๅˆๅŒ",
"ๅ†ๅ–",
"ๅซๅ”ค",
"ๅฃ่ข‹",
"ๅŽš้“",
"ๅމๅฎณ",
"ๅƒๆ–ค",
"ๅŒ…่ขฑ",
"ๅŒ…ๆถต",
"ๅŒ€็งฐ",
"ๅ‹คๅฟซ",
"ๅŠจ้™",
"ๅŠจๅผน",
"ๅŠŸๅคซ",
"ๅŠ›ๆฐ”",
"ๅ‰ๅคด",
"ๅˆบ็Œฌ",
"ๅˆบๆฟ€",
"ๅˆซๆ‰ญ",
"ๅˆฉ่ฝ",
"ๅˆฉ็ดข",
"ๅˆฉๅฎณ",
"ๅˆ†ๆž",
"ๅ‡บๆฏ",
"ๅ‡‘ๅˆ",
"ๅ‡‰ๅฟซ",
"ๅ†ทๆˆ˜",
"ๅ†คๆž‰",
"ๅ†’ๅคฑ",
"ๅ…ปๆดป",
"ๅ…ณ็ณป",
"ๅ…ˆ็”Ÿ",
"ๅ…„ๅผŸ",
"ไพฟๅฎœ",
"ไฝฟๅ”ค",
"ไฝฉๆœ",
"ไฝœๅŠ",
"ไฝ“้ข",
"ไฝ็ฝฎ",
"ไผผ็š„",
"ไผ™่ฎก",
"ไผ‘ๆฏ",
"ไป€ไนˆ",
"ไบบๅฎถ",
"ไบฒๆˆš",
"ไบฒๅฎถ",
"ไบคๆƒ…",
"ไบ‘ๅฝฉ",
"ไบ‹ๆƒ…",
"ไนฐๅ–",
"ไธปๆ„",
"ไธซๅคด",
"ไธงๆฐ”",
"ไธคๅฃ",
"ไธœ่ฅฟ",
"ไธœๅฎถ",
"ไธ–ๆ•…",
"ไธ็”ฑ",
"ไธๅœจ",
"ไธ‹ๆฐด",
"ไธ‹ๅทด",
"ไธŠๅคด",
"ไธŠๅธ",
"ไธˆๅคซ",
"ไธˆไบบ",
"ไธ€่พˆ",
"้‚ฃไธช",
"่ฉ่จ",
"็ˆถไบฒ",
"ๆฏไบฒ",
"ๅ’•ๅ™œ",
"้‚‹้ข",
"่ดน็”จ",
"ๅ†คๅฎถ",
"็”œๅคด",
"ไป‹็ป",
"่’ๅ”",
"ๅคงไบบ",
"ๆณฅ้ณ…",
"ๅนธ็ฆ",
"็†Ÿๆ‚‰",
"่ฎกๅˆ’",
"ๆ‰‘่…พ",
"่œก็ƒ›",
"ๅงฅ็ˆท",
"็…ง้กพ",
"ๅ–‰ๅ’™",
"ๅ‰ไป–",
"ๅผ„ๅ ‚",
"่š‚่šฑ",
"ๅ‡คๅ‡ฐ",
"ๆ‹–ๆฒ“",
"ๅฏ’็ขœ",
"็ณŸ่น‹",
"ๅ€’่…พ",
"ๆŠฅๅค",
"้€ป่พ‘",
"็›˜็ผ ",
"ๅ–ฝๅ•ฐ",
"็‰ข้ชš",
"ๅ’–ๅ–ฑ",
"ๆ‰ซๆŠŠ",
"ๆƒฆ่ฎฐ",
}
self.must_not_neural_tone_words = {
"็”ทๅญ",
"ๅฅณๅญ",
"ๅˆ†ๅญ",
"ๅŽŸๅญ",
"้‡ๅญ",
"่Žฒๅญ",
"็Ÿณๅญ",
"็“œๅญ",
"็”ตๅญ",
"ไบบไบบ",
"่™Ž่™Ž",
"ๅนบๅนบ",
"ๅนฒๅ˜›",
"ๅญฆๅญ",
"ๅ“ˆๅ“ˆ",
"ๆ•ฐๆ•ฐ",
"่ข…่ข…",
"ๅฑ€ๅœฐ",
"ไปฅไธ‹",
"ๅจƒๅ“ˆๅ“ˆ",
"่Šฑ่Šฑ่‰่‰",
"็•™ๅพ—",
"่€•ๅœฐ",
"ๆƒณๆƒณ",
"็†™็†™",
"ๆ”˜ๆ”˜",
"ๅตๅญ",
"ๆญปๆญป",
"ๅ†‰ๅ†‰",
"ๆณๆณ",
"ไฝผไฝผ",
"ๅตๅต",
"ๆ‰“ๆ‰“",
"่€ƒ่€ƒ",
"ๆ•ดๆ•ด",
"่Ž˜่Ž˜",
"่ฝๅœฐ",
"็ฎ—ๅญ",
"ๅฎถๅฎถๆˆทๆˆท",
"้’้’",
}
self.punc = "๏ผš๏ผŒ๏ผ›ใ€‚๏ผŸ๏ผโ€œโ€โ€˜โ€™':,;.?!"
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g.
# word: "ๅฎถ้‡Œ"
# pos: "s"
# finals: ['ia1', 'i3']
def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
# reduplication words for n. and v. e.g. ๅฅถๅฅถ, ่ฏ•่ฏ•, ๆ—บๆ—บ
for j, item in enumerate(word):
if (
j - 1 >= 0
and item == word[j - 1]
and pos[0] in {"n", "v", "a"}
and word not in self.must_not_neural_tone_words
):
finals[j] = finals[j][:-1] + "5"
ge_idx = word.find("ไธช")
if len(word) >= 1 and word[-1] in "ๅงๅ‘ขๅ“ˆๅ•Šๅ‘ๅ™ปๅ˜›ๅ–ๅ—จๅ‘ๅ“ฆๅ“’้ขๆปดๅ“ฉๅ“Ÿๅ–ฝๅ•ฐ่€ถๅ–”่ฏถ":
finals[-1] = finals[-1][:-1] + "5"
elif len(word) >= 1 and word[-1] in "็š„ๅœฐๅพ—":
finals[-1] = finals[-1][:-1] + "5"
# e.g. ่ตฐไบ†, ็œ‹็€, ๅŽป่ฟ‡
elif len(word) == 1 and word in "ไบ†็€่ฟ‡" and pos in {"ul", "uz", "ug"}:
finals[-1] = finals[-1][:-1] + "5"
elif len(word) > 1 and word[-1] in "ไปฌๅญ" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words:
finals[-1] = finals[-1][:-1] + "5"
# e.g. ๆกŒไธŠ, ๅœฐไธ‹, ๅฎถ้‡Œ
elif len(word) > 1 and word[-1] in "ไธŠไธ‹้‡Œ" and pos in {"s", "l", "f"}:
finals[-1] = finals[-1][:-1] + "5"
# e.g. ไธŠๆฅ, ไธ‹ๅŽป
elif len(word) > 1 and word[-1] in "ๆฅๅŽป" and word[-2] in "ไธŠไธ‹่ฟ›ๅ‡บๅ›ž่ฟ‡่ตทๅผ€":
finals[-1] = finals[-1][:-1] + "5"
# ไธชๅš้‡่ฏ
elif (
ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "ๅ‡ ๆœ‰ไธคๅŠๅคšๅ„ๆ•ดๆฏๅšๆ˜ฏ")
) or word == "ไธช":
finals[ge_idx] = finals[ge_idx][:-1] + "5"
else:
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
finals[-1] = finals[-1][:-1] + "5"
word_list = self._split_word(word)
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
for i, word in enumerate(word_list):
# conventional neural in Chinese
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
finals = sum(finals_list, [])
return finals
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
# e.g. ็œ‹ไธๆ‡‚
if len(word) == 3 and word[1] == "ไธ":
finals[1] = finals[1][:-1] + "5"
else:
for i, char in enumerate(word):
# "ไธ" before tone4 should be bu2, e.g. ไธๆ€•
if char == "ไธ" and i + 1 < len(word) and finals[i + 1][-1] == "4":
finals[i] = finals[i][:-1] + "2"
return finals
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
# "ไธ€" in number sequences, e.g. ไธ€้›ถ้›ถ, ไบŒไธ€้›ถ
if word.find("ไธ€") != -1 and all([item.isnumeric() for item in word if item != "ไธ€"]):
return finals
# "ไธ€" between reduplication words shold be yi5, e.g. ็œ‹ไธ€็œ‹
elif len(word) == 3 and word[1] == "ไธ€" and word[0] == word[-1]:
finals[1] = finals[1][:-1] + "5"
# when "ไธ€" is ordinal word, it should be yi1
elif word.startswith("็ฌฌไธ€"):
finals[1] = finals[1][:-1] + "1"
else:
for i, char in enumerate(word):
if char == "ไธ€" and i + 1 < len(word):
# "ไธ€" before tone4 should be yi2, e.g. ไธ€ๆฎต
if finals[i + 1][-1] == "4":
finals[i] = finals[i][:-1] + "2"
# "ไธ€" before non-tone4 should be yi4, e.g. ไธ€ๅคฉ
else:
# "ไธ€" ๅŽ้ขๅฆ‚ๆžœๆ˜ฏๆ ‡็‚น๏ผŒ่ฟ˜่ฏปไธ€ๅฃฐ
if word[i + 1] not in self.punc:
finals[i] = finals[i][:-1] + "4"
return finals
def _split_word(self, word: str) -> List[str]:
word_list = jieba.cut_for_search(word)
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
first_subword = word_list[0]
first_begin_idx = word.find(first_subword)
if first_begin_idx == 0:
second_subword = word[len(first_subword) :]
new_word_list = [first_subword, second_subword]
else:
second_subword = word[: -len(first_subword)]
new_word_list = [second_subword, first_subword]
return new_word_list
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
if len(word) == 2 and self._all_tone_three(finals):
finals[0] = finals[0][:-1] + "2"
elif len(word) == 3:
word_list = self._split_word(word)
if self._all_tone_three(finals):
# disyllabic + monosyllabic, e.g. ่’™ๅค/ๅŒ…
if len(word_list[0]) == 2:
finals[0] = finals[0][:-1] + "2"
finals[1] = finals[1][:-1] + "2"
# monosyllabic + disyllabic, e.g. ็บธ/่€่™Ž
elif len(word_list[0]) == 1:
finals[1] = finals[1][:-1] + "2"
else:
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
if len(finals_list) == 2:
for i, sub in enumerate(finals_list):
# e.g. ๆ‰€ๆœ‰/ไบบ
if self._all_tone_three(sub) and len(sub) == 2:
finals_list[i][0] = finals_list[i][0][:-1] + "2"
# e.g. ๅฅฝ/ๅ–œๆฌข
elif (
i == 1
and not self._all_tone_three(sub)
and finals_list[i][0][-1] == "3"
and finals_list[0][-1][-1] == "3"
):
finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
finals = sum(finals_list, [])
# split idiom into two words who's length is 2
elif len(word) == 4:
finals_list = [finals[:2], finals[2:]]
finals = []
for sub in finals_list:
if self._all_tone_three(sub):
sub[0] = sub[0][:-1] + "2"
finals += sub
return finals
def _all_tone_three(self, finals: List[str]) -> bool:
return all(x[-1] == "3" for x in finals)
# merge "ไธ" and the word behind it
# if don't merge, "ไธ" sometimes appears alone according to jieba, which may occur sandhi error
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
last_word = ""
for word, pos in seg:
if last_word == "ไธ":
word = last_word + word
if word != "ไธ":
new_seg.append((word, pos))
last_word = word[:]
if last_word == "ไธ":
new_seg.append((last_word, "d"))
last_word = ""
return new_seg
# function 1: merge "ไธ€" and reduplication words in it's left and right, e.g. "ๅฌ","ไธ€","ๅฌ" ->"ๅฌไธ€ๅฌ"
# function 2: merge single "ไธ€" and the word behind it
# if don't merge, "ไธ€" sometimes appears alone according to jieba, which may occur sandhi error
# e.g.
# input seg: [('ๅฌ', 'v'), ('ไธ€', 'm'), ('ๅฌ', 'v')]
# output seg: [['ๅฌไธ€ๅฌ', 'v']]
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
i = 0
# function 1
while i < len(seg):
word, pos = seg[i]
merged = False
if (
i - 1 >= 0
and word == "ไธ€"
and i + 1 < len(seg)
):
last = new_seg[-1] if new_seg else seg[i - 1]
if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
combined = last[0] + "ไธ€" + seg[i + 1][0]
new_seg[-1] = [combined, last[1]]
i += 2
merged = True
if not merged:
new_seg.append([word, pos])
i += 1
seg = new_seg
new_seg = []
# function 2
for word, pos in seg:
if new_seg and new_seg[-1][0] == "ไธ€":
new_seg[-1][0] = new_seg[-1][0] + word
else:
new_seg.append([word, pos])
return new_seg
# the first and the second words are all_tone_three
def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = [
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
]
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
for i, (word, pos) in enumerate(seg):
if (
i - 1 >= 0
and self._all_tone_three(sub_finals_list[i - 1])
and self._all_tone_three(sub_finals_list[i])
and not merge_last[i - 1]
):
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
new_seg.append([word, pos])
else:
new_seg.append([word, pos])
return new_seg
def _is_reduplication(self, word: str) -> bool:
return len(word) == 2 and word[0] == word[1]
# the last char of first word and the first char of second word is tone_three
def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = [
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
]
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
for i, (word, pos) in enumerate(seg):
if (
i - 1 >= 0
and sub_finals_list[i - 1][-1][-1] == "3"
and sub_finals_list[i][0][-1] == "3"
and not merge_last[i - 1]
):
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
new_seg.append([word, pos])
else:
new_seg.append([word, pos])
return new_seg
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and word == "ๅ„ฟ" and seg[i - 1][0] != "#":
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
else:
new_seg.append([word, pos])
return new_seg
def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
for i, (word, pos) in enumerate(seg):
if new_seg and word == new_seg[-1][0]:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
else:
new_seg.append([word, pos])
return new_seg
def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
seg = self._merge_bu(seg)
try:
seg = self._merge_yi(seg)
except:
print("_merge_yi failed")
seg = self._merge_reduplication(seg)
try:
seg = self._merge_continuous_three_tones(seg)
except:
print("_merge_continuous_three_tones failed")
try:
seg = self._merge_continuous_three_tones_2(seg)
except:
print("_merge_continuous_three_tones_2 failed")
seg = self._merge_er(seg)
return seg
def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
finals = self._bu_sandhi(word, finals)
finals = self._yi_sandhi(word, finals)
finals = self._neural_sandhi(word, pos, finals)
finals = self._three_sandhi(word, finals)
return finals