|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
|
|
|
def japanese_to_romaji(japanese_text): |
|
|
""" |
|
|
Convert Japanese hiragana and katakana to romaji (Latin alphabet representation). |
|
|
|
|
|
Args: |
|
|
japanese_text (str): Text containing hiragana and/or katakana characters |
|
|
|
|
|
Returns: |
|
|
str: The romaji (Latin alphabet) equivalent |
|
|
""" |
|
|
|
|
|
kana_map = { |
|
|
|
|
|
'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o', |
|
|
'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko', |
|
|
'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so', |
|
|
'タ': 'ta', 'チ': 'chi', 'ツ': 'tsu', 'テ': 'te', 'ト': 'to', |
|
|
'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no', |
|
|
'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho', |
|
|
'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo', |
|
|
'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo', |
|
|
'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro', |
|
|
'ワ': 'wa', 'ヲ': 'wo', 'ン': 'n', |
|
|
|
|
|
|
|
|
'ガ': 'ga', 'ギ': 'gi', 'グ': 'gu', 'ゲ': 'ge', 'ゴ': 'go', |
|
|
'ザ': 'za', 'ジ': 'ji', 'ズ': 'zu', 'ゼ': 'ze', 'ゾ': 'zo', |
|
|
'ダ': 'da', 'ヂ': 'ji', 'ヅ': 'zu', 'デ': 'de', 'ド': 'do', |
|
|
'バ': 'ba', 'ビ': 'bi', 'ブ': 'bu', 'ベ': 'be', 'ボ': 'bo', |
|
|
'パ': 'pa', 'ピ': 'pi', 'プ': 'pu', 'ペ': 'pe', 'ポ': 'po', |
|
|
|
|
|
|
|
|
'キャ': 'kya', 'キュ': 'kyu', 'キョ': 'kyo', |
|
|
'シャ': 'sha', 'シュ': 'shu', 'ショ': 'sho', |
|
|
'チャ': 'cha', 'チュ': 'chu', 'チョ': 'cho', |
|
|
'ニャ': 'nya', 'ニュ': 'nyu', 'ニョ': 'nyo', |
|
|
'ヒャ': 'hya', 'ヒュ': 'hyu', 'ヒョ': 'hyo', |
|
|
'ミャ': 'mya', 'ミュ': 'myu', 'ミョ': 'myo', |
|
|
'リャ': 'rya', 'リュ': 'ryu', 'リョ': 'ryo', |
|
|
'ギャ': 'gya', 'ギュ': 'gyu', 'ギョ': 'gyo', |
|
|
'ジャ': 'ja', 'ジュ': 'ju', 'ジョ': 'jo', |
|
|
'ビャ': 'bya', 'ビュ': 'byu', 'ビョ': 'byo', |
|
|
'ピャ': 'pya', 'ピュ': 'pyu', 'ピョ': 'pyo', |
|
|
|
|
|
|
|
|
'ッ': '', |
|
|
'ャ': 'ya', 'ュ': 'yu', 'ョ': 'yo', |
|
|
|
|
|
|
|
|
'ヴ': 'vu', 'ファ': 'fa', 'フィ': 'fi', 'フェ': 'fe', 'フォ': 'fo', |
|
|
'ウィ': 'wi', 'ウェ': 'we', 'ウォ': 'wo', |
|
|
|
|
|
|
|
|
'あ': 'a', 'い': 'i', 'う': 'u', 'え': 'e', 'お': 'o', |
|
|
'か': 'ka', 'き': 'ki', 'く': 'ku', 'け': 'ke', 'こ': 'ko', |
|
|
'さ': 'sa', 'し': 'shi', 'す': 'su', 'せ': 'se', 'そ': 'so', |
|
|
'た': 'ta', 'ち': 'chi', 'つ': 'tsu', 'て': 'te', 'と': 'to', |
|
|
'な': 'na', 'に': 'ni', 'ぬ': 'nu', 'ね': 'ne', 'の': 'no', |
|
|
'は': 'ha', 'ひ': 'hi', 'ふ': 'fu', 'へ': 'he', 'ほ': 'ho', |
|
|
'ま': 'ma', 'み': 'mi', 'む': 'mu', 'め': 'me', 'も': 'mo', |
|
|
'や': 'ya', 'ゆ': 'yu', 'よ': 'yo', |
|
|
'ら': 'ra', 'り': 'ri', 'る': 'ru', 'れ': 're', 'ろ': 'ro', |
|
|
'わ': 'wa', 'を': 'wo', 'ん': 'n', |
|
|
|
|
|
|
|
|
'が': 'ga', 'ぎ': 'gi', 'ぐ': 'gu', 'げ': 'ge', 'ご': 'go', |
|
|
'ざ': 'za', 'じ': 'ji', 'ず': 'zu', 'ぜ': 'ze', 'ぞ': 'zo', |
|
|
'だ': 'da', 'ぢ': 'ji', 'づ': 'zu', 'で': 'de', 'ど': 'do', |
|
|
'ば': 'ba', 'び': 'bi', 'ぶ': 'bu', 'べ': 'be', 'ぼ': 'bo', |
|
|
'ぱ': 'pa', 'ぴ': 'pi', 'ぷ': 'pu', 'ぺ': 'pe', 'ぽ': 'po', |
|
|
|
|
|
|
|
|
'きゃ': 'kya', 'きゅ': 'kyu', 'きょ': 'kyo', |
|
|
'しゃ': 'sha', 'しゅ': 'shu', 'しょ': 'sho', |
|
|
'ちゃ': 'cha', 'ちゅ': 'chu', 'ちょ': 'cho', |
|
|
'にゃ': 'nya', 'にゅ': 'nyu', 'にょ': 'nyo', |
|
|
'ひゃ': 'hya', 'ひゅ': 'hyu', 'ひょ': 'hyo', |
|
|
'みゃ': 'mya', 'みゅ': 'myu', 'みょ': 'myo', |
|
|
'りゃ': 'rya', 'りゅ': 'ryu', 'りょ': 'ryo', |
|
|
'ぎゃ': 'gya', 'ぎゅ': 'gyu', 'ぎょ': 'gyo', |
|
|
'じゃ': 'ja', 'じゅ': 'ju', 'じょ': 'jo', |
|
|
'びゃ': 'bya', 'びゅ': 'byu', 'びょ': 'byo', |
|
|
'ぴゃ': 'pya', 'ぴゅ': 'pyu', 'ぴょ': 'pyo', |
|
|
|
|
|
|
|
|
'っ': '', |
|
|
'ゃ': 'ya', 'ゅ': 'yu', 'ょ': 'yo', |
|
|
|
|
|
|
|
|
' ': ' ', |
|
|
'、': ', ', '。': '. ', |
|
|
} |
|
|
|
|
|
result = [] |
|
|
i = 0 |
|
|
|
|
|
while i < len(japanese_text): |
|
|
|
|
|
if i < len(japanese_text) - 1 and (japanese_text[i] == 'っ' or japanese_text[i] == 'ッ'): |
|
|
if i < len(japanese_text) - 1 and japanese_text[i+1] in kana_map: |
|
|
next_romaji = kana_map[japanese_text[i+1]] |
|
|
if next_romaji and next_romaji[0] not in 'aiueon': |
|
|
result.append(next_romaji[0]) |
|
|
i += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
if i < len(japanese_text) - 1 and japanese_text[i+1] in ('ゃ', 'ゅ', 'ょ', 'ャ', 'ュ', 'ョ'): |
|
|
combo = japanese_text[i:i+2] |
|
|
if combo in kana_map: |
|
|
result.append(kana_map[combo]) |
|
|
i += 2 |
|
|
continue |
|
|
|
|
|
|
|
|
if japanese_text[i] in kana_map: |
|
|
result.append(kana_map[japanese_text[i]]) |
|
|
else: |
|
|
|
|
|
result.append(japanese_text[i]) |
|
|
|
|
|
i += 1 |
|
|
|
|
|
return ''.join(result) |
|
|
|
|
|
def number_to_text(num, ordinal=False): |
|
|
""" |
|
|
Convert a number (int or float) to its text representation. |
|
|
|
|
|
Args: |
|
|
num: The number to convert |
|
|
|
|
|
Returns: |
|
|
str: Text representation of the number |
|
|
""" |
|
|
|
|
|
if not isinstance(num, (int, float)): |
|
|
return "Input must be a number" |
|
|
|
|
|
|
|
|
if num == 0: |
|
|
return "zero" |
|
|
|
|
|
|
|
|
negative = num < 0 |
|
|
num = abs(num) |
|
|
|
|
|
|
|
|
if isinstance(num, float): |
|
|
|
|
|
int_part = int(num) |
|
|
|
|
|
|
|
|
int_text = _int_to_text(int_part) |
|
|
|
|
|
|
|
|
decimal_str = str(num).split('.')[1] |
|
|
decimal_text = " point " + " ".join(_digit_to_text(int(digit)) for digit in decimal_str) |
|
|
|
|
|
result = int_text + decimal_text |
|
|
else: |
|
|
|
|
|
result = _int_to_text(num) |
|
|
|
|
|
|
|
|
if negative: |
|
|
result = "negative " + result |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def _int_to_text(num): |
|
|
"""Helper function to convert an integer to text""" |
|
|
|
|
|
ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", |
|
|
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", |
|
|
"seventeen", "eighteen", "nineteen"] |
|
|
|
|
|
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] |
|
|
|
|
|
if num < 20: |
|
|
return ones[num] |
|
|
|
|
|
if num < 100: |
|
|
return tens[num // 10] + (" " + ones[num % 10] if num % 10 != 0 else "") |
|
|
|
|
|
if num < 1000: |
|
|
return ones[num // 100] + " hundred" + (" " + _int_to_text(num % 100) if num % 100 != 0 else "") |
|
|
|
|
|
if num < 1000000: |
|
|
return _int_to_text(num // 1000) + " thousand" + (" " + _int_to_text(num % 1000) if num % 1000 != 0 else "") |
|
|
|
|
|
if num < 1000000000: |
|
|
return _int_to_text(num // 1000000) + " million" + (" " + _int_to_text(num % 1000000) if num % 1000000 != 0 else "") |
|
|
|
|
|
return _int_to_text(num // 1000000000) + " billion" + (" " + _int_to_text(num % 1000000000) if num % 1000000000 != 0 else "") |
|
|
|
|
|
|
|
|
def _digit_to_text(digit): |
|
|
"""Convert a single digit to text""" |
|
|
digits = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] |
|
|
return digits[digit] |
|
|
|
|
|
|
|
|
_whitespace_re = re.compile(r"\s+") |
|
|
|
|
|
|
|
|
|
|
|
_abbreviations = { |
|
|
"en": [ |
|
|
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) |
|
|
for x in [ |
|
|
("mrs", "misess"), |
|
|
("mr", "mister"), |
|
|
("dr", "doctor"), |
|
|
("st", "saint"), |
|
|
("co", "company"), |
|
|
("jr", "junior"), |
|
|
("maj", "major"), |
|
|
("gen", "general"), |
|
|
("drs", "doctors"), |
|
|
("rev", "reverend"), |
|
|
("lt", "lieutenant"), |
|
|
("hon", "honorable"), |
|
|
("sgt", "sergeant"), |
|
|
("capt", "captain"), |
|
|
("esq", "esquire"), |
|
|
("ltd", "limited"), |
|
|
("col", "colonel"), |
|
|
("ft", "fort"), |
|
|
] |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
def expand_abbreviations_multilingual(text, lang="en"): |
|
|
for regex, replacement in _abbreviations[lang]: |
|
|
text = re.sub(regex, replacement, text) |
|
|
return text |
|
|
|
|
|
|
|
|
_symbols_multilingual = { |
|
|
"en": [ |
|
|
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) |
|
|
for x in [ |
|
|
("&", " and "), |
|
|
("@", " at "), |
|
|
("%", " percent "), |
|
|
("#", " hash "), |
|
|
("$", " dollar "), |
|
|
("£", " pound "), |
|
|
("°", " degree "), |
|
|
] |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
def expand_symbols_multilingual(text, lang="en"): |
|
|
for regex, replacement in _symbols_multilingual[lang]: |
|
|
text = re.sub(regex, replacement, text) |
|
|
text = text.replace(" ", " ") |
|
|
return text.strip() |
|
|
|
|
|
|
|
|
_ordinal_re = { |
|
|
"en": re.compile(r"([0-9]+)(st|nd|rd|th)"), |
|
|
} |
|
|
_number_re = re.compile(r"[0-9]+") |
|
|
_currency_re = { |
|
|
"USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"), |
|
|
"GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"), |
|
|
"EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"), |
|
|
} |
|
|
|
|
|
_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b") |
|
|
_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b") |
|
|
_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)") |
|
|
|
|
|
|
|
|
def _remove_commas(m): |
|
|
text = m.group(0) |
|
|
if "," in text: |
|
|
text = text.replace(",", "") |
|
|
return text |
|
|
|
|
|
|
|
|
def _remove_dots(m): |
|
|
text = m.group(0) |
|
|
if "." in text: |
|
|
text = text.replace(".", "") |
|
|
return text |
|
|
|
|
|
|
|
|
def _expand_decimal_point(m, lang="en"): |
|
|
amount = m.group(1).replace(",", ".") |
|
|
return number_to_text(float(amount)) |
|
|
|
|
|
|
|
|
def _expand_currency(m, lang="en", currency="USD"): |
|
|
amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) |
|
|
full_amount = number_to_text(amount) |
|
|
|
|
|
and_equivalents = { |
|
|
"en": ", ", |
|
|
"es": " con ", |
|
|
"fr": " et ", |
|
|
"de": " und ", |
|
|
"pt": " e ", |
|
|
"it": " e ", |
|
|
"pl": ", ", |
|
|
"cs": ", ", |
|
|
"ru": ", ", |
|
|
"nl": ", ", |
|
|
"ar": ", ", |
|
|
"tr": ", ", |
|
|
"hu": ", ", |
|
|
"ko": ", ", |
|
|
} |
|
|
|
|
|
if amount.is_integer(): |
|
|
last_and = full_amount.rfind(and_equivalents[lang]) |
|
|
if last_and != -1: |
|
|
full_amount = full_amount[:last_and] |
|
|
|
|
|
return full_amount |
|
|
|
|
|
|
|
|
def _expand_ordinal(m, lang="en"): |
|
|
return number_to_text(int(m.group(1)), ordinal=True) |
|
|
|
|
|
|
|
|
def _expand_number(m, lang="en"): |
|
|
return number_to_text(int(m.group(0))) |
|
|
|
|
|
|
|
|
def expand_numbers_multilingual(text, lang="en"): |
|
|
if lang in ["en", "ru"]: |
|
|
text = re.sub(_comma_number_re, _remove_commas, text) |
|
|
else: |
|
|
text = re.sub(_dot_number_re, _remove_dots, text) |
|
|
try: |
|
|
text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text) |
|
|
text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text) |
|
|
text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text) |
|
|
except: |
|
|
pass |
|
|
|
|
|
text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text) |
|
|
text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text) |
|
|
text = re.sub(_number_re, lambda m: _expand_number(m, lang), text) |
|
|
return text |
|
|
|
|
|
|
|
|
def lowercase(text): |
|
|
return text.lower() |
|
|
|
|
|
|
|
|
def collapse_whitespace(text): |
|
|
return re.sub(_whitespace_re, " ", text) |
|
|
|
|
|
|
|
|
def multilingual_cleaners(text, lang): |
|
|
text = text.replace('"', "") |
|
|
if lang == "tr": |
|
|
text = text.replace("İ", "i") |
|
|
text = text.replace("Ö", "ö") |
|
|
text = text.replace("Ü", "ü") |
|
|
text = lowercase(text) |
|
|
try: |
|
|
text = expand_numbers_multilingual(text, lang) |
|
|
except: |
|
|
pass |
|
|
try: |
|
|
text = expand_abbreviations_multilingual(text, lang) |
|
|
except: |
|
|
pass |
|
|
try: |
|
|
text = expand_symbols_multilingual(text, lang=lang) |
|
|
except: |
|
|
pass |
|
|
text = collapse_whitespace(text) |
|
|
return text |
|
|
|
|
|
|
|
|
def basic_cleaners(text): |
|
|
"""Basic pipeline that lowercases and collapses whitespace without transliteration.""" |
|
|
text = lowercase(text) |
|
|
text = collapse_whitespace(text) |
|
|
return text |
|
|
|