Spaces:
Runtime error
Runtime error
| import re | |
| import unicodedata | |
| import regex | |
| # non-ASCII letters that are not separated by "NFKD" normalization | |
| ADDITIONAL_DIACRITICS = { | |
| "œ": "oe", | |
| "Œ": "OE", | |
| "ø": "o", | |
| "Ø": "O", | |
| "æ": "ae", | |
| "Æ": "AE", | |
| "ß": "ss", | |
| "ẞ": "SS", | |
| "đ": "d", | |
| "Đ": "D", | |
| "ð": "d", | |
| "Ð": "D", | |
| "þ": "th", | |
| "Þ": "th", | |
| "ł": "l", | |
| "Ł": "L", | |
| } | |
| def remove_symbols_and_diacritics(s: str, keep=""): | |
| """ | |
| Replace any other markers, symbols, and punctuations with a space, | |
| and drop any diacritics (category 'Mn' and some manual mappings) | |
| """ | |
| return "".join( | |
| c | |
| if c in keep | |
| else ADDITIONAL_DIACRITICS[c] | |
| if c in ADDITIONAL_DIACRITICS | |
| else "" | |
| if unicodedata.category(c) == "Mn" | |
| else " " | |
| if unicodedata.category(c)[0] in "MSP" | |
| else c | |
| for c in unicodedata.normalize("NFKD", s) | |
| ) | |
| def remove_symbols(s: str): | |
| """ | |
| Replace any other markers, symbols, punctuations with a space, keeping diacritics | |
| """ | |
| return "".join( | |
| " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s) | |
| ) | |
| class BasicTextNormalizer: | |
| def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): | |
| self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols | |
| self.split_letters = split_letters | |
| def __call__(self, s: str): | |
| s = s.lower() | |
| s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets | |
| s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis | |
| s = self.clean(s).lower() | |
| if self.split_letters: | |
| s = " ".join(regex.findall(r"\X", s, regex.U)) | |
| s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space | |
| return s | |