Spaces:
Runtime error
Runtime error
| import re | |
| SYMBOLS_MAPPING = { | |
| "\n": "", | |
| "β¦": ".", | |
| "β": "'", | |
| "β": "'", | |
| "β": "'", | |
| "β": "'", | |
| "γ": "", | |
| "γ": "", | |
| "[": "", | |
| "]": "", | |
| "οΌ": "", | |
| "οΌ": "", | |
| "(": "", | |
| ")": "", | |
| "γ»": "", | |
| "Β·": "", | |
| "γ": "'", | |
| "γ": "'", | |
| "γ": "'", | |
| "γ": "'", | |
| "β": "", | |
| "ο½": "", | |
| "~": "", | |
| "οΌ": ",", | |
| "οΌ": ",", | |
| ";": ",", | |
| ":": ",", | |
| } | |
| REPLACE_SYMBOL_REGEX = re.compile( | |
| "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) | |
| ) | |
| EMOJI_REGEX = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map symbols | |
| "\U0001F1E0-\U0001F1FF" # flags (iOS) | |
| "]+", | |
| flags=re.UNICODE, | |
| ) | |
| def clean_text(text): | |
| # Clean the text | |
| text = text.strip() | |
| # Replace all chinese symbols with their english counterparts | |
| text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) | |
| # Remove emojis | |
| text = EMOJI_REGEX.sub(r"", text) | |
| # Remove continuous periods (...) and commas (,,,) | |
| text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text) | |
| return text | |