| import re | |
| import string | |
| import nltk | |
| import pymorphy2 | |
| from nltk.tokenize import word_tokenize | |
| nltk.download("punkt") | |
| def clean_text(text: str) -> str: | |
| text = text.lower() | |
| text = re.sub(r"\w*(\w)\1{2,}\w*", "", text) | |
| text = re.sub(r"\d+\w*", "", text) | |
| text = re.sub(r"\[.*?\]", "", text) | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| return text | |
| def lemmize_and_tokenize_text(text: str) -> list[str]: | |
| morph = pymorphy2.MorphAnalyzer() | |
| tokens = word_tokenize(text) | |
| lemmas = [morph.parse(token)[0].normal_form for token in tokens] | |
| return lemmas | |
| def data_preprocessing(text: str) -> list[str]: | |
| cleaned_text = clean_text(text) | |
| lemmized_text = lemmize_and_tokenize_text(cleaned_text) | |
| return lemmized_text | |