import nltk | |
from nltk.stem import WordNetLemmatizer | |
import contractions | |
def preprocess(X): | |
lemmatizer = WordNetLemmatizer() | |
preprocessed_texts = [] | |
for doc in X: | |
# Expand contractions | |
expanded = contractions.fix(doc) | |
# Lowercase | |
lowered = expanded.lower() | |
# Tokenize and lemmatize | |
lemmatized = " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)]) | |
preprocessed_texts.append(lemmatized) | |
return preprocessed_texts |