File size: 512 Bytes
296146e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
import nltk
from nltk.stem import WordNetLemmatizer
import contractions
def preprocess(X):
lemmatizer = WordNetLemmatizer()
preprocessed_texts = []
for doc in X:
# Expand contractions
expanded = contractions.fix(doc)
# Lowercase
lowered = expanded.lower()
# Tokenize and lemmatize
lemmatized = " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
preprocessed_texts.append(lemmatized)
return preprocessed_texts |