submission-template / tasks /utils /text_preprocessor.py
Terry Zhang
update preprocess structure and model
296146e
raw
history blame contribute delete
512 Bytes
import nltk
from nltk.stem import WordNetLemmatizer
import contractions
def preprocess(X):
lemmatizer = WordNetLemmatizer()
preprocessed_texts = []
for doc in X:
# Expand contractions
expanded = contractions.fix(doc)
# Lowercase
lowered = expanded.lower()
# Tokenize and lemmatize
lemmatized = " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
preprocessed_texts.append(lemmatized)
return preprocessed_texts