File size: 512 Bytes
296146e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import nltk
from nltk.stem import WordNetLemmatizer
import contractions



def preprocess(X):
    lemmatizer  = WordNetLemmatizer()
    preprocessed_texts = []
    for doc in X:
        # Expand contractions
        expanded = contractions.fix(doc)
        # Lowercase
        lowered = expanded.lower()
        # Tokenize and lemmatize
        lemmatized = " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
        preprocessed_texts.append(lemmatized)
    return preprocessed_texts