submission-template / tasks /utils /text_preprocessor.py
Terry Zhang
update code to include tree classifier
3b83e0c
raw
history blame
950 Bytes
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
import contractions
# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')
# Custom transformer for preprocessing text
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
def fit(self, X, y=None):
return self # Does nothing, just returns the instance
def transform(self, X):
preprocessed_texts = []
for doc in X:
# Expand contractions
expanded = contractions.fix(doc)
# Lowercase
lowered = expanded.lower()
# Tokenize and lemmatize
lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
preprocessed_texts.append(lemmatized)
return preprocessed_texts