import nltk | |
from nltk.stem import WordNetLemmatizer | |
from sklearn.base import BaseEstimator, TransformerMixin | |
import nltk | |
import contractions | |
# Download required NLTK resources | |
nltk.download('punkt_tab') | |
nltk.download('wordnet') | |
# Custom transformer for preprocessing text | |
class TextPreprocessor(BaseEstimator, TransformerMixin): | |
def __init__(self): | |
self.lemmatizer = WordNetLemmatizer() | |
def fit(self, X, y=None): | |
return self # Does nothing, just returns the instance | |
def transform(self, X): | |
preprocessed_texts = [] | |
for doc in X: | |
# Expand contractions | |
expanded = contractions.fix(doc) | |
# Lowercase | |
lowered = expanded.lower() | |
# Tokenize and lemmatize | |
lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)]) | |
preprocessed_texts.append(lemmatized) | |
return preprocessed_texts |