import nltk from nltk.stem import WordNetLemmatizer from sklearn.base import BaseEstimator, TransformerMixin import nltk import contractions # Download required NLTK resources nltk.download('punkt_tab') nltk.download('wordnet') # Custom transformer for preprocessing text class TextPreprocessor(BaseEstimator, TransformerMixin): def __init__(self): self.lemmatizer = WordNetLemmatizer() def fit(self, X, y=None): return self # Does nothing, just returns the instance def transform(self, X): preprocessed_texts = [] for doc in X: # Expand contractions expanded = contractions.fix(doc) # Lowercase lowered = expanded.lower() # Tokenize and lemmatize lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)]) preprocessed_texts.append(lemmatized) return preprocessed_texts