File size: 950 Bytes
3b83e0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
import contractions
# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')
# Custom transformer for preprocessing text
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
def fit(self, X, y=None):
return self # Does nothing, just returns the instance
def transform(self, X):
preprocessed_texts = []
for doc in X:
# Expand contractions
expanded = contractions.fix(doc)
# Lowercase
lowered = expanded.lower()
# Tokenize and lemmatize
lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
preprocessed_texts.append(lemmatized)
return preprocessed_texts |