File size: 950 Bytes
3b83e0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
import contractions

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')

# Custom transformer for preprocessing text
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self  # Does nothing, just returns the instance

    def transform(self, X):
        preprocessed_texts = []
        for doc in X:
            # Expand contractions
            expanded = contractions.fix(doc)
            # Lowercase
            lowered = expanded.lower()

            # Tokenize and lemmatize
            lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
            preprocessed_texts.append(lemmatized)
        return preprocessed_texts