Terry Zhang commited on
Commit
b562460
·
1 Parent(s): 4c44667

move preprocessor

Browse files
Files changed (1) hide show
  1. tasks/text.py +32 -31
tasks/text.py CHANGED
@@ -19,37 +19,7 @@ models_description = {
19
  "tfidf_xgb": "TF-IDF vectorizer and XGBoost classifier",
20
  }
21
 
22
- # Textpreprocessor
23
- import nltk
24
- from nltk.stem import WordNetLemmatizer
25
- from sklearn.base import BaseEstimator, TransformerMixin
26
- import nltk
27
- import contractions
28
-
29
- # Download required NLTK resources
30
- nltk.download('punkt_tab')
31
- nltk.download('wordnet')
32
-
33
- # Custom sklearn transformer for preprocessing text
34
- class TextPreprocessor(BaseEstimator, TransformerMixin):
35
- def __init__(self):
36
- self.lemmatizer = WordNetLemmatizer()
37
-
38
- def fit(self, X, y=None):
39
- return self
40
-
41
- def transform(self, X):
42
- preprocessed_texts = []
43
- for doc in X:
44
- # Expand contractions
45
- expanded = contractions.fix(doc)
46
- # Lowercase
47
- lowered = expanded.lower()
48
-
49
- # Tokenize and lemmatize
50
- lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
51
- preprocessed_texts.append(lemmatized)
52
- return preprocessed_texts
53
 
54
 
55
  # Some code borrowed from Nonnormalizable
@@ -61,6 +31,37 @@ def baseline_model(dataset_length: int):
61
  return predictions
62
 
63
  def tree_classifier(test_dataset: dict, model: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  texts = test_dataset["quote"]
66
 
 
19
  "tfidf_xgb": "TF-IDF vectorizer and XGBoost classifier",
20
  }
21
 
22
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  # Some code borrowed from Nonnormalizable
 
31
  return predictions
32
 
33
  def tree_classifier(test_dataset: dict, model: str):
34
+ # Textpreprocessor defined in this scope
35
+ import nltk
36
+ from nltk.stem import WordNetLemmatizer
37
+ from sklearn.base import BaseEstimator, TransformerMixin
38
+ import nltk
39
+ import contractions
40
+
41
+ # Download required NLTK resources
42
+ nltk.download('punkt_tab')
43
+ nltk.download('wordnet')
44
+
45
+ # Custom sklearn transformer for preprocessing text
46
+ class TextPreprocessor(BaseEstimator, TransformerMixin):
47
+ def __init__(self):
48
+ self.lemmatizer = WordNetLemmatizer()
49
+
50
+ def fit(self, X, y=None):
51
+ return self
52
+
53
+ def transform(self, X):
54
+ preprocessed_texts = []
55
+ for doc in X:
56
+ # Expand contractions
57
+ expanded = contractions.fix(doc)
58
+ # Lowercase
59
+ lowered = expanded.lower()
60
+
61
+ # Tokenize and lemmatize
62
+ lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
63
+ preprocessed_texts.append(lemmatized)
64
+ return preprocessed_texts
65
 
66
  texts = test_dataset["quote"]
67