Terry Zhang commited on
Commit
4c44667
·
1 Parent(s): 9bcb67c

move text preprocessor into app

Browse files
Files changed (2) hide show
  1. tasks/text.py +34 -2
  2. tasks/utils/text_preprocessor.py +0 -30
tasks/text.py CHANGED
@@ -4,7 +4,6 @@ from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
  import random
6
  from skops.io import load
7
- from .utils.text_preprocessor import TextPreprocessor
8
 
9
 
10
  from .utils.evaluation import TextEvaluationRequest
@@ -20,6 +19,39 @@ models_description = {
20
  "tfidf_xgb": "TF-IDF vectorizer and XGBoost classifier",
21
  }
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Some code borrowed from Nonnormalizable
24
 
25
  def baseline_model(dataset_length: int):
@@ -36,7 +68,7 @@ def tree_classifier(test_dataset: dict, model: str):
36
 
37
  model = load(model_path,
38
  trusted=[
39
- 'utils.text_preprocessor.TextPreprocessor',
40
  'nltk.stem.wordnet.WordNetLemmatizer',
41
  'xgboost.core.Booster',
42
  'xgboost.sklearn.XGBClassifier'])
 
4
  from sklearn.metrics import accuracy_score
5
  import random
6
  from skops.io import load
 
7
 
8
 
9
  from .utils.evaluation import TextEvaluationRequest
 
19
  "tfidf_xgb": "TF-IDF vectorizer and XGBoost classifier",
20
  }
21
 
22
+ # Textpreprocessor
23
+ import nltk
24
+ from nltk.stem import WordNetLemmatizer
25
+ from sklearn.base import BaseEstimator, TransformerMixin
26
+ import nltk
27
+ import contractions
28
+
29
+ # Download required NLTK resources
30
+ nltk.download('punkt_tab')
31
+ nltk.download('wordnet')
32
+
33
+ # Custom sklearn transformer for preprocessing text
34
+ class TextPreprocessor(BaseEstimator, TransformerMixin):
35
+ def __init__(self):
36
+ self.lemmatizer = WordNetLemmatizer()
37
+
38
+ def fit(self, X, y=None):
39
+ return self
40
+
41
+ def transform(self, X):
42
+ preprocessed_texts = []
43
+ for doc in X:
44
+ # Expand contractions
45
+ expanded = contractions.fix(doc)
46
+ # Lowercase
47
+ lowered = expanded.lower()
48
+
49
+ # Tokenize and lemmatize
50
+ lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
51
+ preprocessed_texts.append(lemmatized)
52
+ return preprocessed_texts
53
+
54
+
55
  # Some code borrowed from Nonnormalizable
56
 
57
  def baseline_model(dataset_length: int):
 
68
 
69
  model = load(model_path,
70
  trusted=[
71
+ '__main__.TextPreprocessor',
72
  'nltk.stem.wordnet.WordNetLemmatizer',
73
  'xgboost.core.Booster',
74
  'xgboost.sklearn.XGBClassifier'])
tasks/utils/text_preprocessor.py DELETED
@@ -1,30 +0,0 @@
1
- import nltk
2
- from nltk.stem import WordNetLemmatizer
3
- from sklearn.base import BaseEstimator, TransformerMixin
4
- import nltk
5
- import contractions
6
-
7
- # Download required NLTK resources
8
- nltk.download('punkt_tab')
9
- nltk.download('wordnet')
10
-
11
- # Custom transformer for preprocessing text
12
- class TextPreprocessor(BaseEstimator, TransformerMixin):
13
- def __init__(self):
14
- self.lemmatizer = WordNetLemmatizer()
15
-
16
- def fit(self, X, y=None):
17
- return self # Does nothing, just returns the instance
18
-
19
- def transform(self, X):
20
- preprocessed_texts = []
21
- for doc in X:
22
- # Expand contractions
23
- expanded = contractions.fix(doc)
24
- # Lowercase
25
- lowered = expanded.lower()
26
-
27
- # Tokenize and lemmatize
28
- lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
29
- preprocessed_texts.append(lemmatized)
30
- return preprocessed_texts