Terry Zhang commited on
Commit
296146e
·
1 Parent(s): b562460

update preprocess structure and model

Browse files
tasks/text.py CHANGED
@@ -4,10 +4,17 @@ from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
  import random
6
  from skops.io import load
 
 
 
 
 
 
7
 
8
 
9
  from .utils.evaluation import TextEvaluationRequest
10
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
 
11
 
12
  router = APIRouter()
13
 
@@ -20,8 +27,6 @@ models_description = {
20
  }
21
 
22
 
23
-
24
-
25
  # Some code borrowed from Nonnormalizable
26
 
27
  def baseline_model(dataset_length: int):
@@ -31,46 +36,15 @@ def baseline_model(dataset_length: int):
31
  return predictions
32
 
33
  def tree_classifier(test_dataset: dict, model: str):
34
- # Textpreprocessor defined in this scope
35
- import nltk
36
- from nltk.stem import WordNetLemmatizer
37
- from sklearn.base import BaseEstimator, TransformerMixin
38
- import nltk
39
- import contractions
40
-
41
- # Download required NLTK resources
42
- nltk.download('punkt_tab')
43
- nltk.download('wordnet')
44
-
45
- # Custom sklearn transformer for preprocessing text
46
- class TextPreprocessor(BaseEstimator, TransformerMixin):
47
- def __init__(self):
48
- self.lemmatizer = WordNetLemmatizer()
49
-
50
- def fit(self, X, y=None):
51
- return self
52
-
53
- def transform(self, X):
54
- preprocessed_texts = []
55
- for doc in X:
56
- # Expand contractions
57
- expanded = contractions.fix(doc)
58
- # Lowercase
59
- lowered = expanded.lower()
60
-
61
- # Tokenize and lemmatize
62
- lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
63
- preprocessed_texts.append(lemmatized)
64
- return preprocessed_texts
65
 
66
  texts = test_dataset["quote"]
67
 
 
 
68
  model_path = f"tasks/text_models/{model}.skops"
69
 
70
  model = load(model_path,
71
  trusted=[
72
- '__main__.TextPreprocessor',
73
- 'nltk.stem.wordnet.WordNetLemmatizer',
74
  'xgboost.core.Booster',
75
  'xgboost.sklearn.XGBClassifier'])
76
 
 
4
  from sklearn.metrics import accuracy_score
5
  import random
6
  from skops.io import load
7
+ # Textpreprocessor defined in this scope
8
+ import nltk
9
+
10
+ # Download required NLTK resources
11
+ nltk.download('punkt_tab')
12
+ nltk.download('wordnet')
13
 
14
 
15
  from .utils.evaluation import TextEvaluationRequest
16
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
17
+ from .utils.text_preprocessor import preprocess
18
 
19
  router = APIRouter()
20
 
 
27
  }
28
 
29
 
 
 
30
  # Some code borrowed from Nonnormalizable
31
 
32
  def baseline_model(dataset_length: int):
 
36
  return predictions
37
 
38
  def tree_classifier(test_dataset: dict, model: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  texts = test_dataset["quote"]
41
 
42
+ texts = preprocess(texts)
43
+
44
  model_path = f"tasks/text_models/{model}.skops"
45
 
46
  model = load(model_path,
47
  trusted=[
 
 
48
  'xgboost.core.Booster',
49
  'xgboost.sklearn.XGBClassifier'])
50
 
tasks/text_models/xgb_pipeline.skops CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c2100f08f614713cd3e19f06e3456f32ef3d3bb23ce4ff2902688c8074bb82e
3
- size 3277312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4199bda604eb153a7416ccb0e320dfa31411ed7fa7cb84f710b575b049ff8cfc
3
+ size 3278839
tasks/utils/text_preprocessor.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.stem import WordNetLemmatizer
3
+ import contractions
4
+
5
+
6
+
7
+ def preprocess(X):
8
+ lemmatizer = WordNetLemmatizer()
9
+ preprocessed_texts = []
10
+ for doc in X:
11
+ # Expand contractions
12
+ expanded = contractions.fix(doc)
13
+ # Lowercase
14
+ lowered = expanded.lower()
15
+ # Tokenize and lemmatize
16
+ lemmatized = " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)])
17
+ preprocessed_texts.append(lemmatized)
18
+ return preprocessed_texts