laureBe commited on
Commit
d248f3d
·
verified ·
1 Parent(s): 9685f7b

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +21 -3
tasks/text.py CHANGED
@@ -18,13 +18,18 @@ async def evaluate_text(request: TextEvaluationRequest):
18
  """
19
  Evaluate text classification for climate disinformation detection.
20
 
21
- Current Model: Random Baseline
22
- - Makes random predictions from the label space (0-7)
23
  - Used as a baseline for comparison
24
  """
25
  # Get space info
26
  username, space_url = get_space_info()
27
 
 
 
 
 
 
 
28
  # Define the label mapping
29
  LABEL_MAPPING = {
30
  "0_not_relevant": 0,
@@ -44,9 +49,19 @@ async def evaluate_text(request: TextEvaluationRequest):
44
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
45
 
46
  # Split dataset
 
47
  train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
48
  test_dataset = train_test["test"]
 
 
 
 
 
49
 
 
 
 
 
50
  # Start tracking emissions
51
  tracker.start()
52
  tracker.start_task("inference")
@@ -58,8 +73,11 @@ async def evaluate_text(request: TextEvaluationRequest):
58
 
59
  # Make random predictions (placeholder for actual model inference)
60
  true_labels = test_dataset["label"]
61
- predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
62
 
 
 
 
 
63
  #--------------------------------------------------------------------------------------------
64
  # YOUR MODEL INFERENCE STOPS HERE
65
  #--------------------------------------------------------------------------------------------
 
18
  """
19
  Evaluate text classification for climate disinformation detection.
20
 
21
+ Current Model: Logistic regression
 
22
  - Used as a baseline for comparison
23
  """
24
  # Get space info
25
  username, space_url = get_space_info()
26
 
27
+ from sklearn.linear_model import LogisticRegression
28
+ from sklearn.feature_extraction.text import TfidfVectorizer
29
+ from sklearn.model_selection import train_test_split
30
+ from sklearn import metrics
31
+ from datetime import datetime
32
+
33
  # Define the label mapping
34
  LABEL_MAPPING = {
35
  "0_not_relevant": 0,
 
49
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
50
 
51
  # Split dataset
52
+ #train_test = dataset.train_test_split(test_size=.33, seed=42)
53
  train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
54
  test_dataset = train_test["test"]
55
+
56
+ #test_dataset = train_test["test"]
57
+ #train_dataset = train_test["train"]
58
+
59
+ tfidf_vect = TfidfVectorizer(stop_words = 'english')
60
 
61
+ tfidf_train = tfidf_vect.fit_transform(train_dataset['quote'])
62
+ tfidf_test = tfidf_vect.transform(test_dataset['quote'])
63
+
64
+
65
  # Start tracking emissions
66
  tracker.start()
67
  tracker.start_task("inference")
 
73
 
74
  # Make random predictions (placeholder for actual model inference)
75
  true_labels = test_dataset["label"]
 
76
 
77
+ LR = LogisticRegression(class_weight='balanced', max_iter=20, random_state=1234,
78
+ solver='liblinear')
79
+ LR.fit(pd.DataFrame.sparse.from_spmatrix(tfidf_train), pd.DataFrame(y_train_v))
80
+ predictions=LR.predict(pd.DataFrame.sparse.from_spmatrix(tfidf_test))
81
  #--------------------------------------------------------------------------------------------
82
  # YOUR MODEL INFERENCE STOPS HERE
83
  #--------------------------------------------------------------------------------------------