Zen0 commited on
Commit
d778205
·
verified ·
1 Parent(s): 2b7b5be

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +81 -75
tasks/text.py CHANGED
@@ -1,10 +1,8 @@
1
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, MobileBertTokenizerFast
2
-
3
  from fastapi import APIRouter
4
  from datetime import datetime
5
  from datasets import load_dataset
6
  from sklearn.metrics import accuracy_score
7
- import random
8
 
9
  from .utils.evaluation import TextEvaluationRequest
10
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
@@ -12,21 +10,15 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
12
  import numpy as np
13
  import torch
14
 
15
-
16
-
17
-
18
  router = APIRouter()
19
 
20
  DESCRIPTION = "FrugalDisinfoHunter Model"
21
  ROUTE = "/text"
22
 
23
- @router.post(ROUTE, tags=["Text Task"],
24
- description=DESCRIPTION)
25
  async def evaluate_text(request: TextEvaluationRequest):
26
  """
27
  Evaluate text classification for climate disinformation detection.
28
-
29
- Current Model: FrugalDisinfoHunter
30
  """
31
  # Get space info
32
  username, space_url = get_space_info()
@@ -57,69 +49,83 @@ async def evaluate_text(request: TextEvaluationRequest):
57
  tracker.start()
58
  tracker.start_task("inference")
59
 
60
- #--------------------------------------------------------------------------------------------
61
- # YOUR MODEL INFERENCE CODE HERE
62
- # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
63
- #--------------------------------------------------------------------------------------------
64
-
65
-
66
- # Model and Tokenizer
67
- # Model and Tokenizer - use the same model name for both
68
- model_name = "Zen0/FrugalDisinfoHunter"
69
- model = AutoModelForSequenceClassification.from_pretrained(
70
- model_name,
71
- num_labels=8,
72
- output_hidden_states=True,
73
- problem_type="single_label_classification"
74
- )
75
- tokenizer = AutoTokenizer.from_pretrained(model_name) # Use the same model_name
76
-
77
- # Tokenize the test data
78
- test_texts = test_dataset["quote"]
79
- inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
80
-
81
- # Move model and inputs to GPU if available
82
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
83
- model.to(device)
84
- inputs = {key: val.to(device) for key, val in inputs.items()}
85
-
86
- # Run inference on the dataset using the model
87
- with torch.no_grad(): # Disable gradient calculations
88
- outputs = model(**inputs)
89
- logits = outputs.logits
90
-
91
- # Get predictions from the logits
92
- predictions = torch.argmax(logits, dim=-1).cpu().numpy()
93
-
94
- true_labels = test_dataset['label']
95
-
96
- #--------------------------------------------------------------------------------------------
97
- # YOUR MODEL INFERENCE STOPS HERE
98
- #--------------------------------------------------------------------------------------------
99
-
100
-
101
- # Stop tracking emissions
102
- emissions_data = tracker.stop_task()
103
-
104
- # Calculate accuracy
105
- accuracy = accuracy_score(true_labels, predictions)
106
-
107
- # Prepare results dictionary
108
- results = {
109
- "username": username,
110
- "space_url": space_url,
111
- "submission_timestamp": datetime.now().isoformat(),
112
- "model_description": DESCRIPTION,
113
- "accuracy": float(accuracy),
114
- "energy_consumed_wh": emissions_data.energy_consumed * 1000,
115
- "emissions_gco2eq": emissions_data.emissions * 1000,
116
- "emissions_data": clean_emissions_data(emissions_data),
117
- "api_route": ROUTE,
118
- "dataset_config": {
119
- "dataset_name": request.dataset_name,
120
- "test_size": request.test_size,
121
- "test_seed": request.test_seed
 
 
 
 
 
 
 
 
 
 
122
  }
123
- }
124
-
125
- return results
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
2
  from fastapi import APIRouter
3
  from datetime import datetime
4
  from datasets import load_dataset
5
  from sklearn.metrics import accuracy_score
 
6
 
7
  from .utils.evaluation import TextEvaluationRequest
8
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
 
10
  import numpy as np
11
  import torch
12
 
 
 
 
13
  router = APIRouter()
14
 
15
  DESCRIPTION = "FrugalDisinfoHunter Model"
16
  ROUTE = "/text"
17
 
18
+ @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
 
19
  async def evaluate_text(request: TextEvaluationRequest):
20
  """
21
  Evaluate text classification for climate disinformation detection.
 
 
22
  """
23
  # Get space info
24
  username, space_url = get_space_info()
 
49
  tracker.start()
50
  tracker.start_task("inference")
51
 
52
+ try:
53
+ # Model configuration
54
+ model_name = "Zen0/FrugalDisinfoHunter" # Model path
55
+ tokenizer_name = "google/mobilebert-uncased" # Base MobileBERT tokenizer
56
+ BATCH_SIZE = 32 # Batch size for efficient processing
57
+ MAX_LENGTH = 128 # Maximum sequence length
58
+
59
+ # Initialize model and tokenizer
60
+ model = AutoModelForSequenceClassification.from_pretrained(
61
+ model_name,
62
+ num_labels=8,
63
+ output_hidden_states=True,
64
+ problem_type="single_label_classification"
65
+ )
66
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
67
+
68
+ # Move model to appropriate device
69
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
70
+ model = model.to(device)
71
+ model.eval() # Set model to evaluation mode
72
+
73
+ # Get test texts
74
+ test_texts = test_dataset["quote"]
75
+ predictions = []
76
+
77
+ # Process in batches
78
+ for i in range(0, len(test_texts), BATCH_SIZE):
79
+ batch_texts = test_texts[i:i + BATCH_SIZE]
80
+
81
+ # Tokenize batch
82
+ inputs = tokenizer(
83
+ batch_texts,
84
+ padding=True,
85
+ truncation=True,
86
+ return_tensors="pt",
87
+ max_length=MAX_LENGTH
88
+ )
89
+
90
+ # Move inputs to device
91
+ inputs = {key: val.to(device) for key, val in inputs.items()}
92
+
93
+ # Run inference
94
+ with torch.no_grad():
95
+ outputs = model(**inputs)
96
+ batch_preds = torch.argmax(outputs.logits, dim=1)
97
+ predictions.extend(batch_preds.cpu().numpy())
98
+
99
+ # Get true labels
100
+ true_labels = test_dataset['label']
101
+
102
+ # Stop tracking emissions
103
+ emissions_data = tracker.stop_task()
104
+
105
+ # Calculate accuracy
106
+ accuracy = accuracy_score(true_labels, predictions)
107
+
108
+ # Prepare results dictionary
109
+ results = {
110
+ "username": username,
111
+ "space_url": space_url,
112
+ "submission_timestamp": datetime.now().isoformat(),
113
+ "model_description": DESCRIPTION,
114
+ "accuracy": float(accuracy),
115
+ "energy_consumed_wh": emissions_data.energy_consumed * 1000,
116
+ "emissions_gco2eq": emissions_data.emissions * 1000,
117
+ "emissions_data": clean_emissions_data(emissions_data),
118
+ "api_route": ROUTE,
119
+ "dataset_config": {
120
+ "dataset_name": request.dataset_name,
121
+ "test_size": request.test_size,
122
+ "test_seed": request.test_seed
123
+ }
124
  }
125
+
126
+ return results
127
+
128
+ except Exception as e:
129
+ # Stop tracking in case of error
130
+ tracker.stop_task()
131
+ raise e