Tonic commited on
Commit
485bf3f
·
verified ·
1 Parent(s): 878c369

attempt to remove all bias configurations last time

Browse files
Files changed (1) hide show
  1. tasks/text.py +29 -74
tasks/text.py CHANGED
@@ -61,40 +61,41 @@ async def evaluate_text(request: TextEvaluationRequest):
61
  # Model and tokenizer paths
62
  model_name = "Tonic/climate-guard-toxic-agent"
63
  tokenizer_name = "answerdotai/ModernBERT-base"
64
-
65
- # Create ModernBERT config with minimal required parameters
66
- config = ModernBertConfig(
67
- vocab_size=50368,
68
- hidden_size=768,
69
- num_hidden_layers=22,
70
- num_attention_heads=12,
71
- intermediate_size=1152,
72
- max_position_embeddings=8192,
73
- layer_norm_eps=1e-5,
74
- num_labels=8,
75
- problem_type="single_label_classification",
76
- classifier_pooling="mean",
77
- model_type="modernbert",
78
- architectures=["ModernBertForSequenceClassification"],
79
- local_attention=128,
80
- global_attn_every_n_layers=3,
81
- position_embedding_type="absolute",
82
- pad_token_id=50283,
83
- bos_token_id=50281,
84
- eos_token_id=50282,
85
- sep_token_id=50282,
86
- cls_token_id=50281,
87
- hidden_activation="gelu",
88
- classifier_activation="gelu"
89
- )
 
90
 
91
  # Load tokenizer
92
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
93
 
94
- # Load model with config
95
  model = AutoModelForSequenceClassification.from_pretrained(
96
  model_name,
97
- config=config,
98
  trust_remote_code=True,
99
  ignore_mismatched_sizes=True,
100
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
@@ -102,52 +103,6 @@ async def evaluate_text(request: TextEvaluationRequest):
102
 
103
  # Set model to evaluation mode
104
  model.eval()
105
-
106
- # Preprocess function
107
- def preprocess_function(examples):
108
- return tokenizer(
109
- examples["quote"],
110
- padding=False,
111
- truncation=True,
112
- max_length=512,
113
- return_tensors=None
114
- )
115
-
116
- # Tokenize dataset
117
- tokenized_test = test_dataset.map(
118
- preprocess_function,
119
- batched=True,
120
- remove_columns=test_dataset.column_names
121
- )
122
-
123
- # Set format for pytorch
124
- tokenized_test.set_format("torch")
125
-
126
- # Create DataLoader
127
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
128
- test_loader = DataLoader(
129
- tokenized_test,
130
- batch_size=16,
131
- collate_fn=data_collator,
132
- shuffle=False
133
- )
134
-
135
- # Get predictions
136
- predictions = []
137
- with torch.no_grad():
138
- for batch in test_loader:
139
- batch = {k: v.to(device) for k, v in batch.items()}
140
- outputs = model(**batch)
141
- preds = torch.argmax(outputs.logits, dim=-1)
142
- predictions.extend(preds.cpu().numpy().tolist())
143
-
144
- # Clean up GPU memory
145
- if torch.cuda.is_available():
146
- torch.cuda.empty_cache()
147
-
148
- except Exception as e:
149
- print(f"Error during model inference: {str(e)}")
150
- raise
151
 
152
  #--------------------------------------------------------------------------------------------
153
  # MODEL INFERENCE ENDS HERE
 
61
  # Model and tokenizer paths
62
  model_name = "Tonic/climate-guard-toxic-agent"
63
  tokenizer_name = "answerdotai/ModernBERT-base"
64
+
65
+ # Define minimal configuration
66
+ config_dict = {
67
+ "_name_or_path": "answerdotai/ModernBERT-base",
68
+ "architectures": ["ModernBertForSequenceClassification"],
69
+ "model_type": "modernbert",
70
+ "vocab_size": 50368,
71
+ "hidden_size": 768,
72
+ "num_hidden_layers": 22,
73
+ "num_attention_heads": 12,
74
+ "intermediate_size": 1152,
75
+ "max_position_embeddings": 8192,
76
+ "position_embedding_type": "absolute",
77
+ "layer_norm_eps": 1e-5,
78
+ "hidden_activation": "gelu",
79
+ "classifier_activation": "gelu",
80
+ "classifier_pooling": "mean",
81
+ "num_labels": 8,
82
+ "pad_token_id": 50283,
83
+ "bos_token_id": 50281,
84
+ "eos_token_id": 50282,
85
+ "sep_token_id": 50282,
86
+ "cls_token_id": 50281,
87
+ "problem_type": "single_label_classification",
88
+ "id2label": {str(i): label for i, label in enumerate(LABEL_MAPPING.keys())},
89
+ "label2id": LABEL_MAPPING
90
+ }
91
 
92
  # Load tokenizer
93
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
94
 
95
+ # Load model with minimal config
96
  model = AutoModelForSequenceClassification.from_pretrained(
97
  model_name,
98
+ config_dict=config_dict,
99
  trust_remote_code=True,
100
  ignore_mismatched_sizes=True,
101
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 
103
 
104
  # Set model to evaluation mode
105
  model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  #--------------------------------------------------------------------------------------------
108
  # MODEL INFERENCE ENDS HERE