Spaces:

rshakked
/

safe-talk

Running

rshakked commited on May 13

Commit

fedc8f2

1 Parent(s): 285a433

fix: reduce batch size and enable gradient checkpointing to prevent GPU OOM crashes

Files changed (1) hide show

train_abuse_model.py CHANGED Viewed

@@ -185,11 +185,8 @@ model = AutoModelForSequenceClassification.from_pretrained(
     problem_type="multi_label_classification"
 ).to(device)  # Move model to GPU
-# # Optional: Freeze base model layers (only train classifier head)
-# freeze_base = False
-# if freeze_base:
-#     for name, param in model.bert.named_parameters():
-#         param.requires_grad = False
 # Freeze bottom 6 layers of DeBERTa encoder
 for name, param in model.named_parameters():
@@ -215,12 +212,12 @@ test_dataset = AbuseDataset(test_texts, test_labels)
 training_args = TrainingArguments(
     output_dir="./results",
     num_train_epochs=3,
-    per_device_train_batch_size=8,
-    per_device_eval_batch_size=8,
     evaluation_strategy="epoch",
     save_strategy="epoch",
     logging_dir="./logs",
-    logging_steps=10,
 )
 # Train using HuggingFace Trainer

     problem_type="multi_label_classification"
 ).to(device)  # Move model to GPU
+# gradient checkpointing helps cut memory use:
+model.gradient_checkpointing_enable()
 # Freeze bottom 6 layers of DeBERTa encoder
 for name, param in model.named_parameters():
 training_args = TrainingArguments(
     output_dir="./results",
     num_train_epochs=3,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
     evaluation_strategy="epoch",
     save_strategy="epoch",
     logging_dir="./logs",
+    logging_steps=100,
 )
 # Train using HuggingFace Trainer