safe-talk / utils.py
rshakked's picture
fix: make label_columns explicit parameter in label_row_soft
5071ec6
raw
history blame
2.83 kB
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import torch
from torch.utils.data import Dataset
# Custom Dataset class
class AbuseDataset(Dataset):
def __init__(self, texts, labels, tokenizer):
self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
# Label map used across modules
label_map = {
0.0: "no",
0.5: "plausibly",
1.0: "yes"
}
# Function to map probabilities to 3 classes
# (0.0, 0.5, 1.0) based on thresholds
def map_to_3_classes(prob_array, low, high):
"""Map probabilities to 0.0, 0.5, 1.0 using thresholds."""
mapped = np.zeros_like(prob_array)
mapped[(prob_array > low) & (prob_array <= high)] = 0.5
mapped[prob_array > high] = 1.0
return mapped
def convert_to_label_strings(array):
"""Convert float label array to list of strings."""
return [label_map[val] for val in array.flatten()]
def tune_thresholds(probs, true_labels, verbose=True):
"""Search for best (low, high) thresholds by macro F1 score."""
best_macro_f1 = 0.0
best_low, best_high = 0.0, 0.0
for low in np.arange(0.2, 0.5, 0.05):
for high in np.arange(0.55, 0.8, 0.05):
if high <= low:
continue
pred_soft = map_to_3_classes(probs, low, high)
pred_str = convert_to_label_strings(pred_soft)
true_str = convert_to_label_strings(true_labels)
_, _, f1, _ = precision_recall_fscore_support(
true_str, pred_str,
labels=["no", "plausibly", "yes"],
average="macro",
zero_division=0
)
if verbose:
print(f"low={low:.2f}, high={high:.2f} -> macro F1={f1:.3f}")
if f1 > best_macro_f1:
best_macro_f1 = f1
best_low, best_high = low, high
return best_low, best_high, best_macro_f1
# Convert label values to soft scores: "yes" = 1.0, "plausibly" = 0.5, others = 0.0
def label_row_soft(row, label_columns):
labels = []
for col in label_columns:
val = str(row[col]).strip().lower()
if val == "yes":
labels.append(1.0)
elif val == "plausibly":
labels.append(0.5)
else:
labels.append(0.0)
return labels