|
import numpy as np |
|
from sklearn.metrics import precision_recall_fscore_support |
|
import torch |
|
from torch.utils.data import Dataset |
|
from datetime import datetime |
|
from pathlib import Path |
|
import logging |
|
|
|
def save_and_return_prediction(enriched_input: str, predicted_labels: list): |
|
Path("/home/user/app/results_pred").mkdir(parents=True, exist_ok=True) |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
pred_filename = f"prediction_{timestamp}.txt" |
|
pred_filepath = Path("/home/user/app/results_pred") / pred_filename |
|
|
|
with open(pred_filepath, "w") as f: |
|
f.write("===== Enriched Input =====\n") |
|
f.write(enriched_input + "\n\n") |
|
f.write("===== Predicted Labels =====\n") |
|
f.write(", ".join(predicted_labels)) |
|
|
|
return str(pred_filepath.name) |
|
|
|
|
|
def save_and_yield_eval(report: str): |
|
|
|
Path("/home/user/app/results_eval").mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
|
eval_filename = f"eval_report_{timestamp}.txt" |
|
eval_filepath = Path("/home/user/app/results_eval") / eval_filename |
|
|
|
with open(eval_filepath, "w") as f: |
|
f.write(report) |
|
yield f"π Evaluation saved to: {eval_filepath.name}" |
|
yield report |
|
|
|
|
|
class AbuseDataset(Dataset): |
|
def __init__(self, texts, labels, tokenizer): |
|
self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512) |
|
self.labels = labels |
|
|
|
def __len__(self): |
|
return len(self.labels) |
|
|
|
def __getitem__(self, idx): |
|
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} |
|
item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float) |
|
return item |
|
def __getitem__(self, idx): |
|
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} |
|
item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float) |
|
return item |
|
|
|
|
|
|
|
label_map = { |
|
0.0: "no", |
|
0.5: "plausibly", |
|
1.0: "yes" |
|
} |
|
|
|
|
|
|
|
def map_to_3_classes(prob_array, low, high): |
|
"""Map probabilities to 0.0, 0.5, 1.0 using thresholds.""" |
|
mapped = np.zeros_like(prob_array) |
|
mapped[(prob_array > low) & (prob_array <= high)] = 0.5 |
|
mapped[prob_array > high] = 1.0 |
|
return mapped |
|
|
|
def convert_to_label_strings(array): |
|
"""Convert float label array to list of strings.""" |
|
return [label_map[val] for val in array.flatten()] |
|
|
|
def tune_thresholds(probs, true_labels, verbose=True): |
|
"""Search for best (low, high) thresholds by macro F1 score.""" |
|
best_macro_f1 = 0.0 |
|
best_low, best_high = 0.0, 0.0 |
|
|
|
for low in np.arange(0.2, 0.5, 0.05): |
|
for high in np.arange(0.55, 0.8, 0.05): |
|
if high <= low: |
|
continue |
|
|
|
pred_soft = map_to_3_classes(probs, low, high) |
|
pred_str = convert_to_label_strings(pred_soft) |
|
true_str = convert_to_label_strings(true_labels) |
|
|
|
_, _, f1, _ = precision_recall_fscore_support( |
|
true_str, pred_str, |
|
labels=["no", "plausibly", "yes"], |
|
average="macro", |
|
zero_division=0 |
|
) |
|
if verbose: |
|
print(f"low={low:.2f}, high={high:.2f} -> macro F1={f1:.3f}") |
|
if f1 > best_macro_f1: |
|
best_macro_f1 = f1 |
|
best_low, best_high = low, high |
|
|
|
return best_low, best_high, best_macro_f1 |
|
|
|
|
|
def label_row_soft(row, label_columns): |
|
labels = [] |
|
for col in label_columns: |
|
val = str(row[col]).strip().lower() |
|
if val == "yes": |
|
labels.append(1.0) |
|
elif val == "plausibly": |
|
labels.append(0.5) |
|
else: |
|
labels.append(0.0) |
|
return labels |
|
|