Spaces:

rshakked
/

safe-talk

Running

App Files Files Community

safe-talk / utils.py

rshakked

feat: save and display abuse prediction results with timestamped filenames

980da81 about 2 months ago

raw

history blame contribute delete

4.09 kB

	import numpy as np
	from sklearn.metrics import precision_recall_fscore_support
	import torch
	from torch.utils.data import Dataset
	from datetime import datetime
	from pathlib import Path
	import logging

	def save_and_return_prediction(enriched_input: str, predicted_labels: list):
	Path("/home/user/app/results_pred").mkdir(parents=True, exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	pred_filename = f"prediction_{timestamp}.txt"
	pred_filepath = Path("/home/user/app/results_pred") / pred_filename

	with open(pred_filepath, "w") as f:
	f.write("===== Enriched Input =====\n")
	f.write(enriched_input + "\n\n")
	f.write("===== Predicted Labels =====\n")
	f.write(", ".join(predicted_labels))

	return str(pred_filepath.name)

	# Save and print evaluation results
	def save_and_yield_eval(report: str):
	# Create evaluation results directories if they don't exist
	Path("/home/user/app/results_eval").mkdir(parents=True, exist_ok=True)

	# Generate versioned filename using timestamp
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	eval_filename = f"eval_report_{timestamp}.txt"
	eval_filepath = Path("/home/user/app/results_eval") / eval_filename

	with open(eval_filepath, "w") as f:
	f.write(report)
	yield f"📄 Evaluation saved to: {eval_filepath.name}"
	yield report

	# Custom Dataset class
	class AbuseDataset(Dataset):
	def __init__(self, texts, labels, tokenizer):
	self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
	self.labels = labels

	def __len__(self):
	return len(self.labels)

	def __getitem__(self, idx):
	item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
	item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
	return item
	def __getitem__(self, idx):
	item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
	item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
	return item


	# Label map used across modules
	label_map = {
	0.0: "no",
	0.5: "plausibly",
	1.0: "yes"
	}

	# Function to map probabilities to 3 classes
	# (0.0, 0.5, 1.0) based on thresholds
	def map_to_3_classes(prob_array, low, high):
	"""Map probabilities to 0.0, 0.5, 1.0 using thresholds."""
	mapped = np.zeros_like(prob_array)
	mapped[(prob_array > low) & (prob_array <= high)] = 0.5
	mapped[prob_array > high] = 1.0
	return mapped

	def convert_to_label_strings(array):
	"""Convert float label array to list of strings."""
	return [label_map[val] for val in array.flatten()]

	def tune_thresholds(probs, true_labels, verbose=True):
	"""Search for best (low, high) thresholds by macro F1 score."""
	best_macro_f1 = 0.0
	best_low, best_high = 0.0, 0.0

	for low in np.arange(0.2, 0.5, 0.05):
	for high in np.arange(0.55, 0.8, 0.05):
	if high <= low:
	continue

	pred_soft = map_to_3_classes(probs, low, high)
	pred_str = convert_to_label_strings(pred_soft)
	true_str = convert_to_label_strings(true_labels)

	_, _, f1, _ = precision_recall_fscore_support(
	true_str, pred_str,
	labels=["no", "plausibly", "yes"],
	average="macro",
	zero_division=0
	)
	if verbose:
	print(f"low={low:.2f}, high={high:.2f} -> macro F1={f1:.3f}")
	if f1 > best_macro_f1:
	best_macro_f1 = f1
	best_low, best_high = low, high

	return best_low, best_high, best_macro_f1

	# Convert label values to soft scores: "yes" = 1.0, "plausibly" = 0.5, others = 0.0
	def label_row_soft(row, label_columns):
	labels = []
	for col in label_columns:
	val = str(row[col]).strip().lower()
	if val == "yes":
	labels.append(1.0)
	elif val == "plausibly":
	labels.append(0.5)
	else:
	labels.append(0.0)
	return labels