|
|
|
|
|
import threading |
|
import logging |
|
import io |
|
import os |
|
import time |
|
import gradio as gr |
|
from datetime import datetime |
|
from pathlib import Path |
|
import queue |
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import torch |
|
from torch.utils.data import Dataset |
|
|
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import classification_report, precision_recall_fscore_support |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
import transformers |
|
from transformers import ( |
|
TrainerCallback, |
|
AutoTokenizer, |
|
DebertaV2Tokenizer, |
|
BertTokenizer, |
|
BertForSequenceClassification, |
|
AutoModelForSequenceClassification, |
|
Trainer, |
|
TrainingArguments |
|
) |
|
|
|
from utils import ( |
|
map_to_3_classes, |
|
convert_to_label_strings, |
|
tune_thresholds, |
|
label_map, |
|
label_row_soft, |
|
AbuseDataset, |
|
save_and_yield_eval |
|
) |
|
|
|
|
|
Path("/home/user/app/results_eval").mkdir(parents=True, exist_ok=True) |
|
|
|
PERSIST_DIR = Path("/home/user/app") |
|
MODEL_DIR = PERSIST_DIR / "saved_model" |
|
LOG_FILE = PERSIST_DIR / "training.log" |
|
|
|
|
|
|
|
log_buffer = io.StringIO() |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format="%(asctime)s - %(levelname)s - %(message)s", |
|
handlers=[ |
|
logging.FileHandler(LOG_FILE), |
|
logging.StreamHandler(log_buffer) |
|
] |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
logger.info(f"Transformers version: {transformers.__version__}") |
|
|
|
|
|
logger.info("torch.cuda.is_available(): %s", torch.cuda.is_available()) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
class GradioLoggerCallback(TrainerCallback): |
|
def __init__(self, gr_queue): |
|
self.gr_queue = gr_queue |
|
|
|
def on_log(self, args, state, control, logs=None, **kwargs): |
|
if logs: |
|
msg = f"π Step {state.global_step}: {logs}" |
|
logger.info(msg) |
|
self.gr_queue.put(msg) |
|
|
|
def evaluate_model_with_thresholds(trainer, test_dataset): |
|
"""Run full evaluation with automatic threshold tuning.""" |
|
logger.info("\nπ Running model predictions...") |
|
yield "\nπ Running model predictions..." |
|
|
|
predictions = trainer.predict(test_dataset) |
|
probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy() |
|
true_soft = np.array(predictions.label_ids) |
|
|
|
logger.info("\nπ Tuning thresholds...") |
|
yield "\nπ Tuning thresholds..." |
|
best_low, best_high, best_f1 = tune_thresholds(probs, true_soft) |
|
|
|
logger.info(f"\nβ
Best thresholds: low={best_low:.2f}, high={best_high:.2f} (macro F1={best_f1:.3f})") |
|
yield f"\nβ
Best thresholds: low={best_low:.2f}, high={best_high:.2f} (macro F1={best_f1:.3f})" |
|
|
|
final_pred_soft = map_to_3_classes(probs, best_low, best_high) |
|
final_pred_str = convert_to_label_strings(final_pred_soft) |
|
true_str = convert_to_label_strings(true_soft) |
|
|
|
logger.info("\nπ Final Evaluation Report (multi-class per label):\n") |
|
yield "\nπ Final Evaluation Report (multi-class per label):\n " |
|
report = classification_report( |
|
true_str, |
|
final_pred_str, |
|
labels=["no", "plausibly", "yes"], |
|
digits=3, |
|
zero_division=0 |
|
) |
|
logger.info(report) |
|
yield from save_and_yield_eval(report) |
|
|
|
|
|
with open("/home/user/app/results_eval/eval_report.txt", "w") as f: |
|
f.write(report) |
|
def load_saved_model_and_tokenizer(): |
|
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_DIR) |
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device) |
|
return tokenizer, model |
|
|
|
def evaluate_saved_model(progress=gr.Progress(track_tqdm=True)): |
|
if MODEL_DIR.exists(): |
|
yield "β
Trained model found! Skipping training...\n" |
|
else: |
|
yield "β No trained model found. Please train the model first.\n" |
|
return |
|
try: |
|
logger.info("π Loading saved model for evaluation...") |
|
yield "π Loading saved model for evaluation...\n" |
|
|
|
tokenizer, model = load_saved_model_and_tokenizer() |
|
test_dataset = AbuseDataset(test_texts, test_labels, tokenizer) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=TrainingArguments( |
|
output_dir="./results_eval", |
|
per_device_eval_batch_size=4, |
|
logging_dir="./logs_eval", |
|
disable_tqdm=True |
|
), |
|
eval_dataset=test_dataset |
|
) |
|
|
|
|
|
for line in evaluate_model_with_thresholds(trainer, test_dataset): |
|
yield line |
|
|
|
logger.info("β
Evaluation complete.\n") |
|
yield "\nβ
Evaluation complete.\n" |
|
|
|
except Exception as e: |
|
logger.exception(f"β Evaluation failed: {e}") |
|
yield f"β Evaluation failed: {e}\n" |
|
|
|
|
|
token = os.environ.get("HF_TOKEN") |
|
|
|
|
|
path = hf_hub_download( |
|
repo_id="rshakked/abusive-relashionship-stories", |
|
filename="Abusive Relationship Stories - Technion & MSF.xlsx", |
|
repo_type="dataset", |
|
use_auth_token= token |
|
) |
|
df = pd.read_excel(path) |
|
|
|
|
|
text_column = "post_body" |
|
label_columns = [ |
|
'emotional_violence', 'physical_violence', 'sexual_violence', 'spiritual_violence', |
|
'economic_violence', 'past_offenses', 'social_isolation', 'refuses_treatment', |
|
'suicidal_threats', 'mental_condition', 'daily_activity_control', 'violent_behavior', |
|
'unemployment', 'substance_use', 'obsessiveness', 'jealousy', 'outbursts', |
|
'ptsd', 'hard_childhood', 'emotional_dependency', 'prevention_of_care', |
|
'fear_based_relationship', 'humiliation', 'physical_threats', |
|
'presence_of_others_in_assault', 'signs_of_injury', 'property_damage', |
|
'access_to_weapons', 'gaslighting' |
|
] |
|
|
|
logger.info(np.shape(df)) |
|
|
|
df = df[[text_column] + label_columns] |
|
logger.info(np.shape(df)) |
|
df = df.dropna(subset=[text_column]) |
|
logger.info(np.shape(df)) |
|
|
|
df["label_vector"] = df.apply(lambda row: label_row_soft(row, label_columns), axis=1) |
|
label_matrix = df["label_vector"].tolist() |
|
|
|
|
|
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split( |
|
df[text_column].tolist(), label_matrix, test_size=0.2, random_state=42 |
|
) |
|
|
|
train_texts, val_texts, train_labels, val_labels = train_test_split( |
|
train_val_texts, train_val_labels, test_size=0.1, random_state=42 |
|
) |
|
|
|
|
|
model_name = "microsoft/deberta-v3-base" |
|
|
|
def run_training(progress=gr.Progress(track_tqdm=True)): |
|
log_queue = queue.Queue() |
|
if MODEL_DIR.exists(): |
|
yield "β
Trained model found! Skipping training...\n" |
|
for line in evaluate_saved_model(): |
|
yield line |
|
return |
|
yield "π Starting training...\n" |
|
try: |
|
logger.info("Starting training run...") |
|
|
|
|
|
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
model_name, |
|
num_labels=len(label_columns), |
|
problem_type="multi_label_classification" |
|
).to(device) |
|
|
|
|
|
model.gradient_checkpointing_enable() |
|
|
|
|
|
for name, param in model.named_parameters(): |
|
if any(f"encoder.layer.{i}." in name for i in range(0, 6)): |
|
param.requires_grad = False |
|
|
|
|
|
train_dataset = AbuseDataset(train_texts, train_labels,tokenizer) |
|
val_dataset = AbuseDataset(val_texts, val_labels,tokenizer) |
|
test_dataset = AbuseDataset(test_texts, test_labels,tokenizer) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
logging_dir="./logs", |
|
logging_steps=500, |
|
disable_tqdm=True |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=val_dataset, |
|
callbacks=[GradioLoggerCallback(log_queue)] |
|
) |
|
|
|
logger.info("Training started with %d samples", len(train_dataset)) |
|
yield "π Training started...\n" |
|
|
|
progress(0.01) |
|
|
|
|
|
trainer_training = [True] |
|
|
|
def background_train(): |
|
trainer.train() |
|
trainer_training[0] = False |
|
|
|
train_thread = threading.Thread(target=background_train) |
|
train_thread.start() |
|
|
|
|
|
percent = 0 |
|
while train_thread.is_alive() or not log_queue.empty(): |
|
while not log_queue.empty(): |
|
log_msg = log_queue.get() |
|
yield log_msg |
|
|
|
if percent < 98: |
|
percent += 1 |
|
progress(percent / 100) |
|
time.sleep(1) |
|
|
|
progress(1.0) |
|
yield "β
Progress: 100%\n" |
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
progress(1.0) |
|
yield "β
Progress: 100%\n" |
|
|
|
|
|
MODEL_DIR.mkdir(parents=True, exist_ok=True) |
|
model.save_pretrained(MODEL_DIR) |
|
tokenizer.save_pretrained(MODEL_DIR) |
|
|
|
logger.info(" Training completed and model saved.") |
|
yield f"π Training complete! Model saved on {MODEL_DIR.resolve()}.\n" |
|
|
|
except Exception as e: |
|
logger.exception( f"β Training failed: {e}") |
|
yield f"β Training failed: {e}\n" |
|
|
|
|
|
try: |
|
if 'trainer' in locals(): |
|
for line in evaluate_model_with_thresholds(trainer, test_dataset): |
|
yield line |
|
logger.info("Evaluation completed") |
|
logger.info("Evaluation completed") |
|
yield "π Evaluation completed\n" |
|
except Exception as e: |
|
logger.exception(f"Evaluation failed: {e}") |
|
return |
|
|
|
def push_model_to_hub(): |
|
try: |
|
logger.info("π Pushing model to Hugging Face Hub...") |
|
tokenizer, model = load_saved_model_and_tokenizer() |
|
model.push_to_hub("rshakked/abuse-detector-he-en", use_auth_token=token) |
|
tokenizer.push_to_hub("rshakked/abuse-detector-he-en", use_auth_token=token) |
|
return "β
Model pushed to hub successfully!" |
|
except Exception as e: |
|
logger.exception("β Failed to push model to hub.") |
|
return f"β Failed to push model: {e}" |
|
|
|
|
|
|