rshakked commited on
Commit
bc56514
·
1 Parent(s): def4deb

refactor: create predict_pipeline.py and utils.py to modularize app logic

Browse files

- Created predict_pipeline.py to handle enrichment and inference pipeline
- Added run_prediction_pipeline() with mocked summarization and translation
- Handles uploaded WhatsApp ZIP, merges with description, and runs prediction

- Created utils.py for shared logic used across modules
- Moved AbuseDataset class and label_row_soft function
- Added map_to_3_classes, convert_to_label_strings, and tune_thresholds
- Centralized label_map for consistent mapping

- Updated Gradio UI to import from predict_pipeline
- Improves modularity, reusability, and future maintainability

Files changed (4) hide show
  1. app.py +29 -10
  2. predict_pipline.py +0 -0
  3. train_abuse_model.py +11 -78
  4. utils.py +85 -0
app.py CHANGED
@@ -1,20 +1,39 @@
1
  import gradio as gr
2
- from train_abuse_model import run_training, evaluate_saved_model, push_model_to_hub
 
 
 
 
 
 
3
 
4
  with gr.Blocks() as demo:
5
- gr.Markdown("## 🧠 Abuse Detection Fine-Tuning App")
6
  gr.Markdown("⚠️ Keep this tab open while training or evaluating.")
7
 
8
- with gr.Row():
9
- start_btn = gr.Button("🚀 Start Training")
10
- eval_btn = gr.Button("🔍 Evaluate Trained Model")
11
- push_btn = gr.Button("📤 Push Model to Hub")
 
 
 
 
 
 
 
 
 
 
12
 
13
- output_box = gr.Textbox(label="Logs", lines=25, interactive=False)
 
14
 
15
- start_btn.click(fn=run_training, outputs=output_box)
16
- eval_btn.click(fn=evaluate_saved_model, outputs=output_box)
17
- push_btn.click(fn=push_model_to_hub, outputs=output_box)
 
 
18
 
19
  if __name__ == "__main__":
20
  demo.launch()
 
1
  import gradio as gr
2
+ from train_abuse_model import (
3
+ run_training,
4
+ evaluate_saved_model,
5
+ push_model_to_hub
6
+ )
7
+ from predict_pipeline import run_prediction_pipeline
8
+
9
 
10
  with gr.Blocks() as demo:
11
+ gr.Markdown("## 🧠 Abuse Detection App")
12
  gr.Markdown("⚠️ Keep this tab open while training or evaluating.")
13
 
14
+ with gr.Tab("🧪 Train / Evaluate"):
15
+ with gr.Row():
16
+ start_btn = gr.Button("🚀 Start Training")
17
+ eval_btn = gr.Button("🔍 Evaluate Trained Model")
18
+ push_btn = gr.Button("📤 Push Model to Hub")
19
+ output_box = gr.Textbox(label="Logs", lines=25, interactive=False)
20
+ start_btn.click(fn=run_training, outputs=output_box)
21
+ eval_btn.click(fn=evaluate_saved_model, outputs=output_box)
22
+ push_btn.click(fn=push_model_to_hub, outputs=output_box)
23
+
24
+ with gr.Tab("🔮 Abuse Detection"):
25
+ desc_input = gr.Textbox(label="📝 Relationship Description", lines=5, placeholder="Write a relationship story here...")
26
+ chat_upload = gr.File(label="📁 Optional: WhatsApp Chat ZIP (.zip)", file_types=[".zip"])
27
+ predict_btn = gr.Button("Run Prediction")
28
 
29
+ enriched_output = gr.Textbox(label="📎 Enriched Input (Used for Prediction)", lines=8, interactive=False)
30
+ label_output = gr.Textbox(label="🏷️ Predicted Labels", lines=2, interactive=False)
31
 
32
+ predict_btn.click(
33
+ fn=run_prediction_pipeline,
34
+ inputs=[desc_input, chat_upload],
35
+ outputs=[enriched_output, label_output]
36
+ )
37
 
38
  if __name__ == "__main__":
39
  demo.launch()
predict_pipline.py ADDED
File without changes
train_abuse_model.py CHANGED
@@ -3,6 +3,7 @@
3
 
4
  import logging
5
  import io
 
6
  import time
7
  import gradio as gr # ✅ required for progress bar
8
  from pathlib import Path
@@ -31,6 +32,16 @@ from transformers import (
31
  TrainingArguments
32
  )
33
 
 
 
 
 
 
 
 
 
 
 
34
  PERSIST_DIR = Path("/home/user/app")
35
  MODEL_DIR = PERSIST_DIR / "saved_model"
36
  LOG_FILE = PERSIST_DIR / "training.log"
@@ -56,82 +67,6 @@ logger.info("Transformers version: %s", torch.__version__)
56
  logger.info("torch.cuda.is_available(): %s", torch.cuda.is_available())
57
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
 
59
- # Label mapping for evaluation
60
- label_map = {0.0: "no", 0.5: "plausibly", 1.0: "yes"}
61
-
62
- # Custom Dataset class
63
-
64
- class AbuseDataset(Dataset):
65
- def __init__(self, texts, labels, tokenizer):
66
- self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
67
- self.labels = labels
68
-
69
- def __len__(self):
70
- return len(self.labels)
71
-
72
- def __getitem__(self, idx):
73
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
74
- item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
75
- return item
76
- def __getitem__(self, idx):
77
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
78
- item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
79
- return item
80
-
81
-
82
- # Convert label values to soft scores: "yes" = 1.0, "plausibly" = 0.5, others = 0.0
83
- def label_row_soft(row):
84
- labels = []
85
- for col in label_columns:
86
- val = str(row[col]).strip().lower()
87
- if val == "yes":
88
- labels.append(1.0)
89
- elif val == "plausibly":
90
- labels.append(0.5)
91
- else:
92
- labels.append(0.0)
93
- return labels
94
-
95
- # Function to map probabilities to 3 classes
96
- # (0.0, 0.5, 1.0) based on thresholds
97
- def map_to_3_classes(prob_array, low, high):
98
- """Map probabilities to 0.0, 0.5, 1.0 using thresholds."""
99
- mapped = np.zeros_like(prob_array)
100
- mapped[(prob_array > low) & (prob_array <= high)] = 0.5
101
- mapped[prob_array > high] = 1.0
102
- return mapped
103
-
104
- def convert_to_label_strings(array):
105
- """Convert float label array to list of strings."""
106
- return [label_map[val] for val in array.flatten()]
107
-
108
- def tune_thresholds(probs, true_labels, verbose=True):
109
- """Search for best (low, high) thresholds by macro F1 score."""
110
- best_macro_f1 = 0.0
111
- best_low, best_high = 0.0, 0.0
112
-
113
- for low in np.arange(0.2, 0.5, 0.05):
114
- for high in np.arange(0.55, 0.8, 0.05):
115
- if high <= low:
116
- continue
117
-
118
- pred_soft = map_to_3_classes(probs, low, high)
119
- pred_str = convert_to_label_strings(pred_soft)
120
- true_str = convert_to_label_strings(true_labels)
121
-
122
- _, _, f1, _ = precision_recall_fscore_support(
123
- true_str, pred_str,
124
- labels=["no", "plausibly", "yes"],
125
- average="macro",
126
- zero_division=0
127
- )
128
- if verbose:
129
- logger.info(f"low={low:.2f}, high={high:.2f} -> macro F1={f1:.3f}")
130
- if f1 > best_macro_f1:
131
- best_macro_f1 = f1
132
- best_low, best_high = low, high
133
-
134
- return best_low, best_high, best_macro_f1
135
 
136
  def evaluate_model_with_thresholds(trainer, test_dataset):
137
  """Run full evaluation with automatic threshold tuning."""
@@ -198,8 +133,6 @@ def evaluate_saved_model(progress=gr.Progress(track_tqdm=True)):
198
  eval_dataset=test_dataset
199
  )
200
 
201
- label_map = {0.0: "no", 0.5: "plausibly", 1.0: "yes"}
202
-
203
  # Re-yield from generator
204
  for line in evaluate_model_with_thresholds(trainer, test_dataset):
205
  yield line
 
3
 
4
  import logging
5
  import io
6
+ import os
7
  import time
8
  import gradio as gr # ✅ required for progress bar
9
  from pathlib import Path
 
32
  TrainingArguments
33
  )
34
 
35
+ from utils import (
36
+ map_to_3_classes,
37
+ convert_to_label_strings,
38
+ tune_thresholds,
39
+ label_map,
40
+ label_row_soft,
41
+ AbuseDataset
42
+ )
43
+
44
+
45
  PERSIST_DIR = Path("/home/user/app")
46
  MODEL_DIR = PERSIST_DIR / "saved_model"
47
  LOG_FILE = PERSIST_DIR / "training.log"
 
67
  logger.info("torch.cuda.is_available(): %s", torch.cuda.is_available())
68
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def evaluate_model_with_thresholds(trainer, test_dataset):
72
  """Run full evaluation with automatic threshold tuning."""
 
133
  eval_dataset=test_dataset
134
  )
135
 
 
 
136
  # Re-yield from generator
137
  for line in evaluate_model_with_thresholds(trainer, test_dataset):
138
  yield line
utils.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics import precision_recall_fscore_support
3
+ import torch
4
+ from torch.utils.data import Dataset
5
+
6
+ # Custom Dataset class
7
+ class AbuseDataset(Dataset):
8
+ def __init__(self, texts, labels, tokenizer):
9
+ self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
10
+ self.labels = labels
11
+
12
+ def __len__(self):
13
+ return len(self.labels)
14
+
15
+ def __getitem__(self, idx):
16
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
17
+ item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
18
+ return item
19
+ def __getitem__(self, idx):
20
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
21
+ item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
22
+ return item
23
+
24
+
25
+ # Label map used across modules
26
+ label_map = {
27
+ 0.0: "no",
28
+ 0.5: "plausibly",
29
+ 1.0: "yes"
30
+ }
31
+
32
+ # Function to map probabilities to 3 classes
33
+ # (0.0, 0.5, 1.0) based on thresholds
34
+ def map_to_3_classes(prob_array, low, high):
35
+ """Map probabilities to 0.0, 0.5, 1.0 using thresholds."""
36
+ mapped = np.zeros_like(prob_array)
37
+ mapped[(prob_array > low) & (prob_array <= high)] = 0.5
38
+ mapped[prob_array > high] = 1.0
39
+ return mapped
40
+
41
+ def convert_to_label_strings(array):
42
+ """Convert float label array to list of strings."""
43
+ return [label_map[val] for val in array.flatten()]
44
+
45
+ def tune_thresholds(probs, true_labels, verbose=True):
46
+ """Search for best (low, high) thresholds by macro F1 score."""
47
+ best_macro_f1 = 0.0
48
+ best_low, best_high = 0.0, 0.0
49
+
50
+ for low in np.arange(0.2, 0.5, 0.05):
51
+ for high in np.arange(0.55, 0.8, 0.05):
52
+ if high <= low:
53
+ continue
54
+
55
+ pred_soft = map_to_3_classes(probs, low, high)
56
+ pred_str = convert_to_label_strings(pred_soft)
57
+ true_str = convert_to_label_strings(true_labels)
58
+
59
+ _, _, f1, _ = precision_recall_fscore_support(
60
+ true_str, pred_str,
61
+ labels=["no", "plausibly", "yes"],
62
+ average="macro",
63
+ zero_division=0
64
+ )
65
+ if verbose:
66
+ print(f"low={low:.2f}, high={high:.2f} -> macro F1={f1:.3f}")
67
+ if f1 > best_macro_f1:
68
+ best_macro_f1 = f1
69
+ best_low, best_high = low, high
70
+
71
+ return best_low, best_high, best_macro_f1
72
+
73
+ # Convert label values to soft scores: "yes" = 1.0, "plausibly" = 0.5, others = 0.0
74
+ def label_row_soft(row):
75
+ labels = []
76
+ for col in label_columns:
77
+ val = str(row[col]).strip().lower()
78
+ if val == "yes":
79
+ labels.append(1.0)
80
+ elif val == "plausibly":
81
+ labels.append(0.5)
82
+ else:
83
+ labels.append(0.0)
84
+ return labels
85
+