rajatsingh0702 commited on
Commit
adc3811
·
verified ·
1 Parent(s): 509a336

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +496 -496
app.py CHANGED
@@ -1,497 +1,497 @@
1
- import os
2
- import torch
3
- import shutil
4
- import pickle
5
- import zipfile
6
- import matplotlib
7
- import numpy as np
8
- import gradio as gr
9
- import pandas as pd
10
- import torch.nn as nn
11
- import torch.optim as optim
12
- import torch.nn.functional as F
13
- from matplotlib import pyplot as plt
14
- from sklearn.impute import SimpleImputer
15
- from pandas.plotting import andrews_curves
16
- from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
17
- from sklearn.preprocessing import StandardScaler
18
- from torch.utils.data import DataLoader, TensorDataset, Dataset
19
-
20
- ###################################################### Preprocessing #####################################################################
21
- def preprocess_dataframe(df, target_column=None, fill_method='mean', drop_na=True, sequence_length=32, test_size=0.2, batch_size = 128):
22
- """
23
- Loads a DataFrame from a file, preprocesses it, prepares it for LSTM data.
24
- If a target_column is provided, that column is used as the target (y).
25
- Otherwise, it prepares data for an autoencoder (no separate y).
26
- 1. Loads file and checks for the target columns
27
- 2. Drops any NaN rows and non numeric columns.
28
- 3. Fills the NaN values with given method.
29
- 4. After preprocessing, data is transformed to fit in lstm.
30
-
31
- Args:
32
- file_path (str): Path to the data file (e.g., CSV, Excel).
33
- target_column (str, optional): Name of the target column. If provided, use this as target. Otherwise, treats as autoencoder. Defaults to None.
34
- fill_method (str, optional): Method for filling NaNs: 'mean', 'median', 'most_frequent', or 'constant'.
35
- Defaults to 'mean'. If 'constant', `fill_value` must be set.
36
- drop_na (bool, optional): Whether to drop rows with any NaN values. Defaults to True.
37
- sequence_length (int): The length of the sequence to create (e.g., number of features to treat as a sequence).
38
- test_size (float): The proportion of data to use for testing.
39
-
40
- Returns:
41
- tuple: (train_loader, test_loader, input_size) if no target_column.
42
- (train_loader, test_loader, input_size, target_column_name) if target_column provided
43
- A tuple containing:
44
- - train_loader (DataLoader): DataLoader for training data.
45
- - test_loader (DataLoader): DataLoader for test data.
46
- - input_size (int): Number of features.
47
- - target_column_name (str): The name of the target column only when there is target column.
48
- """
49
- # 1. Target Column Check
50
- target_col = None
51
- if target_column:
52
- if target_column in df.columns:
53
- target_col = target_column
54
- print(f"Target column '{target_column}' found.")
55
- else:
56
- target_column = None # Reset target_column so we treat as autoencoder
57
- else:
58
- print("No target column specified. Treating as autoencoder.")
59
-
60
- #2. Drop Rows with NaNs before Fill
61
- if drop_na:
62
- print("Dropping rows with any NaN values...")
63
- df = df.dropna()
64
-
65
-
66
- # 3. Drop Non-Numeric Columns (Except Target)
67
- columns_to_drop = []
68
- for col in df.columns:
69
- if col != target_col and not pd.api.types.is_numeric_dtype(df[col]):
70
- columns_to_drop.append(col) #exclude the target column if target column is not numeric
71
- if columns_to_drop:
72
- print(f"Dropping non-numeric columns: {columns_to_drop}")
73
- df = df.drop(columns=columns_to_drop)
74
- else:
75
- print("No non-numeric columns found.")
76
-
77
-
78
- # 4. Handle Missing Values (Only in Numeric Columns After Dropping)
79
- numeric_cols = df.select_dtypes(include=np.number).columns #select numeric columns after non-numeric columsn removed
80
- if df[numeric_cols].isnull().any().any(): # Check if any NaN values exist (in numeric columns)
81
- print("Handling missing values...")
82
- if fill_method in ['mean', 'median', 'most_frequent', 'constant']:
83
- imputer = SimpleImputer(strategy=fill_method)
84
-
85
- if fill_method == 'constant':
86
- imputer = SimpleImputer(strategy=fill_method, fill_value=0) #only with constant filling value must be provided
87
-
88
- df[numeric_cols] = imputer.fit_transform(df[numeric_cols]) # Apply only to numeric columns
89
-
90
- else:
91
- raise ValueError("Invalid fill_method. Choose 'mean', 'median', 'most_frequent', or 'constant'.")
92
-
93
- # Droping NaN and inf
94
- df.replace([np.inf, -np.inf], np.nan, inplace=True)
95
- df.dropna(inplace=True)
96
-
97
- if target_col:
98
- inputdf = df.drop(columns=[target_col])
99
- outputdf = df[target_col].apply(lambda x: 0 if x.lower() == 'benign' else 1)
100
- malinputdf = inputdf[outputdf == 1]
101
- beninputdf = inputdf[outputdf == 0]
102
- sample_size = min(len(beninputdf), len(malinputdf), 500)
103
- bensample = beninputdf.sample(n=sample_size, random_state=42)
104
- bensample['Label'] = 'Benign'
105
- malsample = malinputdf.sample(n=sample_size, random_state=42)
106
- malsample['Label'] = 'Malicious'
107
- sample = pd.concat([bensample, malsample])
108
- data = beninputdf.values
109
- else:
110
- inputdf = df
111
- sample_size = min(len(inputdf), 500)
112
- sample = df.sample(n=sample_size, random_state=42)
113
- data = inputdf.values
114
-
115
- scaler = StandardScaler()
116
- data = scaler.fit_transform(data)
117
-
118
- if target_col:
119
- X_train = data
120
- data = malinputdf.values
121
- data = scaler.transform(data)
122
- X_test = data
123
- else:
124
- X_train = data
125
-
126
- class TabularDatasetTest(Dataset):
127
- def __init__(self, data):
128
- self.data = data.clone().detach()
129
-
130
- def __len__(self):
131
- return len(self.data)
132
-
133
- def __getitem__(self, idx):
134
- return self.data[idx], self.data[idx]
135
-
136
- class TabularDatasetTrain(Dataset):
137
- def __init__(self, data, sequence_length):
138
- self.data = data.clone().detach()
139
- self.sequence_length = sequence_length
140
-
141
- def __len__(self):
142
- return len(self.data) - self.sequence_length + 1
143
-
144
- def __getitem__(self, idx):
145
- return self.data[idx:idx + self.sequence_length], self.data[idx:idx + self.sequence_length]
146
-
147
- if target_column:
148
- X_train = torch.tensor(X_train, dtype=torch.float32)
149
- X_test = torch.tensor(X_test, dtype=torch.float32)
150
- train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
151
- test_dataset = TabularDatasetTest(X_test)
152
- train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
153
- test_Dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
154
- return {
155
- 'train_loader': train_DataLoader,
156
- 'test_loader': test_Dataloader,
157
- 'input_df': inputdf,
158
- 'target_df': outputdf,
159
- 'malinput_df': malinputdf,
160
- 'beninput_df': beninputdf,
161
- 'target_col': target_col,
162
- 'scaler': scaler,
163
- 'sample': sample
164
- }
165
- else:
166
- X_train = torch.tensor(X_train, dtype=torch.float32)
167
- train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
168
- train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
169
- sample["Label"] = "dummy_class"
170
- return {
171
- 'train_loader': train_DataLoader,
172
- 'test_loader': None,
173
- 'input_df': inputdf,
174
- 'malinput_df': None,
175
- 'beninput_df': None,
176
- 'target_df': None,
177
- 'target_col': None,
178
- 'scaler': scaler,
179
- 'sample': sample
180
- }
181
- ################################################## Model #############################################################################
182
- class EncoderRNN(nn.Module):
183
- def __init__(self, input_size, hidden_size, num_layers, isCuda):
184
- super(EncoderRNN, self).__init__()
185
- self.input_size = input_size
186
- self.hidden_size = hidden_size
187
- self.num_layers = num_layers
188
- self.bottleneck_size = int(input_size/2)
189
-
190
- self.isCuda = isCuda
191
- self.lstm1 = nn.LSTM(input_size, int(hidden_size/2), num_layers, batch_first=True, bidirectional = True)
192
- self.relu = nn.ReLU()
193
- self.dropout = nn.Dropout(0.2)
194
- self.lstm2 = nn.LSTM(hidden_size, self.bottleneck_size, num_layers, batch_first=True)
195
-
196
-
197
- def forward(self, inputs):
198
- intermediate_state, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
199
- intermediate_state = self.relu(self.dropout(intermediate_state))
200
- encoded_input, hidden = self.lstm2(intermediate_state)#, (h0_2, c0_2))
201
- return encoded_input, intermediate_state
202
-
203
- class DecoderRNN(nn.Module):
204
- def __init__(self, hidden_size, output_size, num_layers, isCuda):
205
- super(DecoderRNN, self).__init__()
206
- self.hidden_size = hidden_size
207
- self.output_size = output_size
208
- self.num_layers = num_layers
209
- self.bottleneck_size = int(output_size/2)
210
-
211
- self.isCuda = isCuda
212
- self.lstm2 = nn.LSTM(self.bottleneck_size, hidden_size, num_layers, batch_first=True)
213
- self.relu = nn.ReLU()
214
- self.dropout = nn.Dropout(0.2)
215
- self.lstm1 = nn.LSTM(2*hidden_size, output_size, num_layers, batch_first=True)
216
-
217
- def forward(self, encoded_input, intermediate_state):
218
- encoded_input, hidden = self.lstm2(encoded_input)#, (h0_2, c0_2))
219
- inputs = torch.cat((self.dropout(encoded_input), intermediate_state), dim=2)
220
- inputs = self.relu(inputs)
221
- decoded_output, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
222
- # print(f"output: {decoded_output}")
223
- return decoded_output
224
-
225
- class LSTMAE(nn.Module):
226
- def __init__(self, input_size, hidden_size, num_layers=1, isCuda="cuda" if torch.cuda.is_available() else "cpu"):
227
- super(LSTMAE, self).__init__()
228
- hidden_size = hidden_size if hidden_size%2==0 else hidden_size+1
229
- self.encoder = EncoderRNN(input_size, hidden_size, num_layers, isCuda)
230
- self.decoder = DecoderRNN(hidden_size, input_size, num_layers, isCuda)
231
- self.initialize_weights()
232
-
233
- def initialize_weights(self):
234
- """
235
- Initializes the weights of the linear, LSTM, and convolutional layers
236
- using appropriate initialization schemes.
237
- """
238
- for m in self.modules(): # Iterate through all modules in the network
239
- if isinstance(m, nn.LSTM):
240
- for name, param in m.named_parameters():
241
- if 'weight' in name:
242
- if 'ih' or 'hh' in name:
243
- nn.init.xavier_uniform_(param.data) # Input-to-hidden
244
- elif 'bias' in name:
245
- nn.init.zeros_(param.data)
246
-
247
- def forward(self, input):
248
- encoded_input, intermediate_state = self.encoder(input)
249
- decoded_output = self.decoder(encoded_input, intermediate_state)
250
- return decoded_output
251
-
252
-
253
-
254
- ############################################## Andrews Curves ###########################################################################
255
- def make_better_andrews_curves(df, class_column, colors=None, plot_title="Andrews Curves",
256
- line_width=0.8, transparency=0.5, sample_size=None, legend_loc='best',
257
- custom_labels=None, x_axis_ticks=None, x_axis_labels=None,
258
- figsize=(10, 6), dpi=300, name = "andrews_curves"):
259
- """
260
- Generates an Andrews Curves plot with enhanced styling.
261
-
262
- Args:
263
- df: pandas DataFrame containing the data.
264
- class_column: Name of the column containing class labels.
265
- colors: List of colors to use for each class (e.g., ['blue', 'red']). Defaults to matplotlib's defaults if None.
266
- plot_title: Title of the plot.
267
- line_width: Width of the lines.
268
- transparency: Alpha value (transparency) of the lines.
269
- sample_size: If an integer is provided, a random sample of the data will be used. Useful for large datasets.
270
- legend_loc: Location of the legend (e.g., 'best', 'upper right', 'lower left').
271
- custom_labels: A dictionary mapping original class labels to more descriptive labels for the legend.
272
- x_axis_ticks: A list of tick positions for the x-axis. If None, default ticks are used.
273
- x_axis_labels: A list of labels for the x-axis ticks. Must be the same length as x_axis_ticks.
274
- figsize: Tuple specifying the figure size (width, height) in inches.
275
- """
276
-
277
- if sample_size and sample_size < len(df):
278
- df = df.sample(n=sample_size, random_state=42) # Sample for faster plotting
279
-
280
- plt.figure(figsize=figsize) # Set the figure size before plotting
281
-
282
- ax = andrews_curves(df, class_column, color=colors) # Store the Axes object
283
-
284
- plt.title(plot_title, fontsize=16)
285
- plt.xlabel("t", fontsize=12) # Added x-axis label
286
- plt.ylabel("f(t)", fontsize=12) # Added y-axis label
287
-
288
- for line in ax.get_lines():
289
- line.set_linewidth(line_width)
290
- line.set_alpha(transparency)
291
-
292
- # Customize Legend
293
- if custom_labels:
294
- handles, labels = ax.get_legend_handles_labels()
295
- new_labels = [custom_labels.get(label, label) for label in labels] # Use .get() to handle missing labels
296
- ax.legend(handles, new_labels, loc=legend_loc, fontsize=10)
297
- else:
298
- plt.legend(loc=legend_loc, fontsize=10)
299
-
300
-
301
- # Customize X-axis ticks and labels
302
- if x_axis_ticks:
303
- plt.xticks(x_axis_ticks, x_axis_labels)
304
-
305
- plt.grid(False) # Add a grid
306
- plt.tight_layout() # Adjust layout to prevent labels from overlapping
307
- plt.savefig(f"{name}.png", dpi=dpi)
308
- ################################################# Model Training ######################################################################
309
- def train_model(model, train_loader, test_loader = None, learning_rate=0.001, epochs=10):
310
- criterion = nn.MSELoss()
311
- info = ""
312
- optimizer = optim.Adam(model.parameters(), lr=learning_rate)
313
- train_loss_data = {}
314
- for epoch in range(epochs):
315
- model.train()
316
- train_loss = 0.0
317
- epoch_train_losses = []
318
- mse_losses = []
319
- for i,(inputs, targets) in enumerate(train_loader):
320
- inputs = inputs.to(device)
321
- targets = targets.to(device)
322
- outputs = model(inputs)
323
- # l1_lambda = 0.001
324
- # l2_lambda = 0.0001
325
- # l1_norm = sum(p.abs().sum() for p in model.parameters()) # L1 norm
326
- # l2_norm = sum(p.pow(2.0).sum() for p in model.parameters()) # L2 norm
327
- loss = criterion(outputs, targets)# + l2_lambda * l2_norm + l1_lambda * l1_norm
328
- optimizer.zero_grad()
329
- loss.backward()
330
- optimizer.step()
331
- if epoch == epochs-1:
332
- mse_loss = F.mse_loss(targets, outputs, reduction='none')
333
- mse_loss_per_data_point = mse_loss.mean(dim=-1)
334
- mse_losses.extend(mse_loss_per_data_point.tolist())
335
- epoch_train_losses.append(loss.item())
336
- train_loss += loss.item()
337
- train_loss /= len(train_loader)
338
-
339
- # Validation
340
- if test_loader and epoch%1==0:
341
- model.eval()
342
- test_loss = 0.0
343
- with torch.no_grad():
344
- for i,(inputs, targets) in enumerate(test_loader):
345
- inputs = inputs.to(device)
346
- targets = targets.to(device)
347
- outputs = model(inputs.unsqueeze(1))
348
- loss = criterion(outputs.squeeze(1), targets)
349
- test_loss += loss.item()
350
-
351
- test_loss /= len(test_loader)
352
- else:
353
- test_loss = 0.0
354
- info += f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}\n"
355
- print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
356
- train_loss_data[f'Epoch {epoch + 1}'] = epoch_train_losses
357
- train_loss_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in train_loss_data.items()]))
358
- return model, train_loss_df, mse_losses, info
359
-
360
- #########################################################################################################################################
361
- def detect_anomalies(csv_file, sample_choice="Custom Data", data_slicing_percentage=80, epochs=3, threshold_factor=1.0):
362
- images = []
363
- anomaly_summary = ""
364
- device = "cuda" if torch.cuda.is_available() else "cpu"
365
- if os.path.exists("Results"):
366
- shutil.rmtree("Results")
367
- os.mkdir("Results")
368
- if sample_choice == "Custom Data":
369
- anomaly_summary += f"[INFO] Loading Custom Dataset {data_slicing_percentage}%...\n"
370
- dataframe = pd.read_csv(csv_file.name).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
371
- anomaly_summary += f"[INFO] Preprocessing Dataset...\n"
372
- if dataframe.get('Label') is not None:
373
- processed_data = preprocess_dataframe(dataframe, target_column="Label")
374
- else:
375
- processed_data = preprocess_dataframe(dataframe)
376
- anomaly_summary += f"[WARNING] No Label Column Found, Using Unsupervised Learning...\n"
377
- anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
378
- make_better_andrews_curves(processed_data['sample'], 'Label',
379
- colors=['Blue', 'Red'],
380
- plot_title="Dataset Andrews Curves",
381
- line_width=1.2,
382
- transparency=0.7,
383
- legend_loc='upper right',
384
- figsize=(12, 7),
385
- name = "Results/Dataset_andrews_curves")
386
- images.append("Results/Dataset_andrews_curves.png")
387
- model = LSTMAE(len(processed_data["input_df"].columns),128).to(device)
388
- model.to(device)
389
- anomaly_summary += f"[INFO] Training Model...\n"
390
- _, train_loss_df, mse_losses, info = train_model(model, processed_data['train_loader'], processed_data['test_loader'],epochs=epochs)
391
- anomaly_summary += info
392
- anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
393
- dataframe.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
394
- pickle.dump(processed_data['scaler'], open('Results/scaler.pkl', 'wb'))
395
- torch.save(model, 'Results/model.pth')
396
- anomaly_summary += f"[INFO] Generating Loss Curves...\n"
397
- plt.figure(figsize=(12, 6)) # Adjust figure size as needed
398
- for column in train_loss_df.columns:
399
- plt.plot(train_loss_df[column], label=column)
400
- plt.xlabel("Batch")
401
- plt.ylabel("Loss")
402
- plt.title("Training Loss per Epoch")
403
- plt.legend() # Show the legend to identify each epoch
404
- plt.grid(True) # Add a grid for easier reading
405
- plt.tight_layout() # Adjust layout to prevent labels from overlapping
406
- plt.savefig("Results/loss_curves.png", dpi=300)
407
- images.append("Results/loss_curves.png")
408
- Q1, Q3 = np.percentile(mse_losses, [25, 75])
409
- Dict = {"Q1": Q1, "Q3": Q3}
410
- pickle.dump(Dict, open('Results/INFO.pkl', 'wb'))
411
-
412
- else:
413
- Q1, Q3 = 0.19226229563355446, 0.7454282641410828
414
- IQR = Q3 - Q1
415
- lower_bound = Q1 - threshold_factor * IQR
416
- upper_bound = Q3 + threshold_factor * IQR
417
- # print(lower_bound, upper_bound)
418
- data_path = os.path.join(os.path.abspath('Data'),sample_choice)
419
- dataframe = pd.read_csv(data_path).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
420
- anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
421
- dataframe.to_csv('Results/Scaled_dataset.csv', columns=dataframe.columns, index=False)
422
- scaler = pickle.load(open('scaler.pkl', 'rb'))
423
- original_df = scaler.inverse_transform(dataframe.iloc[:,:-1])
424
- original_df = pd.DataFrame(original_df, columns=dataframe.columns[:-1])
425
- original_df['Label'] = dataframe['Label']
426
- original_df.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
427
- shutil.copy('scaler.pkl', 'Results/scaler.pkl')
428
- shutil.copy('model.pth', 'Results/model.pth')
429
- # andrew curve of dataset
430
- anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
431
- make_better_andrews_curves(dataframe, 'Label',
432
- colors=['Blue', 'Red'],
433
- plot_title="Dataset Andrews Curves",
434
- line_width=1.2,
435
- transparency=0.7,
436
- legend_loc='upper right',
437
- figsize=(12, 7),
438
- name = "Results/Dataset_andrews_curves")
439
- images.append("Results/Dataset_andrews_curves.png")
440
- inputdf = torch.tensor(dataframe.iloc[:,:-1].to_numpy(), dtype=torch.float32, device=device)
441
- outputdf = dataframe['Label']
442
- model = torch.load("model.pth",weights_only = False, map_location=device)
443
- model.eval()
444
- outputs = model(inputdf.unsqueeze(1)).squeeze(1)
445
- mse_loss = F.mse_loss(outputs, inputdf, reduction='none')
446
- mse_loss_per_data_point = mse_loss.mean(dim=-1)
447
- anomaly_scores = pd.DataFrame({'Loss': mse_loss_per_data_point.detach().cpu().numpy(), 'Label': outputdf})
448
- anomaly_scores['Anomaly'] = anomaly_scores['Loss'].apply(lambda x: 1 if x > upper_bound else 0)
449
- anomaly_scores['Label'] = anomaly_scores['Label'].apply(lambda x: 1 if x == "Malicious" else 0)
450
- out_confusion_matrix = confusion_matrix(anomaly_scores['Label'], anomaly_scores['Anomaly'])
451
- disp = ConfusionMatrixDisplay(confusion_matrix=out_confusion_matrix, display_labels=["Benign","Malignant"])
452
- disp.plot(cmap=plt.cm.Blues)
453
- plt.title('Confusion Matrix')
454
- plt.savefig(f"Results/confusion_matrix.png", dpi=300)
455
- images.append("Results/confusion_matrix.png")
456
- accuracy = accuracy_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
457
- precision = precision_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
458
- recall = recall_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
459
- f1 = f1_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
460
- # print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
461
- anomaly_summary += f"[RESULT] Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
462
- anomaly_summary = anomaly_summary + f"Confusion Matrix:\n{out_confusion_matrix}\n"
463
-
464
- folder_path = "Results"
465
- with zipfile.ZipFile("Results.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
466
- for root, _, files in os.walk(folder_path):
467
- for file in files:
468
- file_path = os.path.join(root, file)
469
- relative_path = os.path.relpath(file_path, folder_path)
470
- zipf.write(file_path, relative_path)
471
-
472
- return anomaly_summary, images, "Results.zip"
473
-
474
- iface = gr.Interface(
475
- fn=detect_anomalies,
476
- inputs=[
477
- gr.File(file_types=[".csv"], label="Upload CSV File"),
478
- gr.Radio(["Benign500.csv", "Malignant500.csv", "Balance1000.csv", "Custom Data"], value="Custom Data", label="Choose Samples or CustomData"),
479
- gr.Slider(minimum=10, maximum=100, step=10, value=80, label="Data Usage Percentage (Training or Detection)"),
480
- gr.Slider(minimum=1, maximum=20, step=1, value=3, label="Training Epochs (Default value is 3)"),
481
- gr.Slider(minimum=0, maximum=5, step=0.5, value=1.5, label="Loss Threshold (x, higher x means high threshold) = Q3 + x*IQR"),
482
- ],
483
- outputs=[
484
- gr.Textbox(label="Anomaly Summary"),
485
- gr.Gallery(label="Anomaly Plots"),
486
- "file",
487
- ],
488
- title="Your own Anomaly Detector",
489
- description="""
490
- ### Fully Unsupervised Anomaly Detection Tool (uses Bidirectional based Autoencoder with skip conn. and Dropout Layers)
491
- ##### Download *"Result.zip"* (contains model.pkl, dataset images, output images) to download the results from Right Bottom.
492
- Upload a *CSV file* (Custom Anomalies Detection: Use Output Column: "Label" or ), or Use *our trained model*.
493
- """
494
- )
495
-
496
- if __name__ == "__main__":
497
  iface.launch(debug=False)
 
1
+ import os
2
+ import torch
3
+ import shutil
4
+ import pickle
5
+ import zipfile
6
+ import matplotlib
7
+ import numpy as np
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ import torch.nn.functional as F
13
+ from matplotlib import pyplot as plt
14
+ from sklearn.impute import SimpleImputer
15
+ from pandas.plotting import andrews_curves
16
+ from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
17
+ from sklearn.preprocessing import StandardScaler
18
+ from torch.utils.data import DataLoader, TensorDataset, Dataset
19
+
20
+ ###################################################### Preprocessing #####################################################################
21
+ def preprocess_dataframe(df, target_column=None, fill_method='mean', drop_na=True, sequence_length=32, test_size=0.2, batch_size = 128):
22
+ """
23
+ Loads a DataFrame from a file, preprocesses it, prepares it for LSTM data.
24
+ If a target_column is provided, that column is used as the target (y).
25
+ Otherwise, it prepares data for an autoencoder (no separate y).
26
+ 1. Loads file and checks for the target columns
27
+ 2. Drops any NaN rows and non numeric columns.
28
+ 3. Fills the NaN values with given method.
29
+ 4. After preprocessing, data is transformed to fit in lstm.
30
+
31
+ Args:
32
+ file_path (str): Path to the data file (e.g., CSV, Excel).
33
+ target_column (str, optional): Name of the target column. If provided, use this as target. Otherwise, treats as autoencoder. Defaults to None.
34
+ fill_method (str, optional): Method for filling NaNs: 'mean', 'median', 'most_frequent', or 'constant'.
35
+ Defaults to 'mean'. If 'constant', `fill_value` must be set.
36
+ drop_na (bool, optional): Whether to drop rows with any NaN values. Defaults to True.
37
+ sequence_length (int): The length of the sequence to create (e.g., number of features to treat as a sequence).
38
+ test_size (float): The proportion of data to use for testing.
39
+
40
+ Returns:
41
+ tuple: (train_loader, test_loader, input_size) if no target_column.
42
+ (train_loader, test_loader, input_size, target_column_name) if target_column provided
43
+ A tuple containing:
44
+ - train_loader (DataLoader): DataLoader for training data.
45
+ - test_loader (DataLoader): DataLoader for test data.
46
+ - input_size (int): Number of features.
47
+ - target_column_name (str): The name of the target column only when there is target column.
48
+ """
49
+ # 1. Target Column Check
50
+ target_col = None
51
+ if target_column:
52
+ if target_column in df.columns:
53
+ target_col = target_column
54
+ print(f"Target column '{target_column}' found.")
55
+ else:
56
+ target_column = None # Reset target_column so we treat as autoencoder
57
+ else:
58
+ print("No target column specified. Treating as autoencoder.")
59
+
60
+ #2. Drop Rows with NaNs before Fill
61
+ if drop_na:
62
+ print("Dropping rows with any NaN values...")
63
+ df = df.dropna()
64
+
65
+
66
+ # 3. Drop Non-Numeric Columns (Except Target)
67
+ columns_to_drop = []
68
+ for col in df.columns:
69
+ if col != target_col and not pd.api.types.is_numeric_dtype(df[col]):
70
+ columns_to_drop.append(col) #exclude the target column if target column is not numeric
71
+ if columns_to_drop:
72
+ print(f"Dropping non-numeric columns: {columns_to_drop}")
73
+ df = df.drop(columns=columns_to_drop)
74
+ else:
75
+ print("No non-numeric columns found.")
76
+
77
+
78
+ # 4. Handle Missing Values (Only in Numeric Columns After Dropping)
79
+ numeric_cols = df.select_dtypes(include=np.number).columns #select numeric columns after non-numeric columsn removed
80
+ if df[numeric_cols].isnull().any().any(): # Check if any NaN values exist (in numeric columns)
81
+ print("Handling missing values...")
82
+ if fill_method in ['mean', 'median', 'most_frequent', 'constant']:
83
+ imputer = SimpleImputer(strategy=fill_method)
84
+
85
+ if fill_method == 'constant':
86
+ imputer = SimpleImputer(strategy=fill_method, fill_value=0) #only with constant filling value must be provided
87
+
88
+ df[numeric_cols] = imputer.fit_transform(df[numeric_cols]) # Apply only to numeric columns
89
+
90
+ else:
91
+ raise ValueError("Invalid fill_method. Choose 'mean', 'median', 'most_frequent', or 'constant'.")
92
+
93
+ # Droping NaN and inf
94
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
95
+ df.dropna(inplace=True)
96
+
97
+ if target_col:
98
+ inputdf = df.drop(columns=[target_col])
99
+ outputdf = df[target_col].apply(lambda x: 0 if x.lower() == 'benign' else 1)
100
+ malinputdf = inputdf[outputdf == 1]
101
+ beninputdf = inputdf[outputdf == 0]
102
+ sample_size = min(len(beninputdf), len(malinputdf), 500)
103
+ bensample = beninputdf.sample(n=sample_size, random_state=42)
104
+ bensample['Label'] = 'Benign'
105
+ malsample = malinputdf.sample(n=sample_size, random_state=42)
106
+ malsample['Label'] = 'Malicious'
107
+ sample = pd.concat([bensample, malsample])
108
+ data = beninputdf.values
109
+ else:
110
+ inputdf = df
111
+ sample_size = min(len(inputdf), 500)
112
+ sample = df.sample(n=sample_size, random_state=42)
113
+ data = inputdf.values
114
+
115
+ scaler = StandardScaler()
116
+ data = scaler.fit_transform(data)
117
+
118
+ if target_col:
119
+ X_train = data
120
+ data = malinputdf.values
121
+ data = scaler.transform(data)
122
+ X_test = data
123
+ else:
124
+ X_train = data
125
+
126
+ class TabularDatasetTest(Dataset):
127
+ def __init__(self, data):
128
+ self.data = data.clone().detach()
129
+
130
+ def __len__(self):
131
+ return len(self.data)
132
+
133
+ def __getitem__(self, idx):
134
+ return self.data[idx], self.data[idx]
135
+
136
+ class TabularDatasetTrain(Dataset):
137
+ def __init__(self, data, sequence_length):
138
+ self.data = data.clone().detach()
139
+ self.sequence_length = sequence_length
140
+
141
+ def __len__(self):
142
+ return len(self.data) - self.sequence_length + 1
143
+
144
+ def __getitem__(self, idx):
145
+ return self.data[idx:idx + self.sequence_length], self.data[idx:idx + self.sequence_length]
146
+
147
+ if target_column:
148
+ X_train = torch.tensor(X_train, dtype=torch.float32)
149
+ X_test = torch.tensor(X_test, dtype=torch.float32)
150
+ train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
151
+ test_dataset = TabularDatasetTest(X_test)
152
+ train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
153
+ test_Dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
154
+ return {
155
+ 'train_loader': train_DataLoader,
156
+ 'test_loader': test_Dataloader,
157
+ 'input_df': inputdf,
158
+ 'target_df': outputdf,
159
+ 'malinput_df': malinputdf,
160
+ 'beninput_df': beninputdf,
161
+ 'target_col': target_col,
162
+ 'scaler': scaler,
163
+ 'sample': sample
164
+ }
165
+ else:
166
+ X_train = torch.tensor(X_train, dtype=torch.float32)
167
+ train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
168
+ train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
169
+ sample["Label"] = "dummy_class"
170
+ return {
171
+ 'train_loader': train_DataLoader,
172
+ 'test_loader': None,
173
+ 'input_df': inputdf,
174
+ 'malinput_df': None,
175
+ 'beninput_df': None,
176
+ 'target_df': None,
177
+ 'target_col': None,
178
+ 'scaler': scaler,
179
+ 'sample': sample
180
+ }
181
+ ################################################## Model #############################################################################
182
+ class EncoderRNN(nn.Module):
183
+ def __init__(self, input_size, hidden_size, num_layers, isCuda):
184
+ super(EncoderRNN, self).__init__()
185
+ self.input_size = input_size
186
+ self.hidden_size = hidden_size
187
+ self.num_layers = num_layers
188
+ self.bottleneck_size = int(input_size/2)
189
+
190
+ self.isCuda = isCuda
191
+ self.lstm1 = nn.LSTM(input_size, int(hidden_size/2), num_layers, batch_first=True, bidirectional = True)
192
+ self.relu = nn.ReLU()
193
+ self.dropout = nn.Dropout(0.2)
194
+ self.lstm2 = nn.LSTM(hidden_size, self.bottleneck_size, num_layers, batch_first=True)
195
+
196
+
197
+ def forward(self, inputs):
198
+ intermediate_state, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
199
+ intermediate_state = self.relu(self.dropout(intermediate_state))
200
+ encoded_input, hidden = self.lstm2(intermediate_state)#, (h0_2, c0_2))
201
+ return encoded_input, intermediate_state
202
+
203
+ class DecoderRNN(nn.Module):
204
+ def __init__(self, hidden_size, output_size, num_layers, isCuda):
205
+ super(DecoderRNN, self).__init__()
206
+ self.hidden_size = hidden_size
207
+ self.output_size = output_size
208
+ self.num_layers = num_layers
209
+ self.bottleneck_size = int(output_size/2)
210
+
211
+ self.isCuda = isCuda
212
+ self.lstm2 = nn.LSTM(self.bottleneck_size, hidden_size, num_layers, batch_first=True)
213
+ self.relu = nn.ReLU()
214
+ self.dropout = nn.Dropout(0.2)
215
+ self.lstm1 = nn.LSTM(2*hidden_size, output_size, num_layers, batch_first=True)
216
+
217
+ def forward(self, encoded_input, intermediate_state):
218
+ encoded_input, hidden = self.lstm2(encoded_input)#, (h0_2, c0_2))
219
+ inputs = torch.cat((self.dropout(encoded_input), intermediate_state), dim=2)
220
+ inputs = self.relu(inputs)
221
+ decoded_output, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
222
+ # print(f"output: {decoded_output}")
223
+ return decoded_output
224
+
225
+ class LSTMAE(nn.Module):
226
+ def __init__(self, input_size, hidden_size, num_layers=1, isCuda="cuda" if torch.cuda.is_available() else "cpu"):
227
+ super(LSTMAE, self).__init__()
228
+ hidden_size = hidden_size if hidden_size%2==0 else hidden_size+1
229
+ self.encoder = EncoderRNN(input_size, hidden_size, num_layers, isCuda)
230
+ self.decoder = DecoderRNN(hidden_size, input_size, num_layers, isCuda)
231
+ self.initialize_weights()
232
+
233
+ def initialize_weights(self):
234
+ """
235
+ Initializes the weights of the linear, LSTM, and convolutional layers
236
+ using appropriate initialization schemes.
237
+ """
238
+ for m in self.modules(): # Iterate through all modules in the network
239
+ if isinstance(m, nn.LSTM):
240
+ for name, param in m.named_parameters():
241
+ if 'weight' in name:
242
+ if 'ih' or 'hh' in name:
243
+ nn.init.xavier_uniform_(param.data) # Input-to-hidden
244
+ elif 'bias' in name:
245
+ nn.init.zeros_(param.data)
246
+
247
+ def forward(self, input):
248
+ encoded_input, intermediate_state = self.encoder(input)
249
+ decoded_output = self.decoder(encoded_input, intermediate_state)
250
+ return decoded_output
251
+
252
+
253
+
254
+ ############################################## Andrews Curves ###########################################################################
255
+ def make_better_andrews_curves(df, class_column, colors=None, plot_title="Andrews Curves",
256
+ line_width=0.8, transparency=0.5, sample_size=None, legend_loc='best',
257
+ custom_labels=None, x_axis_ticks=None, x_axis_labels=None,
258
+ figsize=(10, 6), dpi=300, name = "andrews_curves"):
259
+ """
260
+ Generates an Andrews Curves plot with enhanced styling.
261
+
262
+ Args:
263
+ df: pandas DataFrame containing the data.
264
+ class_column: Name of the column containing class labels.
265
+ colors: List of colors to use for each class (e.g., ['blue', 'red']). Defaults to matplotlib's defaults if None.
266
+ plot_title: Title of the plot.
267
+ line_width: Width of the lines.
268
+ transparency: Alpha value (transparency) of the lines.
269
+ sample_size: If an integer is provided, a random sample of the data will be used. Useful for large datasets.
270
+ legend_loc: Location of the legend (e.g., 'best', 'upper right', 'lower left').
271
+ custom_labels: A dictionary mapping original class labels to more descriptive labels for the legend.
272
+ x_axis_ticks: A list of tick positions for the x-axis. If None, default ticks are used.
273
+ x_axis_labels: A list of labels for the x-axis ticks. Must be the same length as x_axis_ticks.
274
+ figsize: Tuple specifying the figure size (width, height) in inches.
275
+ """
276
+
277
+ if sample_size and sample_size < len(df):
278
+ df = df.sample(n=sample_size, random_state=42) # Sample for faster plotting
279
+
280
+ plt.figure(figsize=figsize) # Set the figure size before plotting
281
+
282
+ ax = andrews_curves(df, class_column, color=colors) # Store the Axes object
283
+
284
+ plt.title(plot_title, fontsize=16)
285
+ plt.xlabel("t", fontsize=12) # Added x-axis label
286
+ plt.ylabel("f(t)", fontsize=12) # Added y-axis label
287
+
288
+ for line in ax.get_lines():
289
+ line.set_linewidth(line_width)
290
+ line.set_alpha(transparency)
291
+
292
+ # Customize Legend
293
+ if custom_labels:
294
+ handles, labels = ax.get_legend_handles_labels()
295
+ new_labels = [custom_labels.get(label, label) for label in labels] # Use .get() to handle missing labels
296
+ ax.legend(handles, new_labels, loc=legend_loc, fontsize=10)
297
+ else:
298
+ plt.legend(loc=legend_loc, fontsize=10)
299
+
300
+
301
+ # Customize X-axis ticks and labels
302
+ if x_axis_ticks:
303
+ plt.xticks(x_axis_ticks, x_axis_labels)
304
+
305
+ plt.grid(False) # Add a grid
306
+ plt.tight_layout() # Adjust layout to prevent labels from overlapping
307
+ plt.savefig(f"{name}.png", dpi=dpi)
308
+ ################################################# Model Training ######################################################################
309
+ def train_model(model, train_loader, test_loader = None, learning_rate=0.001, epochs=10, device = "cuda" if torch.cuda.is_available() else "cpu"):
310
+ criterion = nn.MSELoss()
311
+ info = ""
312
+ optimizer = optim.Adam(model.parameters(), lr=learning_rate)
313
+ train_loss_data = {}
314
+ for epoch in range(epochs):
315
+ model.train()
316
+ train_loss = 0.0
317
+ epoch_train_losses = []
318
+ mse_losses = []
319
+ for i,(inputs, targets) in enumerate(train_loader):
320
+ inputs = inputs.to(device)
321
+ targets = targets.to(device)
322
+ outputs = model(inputs)
323
+ # l1_lambda = 0.001
324
+ # l2_lambda = 0.0001
325
+ # l1_norm = sum(p.abs().sum() for p in model.parameters()) # L1 norm
326
+ # l2_norm = sum(p.pow(2.0).sum() for p in model.parameters()) # L2 norm
327
+ loss = criterion(outputs, targets)# + l2_lambda * l2_norm + l1_lambda * l1_norm
328
+ optimizer.zero_grad()
329
+ loss.backward()
330
+ optimizer.step()
331
+ if epoch == epochs-1:
332
+ mse_loss = F.mse_loss(targets, outputs, reduction='none')
333
+ mse_loss_per_data_point = mse_loss.mean(dim=-1)
334
+ mse_losses.extend(mse_loss_per_data_point.tolist())
335
+ epoch_train_losses.append(loss.item())
336
+ train_loss += loss.item()
337
+ train_loss /= len(train_loader)
338
+
339
+ # Validation
340
+ if test_loader and epoch%1==0:
341
+ model.eval()
342
+ test_loss = 0.0
343
+ with torch.no_grad():
344
+ for i,(inputs, targets) in enumerate(test_loader):
345
+ inputs = inputs.to(device)
346
+ targets = targets.to(device)
347
+ outputs = model(inputs.unsqueeze(1))
348
+ loss = criterion(outputs.squeeze(1), targets)
349
+ test_loss += loss.item()
350
+
351
+ test_loss /= len(test_loader)
352
+ else:
353
+ test_loss = 0.0
354
+ info += f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}\n"
355
+ print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
356
+ train_loss_data[f'Epoch {epoch + 1}'] = epoch_train_losses
357
+ train_loss_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in train_loss_data.items()]))
358
+ return model, train_loss_df, mse_losses, info
359
+
360
+ #########################################################################################################################################
361
+ def detect_anomalies(csv_file, sample_choice="Custom Data", data_slicing_percentage=80, epochs=3, threshold_factor=1.0):
362
+ images = []
363
+ anomaly_summary = ""
364
+ device = "cuda" if torch.cuda.is_available() else "cpu"
365
+ if os.path.exists("Results"):
366
+ shutil.rmtree("Results")
367
+ os.mkdir("Results")
368
+ if sample_choice == "Custom Data":
369
+ anomaly_summary += f"[INFO] Loading Custom Dataset {data_slicing_percentage}%...\n"
370
+ dataframe = pd.read_csv(csv_file.name).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
371
+ anomaly_summary += f"[INFO] Preprocessing Dataset...\n"
372
+ if dataframe.get('Label') is not None:
373
+ processed_data = preprocess_dataframe(dataframe, target_column="Label")
374
+ else:
375
+ processed_data = preprocess_dataframe(dataframe)
376
+ anomaly_summary += f"[WARNING] No Label Column Found, Using Unsupervised Learning...\n"
377
+ anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
378
+ make_better_andrews_curves(processed_data['sample'], 'Label',
379
+ colors=['Blue', 'Red'],
380
+ plot_title="Dataset Andrews Curves",
381
+ line_width=1.2,
382
+ transparency=0.7,
383
+ legend_loc='upper right',
384
+ figsize=(12, 7),
385
+ name = "Results/Dataset_andrews_curves")
386
+ images.append("Results/Dataset_andrews_curves.png")
387
+ model = LSTMAE(len(processed_data["input_df"].columns),128).to(device)
388
+ model.to(device)
389
+ anomaly_summary += f"[INFO] Training Model...\n"
390
+ _, train_loss_df, mse_losses, info = train_model(model, processed_data['train_loader'], processed_data['test_loader'],epochs=epochs)
391
+ anomaly_summary += info
392
+ anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
393
+ dataframe.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
394
+ pickle.dump(processed_data['scaler'], open('Results/scaler.pkl', 'wb'))
395
+ torch.save(model, 'Results/model.pth')
396
+ anomaly_summary += f"[INFO] Generating Loss Curves...\n"
397
+ plt.figure(figsize=(12, 6)) # Adjust figure size as needed
398
+ for column in train_loss_df.columns:
399
+ plt.plot(train_loss_df[column], label=column)
400
+ plt.xlabel("Batch")
401
+ plt.ylabel("Loss")
402
+ plt.title("Training Loss per Epoch")
403
+ plt.legend() # Show the legend to identify each epoch
404
+ plt.grid(True) # Add a grid for easier reading
405
+ plt.tight_layout() # Adjust layout to prevent labels from overlapping
406
+ plt.savefig("Results/loss_curves.png", dpi=300)
407
+ images.append("Results/loss_curves.png")
408
+ Q1, Q3 = np.percentile(mse_losses, [25, 75])
409
+ Dict = {"Q1": Q1, "Q3": Q3}
410
+ pickle.dump(Dict, open('Results/INFO.pkl', 'wb'))
411
+
412
+ else:
413
+ Q1, Q3 = 0.19226229563355446, 0.7454282641410828
414
+ IQR = Q3 - Q1
415
+ lower_bound = Q1 - threshold_factor * IQR
416
+ upper_bound = Q3 + threshold_factor * IQR
417
+ # print(lower_bound, upper_bound)
418
+ data_path = os.path.join(os.path.abspath('Data'),sample_choice)
419
+ dataframe = pd.read_csv(data_path).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
420
+ anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
421
+ dataframe.to_csv('Results/Scaled_dataset.csv', columns=dataframe.columns, index=False)
422
+ scaler = pickle.load(open('scaler.pkl', 'rb'))
423
+ original_df = scaler.inverse_transform(dataframe.iloc[:,:-1])
424
+ original_df = pd.DataFrame(original_df, columns=dataframe.columns[:-1])
425
+ original_df['Label'] = dataframe['Label']
426
+ original_df.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
427
+ shutil.copy('scaler.pkl', 'Results/scaler.pkl')
428
+ shutil.copy('model.pth', 'Results/model.pth')
429
+ # andrew curve of dataset
430
+ anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
431
+ make_better_andrews_curves(dataframe, 'Label',
432
+ colors=['Blue', 'Red'],
433
+ plot_title="Dataset Andrews Curves",
434
+ line_width=1.2,
435
+ transparency=0.7,
436
+ legend_loc='upper right',
437
+ figsize=(12, 7),
438
+ name = "Results/Dataset_andrews_curves")
439
+ images.append("Results/Dataset_andrews_curves.png")
440
+ inputdf = torch.tensor(dataframe.iloc[:,:-1].to_numpy(), dtype=torch.float32, device=device)
441
+ outputdf = dataframe['Label']
442
+ model = torch.load("model.pth",weights_only = False, map_location=device)
443
+ model.eval()
444
+ outputs = model(inputdf.unsqueeze(1)).squeeze(1)
445
+ mse_loss = F.mse_loss(outputs, inputdf, reduction='none')
446
+ mse_loss_per_data_point = mse_loss.mean(dim=-1)
447
+ anomaly_scores = pd.DataFrame({'Loss': mse_loss_per_data_point.detach().cpu().numpy(), 'Label': outputdf})
448
+ anomaly_scores['Anomaly'] = anomaly_scores['Loss'].apply(lambda x: 1 if x > upper_bound else 0)
449
+ anomaly_scores['Label'] = anomaly_scores['Label'].apply(lambda x: 1 if x == "Malicious" else 0)
450
+ out_confusion_matrix = confusion_matrix(anomaly_scores['Label'], anomaly_scores['Anomaly'])
451
+ disp = ConfusionMatrixDisplay(confusion_matrix=out_confusion_matrix, display_labels=["Benign","Malignant"])
452
+ disp.plot(cmap=plt.cm.Blues)
453
+ plt.title('Confusion Matrix')
454
+ plt.savefig(f"Results/confusion_matrix.png", dpi=300)
455
+ images.append("Results/confusion_matrix.png")
456
+ accuracy = accuracy_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
457
+ precision = precision_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
458
+ recall = recall_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
459
+ f1 = f1_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
460
+ # print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
461
+ anomaly_summary += f"[RESULT] Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
462
+ anomaly_summary = anomaly_summary + f"Confusion Matrix:\n{out_confusion_matrix}\n"
463
+
464
+ folder_path = "Results"
465
+ with zipfile.ZipFile("Results.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
466
+ for root, _, files in os.walk(folder_path):
467
+ for file in files:
468
+ file_path = os.path.join(root, file)
469
+ relative_path = os.path.relpath(file_path, folder_path)
470
+ zipf.write(file_path, relative_path)
471
+
472
+ return anomaly_summary, images, "Results.zip"
473
+
474
+ iface = gr.Interface(
475
+ fn=detect_anomalies,
476
+ inputs=[
477
+ gr.File(file_types=[".csv"], label="Upload CSV File"),
478
+ gr.Radio(["Benign500.csv", "Malignant500.csv", "Balance1000.csv", "Custom Data"], value="Custom Data", label="Choose Samples or CustomData"),
479
+ gr.Slider(minimum=10, maximum=100, step=10, value=80, label="Data Usage Percentage (Training or Detection)"),
480
+ gr.Slider(minimum=1, maximum=20, step=1, value=3, label="Training Epochs (Default value is 3)"),
481
+ gr.Slider(minimum=0, maximum=5, step=0.5, value=1.5, label="Loss Threshold (x, higher x means high threshold) = Q3 + x*IQR"),
482
+ ],
483
+ outputs=[
484
+ gr.Textbox(label="Anomaly Summary"),
485
+ gr.Gallery(label="Anomaly Plots"),
486
+ "file",
487
+ ],
488
+ title="Your own Anomaly Detector",
489
+ description="""
490
+ ### Fully Unsupervised Anomaly Detection Tool (uses Bidirectional based Autoencoder with skip conn. and Dropout Layers)
491
+ ##### Download *"Result.zip"* (contains model.pkl, dataset images, output images) to download the results from Right Bottom.
492
+ Upload a *CSV file* (Custom Anomalies Detection: Use Output Column: "Label" or ), or Use *our trained model*.
493
+ """
494
+ )
495
+
496
+ if __name__ == "__main__":
497
  iface.launch(debug=False)