rajatsingh0702 commited on
Commit
a52f3fe
·
verified ·
1 Parent(s): e71eef8

Upload 7 files

Browse files
Files changed (7) hide show
  1. Data/Balance1000.csv +0 -0
  2. Data/Benign500.csv +0 -0
  3. Data/Malignant500.csv +0 -0
  4. app.py +497 -0
  5. model.pth +3 -0
  6. requirements.txt +7 -0
  7. scaler.pkl +3 -0
Data/Balance1000.csv ADDED
The diff for this file is too large to render. See raw diff
 
Data/Benign500.csv ADDED
The diff for this file is too large to render. See raw diff
 
Data/Malignant500.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import shutil
4
+ import pickle
5
+ import zipfile
6
+ import matplotlib
7
+ import numpy as np
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ import torch.nn.functional as F
13
+ from matplotlib import pyplot as plt
14
+ from sklearn.impute import SimpleImputer
15
+ from pandas.plotting import andrews_curves
16
+ from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
17
+ from sklearn.preprocessing import StandardScaler
18
+ from torch.utils.data import DataLoader, TensorDataset, Dataset
19
+
20
+ ###################################################### Preprocessing #####################################################################
21
+ def preprocess_dataframe(df, target_column=None, fill_method='mean', drop_na=True, sequence_length=32, test_size=0.2, batch_size = 128):
22
+ """
23
+ Loads a DataFrame from a file, preprocesses it, prepares it for LSTM data.
24
+ If a target_column is provided, that column is used as the target (y).
25
+ Otherwise, it prepares data for an autoencoder (no separate y).
26
+ 1. Loads file and checks for the target columns
27
+ 2. Drops any NaN rows and non numeric columns.
28
+ 3. Fills the NaN values with given method.
29
+ 4. After preprocessing, data is transformed to fit in lstm.
30
+
31
+ Args:
32
+ file_path (str): Path to the data file (e.g., CSV, Excel).
33
+ target_column (str, optional): Name of the target column. If provided, use this as target. Otherwise, treats as autoencoder. Defaults to None.
34
+ fill_method (str, optional): Method for filling NaNs: 'mean', 'median', 'most_frequent', or 'constant'.
35
+ Defaults to 'mean'. If 'constant', `fill_value` must be set.
36
+ drop_na (bool, optional): Whether to drop rows with any NaN values. Defaults to True.
37
+ sequence_length (int): The length of the sequence to create (e.g., number of features to treat as a sequence).
38
+ test_size (float): The proportion of data to use for testing.
39
+
40
+ Returns:
41
+ tuple: (train_loader, test_loader, input_size) if no target_column.
42
+ (train_loader, test_loader, input_size, target_column_name) if target_column provided
43
+ A tuple containing:
44
+ - train_loader (DataLoader): DataLoader for training data.
45
+ - test_loader (DataLoader): DataLoader for test data.
46
+ - input_size (int): Number of features.
47
+ - target_column_name (str): The name of the target column only when there is target column.
48
+ """
49
+ # 1. Target Column Check
50
+ target_col = None
51
+ if target_column:
52
+ if target_column in df.columns:
53
+ target_col = target_column
54
+ print(f"Target column '{target_column}' found.")
55
+ else:
56
+ target_column = None # Reset target_column so we treat as autoencoder
57
+ else:
58
+ print("No target column specified. Treating as autoencoder.")
59
+
60
+ #2. Drop Rows with NaNs before Fill
61
+ if drop_na:
62
+ print("Dropping rows with any NaN values...")
63
+ df = df.dropna()
64
+
65
+
66
+ # 3. Drop Non-Numeric Columns (Except Target)
67
+ columns_to_drop = []
68
+ for col in df.columns:
69
+ if col != target_col and not pd.api.types.is_numeric_dtype(df[col]):
70
+ columns_to_drop.append(col) #exclude the target column if target column is not numeric
71
+ if columns_to_drop:
72
+ print(f"Dropping non-numeric columns: {columns_to_drop}")
73
+ df = df.drop(columns=columns_to_drop)
74
+ else:
75
+ print("No non-numeric columns found.")
76
+
77
+
78
+ # 4. Handle Missing Values (Only in Numeric Columns After Dropping)
79
+ numeric_cols = df.select_dtypes(include=np.number).columns #select numeric columns after non-numeric columsn removed
80
+ if df[numeric_cols].isnull().any().any(): # Check if any NaN values exist (in numeric columns)
81
+ print("Handling missing values...")
82
+ if fill_method in ['mean', 'median', 'most_frequent', 'constant']:
83
+ imputer = SimpleImputer(strategy=fill_method)
84
+
85
+ if fill_method == 'constant':
86
+ imputer = SimpleImputer(strategy=fill_method, fill_value=0) #only with constant filling value must be provided
87
+
88
+ df[numeric_cols] = imputer.fit_transform(df[numeric_cols]) # Apply only to numeric columns
89
+
90
+ else:
91
+ raise ValueError("Invalid fill_method. Choose 'mean', 'median', 'most_frequent', or 'constant'.")
92
+
93
+ # Droping NaN and inf
94
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
95
+ df.dropna(inplace=True)
96
+
97
+ if target_col:
98
+ inputdf = df.drop(columns=[target_col])
99
+ outputdf = df[target_col].apply(lambda x: 0 if x.lower() == 'benign' else 1)
100
+ malinputdf = inputdf[outputdf == 1]
101
+ beninputdf = inputdf[outputdf == 0]
102
+ sample_size = min(len(beninputdf), len(malinputdf), 500)
103
+ bensample = beninputdf.sample(n=sample_size, random_state=42)
104
+ bensample['Label'] = 'Benign'
105
+ malsample = malinputdf.sample(n=sample_size, random_state=42)
106
+ malsample['Label'] = 'Malicious'
107
+ sample = pd.concat([bensample, malsample])
108
+ data = beninputdf.values
109
+ else:
110
+ inputdf = df
111
+ sample_size = min(len(inputdf), 500)
112
+ sample = df.sample(n=sample_size, random_state=42)
113
+ data = inputdf.values
114
+
115
+ scaler = StandardScaler()
116
+ data = scaler.fit_transform(data)
117
+
118
+ if target_col:
119
+ X_train = data
120
+ data = malinputdf.values
121
+ data = scaler.transform(data)
122
+ X_test = data
123
+ else:
124
+ X_train = data
125
+
126
+ class TabularDatasetTest(Dataset):
127
+ def __init__(self, data):
128
+ self.data = data.clone().detach()
129
+
130
+ def __len__(self):
131
+ return len(self.data)
132
+
133
+ def __getitem__(self, idx):
134
+ return self.data[idx], self.data[idx]
135
+
136
+ class TabularDatasetTrain(Dataset):
137
+ def __init__(self, data, sequence_length):
138
+ self.data = data.clone().detach()
139
+ self.sequence_length = sequence_length
140
+
141
+ def __len__(self):
142
+ return len(self.data) - self.sequence_length + 1
143
+
144
+ def __getitem__(self, idx):
145
+ return self.data[idx:idx + self.sequence_length], self.data[idx:idx + self.sequence_length]
146
+
147
+ if target_column:
148
+ X_train = torch.tensor(X_train, dtype=torch.float32)
149
+ X_test = torch.tensor(X_test, dtype=torch.float32)
150
+ train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
151
+ test_dataset = TabularDatasetTest(X_test)
152
+ train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
153
+ test_Dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
154
+ return {
155
+ 'train_loader': train_DataLoader,
156
+ 'test_loader': test_Dataloader,
157
+ 'input_df': inputdf,
158
+ 'target_df': outputdf,
159
+ 'malinput_df': malinputdf,
160
+ 'beninput_df': beninputdf,
161
+ 'target_col': target_col,
162
+ 'scaler': scaler,
163
+ 'sample': sample
164
+ }
165
+ else:
166
+ X_train = torch.tensor(X_train, dtype=torch.float32)
167
+ train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
168
+ train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
169
+ sample["Label"] = "dummy_class"
170
+ return {
171
+ 'train_loader': train_DataLoader,
172
+ 'test_loader': None,
173
+ 'input_df': inputdf,
174
+ 'malinput_df': None,
175
+ 'beninput_df': None,
176
+ 'target_df': None,
177
+ 'target_col': None,
178
+ 'scaler': scaler,
179
+ 'sample': sample
180
+ }
181
+ ################################################## Model #############################################################################
182
+ class EncoderRNN(nn.Module):
183
+ def __init__(self, input_size, hidden_size, num_layers, isCuda):
184
+ super(EncoderRNN, self).__init__()
185
+ self.input_size = input_size
186
+ self.hidden_size = hidden_size
187
+ self.num_layers = num_layers
188
+ self.bottleneck_size = int(input_size/2)
189
+
190
+ self.isCuda = isCuda
191
+ self.lstm1 = nn.LSTM(input_size, int(hidden_size/2), num_layers, batch_first=True, bidirectional = True)
192
+ self.relu = nn.ReLU()
193
+ self.dropout = nn.Dropout(0.2)
194
+ self.lstm2 = nn.LSTM(hidden_size, self.bottleneck_size, num_layers, batch_first=True)
195
+
196
+
197
+ def forward(self, inputs):
198
+ intermediate_state, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
199
+ intermediate_state = self.relu(self.dropout(intermediate_state))
200
+ encoded_input, hidden = self.lstm2(intermediate_state)#, (h0_2, c0_2))
201
+ return encoded_input, intermediate_state
202
+
203
+ class DecoderRNN(nn.Module):
204
+ def __init__(self, hidden_size, output_size, num_layers, isCuda):
205
+ super(DecoderRNN, self).__init__()
206
+ self.hidden_size = hidden_size
207
+ self.output_size = output_size
208
+ self.num_layers = num_layers
209
+ self.bottleneck_size = int(output_size/2)
210
+
211
+ self.isCuda = isCuda
212
+ self.lstm2 = nn.LSTM(self.bottleneck_size, hidden_size, num_layers, batch_first=True)
213
+ self.relu = nn.ReLU()
214
+ self.dropout = nn.Dropout(0.2)
215
+ self.lstm1 = nn.LSTM(2*hidden_size, output_size, num_layers, batch_first=True)
216
+
217
+ def forward(self, encoded_input, intermediate_state):
218
+ encoded_input, hidden = self.lstm2(encoded_input)#, (h0_2, c0_2))
219
+ inputs = torch.cat((self.dropout(encoded_input), intermediate_state), dim=2)
220
+ inputs = self.relu(inputs)
221
+ decoded_output, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
222
+ # print(f"output: {decoded_output}")
223
+ return decoded_output
224
+
225
+ class LSTMAE(nn.Module):
226
+ def __init__(self, input_size, hidden_size, num_layers=1, isCuda="cuda" if torch.cuda.is_available() else "cpu"):
227
+ super(LSTMAE, self).__init__()
228
+ hidden_size = hidden_size if hidden_size%2==0 else hidden_size+1
229
+ self.encoder = EncoderRNN(input_size, hidden_size, num_layers, isCuda)
230
+ self.decoder = DecoderRNN(hidden_size, input_size, num_layers, isCuda)
231
+ self.initialize_weights()
232
+
233
+ def initialize_weights(self):
234
+ """
235
+ Initializes the weights of the linear, LSTM, and convolutional layers
236
+ using appropriate initialization schemes.
237
+ """
238
+ for m in self.modules(): # Iterate through all modules in the network
239
+ if isinstance(m, nn.LSTM):
240
+ for name, param in m.named_parameters():
241
+ if 'weight' in name:
242
+ if 'ih' or 'hh' in name:
243
+ nn.init.xavier_uniform_(param.data) # Input-to-hidden
244
+ elif 'bias' in name:
245
+ nn.init.zeros_(param.data)
246
+
247
+ def forward(self, input):
248
+ encoded_input, intermediate_state = self.encoder(input)
249
+ decoded_output = self.decoder(encoded_input, intermediate_state)
250
+ return decoded_output
251
+
252
+
253
+
254
+ ############################################## Andrews Curves ###########################################################################
255
+ def make_better_andrews_curves(df, class_column, colors=None, plot_title="Andrews Curves",
256
+ line_width=0.8, transparency=0.5, sample_size=None, legend_loc='best',
257
+ custom_labels=None, x_axis_ticks=None, x_axis_labels=None,
258
+ figsize=(10, 6), dpi=300, name = "andrews_curves"):
259
+ """
260
+ Generates an Andrews Curves plot with enhanced styling.
261
+
262
+ Args:
263
+ df: pandas DataFrame containing the data.
264
+ class_column: Name of the column containing class labels.
265
+ colors: List of colors to use for each class (e.g., ['blue', 'red']). Defaults to matplotlib's defaults if None.
266
+ plot_title: Title of the plot.
267
+ line_width: Width of the lines.
268
+ transparency: Alpha value (transparency) of the lines.
269
+ sample_size: If an integer is provided, a random sample of the data will be used. Useful for large datasets.
270
+ legend_loc: Location of the legend (e.g., 'best', 'upper right', 'lower left').
271
+ custom_labels: A dictionary mapping original class labels to more descriptive labels for the legend.
272
+ x_axis_ticks: A list of tick positions for the x-axis. If None, default ticks are used.
273
+ x_axis_labels: A list of labels for the x-axis ticks. Must be the same length as x_axis_ticks.
274
+ figsize: Tuple specifying the figure size (width, height) in inches.
275
+ """
276
+
277
+ if sample_size and sample_size < len(df):
278
+ df = df.sample(n=sample_size, random_state=42) # Sample for faster plotting
279
+
280
+ plt.figure(figsize=figsize) # Set the figure size before plotting
281
+
282
+ ax = andrews_curves(df, class_column, color=colors) # Store the Axes object
283
+
284
+ plt.title(plot_title, fontsize=16)
285
+ plt.xlabel("t", fontsize=12) # Added x-axis label
286
+ plt.ylabel("f(t)", fontsize=12) # Added y-axis label
287
+
288
+ for line in ax.get_lines():
289
+ line.set_linewidth(line_width)
290
+ line.set_alpha(transparency)
291
+
292
+ # Customize Legend
293
+ if custom_labels:
294
+ handles, labels = ax.get_legend_handles_labels()
295
+ new_labels = [custom_labels.get(label, label) for label in labels] # Use .get() to handle missing labels
296
+ ax.legend(handles, new_labels, loc=legend_loc, fontsize=10)
297
+ else:
298
+ plt.legend(loc=legend_loc, fontsize=10)
299
+
300
+
301
+ # Customize X-axis ticks and labels
302
+ if x_axis_ticks:
303
+ plt.xticks(x_axis_ticks, x_axis_labels)
304
+
305
+ plt.grid(False) # Add a grid
306
+ plt.tight_layout() # Adjust layout to prevent labels from overlapping
307
+ plt.savefig(f"{name}.png", dpi=dpi)
308
+ ################################################# Model Training ######################################################################
309
+ def train_model(model, train_loader, test_loader = None, learning_rate=0.001, epochs=10):
310
+ criterion = nn.MSELoss()
311
+ info = ""
312
+ optimizer = optim.Adam(model.parameters(), lr=learning_rate)
313
+ train_loss_data = {}
314
+ for epoch in range(epochs):
315
+ model.train()
316
+ train_loss = 0.0
317
+ epoch_train_losses = []
318
+ mse_losses = []
319
+ for i,(inputs, targets) in enumerate(train_loader):
320
+ inputs = inputs.to(device)
321
+ targets = targets.to(device)
322
+ outputs = model(inputs)
323
+ # l1_lambda = 0.001
324
+ # l2_lambda = 0.0001
325
+ # l1_norm = sum(p.abs().sum() for p in model.parameters()) # L1 norm
326
+ # l2_norm = sum(p.pow(2.0).sum() for p in model.parameters()) # L2 norm
327
+ loss = criterion(outputs, targets)# + l2_lambda * l2_norm + l1_lambda * l1_norm
328
+ optimizer.zero_grad()
329
+ loss.backward()
330
+ optimizer.step()
331
+ if epoch == epochs-1:
332
+ mse_loss = F.mse_loss(targets, outputs, reduction='none')
333
+ mse_loss_per_data_point = mse_loss.mean(dim=-1)
334
+ mse_losses.extend(mse_loss_per_data_point.tolist())
335
+ epoch_train_losses.append(loss.item())
336
+ train_loss += loss.item()
337
+ train_loss /= len(train_loader)
338
+
339
+ # Validation
340
+ if test_loader and epoch%1==0:
341
+ model.eval()
342
+ test_loss = 0.0
343
+ with torch.no_grad():
344
+ for i,(inputs, targets) in enumerate(test_loader):
345
+ inputs = inputs.to(device)
346
+ targets = targets.to(device)
347
+ outputs = model(inputs.unsqueeze(1))
348
+ loss = criterion(outputs.squeeze(1), targets)
349
+ test_loss += loss.item()
350
+
351
+ test_loss /= len(test_loader)
352
+ else:
353
+ test_loss = 0.0
354
+ info += f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}\n"
355
+ print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
356
+ train_loss_data[f'Epoch {epoch + 1}'] = epoch_train_losses
357
+ train_loss_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in train_loss_data.items()]))
358
+ return model, train_loss_df, mse_losses, info
359
+
360
+ #########################################################################################################################################
361
+ def detect_anomalies(csv_file, sample_choice="Custom Data", data_slicing_percentage=80, epochs=3, threshold_factor=1.0):
362
+ images = []
363
+ anomaly_summary = ""
364
+ device = "cuda" if torch.cuda.is_available() else "cpu"
365
+ if os.path.exists("Results"):
366
+ shutil.rmtree("Results")
367
+ os.mkdir("Results")
368
+ if sample_choice == "Custom Data":
369
+ anomaly_summary += f"[INFO] Loading Custom Dataset {data_slicing_percentage}%...\n"
370
+ dataframe = pd.read_csv(csv_file.name).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
371
+ anomaly_summary += f"[INFO] Preprocessing Dataset...\n"
372
+ if dataframe.get('Label') is not None:
373
+ processed_data = preprocess_dataframe(dataframe, target_column="Label")
374
+ else:
375
+ processed_data = preprocess_dataframe(dataframe)
376
+ anomaly_summary += f"[WARNING] No Label Column Found, Using Unsupervised Learning...\n"
377
+ anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
378
+ make_better_andrews_curves(processed_data['sample'], 'Label',
379
+ colors=['Blue', 'Red'],
380
+ plot_title="Dataset Andrews Curves",
381
+ line_width=1.2,
382
+ transparency=0.7,
383
+ legend_loc='upper right',
384
+ figsize=(12, 7),
385
+ name = "Results/Dataset_andrews_curves")
386
+ images.append("Results/Dataset_andrews_curves.png")
387
+ model = LSTMAE(len(processed_data["input_df"].columns),128).to(device)
388
+ model.to(device)
389
+ anomaly_summary += f"[INFO] Training Model...\n"
390
+ _, train_loss_df, mse_losses, info = train_model(model, processed_data['train_loader'], processed_data['test_loader'],epochs=epochs)
391
+ anomaly_summary += info
392
+ anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
393
+ dataframe.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
394
+ pickle.dump(processed_data['scaler'], open('Results/scaler.pkl', 'wb'))
395
+ torch.save(model, 'Results/model.pth')
396
+ anomaly_summary += f"[INFO] Generating Loss Curves...\n"
397
+ plt.figure(figsize=(12, 6)) # Adjust figure size as needed
398
+ for column in train_loss_df.columns:
399
+ plt.plot(train_loss_df[column], label=column)
400
+ plt.xlabel("Batch")
401
+ plt.ylabel("Loss")
402
+ plt.title("Training Loss per Epoch")
403
+ plt.legend() # Show the legend to identify each epoch
404
+ plt.grid(True) # Add a grid for easier reading
405
+ plt.tight_layout() # Adjust layout to prevent labels from overlapping
406
+ plt.savefig("Results/loss_curves.png", dpi=300)
407
+ images.append("Results/loss_curves.png")
408
+ Q1, Q3 = np.percentile(mse_losses, [25, 75])
409
+ Dict = {"Q1": Q1, "Q3": Q3}
410
+ pickle.dump(Dict, open('Results/INFO.pkl', 'wb'))
411
+
412
+ else:
413
+ Q1, Q3 = 0.19226229563355446, 0.7454282641410828
414
+ IQR = Q3 - Q1
415
+ lower_bound = Q1 - threshold_factor * IQR
416
+ upper_bound = Q3 + threshold_factor * IQR
417
+ # print(lower_bound, upper_bound)
418
+ data_path = os.path.join(os.path.abspath('Data'),sample_choice)
419
+ dataframe = pd.read_csv(data_path).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
420
+ anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
421
+ dataframe.to_csv('Results/Scaled_dataset.csv', columns=dataframe.columns, index=False)
422
+ scaler = pickle.load(open('scaler.pkl', 'rb'))
423
+ original_df = scaler.inverse_transform(dataframe.iloc[:,:-1])
424
+ original_df = pd.DataFrame(original_df, columns=dataframe.columns[:-1])
425
+ original_df['Label'] = dataframe['Label']
426
+ original_df.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
427
+ shutil.copy('scaler.pkl', 'Results/scaler.pkl')
428
+ shutil.copy('model.pth', 'Results/model.pth')
429
+ # andrew curve of dataset
430
+ anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
431
+ make_better_andrews_curves(dataframe, 'Label',
432
+ colors=['Blue', 'Red'],
433
+ plot_title="Dataset Andrews Curves",
434
+ line_width=1.2,
435
+ transparency=0.7,
436
+ legend_loc='upper right',
437
+ figsize=(12, 7),
438
+ name = "Results/Dataset_andrews_curves")
439
+ images.append("Results/Dataset_andrews_curves.png")
440
+ inputdf = torch.tensor(dataframe.iloc[:,:-1].to_numpy(), dtype=torch.float32, device=device)
441
+ outputdf = dataframe['Label']
442
+ model = torch.load("model.pth",weights_only = False, map_location=device)
443
+ model.eval()
444
+ outputs = model(inputdf.unsqueeze(1)).squeeze(1)
445
+ mse_loss = F.mse_loss(outputs, inputdf, reduction='none')
446
+ mse_loss_per_data_point = mse_loss.mean(dim=-1)
447
+ anomaly_scores = pd.DataFrame({'Loss': mse_loss_per_data_point.detach().cpu().numpy(), 'Label': outputdf})
448
+ anomaly_scores['Anomaly'] = anomaly_scores['Loss'].apply(lambda x: 1 if x > upper_bound else 0)
449
+ anomaly_scores['Label'] = anomaly_scores['Label'].apply(lambda x: 1 if x == "Malicious" else 0)
450
+ out_confusion_matrix = confusion_matrix(anomaly_scores['Label'], anomaly_scores['Anomaly'])
451
+ disp = ConfusionMatrixDisplay(confusion_matrix=out_confusion_matrix, display_labels=["Benign","Malignant"])
452
+ disp.plot(cmap=plt.cm.Blues)
453
+ plt.title('Confusion Matrix')
454
+ plt.savefig(f"Results/confusion_matrix.png", dpi=300)
455
+ images.append("Results/confusion_matrix.png")
456
+ accuracy = accuracy_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
457
+ precision = precision_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
458
+ recall = recall_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
459
+ f1 = f1_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
460
+ # print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
461
+ anomaly_summary += f"[RESULT] Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
462
+ anomaly_summary = anomaly_summary + f"Confusion Matrix:\n{out_confusion_matrix}\n"
463
+
464
+ folder_path = "Results"
465
+ with zipfile.ZipFile("Results.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
466
+ for root, _, files in os.walk(folder_path):
467
+ for file in files:
468
+ file_path = os.path.join(root, file)
469
+ relative_path = os.path.relpath(file_path, folder_path)
470
+ zipf.write(file_path, relative_path)
471
+
472
+ return anomaly_summary, images, "Results.zip"
473
+
474
+ iface = gr.Interface(
475
+ fn=detect_anomalies,
476
+ inputs=[
477
+ gr.File(file_types=[".csv"], label="Upload CSV File"),
478
+ gr.Radio(["Benign500.csv", "Malignant500.csv", "Balance1000.csv", "Custom Data"], value="Custom Data", label="Choose Samples or CustomData"),
479
+ gr.Slider(minimum=10, maximum=100, step=10, value=80, label="Data Usage Percentage (Training or Detection)"),
480
+ gr.Slider(minimum=1, maximum=20, step=1, value=3, label="Training Epochs (Default value is 3)"),
481
+ gr.Slider(minimum=0, maximum=5, step=0.5, value=1.5, label="Loss Threshold (x, higher x means high threshold) = Q3 + x*IQR"),
482
+ ],
483
+ outputs=[
484
+ gr.Textbox(label="Anomaly Summary"),
485
+ gr.Gallery(label="Anomaly Plots"),
486
+ "file",
487
+ ],
488
+ title="Your own Anomaly Detector",
489
+ description="""
490
+ ### Fully Unsupervised Anomaly Detection Tool (uses Bidirectional based Autoencoder with skip conn. and Dropout Layers)
491
+ ##### Download *"Result.zip"* (contains model.pkl, dataset images, output images) to download the results from Right Bottom.
492
+ Upload a *CSV file* (Custom Anomalies Detection: Use Output Column: "Label" or ), or Use *our trained model*.
493
+ """
494
+ )
495
+
496
+ if __name__ == "__main__":
497
+ iface.launch(debug=False)
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8728659966eeda34b5321cf45505b5ffa1e3f161a40a6b15e87316c2804f8ea6
3
+ size 1173120
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ numpy==2.0.2
3
+ torch==2.6
4
+ gradio==5.23.3
5
+ sklearn-pandas==2.2.0
6
+ jsonpickle==4.0.5
7
+ pickleshare==0.7.5
scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbc2f66ea2123e8415ec89c30132e2ce7efa6f71a02538613fd0d236700f7fcf
3
+ size 2331