Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- Data/Balance1000.csv +0 -0
- Data/Benign500.csv +0 -0
- Data/Malignant500.csv +0 -0
- app.py +497 -0
- model.pth +3 -0
- requirements.txt +7 -0
- scaler.pkl +3 -0
Data/Balance1000.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Data/Benign500.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Data/Malignant500.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import shutil
|
4 |
+
import pickle
|
5 |
+
import zipfile
|
6 |
+
import matplotlib
|
7 |
+
import numpy as np
|
8 |
+
import gradio as gr
|
9 |
+
import pandas as pd
|
10 |
+
import torch.nn as nn
|
11 |
+
import torch.optim as optim
|
12 |
+
import torch.nn.functional as F
|
13 |
+
from matplotlib import pyplot as plt
|
14 |
+
from sklearn.impute import SimpleImputer
|
15 |
+
from pandas.plotting import andrews_curves
|
16 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
|
17 |
+
from sklearn.preprocessing import StandardScaler
|
18 |
+
from torch.utils.data import DataLoader, TensorDataset, Dataset
|
19 |
+
|
20 |
+
###################################################### Preprocessing #####################################################################
|
21 |
+
def preprocess_dataframe(df, target_column=None, fill_method='mean', drop_na=True, sequence_length=32, test_size=0.2, batch_size = 128):
|
22 |
+
"""
|
23 |
+
Loads a DataFrame from a file, preprocesses it, prepares it for LSTM data.
|
24 |
+
If a target_column is provided, that column is used as the target (y).
|
25 |
+
Otherwise, it prepares data for an autoencoder (no separate y).
|
26 |
+
1. Loads file and checks for the target columns
|
27 |
+
2. Drops any NaN rows and non numeric columns.
|
28 |
+
3. Fills the NaN values with given method.
|
29 |
+
4. After preprocessing, data is transformed to fit in lstm.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
file_path (str): Path to the data file (e.g., CSV, Excel).
|
33 |
+
target_column (str, optional): Name of the target column. If provided, use this as target. Otherwise, treats as autoencoder. Defaults to None.
|
34 |
+
fill_method (str, optional): Method for filling NaNs: 'mean', 'median', 'most_frequent', or 'constant'.
|
35 |
+
Defaults to 'mean'. If 'constant', `fill_value` must be set.
|
36 |
+
drop_na (bool, optional): Whether to drop rows with any NaN values. Defaults to True.
|
37 |
+
sequence_length (int): The length of the sequence to create (e.g., number of features to treat as a sequence).
|
38 |
+
test_size (float): The proportion of data to use for testing.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
tuple: (train_loader, test_loader, input_size) if no target_column.
|
42 |
+
(train_loader, test_loader, input_size, target_column_name) if target_column provided
|
43 |
+
A tuple containing:
|
44 |
+
- train_loader (DataLoader): DataLoader for training data.
|
45 |
+
- test_loader (DataLoader): DataLoader for test data.
|
46 |
+
- input_size (int): Number of features.
|
47 |
+
- target_column_name (str): The name of the target column only when there is target column.
|
48 |
+
"""
|
49 |
+
# 1. Target Column Check
|
50 |
+
target_col = None
|
51 |
+
if target_column:
|
52 |
+
if target_column in df.columns:
|
53 |
+
target_col = target_column
|
54 |
+
print(f"Target column '{target_column}' found.")
|
55 |
+
else:
|
56 |
+
target_column = None # Reset target_column so we treat as autoencoder
|
57 |
+
else:
|
58 |
+
print("No target column specified. Treating as autoencoder.")
|
59 |
+
|
60 |
+
#2. Drop Rows with NaNs before Fill
|
61 |
+
if drop_na:
|
62 |
+
print("Dropping rows with any NaN values...")
|
63 |
+
df = df.dropna()
|
64 |
+
|
65 |
+
|
66 |
+
# 3. Drop Non-Numeric Columns (Except Target)
|
67 |
+
columns_to_drop = []
|
68 |
+
for col in df.columns:
|
69 |
+
if col != target_col and not pd.api.types.is_numeric_dtype(df[col]):
|
70 |
+
columns_to_drop.append(col) #exclude the target column if target column is not numeric
|
71 |
+
if columns_to_drop:
|
72 |
+
print(f"Dropping non-numeric columns: {columns_to_drop}")
|
73 |
+
df = df.drop(columns=columns_to_drop)
|
74 |
+
else:
|
75 |
+
print("No non-numeric columns found.")
|
76 |
+
|
77 |
+
|
78 |
+
# 4. Handle Missing Values (Only in Numeric Columns After Dropping)
|
79 |
+
numeric_cols = df.select_dtypes(include=np.number).columns #select numeric columns after non-numeric columsn removed
|
80 |
+
if df[numeric_cols].isnull().any().any(): # Check if any NaN values exist (in numeric columns)
|
81 |
+
print("Handling missing values...")
|
82 |
+
if fill_method in ['mean', 'median', 'most_frequent', 'constant']:
|
83 |
+
imputer = SimpleImputer(strategy=fill_method)
|
84 |
+
|
85 |
+
if fill_method == 'constant':
|
86 |
+
imputer = SimpleImputer(strategy=fill_method, fill_value=0) #only with constant filling value must be provided
|
87 |
+
|
88 |
+
df[numeric_cols] = imputer.fit_transform(df[numeric_cols]) # Apply only to numeric columns
|
89 |
+
|
90 |
+
else:
|
91 |
+
raise ValueError("Invalid fill_method. Choose 'mean', 'median', 'most_frequent', or 'constant'.")
|
92 |
+
|
93 |
+
# Droping NaN and inf
|
94 |
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
95 |
+
df.dropna(inplace=True)
|
96 |
+
|
97 |
+
if target_col:
|
98 |
+
inputdf = df.drop(columns=[target_col])
|
99 |
+
outputdf = df[target_col].apply(lambda x: 0 if x.lower() == 'benign' else 1)
|
100 |
+
malinputdf = inputdf[outputdf == 1]
|
101 |
+
beninputdf = inputdf[outputdf == 0]
|
102 |
+
sample_size = min(len(beninputdf), len(malinputdf), 500)
|
103 |
+
bensample = beninputdf.sample(n=sample_size, random_state=42)
|
104 |
+
bensample['Label'] = 'Benign'
|
105 |
+
malsample = malinputdf.sample(n=sample_size, random_state=42)
|
106 |
+
malsample['Label'] = 'Malicious'
|
107 |
+
sample = pd.concat([bensample, malsample])
|
108 |
+
data = beninputdf.values
|
109 |
+
else:
|
110 |
+
inputdf = df
|
111 |
+
sample_size = min(len(inputdf), 500)
|
112 |
+
sample = df.sample(n=sample_size, random_state=42)
|
113 |
+
data = inputdf.values
|
114 |
+
|
115 |
+
scaler = StandardScaler()
|
116 |
+
data = scaler.fit_transform(data)
|
117 |
+
|
118 |
+
if target_col:
|
119 |
+
X_train = data
|
120 |
+
data = malinputdf.values
|
121 |
+
data = scaler.transform(data)
|
122 |
+
X_test = data
|
123 |
+
else:
|
124 |
+
X_train = data
|
125 |
+
|
126 |
+
class TabularDatasetTest(Dataset):
|
127 |
+
def __init__(self, data):
|
128 |
+
self.data = data.clone().detach()
|
129 |
+
|
130 |
+
def __len__(self):
|
131 |
+
return len(self.data)
|
132 |
+
|
133 |
+
def __getitem__(self, idx):
|
134 |
+
return self.data[idx], self.data[idx]
|
135 |
+
|
136 |
+
class TabularDatasetTrain(Dataset):
|
137 |
+
def __init__(self, data, sequence_length):
|
138 |
+
self.data = data.clone().detach()
|
139 |
+
self.sequence_length = sequence_length
|
140 |
+
|
141 |
+
def __len__(self):
|
142 |
+
return len(self.data) - self.sequence_length + 1
|
143 |
+
|
144 |
+
def __getitem__(self, idx):
|
145 |
+
return self.data[idx:idx + self.sequence_length], self.data[idx:idx + self.sequence_length]
|
146 |
+
|
147 |
+
if target_column:
|
148 |
+
X_train = torch.tensor(X_train, dtype=torch.float32)
|
149 |
+
X_test = torch.tensor(X_test, dtype=torch.float32)
|
150 |
+
train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
|
151 |
+
test_dataset = TabularDatasetTest(X_test)
|
152 |
+
train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
153 |
+
test_Dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
|
154 |
+
return {
|
155 |
+
'train_loader': train_DataLoader,
|
156 |
+
'test_loader': test_Dataloader,
|
157 |
+
'input_df': inputdf,
|
158 |
+
'target_df': outputdf,
|
159 |
+
'malinput_df': malinputdf,
|
160 |
+
'beninput_df': beninputdf,
|
161 |
+
'target_col': target_col,
|
162 |
+
'scaler': scaler,
|
163 |
+
'sample': sample
|
164 |
+
}
|
165 |
+
else:
|
166 |
+
X_train = torch.tensor(X_train, dtype=torch.float32)
|
167 |
+
train_dataset = TabularDatasetTrain(X_train, sequence_length = sequence_length)
|
168 |
+
train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
169 |
+
sample["Label"] = "dummy_class"
|
170 |
+
return {
|
171 |
+
'train_loader': train_DataLoader,
|
172 |
+
'test_loader': None,
|
173 |
+
'input_df': inputdf,
|
174 |
+
'malinput_df': None,
|
175 |
+
'beninput_df': None,
|
176 |
+
'target_df': None,
|
177 |
+
'target_col': None,
|
178 |
+
'scaler': scaler,
|
179 |
+
'sample': sample
|
180 |
+
}
|
181 |
+
################################################## Model #############################################################################
|
182 |
+
class EncoderRNN(nn.Module):
|
183 |
+
def __init__(self, input_size, hidden_size, num_layers, isCuda):
|
184 |
+
super(EncoderRNN, self).__init__()
|
185 |
+
self.input_size = input_size
|
186 |
+
self.hidden_size = hidden_size
|
187 |
+
self.num_layers = num_layers
|
188 |
+
self.bottleneck_size = int(input_size/2)
|
189 |
+
|
190 |
+
self.isCuda = isCuda
|
191 |
+
self.lstm1 = nn.LSTM(input_size, int(hidden_size/2), num_layers, batch_first=True, bidirectional = True)
|
192 |
+
self.relu = nn.ReLU()
|
193 |
+
self.dropout = nn.Dropout(0.2)
|
194 |
+
self.lstm2 = nn.LSTM(hidden_size, self.bottleneck_size, num_layers, batch_first=True)
|
195 |
+
|
196 |
+
|
197 |
+
def forward(self, inputs):
|
198 |
+
intermediate_state, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
|
199 |
+
intermediate_state = self.relu(self.dropout(intermediate_state))
|
200 |
+
encoded_input, hidden = self.lstm2(intermediate_state)#, (h0_2, c0_2))
|
201 |
+
return encoded_input, intermediate_state
|
202 |
+
|
203 |
+
class DecoderRNN(nn.Module):
|
204 |
+
def __init__(self, hidden_size, output_size, num_layers, isCuda):
|
205 |
+
super(DecoderRNN, self).__init__()
|
206 |
+
self.hidden_size = hidden_size
|
207 |
+
self.output_size = output_size
|
208 |
+
self.num_layers = num_layers
|
209 |
+
self.bottleneck_size = int(output_size/2)
|
210 |
+
|
211 |
+
self.isCuda = isCuda
|
212 |
+
self.lstm2 = nn.LSTM(self.bottleneck_size, hidden_size, num_layers, batch_first=True)
|
213 |
+
self.relu = nn.ReLU()
|
214 |
+
self.dropout = nn.Dropout(0.2)
|
215 |
+
self.lstm1 = nn.LSTM(2*hidden_size, output_size, num_layers, batch_first=True)
|
216 |
+
|
217 |
+
def forward(self, encoded_input, intermediate_state):
|
218 |
+
encoded_input, hidden = self.lstm2(encoded_input)#, (h0_2, c0_2))
|
219 |
+
inputs = torch.cat((self.dropout(encoded_input), intermediate_state), dim=2)
|
220 |
+
inputs = self.relu(inputs)
|
221 |
+
decoded_output, hidden = self.lstm1(inputs)#, (h0_1, c0_1))
|
222 |
+
# print(f"output: {decoded_output}")
|
223 |
+
return decoded_output
|
224 |
+
|
225 |
+
class LSTMAE(nn.Module):
|
226 |
+
def __init__(self, input_size, hidden_size, num_layers=1, isCuda="cuda" if torch.cuda.is_available() else "cpu"):
|
227 |
+
super(LSTMAE, self).__init__()
|
228 |
+
hidden_size = hidden_size if hidden_size%2==0 else hidden_size+1
|
229 |
+
self.encoder = EncoderRNN(input_size, hidden_size, num_layers, isCuda)
|
230 |
+
self.decoder = DecoderRNN(hidden_size, input_size, num_layers, isCuda)
|
231 |
+
self.initialize_weights()
|
232 |
+
|
233 |
+
def initialize_weights(self):
|
234 |
+
"""
|
235 |
+
Initializes the weights of the linear, LSTM, and convolutional layers
|
236 |
+
using appropriate initialization schemes.
|
237 |
+
"""
|
238 |
+
for m in self.modules(): # Iterate through all modules in the network
|
239 |
+
if isinstance(m, nn.LSTM):
|
240 |
+
for name, param in m.named_parameters():
|
241 |
+
if 'weight' in name:
|
242 |
+
if 'ih' or 'hh' in name:
|
243 |
+
nn.init.xavier_uniform_(param.data) # Input-to-hidden
|
244 |
+
elif 'bias' in name:
|
245 |
+
nn.init.zeros_(param.data)
|
246 |
+
|
247 |
+
def forward(self, input):
|
248 |
+
encoded_input, intermediate_state = self.encoder(input)
|
249 |
+
decoded_output = self.decoder(encoded_input, intermediate_state)
|
250 |
+
return decoded_output
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
############################################## Andrews Curves ###########################################################################
|
255 |
+
def make_better_andrews_curves(df, class_column, colors=None, plot_title="Andrews Curves",
|
256 |
+
line_width=0.8, transparency=0.5, sample_size=None, legend_loc='best',
|
257 |
+
custom_labels=None, x_axis_ticks=None, x_axis_labels=None,
|
258 |
+
figsize=(10, 6), dpi=300, name = "andrews_curves"):
|
259 |
+
"""
|
260 |
+
Generates an Andrews Curves plot with enhanced styling.
|
261 |
+
|
262 |
+
Args:
|
263 |
+
df: pandas DataFrame containing the data.
|
264 |
+
class_column: Name of the column containing class labels.
|
265 |
+
colors: List of colors to use for each class (e.g., ['blue', 'red']). Defaults to matplotlib's defaults if None.
|
266 |
+
plot_title: Title of the plot.
|
267 |
+
line_width: Width of the lines.
|
268 |
+
transparency: Alpha value (transparency) of the lines.
|
269 |
+
sample_size: If an integer is provided, a random sample of the data will be used. Useful for large datasets.
|
270 |
+
legend_loc: Location of the legend (e.g., 'best', 'upper right', 'lower left').
|
271 |
+
custom_labels: A dictionary mapping original class labels to more descriptive labels for the legend.
|
272 |
+
x_axis_ticks: A list of tick positions for the x-axis. If None, default ticks are used.
|
273 |
+
x_axis_labels: A list of labels for the x-axis ticks. Must be the same length as x_axis_ticks.
|
274 |
+
figsize: Tuple specifying the figure size (width, height) in inches.
|
275 |
+
"""
|
276 |
+
|
277 |
+
if sample_size and sample_size < len(df):
|
278 |
+
df = df.sample(n=sample_size, random_state=42) # Sample for faster plotting
|
279 |
+
|
280 |
+
plt.figure(figsize=figsize) # Set the figure size before plotting
|
281 |
+
|
282 |
+
ax = andrews_curves(df, class_column, color=colors) # Store the Axes object
|
283 |
+
|
284 |
+
plt.title(plot_title, fontsize=16)
|
285 |
+
plt.xlabel("t", fontsize=12) # Added x-axis label
|
286 |
+
plt.ylabel("f(t)", fontsize=12) # Added y-axis label
|
287 |
+
|
288 |
+
for line in ax.get_lines():
|
289 |
+
line.set_linewidth(line_width)
|
290 |
+
line.set_alpha(transparency)
|
291 |
+
|
292 |
+
# Customize Legend
|
293 |
+
if custom_labels:
|
294 |
+
handles, labels = ax.get_legend_handles_labels()
|
295 |
+
new_labels = [custom_labels.get(label, label) for label in labels] # Use .get() to handle missing labels
|
296 |
+
ax.legend(handles, new_labels, loc=legend_loc, fontsize=10)
|
297 |
+
else:
|
298 |
+
plt.legend(loc=legend_loc, fontsize=10)
|
299 |
+
|
300 |
+
|
301 |
+
# Customize X-axis ticks and labels
|
302 |
+
if x_axis_ticks:
|
303 |
+
plt.xticks(x_axis_ticks, x_axis_labels)
|
304 |
+
|
305 |
+
plt.grid(False) # Add a grid
|
306 |
+
plt.tight_layout() # Adjust layout to prevent labels from overlapping
|
307 |
+
plt.savefig(f"{name}.png", dpi=dpi)
|
308 |
+
################################################# Model Training ######################################################################
|
309 |
+
def train_model(model, train_loader, test_loader = None, learning_rate=0.001, epochs=10):
|
310 |
+
criterion = nn.MSELoss()
|
311 |
+
info = ""
|
312 |
+
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
313 |
+
train_loss_data = {}
|
314 |
+
for epoch in range(epochs):
|
315 |
+
model.train()
|
316 |
+
train_loss = 0.0
|
317 |
+
epoch_train_losses = []
|
318 |
+
mse_losses = []
|
319 |
+
for i,(inputs, targets) in enumerate(train_loader):
|
320 |
+
inputs = inputs.to(device)
|
321 |
+
targets = targets.to(device)
|
322 |
+
outputs = model(inputs)
|
323 |
+
# l1_lambda = 0.001
|
324 |
+
# l2_lambda = 0.0001
|
325 |
+
# l1_norm = sum(p.abs().sum() for p in model.parameters()) # L1 norm
|
326 |
+
# l2_norm = sum(p.pow(2.0).sum() for p in model.parameters()) # L2 norm
|
327 |
+
loss = criterion(outputs, targets)# + l2_lambda * l2_norm + l1_lambda * l1_norm
|
328 |
+
optimizer.zero_grad()
|
329 |
+
loss.backward()
|
330 |
+
optimizer.step()
|
331 |
+
if epoch == epochs-1:
|
332 |
+
mse_loss = F.mse_loss(targets, outputs, reduction='none')
|
333 |
+
mse_loss_per_data_point = mse_loss.mean(dim=-1)
|
334 |
+
mse_losses.extend(mse_loss_per_data_point.tolist())
|
335 |
+
epoch_train_losses.append(loss.item())
|
336 |
+
train_loss += loss.item()
|
337 |
+
train_loss /= len(train_loader)
|
338 |
+
|
339 |
+
# Validation
|
340 |
+
if test_loader and epoch%1==0:
|
341 |
+
model.eval()
|
342 |
+
test_loss = 0.0
|
343 |
+
with torch.no_grad():
|
344 |
+
for i,(inputs, targets) in enumerate(test_loader):
|
345 |
+
inputs = inputs.to(device)
|
346 |
+
targets = targets.to(device)
|
347 |
+
outputs = model(inputs.unsqueeze(1))
|
348 |
+
loss = criterion(outputs.squeeze(1), targets)
|
349 |
+
test_loss += loss.item()
|
350 |
+
|
351 |
+
test_loss /= len(test_loader)
|
352 |
+
else:
|
353 |
+
test_loss = 0.0
|
354 |
+
info += f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}\n"
|
355 |
+
print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
|
356 |
+
train_loss_data[f'Epoch {epoch + 1}'] = epoch_train_losses
|
357 |
+
train_loss_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in train_loss_data.items()]))
|
358 |
+
return model, train_loss_df, mse_losses, info
|
359 |
+
|
360 |
+
#########################################################################################################################################
|
361 |
+
def detect_anomalies(csv_file, sample_choice="Custom Data", data_slicing_percentage=80, epochs=3, threshold_factor=1.0):
|
362 |
+
images = []
|
363 |
+
anomaly_summary = ""
|
364 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
365 |
+
if os.path.exists("Results"):
|
366 |
+
shutil.rmtree("Results")
|
367 |
+
os.mkdir("Results")
|
368 |
+
if sample_choice == "Custom Data":
|
369 |
+
anomaly_summary += f"[INFO] Loading Custom Dataset {data_slicing_percentage}%...\n"
|
370 |
+
dataframe = pd.read_csv(csv_file.name).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
|
371 |
+
anomaly_summary += f"[INFO] Preprocessing Dataset...\n"
|
372 |
+
if dataframe.get('Label') is not None:
|
373 |
+
processed_data = preprocess_dataframe(dataframe, target_column="Label")
|
374 |
+
else:
|
375 |
+
processed_data = preprocess_dataframe(dataframe)
|
376 |
+
anomaly_summary += f"[WARNING] No Label Column Found, Using Unsupervised Learning...\n"
|
377 |
+
anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
|
378 |
+
make_better_andrews_curves(processed_data['sample'], 'Label',
|
379 |
+
colors=['Blue', 'Red'],
|
380 |
+
plot_title="Dataset Andrews Curves",
|
381 |
+
line_width=1.2,
|
382 |
+
transparency=0.7,
|
383 |
+
legend_loc='upper right',
|
384 |
+
figsize=(12, 7),
|
385 |
+
name = "Results/Dataset_andrews_curves")
|
386 |
+
images.append("Results/Dataset_andrews_curves.png")
|
387 |
+
model = LSTMAE(len(processed_data["input_df"].columns),128).to(device)
|
388 |
+
model.to(device)
|
389 |
+
anomaly_summary += f"[INFO] Training Model...\n"
|
390 |
+
_, train_loss_df, mse_losses, info = train_model(model, processed_data['train_loader'], processed_data['test_loader'],epochs=epochs)
|
391 |
+
anomaly_summary += info
|
392 |
+
anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
|
393 |
+
dataframe.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
|
394 |
+
pickle.dump(processed_data['scaler'], open('Results/scaler.pkl', 'wb'))
|
395 |
+
torch.save(model, 'Results/model.pth')
|
396 |
+
anomaly_summary += f"[INFO] Generating Loss Curves...\n"
|
397 |
+
plt.figure(figsize=(12, 6)) # Adjust figure size as needed
|
398 |
+
for column in train_loss_df.columns:
|
399 |
+
plt.plot(train_loss_df[column], label=column)
|
400 |
+
plt.xlabel("Batch")
|
401 |
+
plt.ylabel("Loss")
|
402 |
+
plt.title("Training Loss per Epoch")
|
403 |
+
plt.legend() # Show the legend to identify each epoch
|
404 |
+
plt.grid(True) # Add a grid for easier reading
|
405 |
+
plt.tight_layout() # Adjust layout to prevent labels from overlapping
|
406 |
+
plt.savefig("Results/loss_curves.png", dpi=300)
|
407 |
+
images.append("Results/loss_curves.png")
|
408 |
+
Q1, Q3 = np.percentile(mse_losses, [25, 75])
|
409 |
+
Dict = {"Q1": Q1, "Q3": Q3}
|
410 |
+
pickle.dump(Dict, open('Results/INFO.pkl', 'wb'))
|
411 |
+
|
412 |
+
else:
|
413 |
+
Q1, Q3 = 0.19226229563355446, 0.7454282641410828
|
414 |
+
IQR = Q3 - Q1
|
415 |
+
lower_bound = Q1 - threshold_factor * IQR
|
416 |
+
upper_bound = Q3 + threshold_factor * IQR
|
417 |
+
# print(lower_bound, upper_bound)
|
418 |
+
data_path = os.path.join(os.path.abspath('Data'),sample_choice)
|
419 |
+
dataframe = pd.read_csv(data_path).sample(frac=data_slicing_percentage/100, random_state=42).reset_index(drop=True)
|
420 |
+
anomaly_summary += f"[INFO] Saving model, scaler, Dataset Used...\n"
|
421 |
+
dataframe.to_csv('Results/Scaled_dataset.csv', columns=dataframe.columns, index=False)
|
422 |
+
scaler = pickle.load(open('scaler.pkl', 'rb'))
|
423 |
+
original_df = scaler.inverse_transform(dataframe.iloc[:,:-1])
|
424 |
+
original_df = pd.DataFrame(original_df, columns=dataframe.columns[:-1])
|
425 |
+
original_df['Label'] = dataframe['Label']
|
426 |
+
original_df.to_csv('Results/Original_dataset.csv', columns=dataframe.columns, index=False)
|
427 |
+
shutil.copy('scaler.pkl', 'Results/scaler.pkl')
|
428 |
+
shutil.copy('model.pth', 'Results/model.pth')
|
429 |
+
# andrew curve of dataset
|
430 |
+
anomaly_summary += f"[INFO] Generating Andrews Curves...\n"
|
431 |
+
make_better_andrews_curves(dataframe, 'Label',
|
432 |
+
colors=['Blue', 'Red'],
|
433 |
+
plot_title="Dataset Andrews Curves",
|
434 |
+
line_width=1.2,
|
435 |
+
transparency=0.7,
|
436 |
+
legend_loc='upper right',
|
437 |
+
figsize=(12, 7),
|
438 |
+
name = "Results/Dataset_andrews_curves")
|
439 |
+
images.append("Results/Dataset_andrews_curves.png")
|
440 |
+
inputdf = torch.tensor(dataframe.iloc[:,:-1].to_numpy(), dtype=torch.float32, device=device)
|
441 |
+
outputdf = dataframe['Label']
|
442 |
+
model = torch.load("model.pth",weights_only = False, map_location=device)
|
443 |
+
model.eval()
|
444 |
+
outputs = model(inputdf.unsqueeze(1)).squeeze(1)
|
445 |
+
mse_loss = F.mse_loss(outputs, inputdf, reduction='none')
|
446 |
+
mse_loss_per_data_point = mse_loss.mean(dim=-1)
|
447 |
+
anomaly_scores = pd.DataFrame({'Loss': mse_loss_per_data_point.detach().cpu().numpy(), 'Label': outputdf})
|
448 |
+
anomaly_scores['Anomaly'] = anomaly_scores['Loss'].apply(lambda x: 1 if x > upper_bound else 0)
|
449 |
+
anomaly_scores['Label'] = anomaly_scores['Label'].apply(lambda x: 1 if x == "Malicious" else 0)
|
450 |
+
out_confusion_matrix = confusion_matrix(anomaly_scores['Label'], anomaly_scores['Anomaly'])
|
451 |
+
disp = ConfusionMatrixDisplay(confusion_matrix=out_confusion_matrix, display_labels=["Benign","Malignant"])
|
452 |
+
disp.plot(cmap=plt.cm.Blues)
|
453 |
+
plt.title('Confusion Matrix')
|
454 |
+
plt.savefig(f"Results/confusion_matrix.png", dpi=300)
|
455 |
+
images.append("Results/confusion_matrix.png")
|
456 |
+
accuracy = accuracy_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
|
457 |
+
precision = precision_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
|
458 |
+
recall = recall_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
|
459 |
+
f1 = f1_score(anomaly_scores['Label'], anomaly_scores['Anomaly'])
|
460 |
+
# print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
|
461 |
+
anomaly_summary += f"[RESULT] Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
|
462 |
+
anomaly_summary = anomaly_summary + f"Confusion Matrix:\n{out_confusion_matrix}\n"
|
463 |
+
|
464 |
+
folder_path = "Results"
|
465 |
+
with zipfile.ZipFile("Results.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
|
466 |
+
for root, _, files in os.walk(folder_path):
|
467 |
+
for file in files:
|
468 |
+
file_path = os.path.join(root, file)
|
469 |
+
relative_path = os.path.relpath(file_path, folder_path)
|
470 |
+
zipf.write(file_path, relative_path)
|
471 |
+
|
472 |
+
return anomaly_summary, images, "Results.zip"
|
473 |
+
|
474 |
+
iface = gr.Interface(
|
475 |
+
fn=detect_anomalies,
|
476 |
+
inputs=[
|
477 |
+
gr.File(file_types=[".csv"], label="Upload CSV File"),
|
478 |
+
gr.Radio(["Benign500.csv", "Malignant500.csv", "Balance1000.csv", "Custom Data"], value="Custom Data", label="Choose Samples or CustomData"),
|
479 |
+
gr.Slider(minimum=10, maximum=100, step=10, value=80, label="Data Usage Percentage (Training or Detection)"),
|
480 |
+
gr.Slider(minimum=1, maximum=20, step=1, value=3, label="Training Epochs (Default value is 3)"),
|
481 |
+
gr.Slider(minimum=0, maximum=5, step=0.5, value=1.5, label="Loss Threshold (x, higher x means high threshold) = Q3 + x*IQR"),
|
482 |
+
],
|
483 |
+
outputs=[
|
484 |
+
gr.Textbox(label="Anomaly Summary"),
|
485 |
+
gr.Gallery(label="Anomaly Plots"),
|
486 |
+
"file",
|
487 |
+
],
|
488 |
+
title="Your own Anomaly Detector",
|
489 |
+
description="""
|
490 |
+
### Fully Unsupervised Anomaly Detection Tool (uses Bidirectional based Autoencoder with skip conn. and Dropout Layers)
|
491 |
+
##### Download *"Result.zip"* (contains model.pkl, dataset images, output images) to download the results from Right Bottom.
|
492 |
+
Upload a *CSV file* (Custom Anomalies Detection: Use Output Column: "Label" or ), or Use *our trained model*.
|
493 |
+
"""
|
494 |
+
)
|
495 |
+
|
496 |
+
if __name__ == "__main__":
|
497 |
+
iface.launch(debug=False)
|
model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8728659966eeda34b5321cf45505b5ffa1e3f161a40a6b15e87316c2804f8ea6
|
3 |
+
size 1173120
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.2
|
2 |
+
numpy==2.0.2
|
3 |
+
torch==2.6
|
4 |
+
gradio==5.23.3
|
5 |
+
sklearn-pandas==2.2.0
|
6 |
+
jsonpickle==4.0.5
|
7 |
+
pickleshare==0.7.5
|
scaler.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbc2f66ea2123e8415ec89c30132e2ce7efa6f71a02538613fd0d236700f7fcf
|
3 |
+
size 2331
|