Spaces:
Sleeping
Sleeping
Create train_model.py
Browse files- train_model.py +52 -0
train_model.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import tensorflow as tf
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
| 8 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 9 |
+
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Input
|
| 10 |
+
from tensorflow.keras.models import Model
|
| 11 |
+
from tensorflow.keras import regularizers
|
| 12 |
+
from tensorflow.keras.callbacks import EarlyStopping
|
| 13 |
+
from tensorflow.keras.optimizers import Adam
|
| 14 |
+
|
| 15 |
+
# Load and preprocess data
|
| 16 |
+
data = pd.read_csv("train prompt.csv", sep=',', quoting=3, encoding='ISO-8859-1', on_bad_lines='skip', engine='python')
|
| 17 |
+
data['label'] = data['label'].replace({'valid': 0, 'malicious': 1})
|
| 18 |
+
|
| 19 |
+
X = data['input'].values
|
| 20 |
+
y = data['label'].values
|
| 21 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 22 |
+
|
| 23 |
+
# Tokenizer and padding
|
| 24 |
+
tokenizer = Tokenizer(num_words=5000)
|
| 25 |
+
tokenizer.fit_on_texts(X_train)
|
| 26 |
+
|
| 27 |
+
X_train_seq = tokenizer.texts_to_sequences(X_train)
|
| 28 |
+
X_test_seq = tokenizer.texts_to_sequences(X_test)
|
| 29 |
+
|
| 30 |
+
max_length = 100
|
| 31 |
+
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
|
| 32 |
+
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)
|
| 33 |
+
|
| 34 |
+
# Model definition
|
| 35 |
+
input_layer = Input(shape=(max_length,))
|
| 36 |
+
embedding_layer = Embedding(input_dim=5000, output_dim=128, input_length=max_length)(input_layer)
|
| 37 |
+
x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))(embedding_layer)
|
| 38 |
+
x = Dropout(0.3)(x)
|
| 39 |
+
x = Bidirectional(LSTM(64, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))(x)
|
| 40 |
+
malicious_output = Dense(1, activation='sigmoid')(x)
|
| 41 |
+
|
| 42 |
+
model = Model(inputs=input_layer, outputs=malicious_output)
|
| 43 |
+
optimizer = Adam(learning_rate=0.0001)
|
| 44 |
+
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
|
| 45 |
+
|
| 46 |
+
# Training the model
|
| 47 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
|
| 48 |
+
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])
|
| 49 |
+
|
| 50 |
+
# Save the trained model
|
| 51 |
+
model.save("deep_learning_model.h5")
|
| 52 |
+
print("Model saved to deep_learning_model.h5")
|