deepugaur commited on
Commit
148cd14
·
verified ·
1 Parent(s): 92203ec

Update model.h5

Browse files
Files changed (1) hide show
  1. model.h5 +48 -34
model.h5 CHANGED
@@ -1,36 +1,50 @@
1
- # Example model training script
2
- from tensorflow.keras.models import Sequential
3
- from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
 
4
  from tensorflow.keras.preprocessing.text import Tokenizer
5
  from tensorflow.keras.preprocessing.sequence import pad_sequences
6
- import numpy as np
7
- import pickle
8
-
9
- # Sample dataset
10
- texts = ["This is valid", "This is malicious", "Valid text", "Malicious text"]
11
- labels = [0, 1, 0, 1] # 0: Valid, 1: Malicious
12
-
13
- # Tokenization
14
- tokenizer = Tokenizer(num_words=1000)
15
- tokenizer.fit_on_texts(texts)
16
- sequences = tokenizer.texts_to_sequences(texts)
17
- padded_sequences = pad_sequences(sequences, maxlen=50)
18
-
19
- # Save the tokenizer
20
- with open("tokenizer.pkl", "wb") as f:
21
- pickle.dump(tokenizer, f)
22
-
23
- # Model architecture
24
- model = Sequential([
25
- Embedding(input_dim=1000, output_dim=64, input_length=50),
26
- LSTM(64, return_sequences=False),
27
- Dropout(0.5),
28
- Dense(1, activation="sigmoid")
29
- ])
30
-
31
- # Compile and train the model
32
- model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
33
- model.fit(padded_sequences, np.array(labels), epochs=10)
34
-
35
- # Save the model
36
- model.save("model.h5")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import tensorflow as tf
4
+ from sklearn.model_selection import train_test_split
5
  from tensorflow.keras.preprocessing.text import Tokenizer
6
  from tensorflow.keras.preprocessing.sequence import pad_sequences
7
+ from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Input
8
+ from tensorflow.keras.models import Model
9
+ from tensorflow.keras import regularizers
10
+ from tensorflow.keras.callbacks import EarlyStopping
11
+ from tensorflow.keras.optimizers import Adam
12
+
13
+ # Load and preprocess data
14
+ data = pd.read_csv("train prompt.csv", sep=',', quoting=3, encoding='ISO-8859-1', on_bad_lines='skip', engine='python')
15
+ data['label'] = data['label'].replace({'valid': 0, 'malicious': 1})
16
+
17
+ X = data['input'].values
18
+ y = data['label'].values
19
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
20
+
21
+ # Tokenizer and padding
22
+ tokenizer = Tokenizer(num_words=5000)
23
+ tokenizer.fit_on_texts(X_train)
24
+
25
+ X_train_seq = tokenizer.texts_to_sequences(X_train)
26
+ X_test_seq = tokenizer.texts_to_sequences(X_test)
27
+
28
+ max_length = 100
29
+ X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
30
+ X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)
31
+
32
+ # Model definition
33
+ input_layer = Input(shape=(max_length,))
34
+ embedding_layer = Embedding(input_dim=5000, output_dim=128, input_length=max_length)(input_layer)
35
+ x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))(embedding_layer)
36
+ x = Dropout(0.3)(x)
37
+ x = Bidirectional(LSTM(64, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))(x)
38
+ malicious_output = Dense(1, activation='sigmoid')(x)
39
+
40
+ model = Model(inputs=input_layer, outputs=malicious_output)
41
+ optimizer = Adam(learning_rate=0.0001)
42
+ model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
43
+
44
+ # Training the model
45
+ early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
46
+ model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])
47
+
48
+ # Save the trained model
49
+ model.save("deep_learning_model.h5")
50
+ print("Model saved to deep_learning_model.h5")