Spaces:

deepugaur
/

prompt_projectapp

Sleeping

App Files Files Community

deepugaur commited on Nov 23, 2024

Commit

148cd14

verified ·

1 Parent(s): 92203ec

Update model.h5

Browse files

Files changed (1) hide show

model.h5 +48 -34

model.h5 CHANGED Viewed

@@ -1,36 +1,50 @@
-# Example model training script
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
 from tensorflow.keras.preprocessing.text import Tokenizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
-import numpy as np
-import pickle
-# Sample dataset
-texts = ["This is valid", "This is malicious", "Valid text", "Malicious text"]
-labels = [0, 1, 0, 1]  # 0: Valid, 1: Malicious
-# Tokenization
-tokenizer = Tokenizer(num_words=1000)
-tokenizer.fit_on_texts(texts)
-sequences = tokenizer.texts_to_sequences(texts)
-padded_sequences = pad_sequences(sequences, maxlen=50)
-# Save the tokenizer
-with open("tokenizer.pkl", "wb") as f:
-    pickle.dump(tokenizer, f)
-# Model architecture
-model = Sequential([
-    Embedding(input_dim=1000, output_dim=64, input_length=50),
-    LSTM(64, return_sequences=False),
-    Dropout(0.5),
-    Dense(1, activation="sigmoid")
-])
-# Compile and train the model
-model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
-model.fit(padded_sequences, np.array(labels), epochs=10)
-# Save the model
-model.save("model.h5")

+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
 from tensorflow.keras.preprocessing.text import Tokenizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Input
+from tensorflow.keras.models import Model
+from tensorflow.keras import regularizers
+from tensorflow.keras.callbacks import EarlyStopping
+from tensorflow.keras.optimizers import Adam
+# Load and preprocess data
+data = pd.read_csv("train prompt.csv", sep=',', quoting=3, encoding='ISO-8859-1', on_bad_lines='skip', engine='python')
+data['label'] = data['label'].replace({'valid': 0, 'malicious': 1})
+X = data['input'].values
+y = data['label'].values
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Tokenizer and padding
+tokenizer = Tokenizer(num_words=5000)
+tokenizer.fit_on_texts(X_train)
+X_train_seq = tokenizer.texts_to_sequences(X_train)
+X_test_seq = tokenizer.texts_to_sequences(X_test)
+max_length = 100
+X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
+X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)
+# Model definition
+input_layer = Input(shape=(max_length,))
+embedding_layer = Embedding(input_dim=5000, output_dim=128, input_length=max_length)(input_layer)
+x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))(embedding_layer)
+x = Dropout(0.3)(x)
+x = Bidirectional(LSTM(64, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))(x)
+malicious_output = Dense(1, activation='sigmoid')(x)
+model = Model(inputs=input_layer, outputs=malicious_output)
+optimizer = Adam(learning_rate=0.0001)
+model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
+# Training the model
+early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
+model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])
+# Save the trained model
+model.save("deep_learning_model.h5")
+print("Model saved to deep_learning_model.h5")