|
from datasets import load_dataset, Dataset |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer |
|
from sklearn.model_selection import train_test_split |
|
import torch |
|
|
|
|
|
dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True) |
|
|
|
|
|
df = dataset['train'].to_pandas() |
|
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) |
|
|
|
|
|
train_dataset = Dataset.from_pandas(train_df, preserve_index=False) |
|
test_dataset = Dataset.from_pandas(test_df, preserve_index=False) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased") |
|
|
|
|
|
def preprocess_data(examples): |
|
|
|
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512) |
|
|
|
|
|
tokenized_train = train_dataset.map(preprocess_data, batched=True) |
|
tokenized_test = test_dataset.map(preprocess_data, batched=True) |
|
|
|
|
|
tokenized_train = tokenized_train.remove_columns(['text']) |
|
tokenized_test = tokenized_test.remove_columns(['text']) |
|
tokenized_train.set_format("torch") |
|
tokenized_test.set_format("torch") |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2) |
|
|
|
|
|
training_args = TrainingArguments( |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=16, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
save_strategy="epoch", |
|
logging_steps=10, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_train, |
|
eval_dataset=tokenized_test, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./phishing_model") |
|
tokenizer.save_pretrained("./phishing_model") |
|
|
|
|
|
|
|
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_model") |
|
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_model") |
|
|
|
|
|
text = "Your account has been compromised, please reset your password now!" |
|
inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) |
|
|
|
|
|
loaded_model.eval() |
|
with torch.no_grad(): |
|
outputs = loaded_model(**inputs) |
|
prediction = torch.argmax(outputs.logits, dim=-1).item() |
|
|
|
print(f"Predicted label: {prediction}") |
|
|