Spaces:
Running
Running
Commit
Β·
6eb199d
0
Parent(s):
add bert & xgboost from global
Browse files- .gitignore +15 -0
- README.md +49 -0
- app.py +639 -0
- model.py +49 -0
- requirements.txt +12 -0
- xgboost/URLFeatureExtraction.py +382 -0
- xgboost/__init__.py +0 -0
- xgboost/features.py +347 -0
- xgboost_wrapper.py +233 -0
.gitignore
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
4 |
+
*.pyd
|
5 |
+
.Python
|
6 |
+
env/
|
7 |
+
venv/
|
8 |
+
.venv/
|
9 |
+
.env
|
10 |
+
*.log
|
11 |
+
.DS_Store
|
12 |
+
Thumbs.db
|
13 |
+
flagged/
|
14 |
+
*.joblib
|
15 |
+
*.pt
|
README.md
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Phishing Detector
|
3 |
+
emoji: π
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.39.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
# Phishing Detector
|
13 |
+
|
14 |
+
A comprehensive multi-model phishing detection system using:
|
15 |
+
|
16 |
+
## π€ Models
|
17 |
+
- **DeBERTa + LSTM**: Advanced transformer with attention mechanism (`khoa-done/phishing-detector`)
|
18 |
+
- **BERT**: Fine-tuned BERT model (`th1enq/bert_checkpoint`)
|
19 |
+
- **XGBoost**: Traditional ML with feature engineering (`th1enq/xgboost_checkpoint`)
|
20 |
+
|
21 |
+
## β¨ Features
|
22 |
+
- **URL Structure Analysis**: Extract 30+ features from URL patterns
|
23 |
+
- **HTML Content Analysis**: Extract 43+ features from webpage content
|
24 |
+
- **Combined Predictions**: Weighted ensemble of all models
|
25 |
+
- **Visual Attention Weights**: See which tokens influence decisions
|
26 |
+
- **Real-time Web Scraping**: Fetch and analyze live websites
|
27 |
+
- **Multi-tab Interface**: Compare results across different models
|
28 |
+
|
29 |
+
## π Usage
|
30 |
+
1. **Enter a URL**: System will fetch the webpage and analyze both URL structure and content
|
31 |
+
2. **Enter text**: Direct analysis of suspicious text content
|
32 |
+
3. **Compare Models**: Use different tabs to see how each model performs
|
33 |
+
|
34 |
+
## π Model Performance
|
35 |
+
- **DeBERTa + LSTM**: Best for context understanding with attention visualization
|
36 |
+
- **BERT**: Reliable baseline with robust predictions
|
37 |
+
- **XGBoost**: Fast traditional ML approach with feature interpretability
|
38 |
+
|
39 |
+
## π§ Technical Details
|
40 |
+
- All models loaded from Hugging Face Hub for easy deployment
|
41 |
+
- Feature extraction modules included for XGBoost functionality
|
42 |
+
- Dark theme optimized interface with visual analytics
|
43 |
+
- Graceful fallbacks if models fail to load
|
44 |
+
|
45 |
+
## π Examples
|
46 |
+
Try these URLs to see the system in action:
|
47 |
+
- `https://github.com/user/repo` (should be benign)
|
48 |
+
- `http://suspicious-phishing-site.example` (simulated phishing)
|
49 |
+
- Or paste any suspicious email content for analysis
|
app.py
ADDED
@@ -0,0 +1,639 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
import gradio as gr
|
6 |
+
import requests
|
7 |
+
import re
|
8 |
+
from urllib.parse import urlparse
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import time
|
11 |
+
import joblib
|
12 |
+
from xgboost_wrapper import xgboost_detector
|
13 |
+
|
14 |
+
# --- import your architecture ---
|
15 |
+
# Make sure this file is in the repo (e.g., models/deberta_lstm_classifier.py)
|
16 |
+
# and update the import path accordingly.
|
17 |
+
from model import DeBERTaLSTMClassifier # <-- your class
|
18 |
+
|
19 |
+
# --------- Config ----------
|
20 |
+
REPO_ID = "khoa-done/phishing-detector" # HF repo that holds the checkpoint
|
21 |
+
CKPT_NAME = "deberta_lstm_checkpoint.pt" # the .pt file name
|
22 |
+
MODEL_NAME = "microsoft/deberta-base" # base tokenizer/backbone
|
23 |
+
LABELS = ["benign", "phishing"] # adjust to your classes
|
24 |
+
|
25 |
+
# If your checkpoint contains hyperparams, you can fetch them like:
|
26 |
+
# checkpoint.get("config") or checkpoint.get("model_args")
|
27 |
+
# and pass into DeBERTaLSTMClassifier(**model_args)
|
28 |
+
|
29 |
+
# --------- Load model/tokenizer once (global) ----------
|
30 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
32 |
+
|
33 |
+
ckpt_path = hf_hub_download(repo_id=REPO_ID, filename=CKPT_NAME)
|
34 |
+
checkpoint = torch.load(ckpt_path, map_location=device)
|
35 |
+
|
36 |
+
# If you saved hyperparams in the checkpoint, use them:
|
37 |
+
model_args = checkpoint.get("model_args", {}) # e.g., {"lstm_hidden":256, "num_labels":2, ...}
|
38 |
+
model = DeBERTaLSTMClassifier(**model_args)
|
39 |
+
|
40 |
+
# Load state dict and handle missing attention layer for older models
|
41 |
+
try:
|
42 |
+
model.load_state_dict(checkpoint["model_state_dict"])
|
43 |
+
except RuntimeError as e:
|
44 |
+
if "attention" in str(e):
|
45 |
+
# Old model without attention layer - initialize attention layer and load partial state
|
46 |
+
state_dict = checkpoint["model_state_dict"]
|
47 |
+
model_dict = model.state_dict()
|
48 |
+
# Filter out attention layer parameters
|
49 |
+
filtered_dict = {k: v for k, v in state_dict.items() if "attention" not in k}
|
50 |
+
model_dict.update(filtered_dict)
|
51 |
+
model.load_state_dict(model_dict)
|
52 |
+
print("Loaded model without attention layer, using newly initialized attention weights")
|
53 |
+
else:
|
54 |
+
raise e
|
55 |
+
|
56 |
+
model.to(device).eval()
|
57 |
+
|
58 |
+
# --------- Load BERT model/tokenizer from Hugging Face Hub ----------
|
59 |
+
BERT_MODEL_PATH = "th1enq/bert_checkpoint" # Use Hugging Face Hub model
|
60 |
+
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_PATH)
|
61 |
+
bert_model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_PATH)
|
62 |
+
bert_model.to(device).eval()
|
63 |
+
|
64 |
+
# --------- Helper functions ----------
|
65 |
+
def is_url(text):
|
66 |
+
"""Check if text is a URL"""
|
67 |
+
url_pattern = re.compile(
|
68 |
+
r'^https?://' # http:// or https://
|
69 |
+
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
70 |
+
r'localhost|' # localhost...
|
71 |
+
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
72 |
+
r'(?::\d+)?' # optional port
|
73 |
+
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
74 |
+
return url_pattern.match(text) is not None
|
75 |
+
|
76 |
+
def fetch_html_content(url, timeout=10):
|
77 |
+
"""Fetch HTML content from URL"""
|
78 |
+
try:
|
79 |
+
headers = {
|
80 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
81 |
+
}
|
82 |
+
response = requests.get(url, headers=headers, timeout=timeout, verify=False)
|
83 |
+
response.raise_for_status()
|
84 |
+
return response.text, response.status_code
|
85 |
+
except requests.exceptions.RequestException as e:
|
86 |
+
return None, f"Request error: {str(e)}"
|
87 |
+
except Exception as e:
|
88 |
+
return None, f"General error: {str(e)}"
|
89 |
+
|
90 |
+
def predict_single_text(text, text_type="text"):
|
91 |
+
"""Predict for a single text input"""
|
92 |
+
# Tokenize
|
93 |
+
inputs = tokenizer(
|
94 |
+
text,
|
95 |
+
return_tensors="pt",
|
96 |
+
truncation=True,
|
97 |
+
padding=True,
|
98 |
+
max_length=256
|
99 |
+
)
|
100 |
+
# DeBERTa typically doesn't use token_type_ids
|
101 |
+
inputs.pop("token_type_ids", None)
|
102 |
+
# Move to device
|
103 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
104 |
+
|
105 |
+
with torch.no_grad():
|
106 |
+
try:
|
107 |
+
# Try to get predictions with attention weights
|
108 |
+
result = model(**inputs, return_attention=True)
|
109 |
+
if isinstance(result, tuple) and len(result) == 3:
|
110 |
+
logits, attention_weights, deberta_attentions = result
|
111 |
+
has_attention = True
|
112 |
+
else:
|
113 |
+
logits = result
|
114 |
+
has_attention = False
|
115 |
+
except TypeError:
|
116 |
+
# Fallback for older model without return_attention parameter
|
117 |
+
logits = model(**inputs)
|
118 |
+
has_attention = False
|
119 |
+
|
120 |
+
probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
|
121 |
+
|
122 |
+
# Get tokens for visualization
|
123 |
+
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0).tolist())
|
124 |
+
|
125 |
+
return probs, tokens, has_attention, attention_weights if has_attention else None
|
126 |
+
|
127 |
+
def combine_predictions(url_probs, html_probs, url_weight=0.3, html_weight=0.7):
|
128 |
+
"""Combine URL and HTML content predictions"""
|
129 |
+
combined_probs = [
|
130 |
+
url_weight * url_probs[0] + html_weight * html_probs[0], # benign
|
131 |
+
url_weight * url_probs[1] + html_weight * html_probs[1] # phishing
|
132 |
+
]
|
133 |
+
return combined_probs
|
134 |
+
|
135 |
+
# --------- Inference function ----------
|
136 |
+
def predict_fn(text: str):
|
137 |
+
if not text or not text.strip():
|
138 |
+
return {"error": "Please enter a URL or text."}, ""
|
139 |
+
|
140 |
+
# Check if input is URL
|
141 |
+
if is_url(text.strip()):
|
142 |
+
# Process URL
|
143 |
+
url = text.strip()
|
144 |
+
|
145 |
+
# Get prediction for URL itself
|
146 |
+
url_probs, url_tokens, url_has_attention, url_attention = predict_single_text(url, "URL")
|
147 |
+
|
148 |
+
# Try to fetch HTML content
|
149 |
+
html_content, status = fetch_html_content(url)
|
150 |
+
|
151 |
+
if html_content:
|
152 |
+
# Get prediction for HTML content
|
153 |
+
html_probs, html_tokens, html_has_attention, html_attention = predict_single_text(html_content, "HTML")
|
154 |
+
|
155 |
+
# Get XGBoost predictions
|
156 |
+
xgb_result = xgboost_detector.predict_combined(url, html_content)
|
157 |
+
|
158 |
+
# Combine predictions
|
159 |
+
combined_probs = combine_predictions(url_probs, html_probs)
|
160 |
+
|
161 |
+
# Use combined probabilities but show analysis for both
|
162 |
+
probs = combined_probs
|
163 |
+
tokens = url_tokens + ["[SEP]"] + html_tokens[:50] # Limit HTML tokens for display
|
164 |
+
has_attention = url_has_attention or html_has_attention
|
165 |
+
attention_weights = url_attention if url_has_attention else html_attention
|
166 |
+
|
167 |
+
analysis_type = "Combined URL + HTML Analysis"
|
168 |
+
fetch_status = f"β
Successfully fetched HTML content (Status: {status})"
|
169 |
+
|
170 |
+
# Add XGBoost analysis if available
|
171 |
+
if xgb_result:
|
172 |
+
analysis_type += " + XGBoost"
|
173 |
+
xgb_info = f" | XGBoost: {'Phishing' if xgb_result['is_phishing'] else 'Benign'} ({xgb_result['probability'][1]:.1%})"
|
174 |
+
fetch_status += xgb_info
|
175 |
+
|
176 |
+
else:
|
177 |
+
# Fallback to URL-only analysis
|
178 |
+
probs = url_probs
|
179 |
+
tokens = url_tokens
|
180 |
+
has_attention = url_has_attention
|
181 |
+
attention_weights = url_attention
|
182 |
+
|
183 |
+
# Get XGBoost URL prediction
|
184 |
+
xgb_result = xgboost_detector.predict_url(url)
|
185 |
+
|
186 |
+
analysis_type = "URL-only Analysis"
|
187 |
+
fetch_status = f"β οΈ Could not fetch HTML content: {status}"
|
188 |
+
|
189 |
+
# Add XGBoost analysis if available
|
190 |
+
if xgb_result:
|
191 |
+
analysis_type += " + XGBoost"
|
192 |
+
xgb_info = f" | XGBoost: {'Phishing' if xgb_result['is_phishing'] else 'Benign'} ({xgb_result['probability'][1]:.1%})"
|
193 |
+
fetch_status += xgb_info
|
194 |
+
else:
|
195 |
+
# Process as regular text
|
196 |
+
probs, tokens, has_attention, attention_weights = predict_single_text(text, "text")
|
197 |
+
analysis_type = "Text Analysis"
|
198 |
+
fetch_status = ""
|
199 |
+
|
200 |
+
# Get tokens for visualization
|
201 |
+
|
202 |
+
# Create detailed analysis
|
203 |
+
predicted_class = "phishing" if probs[1] > probs[0] else "benign"
|
204 |
+
confidence = max(probs)
|
205 |
+
|
206 |
+
detailed_analysis = f"""
|
207 |
+
<div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
|
208 |
+
<div style="background: linear-gradient(135deg, {'#8b0000' if predicted_class == 'phishing' else '#006400'} 0%, {'#dc143c' if predicted_class == 'phishing' else '#228b22'} 100%); padding: 25px; border-radius: 20px; color: white; text-align: center; margin-bottom: 20px; box-shadow: 0 8px 32px rgba(0,0,0,0.5); border: 2px solid {'#ff4444' if predicted_class == 'phishing' else '#44ff44'};">
|
209 |
+
<h2 style="margin: 0 0 10px 0; font-size: 28px; color: white;">π {analysis_type}</h2>
|
210 |
+
<div style="font-size: 36px; font-weight: bold; margin: 10px 0; color: white;">
|
211 |
+
{predicted_class.upper()}
|
212 |
+
</div>
|
213 |
+
<div style="font-size: 18px; color: #f0f0f0;">
|
214 |
+
Confidence: {confidence:.1%}
|
215 |
+
</div>
|
216 |
+
<div style="margin-top: 15px; font-size: 14px; color: #e0e0e0;">
|
217 |
+
{'This appears to be a phishing attempt!' if predicted_class == 'phishing' else 'β
This appears to be legitimate content.'}
|
218 |
+
</div>
|
219 |
+
</div>
|
220 |
+
"""
|
221 |
+
|
222 |
+
if fetch_status:
|
223 |
+
detailed_analysis += f"""
|
224 |
+
<div style="background: #2d2d2d; padding: 15px; border-radius: 10px; margin: 15px 0; border-left: 4px solid #4caf50; color: #e0e0e0;">
|
225 |
+
<strong>Fetch Status:</strong> {fetch_status}
|
226 |
+
</div>
|
227 |
+
"""
|
228 |
+
|
229 |
+
if has_attention and attention_weights is not None:
|
230 |
+
attention_scores = attention_weights.squeeze(0).tolist()
|
231 |
+
|
232 |
+
token_analysis = []
|
233 |
+
for i, (token, score) in enumerate(zip(tokens, attention_scores)):
|
234 |
+
# More lenient filtering - include more tokens for text analysis
|
235 |
+
if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>'] and len(token.strip()) > 0 and score > 0.005:
|
236 |
+
clean_token = token.replace('β', '').replace('Δ ', '').strip() # Handle different tokenizer prefixes
|
237 |
+
if clean_token: # Only add if token has content after cleaning
|
238 |
+
token_analysis.append({
|
239 |
+
'token': clean_token,
|
240 |
+
'importance': score,
|
241 |
+
'position': i
|
242 |
+
})
|
243 |
+
|
244 |
+
# Sort by importance
|
245 |
+
token_analysis.sort(key=lambda x: x['importance'], reverse=True)
|
246 |
+
|
247 |
+
detailed_analysis += f"""
|
248 |
+
## Top important tokens:
|
249 |
+
<div style="background: #2d2d2d; padding: 15px; border-radius: 10px; margin: 15px 0; border-left: 4px solid #4caf50; color: #e0e0e0;">
|
250 |
+
<strong>Analysis Info:</strong> Found {len(token_analysis)} important tokens out of {len(tokens)} total tokens
|
251 |
+
</div>
|
252 |
+
<div style="font-family: Arial, sans-serif;">
|
253 |
+
"""
|
254 |
+
|
255 |
+
for i, token_info in enumerate(token_analysis[:10]): # Top 10 tokens
|
256 |
+
bar_width = int(token_info['importance'] * 100)
|
257 |
+
color = "#ff4444" if predicted_class == "phishing" else "#44ff44"
|
258 |
+
|
259 |
+
detailed_analysis += f"""
|
260 |
+
<div style="margin: 8px 0; display: flex; align-items: center; background: #2d2d2d; padding: 8px; border-radius: 8px; border-left: 4px solid {color};">
|
261 |
+
<div style="width: 30px; text-align: right; margin-right: 10px; font-weight: bold; color: #ffffff;">
|
262 |
+
{i+1}.
|
263 |
+
</div>
|
264 |
+
<div style="width: 120px; margin-right: 10px; font-weight: bold; color: #e0e0e0; text-align: right;">
|
265 |
+
{token_info['token']}
|
266 |
+
</div>
|
267 |
+
<div style="width: 300px; background-color: #404040; border-radius: 10px; overflow: hidden; margin-right: 10px; border: 1px solid #555;">
|
268 |
+
<div style="width: {bar_width}%; background-color: {color}; height: 20px; border-radius: 10px; transition: width 0.3s ease;"></div>
|
269 |
+
</div>
|
270 |
+
<div style="color: #cccccc; font-size: 12px; font-weight: bold;">
|
271 |
+
{token_info['importance']:.1%}
|
272 |
+
</div>
|
273 |
+
</div>
|
274 |
+
"""
|
275 |
+
|
276 |
+
detailed_analysis += "</div>\n"
|
277 |
+
|
278 |
+
detailed_analysis += f"""
|
279 |
+
## Detailed analysis:
|
280 |
+
<div style="font-family: Arial, sans-serif; background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
|
281 |
+
<h3 style="margin: 0 0 15px 0; color: white;">Statistical Overview</h3>
|
282 |
+
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;">
|
283 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
|
284 |
+
<div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
|
285 |
+
<div style="font-size: 14px; color: #e0e0e0;">Total tokens</div>
|
286 |
+
</div>
|
287 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
|
288 |
+
<div style="font-size: 24px; font-weight: bold, color: white;">{len([t for t in token_analysis if t['importance'] > 0.05])}</div>
|
289 |
+
<div style="font-size: 14px, color: #e0e0e0;">High impact tokens (>5%)</div>
|
290 |
+
</div>
|
291 |
+
</div>
|
292 |
+
</div>
|
293 |
+
<div style="font-family: Arial, sans-serif; margin: 15px 0; background: #2d2d2d; padding: 20px; border-radius: 15px; border: 1px solid #555;">
|
294 |
+
<h3 style="color: #ffffff; margin-bottom: 15px;"> Prediction Confidence</h3>
|
295 |
+
<div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
|
296 |
+
<span style="font-weight: bold; color: #ff4444;">Phishing</span>
|
297 |
+
<span style="font-weight: bold; color: #44ff44;">Benign</span>
|
298 |
+
</div>
|
299 |
+
<div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
|
300 |
+
<div style="width: {probs[1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
|
301 |
+
{probs[1]:.1%}
|
302 |
+
</div>
|
303 |
+
</div>
|
304 |
+
<div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
|
305 |
+
Benign: {probs[0]:.1%}
|
306 |
+
</div>
|
307 |
+
</div>
|
308 |
+
"""
|
309 |
+
else:
|
310 |
+
# Fallback analysis without attention weights
|
311 |
+
detailed_analysis += f"""
|
312 |
+
<div style="background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
|
313 |
+
<h3 style="margin: 0 0 15px 0; color: white;">Basic Analysis</h3>
|
314 |
+
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 15px;">
|
315 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
|
316 |
+
<div style="font-size: 24px; font-weight: bold; color: white;">{probs[1]:.1%}</div>
|
317 |
+
<div style="font-size: 14px; color: #e0e0e0;">Phishing</div>
|
318 |
+
</div>
|
319 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
|
320 |
+
<div style="font-size: 24px; font-weight: bold; color: white;">{probs[0]:.1%}</div>
|
321 |
+
<div style="font-size: 14px; color: #e0e0e0;">Benign</div>
|
322 |
+
</div>
|
323 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
|
324 |
+
<div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
|
325 |
+
<div style="font-size: 14px; color: #e0e0e0;">Tokens</div>
|
326 |
+
</div>
|
327 |
+
</div>
|
328 |
+
</div>
|
329 |
+
<div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
|
330 |
+
<h3 style="color: #ffffff; margin: 0 0 15px 0;">π€ Tokens in text:</h3>
|
331 |
+
<div style="display: flex; flex-wrap: wrap; gap: 8px;">""" + ''.join([f'<span style="background: #404040; color: #64b5f6; padding: 4px 8px; border-radius: 15px; font-size: 12px; border: 1px solid #666;">{token.replace("β", "")}</span>' for token in tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]) + f"""</div>
|
332 |
+
<div style="margin-top: 15px; padding: 10px; background: #3d2914; border-radius: 8px; border-left: 4px solid #ff9800;">
|
333 |
+
<strong style="color: #ffcc02;">Debug info:</strong> <span style="color: #e0e0e0;">Found {len(tokens)} total tokens, {len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])} content tokens</span>
|
334 |
+
</div>
|
335 |
+
</div>
|
336 |
+
<div style="background: #3d2914; padding: 15px; border-radius: 10px; border-left: 4px solid #ff9800; margin: 15px 0;">
|
337 |
+
<p style="margin: 0; color: #ffcc02; font-size: 14px;">
|
338 |
+
<strong>Note:</strong> Detailed attention weights analysis is not available for the current model.
|
339 |
+
</p>
|
340 |
+
</div>
|
341 |
+
"""
|
342 |
+
|
343 |
+
# Build label->prob mapping for Gradio Label output
|
344 |
+
if len(LABELS) == len(probs):
|
345 |
+
prediction_result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
|
346 |
+
else:
|
347 |
+
prediction_result = {f"class_{i}": float(p) for i, p in enumerate(probs)}
|
348 |
+
|
349 |
+
return prediction_result, detailed_analysis
|
350 |
+
|
351 |
+
# --------- BERT Model Functions ----------
|
352 |
+
def predict_bert_single_text(text):
|
353 |
+
"""Predict for a single text input using BERT."""
|
354 |
+
# Tokenize
|
355 |
+
inputs = bert_tokenizer(
|
356 |
+
text,
|
357 |
+
return_tensors="pt",
|
358 |
+
truncation=True,
|
359 |
+
padding=True,
|
360 |
+
max_length=512
|
361 |
+
)
|
362 |
+
# Move to device
|
363 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
364 |
+
|
365 |
+
with torch.no_grad():
|
366 |
+
logits = bert_model(**inputs).logits
|
367 |
+
|
368 |
+
probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
|
369 |
+
|
370 |
+
return probs
|
371 |
+
|
372 |
+
def predict_bert_interface_fn(text: str):
|
373 |
+
"""Gradio interface function for BERT model."""
|
374 |
+
if not text or not text.strip():
|
375 |
+
return {"error": "Please enter a URL or text."}, ""
|
376 |
+
|
377 |
+
probs = predict_bert_single_text(text)
|
378 |
+
|
379 |
+
# Create detailed analysis
|
380 |
+
predicted_class = "phishing" if probs[1] > probs[0] else "benign"
|
381 |
+
confidence = max(probs)
|
382 |
+
|
383 |
+
detailed_analysis = f"""
|
384 |
+
<div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
|
385 |
+
<div style="background: linear-gradient(135deg, {'#8b0000' if predicted_class == 'phishing' else '#006400'} 0%, {'#dc143c' if predicted_class == 'phishing' else '#228b22'} 100%); padding: 25px; border-radius: 20px; color: white; text-align: center; margin-bottom: 20px; box-shadow: 0 8px 32px rgba(0,0,0,0.5); border: 2px solid {'#ff4444' if predicted_class == 'phishing' else '#44ff44'};">
|
386 |
+
<h2 style="margin: 0 0 10px 0; font-size: 28px; color: white;">π BERT Model Analysis</h2>
|
387 |
+
<div style="font-size: 36px; font-weight: bold; margin: 10px 0; color: white;">
|
388 |
+
{predicted_class.upper()}
|
389 |
+
</div>
|
390 |
+
<div style="font-size: 18px, color: #f0f0f0;">
|
391 |
+
Confidence: {confidence:.1%}
|
392 |
+
</div>
|
393 |
+
</div>
|
394 |
+
<div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
|
395 |
+
<h3 style="color: #ffffff; margin-bottom: 15px;"> Prediction Confidence</h3>
|
396 |
+
<div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
|
397 |
+
<span style="font-weight: bold; color: #ff4444;">Phishing</span>
|
398 |
+
<span style="font-weight: bold; color: #44ff44;">Benign</span>
|
399 |
+
</div>
|
400 |
+
<div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
|
401 |
+
<div style="width: {probs[1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
|
402 |
+
{probs[1]:.1%}
|
403 |
+
</div>
|
404 |
+
</div>
|
405 |
+
<div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
|
406 |
+
Benign: {probs[0]:.1%}
|
407 |
+
</div>
|
408 |
+
</div>
|
409 |
+
</div>
|
410 |
+
"""
|
411 |
+
# Build label->prob mapping for Gradio Label output
|
412 |
+
if len(LABELS) == len(probs):
|
413 |
+
prediction_result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
|
414 |
+
else:
|
415 |
+
prediction_result = {f"class_{i}": float(p) for i, p in enumerate(probs)}
|
416 |
+
|
417 |
+
return prediction_result, detailed_analysis
|
418 |
+
|
419 |
+
# --------- XGBoost Interface Function ----------
|
420 |
+
def predict_xgboost_interface_fn(text: str):
|
421 |
+
"""Gradio interface function for XGBoost models."""
|
422 |
+
if not text or not text.strip():
|
423 |
+
return {"error": "Please enter a URL or text."}, ""
|
424 |
+
|
425 |
+
if not xgboost_detector.available:
|
426 |
+
return {"error": "XGBoost models not available"}, "XGBoost models are not properly loaded."
|
427 |
+
|
428 |
+
# Check if input is URL
|
429 |
+
if is_url(text.strip()):
|
430 |
+
url = text.strip()
|
431 |
+
|
432 |
+
# Try to fetch HTML content
|
433 |
+
html_content, status = fetch_html_content(url)
|
434 |
+
|
435 |
+
if html_content:
|
436 |
+
result = xgboost_detector.predict_combined(url, html_content)
|
437 |
+
analysis_type = "Combined URL + HTML XGBoost Analysis"
|
438 |
+
fetch_status = f"β
Successfully fetched HTML content (Status: {status})"
|
439 |
+
else:
|
440 |
+
result = xgboost_detector.predict_url(url)
|
441 |
+
analysis_type = "URL-only XGBoost Analysis"
|
442 |
+
fetch_status = f"β οΈ Could not fetch HTML content: {status}"
|
443 |
+
else:
|
444 |
+
# For text input, treat as HTML content
|
445 |
+
result = xgboost_detector.predict_html(text)
|
446 |
+
analysis_type = "HTML Content XGBoost Analysis"
|
447 |
+
fetch_status = ""
|
448 |
+
|
449 |
+
if not result:
|
450 |
+
return {"error": "XGBoost prediction failed"}, "Failed to get prediction from XGBoost models."
|
451 |
+
|
452 |
+
predicted_class = "phishing" if result['is_phishing'] else "benign"
|
453 |
+
confidence = max(result['probability'])
|
454 |
+
|
455 |
+
detailed_analysis = f"""
|
456 |
+
<div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
|
457 |
+
<div style="background: linear-gradient(135deg, {'#8b0000' if predicted_class == 'phishing' else '#006400'} 0%, {'#dc143c' if predicted_class == 'phishing' else '#228b22'} 100%); padding: 25px; border-radius: 20px; color: white; text-align: center; margin-bottom: 20px; box-shadow: 0 8px 32px rgba(0,0,0,0.5); border: 2px solid {'#ff4444' if predicted_class == 'phishing' else '#44ff44'};">
|
458 |
+
<h2 style="margin: 0 0 10px 0; font-size: 28px; color: white;">π {analysis_type}</h2>
|
459 |
+
<div style="font-size: 36px; font-weight: bold; margin: 10px 0; color: white;">
|
460 |
+
{predicted_class.upper()}
|
461 |
+
</div>
|
462 |
+
<div style="font-size: 18px; color: #f0f0f0;">
|
463 |
+
Confidence: {confidence:.1%}
|
464 |
+
</div>
|
465 |
+
</div>
|
466 |
+
"""
|
467 |
+
|
468 |
+
if fetch_status:
|
469 |
+
detailed_analysis += f"""
|
470 |
+
<div style="background: #2d2d2d; padding: 15px; border-radius: 10px; margin: 15px 0; border-left: 4px solid #4caf50; color: #e0e0e0;">
|
471 |
+
<strong>Fetch Status:</strong> {fetch_status}
|
472 |
+
</div>
|
473 |
+
"""
|
474 |
+
|
475 |
+
# Show detailed XGBoost results
|
476 |
+
detailed_analysis += f"""
|
477 |
+
<div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
|
478 |
+
<h3 style="color: #ffffff; margin-bottom: 15px;">π― XGBoost Prediction Confidence</h3>
|
479 |
+
<div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
|
480 |
+
<span style="font-weight: bold; color: #ff4444;">Phishing</span>
|
481 |
+
<span style="font-weight: bold; color: #44ff44;">Benign</span>
|
482 |
+
</div>
|
483 |
+
<div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
|
484 |
+
<div style="width: {result['probability'][1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
|
485 |
+
{result['probability'][1]:.1%}
|
486 |
+
</div>
|
487 |
+
</div>
|
488 |
+
<div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
|
489 |
+
Benign: {result['probability'][0]:.1%}
|
490 |
+
</div>
|
491 |
+
</div>
|
492 |
+
"""
|
493 |
+
|
494 |
+
# Show component analysis if available
|
495 |
+
if 'url_result' in result and 'html_result' in result:
|
496 |
+
detailed_analysis += f"""
|
497 |
+
<div style="background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
|
498 |
+
<h3 style="margin: 0 0 15px 0; color: white;">π¬ Component Analysis</h3>
|
499 |
+
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;">
|
500 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
|
501 |
+
<div style="font-size: 18px; font-weight: bold; color: white;">URL Analysis</div>
|
502 |
+
<div style="font-size: 24px; font-weight: bold; color: {'#ff6666' if result['url_result']['is_phishing'] else '#66ff66'};">
|
503 |
+
{'Phishing' if result['url_result']['is_phishing'] else 'Benign'}
|
504 |
+
</div>
|
505 |
+
<div style="font-size: 14px; color: #e0e0e0;">{result['url_result']['probability'][1]:.1%} phishing</div>
|
506 |
+
</div>
|
507 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
|
508 |
+
<div style="font-size: 18px; font-weight: bold; color: white;">HTML Analysis</div>
|
509 |
+
<div style="font-size: 24px; font-weight: bold; color: {'#ff6666' if result['html_result']['is_phishing'] else '#66ff66'};">
|
510 |
+
{'Phishing' if result['html_result']['is_phishing'] else 'Benign'}
|
511 |
+
</div>
|
512 |
+
<div style="font-size: 14px; color: #e0e0e0;">{result['html_result']['probability'][1]:.1%} phishing</div>
|
513 |
+
</div>
|
514 |
+
</div>
|
515 |
+
</div>
|
516 |
+
"""
|
517 |
+
|
518 |
+
detailed_analysis += "</div>"
|
519 |
+
|
520 |
+
# Build label->prob mapping for Gradio Label output
|
521 |
+
if len(LABELS) == len(result['probability']):
|
522 |
+
prediction_result = {LABELS[i]: float(result['probability'][i]) for i in range(len(LABELS))}
|
523 |
+
else:
|
524 |
+
prediction_result = {f"class_{i}": float(p) for i, p in enumerate(result['probability'])}
|
525 |
+
|
526 |
+
return prediction_result, detailed_analysis
|
527 |
+
|
528 |
+
# --------- Gradio UI ----------
|
529 |
+
deberta_interface = gr.Interface(
|
530 |
+
fn=predict_fn,
|
531 |
+
inputs=gr.Textbox(label="URL or text", placeholder="Example: http://suspicious-site.example or paste any text"),
|
532 |
+
outputs=[
|
533 |
+
gr.Label(label="Prediction result"),
|
534 |
+
gr.Markdown(label="Detailed token analysis")
|
535 |
+
],
|
536 |
+
title="Phishing Detector (DeBERTa + LSTM)",
|
537 |
+
description="""
|
538 |
+
Enter a URL or text for analysis.
|
539 |
+
**Features:**
|
540 |
+
- **URL Analysis**: For URLs, the system will fetch HTML content and combine both URL and content analysis
|
541 |
+
- **Combined Prediction**: Uses weighted combination of URL structure and webpage content analysis
|
542 |
+
- **Visual Analysis**: Predict phishing/benign probability with visual charts
|
543 |
+
- **Token Importance**: Display the most important tokens in classification
|
544 |
+
- **Detailed Insights**: Comprehensive analysis of the impact of each token
|
545 |
+
- **Dark Theme**: Beautiful interface with colorful charts optimized for dark themes
|
546 |
+
|
547 |
+
**How it works for URLs:**
|
548 |
+
1. Analyze the URL structure itself
|
549 |
+
2. Fetch the webpage HTML content
|
550 |
+
3. Analyze the webpage content
|
551 |
+
4. Combine both results for final prediction (30% URL + 70% content)
|
552 |
+
""",
|
553 |
+
examples=[
|
554 |
+
["http://rendmoiunserviceeee.com"],
|
555 |
+
["https://www.google.com"],
|
556 |
+
["Dear customer, your account has been suspended. Click here to verify your identity immediately."],
|
557 |
+
["https://mail-secure-login-verify.example/path?token=suspicious"],
|
558 |
+
["http://paypaI-security-update.net/login"],
|
559 |
+
["Your package has been delivered successfully. Thank you for using our service."],
|
560 |
+
["https://github.com/user/repo"]
|
561 |
+
],
|
562 |
+
theme=gr.themes.Soft(),
|
563 |
+
css="""
|
564 |
+
.gradio-container {
|
565 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
566 |
+
background-color: #1e1e1e !important;
|
567 |
+
color: #ffffff !important;
|
568 |
+
}
|
569 |
+
.dark .gradio-container {
|
570 |
+
background-color: #1e1e1e !important;
|
571 |
+
}
|
572 |
+
/* Dark theme for all components */
|
573 |
+
.block {
|
574 |
+
background-color: #2d2d2d !important;
|
575 |
+
border: 1px solid #444 !important;
|
576 |
+
}
|
577 |
+
.gradio-textbox {
|
578 |
+
background-color: #3d3d3d !important;
|
579 |
+
color: #ffffff !important;
|
580 |
+
border: 1px solid #666 !important;
|
581 |
+
}
|
582 |
+
.gradio-button {
|
583 |
+
background-color: #4a4a4a !important;
|
584 |
+
color: #ffffff !important;
|
585 |
+
border: 1px solid #666 !important;
|
586 |
+
}
|
587 |
+
.gradio-button:hover {
|
588 |
+
background-color: #5a5a5a !important;
|
589 |
+
}
|
590 |
+
"""
|
591 |
+
)
|
592 |
+
|
593 |
+
bert_interface = gr.Interface(
|
594 |
+
fn=predict_bert_interface_fn,
|
595 |
+
inputs=gr.Textbox(label="URL or text", placeholder="Example: http://suspicious-site.example or paste any text"),
|
596 |
+
outputs=[
|
597 |
+
gr.Label(label="Prediction result"),
|
598 |
+
gr.Markdown(label="Detailed analysis")
|
599 |
+
],
|
600 |
+
title="Phishing Detector (BERT)",
|
601 |
+
description="Enter a URL or text for analysis using the BERT model.",
|
602 |
+
examples=[
|
603 |
+
["http://rendmoiunserviceeee.com"],
|
604 |
+
["https://www.google.com"],
|
605 |
+
["Dear customer, your account has been suspended. Click here to verify your identity immediately."],
|
606 |
+
]
|
607 |
+
)
|
608 |
+
|
609 |
+
xgboost_interface = gr.Interface(
|
610 |
+
fn=predict_xgboost_interface_fn,
|
611 |
+
inputs=gr.Textbox(label="URL or HTML content", placeholder="Example: http://suspicious-site.example or paste HTML content"),
|
612 |
+
outputs=[
|
613 |
+
gr.Label(label="Prediction result"),
|
614 |
+
gr.Markdown(label="Detailed analysis")
|
615 |
+
],
|
616 |
+
title="Phishing Detector (XGBoost)",
|
617 |
+
description="""
|
618 |
+
Enter a URL or HTML content for analysis using XGBoost models.
|
619 |
+
**Features:**
|
620 |
+
- **URL Feature Analysis**: Extracts 30+ features from URL structure
|
621 |
+
- **HTML Feature Analysis**: Extracts 43+ features from HTML content
|
622 |
+
- **Combined Analysis**: For URLs, combines both URL and HTML features
|
623 |
+
- **Fast Prediction**: Traditional ML approach for quick results
|
624 |
+
""",
|
625 |
+
examples=[
|
626 |
+
["http://rendmoiunserviceeee.com"],
|
627 |
+
["https://www.google.com"],
|
628 |
+
["http://paypaI-security-update.net/login"],
|
629 |
+
["<html><head><title>Urgent Security Alert</title></head><body><form><input type='password'></form></body></html>"],
|
630 |
+
]
|
631 |
+
)
|
632 |
+
|
633 |
+
demo = gr.TabbedInterface(
|
634 |
+
[deberta_interface, bert_interface, xgboost_interface],
|
635 |
+
["DeBERTa + LSTM", "BERT", "XGBoost"]
|
636 |
+
)
|
637 |
+
|
638 |
+
if __name__ == "__main__":
|
639 |
+
demo.launch()
|
model.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from transformers import AutoModel
|
5 |
+
|
6 |
+
class DeBERTaLSTMClassifier(nn.Module):
|
7 |
+
def __init__(self, hidden_dim=128, num_labels=2):
|
8 |
+
super().__init__()
|
9 |
+
|
10 |
+
self.deberta = AutoModel.from_pretrained("microsoft/deberta-base")
|
11 |
+
for param in self.deberta.parameters():
|
12 |
+
param.requires_grad = False # freeze DeBERTa (as we don't have enough resources, we will not train DeBERTa in this model)
|
13 |
+
|
14 |
+
self.lstm = nn.LSTM(
|
15 |
+
input_size=self.deberta.config.hidden_size,
|
16 |
+
hidden_size=hidden_dim,
|
17 |
+
batch_first=True,
|
18 |
+
bidirectional=True
|
19 |
+
)
|
20 |
+
|
21 |
+
self.fc = nn.Linear(hidden_dim * 2, num_labels)
|
22 |
+
|
23 |
+
# Attention layer Δα» tΓnh token importance
|
24 |
+
self.attention = nn.Linear(hidden_dim * 2, 1)
|
25 |
+
|
26 |
+
def forward(self, input_ids, attention_mask, return_attention=False):
|
27 |
+
with torch.no_grad():
|
28 |
+
outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask, output_attentions=True)
|
29 |
+
|
30 |
+
lstm_out, _ = self.lstm(outputs.last_hidden_state) # shape: [batch, seq_len, hidden*2]
|
31 |
+
|
32 |
+
if return_attention:
|
33 |
+
# TΓnh attention weights cho tα»«ng token
|
34 |
+
attention_weights = self.attention(lstm_out) # [batch, seq_len, 1]
|
35 |
+
attention_weights = F.softmax(attention_weights.squeeze(-1), dim=-1) # [batch, seq_len]
|
36 |
+
|
37 |
+
# Apply attention mask
|
38 |
+
attention_weights = attention_weights * attention_mask.float()
|
39 |
+
attention_weights = attention_weights / (attention_weights.sum(dim=-1, keepdim=True) + 1e-8)
|
40 |
+
|
41 |
+
# Weighted sum of LSTM outputs
|
42 |
+
attended_output = torch.sum(lstm_out * attention_weights.unsqueeze(-1), dim=1)
|
43 |
+
logits = self.fc(attended_output)
|
44 |
+
|
45 |
+
return logits, attention_weights, outputs.attentions
|
46 |
+
else:
|
47 |
+
final_hidden = lstm_out[:, -1, :] # last token output
|
48 |
+
logits = self.fc(final_hidden)
|
49 |
+
return logits
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=2.0.0
|
2 |
+
transformers==4.41.2
|
3 |
+
huggingface_hub==0.24.5
|
4 |
+
safetensors
|
5 |
+
gradio==4.39.0
|
6 |
+
requests>=2.25.0
|
7 |
+
beautifulsoup4>=4.9.0
|
8 |
+
numpy>=1.21.0
|
9 |
+
scikit-learn>=1.3.0
|
10 |
+
joblib>=1.2.0
|
11 |
+
xgboost>=1.7.0
|
12 |
+
urllib3>=1.26.0
|
xgboost/URLFeatureExtraction.py
ADDED
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# importing required packages for this section
|
4 |
+
from urllib.parse import urlparse,urlencode
|
5 |
+
import ipaddress
|
6 |
+
import re
|
7 |
+
|
8 |
+
"""#### **3.1.1. Domain of the URL**
|
9 |
+
Here, we are just extracting the domain present in the URL. This feature doesn't have much significance in the training. May even be dropped while training the model.
|
10 |
+
"""
|
11 |
+
'''
|
12 |
+
# 1.Domain of the URL (Domain)
|
13 |
+
def getDomain(url):
|
14 |
+
domain = urlparse(url).netloc
|
15 |
+
if re.match(r"^www.",domain):
|
16 |
+
domain = domain.replace("www.","")
|
17 |
+
return domain'''
|
18 |
+
|
19 |
+
"""#### **3.1.2. IP Address in the URL**
|
20 |
+
|
21 |
+
Checks for the presence of IP address in the URL. URLs may have IP address instead of domain name. If an IP address is used as an alternative of the domain name in the URL, we can be sure that someone is trying to steal personal information with this URL.
|
22 |
+
|
23 |
+
If the domain part of URL has IP address, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
24 |
+
"""
|
25 |
+
|
26 |
+
# 2.Checks for IP address in URL (Have_IP)
|
27 |
+
def havingIP(url):
|
28 |
+
try:
|
29 |
+
ipaddress.ip_address(url)
|
30 |
+
ip = 1
|
31 |
+
except:
|
32 |
+
ip = 0
|
33 |
+
return ip
|
34 |
+
|
35 |
+
"""#### **3.1.3. "@" Symbol in URL**
|
36 |
+
|
37 |
+
Checks for the presence of '@' symbol in the URL. Using β@β symbol in the URL leads the browser to ignore everything preceding the β@β symbol and the real address often follows the β@β symbol.
|
38 |
+
|
39 |
+
If the URL has '@' symbol, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
40 |
+
"""
|
41 |
+
|
42 |
+
# 3.Checks the presence of @ in URL (Have_At)
|
43 |
+
def haveAtSign(url):
|
44 |
+
if "@" in url:
|
45 |
+
at = 1
|
46 |
+
else:
|
47 |
+
at = 0
|
48 |
+
return at
|
49 |
+
|
50 |
+
"""#### **3.1.4. Length of URL**
|
51 |
+
|
52 |
+
Computes the length of the URL. Phishers can use long URL to hide the doubtful part in the address bar. In this project, if the length of the URL is greater than or equal 54 characters then the URL classified as phishing otherwise legitimate.
|
53 |
+
|
54 |
+
If the length of URL >= 54 , the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
55 |
+
"""
|
56 |
+
|
57 |
+
# 4.Finding the length of URL and categorizing (URL_Length)
|
58 |
+
def getLength(url):
|
59 |
+
if len(url) < 54:
|
60 |
+
length = 0
|
61 |
+
else:
|
62 |
+
length = 1
|
63 |
+
return length
|
64 |
+
|
65 |
+
"""#### **3.1.5. Depth of URL**
|
66 |
+
|
67 |
+
Computes the depth of the URL. This feature calculates the number of sub pages in the given url based on the '/'.
|
68 |
+
|
69 |
+
The value of feature is a numerical based on the URL.
|
70 |
+
"""
|
71 |
+
|
72 |
+
# 5.Gives number of '/' in URL (URL_Depth)
|
73 |
+
def getDepth(url):
|
74 |
+
s = urlparse(url).path.split('/')
|
75 |
+
depth = 0
|
76 |
+
for j in range(len(s)):
|
77 |
+
if len(s[j]) != 0:
|
78 |
+
depth = depth+1
|
79 |
+
return depth
|
80 |
+
|
81 |
+
"""#### **3.1.6. Redirection "//" in URL**
|
82 |
+
|
83 |
+
Checks the presence of "//" in the URL. The existence of β//β within the URL path means that the user will be redirected to another website. The location of the β//β in URL is computed. We find that if the URL starts with βHTTPβ, that means the β//β should appear in the sixth position. However, if the URL employs βHTTPSβ then the β//β should appear in seventh position.
|
84 |
+
|
85 |
+
If the "//" is anywhere in the URL apart from after the protocal, thee value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
86 |
+
"""
|
87 |
+
|
88 |
+
# 6.Checking for redirection '//' in the url (Redirection)
|
89 |
+
def redirection(url):
|
90 |
+
pos = url.rfind('//')
|
91 |
+
if pos > 6:
|
92 |
+
if pos > 7:
|
93 |
+
return 1
|
94 |
+
else:
|
95 |
+
return 0
|
96 |
+
else:
|
97 |
+
return 0
|
98 |
+
|
99 |
+
"""#### **3.1.7. "http/https" in Domain name**
|
100 |
+
|
101 |
+
Checks for the presence of "http/https" in the domain part of the URL. The phishers may add the βHTTPSβ token to the domain part of a URL in order to trick users.
|
102 |
+
|
103 |
+
If the URL has "http/https" in the domain part, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
104 |
+
"""
|
105 |
+
|
106 |
+
# 7.Existence of βHTTPSβ Token in the Domain Part of the URL (https_Domain)
|
107 |
+
def httpDomain(url):
|
108 |
+
domain = urlparse(url).netloc
|
109 |
+
if 'https' in domain:
|
110 |
+
return 1
|
111 |
+
else:
|
112 |
+
return 0
|
113 |
+
|
114 |
+
"""#### **3.1.8. Using URL Shortening Services βTinyURLβ**
|
115 |
+
|
116 |
+
URL shortening is a method on the βWorld Wide Webβ in which a URL may be made considerably smaller in length and still lead to the required webpage. This is accomplished by means of an βHTTP Redirectβ on a domain name that is short, which links to the webpage that has a long URL.
|
117 |
+
|
118 |
+
If the URL is using Shortening Services, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
119 |
+
"""
|
120 |
+
|
121 |
+
#listing shortening services
|
122 |
+
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
|
123 |
+
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
|
124 |
+
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
|
125 |
+
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
|
126 |
+
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
|
127 |
+
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
|
128 |
+
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
|
129 |
+
r"tr\.im|link\.zip\.net"
|
130 |
+
|
131 |
+
# 8. Checking for Shortening Services in URL (Tiny_URL)
|
132 |
+
def tinyURL(url):
|
133 |
+
match=re.search(shortening_services,url)
|
134 |
+
if match:
|
135 |
+
return 1
|
136 |
+
else:
|
137 |
+
return 0
|
138 |
+
|
139 |
+
"""#### **3.1.9. Prefix or Suffix "-" in Domain**
|
140 |
+
|
141 |
+
Checking the presence of '-' in the domain part of URL. The dash symbol is rarely used in legitimate URLs. Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate webpage.
|
142 |
+
|
143 |
+
If the URL has '-' symbol in the domain part of the URL, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
144 |
+
"""
|
145 |
+
|
146 |
+
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
|
147 |
+
def prefixSuffix(url):
|
148 |
+
if '-' in urlparse(url).netloc:
|
149 |
+
return 1 # phishing
|
150 |
+
else:
|
151 |
+
return 0 # legitimate
|
152 |
+
|
153 |
+
"""### **3.2. Domain Based Features:**
|
154 |
+
|
155 |
+
Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.
|
156 |
+
|
157 |
+
* DNS Record
|
158 |
+
* Website Traffic
|
159 |
+
* Age of Domain
|
160 |
+
* End Period of Domain
|
161 |
+
|
162 |
+
Each of these features are explained and the coded below:
|
163 |
+
"""
|
164 |
+
|
165 |
+
#!pip install python-whois
|
166 |
+
|
167 |
+
# importing required packages for this section
|
168 |
+
import re
|
169 |
+
from bs4 import BeautifulSoup
|
170 |
+
#import whois
|
171 |
+
import urllib
|
172 |
+
import urllib.request
|
173 |
+
from datetime import datetime
|
174 |
+
|
175 |
+
"""#### **3.2.1. DNS Record**
|
176 |
+
|
177 |
+
For phishing websites, either the claimed identity is not recognized by the WHOIS database or no records founded for the hostname.
|
178 |
+
If the DNS record is empty or not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
179 |
+
"""
|
180 |
+
|
181 |
+
# 11.DNS Record availability (DNS_Record)
|
182 |
+
# obtained in the featureExtraction function itself
|
183 |
+
|
184 |
+
"""#### **3.2.2. Web Traffic**
|
185 |
+
|
186 |
+
This feature measures the popularity of the website by determining the number of visitors and the number of pages they visit. However, since phishing websites live for a short period of time, they may not be recognized by the Alexa database (Alexa the Web Information Company., 1996). By reviewing our dataset, we find that in worst scenarios, legitimate websites ranked among the top 100,000. Furthermore, if the domain has no traffic or is not recognized by the Alexa database, it is classified as βPhishingβ.
|
187 |
+
|
188 |
+
If the rank of the domain < 100000, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
|
189 |
+
"""
|
190 |
+
|
191 |
+
# 12.Web traffic (Web_Traffic)
|
192 |
+
def web_traffic(url):
|
193 |
+
try:
|
194 |
+
#Filling the whitespaces in the URL if any
|
195 |
+
url = urllib.parse.quote(url)
|
196 |
+
rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
|
197 |
+
"REACH")['RANK']
|
198 |
+
rank = int(rank)
|
199 |
+
except TypeError:
|
200 |
+
return 1
|
201 |
+
if rank <100000:
|
202 |
+
return 1
|
203 |
+
else:
|
204 |
+
return 0
|
205 |
+
|
206 |
+
"""#### **3.2.3. Age of Domain**
|
207 |
+
|
208 |
+
This feature can be extracted from WHOIS database. Most phishing websites live for a short period of time. The minimum age of the legitimate domain is considered to be 12 months for this project. Age here is nothing but different between creation and expiration time.
|
209 |
+
|
210 |
+
If age of domain > 12 months, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
|
211 |
+
"""
|
212 |
+
|
213 |
+
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
|
214 |
+
def domainAge(domain_name):
|
215 |
+
creation_date = domain_name.creation_date
|
216 |
+
expiration_date = domain_name.expiration_date
|
217 |
+
if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
|
218 |
+
try:
|
219 |
+
creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
|
220 |
+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
|
221 |
+
except:
|
222 |
+
return 1
|
223 |
+
if ((expiration_date is None) or (creation_date is None)):
|
224 |
+
return 1
|
225 |
+
elif ((type(expiration_date) is list) or (type(creation_date) is list)):
|
226 |
+
return 1
|
227 |
+
else:
|
228 |
+
ageofdomain = abs((expiration_date - creation_date).days)
|
229 |
+
if ((ageofdomain/30) < 6):
|
230 |
+
age = 1
|
231 |
+
else:
|
232 |
+
age = 0
|
233 |
+
return age
|
234 |
+
|
235 |
+
"""#### **3.2.4. End Period of Domain**
|
236 |
+
|
237 |
+
This feature can be extracted from WHOIS database. For this feature, the remaining domain time is calculated by finding the different between expiration time & current time. The end period considered for the legitimate domain is 6 months or less for this project.
|
238 |
+
|
239 |
+
If end period of domain > 6 months, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
|
240 |
+
"""
|
241 |
+
|
242 |
+
# 14.End time of domain: The difference between termination time and current time (Domain_End)
|
243 |
+
def domainEnd(domain_name):
|
244 |
+
expiration_date = domain_name.expiration_date
|
245 |
+
if isinstance(expiration_date,str):
|
246 |
+
try:
|
247 |
+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
|
248 |
+
except:
|
249 |
+
return 1
|
250 |
+
if (expiration_date is None):
|
251 |
+
return 1
|
252 |
+
elif (type(expiration_date) is list):
|
253 |
+
return 1
|
254 |
+
else:
|
255 |
+
today = datetime.now()
|
256 |
+
end = abs((expiration_date - today).days)
|
257 |
+
if ((end/30) < 6):
|
258 |
+
end = 0
|
259 |
+
else:
|
260 |
+
end = 1
|
261 |
+
return end
|
262 |
+
|
263 |
+
"""## **3.3. HTML and JavaScript based Features**
|
264 |
+
|
265 |
+
Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.
|
266 |
+
|
267 |
+
* IFrame Redirection
|
268 |
+
* Status Bar Customization
|
269 |
+
* Disabling Right Click
|
270 |
+
* Website Forwarding
|
271 |
+
|
272 |
+
Each of these features are explained and the coded below:
|
273 |
+
"""
|
274 |
+
|
275 |
+
# importing required packages for this section
|
276 |
+
import requests
|
277 |
+
|
278 |
+
"""### **3.3.1. IFrame Redirection**
|
279 |
+
|
280 |
+
IFrame is an HTML tag used to display an additional webpage into one that is currently shown. Phishers can make use of the βiframeβ tag and make it invisible i.e. without frame borders. In this regard, phishers make use of the βframeBorderβ attribute which causes the browser to render a visual delineation.
|
281 |
+
|
282 |
+
If the iframe is empty or repsonse is not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
283 |
+
"""
|
284 |
+
|
285 |
+
# 15. IFrame Redirection (iFrame)
|
286 |
+
def iframe(response):
|
287 |
+
if response == "":
|
288 |
+
return 1
|
289 |
+
else:
|
290 |
+
if re.findall(r"[<iframe>|<frameBorder>]", response.text):
|
291 |
+
return 0
|
292 |
+
else:
|
293 |
+
return 1
|
294 |
+
|
295 |
+
"""### **3.3.2. Status Bar Customization**
|
296 |
+
|
297 |
+
Phishers may use JavaScript to show a fake URL in the status bar to users. To extract this feature, we must dig-out the webpage source code, particularly the βonMouseOverβ event, and check if it makes any changes on the status bar
|
298 |
+
|
299 |
+
If the response is empty or onmouseover is found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
300 |
+
"""
|
301 |
+
|
302 |
+
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
|
303 |
+
def mouseOver(response):
|
304 |
+
if response == "" :
|
305 |
+
return 1
|
306 |
+
else:
|
307 |
+
if re.findall("<script>.+onmouseover.+</script>", response.text):
|
308 |
+
return 1
|
309 |
+
else:
|
310 |
+
return 0
|
311 |
+
|
312 |
+
"""### **3.3.3. Disabling Right Click**
|
313 |
+
|
314 |
+
Phishers use JavaScript to disable the right-click function, so that users cannot view and save the webpage source code. This feature is treated exactly as βUsing onMouseOver to hide the Linkβ. Nonetheless, for this feature, we will search for event βevent.button==2β in the webpage source code and check if the right click is disabled.
|
315 |
+
|
316 |
+
If the response is empty or onmouseover is not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
|
317 |
+
"""
|
318 |
+
|
319 |
+
# 17.Checks the status of the right click attribute (Right_Click)
|
320 |
+
def rightClick(response):
|
321 |
+
if response == "":
|
322 |
+
return 1
|
323 |
+
else:
|
324 |
+
if re.findall(r"event.button ?== ?2", response.text):
|
325 |
+
return 0
|
326 |
+
else:
|
327 |
+
return 1
|
328 |
+
|
329 |
+
"""### **3.3.4. Website Forwarding**
|
330 |
+
The fine line that distinguishes phishing websites from legitimate ones is how many times a website has been redirected. In our dataset, we find that legitimate websites have been redirected one time max. On the other hand, phishing websites containing this feature have been redirected at least 4 times.
|
331 |
+
"""
|
332 |
+
|
333 |
+
# 18.Checks the number of forwardings (Web_Forwards)
|
334 |
+
def forwarding(response):
|
335 |
+
if response == "":
|
336 |
+
return 1
|
337 |
+
else:
|
338 |
+
if len(response.history) <= 2:
|
339 |
+
return 0
|
340 |
+
else:
|
341 |
+
return 1
|
342 |
+
|
343 |
+
"""## **4. Computing URL Features**
|
344 |
+
|
345 |
+
Create a list and a function that calls the other functions and stores all the features of the URL in the list. We will extract the features of each URL and append to this list.
|
346 |
+
"""
|
347 |
+
|
348 |
+
#Function to extract features
|
349 |
+
def featureExtraction(url):
|
350 |
+
|
351 |
+
features = []
|
352 |
+
#Address bar based features (10)
|
353 |
+
#features.append(getDomain(url))
|
354 |
+
features.append(havingIP(url))
|
355 |
+
features.append(haveAtSign(url))
|
356 |
+
features.append(getLength(url))
|
357 |
+
features.append(getDepth(url))
|
358 |
+
features.append(redirection(url))
|
359 |
+
features.append(httpDomain(url))
|
360 |
+
features.append(tinyURL(url))
|
361 |
+
features.append(prefixSuffix(url))
|
362 |
+
|
363 |
+
# #Domain based features (4)
|
364 |
+
# dns = 0
|
365 |
+
# try:
|
366 |
+
# domain_name = whois.whois(urlparse(url).netloc)
|
367 |
+
# except:
|
368 |
+
# dns = 1
|
369 |
+
|
370 |
+
# features.append(dns)
|
371 |
+
# features.append(web_traffic(url))
|
372 |
+
# features.append(1 if dns == 1 else domainAge(domain_name))
|
373 |
+
# features.append(1 if dns == 1 else domainEnd(domain_name))
|
374 |
+
|
375 |
+
|
376 |
+
|
377 |
+
return features
|
378 |
+
|
379 |
+
#converting the list to dataframe
|
380 |
+
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
|
381 |
+
'https_Domain', 'TinyURL', 'Prefix/Suffix', 'Label']
|
382 |
+
|
xgboost/__init__.py
ADDED
File without changes
|
xgboost/features.py
ADDED
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
|
5 |
+
with open("mini_dataset/6.html") as f:
|
6 |
+
test = f.read()
|
7 |
+
|
8 |
+
soup = BeautifulSoup(test, "html.parser")
|
9 |
+
"""
|
10 |
+
|
11 |
+
|
12 |
+
# has_title
|
13 |
+
def has_title(soup):
|
14 |
+
if soup.title is None:
|
15 |
+
return 0
|
16 |
+
if len(soup.title.text) > 0:
|
17 |
+
return 1
|
18 |
+
else:
|
19 |
+
return 0
|
20 |
+
|
21 |
+
|
22 |
+
# has_input
|
23 |
+
def has_input(soup):
|
24 |
+
if len(soup.find_all("input")):
|
25 |
+
return 1
|
26 |
+
else:
|
27 |
+
return 0
|
28 |
+
|
29 |
+
|
30 |
+
# has_button
|
31 |
+
def has_button(soup):
|
32 |
+
if len(soup.find_all("button")) > 0:
|
33 |
+
return 1
|
34 |
+
else:
|
35 |
+
return 0
|
36 |
+
|
37 |
+
|
38 |
+
# has_image
|
39 |
+
def has_image(soup):
|
40 |
+
if len(soup.find_all("image")) == 0:
|
41 |
+
return 0
|
42 |
+
else:
|
43 |
+
return 1
|
44 |
+
|
45 |
+
|
46 |
+
# has_submit
|
47 |
+
def has_submit(soup):
|
48 |
+
for button in soup.find_all("input"):
|
49 |
+
if button.get("type") == "submit":
|
50 |
+
return 1
|
51 |
+
else:
|
52 |
+
pass
|
53 |
+
return 0
|
54 |
+
|
55 |
+
|
56 |
+
# has_link
|
57 |
+
def has_link(soup):
|
58 |
+
if len(soup.find_all("link")) > 0:
|
59 |
+
return 1
|
60 |
+
else:
|
61 |
+
return 0
|
62 |
+
|
63 |
+
|
64 |
+
# has_password
|
65 |
+
def has_password(soup):
|
66 |
+
for input in soup.find_all("input"):
|
67 |
+
if (input.get("type") or input.get("name") or input.get("id")) == "password":
|
68 |
+
return 1
|
69 |
+
else:
|
70 |
+
pass
|
71 |
+
return 0
|
72 |
+
|
73 |
+
|
74 |
+
# has_email_input
|
75 |
+
def has_email_input(soup):
|
76 |
+
for input in soup.find_all("input"):
|
77 |
+
if (input.get("type") or input.get("id") or input.get("name")) == "email":
|
78 |
+
return 1
|
79 |
+
else:
|
80 |
+
pass
|
81 |
+
return 0
|
82 |
+
|
83 |
+
|
84 |
+
# has_hidden_element
|
85 |
+
def has_hidden_element(soup):
|
86 |
+
for input in soup.find_all("input"):
|
87 |
+
if input.get("type") == "hidden":
|
88 |
+
return 1
|
89 |
+
else:
|
90 |
+
pass
|
91 |
+
return 0
|
92 |
+
|
93 |
+
|
94 |
+
# has_audio
|
95 |
+
def has_audio(soup):
|
96 |
+
if len(soup.find_all("audio")) > 0:
|
97 |
+
return 1
|
98 |
+
else:
|
99 |
+
return 0
|
100 |
+
|
101 |
+
|
102 |
+
# has_video
|
103 |
+
def has_video(soup):
|
104 |
+
if len(soup.find_all("video")) > 0:
|
105 |
+
return 1
|
106 |
+
else:
|
107 |
+
return 0
|
108 |
+
|
109 |
+
|
110 |
+
# number_of_inputs
|
111 |
+
def number_of_inputs(soup):
|
112 |
+
return len(soup.find_all("input"))
|
113 |
+
|
114 |
+
|
115 |
+
# number_of_buttons
|
116 |
+
def number_of_buttons(soup):
|
117 |
+
return len(soup.find_all("button"))
|
118 |
+
|
119 |
+
|
120 |
+
# number_of_images
|
121 |
+
def number_of_images(soup):
|
122 |
+
image_tags = len(soup.find_all("image"))
|
123 |
+
count = 0
|
124 |
+
for meta in soup.find_all("meta"):
|
125 |
+
if meta.get("type") or meta.get("name") == "image":
|
126 |
+
count += 1
|
127 |
+
return image_tags + count
|
128 |
+
|
129 |
+
|
130 |
+
# number_of_option
|
131 |
+
def number_of_option(soup):
|
132 |
+
return len(soup.find_all("option"))
|
133 |
+
|
134 |
+
|
135 |
+
# number_of_list
|
136 |
+
def number_of_list(soup):
|
137 |
+
return len(soup.find_all("li"))
|
138 |
+
|
139 |
+
|
140 |
+
# number_of_TH
|
141 |
+
def number_of_TH(soup):
|
142 |
+
return len(soup.find_all("th"))
|
143 |
+
|
144 |
+
|
145 |
+
# number_of_TR
|
146 |
+
def number_of_TR(soup):
|
147 |
+
return len(soup.find_all("tr"))
|
148 |
+
|
149 |
+
|
150 |
+
# number_of_href
|
151 |
+
def number_of_href(soup):
|
152 |
+
count = 0
|
153 |
+
for link in soup.find_all("link"):
|
154 |
+
if link.get("href"):
|
155 |
+
count += 1
|
156 |
+
return count
|
157 |
+
|
158 |
+
|
159 |
+
# number_of_paragraph
|
160 |
+
def number_of_paragraph(soup):
|
161 |
+
return len(soup.find_all("p"))
|
162 |
+
|
163 |
+
|
164 |
+
# number_of_script
|
165 |
+
def number_of_script(soup):
|
166 |
+
return len(soup.find_all("script"))
|
167 |
+
|
168 |
+
|
169 |
+
# length_of_title
|
170 |
+
def length_of_title(soup):
|
171 |
+
if soup.title == None:
|
172 |
+
return 0
|
173 |
+
return len(soup.title.text)
|
174 |
+
|
175 |
+
|
176 |
+
"""
|
177 |
+
print("has_title --> ", has_title(soup))
|
178 |
+
print("has_input --> ", has_input(soup))
|
179 |
+
print("has_button --> ", has_button(soup))
|
180 |
+
print("has_image --> ", has_image(soup))
|
181 |
+
print("has_submit --> ", has_submit(soup))
|
182 |
+
print("has_link --> ", has_link(soup))
|
183 |
+
print("has_password --> ", has_password(soup))
|
184 |
+
print("has_email_input --> ", has_email_input(soup))
|
185 |
+
print("has_hidden_element --> ", has_hidden_element(soup))
|
186 |
+
print("has_audio --> ", has_audio(soup))
|
187 |
+
print("has_video --> ", has_video(soup))
|
188 |
+
print("number_of_inputs --> ", number_of_inputs(soup))
|
189 |
+
print("number_of_buttons --> ", number_of_buttons(soup))
|
190 |
+
print("number_of_images --> ", number_of_images(soup))
|
191 |
+
print("number_of_option --> ", number_of_option(soup))
|
192 |
+
print("number_of_list --> ", number_of_list(soup))
|
193 |
+
print("number_of_TH --> ", number_of_TH(soup))
|
194 |
+
print("number_of_TR --> ", number_of_TR(soup))
|
195 |
+
print("number_of_href --> ", number_of_href(soup))
|
196 |
+
print("number_of_paragraph --> ", number_of_paragraph(soup))
|
197 |
+
print("number_of_script --> ", number_of_script(soup))
|
198 |
+
print("length_of_title --> ", length_of_title(soup))
|
199 |
+
|
200 |
+
"""
|
201 |
+
|
202 |
+
|
203 |
+
# has h1
|
204 |
+
def has_h1(soup):
|
205 |
+
if len(soup.find_all("h1")) > 0:
|
206 |
+
return 1
|
207 |
+
else:
|
208 |
+
return 0
|
209 |
+
|
210 |
+
|
211 |
+
# has h2
|
212 |
+
def has_h2(soup):
|
213 |
+
if len(soup.find_all("h2")) > 0:
|
214 |
+
return 1
|
215 |
+
else:
|
216 |
+
return 0
|
217 |
+
|
218 |
+
|
219 |
+
# has h3
|
220 |
+
def has_h3(soup):
|
221 |
+
if len(soup.find_all("h3")) > 0:
|
222 |
+
return 1
|
223 |
+
else:
|
224 |
+
return 0
|
225 |
+
|
226 |
+
|
227 |
+
# length of text
|
228 |
+
def length_of_text(soup):
|
229 |
+
return len(soup.get_text())
|
230 |
+
|
231 |
+
|
232 |
+
# number of clickable button
|
233 |
+
def number_of_clickable_button(soup):
|
234 |
+
count = 0
|
235 |
+
for button in soup.find_all("button"):
|
236 |
+
if button.get("type") == "button":
|
237 |
+
count += 1
|
238 |
+
return count
|
239 |
+
|
240 |
+
|
241 |
+
# number of a
|
242 |
+
def number_of_a(soup):
|
243 |
+
return len(soup.find_all("a"))
|
244 |
+
|
245 |
+
|
246 |
+
# number of img
|
247 |
+
def number_of_img(soup):
|
248 |
+
return len(soup.find_all("img"))
|
249 |
+
|
250 |
+
|
251 |
+
# number of div class
|
252 |
+
def number_of_div(soup):
|
253 |
+
return len(soup.find_all("div"))
|
254 |
+
|
255 |
+
|
256 |
+
# number of figures
|
257 |
+
def number_of_figure(soup):
|
258 |
+
return len(soup.find_all("figure"))
|
259 |
+
|
260 |
+
|
261 |
+
# has footer
|
262 |
+
def has_footer(soup):
|
263 |
+
if len(soup.find_all("footer")) > 0:
|
264 |
+
return 1
|
265 |
+
else:
|
266 |
+
return 0
|
267 |
+
|
268 |
+
|
269 |
+
# has form
|
270 |
+
def has_form(soup):
|
271 |
+
if len(soup.find_all("form")) > 0:
|
272 |
+
return 1
|
273 |
+
else:
|
274 |
+
return 0
|
275 |
+
|
276 |
+
|
277 |
+
# has textarea
|
278 |
+
def has_text_area(soup):
|
279 |
+
if len(soup.find_all("textarea")) > 0:
|
280 |
+
return 1
|
281 |
+
else:
|
282 |
+
return 0
|
283 |
+
|
284 |
+
|
285 |
+
# has iframe
|
286 |
+
def has_iframe(soup):
|
287 |
+
if len(soup.find_all("iframe")) > 0:
|
288 |
+
return 1
|
289 |
+
else:
|
290 |
+
return 0
|
291 |
+
|
292 |
+
|
293 |
+
# has text input
|
294 |
+
def has_text_input(soup):
|
295 |
+
for input in soup.find_all("input"):
|
296 |
+
if input.get("type") == "text":
|
297 |
+
return 1
|
298 |
+
return 0
|
299 |
+
|
300 |
+
|
301 |
+
# number of meta
|
302 |
+
def number_of_meta(soup):
|
303 |
+
return len(soup.find_all("meta"))
|
304 |
+
|
305 |
+
|
306 |
+
# has nav
|
307 |
+
def has_nav(soup):
|
308 |
+
if len(soup.find_all("nav")) > 0:
|
309 |
+
return 1
|
310 |
+
else:
|
311 |
+
return 0
|
312 |
+
|
313 |
+
|
314 |
+
# has object
|
315 |
+
def has_object(soup):
|
316 |
+
if len(soup.find_all("object")) > 0:
|
317 |
+
return 1
|
318 |
+
else:
|
319 |
+
return 0
|
320 |
+
|
321 |
+
|
322 |
+
# has picture
|
323 |
+
def has_picture(soup):
|
324 |
+
if len(soup.find_all("picture")) > 0:
|
325 |
+
return 1
|
326 |
+
else:
|
327 |
+
return 0
|
328 |
+
|
329 |
+
|
330 |
+
# number of sources
|
331 |
+
def number_of_sources(soup):
|
332 |
+
return len(soup.find_all("source"))
|
333 |
+
|
334 |
+
|
335 |
+
# number of span
|
336 |
+
def number_of_span(soup):
|
337 |
+
return len(soup.find_all("span"))
|
338 |
+
|
339 |
+
|
340 |
+
# number of table
|
341 |
+
def number_of_table(soup):
|
342 |
+
return len(soup.find_all("table"))
|
343 |
+
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
+
|
xgboost_wrapper.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
XGBoost Model Wrapper
|
3 |
+
This module provides a safe wrapper around the XGBoost models for phishing detection.
|
4 |
+
Loads models from Hugging Face Hub: th1enq/xgboost_checkpoint
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import joblib
|
10 |
+
import pickle
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
from huggingface_hub import hf_hub_download
|
13 |
+
|
14 |
+
# Add xgboost directory to path for feature extraction modules
|
15 |
+
xgboost_dir = os.path.join(os.path.dirname(__file__), 'xgboost')
|
16 |
+
sys.path.append(xgboost_dir)
|
17 |
+
|
18 |
+
try:
|
19 |
+
import features as fe
|
20 |
+
from URLFeatureExtraction import featureExtraction
|
21 |
+
XGBOOST_AVAILABLE = True
|
22 |
+
except ImportError as e:
|
23 |
+
print(f"XGBoost modules not available: {e}")
|
24 |
+
XGBOOST_AVAILABLE = False
|
25 |
+
|
26 |
+
def load_model_from_hub(repo_id, filename):
|
27 |
+
"""Load model from Hugging Face Hub"""
|
28 |
+
try:
|
29 |
+
# Download model from Hugging Face Hub
|
30 |
+
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
31 |
+
return joblib.load(model_path)
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Failed to load model {filename} from {repo_id}: {e}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
def load_model_safe(model_path):
|
37 |
+
"""Safely load a local model, handling version compatibility issues"""
|
38 |
+
try:
|
39 |
+
# Try loading with joblib first
|
40 |
+
return joblib.load(model_path)
|
41 |
+
except Exception as e1:
|
42 |
+
try:
|
43 |
+
# Try loading with pickle
|
44 |
+
with open(model_path, 'rb') as f:
|
45 |
+
return pickle.load(f)
|
46 |
+
except Exception as e2:
|
47 |
+
print(f"Failed to load model {model_path}")
|
48 |
+
print(f"Joblib error: {e1}")
|
49 |
+
print(f"Pickle error: {e2}")
|
50 |
+
return None
|
51 |
+
|
52 |
+
def extract_features_from_html(html_content):
|
53 |
+
"""Extract features from HTML content for phishing detection"""
|
54 |
+
if not XGBOOST_AVAILABLE:
|
55 |
+
return None
|
56 |
+
|
57 |
+
try:
|
58 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
59 |
+
|
60 |
+
features = [
|
61 |
+
fe.has_title(soup),
|
62 |
+
fe.has_input(soup),
|
63 |
+
fe.has_button(soup),
|
64 |
+
fe.has_image(soup),
|
65 |
+
fe.has_submit(soup),
|
66 |
+
fe.has_link(soup),
|
67 |
+
fe.has_password(soup),
|
68 |
+
fe.has_email_input(soup),
|
69 |
+
fe.has_hidden_element(soup),
|
70 |
+
fe.has_audio(soup),
|
71 |
+
fe.has_video(soup),
|
72 |
+
fe.number_of_inputs(soup),
|
73 |
+
fe.number_of_buttons(soup),
|
74 |
+
fe.number_of_images(soup),
|
75 |
+
fe.number_of_option(soup),
|
76 |
+
fe.number_of_list(soup),
|
77 |
+
fe.number_of_TH(soup),
|
78 |
+
fe.number_of_TR(soup),
|
79 |
+
fe.number_of_href(soup),
|
80 |
+
fe.number_of_paragraph(soup),
|
81 |
+
fe.number_of_script(soup),
|
82 |
+
fe.length_of_title(soup),
|
83 |
+
fe.has_h1(soup),
|
84 |
+
fe.has_h2(soup),
|
85 |
+
fe.has_h3(soup),
|
86 |
+
fe.length_of_text(soup),
|
87 |
+
fe.number_of_clickable_button(soup),
|
88 |
+
fe.number_of_a(soup),
|
89 |
+
fe.number_of_img(soup),
|
90 |
+
fe.number_of_div(soup),
|
91 |
+
fe.number_of_figure(soup),
|
92 |
+
fe.has_footer(soup),
|
93 |
+
fe.has_form(soup),
|
94 |
+
fe.has_text_area(soup),
|
95 |
+
fe.has_iframe(soup),
|
96 |
+
fe.has_text_input(soup),
|
97 |
+
fe.number_of_meta(soup),
|
98 |
+
fe.has_nav(soup),
|
99 |
+
fe.has_object(soup),
|
100 |
+
fe.has_picture(soup),
|
101 |
+
fe.number_of_sources(soup),
|
102 |
+
fe.number_of_span(soup),
|
103 |
+
fe.number_of_table(soup)
|
104 |
+
]
|
105 |
+
|
106 |
+
return features
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Error extracting HTML features: {e}")
|
109 |
+
return [0] * 43
|
110 |
+
|
111 |
+
def extract_features_from_url(url):
|
112 |
+
"""Extract features from URL for phishing detection"""
|
113 |
+
if not XGBOOST_AVAILABLE:
|
114 |
+
return None
|
115 |
+
|
116 |
+
try:
|
117 |
+
return featureExtraction(url)
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error extracting URL features: {e}")
|
120 |
+
return None
|
121 |
+
|
122 |
+
class XGBoostPhishingDetector:
|
123 |
+
def __init__(self):
|
124 |
+
self.html_model = None
|
125 |
+
self.url_model = None
|
126 |
+
self.available = XGBOOST_AVAILABLE
|
127 |
+
|
128 |
+
if self.available:
|
129 |
+
self._load_models()
|
130 |
+
|
131 |
+
def _load_models(self):
|
132 |
+
"""Load the XGBoost models from Hugging Face Hub"""
|
133 |
+
repo_id = "th1enq/xgboost_checkpoint"
|
134 |
+
|
135 |
+
# Try to load from Hugging Face Hub first
|
136 |
+
print("π Loading XGBoost models from Hugging Face Hub...")
|
137 |
+
|
138 |
+
self.html_model = load_model_from_hub(repo_id, 'xgboost_html.joblib')
|
139 |
+
if self.html_model:
|
140 |
+
print("β
HTML XGBoost model loaded from Hugging Face Hub")
|
141 |
+
else:
|
142 |
+
print("β Failed to load HTML XGBoost model from Hugging Face Hub")
|
143 |
+
# Fallback to local file
|
144 |
+
html_model_path = os.path.join(xgboost_dir, 'xgboost_html.joblib')
|
145 |
+
if os.path.exists(html_model_path):
|
146 |
+
self.html_model = load_model_safe(html_model_path)
|
147 |
+
print("β
HTML XGBoost model loaded from local file")
|
148 |
+
|
149 |
+
self.url_model = load_model_from_hub(repo_id, 'xgboost_url.joblib')
|
150 |
+
if self.url_model:
|
151 |
+
print("β
URL XGBoost model loaded from Hugging Face Hub")
|
152 |
+
else:
|
153 |
+
print("β Failed to load URL XGBoost model from Hugging Face Hub")
|
154 |
+
# Fallback to local file
|
155 |
+
url_model_path = os.path.join(xgboost_dir, 'xgboost_url.joblib')
|
156 |
+
if os.path.exists(url_model_path):
|
157 |
+
self.url_model = load_model_safe(url_model_path)
|
158 |
+
print("β
URL XGBoost model loaded from local file")
|
159 |
+
|
160 |
+
def predict_html(self, html_content):
|
161 |
+
"""Predict phishing from HTML content"""
|
162 |
+
if not self.available or not self.html_model:
|
163 |
+
return None
|
164 |
+
|
165 |
+
features = extract_features_from_html(html_content)
|
166 |
+
if features is None:
|
167 |
+
return None
|
168 |
+
|
169 |
+
try:
|
170 |
+
prediction = self.html_model.predict([features])[0]
|
171 |
+
probability = self.html_model.predict_proba([features])[0] if hasattr(self.html_model, 'predict_proba') else [1-prediction, prediction]
|
172 |
+
return {
|
173 |
+
'prediction': int(prediction),
|
174 |
+
'probability': probability,
|
175 |
+
'is_phishing': prediction == 1
|
176 |
+
}
|
177 |
+
except Exception as e:
|
178 |
+
print(f"Error predicting HTML: {e}")
|
179 |
+
return None
|
180 |
+
|
181 |
+
def predict_url(self, url):
|
182 |
+
"""Predict phishing from URL"""
|
183 |
+
if not self.available or not self.url_model:
|
184 |
+
return None
|
185 |
+
|
186 |
+
features = extract_features_from_url(url)
|
187 |
+
if features is None:
|
188 |
+
return None
|
189 |
+
|
190 |
+
try:
|
191 |
+
prediction = self.url_model.predict([features])[0]
|
192 |
+
probability = self.url_model.predict_proba([features])[0] if hasattr(self.url_model, 'predict_proba') else [1-prediction, prediction]
|
193 |
+
return {
|
194 |
+
'prediction': int(prediction),
|
195 |
+
'probability': probability,
|
196 |
+
'is_phishing': prediction == 1
|
197 |
+
}
|
198 |
+
except Exception as e:
|
199 |
+
print(f"Error predicting URL: {e}")
|
200 |
+
return None
|
201 |
+
|
202 |
+
def predict_combined(self, url, html_content=None, url_weight=0.3, html_weight=0.7):
|
203 |
+
"""Predict using both URL and HTML analysis"""
|
204 |
+
url_result = self.predict_url(url)
|
205 |
+
html_result = None
|
206 |
+
|
207 |
+
if html_content:
|
208 |
+
html_result = self.predict_html(html_content)
|
209 |
+
|
210 |
+
if url_result and html_result:
|
211 |
+
# Combine predictions
|
212 |
+
combined_prob = [
|
213 |
+
url_weight * url_result['probability'][0] + html_weight * html_result['probability'][0],
|
214 |
+
url_weight * url_result['probability'][1] + html_weight * html_result['probability'][1]
|
215 |
+
]
|
216 |
+
combined_prediction = 1 if combined_prob[1] > combined_prob[0] else 0
|
217 |
+
|
218 |
+
return {
|
219 |
+
'prediction': combined_prediction,
|
220 |
+
'probability': combined_prob,
|
221 |
+
'is_phishing': combined_prediction == 1,
|
222 |
+
'url_result': url_result,
|
223 |
+
'html_result': html_result
|
224 |
+
}
|
225 |
+
elif url_result:
|
226 |
+
return url_result
|
227 |
+
elif html_result:
|
228 |
+
return html_result
|
229 |
+
else:
|
230 |
+
return None
|
231 |
+
|
232 |
+
# Global instance
|
233 |
+
xgboost_detector = XGBoostPhishingDetector()
|