Upload 6 files
Browse files- README.md +27 -3
- beast_spam_model.pt +3 -0
- beast_spam_model.safetensors +3 -0
- check_spam.py +34 -0
- model.py +32 -0
- requirements.txt +4 -0
README.md
CHANGED
@@ -1,3 +1,27 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# π§ Beast Spam Detector
|
2 |
+
|
3 |
+
This is a spam detection model built from scratch using PyTorch. It includes:
|
4 |
+
|
5 |
+
- Custom tokenizer
|
6 |
+
- CNN + BiLSTM model
|
7 |
+
- Trained weights (.pt and .safetensors)
|
8 |
+
- Easy-to-use inference script
|
9 |
+
|
10 |
+
## π¦ Usage
|
11 |
+
|
12 |
+
```bash
|
13 |
+
python check_spam.py
|
14 |
+
```
|
15 |
+
|
16 |
+
Type your email content and press Enter twice to get prediction.
|
17 |
+
|
18 |
+
## π§ Model
|
19 |
+
|
20 |
+
Built using custom tokenizer and a CNN+LSTM-based architecture. Safe to use.
|
21 |
+
|
22 |
+
## π Files
|
23 |
+
|
24 |
+
- `beast_spam_model.pt`: PyTorch weights
|
25 |
+
- `beast_spam_model.safetensors`: Safe format model
|
26 |
+
- `model.py`: Tokenizer + model
|
27 |
+
- `check_spam.py`: Inference script
|
beast_spam_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f6cb7a302306b414092528a5bd0c0f324715b47bb4ace0ff8a42d489f16c872
|
3 |
+
size 3290696
|
beast_spam_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6851494e1d42d84ee3c1d0d15d110d81892da3160b190b112a7a8460eb52962d
|
3 |
+
size 216
|
check_spam.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import re
|
3 |
+
from model import BeastTokenizer, BeastSpamModel
|
4 |
+
from safetensors.torch import load_file
|
5 |
+
|
6 |
+
def predict_spam(text, tokenizer, model):
|
7 |
+
cleaned = re.sub(r"\s+", " ", re.sub(r"\W", " ", re.sub(r"http\S+", "", text.lower()))).strip()
|
8 |
+
encoded = tokenizer.encode(cleaned)
|
9 |
+
tensor = torch.tensor([encoded], dtype=torch.long)
|
10 |
+
with torch.no_grad():
|
11 |
+
output = model(tensor).item()
|
12 |
+
return "π₯ It is SPAM!" if output > 0.5 else "β
It is NOT spam."
|
13 |
+
|
14 |
+
if __name__ == "__main__":
|
15 |
+
print("π© Enter the full email content below (press Enter twice to finish):\n")
|
16 |
+
lines = []
|
17 |
+
while True:
|
18 |
+
line = input()
|
19 |
+
if line.strip() == "":
|
20 |
+
break
|
21 |
+
lines.append(line)
|
22 |
+
email = "\n".join(lines)
|
23 |
+
|
24 |
+
# Load tokenizer vocab (manually or from file)
|
25 |
+
texts = ["this is dummy tokenizer data"]
|
26 |
+
tokenizer = BeastTokenizer(texts)
|
27 |
+
|
28 |
+
# Load model
|
29 |
+
model = BeastSpamModel(len(tokenizer.word2idx))
|
30 |
+
model.load_state_dict(load_file("beast_spam_model.safetensors"))
|
31 |
+
model.eval()
|
32 |
+
|
33 |
+
print("\n[π] Checking email...")
|
34 |
+
print(f"[π§ ] Result: {predict_spam(email, tokenizer, model)}")
|
model.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
+
class BeastTokenizer:
|
6 |
+
def __init__(self, texts=[], vocab_size=5000):
|
7 |
+
self.word2idx = {'<PAD>': 0, '<UNK>': 1}
|
8 |
+
if texts:
|
9 |
+
counter = Counter(word for text in texts for word in text.split())
|
10 |
+
common = counter.most_common(vocab_size - 2)
|
11 |
+
self.word2idx.update({word: idx + 2 for idx, (word, _) in enumerate(common)})
|
12 |
+
|
13 |
+
def encode(self, text, max_len=100):
|
14 |
+
tokens = [self.word2idx.get(word, 1) for word in text.split()]
|
15 |
+
return tokens[:max_len] + [0] * (max_len - len(tokens))
|
16 |
+
|
17 |
+
class BeastSpamModel(nn.Module):
|
18 |
+
def __init__(self, vocab_size, embed_dim=128, hidden_dim=64):
|
19 |
+
super().__init__()
|
20 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
|
21 |
+
self.conv = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
|
22 |
+
self.lstm = nn.LSTM(128, hidden_dim, batch_first=True, bidirectional=True)
|
23 |
+
self.fc = nn.Linear(hidden_dim * 2, 1)
|
24 |
+
self.sigmoid = nn.Sigmoid()
|
25 |
+
|
26 |
+
def forward(self, x):
|
27 |
+
x = self.embedding(x)
|
28 |
+
x = x.permute(0, 2, 1)
|
29 |
+
x = self.conv(x).permute(0, 2, 1)
|
30 |
+
lstm_out, _ = self.lstm(x)
|
31 |
+
out = self.fc(lstm_out[:, -1, :])
|
32 |
+
return self.sigmoid(out).squeeze(1)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
safetensors
|
3 |
+
datasets
|
4 |
+
scikit-learn
|