abdullahalioo commited on
Commit
f02a16d
Β·
verified Β·
1 Parent(s): 56074d9

Upload 6 files

Browse files
README.md CHANGED
@@ -1,3 +1,27 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🧠 Beast Spam Detector
2
+
3
+ This is a spam detection model built from scratch using PyTorch. It includes:
4
+
5
+ - Custom tokenizer
6
+ - CNN + BiLSTM model
7
+ - Trained weights (.pt and .safetensors)
8
+ - Easy-to-use inference script
9
+
10
+ ## πŸ“¦ Usage
11
+
12
+ ```bash
13
+ python check_spam.py
14
+ ```
15
+
16
+ Type your email content and press Enter twice to get prediction.
17
+
18
+ ## 🧠 Model
19
+
20
+ Built using custom tokenizer and a CNN+LSTM-based architecture. Safe to use.
21
+
22
+ ## πŸ“ Files
23
+
24
+ - `beast_spam_model.pt`: PyTorch weights
25
+ - `beast_spam_model.safetensors`: Safe format model
26
+ - `model.py`: Tokenizer + model
27
+ - `check_spam.py`: Inference script
beast_spam_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f6cb7a302306b414092528a5bd0c0f324715b47bb4ace0ff8a42d489f16c872
3
+ size 3290696
beast_spam_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6851494e1d42d84ee3c1d0d15d110d81892da3160b190b112a7a8460eb52962d
3
+ size 216
check_spam.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import re
3
+ from model import BeastTokenizer, BeastSpamModel
4
+ from safetensors.torch import load_file
5
+
6
+ def predict_spam(text, tokenizer, model):
7
+ cleaned = re.sub(r"\s+", " ", re.sub(r"\W", " ", re.sub(r"http\S+", "", text.lower()))).strip()
8
+ encoded = tokenizer.encode(cleaned)
9
+ tensor = torch.tensor([encoded], dtype=torch.long)
10
+ with torch.no_grad():
11
+ output = model(tensor).item()
12
+ return "πŸ”₯ It is SPAM!" if output > 0.5 else "βœ… It is NOT spam."
13
+
14
+ if __name__ == "__main__":
15
+ print("πŸ“© Enter the full email content below (press Enter twice to finish):\n")
16
+ lines = []
17
+ while True:
18
+ line = input()
19
+ if line.strip() == "":
20
+ break
21
+ lines.append(line)
22
+ email = "\n".join(lines)
23
+
24
+ # Load tokenizer vocab (manually or from file)
25
+ texts = ["this is dummy tokenizer data"]
26
+ tokenizer = BeastTokenizer(texts)
27
+
28
+ # Load model
29
+ model = BeastSpamModel(len(tokenizer.word2idx))
30
+ model.load_state_dict(load_file("beast_spam_model.safetensors"))
31
+ model.eval()
32
+
33
+ print("\n[πŸ”] Checking email...")
34
+ print(f"[🧠] Result: {predict_spam(email, tokenizer, model)}")
model.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from collections import Counter
4
+
5
+ class BeastTokenizer:
6
+ def __init__(self, texts=[], vocab_size=5000):
7
+ self.word2idx = {'<PAD>': 0, '<UNK>': 1}
8
+ if texts:
9
+ counter = Counter(word for text in texts for word in text.split())
10
+ common = counter.most_common(vocab_size - 2)
11
+ self.word2idx.update({word: idx + 2 for idx, (word, _) in enumerate(common)})
12
+
13
+ def encode(self, text, max_len=100):
14
+ tokens = [self.word2idx.get(word, 1) for word in text.split()]
15
+ return tokens[:max_len] + [0] * (max_len - len(tokens))
16
+
17
+ class BeastSpamModel(nn.Module):
18
+ def __init__(self, vocab_size, embed_dim=128, hidden_dim=64):
19
+ super().__init__()
20
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
21
+ self.conv = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
22
+ self.lstm = nn.LSTM(128, hidden_dim, batch_first=True, bidirectional=True)
23
+ self.fc = nn.Linear(hidden_dim * 2, 1)
24
+ self.sigmoid = nn.Sigmoid()
25
+
26
+ def forward(self, x):
27
+ x = self.embedding(x)
28
+ x = x.permute(0, 2, 1)
29
+ x = self.conv(x).permute(0, 2, 1)
30
+ lstm_out, _ = self.lstm(x)
31
+ out = self.fc(lstm_out[:, -1, :])
32
+ return self.sigmoid(out).squeeze(1)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ safetensors
3
+ datasets
4
+ scikit-learn