Update README.md
Browse files
README.md
CHANGED
@@ -26,39 +26,28 @@ The model was trained on a multilingual dataset of cybersecurity and non-cyberse
|
|
26 |
|
27 |
```python
|
28 |
from sentence_transformers import SentenceTransformer
|
29 |
-
from sklearn.model_selection import train_test_split
|
30 |
-
from sklearn.preprocessing import LabelEncoder
|
31 |
-
import pandas as pd
|
32 |
-
import joblib
|
33 |
from huggingface_hub import hf_hub_download
|
|
|
34 |
|
35 |
-
# Load
|
36 |
-
df = pd.read_csv("your_dataset.csv") # Requires 'clean_text' and 'label' columns
|
37 |
-
|
38 |
-
# Load the sentence transformer
|
39 |
embedder = SentenceTransformer("intfloat/multilingual-e5-large")
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
-
|
44 |
-
df["label"],
|
45 |
-
test_size=0.05,
|
46 |
-
stratify=df["label"],
|
47 |
-
random_state=42
|
48 |
-
)
|
49 |
|
50 |
-
#
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
54 |
|
55 |
-
# Generate
|
56 |
-
|
57 |
-
X_test_emb = embedder.encode(X_test.tolist(), convert_to_numpy=True, show_progress_bar=True)
|
58 |
|
59 |
-
#
|
60 |
-
|
61 |
-
model = joblib.load(model_path)
|
62 |
|
63 |
-
#
|
64 |
-
|
|
|
|
26 |
|
27 |
```python
|
28 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
29 |
from huggingface_hub import hf_hub_download
|
30 |
+
import joblib
|
31 |
|
32 |
+
# 1. Load the embedding model
|
|
|
|
|
|
|
33 |
embedder = SentenceTransformer("intfloat/multilingual-e5-large")
|
34 |
|
35 |
+
# 2. Load the pretrained MLP classifier from Hugging Face Hub
|
36 |
+
model_path = hf_hub_download(repo_id="selfconstruct3d/cybersec_classifier", filename="cybersec_classifier.pkl")
|
37 |
+
model = joblib.load(model_path)
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# 3. Example input texts (can be in English or German)
|
40 |
+
texts = [
|
41 |
+
"A new ransomware attack has affected critical infrastructure in Germany.",
|
42 |
+
"The local sports club hosted its annual summer festival this weekend."
|
43 |
+
]
|
44 |
|
45 |
+
# 4. Generate embeddings
|
46 |
+
embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
|
|
47 |
|
48 |
+
# 5. Predict cybersecurity relevance
|
49 |
+
predictions = model.predict(embeddings)
|
|
|
50 |
|
51 |
+
# 6. Output results
|
52 |
+
for text, label in zip(texts, predictions):
|
53 |
+
print(f"Text: {text}\nPrediction: {label}\n")
|