nyasukun commited on
Commit
9290385
·
1 Parent(s): 2c269a8
Files changed (2) hide show
  1. app.py +414 -0
  2. requirements.txt +20 -0
app.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, json
2
+ import gradio as gr
3
+ import torch, pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+
8
+ # ZeroGPU support
9
+ try:
10
+ import spaces
11
+ ZEROGPU_AVAILABLE = True
12
+ print("ZeroGPU support enabled")
13
+ except ImportError:
14
+ ZEROGPU_AVAILABLE = False
15
+ print("ZeroGPU not available, running in standard mode")
16
+ # Create dummy decorator for local development
17
+ def spaces_gpu_decorator(duration=60):
18
+ def decorator(func):
19
+ return func
20
+ return decorator
21
+ spaces = type('spaces', (), {'GPU': spaces_gpu_decorator})
22
+
23
+ # Model configuration - can be replaced with other models
24
+ MODEL_NAME = "fdtn-ai/Foundation-Sec-8B"
25
+ #MODEL_NAME = "sshleifer/tiny-gpt2"
26
+
27
+ # Initialize tokenizer and model
28
+ print(f"Loading model: {MODEL_NAME}")
29
+ tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ MODEL_NAME, torch_dtype=torch.float16, device_map="auto"
32
+ ).eval()
33
+
34
+ # Log device information
35
+ if hasattr(model, 'device'):
36
+ print(f"Model loaded on device: {model.device}")
37
+ else:
38
+ device_info = next(model.parameters()).device
39
+ print(f"Model parameters on device: {device_info}")
40
+
41
+ print(f"CUDA available: {torch.cuda.is_available()}")
42
+ if torch.cuda.is_available():
43
+ print(f"CUDA device count: {torch.cuda.device_count()}")
44
+ print(f"Current CUDA device: {torch.cuda.current_device()}")
45
+ print(f"CUDA device name: {torch.cuda.get_device_name()}")
46
+
47
+ # Configuration parameters
48
+ LEN_ALPHA = 0.7 # Length correction factor (0=no correction, 1=full average logP)
49
+
50
+ # Sample data for testing
51
+ CAMPAIGN_LIST = [
52
+ "Operation Aurora",
53
+ "Dust Storm",
54
+ "ShadowHammer",
55
+ "NotPetya",
56
+ "SolarWinds",
57
+ ]
58
+ ACTOR_LIST = ["APT1", "APT28", "APT33", "APT38", "FIN8"]
59
+
60
+ # Sample ATT&CK technique IDs with names
61
+ TECHNIQUE_LIST = [
62
+ "T1059 Command and Scripting Interpreter",
63
+ "T1566 Phishing",
64
+ "T1027 Obfuscated/Stored Files",
65
+ "T1036 Masquerading",
66
+ "T1105 Ingress Tool Transfer",
67
+ "T1018 Remote System Discovery",
68
+ "T1568 Dynamic Resolution",
69
+ ]
70
+
71
+
72
+ @spaces.GPU(duration=120)
73
+ @torch.no_grad()
74
+ def phrase_log_prob(prompt, phrase):
75
+ """Calculate log probability of a phrase given a prompt using the language model."""
76
+ try:
77
+ # Log GPU usage information
78
+ device_info = next(model.parameters()).device
79
+ print(f"Running phrase_log_prob on device: {device_info}")
80
+
81
+ ids_prompt = tok(prompt, return_tensors="pt").to(model.device)["input_ids"][0]
82
+ ids_phrase = tok(phrase, add_special_tokens=False)["input_ids"]
83
+ lp = 0.0
84
+ cur = ids_prompt.unsqueeze(0)
85
+ for tid in ids_phrase:
86
+ logits = model(cur).logits[0, -1].float()
87
+ lp += torch.log_softmax(logits, -1)[tid].item()
88
+ cur = torch.cat([cur, torch.tensor([[tid]], device=model.device)], 1)
89
+ return lp
90
+ except Exception as e:
91
+ print(f"Error in phrase_log_prob: {e}")
92
+ raise e
93
+
94
+
95
+ def binary_assoc_score(prompt: str, phrase: str, neg="does NOT use", prompt_template="typically uses") -> float:
96
+ """
97
+ Calculate binary association score: p ≈ P(use) / (P(use)+P(not use))
98
+ Applies length normalization to correct for longer phrases.
99
+
100
+ Args:
101
+ prompt: Base prompt string
102
+ phrase: Phrase to evaluate
103
+ neg: Negative template to replace positive template
104
+ prompt_template: Positive template to be replaced
105
+
106
+ Returns:
107
+ Length-normalized association score between 0 and 1
108
+ """
109
+ lp_pos = phrase_log_prob(prompt, phrase)
110
+ lp_neg = phrase_log_prob(prompt.replace(prompt_template, neg), phrase)
111
+
112
+ # Logistic transformation
113
+ prob = 1 / (1 + math.exp(lp_neg - lp_pos))
114
+
115
+ # Length normalization
116
+ n_tok = len(tok(phrase, add_special_tokens=False)["input_ids"])
117
+ return prob / (n_tok ** LEN_ALPHA)
118
+
119
+
120
+ def campaign_actor_associations(campaigns, actors):
121
+ """Campaign × Actor の関連度を計算し、各CampaignごとにTop Actorを返す"""
122
+ results = {}
123
+ for camp in campaigns:
124
+ prompt_base = CAMPAIGN_ACTOR_PROMPT.format(campaign=camp)
125
+ actor_scores = {}
126
+ for actor in actors:
127
+ score = binary_assoc_score(prompt_base, actor, neg="is NOT associated with")
128
+ actor_scores[actor] = score
129
+
130
+ # スコア順でソート
131
+ sorted_actors = sorted(actor_scores.items(), key=lambda x: x[1], reverse=True)
132
+ results[camp] = sorted_actors
133
+
134
+ return results
135
+
136
+
137
+ def campaign_technique_matrix(campaigns, techniques, prompt_template="typically uses", neg_template="typically does NOT use"):
138
+ """
139
+ Generate Campaign × Technique association matrix using binary scoring.
140
+
141
+ Args:
142
+ campaigns: List of campaign names
143
+ techniques: List of technique names
144
+ prompt_template: Template for positive association
145
+ neg_template: Template for negative association
146
+
147
+ Returns:
148
+ DataFrame with campaigns as rows, techniques as columns, scores as values
149
+ """
150
+ rows = {}
151
+ for camp in campaigns:
152
+ prompt_base = f"{camp} {prompt_template}"
153
+ rows[camp] = {
154
+ tech: binary_assoc_score(prompt_base, tech, neg=neg_template, prompt_template=prompt_template)
155
+ for tech in techniques
156
+ }
157
+ return pd.DataFrame.from_dict(rows, orient="index")
158
+
159
+
160
+ def campaign_actor_matrix(campaigns, actors):
161
+ """Campaign × Actor 行列を生成"""
162
+ rows = {}
163
+ for camp in campaigns:
164
+ prompt_base = CAMPAIGN_ACTOR_PROMPT.format(campaign=camp)
165
+ rows[camp] = {
166
+ actor: binary_assoc_score(prompt_base, actor, neg="is NOT associated with")
167
+ for actor in actors
168
+ }
169
+ return pd.DataFrame.from_dict(rows, orient="index")
170
+
171
+
172
+ def campaign_actor_probs(campaigns, actors, prompt_template="is conducted by"):
173
+ """
174
+ Generate Campaign × Actor probability matrix using softmax normalization.
175
+
176
+ Args:
177
+ campaigns: List of campaign names
178
+ actors: List of actor names
179
+ prompt_template: Template for actor association prompt
180
+
181
+ Returns:
182
+ DataFrame with campaigns as rows, actors as columns, probabilities as values
183
+ """
184
+ rows = {}
185
+ for camp in campaigns:
186
+ prompt = f"{camp} {prompt_template}"
187
+ logps = [phrase_log_prob(prompt, a) for a in actors]
188
+
189
+ # Softmax normalization (with max-shift for numerical stability)
190
+ m = max(logps)
191
+ ps = [math.exp(lp - m) for lp in logps]
192
+ s = sum(ps)
193
+ rows[camp] = {a: p/s for a, p in zip(actors, ps)}
194
+ return pd.DataFrame.from_dict(rows, orient="index")
195
+
196
+
197
+ def generate_actor_heatmap(c_list, a_list, actor_prompt_template):
198
+ """Generate Campaign-Actor association heatmap with probability visualization."""
199
+ try:
200
+ campaigns = [c.strip() for c in c_list.split(",") if c.strip()]
201
+ actors = [a.strip() for a in a_list.split(",") if a.strip()]
202
+
203
+ if not campaigns or not actors:
204
+ fig, ax = plt.subplots(figsize=(8, 6))
205
+ ax.text(0.5, 0.5, 'Please enter both Campaigns and Actors',
206
+ ha='center', va='center', fontsize=16)
207
+ ax.set_xlim(0, 1)
208
+ ax.set_ylim(0, 1)
209
+ ax.axis('off')
210
+ return fig
211
+
212
+ print(f"Processing {len(campaigns)} campaigns and {len(actors)} actors...")
213
+ print(f"Using prompt template: '{actor_prompt_template}'")
214
+
215
+ # Check GPU availability
216
+ if torch.cuda.is_available():
217
+ print(f"GPU computation enabled - Device: {torch.cuda.get_device_name()}")
218
+ else:
219
+ print("Running on CPU")
220
+
221
+ # Calculate probability matrix
222
+ df_ca = campaign_actor_probs(campaigns, actors, actor_prompt_template)
223
+ print(f"Actor probability matrix shape: {df_ca.shape}")
224
+ print("Actor probability matrix:")
225
+ print(df_ca.round(4))
226
+
227
+ # Create heatmap with matplotlib/seaborn
228
+ fig, ax = plt.subplots(figsize=(max(8, len(actors)*1.2), max(6, len(campaigns)*0.8)))
229
+
230
+ sns.heatmap(df_ca, annot=True, cmap='plasma', fmt='.3f',
231
+ cbar_kws={'label': 'P(actor)'}, ax=ax)
232
+
233
+ ax.set_title('Campaign-Actor Probabilities (softmax normalized)',
234
+ fontsize=14, pad=20)
235
+ ax.set_xlabel('Actor', fontsize=12)
236
+ ax.set_ylabel('Campaign', fontsize=12)
237
+
238
+ # Adjust label rotation
239
+ plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
240
+ plt.setp(ax.get_yticklabels(), rotation=0)
241
+
242
+ plt.tight_layout()
243
+
244
+ print("Actor heatmap generated successfully!")
245
+ return fig
246
+
247
+ except Exception as e:
248
+ print(f"Error in generate_actor_heatmap: {e}")
249
+ import traceback
250
+ traceback.print_exc()
251
+
252
+ fig, ax = plt.subplots(figsize=(8, 6))
253
+ ax.text(0.5, 0.5, f'Error occurred: {str(e)}',
254
+ ha='center', va='center', fontsize=12, color='red')
255
+ ax.set_xlim(0, 1)
256
+ ax.set_ylim(0, 1)
257
+ ax.axis('off')
258
+ return fig
259
+
260
+
261
+ def generate_technique_heatmap(c_list, t_list, technique_prompt_template, technique_neg_template):
262
+ """Generate Campaign-Technique association heatmap with binary scoring visualization."""
263
+ try:
264
+ campaigns = [c.strip() for c in c_list.split(",") if c.strip()]
265
+ techniques = [t.strip() for t in t_list.split(",") if t.strip()]
266
+
267
+ if not campaigns or not techniques:
268
+ fig, ax = plt.subplots(figsize=(8, 6))
269
+ ax.text(0.5, 0.5, 'Please enter both Campaigns and Techniques',
270
+ ha='center', va='center', fontsize=16)
271
+ ax.set_xlim(0, 1)
272
+ ax.set_ylim(0, 1)
273
+ ax.axis('off')
274
+ return fig
275
+
276
+ print(f"Processing {len(campaigns)} campaigns and {len(techniques)} techniques...")
277
+ print(f"Using prompt templates: '{technique_prompt_template}' / '{technique_neg_template}'")
278
+
279
+ # Check GPU availability
280
+ if torch.cuda.is_available():
281
+ print(f"GPU computation enabled - Device: {torch.cuda.get_device_name()}")
282
+ else:
283
+ print("Running on CPU")
284
+
285
+ # Calculate score matrix
286
+ df_ct = campaign_technique_matrix(campaigns, techniques, technique_prompt_template, technique_neg_template)
287
+ print(f"Score matrix shape: {df_ct.shape}")
288
+ print("Score matrix:")
289
+ print(df_ct.round(4))
290
+
291
+ # Create heatmap with matplotlib/seaborn
292
+ fig, ax = plt.subplots(figsize=(max(8, len(techniques)*1.2), max(6, len(campaigns)*0.8)))
293
+
294
+ sns.heatmap(df_ct, annot=True, cmap='viridis', fmt='.3f',
295
+ cbar_kws={'label': 'Association Score'}, ax=ax)
296
+
297
+ ax.set_title('Campaign-Technique Associations (len-norm, independent)',
298
+ fontsize=14, pad=20)
299
+ ax.set_xlabel('Technique', fontsize=12)
300
+ ax.set_ylabel('Campaign', fontsize=12)
301
+
302
+ # Adjust label rotation
303
+ plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
304
+ plt.setp(ax.get_yticklabels(), rotation=0)
305
+
306
+ plt.tight_layout()
307
+
308
+ print("Technique heatmap generated successfully!")
309
+ return fig
310
+
311
+ except Exception as e:
312
+ print(f"Error in generate_technique_heatmap: {e}")
313
+ import traceback
314
+ traceback.print_exc()
315
+
316
+ fig, ax = plt.subplots(figsize=(8, 6))
317
+ ax.text(0.5, 0.5, f'Error occurred: {str(e)}',
318
+ ha='center', va='center', fontsize=12, color='red')
319
+ ax.set_xlim(0, 1)
320
+ ax.set_ylim(0, 1)
321
+ ax.axis('off')
322
+ return fig
323
+
324
+
325
+ with gr.Blocks(title="LLM Threat Graph Demo") as demo:
326
+ gr.Markdown("# 🕸️ LLM Threat Association Analysis\n*Visualizing Campaign-Actor-Technique relationships using Language Models*")
327
+
328
+ # Common inputs
329
+ with gr.Row():
330
+ campaigns = gr.Textbox(
331
+ "Operation Aurora, Dust Storm, ShadowHammer, NotPetya, SolarWinds",
332
+ label="Campaigns (comma-separated)",
333
+ placeholder="e.g., Operation Aurora, NotPetya, Stuxnet"
334
+ )
335
+
336
+ # Campaign-Actor section (probabilistic)
337
+ gr.Markdown("## 👤 Campaign-Actor Associations")
338
+ gr.Markdown("Visualizing Campaign-Actor relationships with probabilistic heatmaps")
339
+
340
+ gr.Markdown("""
341
+ **Calculation Method**: `P(actor | "{campaign} is conducted by") (softmax normalized)`
342
+
343
+ 1. Calculate `phrase_log_prob("{campaign} is conducted by", actor)` for each Actor
344
+ 2. Apply softmax normalization to create probability distribution (probabilities sum to 1.0 per Campaign)
345
+ 3. Result: Shows relative likelihood of each Actor conducting each Campaign
346
+ """)
347
+
348
+ with gr.Row():
349
+ actor_prompt_template = gr.Textbox(
350
+ "is conducted by",
351
+ label="Actor Prompt Template",
352
+ placeholder="e.g., is conducted by, is attributed to"
353
+ )
354
+
355
+ actors = gr.Textbox(
356
+ "APT1, APT28, APT33, APT38, FIN8",
357
+ label="Actors (comma-separated)",
358
+ placeholder="e.g., APT1, Lazarus Group, Cozy Bear"
359
+ )
360
+
361
+ btn_actor = gr.Button("Generate Actor Heatmap", variant="primary")
362
+ plot_actor = gr.Plot(label="Campaign-Actor Heatmap")
363
+
364
+ btn_actor.click(
365
+ fn=generate_actor_heatmap,
366
+ inputs=[campaigns, actors, actor_prompt_template],
367
+ outputs=plot_actor,
368
+ show_progress=True
369
+ )
370
+
371
+ # Campaign-Technique section (independent scoring)
372
+ gr.Markdown("## 🛠️ Campaign-Technique Associations")
373
+ gr.Markdown("Visualizing Campaign-Technique relationships with independent association scores")
374
+
375
+ gr.Markdown("""
376
+ **Calculation Method**: `Binary Association Score (length-normalized, independent)`
377
+
378
+ 1. For each Technique, calculate:
379
+ - `lp_pos = phrase_log_prob("{campaign} typically uses", technique)`
380
+ - `lp_neg = phrase_log_prob("{campaign} typically does NOT use", technique)`
381
+ 2. Apply logistic transformation: `prob = 1 / (1 + exp(lp_neg - lp_pos))`
382
+ 3. Length normalization: `score = prob / (n_tokens^0.7)` (penalty for longer phrases)
383
+ 4. Result: Independent association scores (0-1) for each Campaign-Technique pair
384
+ """)
385
+
386
+ with gr.Row():
387
+ technique_prompt_template = gr.Textbox(
388
+ "typically uses",
389
+ label="Technique Prompt Template (positive)",
390
+ placeholder="e.g., typically uses, commonly employs"
391
+ )
392
+ technique_neg_template = gr.Textbox(
393
+ "typically does NOT use",
394
+ label="Technique Prompt Template (negative)",
395
+ placeholder="e.g., typically does NOT use, never employs"
396
+ )
397
+
398
+ techniques = gr.Textbox(
399
+ "T1059 Command and Scripting Interpreter, T1566 Phishing, T1027 Obfuscated/Stored Files, T1036 Masquerading, T1105 Ingress Tool Transfer, T1018 Remote System Discovery, T1568 Dynamic Resolution",
400
+ label="Techniques (comma-separated)",
401
+ placeholder="e.g., T1059 Command and Scripting Interpreter, T1566 Phishing"
402
+ )
403
+
404
+ btn_technique = gr.Button("Generate Technique Heatmap", variant="primary")
405
+ plot_technique = gr.Plot(label="Campaign-Technique Heatmap")
406
+
407
+ btn_technique.click(
408
+ fn=generate_technique_heatmap,
409
+ inputs=[campaigns, techniques, technique_prompt_template, technique_neg_template],
410
+ outputs=plot_technique,
411
+ show_progress=True
412
+ )
413
+
414
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies for LLM Threat Association Analysis (ZeroGPU compatible)
2
+ gradio>=4.0.0
3
+ torch==2.4.0
4
+ transformers>=4.30.0
5
+ pandas>=2.0.0
6
+ accelerate>=0.26.0
7
+
8
+ # Visualization dependencies
9
+ matplotlib>=3.7.0
10
+ seaborn>=0.12.0
11
+
12
+ # Additional utilities
13
+ numpy>=1.24.0
14
+
15
+ # ZeroGPU support
16
+ spaces
17
+
18
+ # Optional: GPU acceleration (uncomment if using CUDA)
19
+ # torch-audio>=2.0.0
20
+ # torchvision>=0.15.0