lenawilli commited on
Commit
c73a79b
·
verified ·
1 Parent(s): 2d7abbc

Upload 9 files

Browse files
src/KnowledgeGraphView.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from rdflib import Graph, RDFS, Namespace, URIRef, RDF, Literal
3
+ from pyvis.network import Network
4
+ import streamlit.components.v1 as components
5
+
6
+ # --- Load RDF ---
7
+ g = Graph()
8
+ g.parse("../KnowledgeGraph/gdpr_policy_graph.ttl", format="ttl")
9
+
10
+ BASE_URI = "http://example.org/gdpr#"
11
+ EX = Namespace(BASE_URI)
12
+
13
+ # --- UI Layout ---
14
+ st.set_page_config(layout="wide")
15
+ st.title("🕸️ GDPR Knowledge Graph Visualizer")
16
+
17
+ # --- Get Filter Options ---
18
+ articles = sorted({str(o).split(" ")[1] for s, p, o in g.triples((None, RDFS.label, None)) if "Article" in str(o)})
19
+ sections = sorted({
20
+ str(label)
21
+ for sec in g.subjects(RDF.type, EX.PolicySection)
22
+ for label in g.objects(sec, RDFS.label)
23
+ })
24
+
25
+ col1, col2 = st.columns([1, 3])
26
+ selected_article = col1.selectbox("🔍 Filter by Article Number", ["All"] + articles)
27
+ selected_section = col2.selectbox("📄 Filter by Policy Section", ["All"] + sections)
28
+
29
+ # --- Determine nodes to show based on filters ---
30
+ def get_related_nodes_for_section(section_label):
31
+ # Find the PolicySection node
32
+ matching_sections = [
33
+ s for s, p, o in g.triples((None, RDFS.label, Literal(section_label)))
34
+ if (s, RDF.type, EX.PolicySection) in g
35
+ ]
36
+ if not matching_sections:
37
+ return set()
38
+ sec_node = matching_sections[0]
39
+
40
+ clause_nodes = set(o for _, _, o in g.triples((sec_node, EX.relatesToClause, None)))
41
+ article_nodes = set(o for c in clause_nodes for _, _, o in g.triples((c, EX.partOf, None)))
42
+ return {sec_node} | clause_nodes | article_nodes
43
+
44
+ def get_related_nodes_for_article(article_number):
45
+ # Find Article node(s)
46
+ article_nodes = set(
47
+ s for s, p, o in g.triples((None, RDFS.label, None))
48
+ if (s, RDF.type, EX.Article) in g and f"Article {article_number}" in str(o)
49
+ )
50
+ if not article_nodes:
51
+ return set()
52
+ article_node = list(article_nodes)[0]
53
+
54
+ # Clauses of the article
55
+ clause_nodes = set(o for _, _, o in g.triples((None, EX.partOf, article_node)))
56
+
57
+ # Find all policy sections relating to these clauses
58
+ policy_sections = set(
59
+ s for s, p, o in g.triples((None, EX.relatesToClause, None)) if o in clause_nodes
60
+ )
61
+ return {article_node} | clause_nodes | policy_sections
62
+
63
+ def expand_with_neighbors(graph, nodes):
64
+ expanded = set(nodes)
65
+ for node in nodes:
66
+ # Outgoing neighbors
67
+ for _, _, o in graph.triples((node, None, None)):
68
+ expanded.add(o)
69
+ # Incoming neighbors
70
+ for s, _, _ in graph.triples((None, None, node)):
71
+ expanded.add(s)
72
+ return expanded
73
+
74
+ if selected_section != "All" and selected_article != "All":
75
+ # Filter by both: intersection of sets
76
+ nodes_for_section = get_related_nodes_for_section(selected_section)
77
+ nodes_for_article = get_related_nodes_for_article(selected_article)
78
+ base_nodes = nodes_for_section & nodes_for_article
79
+ elif selected_section != "All":
80
+ base_nodes = get_related_nodes_for_section(selected_section)
81
+ elif selected_article != "All":
82
+ base_nodes = get_related_nodes_for_article(selected_article)
83
+ else:
84
+ # Show everything
85
+ base_nodes = set()
86
+ for s, p, o in g:
87
+ base_nodes.add(s)
88
+ base_nodes.add(o)
89
+
90
+ nodes_to_show = expand_with_neighbors(g, base_nodes)
91
+ # --- Initialize network ---
92
+ net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")
93
+ net.force_atlas_2based(gravity=-30)
94
+
95
+ added_nodes = set()
96
+
97
+ def get_type(uri):
98
+ for s, p, o in g.triples((uri, RDFS.label, None)):
99
+ label = str(o)
100
+ if label.startswith("Article"):
101
+ return "article"
102
+ elif label.startswith("Art."):
103
+ return "clause"
104
+ elif label.startswith("Section"):
105
+ return "section"
106
+ return "unknown"
107
+
108
+ def get_label(g, node):
109
+ for _, _, label in g.triples((node, RDFS.label, None)):
110
+ return str(label)
111
+ return None
112
+
113
+ def color_by_type(node_type):
114
+ return {
115
+ "article": "#77B5FE", # blue
116
+ "clause": "#81C784", # green
117
+ "section": "#FFB74D", # orange
118
+ }.get(node_type, "#D3D3D3")
119
+
120
+ # --- Add nodes ---
121
+ for node in nodes_to_show:
122
+ label = get_label(g, node) or (str(node).split("#")[-1] if isinstance(node, URIRef) else str(node))
123
+ n_type = get_type(node)
124
+ tooltip = label
125
+ for val in g.objects(node, EX.similarityScore):
126
+ tooltip += f"\nSimilarity: {val}"
127
+ for val in g.objects(node, RDFS.comment):
128
+ tooltip += f"\n{val}"
129
+ net.add_node(str(node), label=label, title=tooltip, color=color_by_type(n_type))
130
+ added_nodes.add(str(node))
131
+
132
+ # --- Add edges only between shown nodes ---
133
+ for s, p, o in g:
134
+ if s in nodes_to_show and o in nodes_to_show:
135
+ pred_label = p.split("#")[-1] if isinstance(p, URIRef) else str(p)
136
+ net.add_edge(str(s), str(o), label=pred_label)
137
+
138
+ # --- Render ---
139
+ net.save_graph("graph.html")
140
+ with open("graph.html", "r", encoding="utf-8") as f:
141
+ html = f.read()
142
+
143
+ components.html(html, height=780, scrolling=True)
src/app-st.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import numpy as np
4
+ import joblib
5
+ from collections import defaultdict
6
+ from transformers import AutoTokenizer, AutoModel
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import torch
9
+ import re
10
+ from typing import List, Dict, Any
11
+ from openai import OpenAI
12
+ from dotenv import load_dotenv
13
+ import os
14
+ from sentence_transformers import SentenceTransformer
15
+ from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, XSD
16
+ import os
17
+ import networkx as nx
18
+ from pyvis.network import Network
19
+ import streamlit.components.v1 as components
20
+
21
+ # ---------------------------
22
+ # LegalBERT-based compliance checker
23
+ # ---------------------------
24
+ class GDPRComplianceChecker:
25
+ def __init__(self, model_name="nlpaueb/bert-base-uncased-eurlex"):
26
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
27
+ self.model = AutoModel.from_pretrained(model_name)
28
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ self.model.to(self.device).eval()
30
+
31
+ def get_embeddings(self, texts):
32
+ embeddings = []
33
+ for text in texts:
34
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
35
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
36
+ with torch.no_grad():
37
+ output = self.model(**inputs)
38
+ embedding = output.last_hidden_state[:, 0, :].cpu().numpy()
39
+ embeddings.append(embedding[0])
40
+ return np.array(embeddings)
41
+
42
+ def chunk_policy_text(self, text, chunk_size=500):
43
+ paragraphs = re.split(r'\n{2,}|\.\s+', text)
44
+ chunks, current = [], ""
45
+ for para in paragraphs:
46
+ if len(current) + len(para) < chunk_size:
47
+ current += " " + para
48
+ else:
49
+ chunks.append(current.strip())
50
+ current = para
51
+ if current:
52
+ chunks.append(current.strip())
53
+ return [chunk for chunk in chunks if len(chunk) > 50]
54
+
55
+ def load_gdpr_articles(self, gdpr_json):
56
+ gdpr_map, texts = {}, []
57
+ for article in gdpr_json:
58
+ number, title = article["article_number"], article["article_title"]
59
+ body = " ".join([f"{k} {v}" for sec in article["sections"] for k, v in sec.items()])
60
+ full_text = f"Article {number}: {title}. {body}"
61
+ gdpr_map[number] = {"title": title, "text": full_text}
62
+ texts.append(full_text)
63
+
64
+ embeddings = self.get_embeddings(texts)
65
+ return gdpr_map, embeddings
66
+
67
+ def calculate_compliance_score(self, policy_text, gdpr_map, gdpr_embeddings):
68
+ chunks = self.chunk_policy_text(policy_text)
69
+ if not chunks:
70
+ return {"error": "Policy has no meaningful chunks."}
71
+ chunk_embeddings = self.get_embeddings(chunks)
72
+ sim_matrix = cosine_similarity(gdpr_embeddings, chunk_embeddings)
73
+
74
+ article_scores = {}
75
+ presence_threshold = 0.35
76
+ total_score, counted_articles = 0, 0
77
+
78
+ for i, (art_num, art_data) in enumerate(gdpr_map.items()):
79
+ max_sim = np.max(sim_matrix[i])
80
+ best_idx = np.argmax(sim_matrix[i])
81
+
82
+ if max_sim < presence_threshold:
83
+ continue
84
+
85
+ score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100))
86
+ article_scores[art_num] = {
87
+ "article_title": art_data["title"],
88
+ "compliance_percentage": round(score_pct, 2),
89
+ "similarity_score": round(max_sim, 4),
90
+ "matched_text_snippet": chunks[best_idx][:300] + "..."
91
+ }
92
+ total_score += score_pct
93
+ counted_articles += 1
94
+
95
+ overall = round(total_score / counted_articles, 2) if counted_articles else 0
96
+ return {
97
+ "overall_compliance_percentage": overall,
98
+ "relevant_articles_analyzed": counted_articles,
99
+ "total_policy_chunks": len(chunks),
100
+ "article_scores": article_scores
101
+ }
102
+
103
+
104
+ def chunk_policy_text(text, chunk_size=500):
105
+ import re
106
+ paragraphs = re.split(r'\n{2,}|\.\s+', text)
107
+ chunks, current = [], ""
108
+ for para in paragraphs:
109
+ if len(current) + len(para) < chunk_size:
110
+ current += " " + para
111
+ else:
112
+ chunks.append(current.strip())
113
+ current = para
114
+ if current:
115
+ chunks.append(current.strip())
116
+ return [chunk for chunk in chunks if len(chunk) > 50]
117
+
118
+ def prepare_article_text(article: Dict[str, Any]) -> str:
119
+ body = " ".join(
120
+ " ".join(sec.values()) if isinstance(sec, dict) else str(sec)
121
+ for sec in article.get("sections", [])
122
+ )
123
+ return f"Art. {article['article_number']} – {article['article_title']} {body}"
124
+
125
+ def get_embedding(text: str) -> List[float]:
126
+ # If input is a list of strings, clean each string
127
+ if isinstance(text, list):
128
+ cleaned_text = [t.replace("\n", " ") for t in text]
129
+ else: # single string
130
+ cleaned_text = text.replace("\n", " ")
131
+ resp = client.embeddings.create(model=EMBED_MODEL, input=cleaned_text)
132
+ if isinstance(cleaned_text, list):
133
+ return [item.embedding for item in resp.data]
134
+ else:
135
+ return resp.data[0].embedding
136
+
137
+ def rdflib_to_networkx(rdflib_graph):
138
+ nx_graph = nx.MultiDiGraph()
139
+ for s, p, o in rdflib_graph:
140
+ nx_graph.add_edge(str(s), str(o), label=str(p))
141
+ return nx_graph
142
+
143
+ def draw_pyvis_graph(nx_graph):
144
+ net = Network(height="600px", width="100%", directed=True, notebook=False)
145
+ net.from_nx(nx_graph)
146
+ net.repulsion(node_distance=200, central_gravity=0.33, spring_length=100, spring_strength=0.10, damping=0.95)
147
+ return net
148
+ # ---------------------------
149
+ # Streamlit interface
150
+ # ---------------------------
151
+ st.set_page_config(page_title="GDPR Compliance Checker", layout="wide")
152
+ st.title("🛡️ GDPR Compliance Checker")
153
+
154
+ with st.sidebar:
155
+ st.header("Upload Files")
156
+ gdpr_file = st.file_uploader("GDPR JSON File", type=["json"])
157
+ policy_file = st.file_uploader("Company Policy (.txt)", type=["txt"])
158
+
159
+ if gdpr_file and policy_file:
160
+ model_choice = st.selectbox(
161
+ "Choose the model to use:",
162
+ ["Logistic Regression", "MultinomialNB", "LegalBERT (Eurlex)", "SentenceTransformer", "LLM Model", "Knowledge Graphs"]
163
+ )
164
+
165
+ gdpr_data = json.load(gdpr_file)
166
+ article_title_map = {f"Article {a['article_number']}": a['article_title'] for a in gdpr_data}
167
+
168
+ policy_text = policy_file.read().decode("utf-8")
169
+
170
+ with st.spinner("Analyzing..."):
171
+ if model_choice == "LegalBERT (Eurlex)":
172
+ checker = GDPRComplianceChecker()
173
+ gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data)
174
+ result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings)
175
+
176
+ elif model_choice in ["Logistic Regression", "MultinomialNB"]:
177
+ if model_choice == "Logistic Regression":
178
+ model = joblib.load("logistic_regression_model.joblib")
179
+ vectorizer = joblib.load("logistic_regression_vectorizer.joblib")
180
+ else:
181
+ model = joblib.load("multinomialNB_model.joblib")
182
+ vectorizer = joblib.load("multinomialNB_vectorizer.joblib")
183
+
184
+ chunks = chunk_policy_text(policy_text)
185
+ chunks = [c.strip() for c in chunks if len(c.strip()) > 40]
186
+ X_tfidf = vectorizer.transform(chunks)
187
+ y_pred = model.predict(X_tfidf)
188
+ y_proba = model.predict_proba(X_tfidf)
189
+
190
+ article_scores = defaultdict(lambda: {
191
+ "article_title": "",
192
+ "compliance_percentage": 0.0,
193
+ "similarity_score": 0.0,
194
+ "matched_text_snippet": ""
195
+ })
196
+ total_score = 0
197
+ counted_chunks = 0
198
+
199
+ for i, (label, prob_vector) in enumerate(zip(y_pred, y_proba)):
200
+ max_prob = max(prob_vector)
201
+ if max_prob >= 0.35:
202
+ score_pct = min(100.0, max(0.0, (max_prob - 0.35) / (1 - 0.35) * 100))
203
+ if score_pct > article_scores[label]["compliance_percentage"]:
204
+ article_scores[label]["compliance_percentage"] = score_pct
205
+ article_scores[label]["similarity_score"] = round(max_prob, 4)
206
+ article_scores[label]["matched_text_snippet"] = chunks[i][:300] + "..."
207
+ article_scores[label]["article_title"] = article_title_map.get(label, label)
208
+ total_score += score_pct
209
+ counted_chunks += 1
210
+
211
+ overall = round(total_score / counted_chunks, 2) if counted_chunks else 0
212
+ result = {
213
+ "overall_compliance_percentage": overall,
214
+ "relevant_articles_analyzed": len(article_scores),
215
+ "total_policy_chunks": len(chunks),
216
+ "article_scores": dict(article_scores)
217
+ }
218
+
219
+ elif model_choice == "SentenceTransformer":
220
+ model = joblib.load("sentence_transformer_model.joblib")
221
+ gdpr_texts = []
222
+ gdpr_map = {}
223
+ for article in gdpr_data:
224
+ number, title = article["article_number"], article["article_title"]
225
+ body = " ".join([f"{k} {v}" for sec in article["sections"] for k, v in sec.items()])
226
+ full_text = f"Article {number}: {title}. {body}"
227
+ gdpr_map[number] = {
228
+ "title": title,
229
+ "text": full_text
230
+ }
231
+ gdpr_texts.append(full_text)
232
+
233
+ gdpr_embeddings = model.encode(gdpr_texts, convert_to_numpy=True)
234
+
235
+ chunks = chunk_policy_text(policy_text)
236
+ chunk_embeddings = model.encode(chunks, convert_to_numpy=True)
237
+
238
+ sim_matrix = cosine_similarity(gdpr_embeddings, chunk_embeddings)
239
+
240
+ article_scores = {}
241
+ presence_threshold = 0.35
242
+ total_score, counted_articles = 0, 0
243
+
244
+ for i, (art_num, art_data) in enumerate(gdpr_map.items()):
245
+ max_sim = np.max(sim_matrix[i])
246
+ best_idx = np.argmax(sim_matrix[i])
247
+
248
+ if max_sim < presence_threshold:
249
+ continue
250
+
251
+ score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100))
252
+ article_scores[art_num] = {
253
+ "article_title": art_data["title"],
254
+ "compliance_percentage": round(score_pct, 2),
255
+ "similarity_score": round(max_sim, 4),
256
+ "matched_text_snippet": chunks[best_idx][:300] + "..."
257
+ }
258
+ total_score += score_pct
259
+ counted_articles += 1
260
+
261
+ overall = round(total_score / counted_articles, 2) if counted_articles else 0
262
+ result = {
263
+ "overall_compliance_percentage": overall,
264
+ "relevant_articles_analyzed": counted_articles,
265
+ "total_policy_chunks": len(chunks),
266
+ "article_scores": article_scores
267
+ }
268
+
269
+ elif model_choice == "LLM Model":
270
+ load_dotenv()
271
+ api_key = os.getenv("OPENAI_API_KEY")
272
+ client = OpenAI(api_key=api_key)
273
+ EMBED_MODEL = "text-embedding-3-small"
274
+ gdpr_embeddings = {}
275
+ gdpr_map = {}
276
+ for art in gdpr_data:
277
+ number, title = art["article_number"], art["article_title"]
278
+ art_text = prepare_article_text(art)
279
+ gdpr_embeddings[art["article_number"]] = {
280
+ "embedding": get_embedding(art_text),
281
+ "title": art["article_title"]
282
+ }
283
+ gdpr_map[number] = {"title": title, "text": art_text}
284
+ chunks = chunk_policy_text(policy_text)
285
+ chunk_embeddings = get_embedding(chunks)
286
+ gdpr_embedding_vectors = [v["embedding"] for v in gdpr_embeddings.values()]
287
+ sim_matrix = cosine_similarity(gdpr_embedding_vectors, chunk_embeddings)
288
+
289
+ article_scores = {}
290
+ presence_threshold = 0.35
291
+ total_score, counted_articles = 0, 0
292
+
293
+ for i, (art_num, art_data) in enumerate(gdpr_map.items()):
294
+ max_sim = np.max(sim_matrix[i])
295
+ best_idx = np.argmax(sim_matrix[i])
296
+
297
+ if max_sim < presence_threshold:
298
+ continue
299
+
300
+ score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100))
301
+ article_scores[art_num] = {
302
+ "article_title": art_data["title"],
303
+ "compliance_percentage": round(score_pct, 2),
304
+ "similarity_score": round(max_sim, 4),
305
+ "matched_text_snippet": chunks[best_idx][:300] + "..."
306
+ }
307
+ total_score += score_pct
308
+ counted_articles += 1
309
+
310
+ overall = round(total_score / counted_articles, 2) if counted_articles else 0
311
+ result = {
312
+ "overall_compliance_percentage": overall,
313
+ "relevant_articles_analyzed": counted_articles,
314
+ "total_policy_chunks": len(chunks),
315
+ "article_scores": article_scores
316
+ }
317
+ elif model_choice == "Knowledge Graphs":
318
+ EMBED_MODEL = "all-MiniLM-L6-v2"
319
+ model = SentenceTransformer(EMBED_MODEL)
320
+ TOP_N = 1
321
+ BASE_URI = "http://example.org/gdpr#"
322
+ gdpr_embeddings = {}
323
+ gdpr_map = {}
324
+ for art in gdpr_data:
325
+ number, title = art["article_number"], art["article_title"]
326
+ art_text = prepare_article_text(art)
327
+ gdpr_embeddings[art["article_number"]] = {
328
+ "embedding": model.encode(art_text),
329
+ "title": art["article_title"],
330
+ "uri": URIRef(f"{BASE_URI}Article{art['article_number']}")
331
+ }
332
+ gdpr_map[number] = {"title": title, "text": art_text}
333
+ g = Graph()
334
+ EX = Namespace(BASE_URI)
335
+ g.bind("ex", EX)
336
+
337
+ # Add article nodes
338
+ for num, info in gdpr_embeddings.items():
339
+ g.add((info["uri"], RDF.type, EX.Article))
340
+ g.add((info["uri"], RDFS.label, Literal(f"Article {num}: {info['title']}")))
341
+ # Extract GDPR article vectors
342
+ article_nums = list(gdpr_embeddings.keys())
343
+ article_vectors = np.array([gdpr_embeddings[num]["embedding"] for num in article_nums])
344
+
345
+ # Score tracking
346
+ total_score = 0
347
+ counted_sections = 0
348
+ chunks = chunk_policy_text(policy_text)
349
+ report = []
350
+ presence_threshold = 0.35
351
+
352
+ # Process each policy chunk
353
+ for idx, text in enumerate(chunks, start=1):
354
+ if not text.strip():
355
+ continue
356
+
357
+ # RDF section node
358
+ sec_uri = URIRef(f"{BASE_URI}PolicySection{idx}")
359
+ g.add((sec_uri, RDF.type, EX.PolicySection))
360
+ g.add((sec_uri, RDFS.label, Literal(f"Section {idx}")))
361
+
362
+ # Embed section
363
+ sec_emb = model.encode(text)
364
+
365
+ # Similarities to all articles
366
+ sims = []
367
+ for i, art_num in enumerate(article_nums):
368
+ art_emb = article_vectors[i]
369
+ sim = cosine_similarity([sec_emb], [art_emb])[0][0]
370
+ sims.append({
371
+ "article": art_num,
372
+ "title": gdpr_embeddings[art_num]["title"],
373
+ "similarity": round(sim, 4),
374
+ "uri": gdpr_embeddings[art_num]["uri"],
375
+ "text": gdpr_map[art_num]["text"]
376
+ })
377
+
378
+ # Sort and pick best match
379
+ sims.sort(key=lambda x: x["similarity"], reverse=True)
380
+ top_match = sims[0]
381
+
382
+ # Threshold filtering
383
+ if top_match["similarity"] < presence_threshold:
384
+ continue
385
+
386
+ # Compliance score
387
+ score_pct = min(100, max(0, (top_match["similarity"] - presence_threshold) / (1 - presence_threshold) * 100))
388
+
389
+ # Add RDF triples
390
+ g.add((sec_uri, EX.relatesTo, top_match["uri"]))
391
+ g.add((sec_uri, EX.similarityScore, Literal(top_match["similarity"], datatype=XSD.float)))
392
+
393
+
394
+ g.serialize(destination="gdpr_policy_graph.ttl", format="turtle")
395
+
396
+ total_score += score_pct
397
+ counted_sections += 1
398
+
399
+ # Final summary
400
+ overall = round(total_score / counted_sections, 2) if counted_sections else 0
401
+ result = {
402
+ "overall_compliance_percentage": overall,
403
+ "relevant_sections_analyzed": counted_sections,
404
+ "total_policy_sections": len(chunks),
405
+ "ttl": True
406
+ }
407
+
408
+ else:
409
+ result = {}
410
+
411
+ if result:
412
+ st.subheader(f"✅ Overall Compliance Score: {result['overall_compliance_percentage']}%")
413
+ st.markdown("---")
414
+ st.subheader("📋 Detailed Article Breakdown")
415
+ ttl_file_path = "gdpr_policy_graph.ttl"
416
+ if result.get('article_scores'):
417
+ for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
418
+ with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
419
+ st.write(f"**Similarity Score**: {data['similarity_score']}")
420
+ st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")
421
+ elif result.get("ttl") and os.path.exists(ttl_file_path):
422
+ st.markdown("---")
423
+ st.subheader("🧠 Interactive RDF Graph Visualization")
424
+
425
+ g = Graph()
426
+ g.parse(ttl_file_path, format="ttl")
427
+
428
+ nx_graph = rdflib_to_networkx(g)
429
+ net = draw_pyvis_graph(nx_graph)
430
+
431
+ # Save the interactive graph temporarily
432
+ net.save_graph("rdf_graph.html")
433
+ HtmlFile = open("rdf_graph.html", "r", encoding="utf-8").read()
434
+
435
+ # Display interactive graph inside Streamlit
436
+ components.html(HtmlFile, height=650, scrolling=True)
437
+
438
+ else:
439
+ st.info("No article scores or RDF graph to display.")
440
+
441
+ else:
442
+ st.info("Please upload both a GDPR JSON file and a company policy text file to begin.")
src/gdpr_articles_baseline.json ADDED
The diff for this file is too large to render. See raw diff
 
src/logistic_regression_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:091220aebae9dc7e864e5dea7c53cee4257342d759fe88bae43e247e4f75c2dd
3
+ size 8558559
src/logistic_regression_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300da132d26d2172ca64ce66dfb522048fc3b0238e93c850c849744c69c7c46c
3
+ size 255317
src/multinomialNB_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:023df78556de03380ff5a50a0c53c9c2f10bb330f84d4ead95a465aec8c5c84e
3
+ size 17115775
src/multinomialNB_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300da132d26d2172ca64ce66dfb522048fc3b0238e93c850c849744c69c7c46c
3
+ size 255317
src/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ scikit-learn
3
+ nltk
4
+ joblib
5
+ streamlit
6
+ torch
7
+ transformers
8
+ numpy
9
+ matplotlib
10
+ seaborn
11
+ sentence_transformers
12
+ rdflib
13
+ openai
14
+ python-dotenv
15
+ rdflib
16
+ networkx
17
+ pyvis
src/sentence_transformer_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:517b22c8e93043f88e3fcc73d567e88d3ac9da66babd9d061ed0ca30ed58c6fc
3
+ size 91394608