|
import streamlit as st |
|
import json |
|
import numpy as np |
|
import joblib |
|
from collections import defaultdict |
|
from transformers import AutoTokenizer, AutoModel |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import torch |
|
import re |
|
|
|
|
|
|
|
|
|
class GDPRComplianceChecker: |
|
def __init__(self, model_name="nlpaueb/bert-base-uncased-eurlex"): |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
self.model = AutoModel.from_pretrained(model_name) |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
self.model.to(self.device).eval() |
|
|
|
def get_embeddings(self, texts): |
|
embeddings = [] |
|
for text in texts: |
|
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) |
|
inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
with torch.no_grad(): |
|
output = self.model(**inputs) |
|
embedding = output.last_hidden_state[:, 0, :].cpu().numpy() |
|
embeddings.append(embedding[0]) |
|
return np.array(embeddings) |
|
|
|
def chunk_policy_text(self, text, chunk_size=500): |
|
paragraphs = re.split(r'\n{2,}|\.\s+', text) |
|
chunks, current = [], "" |
|
for para in paragraphs: |
|
if len(current) + len(para) < chunk_size: |
|
current += " " + para |
|
else: |
|
chunks.append(current.strip()) |
|
current = para |
|
if current: |
|
chunks.append(current.strip()) |
|
return [chunk for chunk in chunks if len(chunk) > 50] |
|
|
|
def load_gdpr_articles(self, gdpr_json): |
|
gdpr_map, texts = {}, [] |
|
for article in gdpr_json: |
|
number, title = article["article_number"], article["article_title"] |
|
body = " ".join([f"{k} {v}" for sec in article["sections"] for k, v in sec.items()]) |
|
full_text = f"Article {number}: {title}. {body}" |
|
gdpr_map[number] = {"title": title, "text": full_text} |
|
texts.append(full_text) |
|
embeddings = self.get_embeddings(texts) |
|
return gdpr_map, embeddings |
|
|
|
def calculate_compliance_score(self, policy_text, gdpr_map, gdpr_embeddings): |
|
chunks = self.chunk_policy_text(policy_text) |
|
if not chunks: |
|
return {"error": "Policy has no meaningful chunks."} |
|
chunk_embeddings = self.get_embeddings(chunks) |
|
sim_matrix = cosine_similarity(gdpr_embeddings, chunk_embeddings) |
|
|
|
article_scores = {} |
|
presence_threshold = 0.35 |
|
total_score, counted_articles = 0, 0 |
|
|
|
for i, (art_num, art_data) in enumerate(gdpr_map.items()): |
|
max_sim = np.max(sim_matrix[i]) |
|
best_idx = np.argmax(sim_matrix[i]) |
|
|
|
if max_sim < presence_threshold: |
|
continue |
|
|
|
score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100)) |
|
article_scores[art_num] = { |
|
"article_title": art_data["title"], |
|
"compliance_percentage": round(score_pct, 2), |
|
"similarity_score": round(max_sim, 4), |
|
"matched_text_snippet": chunks[best_idx][:300] + "..." |
|
} |
|
total_score += score_pct |
|
counted_articles += 1 |
|
|
|
overall = round(total_score / counted_articles, 2) if counted_articles else 0 |
|
return { |
|
"overall_compliance_percentage": overall, |
|
"relevant_articles_analyzed": counted_articles, |
|
"total_policy_chunks": len(chunks), |
|
"article_scores": article_scores |
|
} |
|
|
|
|
|
|
|
|
|
st.set_page_config(page_title="GDPR Compliance Checker", layout="wide") |
|
st.title("π‘οΈ GDPR Compliance Checker") |
|
|
|
|
|
gdpr_path = "/app/src/gdpr_articles_baseline.json" |
|
policy_path = "/app/src/sephora_com_policy.txt" |
|
|
|
|
|
with open(gdpr_path, "r", encoding="utf-8") as f: |
|
gdpr_data = json.load(f) |
|
|
|
with open(policy_path, "r", encoding="utf-8") as f: |
|
policy_text = f.read() |
|
|
|
|
|
with st.spinner("Analyzing using LegalBERT (Eurlex)..."): |
|
checker = GDPRComplianceChecker() |
|
gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data) |
|
result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings) |
|
|
|
|
|
if result: |
|
st.subheader(f"β
Overall Compliance Score: {result['overall_compliance_percentage']}%") |
|
st.markdown("---") |
|
st.subheader("π Detailed Article Breakdown") |
|
for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']): |
|
with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"): |
|
st.write(f"**Similarity Score**: {data['similarity_score']}") |
|
st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}") |
|
|