GDPR / src /streamlit_app.py
lenawilli's picture
Update src/streamlit_app.py
6a16c7d verified
import streamlit as st
import json
import numpy as np
import joblib
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import re
# ---------------------------
# LegalBERT-based compliance checker
# ---------------------------
class GDPRComplianceChecker:
def __init__(self, model_name="nlpaueb/bert-base-uncased-eurlex"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device).eval()
def get_embeddings(self, texts):
embeddings = []
for text in texts:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
output = self.model(**inputs)
embedding = output.last_hidden_state[:, 0, :].cpu().numpy()
embeddings.append(embedding[0])
return np.array(embeddings)
def chunk_policy_text(self, text, chunk_size=500):
paragraphs = re.split(r'\n{2,}|\.\s+', text)
chunks, current = [], ""
for para in paragraphs:
if len(current) + len(para) < chunk_size:
current += " " + para
else:
chunks.append(current.strip())
current = para
if current:
chunks.append(current.strip())
return [chunk for chunk in chunks if len(chunk) > 50]
def load_gdpr_articles(self, gdpr_json):
gdpr_map, texts = {}, []
for article in gdpr_json:
number, title = article["article_number"], article["article_title"]
body = " ".join([f"{k} {v}" for sec in article["sections"] for k, v in sec.items()])
full_text = f"Article {number}: {title}. {body}"
gdpr_map[number] = {"title": title, "text": full_text}
texts.append(full_text)
embeddings = self.get_embeddings(texts)
return gdpr_map, embeddings
def calculate_compliance_score(self, policy_text, gdpr_map, gdpr_embeddings):
chunks = self.chunk_policy_text(policy_text)
if not chunks:
return {"error": "Policy has no meaningful chunks."}
chunk_embeddings = self.get_embeddings(chunks)
sim_matrix = cosine_similarity(gdpr_embeddings, chunk_embeddings)
article_scores = {}
presence_threshold = 0.35
total_score, counted_articles = 0, 0
for i, (art_num, art_data) in enumerate(gdpr_map.items()):
max_sim = np.max(sim_matrix[i])
best_idx = np.argmax(sim_matrix[i])
if max_sim < presence_threshold:
continue
score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100))
article_scores[art_num] = {
"article_title": art_data["title"],
"compliance_percentage": round(score_pct, 2),
"similarity_score": round(max_sim, 4),
"matched_text_snippet": chunks[best_idx][:300] + "..."
}
total_score += score_pct
counted_articles += 1
overall = round(total_score / counted_articles, 2) if counted_articles else 0
return {
"overall_compliance_percentage": overall,
"relevant_articles_analyzed": counted_articles,
"total_policy_chunks": len(chunks),
"article_scores": article_scores
}
# ---------------------------
# Streamlit interface
# ---------------------------
st.set_page_config(page_title="GDPR Compliance Checker", layout="wide")
st.title("πŸ›‘οΈ GDPR Compliance Checker")
# Fixe Dateipfade
gdpr_path = "/app/src/gdpr_articles_baseline.json"
policy_path = "/app/src/sephora_com_policy.txt"
# Laden der Daten
with open(gdpr_path, "r", encoding="utf-8") as f:
gdpr_data = json.load(f)
with open(policy_path, "r", encoding="utf-8") as f:
policy_text = f.read()
# Automatische Analyse
with st.spinner("Analyzing using LegalBERT (Eurlex)..."):
checker = GDPRComplianceChecker()
gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data)
result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings)
# Ergebnisse anzeigen
if result:
st.subheader(f"βœ… Overall Compliance Score: {result['overall_compliance_percentage']}%")
st.markdown("---")
st.subheader("πŸ“‹ Detailed Article Breakdown")
for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
st.write(f"**Similarity Score**: {data['similarity_score']}")
st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")