import streamlit as st import json import numpy as np import joblib from collections import defaultdict from transformers import AutoTokenizer, AutoModel from sklearn.metrics.pairwise import cosine_similarity import torch import re # --------------------------- # LegalBERT-based compliance checker # --------------------------- class GDPRComplianceChecker: def __init__(self, model_name="nlpaueb/bert-base-uncased-eurlex"): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device).eval() def get_embeddings(self, texts): embeddings = [] for text in texts: inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) inputs = {k: v.to(self.device) for k, v in inputs.items()} with torch.no_grad(): output = self.model(**inputs) embedding = output.last_hidden_state[:, 0, :].cpu().numpy() embeddings.append(embedding[0]) return np.array(embeddings) def chunk_policy_text(self, text, chunk_size=500): paragraphs = re.split(r'\n{2,}|\.\s+', text) chunks, current = [], "" for para in paragraphs: if len(current) + len(para) < chunk_size: current += " " + para else: chunks.append(current.strip()) current = para if current: chunks.append(current.strip()) return [chunk for chunk in chunks if len(chunk) > 50] def load_gdpr_articles(self, gdpr_json): gdpr_map, texts = {}, [] for article in gdpr_json: number, title = article["article_number"], article["article_title"] body = " ".join([f"{k} {v}" for sec in article["sections"] for k, v in sec.items()]) full_text = f"Article {number}: {title}. {body}" gdpr_map[number] = {"title": title, "text": full_text} texts.append(full_text) embeddings = self.get_embeddings(texts) return gdpr_map, embeddings def calculate_compliance_score(self, policy_text, gdpr_map, gdpr_embeddings): chunks = self.chunk_policy_text(policy_text) if not chunks: return {"error": "Policy has no meaningful chunks."} chunk_embeddings = self.get_embeddings(chunks) sim_matrix = cosine_similarity(gdpr_embeddings, chunk_embeddings) article_scores = {} presence_threshold = 0.35 total_score, counted_articles = 0, 0 for i, (art_num, art_data) in enumerate(gdpr_map.items()): max_sim = np.max(sim_matrix[i]) best_idx = np.argmax(sim_matrix[i]) if max_sim < presence_threshold: continue score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100)) article_scores[art_num] = { "article_title": art_data["title"], "compliance_percentage": round(score_pct, 2), "similarity_score": round(max_sim, 4), "matched_text_snippet": chunks[best_idx][:300] + "..." } total_score += score_pct counted_articles += 1 overall = round(total_score / counted_articles, 2) if counted_articles else 0 return { "overall_compliance_percentage": overall, "relevant_articles_analyzed": counted_articles, "total_policy_chunks": len(chunks), "article_scores": article_scores } # --------------------------- # Streamlit interface # --------------------------- st.set_page_config(page_title="GDPR Compliance Checker", layout="wide") st.title("🛡️ GDPR Compliance Checker") # Fixe Dateipfade gdpr_path = "/app/src/gdpr_articles_baseline.json" policy_path = "/app/src/sephora_com_policy.txt" # Laden der Daten with open(gdpr_path, "r", encoding="utf-8") as f: gdpr_data = json.load(f) with open(policy_path, "r", encoding="utf-8") as f: policy_text = f.read() # Automatische Analyse with st.spinner("Analyzing using LegalBERT (Eurlex)..."): checker = GDPRComplianceChecker() gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data) result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings) # Ergebnisse anzeigen if result: st.subheader(f"✅ Overall Compliance Score: {result['overall_compliance_percentage']}%") st.markdown("---") st.subheader("📋 Detailed Article Breakdown") for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']): with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"): st.write(f"**Similarity Score**: {data['similarity_score']}") st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")