Spaces:

lenawilli
/

GDPR

Sleeping

App Files Files Community

lenawilli commited on Jun 25

Commit

85ee116

verified ·

1 Parent(s): 9c0bae8

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +26 -98

src/streamlit_app.py CHANGED Viewed

@@ -89,107 +89,35 @@ class GDPRComplianceChecker:
             "article_scores": article_scores
         }
-def chunk_policy_text(text, chunk_size=500):
-    import re
-    paragraphs = re.split(r'\n{2,}|\.\s+', text)
-    chunks, current = [], ""
-    for para in paragraphs:
-        if len(current) + len(para) < chunk_size:
-            current += " " + para
-        else:
-            chunks.append(current.strip())
-            current = para
-    if current:
-        chunks.append(current.strip())
-    return [chunk for chunk in chunks if len(chunk) > 50]
 # ---------------------------
 # Streamlit interface
 # ---------------------------
 st.set_page_config(page_title="GDPR Compliance Checker", layout="wide")
 st.title("🛡️ GDPR Compliance Checker")
-with st.sidebar:
-    st.header("Upload Files")
-    gdpr_file = st.file_uploader("GDPR JSON File", type=["json"])
-    policy_file = st.file_uploader("Company Policy (.txt)", type=["txt"])
-if gdpr_file and policy_file:
-    model_choice = st.selectbox(
-        "Choose the model to use:",
-        ["Logistic Regression", "MultinomialNB", "LegalBERT (Eurlex)", "Knowledge Graphs"]
-    )
-    gdpr_data = json.load(gdpr_file)
-    article_title_map = {f"Article {a['article_number']}": a['article_title'] for a in gdpr_data}
-    policy_text = policy_file.read().decode("utf-8")
-    with st.spinner("Analyzing..."):
-        if model_choice == "LegalBERT (Eurlex)":
-            checker = GDPRComplianceChecker()
-            gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data)
-            result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings)
-        elif model_choice in ["Logistic Regression", "MultinomialNB"]:
-            if model_choice == "Logistic Regression":
-                model = joblib.load("logistic_regression_model.joblib")
-                vectorizer = joblib.load("logistic_regression_vectorizer.joblib")
-            else:
-                model = joblib.load("multinomialNB_model.joblib")
-                vectorizer = joblib.load("multinomialNB_vectorizer.joblib")
-            chunks = chunk_policy_text(policy_text)
-            chunks = [c.strip() for c in chunks if len(c.strip()) > 40]
-            X_tfidf = vectorizer.transform(chunks)
-            y_pred = model.predict(X_tfidf)
-            y_proba = model.predict_proba(X_tfidf)
-            article_scores = defaultdict(lambda: {
-                "article_title": "",
-                "compliance_percentage": 0.0,
-                "similarity_score": 0.0,
-                "matched_text_snippet": ""
-            })
-            total_score = 0
-            counted_chunks = 0
-            for i, (label, prob_vector) in enumerate(zip(y_pred, y_proba)):
-                max_prob = max(prob_vector)
-                if max_prob >= 0.35:
-                    score_pct = min(100.0, max(0.0, (max_prob - 0.35) / (1 - 0.35) * 100))
-                    if score_pct > article_scores[label]["compliance_percentage"]:
-                        article_scores[label]["compliance_percentage"] = score_pct
-                        article_scores[label]["similarity_score"] = round(max_prob, 4)
-                        article_scores[label]["matched_text_snippet"] = chunks[i][:300] + "..."
-                    article_scores[label]["article_title"] = article_title_map.get(label, label)
-                    total_score += score_pct
-                    counted_chunks += 1
-            overall = round(total_score / counted_chunks, 2) if counted_chunks else 0
-            result = {
-                "overall_compliance_percentage": overall,
-                "relevant_articles_analyzed": len(article_scores),
-                "total_policy_chunks": len(chunks),
-                "article_scores": dict(article_scores)
-            }
-        elif model_choice == "Knowledge Graphs":
-            st.warning("Knowledge Graphs model is not implemented yet.")
-            result = {}
-        else:
-            result = {}
-    if result:
-        st.subheader(f"✅ Overall Compliance Score: {result['overall_compliance_percentage']}%")
-        st.markdown("---")
-        st.subheader("📋 Detailed Article Breakdown")
-        for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
-            with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
-                st.write(f"**Similarity Score**: {data['similarity_score']}")
-                st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")
-else:
-    st.info("Please upload both a GDPR JSON file and a company policy text file to begin.")

             "article_scores": article_scores
         }
 # ---------------------------
 # Streamlit interface
 # ---------------------------
 st.set_page_config(page_title="GDPR Compliance Checker", layout="wide")
 st.title("🛡️ GDPR Compliance Checker")
+# Fixe Dateipfade
+gdpr_path = "gdpr_articles_baseline.json"
+policy_path = "sephora_com_policy.txt"
+# Laden der Daten
+with open(gdpr_path, "r", encoding="utf-8") as f:
+    gdpr_data = json.load(f)
+with open(policy_path, "r", encoding="utf-8") as f:
+    policy_text = f.read()
+# Automatische Analyse
+with st.spinner("Analyzing using LegalBERT (Eurlex)..."):
+    checker = GDPRComplianceChecker()
+    gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data)
+    result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings)
+# Ergebnisse anzeigen
+if result:
+    st.subheader(f"✅ Overall Compliance Score: {result['overall_compliance_percentage']}%")
+    st.markdown("---")
+    st.subheader("📋 Detailed Article Breakdown")
+    for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
+        with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
+            st.write(f"**Similarity Score**: {data['similarity_score']}")
+            st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")