lenawilli commited on
Commit
85ee116
Β·
verified Β·
1 Parent(s): 9c0bae8

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +26 -98
src/streamlit_app.py CHANGED
@@ -89,107 +89,35 @@ class GDPRComplianceChecker:
89
  "article_scores": article_scores
90
  }
91
 
92
-
93
- def chunk_policy_text(text, chunk_size=500):
94
- import re
95
- paragraphs = re.split(r'\n{2,}|\.\s+', text)
96
- chunks, current = [], ""
97
- for para in paragraphs:
98
- if len(current) + len(para) < chunk_size:
99
- current += " " + para
100
- else:
101
- chunks.append(current.strip())
102
- current = para
103
- if current:
104
- chunks.append(current.strip())
105
- return [chunk for chunk in chunks if len(chunk) > 50]
106
-
107
-
108
  # ---------------------------
109
  # Streamlit interface
110
  # ---------------------------
111
  st.set_page_config(page_title="GDPR Compliance Checker", layout="wide")
112
  st.title("πŸ›‘οΈ GDPR Compliance Checker")
113
 
114
- with st.sidebar:
115
- st.header("Upload Files")
116
- gdpr_file = st.file_uploader("GDPR JSON File", type=["json"])
117
- policy_file = st.file_uploader("Company Policy (.txt)", type=["txt"])
118
-
119
- if gdpr_file and policy_file:
120
- model_choice = st.selectbox(
121
- "Choose the model to use:",
122
- ["Logistic Regression", "MultinomialNB", "LegalBERT (Eurlex)", "Knowledge Graphs"]
123
- )
124
-
125
- gdpr_data = json.load(gdpr_file)
126
- article_title_map = {f"Article {a['article_number']}": a['article_title'] for a in gdpr_data}
127
-
128
- policy_text = policy_file.read().decode("utf-8")
129
-
130
- with st.spinner("Analyzing..."):
131
- if model_choice == "LegalBERT (Eurlex)":
132
- checker = GDPRComplianceChecker()
133
- gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data)
134
- result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings)
135
-
136
- elif model_choice in ["Logistic Regression", "MultinomialNB"]:
137
- if model_choice == "Logistic Regression":
138
- model = joblib.load("logistic_regression_model.joblib")
139
- vectorizer = joblib.load("logistic_regression_vectorizer.joblib")
140
- else:
141
- model = joblib.load("multinomialNB_model.joblib")
142
- vectorizer = joblib.load("multinomialNB_vectorizer.joblib")
143
-
144
- chunks = chunk_policy_text(policy_text)
145
- chunks = [c.strip() for c in chunks if len(c.strip()) > 40]
146
- X_tfidf = vectorizer.transform(chunks)
147
- y_pred = model.predict(X_tfidf)
148
- y_proba = model.predict_proba(X_tfidf)
149
-
150
- article_scores = defaultdict(lambda: {
151
- "article_title": "",
152
- "compliance_percentage": 0.0,
153
- "similarity_score": 0.0,
154
- "matched_text_snippet": ""
155
- })
156
- total_score = 0
157
- counted_chunks = 0
158
-
159
- for i, (label, prob_vector) in enumerate(zip(y_pred, y_proba)):
160
- max_prob = max(prob_vector)
161
- if max_prob >= 0.35:
162
- score_pct = min(100.0, max(0.0, (max_prob - 0.35) / (1 - 0.35) * 100))
163
- if score_pct > article_scores[label]["compliance_percentage"]:
164
- article_scores[label]["compliance_percentage"] = score_pct
165
- article_scores[label]["similarity_score"] = round(max_prob, 4)
166
- article_scores[label]["matched_text_snippet"] = chunks[i][:300] + "..."
167
- article_scores[label]["article_title"] = article_title_map.get(label, label)
168
- total_score += score_pct
169
- counted_chunks += 1
170
-
171
- overall = round(total_score / counted_chunks, 2) if counted_chunks else 0
172
- result = {
173
- "overall_compliance_percentage": overall,
174
- "relevant_articles_analyzed": len(article_scores),
175
- "total_policy_chunks": len(chunks),
176
- "article_scores": dict(article_scores)
177
- }
178
-
179
- elif model_choice == "Knowledge Graphs":
180
- st.warning("Knowledge Graphs model is not implemented yet.")
181
- result = {}
182
-
183
- else:
184
- result = {}
185
-
186
- if result:
187
- st.subheader(f"βœ… Overall Compliance Score: {result['overall_compliance_percentage']}%")
188
- st.markdown("---")
189
- st.subheader("πŸ“‹ Detailed Article Breakdown")
190
- for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
191
- with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
192
- st.write(f"**Similarity Score**: {data['similarity_score']}")
193
- st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")
194
- else:
195
- st.info("Please upload both a GDPR JSON file and a company policy text file to begin.")
 
89
  "article_scores": article_scores
90
  }
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  # ---------------------------
93
  # Streamlit interface
94
  # ---------------------------
95
  st.set_page_config(page_title="GDPR Compliance Checker", layout="wide")
96
  st.title("πŸ›‘οΈ GDPR Compliance Checker")
97
 
98
+ # Fixe Dateipfade
99
+ gdpr_path = "gdpr_articles_baseline.json"
100
+ policy_path = "sephora_com_policy.txt"
101
+
102
+ # Laden der Daten
103
+ with open(gdpr_path, "r", encoding="utf-8") as f:
104
+ gdpr_data = json.load(f)
105
+
106
+ with open(policy_path, "r", encoding="utf-8") as f:
107
+ policy_text = f.read()
108
+
109
+ # Automatische Analyse
110
+ with st.spinner("Analyzing using LegalBERT (Eurlex)..."):
111
+ checker = GDPRComplianceChecker()
112
+ gdpr_map, gdpr_embeddings = checker.load_gdpr_articles(gdpr_data)
113
+ result = checker.calculate_compliance_score(policy_text, gdpr_map, gdpr_embeddings)
114
+
115
+ # Ergebnisse anzeigen
116
+ if result:
117
+ st.subheader(f"βœ… Overall Compliance Score: {result['overall_compliance_percentage']}%")
118
+ st.markdown("---")
119
+ st.subheader("πŸ“‹ Detailed Article Breakdown")
120
+ for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
121
+ with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
122
+ st.write(f"**Similarity Score**: {data['similarity_score']}")
123
+ st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")