ludigija commited on
Commit
2c5182f
·
verified ·
1 Parent(s): 048c8c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +328 -99
app.py CHANGED
@@ -1,105 +1,334 @@
1
  import streamlit as st
 
 
 
 
 
2
  import difflib
3
- from sentence_transformers import SentenceTransformer
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
- from xhtml2pdf import pisa
7
- import base64
8
- import os
9
- from io import BytesIO
10
-
11
- # Load SBERT model
12
- sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
13
-
14
- def compute_sbert_similarity(text1, text2):
15
- emb1 = sbert_model.encode([text1])[0]
16
- emb2 = sbert_model.encode([text2])[0]
17
- score = cosine_similarity([emb1], [emb2])[0][0]
18
- return score
19
-
20
- def compute_tfidf_similarity(text1, text2):
21
- vectorizer = TfidfVectorizer().fit([text1, text2])
22
- vectors = vectorizer.transform([text1, text2])
23
- return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
24
-
25
- def html_diff(a, b):
26
- differ = difflib.HtmlDiff()
27
- return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)
28
-
29
- def convert_html_to_pdf(source_html):
30
- pdf_file = BytesIO()
31
- pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
32
- if pisa_status.err:
33
- return None
34
- return pdf_file.getvalue()
35
-
36
- def create_download_link(pdf_data, filename="report.pdf"):
37
- b64 = base64.b64encode(pdf_data).decode()
38
- href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
39
- return href
40
-
41
- st.set_page_config(layout="wide")
42
- st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")
43
-
44
- col1, col2 = st.columns(2)
45
-
46
- with col1:
47
- uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
48
- with col2:
49
- uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])
50
-
51
- if uploaded_file1 and uploaded_file2:
52
- original_text = uploaded_file1.read().decode("utf-8")
53
- modified_text = uploaded_file2.read().decode("utf-8")
54
-
55
- sbert_score = compute_sbert_similarity(original_text, modified_text)
56
- tfidf_score = compute_tfidf_similarity(original_text, modified_text)
57
-
58
- html_comparison = html_diff(original_text, modified_text)
59
-
60
- st.markdown("### 🔍 Similarity Scores")
61
- st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}")
62
- st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}")
63
-
64
- st.markdown("### 📑 Comparison Result")
65
-
66
- html_report = f'''
67
- <html>
68
- <head>
69
- <style>
70
- .diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
71
- .diff th, .diff td {{ padding: 5px; }}
72
- iframe {{ width: 100%; height: 600px; border: none; }}
73
- </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  <script>
75
- window.addEventListener("DOMContentLoaded", () => {{
76
- const iframes = document.querySelectorAll("iframe");
77
- if (iframes.length === 2) {{
78
- const syncScroll = (e) => {{
79
- iframes.forEach((frame) => {{
80
- if (frame !== e.target) {{
81
- frame.contentWindow.scrollTo(0, e.target.scrollTop);
82
- }}
83
- }});
84
- }};
85
- iframes.forEach((iframe) => {{
86
- iframe.contentWindow.onscroll = syncScroll;
87
- }});
88
- }}
89
- }});
90
  </script>
91
- </head>
92
- <body>
93
- {html_comparison}
94
- </body>
95
- </html>
96
- '''
97
-
98
- st.components.v1.html(html_report, height=700, scrolling=True)
99
-
100
- if st.button("Generate PDF Report"):
101
- pdf_bytes = convert_html_to_pdf(html_report)
102
- if pdf_bytes:
103
- st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
104
- else:
105
- st.error(" Failed to generate PDF. Check for HTML formatting issues.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from predict import run_prediction
3
+ from io import StringIO
4
+ import PyPDF4
5
+ import docx2txt
6
+ import pdfplumber
7
  import difflib
 
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
+ import streamlit.components.v1 as components
11
+
12
+ # ========== CONFIGURATION ==========
13
+ st.set_page_config(
14
+ layout="wide",
15
+ page_title="Contract Analysis Suite",
16
+ page_icon="📑"
17
+ )
18
+
19
+ # Initialize session state variables if they don't exist
20
+ if 'comparison_results' not in st.session_state:
21
+ st.session_state.comparison_results = None
22
+ if 'analysis_results' not in st.session_state:
23
+ st.session_state.analysis_results = None
24
+
25
+ # ========== CACHED DATA LOADING ==========
26
+ @st.cache_data(show_spinner=False)
27
+ def load_questions():
28
+ try:
29
+ with open('data/questions.txt') as f:
30
+ return [q.strip() for q in f.readlines() if q.strip()]
31
+ except Exception as e:
32
+ st.error(f"Error loading questions: {str(e)}")
33
+ return []
34
+
35
+ @st.cache_data(show_spinner=False)
36
+ def load_questions_short():
37
+ try:
38
+ with open('data/questions_short.txt') as f:
39
+ return [q.strip() for q in f.readlines() if q.strip()]
40
+ except Exception as e:
41
+ st.error(f"Error loading short questions: {str(e)}")
42
+ return []
43
+
44
+ # ========== UTILITY FUNCTIONS ==========
45
+ def extract_text_from_pdf(uploaded_file):
46
+ try:
47
+ with pdfplumber.open(uploaded_file) as pdf:
48
+ full_text = ""
49
+ for page in pdf.pages:
50
+ try:
51
+ text = page.extract_text_formatted() # Try to get formatted text
52
+ except AttributeError:
53
+ text = page.extract_text()
54
+ if text:
55
+ full_text += text + "\n\n" # Add page separator
56
+ else:
57
+ full_text += page.extract_text() + "\n\n"
58
+ return full_text if full_text.strip() else ""
59
+ except Exception as e:
60
+ st.error(f"PDF extraction error: {str(e)}")
61
+ return ""
62
+
63
+
64
+
65
+ def highlight_differences_words(text1, text2):
66
+ differ = difflib.Differ()
67
+ diff = list(differ.compare(text1.split(), text2.split()))
68
+
69
+ highlighted_text1 = ""
70
+ highlighted_text2 = ""
71
+
72
+ for i, word in enumerate(diff):
73
+ if word.startswith("- "):
74
+ removed_word = word[2:]
75
+ highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
76
+ # Check for corresponding addition to highlight as changed
77
+ if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
78
+ added_word = diff[i + 1][2:]
79
+ highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>' # Yellow for changed in text2
80
+ diff[i + 1] = ' ' # Consume the addition
81
+ else:
82
+ highlighted_text2 += " "
83
+ elif word.startswith("+ "):
84
+ added_word = word[2:]
85
+ highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
86
+ # Check for corresponding removal
87
+ if i - 1 >= 0 and diff[i - 1].startswith("- "):
88
+ highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
89
+ diff[i-1] = ' '
90
+ else:
91
+ highlighted_text1 += " "
92
+
93
+ elif word.startswith(" "):
94
+ highlighted_text1 += word[2:] + " "
95
+ highlighted_text2 += word[2:] + " "
96
+
97
+ return highlighted_text1, highlighted_text2
98
+ def calculate_similarity(text1, text2):
99
+ if not text1.strip() or not text2.strip():
100
+ return 0.0
101
+
102
+ try:
103
+ vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
104
+ tfidf_matrix = vectorizer.fit_transform([text1, text2])
105
+ similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
106
+ return similarity[0][0] * 100
107
+ except ValueError:
108
+ return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
109
+
110
+ def load_contract(file):
111
+ if file is None:
112
+ return ""
113
+
114
+ ext = file.name.split('.')[-1].lower()
115
+ try:
116
+ if ext == 'txt':
117
+ content = StringIO(file.getvalue().decode("utf-8")).read()
118
+ elif ext == 'pdf':
119
+ content = extract_text_from_pdf(file)
120
+ if not content:
121
+ # Fallback to PyPDF4
122
+ pdfReader = PyPDF4.PdfFileReader(file)
123
+ full_text = ""
124
+ for page in pdfReader.pages:
125
+ text = page.extractText()
126
+ if text:
127
+ full_text += text + "\n\n"
128
+ content = full_text
129
+ elif ext == 'docx':
130
+ content = docx2txt.process(file)
131
+ else:
132
+ st.warning('Unsupported file type')
133
+ return ""
134
+ return content.strip() if content else ""
135
+ except Exception as e:
136
+ st.error(f"Error loading {ext.upper()} file: {str(e)}")
137
+ return ""
138
+
139
+ # ========== MAIN APP ==========
140
+ def main():
141
+ questions = load_questions()
142
+ questions_short = load_questions_short()
143
+
144
+ if not questions or not questions_short or len(questions) != len(questions_short):
145
+ st.error("Failed to load questions or questions mismatch. Please check data files.")
146
+ return
147
+
148
+ st.title("📑 Contract Analysis Suite")
149
+ st.markdown("""
150
+ Compare documents and analyze legal clauses using AI-powered question answering.
151
+ """)
152
+
153
+ # ===== DOCUMENT UPLOAD SECTION =====
154
+ st.header("1. Upload Documents")
155
+ col1, col2 = st.columns(2)
156
+
157
+ with col1:
158
+ uploaded_file1 = st.file_uploader(
159
+ "Upload First Document",
160
+ type=["txt", "pdf", "docx"],
161
+ key="file1"
162
+ )
163
+ contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
164
+ doc1_container = st.empty()
165
+
166
+ with col2:
167
+ uploaded_file2 = st.file_uploader(
168
+ "Upload Second Document",
169
+ type=["txt", "pdf", "docx"],
170
+ key="file2"
171
+ )
172
+ contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
173
+ doc2_container = st.empty()
174
+
175
+ # Update document displays (initial content)
176
+ if uploaded_file1:
177
+ doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
178
+ doc1_container.markdown(doc1_content, unsafe_allow_html=True)
179
+ if uploaded_file2:
180
+ doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
181
+ doc2_container.markdown(doc2_content, unsafe_allow_html=True)
182
+
183
+ # JavaScript for synchronized scrolling of the initial document panes
184
+ scroll_script = """
185
  <script>
186
+ function syncScroll(id, otherId) {
187
+ var element = document.getElementById(id);
188
+ var otherElement = document.getElementById(otherId);
189
+ if (element && otherElement) {
190
+ element.addEventListener('scroll', function() {
191
+ otherElement.scrollTop = element.scrollTop;
192
+ });
193
+ otherElement.addEventListener('scroll', function() {
194
+ element.scrollTop = otherElement.scrollTop;
195
+ });
196
+ }
197
+ }
198
+ window.onload = function() {
199
+ syncScroll('doc1_text', 'doc2_text');
200
+ };
201
  </script>
202
+ """
203
+ components.html(scroll_script, height=0)
204
+
205
+ if not (uploaded_file1 and uploaded_file2):
206
+ st.warning("Please upload both documents to proceed")
207
+ return
208
+
209
+ # ===== DOCUMENT COMPARISON SECTION =====
210
+ st.header("2. Document Comparison")
211
+
212
+ with st.expander("Show Document Differences", expanded=True):
213
+ if st.button("Compare Documents"):
214
+ with st.spinner("Analyzing documents..."):
215
+ if not contract_text1.strip() or not contract_text2.strip():
216
+ st.error("One or both documents appear to be empty or couldn't be read properly")
217
+ return
218
+
219
+ similarity_score = calculate_similarity(contract_text1, contract_text2)
220
+
221
+
222
+ highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
223
+ st.session_state.comparison_results = {
224
+ 'similarity_score': similarity_score,
225
+ 'highlighted_diff1': highlighted_diff1,
226
+ 'highlighted_diff2': highlighted_diff2,
227
+
228
+ }
229
+
230
+
231
+ # Display comparison results
232
+ if st.session_state.comparison_results:
233
+ st.metric("Document Similarity Score",
234
+ f"{st.session_state.comparison_results['similarity_score']:.2f}%")
235
+
236
+ if st.session_state.comparison_results['similarity_score'] <= 70:
237
+ st.warning("Significant differences detected")
238
+
239
+ st.markdown("**Visual Difference Highlighting:**")
240
+
241
+ col1_diff, col2_diff = st.columns(2)
242
+ with col1_diff:
243
+ st.markdown("### Original Document")
244
+ diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
245
+ st.markdown(diff1_content, unsafe_allow_html=True)
246
+ with col2_diff:
247
+ st.markdown("### Modified Document")
248
+ diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
249
+ st.markdown(diff2_content, unsafe_allow_html=True)
250
+
251
+ # JavaScript for synchronized scrolling of the difference panes
252
+ diff_scroll_script = """
253
+ <script>
254
+ function syncDiffScroll(id, otherId) {
255
+ var element = document.getElementById(id);
256
+ var otherElement = document.getElementById(otherId);
257
+ if (element && otherElement) {
258
+ element.addEventListener('scroll', function() {
259
+ otherElement.scrollTop = element.scrollTop;
260
+ });
261
+ otherElement.addEventListener('scroll', function() {
262
+ element.scrollTop = otherElement.scrollTop;
263
+ });
264
+ }
265
+ }
266
+ // Execute this script after the elements are rendered
267
+ setTimeout(function() {
268
+ syncDiffScroll('diff1_text', 'diff2_text');
269
+ }, 100); // Add a small delay to ensure elements are loaded
270
+ </script>
271
+ """
272
+ components.html(diff_scroll_script, height=0)
273
+
274
+
275
+ # ===== QUESTION ANALYSIS SECTION =====
276
+ st.header("3. Clause Analysis")
277
+
278
+ try:
279
+ question_selected = st.selectbox(
280
+ 'Select a legal question to analyze:',
281
+ questions_short,
282
+ index=0,
283
+ key="question_select"
284
+ )
285
+ question_idx = questions_short.index(question_selected)
286
+ selected_question = questions[question_idx]
287
+ except Exception as e:
288
+ st.error(f"Error selecting question: {str(e)}")
289
+ return
290
+
291
+ if st.button("Analyze Both Documents"):
292
+ if not (contract_text1.strip() and contract_text2.strip()):
293
+ st.error("Please ensure both documents have readable content")
294
+ return
295
+
296
+ col1_analysis, col2_analysis = st.columns(2)
297
+
298
+ with col1_analysis:
299
+ st.subheader("First Document Analysis")
300
+ with st.spinner('Processing first document...'):
301
+ try:
302
+ predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
303
+ answer1 = predictions1.get('0', 'No answer found')
304
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
305
+ st.session_state.analysis_results['doc1'] = answer1 if answer1 else "No relevant clause found"
306
+ except Exception as e:
307
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
308
+ st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
309
+
310
+ with col2_analysis:
311
+ st.subheader("Second Document Analysis")
312
+ with st.spinner('Processing second document...'):
313
+ try:
314
+ predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
315
+ answer2 = predictions2.get('0', 'No answer found')
316
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
317
+ st.session_state.analysis_results['doc2'] = answer2 if answer2 else "No relevant clause found"
318
+ except Exception as e:
319
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
320
+ st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
321
+
322
+ # Display analysis results
323
+ if st.session_state.analysis_results:
324
+ col1_answer, col2_answer = st.columns(2)
325
+ with col1_answer:
326
+ st.subheader("First Document Analysis")
327
+ st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
328
+
329
+ with col2_answer:
330
+ st.subheader("Second Document Analysis")
331
+ st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
332
+
333
+ if __name__ == "__main__":
334
+ main()