ludigija commited on
Commit
f623e18
·
verified ·
1 Parent(s): 4665d41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -221
app.py CHANGED
@@ -7,22 +7,17 @@ import pdfplumber
7
  import difflib
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
- import streamlit.components.v1 as components
11
 
12
- # ========== CONFIGURATION ==========
13
- st.set_page_config(
14
- layout="wide",
15
- page_title="Contract Analysis Suite",
16
- page_icon="📑"
17
- )
18
 
19
- # Initialize session state variables if they don't exist
20
  if 'comparison_results' not in st.session_state:
21
  st.session_state.comparison_results = None
22
  if 'analysis_results' not in st.session_state:
23
  st.session_state.analysis_results = None
24
 
25
- # ========== CACHED DATA LOADING ==========
26
  @st.cache_data(show_spinner=False)
27
  def load_questions():
28
  try:
@@ -41,293 +36,209 @@ def load_questions_short():
41
  st.error(f"Error loading short questions: {str(e)}")
42
  return []
43
 
44
- # ========== UTILITY FUNCTIONS ==========
45
  def extract_text_from_pdf(uploaded_file):
46
  try:
47
  with pdfplumber.open(uploaded_file) as pdf:
48
  full_text = ""
49
  for page in pdf.pages:
50
  try:
51
- text = page.extract_text_formatted() # Try to get formatted text
52
  except AttributeError:
53
  text = page.extract_text()
54
- if text:
55
- full_text += text + "\n\n" # Add page separator
56
- else:
57
- full_text += page.extract_text() + "\n\n"
58
- return full_text if full_text.strip() else ""
59
  except Exception as e:
60
  st.error(f"PDF extraction error: {str(e)}")
61
  return ""
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
-
65
  def highlight_differences_words(text1, text2):
66
  differ = difflib.Differ()
67
  diff = list(differ.compare(text1.split(), text2.split()))
68
-
69
- highlighted_text1 = ""
70
- highlighted_text2 = ""
71
-
72
  for i, word in enumerate(diff):
73
  if word.startswith("- "):
74
- removed_word = word[2:]
75
- highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
76
- # Check for corresponding addition to highlight as changed
77
- if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
78
- added_word = diff[i + 1][2:]
79
- highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>' # Yellow for changed in text2
80
- diff[i + 1] = ' ' # Consume the addition
81
  else:
82
- highlighted_text2 += " "
83
  elif word.startswith("+ "):
84
- added_word = word[2:]
85
- highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
86
- # Check for corresponding removal
87
- if i - 1 >= 0 and diff[i - 1].startswith("- "):
88
- highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
89
  diff[i-1] = ' '
90
  else:
91
- highlighted_text1 += " "
92
-
93
  elif word.startswith(" "):
94
- highlighted_text1 += word[2:] + " "
95
- highlighted_text2 += word[2:] + " "
 
 
96
 
97
- return highlighted_text1, highlighted_text2
98
  def calculate_similarity(text1, text2):
99
  if not text1.strip() or not text2.strip():
100
  return 0.0
101
-
102
  try:
103
  vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
104
- tfidf_matrix = vectorizer.fit_transform([text1, text2])
105
- similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
106
- return similarity[0][0] * 100
107
- except ValueError:
108
  return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
109
 
110
- def load_contract(file):
111
- if file is None:
112
- return ""
113
-
114
- ext = file.name.split('.')[-1].lower()
115
- try:
116
- if ext == 'txt':
117
- content = StringIO(file.getvalue().decode("utf-8")).read()
118
- elif ext == 'pdf':
119
- content = extract_text_from_pdf(file)
120
- if not content:
121
- # Fallback to PyPDF4
122
- pdfReader = PyPDF4.PdfFileReader(file)
123
- full_text = ""
124
- for page in pdfReader.pages:
125
- text = page.extractText()
126
- if text:
127
- full_text += text + "\n\n"
128
- content = full_text
129
- elif ext == 'docx':
130
- content = docx2txt.process(file)
131
- else:
132
- st.warning('Unsupported file type')
133
- return ""
134
- return content.strip() if content else ""
135
- except Exception as e:
136
- st.error(f"Error loading {ext.upper()} file: {str(e)}")
137
- return ""
138
-
139
  # ========== MAIN APP ==========
140
  def main():
 
 
 
141
  questions = load_questions()
142
  questions_short = load_questions_short()
143
 
144
  if not questions or not questions_short or len(questions) != len(questions_short):
145
- st.error("Failed to load questions or questions mismatch. Please check data files.")
146
  return
147
 
148
- st.title("📑 Contract Analysis Suite")
149
- st.markdown("""
150
- Compare documents and analyze legal clauses using AI-powered question answering.
151
- """)
152
-
153
- # ===== DOCUMENT UPLOAD SECTION =====
154
  st.header("1. Upload Documents")
155
  col1, col2 = st.columns(2)
156
 
157
  with col1:
158
- uploaded_file1 = st.file_uploader(
159
- "Upload First Document",
160
- type=["txt", "pdf", "docx"],
161
- key="file1"
162
- )
163
- contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
164
- doc1_container = st.empty()
165
 
166
  with col2:
167
- uploaded_file2 = st.file_uploader(
168
- "Upload Second Document",
169
- type=["txt", "pdf", "docx"],
170
- key="file2"
171
- )
172
- contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
173
- doc2_container = st.empty()
174
 
175
- # Update document displays with synchronized scrolling
176
- if uploaded_file1:
177
- doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
178
- doc1_container.markdown(doc1_content, unsafe_allow_html=True)
179
- if uploaded_file2:
180
- doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
181
- doc2_container.markdown(doc2_content, unsafe_allow_html=True)
182
 
183
- # JavaScript for synchronized scrolling
184
- scroll_script = """
185
- <script>
186
- function syncScroll(id, otherId) {
187
- var element = document.getElementById(id);
188
- var otherElement = document.getElementById(otherId);
189
- if (element && otherElement) {
190
- element.addEventListener('scroll', function() {
191
- otherElement.scrollTop = element.scrollTop;
192
- });
193
- otherElement.addEventListener('scroll', function() {
194
- element.scrollTop = otherElement.scrollTop;
195
- });
196
- }
197
- }
198
- window.onload = function() {
199
- syncScroll('doc1_text', 'doc2_text');
200
- };
201
- </script>
202
- """
203
- components.html(scroll_script, height=0)
204
-
205
- if not (uploaded_file1 and uploaded_file2):
206
- st.warning("Please upload both documents to proceed")
207
  return
208
 
209
- # ===== DOCUMENT COMPARISON SECTION =====
210
  st.header("2. Document Comparison")
211
-
212
  with st.expander("Show Document Differences", expanded=True):
213
  if st.button("Compare Documents"):
214
- with st.spinner("Analyzing documents..."):
215
- if not contract_text1.strip() or not contract_text2.strip():
216
- st.error("One or both documents appear to be empty or couldn't be read properly")
217
- return
218
-
219
- similarity_score = calculate_similarity(contract_text1, contract_text2)
220
-
221
-
222
- highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
223
  st.session_state.comparison_results = {
224
- 'similarity_score': similarity_score,
225
- 'highlighted_diff1': highlighted_diff1,
226
- 'highlighted_diff2': highlighted_diff2,
227
-
228
  }
229
 
230
-
231
- # Display comparison results
232
  if st.session_state.comparison_results:
233
- st.metric("Document Similarity Score",
234
- f"{st.session_state.comparison_results['similarity_score']:.2f}%")
235
-
236
- if st.session_state.comparison_results['similarity_score'] <= 70:
237
- st.warning("Significant differences detected")
238
-
239
- st.markdown("**Visual Difference Highlighting:**")
240
-
241
- col1_diff, col2_diff = st.columns(2)
242
- with col1_diff:
243
- st.markdown("### Original Document")
244
- diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
245
- st.markdown(diff1_content, unsafe_allow_html=True)
246
- with col2_diff:
247
- st.markdown("### Modified Document")
248
- diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
249
- st.markdown(diff2_content, unsafe_allow_html=True)
250
-
251
- # JavaScript for synchronized scrolling of diff panes
252
- diff_scroll_script = """
253
- <script>
254
- function syncDiffScroll(id, otherId) {
255
- var element = document.getElementById(id);
256
- var otherElement = document.getElementById(otherId);
257
- if (element && otherElement) {
258
- element.addEventListener('scroll', function() {
259
- otherElement.scrollTop = element.scrollTop;
260
- });
261
- otherElement.addEventListener('scroll', function() {
262
- element.scrollTop = otherElement.scrollTop;
263
- });
264
- }
265
- }
266
- window.onload = function() {
267
- syncDiffScroll('diff1_text', 'diff2_text');
268
- };
269
- </script>
270
- """
271
- components.html(diff_scroll_script, height=0)
272
-
273
 
274
- # ===== QUESTION ANALYSIS SECTION =====
275
  st.header("3. Clause Analysis")
276
-
277
  try:
278
- question_selected = st.selectbox(
279
- 'Select a legal question to analyze:',
280
- questions_short,
281
- index=0,
282
- key="question_select"
283
- )
284
- question_idx = questions_short.index(question_selected)
285
- selected_question = questions[question_idx]
286
- except Exception as e:
287
- st.error(f"Error selecting question: {str(e)}")
288
  return
289
 
290
  if st.button("Analyze Both Documents"):
291
- if not (contract_text1.strip() and contract_text2.strip()):
292
- st.error("Please ensure both documents have readable content")
293
  return
294
 
295
- col1_analysis, col2_analysis = st.columns(2)
296
 
297
- with col1_analysis:
298
  st.subheader("First Document Analysis")
299
- with st.spinner('Processing first document...'):
300
  try:
301
- predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
302
- answer1 = predictions1.get('0', 'No answer found')
303
  st.session_state.analysis_results = st.session_state.analysis_results or {}
304
- st.session_state.analysis_results['doc1'] = answer1 if answer1 else "No relevant clause found"
305
  except Exception as e:
306
- st.session_state.analysis_results = st.session_state.analysis_results or {}
307
- st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
308
 
309
- with col2_analysis:
310
  st.subheader("Second Document Analysis")
311
- with st.spinner('Processing second document...'):
312
  try:
313
- predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
314
- answer2 = predictions2.get('0', 'No answer found')
315
  st.session_state.analysis_results = st.session_state.analysis_results or {}
316
- st.session_state.analysis_results['doc2'] = answer2 if answer2 else "No relevant clause found"
317
  except Exception as e:
318
- st.session_state.analysis_results = st.session_state.analysis_results or {}
319
- st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
320
 
321
- # Display analysis results
322
  if st.session_state.analysis_results:
323
- col1_answer, col2_answer = st.columns(2)
324
- with col1_answer:
325
- st.subheader("First Document Analysis")
326
- st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
327
-
328
- with col2_answer:
329
- st.subheader("Second Document Analysis")
330
- st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
331
 
332
  if __name__ == "__main__":
333
- main()
 
7
  import difflib
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
 
10
 
11
+ # ========== CONFIG ==========
12
+ st.set_page_config(layout="wide", page_title="Contract Analysis Suite", page_icon="📑")
 
 
 
 
13
 
14
+ # ========== SESSION STATE ==========
15
  if 'comparison_results' not in st.session_state:
16
  st.session_state.comparison_results = None
17
  if 'analysis_results' not in st.session_state:
18
  st.session_state.analysis_results = None
19
 
20
+ # ========== CACHED HELPERS ==========
21
  @st.cache_data(show_spinner=False)
22
  def load_questions():
23
  try:
 
36
  st.error(f"Error loading short questions: {str(e)}")
37
  return []
38
 
39
+ # ========== FILE PARSING ==========
40
  def extract_text_from_pdf(uploaded_file):
41
  try:
42
  with pdfplumber.open(uploaded_file) as pdf:
43
  full_text = ""
44
  for page in pdf.pages:
45
  try:
46
+ text = page.extract_text_formatted()
47
  except AttributeError:
48
  text = page.extract_text()
49
+ full_text += (text or "") + "\n\n"
50
+ return full_text.strip()
 
 
 
51
  except Exception as e:
52
  st.error(f"PDF extraction error: {str(e)}")
53
  return ""
54
 
55
+ def load_contract(file):
56
+ if not file:
57
+ return ""
58
+ try:
59
+ ext = file.name.split('.')[-1].lower()
60
+ if ext == 'txt':
61
+ return StringIO(file.getvalue().decode("utf-8")).read().strip()
62
+ elif ext == 'pdf':
63
+ content = extract_text_from_pdf(file)
64
+ if not content:
65
+ pdfReader = PyPDF4.PdfFileReader(file)
66
+ return "\n\n".join([p.extractText() for p in pdfReader.pages])
67
+ return content
68
+ elif ext == 'docx':
69
+ return docx2txt.process(file).strip()
70
+ else:
71
+ st.warning("Unsupported file type")
72
+ return ""
73
+ except Exception as e:
74
+ st.error(f"Error loading file: {str(e)}")
75
+ return ""
76
 
77
+ # ========== TEXT UTILS ==========
78
  def highlight_differences_words(text1, text2):
79
  differ = difflib.Differ()
80
  diff = list(differ.compare(text1.split(), text2.split()))
81
+ h1, h2 = "", ""
 
 
 
82
  for i, word in enumerate(diff):
83
  if word.startswith("- "):
84
+ w = word[2:]
85
+ h1 += f'<span style="background-color:#ffcccc;">{w}</span> '
86
+ if i+1 < len(diff) and diff[i+1].startswith("+ "):
87
+ h2 += f'<span style="background-color:#ffffcc;">{diff[i+1][2:]}</span> '
88
+ diff[i+1] = ' '
 
 
89
  else:
90
+ h2 += " "
91
  elif word.startswith("+ "):
92
+ w = word[2:]
93
+ h2 += f'<span style="background-color:#ccffcc;">{w}</span> '
94
+ if i-1 >= 0 and diff[i-1].startswith("- "):
95
+ h1 += f'<span style="background-color:#ffffcc;">{diff[i-1][2:]}</span> '
 
96
  diff[i-1] = ' '
97
  else:
98
+ h1 += " "
 
99
  elif word.startswith(" "):
100
+ w = word[2:] + " "
101
+ h1 += w
102
+ h2 += w
103
+ return h1.strip(), h2.strip()
104
 
 
105
  def calculate_similarity(text1, text2):
106
  if not text1.strip() or not text2.strip():
107
  return 0.0
 
108
  try:
109
  vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
110
+ tfidf = vectorizer.fit_transform([text1, text2])
111
+ sim = cosine_similarity(tfidf[0:1], tfidf[1:2])
112
+ return sim[0][0] * 100
113
+ except:
114
  return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  # ========== MAIN APP ==========
117
  def main():
118
+ st.title("📑 Contract Analysis Suite")
119
+ st.markdown("Compare documents and analyze legal clauses using AI-powered tools.")
120
+
121
  questions = load_questions()
122
  questions_short = load_questions_short()
123
 
124
  if not questions or not questions_short or len(questions) != len(questions_short):
125
+ st.error("Questions failed to load properly.")
126
  return
127
 
 
 
 
 
 
 
128
  st.header("1. Upload Documents")
129
  col1, col2 = st.columns(2)
130
 
131
  with col1:
132
+ file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
133
+ text1 = load_contract(file1) if file1 else ""
134
+ display1 = st.empty()
 
 
 
 
135
 
136
  with col2:
137
+ file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
138
+ text2 = load_contract(file2) if file2 else ""
139
+ display2 = st.empty()
 
 
 
 
140
 
141
+ if file1:
142
+ display1.text_area("Document 1 Content", value=text1, height=400, key="area1")
143
+ if file2:
144
+ display2.text_area("Document 2 Content", value=text2, height=400, key="area2")
 
 
 
145
 
146
+ if not (file1 and file2):
147
+ st.warning("Please upload both documents.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  return
149
 
 
150
  st.header("2. Document Comparison")
 
151
  with st.expander("Show Document Differences", expanded=True):
152
  if st.button("Compare Documents"):
153
+ with st.spinner("Analyzing..."):
154
+ sim = calculate_similarity(text1, text2)
155
+ diff1, diff2 = highlight_differences_words(text1, text2)
 
 
 
 
 
 
156
  st.session_state.comparison_results = {
157
+ 'similarity': sim,
158
+ 'diff1': diff1,
159
+ 'diff2': diff2,
 
160
  }
161
 
 
 
162
  if st.session_state.comparison_results:
163
+ sim = st.session_state.comparison_results['similarity']
164
+ st.metric("Document Similarity Score", f"{sim:.2f}%")
165
+
166
+ if sim >= 70:
167
+ st.markdown("### Visual Difference Highlighting")
168
+ sync_scroll_script = """
169
+ <script>
170
+ const left = document.getElementById("left");
171
+ const right = document.getElementById("right");
172
+
173
+ left.onscroll = function() {
174
+ right.scrollTop = left.scrollTop;
175
+ };
176
+ right.onscroll = function() {
177
+ left.scrollTop = right.scrollTop;
178
+ };
179
+ </script>
180
+ """
181
+
182
+ html = f"""
183
+ <div style="display: flex; gap: 20px;">
184
+ <div id="left" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
185
+ {st.session_state.comparison_results['diff1']}
186
+ </div>
187
+ <div id="right" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
188
+ {st.session_state.comparison_results['diff2']}
189
+ </div>
190
+ </div>
191
+ {sync_scroll_script}
192
+ """
193
+ st.markdown(html, unsafe_allow_html=True)
194
+ else:
195
+ st.warning("Similarity below 70%. Skipping visual diff display.")
 
 
 
 
 
 
 
196
 
197
+ # ========== CLAUSE ANALYSIS ==========
198
  st.header("3. Clause Analysis")
 
199
  try:
200
+ question_short = st.selectbox("Select a legal question to analyze:", questions_short)
201
+ idx = questions_short.index(question_short)
202
+ question = questions[idx]
203
+ except:
204
+ st.error("Error selecting question")
 
 
 
 
 
205
  return
206
 
207
  if st.button("Analyze Both Documents"):
208
+ if not (text1.strip() and text2.strip()):
209
+ st.error("Ensure both documents have content.")
210
  return
211
 
212
+ col1, col2 = st.columns(2)
213
 
214
+ with col1:
215
  st.subheader("First Document Analysis")
216
+ with st.spinner("Processing..."):
217
  try:
218
+ ans1 = run_prediction([question], text1, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
 
219
  st.session_state.analysis_results = st.session_state.analysis_results or {}
220
+ st.session_state.analysis_results['doc1'] = ans1
221
  except Exception as e:
222
+ st.session_state.analysis_results['doc1'] = f"Failed: {e}"
 
223
 
224
+ with col2:
225
  st.subheader("Second Document Analysis")
226
+ with st.spinner("Processing..."):
227
  try:
228
+ ans2 = run_prediction([question], text2, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
 
229
  st.session_state.analysis_results = st.session_state.analysis_results or {}
230
+ st.session_state.analysis_results['doc2'] = ans2
231
  except Exception as e:
232
+ st.session_state.analysis_results['doc2'] = f"Failed: {e}"
 
233
 
 
234
  if st.session_state.analysis_results:
235
+ col1, col2 = st.columns(2)
236
+ with col1:
237
+ st.subheader("First Document Result")
238
+ st.success(st.session_state.analysis_results.get('doc1', 'No analysis yet'))
239
+ with col2:
240
+ st.subheader("Second Document Result")
241
+ st.success(st.session_state.analysis_results.get('doc2', 'No analysis yet'))
 
242
 
243
  if __name__ == "__main__":
244
+ main()