ludigija commited on
Commit
e731a1a
Β·
verified Β·
1 Parent(s): e0f7f4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -113
app.py CHANGED
@@ -7,13 +7,14 @@ import pdfplumber
7
  import difflib
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
- import streamlit.components.v1 as components
 
11
 
12
  # ========== CONFIGURATION ==========
13
  st.set_page_config(
14
  layout="wide",
15
  page_title="Contract Analysis Suite",
16
- page_icon="πŸ“‘"
17
  )
18
 
19
  # Initialize session state variables if they don't exist
@@ -48,11 +49,11 @@ def extract_text_from_pdf(uploaded_file):
48
  full_text = ""
49
  for page in pdf.pages:
50
  try:
51
- text = page.extract_text_formatted() # Try to get formatted text
52
  except AttributeError:
53
  text = page.extract_text()
54
  if text:
55
- full_text += text + "\n\n" # Add page separator
56
  else:
57
  full_text += page.extract_text() + "\n\n"
58
  return full_text if full_text.strip() else ""
@@ -60,8 +61,6 @@ def extract_text_from_pdf(uploaded_file):
60
  st.error(f"PDF extraction error: {str(e)}")
61
  return ""
62
 
63
-
64
-
65
  def highlight_differences_words(text1, text2):
66
  differ = difflib.Differ()
67
  diff = list(differ.compare(text1.split(), text2.split()))
@@ -73,39 +72,64 @@ def highlight_differences_words(text1, text2):
73
  if word.startswith("- "):
74
  removed_word = word[2:]
75
  highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
76
- # Check for corresponding addition to highlight as changed
77
  if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
78
  added_word = diff[i + 1][2:]
79
- highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>' # Yellow for changed in text2
80
- diff[i + 1] = ' ' # Consume the addition
81
  else:
82
  highlighted_text2 += " "
83
  elif word.startswith("+ "):
84
  added_word = word[2:]
85
  highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
86
- # Check for corresponding removal
87
  if i - 1 >= 0 and diff[i - 1].startswith("- "):
88
- highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
89
  diff[i-1] = ' '
90
  else:
91
  highlighted_text1 += " "
92
-
93
  elif word.startswith(" "):
94
  highlighted_text1 += word[2:] + " "
95
  highlighted_text2 += word[2:] + " "
96
 
97
  return highlighted_text1, highlighted_text2
 
98
  def calculate_similarity(text1, text2):
99
  if not text1.strip() or not text2.strip():
100
  return 0.0
101
 
102
  try:
103
- vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
104
- tfidf_matrix = vectorizer.fit_transform([text1, text2])
105
- similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
106
- return similarity[0][0] * 100
107
- except ValueError:
108
- return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  def load_contract(file):
111
  if file is None:
@@ -118,7 +142,6 @@ def load_contract(file):
118
  elif ext == 'pdf':
119
  content = extract_text_from_pdf(file)
120
  if not content:
121
- # Fallback to PyPDF4
122
  pdfReader = PyPDF4.PdfFileReader(file)
123
  full_text = ""
124
  for page in pdfReader.pages:
@@ -145,68 +168,33 @@ def main():
145
  st.error("Failed to load questions or questions mismatch. Please check data files.")
146
  return
147
 
148
- st.title("πŸ“‘ Contract Analysis Suite")
149
  st.markdown("""
150
  Compare documents and analyze legal clauses using AI-powered question answering.
151
  """)
152
 
153
- # ===== DOCUMENT UPLOAD SECTION =====
154
  st.header("1. Upload Documents")
155
  col1, col2 = st.columns(2)
156
 
157
  with col1:
158
- uploaded_file1 = st.file_uploader(
159
- "Upload First Document",
160
- type=["txt", "pdf", "docx"],
161
- key="file1"
162
- )
163
  contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
164
- doc1_container = st.empty()
165
 
166
  with col2:
167
- uploaded_file2 = st.file_uploader(
168
- "Upload Second Document",
169
- type=["txt", "pdf", "docx"],
170
- key="file2"
171
- )
172
  contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
173
- doc2_container = st.empty()
174
 
175
- # Update document displays (initial content)
176
  if uploaded_file1:
177
- doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
178
- doc1_container.markdown(doc1_content, unsafe_allow_html=True)
179
  if uploaded_file2:
180
- doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
181
- doc2_container.markdown(doc2_content, unsafe_allow_html=True)
182
-
183
- # JavaScript for synchronized scrolling of the initial document panes
184
- scroll_script = """
185
- <script>
186
- function syncScroll(id, otherId) {
187
- var element = document.getElementById(id);
188
- var otherElement = document.getElementById(otherId);
189
- if (element && otherElement) {
190
- element.addEventListener('scroll', function() {
191
- otherElement.scrollTop = element.scrollTop;
192
- });
193
- otherElement.addEventListener('scroll', function() {
194
- element.scrollTop = otherElement.scrollTop;
195
- });
196
- }
197
- }
198
- window.onload = function() {
199
- syncScroll('doc1_text', 'doc2_text');
200
- };
201
- </script>
202
- """
203
- components.html(scroll_script, height=0)
204
 
205
  if not (uploaded_file1 and uploaded_file2):
206
  st.warning("Please upload both documents to proceed")
207
  return
208
 
209
- # ===== DOCUMENT COMPARISON SECTION =====
210
  st.header("2. Document Comparison")
211
 
212
  with st.expander("Show Document Differences", expanded=True):
@@ -218,7 +206,6 @@ def main():
218
 
219
  similarity_score = calculate_similarity(contract_text1, contract_text2)
220
 
221
-
222
  highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
223
  st.session_state.comparison_results = {
224
  'similarity_score': similarity_score,
@@ -226,61 +213,40 @@ def main():
226
  'highlighted_diff2': highlighted_diff2,
227
  }
228
 
229
-
230
- # Display comparison results
231
  if st.session_state.comparison_results:
232
- st.metric("Document Similarity Score",
233
- f"{st.session_state.comparison_results['similarity_score']:.2f}%")
234
 
235
- if st.session_state.comparison_results['similarity_score'] <= 70:
236
  st.warning("Significant differences detected")
237
 
238
  st.markdown("**Visual Difference Highlighting:**")
239
 
240
- col1_diff, col2_diff = st.columns(2)
241
- with col1_diff:
242
  st.markdown("### Original Document")
243
- diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
244
- st.markdown(diff1_content, unsafe_allow_html=True)
245
- with col2_diff:
246
  st.markdown("### Modified Document")
247
- diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
248
- st.markdown(diff2_content, unsafe_allow_html=True)
249
-
250
- # JavaScript for synchronized scrolling of the difference panes
251
- diff_scroll_script = """
252
- <script>
253
- function syncDiffScroll(id, otherId) {
254
- var element = document.getElementById(id);
255
- var otherElement = document.getElementById(otherId);
256
- if (element && otherElement) {
257
- element.addEventListener('scroll', function() {
258
- otherElement.scrollTop = element.scrollTop;
259
- });
260
- otherElement.addEventListener('scroll', function() {
261
- element.scrollTop = otherElement.scrollTop;
262
- });
263
- }
264
- }
265
- // Execute this script after the elements are rendered
266
- setTimeout(function() {
267
- syncDiffScroll('diff1_text', 'diff2_text');
268
- }, 200); // Increased delay to ensure rendering
269
- </script>
270
- """
271
- components.html(diff_scroll_script, height=0)
272
-
273
 
274
- # ===== QUESTION ANALYSIS SECTION =====
275
  st.header("3. Clause Analysis")
276
 
277
  try:
278
- question_selected = st.selectbox(
279
- 'Select a legal question to analyze:',
280
- questions_short,
281
- index=0,
282
- key="question_select"
283
- )
284
  question_idx = questions_short.index(question_selected)
285
  selected_question = questions[question_idx]
286
  except Exception as e:
@@ -292,9 +258,9 @@ def main():
292
  st.error("Please ensure both documents have readable content")
293
  return
294
 
295
- col1_analysis, col2_analysis = st.columns(2)
296
 
297
- with col1_analysis:
298
  st.subheader("First Document Analysis")
299
  with st.spinner('Processing first document...'):
300
  try:
@@ -306,7 +272,7 @@ def main():
306
  st.session_state.analysis_results = st.session_state.analysis_results or {}
307
  st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
308
 
309
- with col2_analysis:
310
  st.subheader("Second Document Analysis")
311
  with st.spinner('Processing second document...'):
312
  try:
@@ -318,16 +284,15 @@ def main():
318
  st.session_state.analysis_results = st.session_state.analysis_results or {}
319
  st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
320
 
321
- # Display analysis results
322
  if st.session_state.analysis_results:
323
- col1_answer, col2_answer = st.columns(2)
324
- with col1_answer:
325
  st.subheader("First Document Analysis")
326
  st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
327
 
328
- with col2_answer:
329
  st.subheader("Second Document Analysis")
330
  st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
331
 
332
  if __name__ == "__main__":
333
- main()
 
7
  import difflib
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
+ from sentence_transformers import SentenceTransformer, util
11
+ from fpdf import FPDF
12
 
13
  # ========== CONFIGURATION ==========
14
  st.set_page_config(
15
  layout="wide",
16
  page_title="Contract Analysis Suite",
17
+ page_icon="πŸ“"
18
  )
19
 
20
  # Initialize session state variables if they don't exist
 
49
  full_text = ""
50
  for page in pdf.pages:
51
  try:
52
+ text = page.extract_text_formatted()
53
  except AttributeError:
54
  text = page.extract_text()
55
  if text:
56
+ full_text += text + "\n\n"
57
  else:
58
  full_text += page.extract_text() + "\n\n"
59
  return full_text if full_text.strip() else ""
 
61
  st.error(f"PDF extraction error: {str(e)}")
62
  return ""
63
 
 
 
64
  def highlight_differences_words(text1, text2):
65
  differ = difflib.Differ()
66
  diff = list(differ.compare(text1.split(), text2.split()))
 
72
  if word.startswith("- "):
73
  removed_word = word[2:]
74
  highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
 
75
  if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
76
  added_word = diff[i + 1][2:]
77
+ highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'
78
+ diff[i + 1] = ' '
79
  else:
80
  highlighted_text2 += " "
81
  elif word.startswith("+ "):
82
  added_word = word[2:]
83
  highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
 
84
  if i - 1 >= 0 and diff[i - 1].startswith("- "):
85
+ highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>'
86
  diff[i-1] = ' '
87
  else:
88
  highlighted_text1 += " "
 
89
  elif word.startswith(" "):
90
  highlighted_text1 += word[2:] + " "
91
  highlighted_text2 += word[2:] + " "
92
 
93
  return highlighted_text1, highlighted_text2
94
+
95
  def calculate_similarity(text1, text2):
96
  if not text1.strip() or not text2.strip():
97
  return 0.0
98
 
99
  try:
100
+ model = SentenceTransformer('all-MiniLM-L6-v2')
101
+ embeddings = model.encode([text1, text2], convert_to_tensor=True)
102
+ similarity = util.cos_sim(embeddings[0], embeddings[1])
103
+ return float(similarity.item()) * 100
104
+ except Exception as e:
105
+ st.error(f"Similarity calculation error: {e}")
106
+ return 0.0
107
+
108
+ def generate_pdf_report(similarity_score, doc1, doc2):
109
+ pdf = FPDF()
110
+ pdf.add_page()
111
+ pdf.set_auto_page_break(auto=True, margin=15)
112
+
113
+ pdf.set_font("Arial", 'B', 16)
114
+ pdf.cell(0, 10, "Contract Comparison Report", ln=True, align="C")
115
+
116
+ pdf.set_font("Arial", '', 12)
117
+ pdf.ln(10)
118
+ pdf.multi_cell(0, 10, f"Document Similarity Score: {similarity_score:.2f}%")
119
+
120
+ pdf.ln(5)
121
+ pdf.set_font("Arial", 'B', 12)
122
+ pdf.cell(0, 10, "Document 1 Excerpt:", ln=True)
123
+ pdf.set_font("Arial", '', 10)
124
+ pdf.multi_cell(0, 10, doc1[:1000])
125
+
126
+ pdf.ln(5)
127
+ pdf.set_font("Arial", 'B', 12)
128
+ pdf.cell(0, 10, "Document 2 Excerpt:", ln=True)
129
+ pdf.set_font("Arial", '', 10)
130
+ pdf.multi_cell(0, 10, doc2[:1000])
131
+
132
+ return pdf.output(dest='S').encode('latin1')
133
 
134
  def load_contract(file):
135
  if file is None:
 
142
  elif ext == 'pdf':
143
  content = extract_text_from_pdf(file)
144
  if not content:
 
145
  pdfReader = PyPDF4.PdfFileReader(file)
146
  full_text = ""
147
  for page in pdfReader.pages:
 
168
  st.error("Failed to load questions or questions mismatch. Please check data files.")
169
  return
170
 
171
+ st.title("πŸ“ Contract Analysis Suite")
172
  st.markdown("""
173
  Compare documents and analyze legal clauses using AI-powered question answering.
174
  """)
175
 
 
176
  st.header("1. Upload Documents")
177
  col1, col2 = st.columns(2)
178
 
179
  with col1:
180
+ uploaded_file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
 
 
 
 
181
  contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
182
+ doc1_display = st.empty()
183
 
184
  with col2:
185
+ uploaded_file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
 
 
 
 
186
  contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
187
+ doc2_display = st.empty()
188
 
 
189
  if uploaded_file1:
190
+ doc1_display.text_area("Document 1 Content", value=contract_text1, height=400, key="area1")
 
191
  if uploaded_file2:
192
+ doc2_display.text_area("Document 2 Content", value=contract_text2, height=400, key="area2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  if not (uploaded_file1 and uploaded_file2):
195
  st.warning("Please upload both documents to proceed")
196
  return
197
 
 
198
  st.header("2. Document Comparison")
199
 
200
  with st.expander("Show Document Differences", expanded=True):
 
206
 
207
  similarity_score = calculate_similarity(contract_text1, contract_text2)
208
 
 
209
  highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
210
  st.session_state.comparison_results = {
211
  'similarity_score': similarity_score,
 
213
  'highlighted_diff2': highlighted_diff2,
214
  }
215
 
 
 
216
  if st.session_state.comparison_results:
217
+ st.metric("Document Similarity Score", f"{st.session_state.comparison_results['similarity_score']:.2f}%")
 
218
 
219
+ if st.session_state.comparison_results['similarity_score'] < 50:
220
  st.warning("Significant differences detected")
221
 
222
  st.markdown("**Visual Difference Highlighting:**")
223
 
224
+ col1, col2 = st.columns(2)
225
+ with col1:
226
  st.markdown("### Original Document")
227
+ st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff1"]}</div>', unsafe_allow_html=True)
228
+ with col2:
 
229
  st.markdown("### Modified Document")
230
+ st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff2"]}</div>', unsafe_allow_html=True)
231
+
232
+ if st.button("Download PDF Report"):
233
+ with st.spinner("Generating report..."):
234
+ pdf_bytes = generate_pdf_report(
235
+ st.session_state.comparison_results['similarity_score'],
236
+ contract_text1,
237
+ contract_text2
238
+ )
239
+ st.download_button(
240
+ label="Click to download PDF",
241
+ data=pdf_bytes,
242
+ file_name="contract_comparison_report.pdf",
243
+ mime="application/pdf"
244
+ )
 
 
 
 
 
 
 
 
 
 
 
245
 
 
246
  st.header("3. Clause Analysis")
247
 
248
  try:
249
+ question_selected = st.selectbox('Select a legal question to analyze:', questions_short, index=0, key="question_select")
 
 
 
 
 
250
  question_idx = questions_short.index(question_selected)
251
  selected_question = questions[question_idx]
252
  except Exception as e:
 
258
  st.error("Please ensure both documents have readable content")
259
  return
260
 
261
+ col1, col2 = st.columns(2)
262
 
263
+ with col1:
264
  st.subheader("First Document Analysis")
265
  with st.spinner('Processing first document...'):
266
  try:
 
272
  st.session_state.analysis_results = st.session_state.analysis_results or {}
273
  st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
274
 
275
+ with col2:
276
  st.subheader("Second Document Analysis")
277
  with st.spinner('Processing second document...'):
278
  try:
 
284
  st.session_state.analysis_results = st.session_state.analysis_results or {}
285
  st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
286
 
 
287
  if st.session_state.analysis_results:
288
+ col1, col2 = st.columns(2)
289
+ with col1:
290
  st.subheader("First Document Analysis")
291
  st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
292
 
293
+ with col2:
294
  st.subheader("Second Document Analysis")
295
  st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
296
 
297
  if __name__ == "__main__":
298
+ main()