ludigija commited on
Commit
e0f7f4f
Β·
verified Β·
1 Parent(s): dc1a085

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +325 -123
app.py CHANGED
@@ -1,131 +1,333 @@
1
- import os
2
- import difflib
3
  import streamlit as st
4
- import pandas as pd
5
- from PyPDF4 import PdfReader
6
- from docx import Document
7
- from difflib import HtmlDiff
8
- from fpdf import FPDF
9
- import base64
10
- from streamlit.components.v1 import html
11
-
12
- # Set page config
13
- st.set_page_config(page_title="Contract Analysis Suite", layout="wide")
14
- st.title("πŸ“„ Contract Analysis Suite")
15
-
16
- # Session state initialization
17
- if "comparison_results" not in st.session_state:
 
 
 
 
 
18
  st.session_state.comparison_results = None
 
 
19
 
20
- def extract_text_from_pdf(file):
21
- pdf = PdfReader(file)
22
- text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
23
- return text
24
-
25
- def extract_text_from_docx(file):
26
- doc = Document(file)
27
- return "\n".join([para.text for para in doc.paragraphs])
28
-
29
- def get_text(file):
30
- if file.name.endswith(".pdf"):
31
- return extract_text_from_pdf(file)
32
- elif file.name.endswith(".docx"):
33
- return extract_text_from_docx(file)
34
- else:
35
- return file.read().decode("utf-8")
36
-
37
- def compare_documents(text1, text2):
38
- diff = difflib.ndiff(text1.split(), text2.split())
39
- diff1, diff2 = [], []
40
- added, removed, unchanged = 0, 0, 0
41
-
42
- for word in diff:
43
- tag = word[:2]
44
- content = word[2:]
45
- if tag == " ":
46
- diff1.append(content)
47
- diff2.append(content)
48
- unchanged += 1
49
- elif tag == "- ":
50
- diff1.append(f'<span style="background-color: red;">{content}</span>')
51
- removed += 1
52
- elif tag == "+ ":
53
- diff2.append(f'<span style="background-color: lightgreen;">{content}</span>')
54
- added += 1
55
-
56
- total = added + removed + unchanged
57
- similarity_score = 100 * unchanged / total if total > 0 else 0
58
- return " ".join(diff1), " ".join(diff2), similarity_score
59
-
60
- def generate_pdf_report(html1, html2, score):
61
- pdf = FPDF()
62
- pdf.add_page()
63
- pdf.set_font("Arial", size=12)
64
- pdf.multi_cell(0, 10, f"Contract Comparison Report\n\nSimilarity Score: {score:.2f}%\n\nDocument 1 Differences:\n\n", align='L')
65
- pdf.multi_cell(0, 10, html1.replace('<span style="background-color:', '\n[').replace('</span>', ']'), align='L')
66
- pdf.add_page()
67
- pdf.multi_cell(0, 10, "Document 2 Differences:\n\n", align='L')
68
- pdf.multi_cell(0, 10, html2.replace('<span style="background-color:', '\n[').replace('</span>', ']'), align='L')
69
-
70
- file_path = "/tmp/report.pdf"
71
- pdf.output(file_path)
72
- return file_path
73
-
74
- # File upload section
75
- st.sidebar.header("Upload Documents")
76
- file1 = st.sidebar.file_uploader("Choose the original document", type=["pdf", "docx", "txt"], key="file1")
77
- file2 = st.sidebar.file_uploader("Choose the modified document", type=["pdf", "docx", "txt"], key="file2")
78
-
79
- if file1 and file2:
80
- with st.spinner("Analyzing documents..."):
81
- text1 = get_text(file1)
82
- text2 = get_text(file2)
83
-
84
- diff1, diff2, score = compare_documents(text1, text2)
85
- st.session_state.comparison_results = {
86
- "highlighted_diff1": diff1,
87
- "highlighted_diff2": diff2,
88
- "similarity_score": score
89
- }
90
 
91
- if st.session_state.comparison_results:
92
- st.markdown("## Comparison Results")
93
- st.markdown(f"**Similarity Score:** {st.session_state.comparison_results['similarity_score']:.2f}%")
94
-
95
- st.markdown("### Original vs Modified (Synchronized View)")
96
- html(f"""
97
- <div style="display: flex; gap: 10px;">
98
- <div id="panel1" style="flex: 1; border:1px solid #ccc; padding:10px; overflow-y: scroll; max-height:500px; font-family: monospace; font-size: 0.9em; white-space: pre-wrap; background-color: #fdfdfd;">
99
- {st.session_state.comparison_results["highlighted_diff1"]}
100
- </div>
101
- <div id="panel2" style="flex: 1; border:1px solid #ccc; padding:10px; overflow-y: scroll; max-height:500px; font-family: monospace; font-size: 0.9em; white-space: pre-wrap; background-color: #fdfdfd;">
102
- {st.session_state.comparison_results["highlighted_diff2"]}
103
- </div>
104
- </div>
105
 
106
- <script>
107
- const panel1 = document.getElementById('panel1');
108
- const panel2 = document.getElementById('panel2');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- function syncScroll(source, target) {{
111
- target.scrollTop = source.scrollTop;
112
- }}
 
113
 
114
- panel1.addEventListener('scroll', () => syncScroll(panel1, panel2));
115
- panel2.addEventListener('scroll', () => syncScroll(panel2, panel1));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  </script>
117
- """, height=520)
118
-
119
- if st.button("πŸ“„ Generate PDF Report"):
120
- try:
121
- file_path = generate_pdf_report(
122
- st.session_state.comparison_results["highlighted_diff1"],
123
- st.session_state.comparison_results["highlighted_diff2"],
124
- st.session_state.comparison_results["similarity_score"]
125
- )
126
- with open(file_path, "rb") as f:
127
- base64_pdf = base64.b64encode(f.read()).decode('utf-8')
128
- href = f'<a href="data:application/pdf;base64,{base64_pdf}" download="comparison_report.pdf">πŸ“₯ Click to Download Report</a>'
129
- st.markdown(href, unsafe_allow_html=True)
130
- except Exception as e:
131
- st.error(f"Failed to generate PDF report: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from predict import run_prediction
3
+ from io import StringIO
4
+ import PyPDF4
5
+ import docx2txt
6
+ import pdfplumber
7
+ import difflib
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ import streamlit.components.v1 as components
11
+
12
+ # ========== CONFIGURATION ==========
13
+ st.set_page_config(
14
+ layout="wide",
15
+ page_title="Contract Analysis Suite",
16
+ page_icon="πŸ“‘"
17
+ )
18
+
19
+ # Initialize session state variables if they don't exist
20
+ if 'comparison_results' not in st.session_state:
21
  st.session_state.comparison_results = None
22
+ if 'analysis_results' not in st.session_state:
23
+ st.session_state.analysis_results = None
24
 
25
+ # ========== CACHED DATA LOADING ==========
26
+ @st.cache_data(show_spinner=False)
27
+ def load_questions():
28
+ try:
29
+ with open('data/questions.txt') as f:
30
+ return [q.strip() for q in f.readlines() if q.strip()]
31
+ except Exception as e:
32
+ st.error(f"Error loading questions: {str(e)}")
33
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ @st.cache_data(show_spinner=False)
36
+ def load_questions_short():
37
+ try:
38
+ with open('data/questions_short.txt') as f:
39
+ return [q.strip() for q in f.readlines() if q.strip()]
40
+ except Exception as e:
41
+ st.error(f"Error loading short questions: {str(e)}")
42
+ return []
 
 
 
 
 
 
43
 
44
+ # ========== UTILITY FUNCTIONS ==========
45
+ def extract_text_from_pdf(uploaded_file):
46
+ try:
47
+ with pdfplumber.open(uploaded_file) as pdf:
48
+ full_text = ""
49
+ for page in pdf.pages:
50
+ try:
51
+ text = page.extract_text_formatted() # Try to get formatted text
52
+ except AttributeError:
53
+ text = page.extract_text()
54
+ if text:
55
+ full_text += text + "\n\n" # Add page separator
56
+ else:
57
+ full_text += page.extract_text() + "\n\n"
58
+ return full_text if full_text.strip() else ""
59
+ except Exception as e:
60
+ st.error(f"PDF extraction error: {str(e)}")
61
+ return ""
62
+
63
+
64
+
65
+ def highlight_differences_words(text1, text2):
66
+ differ = difflib.Differ()
67
+ diff = list(differ.compare(text1.split(), text2.split()))
68
+
69
+ highlighted_text1 = ""
70
+ highlighted_text2 = ""
71
+
72
+ for i, word in enumerate(diff):
73
+ if word.startswith("- "):
74
+ removed_word = word[2:]
75
+ highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
76
+ # Check for corresponding addition to highlight as changed
77
+ if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
78
+ added_word = diff[i + 1][2:]
79
+ highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>' # Yellow for changed in text2
80
+ diff[i + 1] = ' ' # Consume the addition
81
+ else:
82
+ highlighted_text2 += " "
83
+ elif word.startswith("+ "):
84
+ added_word = word[2:]
85
+ highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
86
+ # Check for corresponding removal
87
+ if i - 1 >= 0 and diff[i - 1].startswith("- "):
88
+ highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
89
+ diff[i-1] = ' '
90
+ else:
91
+ highlighted_text1 += " "
92
+
93
+ elif word.startswith(" "):
94
+ highlighted_text1 += word[2:] + " "
95
+ highlighted_text2 += word[2:] + " "
96
 
97
+ return highlighted_text1, highlighted_text2
98
+ def calculate_similarity(text1, text2):
99
+ if not text1.strip() or not text2.strip():
100
+ return 0.0
101
 
102
+ try:
103
+ vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
104
+ tfidf_matrix = vectorizer.fit_transform([text1, text2])
105
+ similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
106
+ return similarity[0][0] * 100
107
+ except ValueError:
108
+ return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
109
+
110
+ def load_contract(file):
111
+ if file is None:
112
+ return ""
113
+
114
+ ext = file.name.split('.')[-1].lower()
115
+ try:
116
+ if ext == 'txt':
117
+ content = StringIO(file.getvalue().decode("utf-8")).read()
118
+ elif ext == 'pdf':
119
+ content = extract_text_from_pdf(file)
120
+ if not content:
121
+ # Fallback to PyPDF4
122
+ pdfReader = PyPDF4.PdfFileReader(file)
123
+ full_text = ""
124
+ for page in pdfReader.pages:
125
+ text = page.extractText()
126
+ if text:
127
+ full_text += text + "\n\n"
128
+ content = full_text
129
+ elif ext == 'docx':
130
+ content = docx2txt.process(file)
131
+ else:
132
+ st.warning('Unsupported file type')
133
+ return ""
134
+ return content.strip() if content else ""
135
+ except Exception as e:
136
+ st.error(f"Error loading {ext.upper()} file: {str(e)}")
137
+ return ""
138
+
139
+ # ========== MAIN APP ==========
140
+ def main():
141
+ questions = load_questions()
142
+ questions_short = load_questions_short()
143
+
144
+ if not questions or not questions_short or len(questions) != len(questions_short):
145
+ st.error("Failed to load questions or questions mismatch. Please check data files.")
146
+ return
147
+
148
+ st.title("πŸ“‘ Contract Analysis Suite")
149
+ st.markdown("""
150
+ Compare documents and analyze legal clauses using AI-powered question answering.
151
+ """)
152
+
153
+ # ===== DOCUMENT UPLOAD SECTION =====
154
+ st.header("1. Upload Documents")
155
+ col1, col2 = st.columns(2)
156
+
157
+ with col1:
158
+ uploaded_file1 = st.file_uploader(
159
+ "Upload First Document",
160
+ type=["txt", "pdf", "docx"],
161
+ key="file1"
162
+ )
163
+ contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
164
+ doc1_container = st.empty()
165
+
166
+ with col2:
167
+ uploaded_file2 = st.file_uploader(
168
+ "Upload Second Document",
169
+ type=["txt", "pdf", "docx"],
170
+ key="file2"
171
+ )
172
+ contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
173
+ doc2_container = st.empty()
174
+
175
+ # Update document displays (initial content)
176
+ if uploaded_file1:
177
+ doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
178
+ doc1_container.markdown(doc1_content, unsafe_allow_html=True)
179
+ if uploaded_file2:
180
+ doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
181
+ doc2_container.markdown(doc2_content, unsafe_allow_html=True)
182
+
183
+ # JavaScript for synchronized scrolling of the initial document panes
184
+ scroll_script = """
185
+ <script>
186
+ function syncScroll(id, otherId) {
187
+ var element = document.getElementById(id);
188
+ var otherElement = document.getElementById(otherId);
189
+ if (element && otherElement) {
190
+ element.addEventListener('scroll', function() {
191
+ otherElement.scrollTop = element.scrollTop;
192
+ });
193
+ otherElement.addEventListener('scroll', function() {
194
+ element.scrollTop = otherElement.scrollTop;
195
+ });
196
+ }
197
+ }
198
+ window.onload = function() {
199
+ syncScroll('doc1_text', 'doc2_text');
200
+ };
201
  </script>
202
+ """
203
+ components.html(scroll_script, height=0)
204
+
205
+ if not (uploaded_file1 and uploaded_file2):
206
+ st.warning("Please upload both documents to proceed")
207
+ return
208
+
209
+ # ===== DOCUMENT COMPARISON SECTION =====
210
+ st.header("2. Document Comparison")
211
+
212
+ with st.expander("Show Document Differences", expanded=True):
213
+ if st.button("Compare Documents"):
214
+ with st.spinner("Analyzing documents..."):
215
+ if not contract_text1.strip() or not contract_text2.strip():
216
+ st.error("One or both documents appear to be empty or couldn't be read properly")
217
+ return
218
+
219
+ similarity_score = calculate_similarity(contract_text1, contract_text2)
220
+
221
+
222
+ highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
223
+ st.session_state.comparison_results = {
224
+ 'similarity_score': similarity_score,
225
+ 'highlighted_diff1': highlighted_diff1,
226
+ 'highlighted_diff2': highlighted_diff2,
227
+ }
228
+
229
+
230
+ # Display comparison results
231
+ if st.session_state.comparison_results:
232
+ st.metric("Document Similarity Score",
233
+ f"{st.session_state.comparison_results['similarity_score']:.2f}%")
234
+
235
+ if st.session_state.comparison_results['similarity_score'] <= 70:
236
+ st.warning("Significant differences detected")
237
+
238
+ st.markdown("**Visual Difference Highlighting:**")
239
+
240
+ col1_diff, col2_diff = st.columns(2)
241
+ with col1_diff:
242
+ st.markdown("### Original Document")
243
+ diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
244
+ st.markdown(diff1_content, unsafe_allow_html=True)
245
+ with col2_diff:
246
+ st.markdown("### Modified Document")
247
+ diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
248
+ st.markdown(diff2_content, unsafe_allow_html=True)
249
+
250
+ # JavaScript for synchronized scrolling of the difference panes
251
+ diff_scroll_script = """
252
+ <script>
253
+ function syncDiffScroll(id, otherId) {
254
+ var element = document.getElementById(id);
255
+ var otherElement = document.getElementById(otherId);
256
+ if (element && otherElement) {
257
+ element.addEventListener('scroll', function() {
258
+ otherElement.scrollTop = element.scrollTop;
259
+ });
260
+ otherElement.addEventListener('scroll', function() {
261
+ element.scrollTop = otherElement.scrollTop;
262
+ });
263
+ }
264
+ }
265
+ // Execute this script after the elements are rendered
266
+ setTimeout(function() {
267
+ syncDiffScroll('diff1_text', 'diff2_text');
268
+ }, 200); // Increased delay to ensure rendering
269
+ </script>
270
+ """
271
+ components.html(diff_scroll_script, height=0)
272
+
273
+
274
+ # ===== QUESTION ANALYSIS SECTION =====
275
+ st.header("3. Clause Analysis")
276
+
277
+ try:
278
+ question_selected = st.selectbox(
279
+ 'Select a legal question to analyze:',
280
+ questions_short,
281
+ index=0,
282
+ key="question_select"
283
+ )
284
+ question_idx = questions_short.index(question_selected)
285
+ selected_question = questions[question_idx]
286
+ except Exception as e:
287
+ st.error(f"Error selecting question: {str(e)}")
288
+ return
289
+
290
+ if st.button("Analyze Both Documents"):
291
+ if not (contract_text1.strip() and contract_text2.strip()):
292
+ st.error("Please ensure both documents have readable content")
293
+ return
294
+
295
+ col1_analysis, col2_analysis = st.columns(2)
296
+
297
+ with col1_analysis:
298
+ st.subheader("First Document Analysis")
299
+ with st.spinner('Processing first document...'):
300
+ try:
301
+ predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
302
+ answer1 = predictions1.get('0', 'No answer found')
303
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
304
+ st.session_state.analysis_results['doc1'] = answer1 if answer1 else "No relevant clause found"
305
+ except Exception as e:
306
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
307
+ st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
308
+
309
+ with col2_analysis:
310
+ st.subheader("Second Document Analysis")
311
+ with st.spinner('Processing second document...'):
312
+ try:
313
+ predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
314
+ answer2 = predictions2.get('0', 'No answer found')
315
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
316
+ st.session_state.analysis_results['doc2'] = answer2 if answer2 else "No relevant clause found"
317
+ except Exception as e:
318
+ st.session_state.analysis_results = st.session_state.analysis_results or {}
319
+ st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
320
+
321
+ # Display analysis results
322
+ if st.session_state.analysis_results:
323
+ col1_answer, col2_answer = st.columns(2)
324
+ with col1_answer:
325
+ st.subheader("First Document Analysis")
326
+ st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
327
+
328
+ with col2_answer:
329
+ st.subheader("Second Document Analysis")
330
+ st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
331
+
332
+ if __name__ == "__main__":
333
+ main()