ludigija commited on
Commit
dc1a085
Β·
verified Β·
1 Parent(s): 8da2ea9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -325
app.py CHANGED
@@ -1,333 +1,131 @@
1
- import streamlit as st
2
- from predict import run_prediction
3
- from io import StringIO
4
- import PyPDF4
5
- import docx2txt
6
- import pdfplumber
7
  import difflib
8
- from sklearn.feature_extraction.text import TfidfVectorizer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
- import streamlit.components.v1 as components
11
-
12
- # ========== CONFIGURATION ==========
13
- st.set_page_config(
14
- layout="wide",
15
- page_title="Contract Analysis Suite",
16
- page_icon="πŸ“‘"
17
- )
18
-
19
- # Initialize session state variables if they don't exist
20
- if 'comparison_results' not in st.session_state:
 
 
21
  st.session_state.comparison_results = None
22
- if 'analysis_results' not in st.session_state:
23
- st.session_state.analysis_results = None
24
-
25
- # ========== CACHED DATA LOADING ==========
26
- @st.cache_data(show_spinner=False)
27
- def load_questions():
28
- try:
29
- with open('data/questions.txt') as f:
30
- return [q.strip() for q in f.readlines() if q.strip()]
31
- except Exception as e:
32
- st.error(f"Error loading questions: {str(e)}")
33
- return []
34
-
35
- @st.cache_data(show_spinner=False)
36
- def load_questions_short():
37
- try:
38
- with open('data/questions_short.txt') as f:
39
- return [q.strip() for q in f.readlines() if q.strip()]
40
- except Exception as e:
41
- st.error(f"Error loading short questions: {str(e)}")
42
- return []
43
-
44
- # ========== UTILITY FUNCTIONS ==========
45
- def extract_text_from_pdf(uploaded_file):
46
- try:
47
- with pdfplumber.open(uploaded_file) as pdf:
48
- full_text = ""
49
- for page in pdf.pages:
50
- try:
51
- text = page.extract_text_formatted() # Try to get formatted text
52
- except AttributeError:
53
- text = page.extract_text()
54
- if text:
55
- full_text += text + "\n\n" # Add page separator
56
- else:
57
- full_text += page.extract_text() + "\n\n"
58
- return full_text if full_text.strip() else ""
59
- except Exception as e:
60
- st.error(f"PDF extraction error: {str(e)}")
61
- return ""
62
-
63
-
64
-
65
- def highlight_differences_words(text1, text2):
66
- differ = difflib.Differ()
67
- diff = list(differ.compare(text1.split(), text2.split()))
68
-
69
- highlighted_text1 = ""
70
- highlighted_text2 = ""
71
-
72
- for i, word in enumerate(diff):
73
- if word.startswith("- "):
74
- removed_word = word[2:]
75
- highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
76
- # Check for corresponding addition to highlight as changed
77
- if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
78
- added_word = diff[i + 1][2:]
79
- highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>' # Yellow for changed in text2
80
- diff[i + 1] = ' ' # Consume the addition
81
- else:
82
- highlighted_text2 += " "
83
- elif word.startswith("+ "):
84
- added_word = word[2:]
85
- highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
86
- # Check for corresponding removal
87
- if i - 1 >= 0 and diff[i - 1].startswith("- "):
88
- highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
89
- diff[i-1] = ' '
90
- else:
91
- highlighted_text1 += " "
92
-
93
- elif word.startswith(" "):
94
- highlighted_text1 += word[2:] + " "
95
- highlighted_text2 += word[2:] + " "
96
-
97
- return highlighted_text1, highlighted_text2
98
- def calculate_similarity(text1, text2):
99
- if not text1.strip() or not text2.strip():
100
- return 0.0
101
-
102
- try:
103
- vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
104
- tfidf_matrix = vectorizer.fit_transform([text1, text2])
105
- similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
106
- return similarity[0][0] * 100
107
- except ValueError:
108
- return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
109
-
110
- def load_contract(file):
111
- if file is None:
112
- return ""
113
-
114
- ext = file.name.split('.')[-1].lower()
115
- try:
116
- if ext == 'txt':
117
- content = StringIO(file.getvalue().decode("utf-8")).read()
118
- elif ext == 'pdf':
119
- content = extract_text_from_pdf(file)
120
- if not content:
121
- # Fallback to PyPDF4
122
- pdfReader = PyPDF4.PdfFileReader(file)
123
- full_text = ""
124
- for page in pdfReader.pages:
125
- text = page.extractText()
126
- if text:
127
- full_text += text + "\n\n"
128
- content = full_text
129
- elif ext == 'docx':
130
- content = docx2txt.process(file)
131
- else:
132
- st.warning('Unsupported file type')
133
- return ""
134
- return content.strip() if content else ""
135
- except Exception as e:
136
- st.error(f"Error loading {ext.upper()} file: {str(e)}")
137
- return ""
138
-
139
- # ========== MAIN APP ==========
140
- def main():
141
- questions = load_questions()
142
- questions_short = load_questions_short()
143
-
144
- if not questions or not questions_short or len(questions) != len(questions_short):
145
- st.error("Failed to load questions or questions mismatch. Please check data files.")
146
- return
147
-
148
- st.title("πŸ“‘ Contract Analysis Suite")
149
- st.markdown("""
150
- Compare documents and analyze legal clauses using AI-powered question answering.
151
- """)
152
-
153
- # ===== DOCUMENT UPLOAD SECTION =====
154
- st.header("1. Upload Documents")
155
- col1, col2 = st.columns(2)
156
-
157
- with col1:
158
- uploaded_file1 = st.file_uploader(
159
- "Upload First Document",
160
- type=["txt", "pdf", "docx"],
161
- key="file1"
162
- )
163
- contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
164
- doc1_container = st.empty()
165
-
166
- with col2:
167
- uploaded_file2 = st.file_uploader(
168
- "Upload Second Document",
169
- type=["txt", "pdf", "docx"],
170
- key="file2"
171
- )
172
- contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
173
- doc2_container = st.empty()
174
 
175
- # Update document displays (initial content)
176
- if uploaded_file1:
177
- doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
178
- doc1_container.markdown(doc1_content, unsafe_allow_html=True)
179
- if uploaded_file2:
180
- doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
181
- doc2_container.markdown(doc2_content, unsafe_allow_html=True)
182
-
183
- # JavaScript for synchronized scrolling of the initial document panes
184
- scroll_script = """
185
- <script>
186
- function syncScroll(id, otherId) {
187
- var element = document.getElementById(id);
188
- var otherElement = document.getElementById(otherId);
189
- if (element && otherElement) {
190
- element.addEventListener('scroll', function() {
191
- otherElement.scrollTop = element.scrollTop;
192
- });
193
- otherElement.addEventListener('scroll', function() {
194
- element.scrollTop = otherElement.scrollTop;
195
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  }
197
- }
198
- window.onload = function() {
199
- syncScroll('doc1_text', 'doc2_text');
200
- };
201
- </script>
202
- """
203
- components.html(scroll_script, height=0)
204
-
205
- if not (uploaded_file1 and uploaded_file2):
206
- st.warning("Please upload both documents to proceed")
207
- return
208
-
209
- # ===== DOCUMENT COMPARISON SECTION =====
210
- st.header("2. Document Comparison")
211
-
212
- with st.expander("Show Document Differences", expanded=True):
213
- if st.button("Compare Documents"):
214
- with st.spinner("Analyzing documents..."):
215
- if not contract_text1.strip() or not contract_text2.strip():
216
- st.error("One or both documents appear to be empty or couldn't be read properly")
217
- return
218
-
219
- similarity_score = calculate_similarity(contract_text1, contract_text2)
220
-
221
-
222
- highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
223
- st.session_state.comparison_results = {
224
- 'similarity_score': similarity_score,
225
- 'highlighted_diff1': highlighted_diff1,
226
- 'highlighted_diff2': highlighted_diff2,
227
- }
228
-
229
-
230
- # Display comparison results
231
- if st.session_state.comparison_results:
232
- st.metric("Document Similarity Score",
233
- f"{st.session_state.comparison_results['similarity_score']:.2f}%")
234
-
235
- if st.session_state.comparison_results['similarity_score'] <= 70:
236
- st.warning("Significant differences detected")
237
 
238
- st.markdown("**Visual Difference Highlighting:**")
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
- col1_diff, col2_diff = st.columns(2)
241
- with col1_diff:
242
- st.markdown("### Original Document")
243
- diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
244
- st.markdown(diff1_content, unsafe_allow_html=True)
245
- with col2_diff:
246
- st.markdown("### Modified Document")
247
- diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
248
- st.markdown(diff2_content, unsafe_allow_html=True)
249
-
250
- # JavaScript for synchronized scrolling of the difference panes
251
- diff_scroll_script = f"""
252
- <script>
253
- function syncDiffScroll(id, otherId) {
254
- var element = document.getElementById(id);
255
- var otherElement = document.getElementById(otherId);
256
- if (element && otherElement) {
257
- element.addEventListener('scroll', function() {
258
- otherElement.scrollTop = element.scrollTop;
259
- });
260
- otherElement.addEventListener('scroll', function() {
261
- element.scrollTop = otherElement.scrollTop;
262
- });
263
- }
264
- }
265
- // Execute this script after the elements are rendered
266
- setTimeout(function() {
267
- syncDiffScroll('diff1_text', 'diff2_text');
268
- }, 200); // Increased delay to ensure rendering
269
- </script>
270
- """
271
- components.html(diff_scroll_script, height=0)
272
-
273
-
274
- # ===== QUESTION ANALYSIS SECTION =====
275
- st.header("3. Clause Analysis")
276
-
277
- try:
278
- question_selected = st.selectbox(
279
- 'Select a legal question to analyze:',
280
- questions_short,
281
- index=0,
282
- key="question_select"
283
- )
284
- question_idx = questions_short.index(question_selected)
285
- selected_question = questions[question_idx]
286
- except Exception as e:
287
- st.error(f"Error selecting question: {str(e)}")
288
- return
289
-
290
- if st.button("Analyze Both Documents"):
291
- if not (contract_text1.strip() and contract_text2.strip()):
292
- st.error("Please ensure both documents have readable content")
293
- return
294
-
295
- col1_analysis, col2_analysis = st.columns(2)
296
-
297
- with col1_analysis:
298
- st.subheader("First Document Analysis")
299
- with st.spinner('Processing first document...'):
300
- try:
301
- predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
302
- answer1 = predictions1.get('0', 'No answer found')
303
- st.session_state.analysis_results = st.session_state.analysis_results or {}
304
- st.session_state.analysis_results['doc1'] = answer1 if answer1 else "No relevant clause found"
305
- except Exception as e:
306
- st.session_state.analysis_results = st.session_state.analysis_results or {}
307
- st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
308
-
309
- with col2_analysis:
310
- st.subheader("Second Document Analysis")
311
- with st.spinner('Processing second document...'):
312
- try:
313
- predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
314
- answer2 = predictions2.get('0', 'No answer found')
315
- st.session_state.analysis_results = st.session_state.analysis_results or {}
316
- st.session_state.analysis_results['doc2'] = answer2 if answer2 else "No relevant clause found"
317
- except Exception as e:
318
- st.session_state.analysis_results = st.session_state.analysis_results or {}
319
- st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
320
-
321
- # Display analysis results
322
- if st.session_state.analysis_results:
323
- col1_answer, col2_answer = st.columns(2)
324
- with col1_answer:
325
- st.subheader("First Document Analysis")
326
- st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
327
 
328
- with col2_answer:
329
- st.subheader("Second Document Analysis")
330
- st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
331
 
332
- if __name__ == "__main__":
333
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
 
 
 
 
 
2
  import difflib
3
+ import streamlit as st
4
+ import pandas as pd
5
+ from PyPDF4 import PdfReader
6
+ from docx import Document
7
+ from difflib import HtmlDiff
8
+ from fpdf import FPDF
9
+ import base64
10
+ from streamlit.components.v1 import html
11
+
12
+ # Set page config
13
+ st.set_page_config(page_title="Contract Analysis Suite", layout="wide")
14
+ st.title("πŸ“„ Contract Analysis Suite")
15
+
16
+ # Session state initialization
17
+ if "comparison_results" not in st.session_state:
18
  st.session_state.comparison_results = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ def extract_text_from_pdf(file):
21
+ pdf = PdfReader(file)
22
+ text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
23
+ return text
24
+
25
+ def extract_text_from_docx(file):
26
+ doc = Document(file)
27
+ return "\n".join([para.text for para in doc.paragraphs])
28
+
29
+ def get_text(file):
30
+ if file.name.endswith(".pdf"):
31
+ return extract_text_from_pdf(file)
32
+ elif file.name.endswith(".docx"):
33
+ return extract_text_from_docx(file)
34
+ else:
35
+ return file.read().decode("utf-8")
36
+
37
+ def compare_documents(text1, text2):
38
+ diff = difflib.ndiff(text1.split(), text2.split())
39
+ diff1, diff2 = [], []
40
+ added, removed, unchanged = 0, 0, 0
41
+
42
+ for word in diff:
43
+ tag = word[:2]
44
+ content = word[2:]
45
+ if tag == " ":
46
+ diff1.append(content)
47
+ diff2.append(content)
48
+ unchanged += 1
49
+ elif tag == "- ":
50
+ diff1.append(f'<span style="background-color: red;">{content}</span>')
51
+ removed += 1
52
+ elif tag == "+ ":
53
+ diff2.append(f'<span style="background-color: lightgreen;">{content}</span>')
54
+ added += 1
55
+
56
+ total = added + removed + unchanged
57
+ similarity_score = 100 * unchanged / total if total > 0 else 0
58
+ return " ".join(diff1), " ".join(diff2), similarity_score
59
+
60
+ def generate_pdf_report(html1, html2, score):
61
+ pdf = FPDF()
62
+ pdf.add_page()
63
+ pdf.set_font("Arial", size=12)
64
+ pdf.multi_cell(0, 10, f"Contract Comparison Report\n\nSimilarity Score: {score:.2f}%\n\nDocument 1 Differences:\n\n", align='L')
65
+ pdf.multi_cell(0, 10, html1.replace('<span style="background-color:', '\n[').replace('</span>', ']'), align='L')
66
+ pdf.add_page()
67
+ pdf.multi_cell(0, 10, "Document 2 Differences:\n\n", align='L')
68
+ pdf.multi_cell(0, 10, html2.replace('<span style="background-color:', '\n[').replace('</span>', ']'), align='L')
69
+
70
+ file_path = "/tmp/report.pdf"
71
+ pdf.output(file_path)
72
+ return file_path
73
+
74
+ # File upload section
75
+ st.sidebar.header("Upload Documents")
76
+ file1 = st.sidebar.file_uploader("Choose the original document", type=["pdf", "docx", "txt"], key="file1")
77
+ file2 = st.sidebar.file_uploader("Choose the modified document", type=["pdf", "docx", "txt"], key="file2")
78
+
79
+ if file1 and file2:
80
+ with st.spinner("Analyzing documents..."):
81
+ text1 = get_text(file1)
82
+ text2 = get_text(file2)
83
+
84
+ diff1, diff2, score = compare_documents(text1, text2)
85
+ st.session_state.comparison_results = {
86
+ "highlighted_diff1": diff1,
87
+ "highlighted_diff2": diff2,
88
+ "similarity_score": score
89
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ if st.session_state.comparison_results:
92
+ st.markdown("## Comparison Results")
93
+ st.markdown(f"**Similarity Score:** {st.session_state.comparison_results['similarity_score']:.2f}%")
94
+
95
+ st.markdown("### Original vs Modified (Synchronized View)")
96
+ html(f"""
97
+ <div style="display: flex; gap: 10px;">
98
+ <div id="panel1" style="flex: 1; border:1px solid #ccc; padding:10px; overflow-y: scroll; max-height:500px; font-family: monospace; font-size: 0.9em; white-space: pre-wrap; background-color: #fdfdfd;">
99
+ {st.session_state.comparison_results["highlighted_diff1"]}
100
+ </div>
101
+ <div id="panel2" style="flex: 1; border:1px solid #ccc; padding:10px; overflow-y: scroll; max-height:500px; font-family: monospace; font-size: 0.9em; white-space: pre-wrap; background-color: #fdfdfd;">
102
+ {st.session_state.comparison_results["highlighted_diff2"]}
103
+ </div>
104
+ </div>
105
 
106
+ <script>
107
+ const panel1 = document.getElementById('panel1');
108
+ const panel2 = document.getElementById('panel2');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ function syncScroll(source, target) {{
111
+ target.scrollTop = source.scrollTop;
112
+ }}
113
 
114
+ panel1.addEventListener('scroll', () => syncScroll(panel1, panel2));
115
+ panel2.addEventListener('scroll', () => syncScroll(panel2, panel1));
116
+ </script>
117
+ """, height=520)
118
+
119
+ if st.button("πŸ“„ Generate PDF Report"):
120
+ try:
121
+ file_path = generate_pdf_report(
122
+ st.session_state.comparison_results["highlighted_diff1"],
123
+ st.session_state.comparison_results["highlighted_diff2"],
124
+ st.session_state.comparison_results["similarity_score"]
125
+ )
126
+ with open(file_path, "rb") as f:
127
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
128
+ href = f'<a href="data:application/pdf;base64,{base64_pdf}" download="comparison_report.pdf">πŸ“₯ Click to Download Report</a>'
129
+ st.markdown(href, unsafe_allow_html=True)
130
+ except Exception as e:
131
+ st.error(f"Failed to generate PDF report: {e}")