ludigija commited on
Commit
1cb59c0
·
verified ·
1 Parent(s): f623e18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -239
app.py CHANGED
@@ -1,244 +1,105 @@
1
  import streamlit as st
2
- from predict import run_prediction
3
- from io import StringIO
4
- import PyPDF4
5
- import docx2txt
6
- import pdfplumber
7
  import difflib
 
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
-
11
- # ========== CONFIG ==========
12
- st.set_page_config(layout="wide", page_title="Contract Analysis Suite", page_icon="📑")
13
-
14
- # ========== SESSION STATE ==========
15
- if 'comparison_results' not in st.session_state:
16
- st.session_state.comparison_results = None
17
- if 'analysis_results' not in st.session_state:
18
- st.session_state.analysis_results = None
19
-
20
- # ========== CACHED HELPERS ==========
21
- @st.cache_data(show_spinner=False)
22
- def load_questions():
23
- try:
24
- with open('data/questions.txt') as f:
25
- return [q.strip() for q in f.readlines() if q.strip()]
26
- except Exception as e:
27
- st.error(f"Error loading questions: {str(e)}")
28
- return []
29
-
30
- @st.cache_data(show_spinner=False)
31
- def load_questions_short():
32
- try:
33
- with open('data/questions_short.txt') as f:
34
- return [q.strip() for q in f.readlines() if q.strip()]
35
- except Exception as e:
36
- st.error(f"Error loading short questions: {str(e)}")
37
- return []
38
-
39
- # ========== FILE PARSING ==========
40
- def extract_text_from_pdf(uploaded_file):
41
- try:
42
- with pdfplumber.open(uploaded_file) as pdf:
43
- full_text = ""
44
- for page in pdf.pages:
45
- try:
46
- text = page.extract_text_formatted()
47
- except AttributeError:
48
- text = page.extract_text()
49
- full_text += (text or "") + "\n\n"
50
- return full_text.strip()
51
- except Exception as e:
52
- st.error(f"PDF extraction error: {str(e)}")
53
- return ""
54
-
55
- def load_contract(file):
56
- if not file:
57
- return ""
58
- try:
59
- ext = file.name.split('.')[-1].lower()
60
- if ext == 'txt':
61
- return StringIO(file.getvalue().decode("utf-8")).read().strip()
62
- elif ext == 'pdf':
63
- content = extract_text_from_pdf(file)
64
- if not content:
65
- pdfReader = PyPDF4.PdfFileReader(file)
66
- return "\n\n".join([p.extractText() for p in pdfReader.pages])
67
- return content
68
- elif ext == 'docx':
69
- return docx2txt.process(file).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  else:
71
- st.warning("Unsupported file type")
72
- return ""
73
- except Exception as e:
74
- st.error(f"Error loading file: {str(e)}")
75
- return ""
76
-
77
- # ========== TEXT UTILS ==========
78
- def highlight_differences_words(text1, text2):
79
- differ = difflib.Differ()
80
- diff = list(differ.compare(text1.split(), text2.split()))
81
- h1, h2 = "", ""
82
- for i, word in enumerate(diff):
83
- if word.startswith("- "):
84
- w = word[2:]
85
- h1 += f'<span style="background-color:#ffcccc;">{w}</span> '
86
- if i+1 < len(diff) and diff[i+1].startswith("+ "):
87
- h2 += f'<span style="background-color:#ffffcc;">{diff[i+1][2:]}</span> '
88
- diff[i+1] = ' '
89
- else:
90
- h2 += " "
91
- elif word.startswith("+ "):
92
- w = word[2:]
93
- h2 += f'<span style="background-color:#ccffcc;">{w}</span> '
94
- if i-1 >= 0 and diff[i-1].startswith("- "):
95
- h1 += f'<span style="background-color:#ffffcc;">{diff[i-1][2:]}</span> '
96
- diff[i-1] = ' '
97
- else:
98
- h1 += " "
99
- elif word.startswith(" "):
100
- w = word[2:] + " "
101
- h1 += w
102
- h2 += w
103
- return h1.strip(), h2.strip()
104
-
105
- def calculate_similarity(text1, text2):
106
- if not text1.strip() or not text2.strip():
107
- return 0.0
108
- try:
109
- vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
110
- tfidf = vectorizer.fit_transform([text1, text2])
111
- sim = cosine_similarity(tfidf[0:1], tfidf[1:2])
112
- return sim[0][0] * 100
113
- except:
114
- return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
115
-
116
- # ========== MAIN APP ==========
117
- def main():
118
- st.title("📑 Contract Analysis Suite")
119
- st.markdown("Compare documents and analyze legal clauses using AI-powered tools.")
120
-
121
- questions = load_questions()
122
- questions_short = load_questions_short()
123
-
124
- if not questions or not questions_short or len(questions) != len(questions_short):
125
- st.error("Questions failed to load properly.")
126
- return
127
-
128
- st.header("1. Upload Documents")
129
- col1, col2 = st.columns(2)
130
-
131
- with col1:
132
- file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
133
- text1 = load_contract(file1) if file1 else ""
134
- display1 = st.empty()
135
-
136
- with col2:
137
- file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
138
- text2 = load_contract(file2) if file2 else ""
139
- display2 = st.empty()
140
-
141
- if file1:
142
- display1.text_area("Document 1 Content", value=text1, height=400, key="area1")
143
- if file2:
144
- display2.text_area("Document 2 Content", value=text2, height=400, key="area2")
145
-
146
- if not (file1 and file2):
147
- st.warning("Please upload both documents.")
148
- return
149
-
150
- st.header("2. Document Comparison")
151
- with st.expander("Show Document Differences", expanded=True):
152
- if st.button("Compare Documents"):
153
- with st.spinner("Analyzing..."):
154
- sim = calculate_similarity(text1, text2)
155
- diff1, diff2 = highlight_differences_words(text1, text2)
156
- st.session_state.comparison_results = {
157
- 'similarity': sim,
158
- 'diff1': diff1,
159
- 'diff2': diff2,
160
- }
161
-
162
- if st.session_state.comparison_results:
163
- sim = st.session_state.comparison_results['similarity']
164
- st.metric("Document Similarity Score", f"{sim:.2f}%")
165
-
166
- if sim >= 70:
167
- st.markdown("### Visual Difference Highlighting")
168
- sync_scroll_script = """
169
- <script>
170
- const left = document.getElementById("left");
171
- const right = document.getElementById("right");
172
-
173
- left.onscroll = function() {
174
- right.scrollTop = left.scrollTop;
175
- };
176
- right.onscroll = function() {
177
- left.scrollTop = right.scrollTop;
178
- };
179
- </script>
180
- """
181
-
182
- html = f"""
183
- <div style="display: flex; gap: 20px;">
184
- <div id="left" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
185
- {st.session_state.comparison_results['diff1']}
186
- </div>
187
- <div id="right" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
188
- {st.session_state.comparison_results['diff2']}
189
- </div>
190
- </div>
191
- {sync_scroll_script}
192
- """
193
- st.markdown(html, unsafe_allow_html=True)
194
- else:
195
- st.warning("Similarity below 70%. Skipping visual diff display.")
196
-
197
- # ========== CLAUSE ANALYSIS ==========
198
- st.header("3. Clause Analysis")
199
- try:
200
- question_short = st.selectbox("Select a legal question to analyze:", questions_short)
201
- idx = questions_short.index(question_short)
202
- question = questions[idx]
203
- except:
204
- st.error("Error selecting question")
205
- return
206
-
207
- if st.button("Analyze Both Documents"):
208
- if not (text1.strip() and text2.strip()):
209
- st.error("Ensure both documents have content.")
210
- return
211
-
212
- col1, col2 = st.columns(2)
213
-
214
- with col1:
215
- st.subheader("First Document Analysis")
216
- with st.spinner("Processing..."):
217
- try:
218
- ans1 = run_prediction([question], text1, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
219
- st.session_state.analysis_results = st.session_state.analysis_results or {}
220
- st.session_state.analysis_results['doc1'] = ans1
221
- except Exception as e:
222
- st.session_state.analysis_results['doc1'] = f"Failed: {e}"
223
-
224
- with col2:
225
- st.subheader("Second Document Analysis")
226
- with st.spinner("Processing..."):
227
- try:
228
- ans2 = run_prediction([question], text2, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
229
- st.session_state.analysis_results = st.session_state.analysis_results or {}
230
- st.session_state.analysis_results['doc2'] = ans2
231
- except Exception as e:
232
- st.session_state.analysis_results['doc2'] = f"Failed: {e}"
233
-
234
- if st.session_state.analysis_results:
235
- col1, col2 = st.columns(2)
236
- with col1:
237
- st.subheader("First Document Result")
238
- st.success(st.session_state.analysis_results.get('doc1', 'No analysis yet'))
239
- with col2:
240
- st.subheader("Second Document Result")
241
- st.success(st.session_state.analysis_results.get('doc2', 'No analysis yet'))
242
-
243
- if __name__ == "__main__":
244
- main()
 
1
  import streamlit as st
 
 
 
 
 
2
  import difflib
3
+ from sentence_transformers import SentenceTransformer
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ from xhtml2pdf import pisa
7
+ import base64
8
+ import os
9
+ from io import BytesIO
10
+
11
+ # Load SBERT model
12
+ sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
13
+
14
+ def compute_sbert_similarity(text1, text2):
15
+ emb1 = sbert_model.encode([text1])[0]
16
+ emb2 = sbert_model.encode([text2])[0]
17
+ score = cosine_similarity([emb1], [emb2])[0][0]
18
+ return score
19
+
20
+ def compute_tfidf_similarity(text1, text2):
21
+ vectorizer = TfidfVectorizer().fit([text1, text2])
22
+ vectors = vectorizer.transform([text1, text2])
23
+ return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
24
+
25
+ def html_diff(a, b):
26
+ differ = difflib.HtmlDiff()
27
+ return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)
28
+
29
+ def convert_html_to_pdf(source_html):
30
+ pdf_file = BytesIO()
31
+ pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
32
+ if pisa_status.err:
33
+ return None
34
+ return pdf_file.getvalue()
35
+
36
+ def create_download_link(pdf_data, filename="report.pdf"):
37
+ b64 = base64.b64encode(pdf_data).decode()
38
+ href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
39
+ return href
40
+
41
+ st.set_page_config(layout="wide")
42
+ st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")
43
+
44
+ col1, col2 = st.columns(2)
45
+
46
+ with col1:
47
+ uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
48
+ with col2:
49
+ uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])
50
+
51
+ if uploaded_file1 and uploaded_file2:
52
+ original_text = uploaded_file1.read().decode("utf-8")
53
+ modified_text = uploaded_file2.read().decode("utf-8")
54
+
55
+ sbert_score = compute_sbert_similarity(original_text, modified_text)
56
+ tfidf_score = compute_tfidf_similarity(original_text, modified_text)
57
+
58
+ html_comparison = html_diff(original_text, modified_text)
59
+
60
+ st.markdown("### 🔍 Similarity Scores")
61
+ st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}")
62
+ st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}")
63
+
64
+ st.markdown("### 📑 Comparison Result")
65
+
66
+ html_report = f'''
67
+ <html>
68
+ <head>
69
+ <style>
70
+ .diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
71
+ .diff th, .diff td {{ padding: 5px; }}
72
+ iframe {{ width: 100%; height: 600px; border: none; }}
73
+ </style>
74
+ <script>
75
+ window.addEventListener("DOMContentLoaded", () => {{
76
+ const iframes = document.querySelectorAll("iframe");
77
+ if (iframes.length === 2) {{
78
+ const syncScroll = (e) => {{
79
+ iframes.forEach((frame) => {{
80
+ if (frame !== e.target) {{
81
+ frame.contentWindow.scrollTo(0, e.target.scrollTop);
82
+ }}
83
+ }});
84
+ }};
85
+ iframes.forEach((iframe) => {{
86
+ iframe.contentWindow.onscroll = syncScroll;
87
+ }});
88
+ }}
89
+ }});
90
+ </script>
91
+ </head>
92
+ <body>
93
+ {html_comparison}
94
+ </body>
95
+ </html>
96
+ '''
97
+
98
+ st.components.v1.html(html_report, height=700, scrolling=True)
99
+
100
+ if st.button("Generate PDF Report"):
101
+ pdf_bytes = convert_html_to_pdf(html_report)
102
+ if pdf_bytes:
103
+ st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
104
  else:
105
+ st.error(" Failed to generate PDF. Check for HTML formatting issues.")