Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,22 +7,17 @@ import pdfplumber
|
|
7 |
import difflib
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
-
import streamlit.components.v1 as components
|
11 |
|
12 |
-
# ==========
|
13 |
-
st.set_page_config(
|
14 |
-
layout="wide",
|
15 |
-
page_title="Contract Analysis Suite",
|
16 |
-
page_icon="📑"
|
17 |
-
)
|
18 |
|
19 |
-
#
|
20 |
if 'comparison_results' not in st.session_state:
|
21 |
st.session_state.comparison_results = None
|
22 |
if 'analysis_results' not in st.session_state:
|
23 |
st.session_state.analysis_results = None
|
24 |
|
25 |
-
# ========== CACHED
|
26 |
@st.cache_data(show_spinner=False)
|
27 |
def load_questions():
|
28 |
try:
|
@@ -41,293 +36,209 @@ def load_questions_short():
|
|
41 |
st.error(f"Error loading short questions: {str(e)}")
|
42 |
return []
|
43 |
|
44 |
-
# ==========
|
45 |
def extract_text_from_pdf(uploaded_file):
|
46 |
try:
|
47 |
with pdfplumber.open(uploaded_file) as pdf:
|
48 |
full_text = ""
|
49 |
for page in pdf.pages:
|
50 |
try:
|
51 |
-
text = page.extract_text_formatted()
|
52 |
except AttributeError:
|
53 |
text = page.extract_text()
|
54 |
-
|
55 |
-
|
56 |
-
else:
|
57 |
-
full_text += page.extract_text() + "\n\n"
|
58 |
-
return full_text if full_text.strip() else ""
|
59 |
except Exception as e:
|
60 |
st.error(f"PDF extraction error: {str(e)}")
|
61 |
return ""
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
def highlight_differences_words(text1, text2):
|
66 |
differ = difflib.Differ()
|
67 |
diff = list(differ.compare(text1.split(), text2.split()))
|
68 |
-
|
69 |
-
highlighted_text1 = ""
|
70 |
-
highlighted_text2 = ""
|
71 |
-
|
72 |
for i, word in enumerate(diff):
|
73 |
if word.startswith("- "):
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>' # Yellow for changed in text2
|
80 |
-
diff[i + 1] = ' ' # Consume the addition
|
81 |
else:
|
82 |
-
|
83 |
elif word.startswith("+ "):
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
|
89 |
diff[i-1] = ' '
|
90 |
else:
|
91 |
-
|
92 |
-
|
93 |
elif word.startswith(" "):
|
94 |
-
|
95 |
-
|
|
|
|
|
96 |
|
97 |
-
return highlighted_text1, highlighted_text2
|
98 |
def calculate_similarity(text1, text2):
|
99 |
if not text1.strip() or not text2.strip():
|
100 |
return 0.0
|
101 |
-
|
102 |
try:
|
103 |
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
|
104 |
-
|
105 |
-
|
106 |
-
return
|
107 |
-
except
|
108 |
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
|
109 |
|
110 |
-
def load_contract(file):
|
111 |
-
if file is None:
|
112 |
-
return ""
|
113 |
-
|
114 |
-
ext = file.name.split('.')[-1].lower()
|
115 |
-
try:
|
116 |
-
if ext == 'txt':
|
117 |
-
content = StringIO(file.getvalue().decode("utf-8")).read()
|
118 |
-
elif ext == 'pdf':
|
119 |
-
content = extract_text_from_pdf(file)
|
120 |
-
if not content:
|
121 |
-
# Fallback to PyPDF4
|
122 |
-
pdfReader = PyPDF4.PdfFileReader(file)
|
123 |
-
full_text = ""
|
124 |
-
for page in pdfReader.pages:
|
125 |
-
text = page.extractText()
|
126 |
-
if text:
|
127 |
-
full_text += text + "\n\n"
|
128 |
-
content = full_text
|
129 |
-
elif ext == 'docx':
|
130 |
-
content = docx2txt.process(file)
|
131 |
-
else:
|
132 |
-
st.warning('Unsupported file type')
|
133 |
-
return ""
|
134 |
-
return content.strip() if content else ""
|
135 |
-
except Exception as e:
|
136 |
-
st.error(f"Error loading {ext.upper()} file: {str(e)}")
|
137 |
-
return ""
|
138 |
-
|
139 |
# ========== MAIN APP ==========
|
140 |
def main():
|
|
|
|
|
|
|
141 |
questions = load_questions()
|
142 |
questions_short = load_questions_short()
|
143 |
|
144 |
if not questions or not questions_short or len(questions) != len(questions_short):
|
145 |
-
st.error("
|
146 |
return
|
147 |
|
148 |
-
st.title("📑 Contract Analysis Suite")
|
149 |
-
st.markdown("""
|
150 |
-
Compare documents and analyze legal clauses using AI-powered question answering.
|
151 |
-
""")
|
152 |
-
|
153 |
-
# ===== DOCUMENT UPLOAD SECTION =====
|
154 |
st.header("1. Upload Documents")
|
155 |
col1, col2 = st.columns(2)
|
156 |
|
157 |
with col1:
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
key="file1"
|
162 |
-
)
|
163 |
-
contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
|
164 |
-
doc1_container = st.empty()
|
165 |
|
166 |
with col2:
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
key="file2"
|
171 |
-
)
|
172 |
-
contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
|
173 |
-
doc2_container = st.empty()
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
if uploaded_file2:
|
180 |
-
doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
|
181 |
-
doc2_container.markdown(doc2_content, unsafe_allow_html=True)
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
<script>
|
186 |
-
function syncScroll(id, otherId) {
|
187 |
-
var element = document.getElementById(id);
|
188 |
-
var otherElement = document.getElementById(otherId);
|
189 |
-
if (element && otherElement) {
|
190 |
-
element.addEventListener('scroll', function() {
|
191 |
-
otherElement.scrollTop = element.scrollTop;
|
192 |
-
});
|
193 |
-
otherElement.addEventListener('scroll', function() {
|
194 |
-
element.scrollTop = otherElement.scrollTop;
|
195 |
-
});
|
196 |
-
}
|
197 |
-
}
|
198 |
-
window.onload = function() {
|
199 |
-
syncScroll('doc1_text', 'doc2_text');
|
200 |
-
};
|
201 |
-
</script>
|
202 |
-
"""
|
203 |
-
components.html(scroll_script, height=0)
|
204 |
-
|
205 |
-
if not (uploaded_file1 and uploaded_file2):
|
206 |
-
st.warning("Please upload both documents to proceed")
|
207 |
return
|
208 |
|
209 |
-
# ===== DOCUMENT COMPARISON SECTION =====
|
210 |
st.header("2. Document Comparison")
|
211 |
-
|
212 |
with st.expander("Show Document Differences", expanded=True):
|
213 |
if st.button("Compare Documents"):
|
214 |
-
with st.spinner("Analyzing
|
215 |
-
|
216 |
-
|
217 |
-
return
|
218 |
-
|
219 |
-
similarity_score = calculate_similarity(contract_text1, contract_text2)
|
220 |
-
|
221 |
-
|
222 |
-
highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
|
223 |
st.session_state.comparison_results = {
|
224 |
-
'
|
225 |
-
'
|
226 |
-
'
|
227 |
-
|
228 |
}
|
229 |
|
230 |
-
|
231 |
-
# Display comparison results
|
232 |
if st.session_state.comparison_results:
|
233 |
-
st.
|
234 |
-
|
235 |
-
|
236 |
-
if
|
237 |
-
st.
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
window.onload = function() {
|
267 |
-
syncDiffScroll('diff1_text', 'diff2_text');
|
268 |
-
};
|
269 |
-
</script>
|
270 |
-
"""
|
271 |
-
components.html(diff_scroll_script, height=0)
|
272 |
-
|
273 |
|
274 |
-
#
|
275 |
st.header("3. Clause Analysis")
|
276 |
-
|
277 |
try:
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
)
|
284 |
-
question_idx = questions_short.index(question_selected)
|
285 |
-
selected_question = questions[question_idx]
|
286 |
-
except Exception as e:
|
287 |
-
st.error(f"Error selecting question: {str(e)}")
|
288 |
return
|
289 |
|
290 |
if st.button("Analyze Both Documents"):
|
291 |
-
if not (
|
292 |
-
st.error("
|
293 |
return
|
294 |
|
295 |
-
|
296 |
|
297 |
-
with
|
298 |
st.subheader("First Document Analysis")
|
299 |
-
with st.spinner(
|
300 |
try:
|
301 |
-
|
302 |
-
answer1 = predictions1.get('0', 'No answer found')
|
303 |
st.session_state.analysis_results = st.session_state.analysis_results or {}
|
304 |
-
st.session_state.analysis_results['doc1'] =
|
305 |
except Exception as e:
|
306 |
-
st.session_state.analysis_results =
|
307 |
-
st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
|
308 |
|
309 |
-
with
|
310 |
st.subheader("Second Document Analysis")
|
311 |
-
with st.spinner(
|
312 |
try:
|
313 |
-
|
314 |
-
answer2 = predictions2.get('0', 'No answer found')
|
315 |
st.session_state.analysis_results = st.session_state.analysis_results or {}
|
316 |
-
st.session_state.analysis_results['doc2'] =
|
317 |
except Exception as e:
|
318 |
-
st.session_state.analysis_results =
|
319 |
-
st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
|
320 |
|
321 |
-
# Display analysis results
|
322 |
if st.session_state.analysis_results:
|
323 |
-
|
324 |
-
with
|
325 |
-
st.subheader("First Document
|
326 |
-
st.success(st.session_state.analysis_results.get('doc1', 'No analysis
|
327 |
-
|
328 |
-
|
329 |
-
st.
|
330 |
-
st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
|
331 |
|
332 |
if __name__ == "__main__":
|
333 |
-
main()
|
|
|
7 |
import difflib
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
10 |
|
11 |
+
# ========== CONFIG ==========
|
12 |
+
st.set_page_config(layout="wide", page_title="Contract Analysis Suite", page_icon="📑")
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
# ========== SESSION STATE ==========
|
15 |
if 'comparison_results' not in st.session_state:
|
16 |
st.session_state.comparison_results = None
|
17 |
if 'analysis_results' not in st.session_state:
|
18 |
st.session_state.analysis_results = None
|
19 |
|
20 |
+
# ========== CACHED HELPERS ==========
|
21 |
@st.cache_data(show_spinner=False)
|
22 |
def load_questions():
|
23 |
try:
|
|
|
36 |
st.error(f"Error loading short questions: {str(e)}")
|
37 |
return []
|
38 |
|
39 |
+
# ========== FILE PARSING ==========
|
40 |
def extract_text_from_pdf(uploaded_file):
|
41 |
try:
|
42 |
with pdfplumber.open(uploaded_file) as pdf:
|
43 |
full_text = ""
|
44 |
for page in pdf.pages:
|
45 |
try:
|
46 |
+
text = page.extract_text_formatted()
|
47 |
except AttributeError:
|
48 |
text = page.extract_text()
|
49 |
+
full_text += (text or "") + "\n\n"
|
50 |
+
return full_text.strip()
|
|
|
|
|
|
|
51 |
except Exception as e:
|
52 |
st.error(f"PDF extraction error: {str(e)}")
|
53 |
return ""
|
54 |
|
55 |
+
def load_contract(file):
|
56 |
+
if not file:
|
57 |
+
return ""
|
58 |
+
try:
|
59 |
+
ext = file.name.split('.')[-1].lower()
|
60 |
+
if ext == 'txt':
|
61 |
+
return StringIO(file.getvalue().decode("utf-8")).read().strip()
|
62 |
+
elif ext == 'pdf':
|
63 |
+
content = extract_text_from_pdf(file)
|
64 |
+
if not content:
|
65 |
+
pdfReader = PyPDF4.PdfFileReader(file)
|
66 |
+
return "\n\n".join([p.extractText() for p in pdfReader.pages])
|
67 |
+
return content
|
68 |
+
elif ext == 'docx':
|
69 |
+
return docx2txt.process(file).strip()
|
70 |
+
else:
|
71 |
+
st.warning("Unsupported file type")
|
72 |
+
return ""
|
73 |
+
except Exception as e:
|
74 |
+
st.error(f"Error loading file: {str(e)}")
|
75 |
+
return ""
|
76 |
|
77 |
+
# ========== TEXT UTILS ==========
|
78 |
def highlight_differences_words(text1, text2):
|
79 |
differ = difflib.Differ()
|
80 |
diff = list(differ.compare(text1.split(), text2.split()))
|
81 |
+
h1, h2 = "", ""
|
|
|
|
|
|
|
82 |
for i, word in enumerate(diff):
|
83 |
if word.startswith("- "):
|
84 |
+
w = word[2:]
|
85 |
+
h1 += f'<span style="background-color:#ffcccc;">{w}</span> '
|
86 |
+
if i+1 < len(diff) and diff[i+1].startswith("+ "):
|
87 |
+
h2 += f'<span style="background-color:#ffffcc;">{diff[i+1][2:]}</span> '
|
88 |
+
diff[i+1] = ' '
|
|
|
|
|
89 |
else:
|
90 |
+
h2 += " "
|
91 |
elif word.startswith("+ "):
|
92 |
+
w = word[2:]
|
93 |
+
h2 += f'<span style="background-color:#ccffcc;">{w}</span> '
|
94 |
+
if i-1 >= 0 and diff[i-1].startswith("- "):
|
95 |
+
h1 += f'<span style="background-color:#ffffcc;">{diff[i-1][2:]}</span> '
|
|
|
96 |
diff[i-1] = ' '
|
97 |
else:
|
98 |
+
h1 += " "
|
|
|
99 |
elif word.startswith(" "):
|
100 |
+
w = word[2:] + " "
|
101 |
+
h1 += w
|
102 |
+
h2 += w
|
103 |
+
return h1.strip(), h2.strip()
|
104 |
|
|
|
105 |
def calculate_similarity(text1, text2):
|
106 |
if not text1.strip() or not text2.strip():
|
107 |
return 0.0
|
|
|
108 |
try:
|
109 |
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
|
110 |
+
tfidf = vectorizer.fit_transform([text1, text2])
|
111 |
+
sim = cosine_similarity(tfidf[0:1], tfidf[1:2])
|
112 |
+
return sim[0][0] * 100
|
113 |
+
except:
|
114 |
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
# ========== MAIN APP ==========
|
117 |
def main():
|
118 |
+
st.title("📑 Contract Analysis Suite")
|
119 |
+
st.markdown("Compare documents and analyze legal clauses using AI-powered tools.")
|
120 |
+
|
121 |
questions = load_questions()
|
122 |
questions_short = load_questions_short()
|
123 |
|
124 |
if not questions or not questions_short or len(questions) != len(questions_short):
|
125 |
+
st.error("Questions failed to load properly.")
|
126 |
return
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
st.header("1. Upload Documents")
|
129 |
col1, col2 = st.columns(2)
|
130 |
|
131 |
with col1:
|
132 |
+
file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
|
133 |
+
text1 = load_contract(file1) if file1 else ""
|
134 |
+
display1 = st.empty()
|
|
|
|
|
|
|
|
|
135 |
|
136 |
with col2:
|
137 |
+
file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
|
138 |
+
text2 = load_contract(file2) if file2 else ""
|
139 |
+
display2 = st.empty()
|
|
|
|
|
|
|
|
|
140 |
|
141 |
+
if file1:
|
142 |
+
display1.text_area("Document 1 Content", value=text1, height=400, key="area1")
|
143 |
+
if file2:
|
144 |
+
display2.text_area("Document 2 Content", value=text2, height=400, key="area2")
|
|
|
|
|
|
|
145 |
|
146 |
+
if not (file1 and file2):
|
147 |
+
st.warning("Please upload both documents.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
return
|
149 |
|
|
|
150 |
st.header("2. Document Comparison")
|
|
|
151 |
with st.expander("Show Document Differences", expanded=True):
|
152 |
if st.button("Compare Documents"):
|
153 |
+
with st.spinner("Analyzing..."):
|
154 |
+
sim = calculate_similarity(text1, text2)
|
155 |
+
diff1, diff2 = highlight_differences_words(text1, text2)
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
st.session_state.comparison_results = {
|
157 |
+
'similarity': sim,
|
158 |
+
'diff1': diff1,
|
159 |
+
'diff2': diff2,
|
|
|
160 |
}
|
161 |
|
|
|
|
|
162 |
if st.session_state.comparison_results:
|
163 |
+
sim = st.session_state.comparison_results['similarity']
|
164 |
+
st.metric("Document Similarity Score", f"{sim:.2f}%")
|
165 |
+
|
166 |
+
if sim >= 70:
|
167 |
+
st.markdown("### Visual Difference Highlighting")
|
168 |
+
sync_scroll_script = """
|
169 |
+
<script>
|
170 |
+
const left = document.getElementById("left");
|
171 |
+
const right = document.getElementById("right");
|
172 |
+
|
173 |
+
left.onscroll = function() {
|
174 |
+
right.scrollTop = left.scrollTop;
|
175 |
+
};
|
176 |
+
right.onscroll = function() {
|
177 |
+
left.scrollTop = right.scrollTop;
|
178 |
+
};
|
179 |
+
</script>
|
180 |
+
"""
|
181 |
+
|
182 |
+
html = f"""
|
183 |
+
<div style="display: flex; gap: 20px;">
|
184 |
+
<div id="left" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
|
185 |
+
{st.session_state.comparison_results['diff1']}
|
186 |
+
</div>
|
187 |
+
<div id="right" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
|
188 |
+
{st.session_state.comparison_results['diff2']}
|
189 |
+
</div>
|
190 |
+
</div>
|
191 |
+
{sync_scroll_script}
|
192 |
+
"""
|
193 |
+
st.markdown(html, unsafe_allow_html=True)
|
194 |
+
else:
|
195 |
+
st.warning("Similarity below 70%. Skipping visual diff display.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
+
# ========== CLAUSE ANALYSIS ==========
|
198 |
st.header("3. Clause Analysis")
|
|
|
199 |
try:
|
200 |
+
question_short = st.selectbox("Select a legal question to analyze:", questions_short)
|
201 |
+
idx = questions_short.index(question_short)
|
202 |
+
question = questions[idx]
|
203 |
+
except:
|
204 |
+
st.error("Error selecting question")
|
|
|
|
|
|
|
|
|
|
|
205 |
return
|
206 |
|
207 |
if st.button("Analyze Both Documents"):
|
208 |
+
if not (text1.strip() and text2.strip()):
|
209 |
+
st.error("Ensure both documents have content.")
|
210 |
return
|
211 |
|
212 |
+
col1, col2 = st.columns(2)
|
213 |
|
214 |
+
with col1:
|
215 |
st.subheader("First Document Analysis")
|
216 |
+
with st.spinner("Processing..."):
|
217 |
try:
|
218 |
+
ans1 = run_prediction([question], text1, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
|
|
|
219 |
st.session_state.analysis_results = st.session_state.analysis_results or {}
|
220 |
+
st.session_state.analysis_results['doc1'] = ans1
|
221 |
except Exception as e:
|
222 |
+
st.session_state.analysis_results['doc1'] = f"Failed: {e}"
|
|
|
223 |
|
224 |
+
with col2:
|
225 |
st.subheader("Second Document Analysis")
|
226 |
+
with st.spinner("Processing..."):
|
227 |
try:
|
228 |
+
ans2 = run_prediction([question], text2, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
|
|
|
229 |
st.session_state.analysis_results = st.session_state.analysis_results or {}
|
230 |
+
st.session_state.analysis_results['doc2'] = ans2
|
231 |
except Exception as e:
|
232 |
+
st.session_state.analysis_results['doc2'] = f"Failed: {e}"
|
|
|
233 |
|
|
|
234 |
if st.session_state.analysis_results:
|
235 |
+
col1, col2 = st.columns(2)
|
236 |
+
with col1:
|
237 |
+
st.subheader("First Document Result")
|
238 |
+
st.success(st.session_state.analysis_results.get('doc1', 'No analysis yet'))
|
239 |
+
with col2:
|
240 |
+
st.subheader("Second Document Result")
|
241 |
+
st.success(st.session_state.analysis_results.get('doc2', 'No analysis yet'))
|
|
|
242 |
|
243 |
if __name__ == "__main__":
|
244 |
+
main()
|