Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,244 +1,105 @@
|
|
1 |
import streamlit as st
|
2 |
-
from predict import run_prediction
|
3 |
-
from io import StringIO
|
4 |
-
import PyPDF4
|
5 |
-
import docx2txt
|
6 |
-
import pdfplumber
|
7 |
import difflib
|
|
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
return
|
38 |
-
|
39 |
-
|
40 |
-
def
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
else:
|
71 |
-
st.
|
72 |
-
return ""
|
73 |
-
except Exception as e:
|
74 |
-
st.error(f"Error loading file: {str(e)}")
|
75 |
-
return ""
|
76 |
-
|
77 |
-
# ========== TEXT UTILS ==========
|
78 |
-
def highlight_differences_words(text1, text2):
|
79 |
-
differ = difflib.Differ()
|
80 |
-
diff = list(differ.compare(text1.split(), text2.split()))
|
81 |
-
h1, h2 = "", ""
|
82 |
-
for i, word in enumerate(diff):
|
83 |
-
if word.startswith("- "):
|
84 |
-
w = word[2:]
|
85 |
-
h1 += f'<span style="background-color:#ffcccc;">{w}</span> '
|
86 |
-
if i+1 < len(diff) and diff[i+1].startswith("+ "):
|
87 |
-
h2 += f'<span style="background-color:#ffffcc;">{diff[i+1][2:]}</span> '
|
88 |
-
diff[i+1] = ' '
|
89 |
-
else:
|
90 |
-
h2 += " "
|
91 |
-
elif word.startswith("+ "):
|
92 |
-
w = word[2:]
|
93 |
-
h2 += f'<span style="background-color:#ccffcc;">{w}</span> '
|
94 |
-
if i-1 >= 0 and diff[i-1].startswith("- "):
|
95 |
-
h1 += f'<span style="background-color:#ffffcc;">{diff[i-1][2:]}</span> '
|
96 |
-
diff[i-1] = ' '
|
97 |
-
else:
|
98 |
-
h1 += " "
|
99 |
-
elif word.startswith(" "):
|
100 |
-
w = word[2:] + " "
|
101 |
-
h1 += w
|
102 |
-
h2 += w
|
103 |
-
return h1.strip(), h2.strip()
|
104 |
-
|
105 |
-
def calculate_similarity(text1, text2):
|
106 |
-
if not text1.strip() or not text2.strip():
|
107 |
-
return 0.0
|
108 |
-
try:
|
109 |
-
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
|
110 |
-
tfidf = vectorizer.fit_transform([text1, text2])
|
111 |
-
sim = cosine_similarity(tfidf[0:1], tfidf[1:2])
|
112 |
-
return sim[0][0] * 100
|
113 |
-
except:
|
114 |
-
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
|
115 |
-
|
116 |
-
# ========== MAIN APP ==========
|
117 |
-
def main():
|
118 |
-
st.title("📑 Contract Analysis Suite")
|
119 |
-
st.markdown("Compare documents and analyze legal clauses using AI-powered tools.")
|
120 |
-
|
121 |
-
questions = load_questions()
|
122 |
-
questions_short = load_questions_short()
|
123 |
-
|
124 |
-
if not questions or not questions_short or len(questions) != len(questions_short):
|
125 |
-
st.error("Questions failed to load properly.")
|
126 |
-
return
|
127 |
-
|
128 |
-
st.header("1. Upload Documents")
|
129 |
-
col1, col2 = st.columns(2)
|
130 |
-
|
131 |
-
with col1:
|
132 |
-
file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
|
133 |
-
text1 = load_contract(file1) if file1 else ""
|
134 |
-
display1 = st.empty()
|
135 |
-
|
136 |
-
with col2:
|
137 |
-
file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
|
138 |
-
text2 = load_contract(file2) if file2 else ""
|
139 |
-
display2 = st.empty()
|
140 |
-
|
141 |
-
if file1:
|
142 |
-
display1.text_area("Document 1 Content", value=text1, height=400, key="area1")
|
143 |
-
if file2:
|
144 |
-
display2.text_area("Document 2 Content", value=text2, height=400, key="area2")
|
145 |
-
|
146 |
-
if not (file1 and file2):
|
147 |
-
st.warning("Please upload both documents.")
|
148 |
-
return
|
149 |
-
|
150 |
-
st.header("2. Document Comparison")
|
151 |
-
with st.expander("Show Document Differences", expanded=True):
|
152 |
-
if st.button("Compare Documents"):
|
153 |
-
with st.spinner("Analyzing..."):
|
154 |
-
sim = calculate_similarity(text1, text2)
|
155 |
-
diff1, diff2 = highlight_differences_words(text1, text2)
|
156 |
-
st.session_state.comparison_results = {
|
157 |
-
'similarity': sim,
|
158 |
-
'diff1': diff1,
|
159 |
-
'diff2': diff2,
|
160 |
-
}
|
161 |
-
|
162 |
-
if st.session_state.comparison_results:
|
163 |
-
sim = st.session_state.comparison_results['similarity']
|
164 |
-
st.metric("Document Similarity Score", f"{sim:.2f}%")
|
165 |
-
|
166 |
-
if sim >= 70:
|
167 |
-
st.markdown("### Visual Difference Highlighting")
|
168 |
-
sync_scroll_script = """
|
169 |
-
<script>
|
170 |
-
const left = document.getElementById("left");
|
171 |
-
const right = document.getElementById("right");
|
172 |
-
|
173 |
-
left.onscroll = function() {
|
174 |
-
right.scrollTop = left.scrollTop;
|
175 |
-
};
|
176 |
-
right.onscroll = function() {
|
177 |
-
left.scrollTop = right.scrollTop;
|
178 |
-
};
|
179 |
-
</script>
|
180 |
-
"""
|
181 |
-
|
182 |
-
html = f"""
|
183 |
-
<div style="display: flex; gap: 20px;">
|
184 |
-
<div id="left" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
|
185 |
-
{st.session_state.comparison_results['diff1']}
|
186 |
-
</div>
|
187 |
-
<div id="right" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
|
188 |
-
{st.session_state.comparison_results['diff2']}
|
189 |
-
</div>
|
190 |
-
</div>
|
191 |
-
{sync_scroll_script}
|
192 |
-
"""
|
193 |
-
st.markdown(html, unsafe_allow_html=True)
|
194 |
-
else:
|
195 |
-
st.warning("Similarity below 70%. Skipping visual diff display.")
|
196 |
-
|
197 |
-
# ========== CLAUSE ANALYSIS ==========
|
198 |
-
st.header("3. Clause Analysis")
|
199 |
-
try:
|
200 |
-
question_short = st.selectbox("Select a legal question to analyze:", questions_short)
|
201 |
-
idx = questions_short.index(question_short)
|
202 |
-
question = questions[idx]
|
203 |
-
except:
|
204 |
-
st.error("Error selecting question")
|
205 |
-
return
|
206 |
-
|
207 |
-
if st.button("Analyze Both Documents"):
|
208 |
-
if not (text1.strip() and text2.strip()):
|
209 |
-
st.error("Ensure both documents have content.")
|
210 |
-
return
|
211 |
-
|
212 |
-
col1, col2 = st.columns(2)
|
213 |
-
|
214 |
-
with col1:
|
215 |
-
st.subheader("First Document Analysis")
|
216 |
-
with st.spinner("Processing..."):
|
217 |
-
try:
|
218 |
-
ans1 = run_prediction([question], text1, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
|
219 |
-
st.session_state.analysis_results = st.session_state.analysis_results or {}
|
220 |
-
st.session_state.analysis_results['doc1'] = ans1
|
221 |
-
except Exception as e:
|
222 |
-
st.session_state.analysis_results['doc1'] = f"Failed: {e}"
|
223 |
-
|
224 |
-
with col2:
|
225 |
-
st.subheader("Second Document Analysis")
|
226 |
-
with st.spinner("Processing..."):
|
227 |
-
try:
|
228 |
-
ans2 = run_prediction([question], text2, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
|
229 |
-
st.session_state.analysis_results = st.session_state.analysis_results or {}
|
230 |
-
st.session_state.analysis_results['doc2'] = ans2
|
231 |
-
except Exception as e:
|
232 |
-
st.session_state.analysis_results['doc2'] = f"Failed: {e}"
|
233 |
-
|
234 |
-
if st.session_state.analysis_results:
|
235 |
-
col1, col2 = st.columns(2)
|
236 |
-
with col1:
|
237 |
-
st.subheader("First Document Result")
|
238 |
-
st.success(st.session_state.analysis_results.get('doc1', 'No analysis yet'))
|
239 |
-
with col2:
|
240 |
-
st.subheader("Second Document Result")
|
241 |
-
st.success(st.session_state.analysis_results.get('doc2', 'No analysis yet'))
|
242 |
-
|
243 |
-
if __name__ == "__main__":
|
244 |
-
main()
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
2 |
import difflib
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
from xhtml2pdf import pisa
|
7 |
+
import base64
|
8 |
+
import os
|
9 |
+
from io import BytesIO
|
10 |
+
|
11 |
+
# Load SBERT model
|
12 |
+
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
|
13 |
+
|
14 |
+
def compute_sbert_similarity(text1, text2):
|
15 |
+
emb1 = sbert_model.encode([text1])[0]
|
16 |
+
emb2 = sbert_model.encode([text2])[0]
|
17 |
+
score = cosine_similarity([emb1], [emb2])[0][0]
|
18 |
+
return score
|
19 |
+
|
20 |
+
def compute_tfidf_similarity(text1, text2):
|
21 |
+
vectorizer = TfidfVectorizer().fit([text1, text2])
|
22 |
+
vectors = vectorizer.transform([text1, text2])
|
23 |
+
return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
|
24 |
+
|
25 |
+
def html_diff(a, b):
|
26 |
+
differ = difflib.HtmlDiff()
|
27 |
+
return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)
|
28 |
+
|
29 |
+
def convert_html_to_pdf(source_html):
|
30 |
+
pdf_file = BytesIO()
|
31 |
+
pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
|
32 |
+
if pisa_status.err:
|
33 |
+
return None
|
34 |
+
return pdf_file.getvalue()
|
35 |
+
|
36 |
+
def create_download_link(pdf_data, filename="report.pdf"):
|
37 |
+
b64 = base64.b64encode(pdf_data).decode()
|
38 |
+
href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
|
39 |
+
return href
|
40 |
+
|
41 |
+
st.set_page_config(layout="wide")
|
42 |
+
st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")
|
43 |
+
|
44 |
+
col1, col2 = st.columns(2)
|
45 |
+
|
46 |
+
with col1:
|
47 |
+
uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
|
48 |
+
with col2:
|
49 |
+
uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])
|
50 |
+
|
51 |
+
if uploaded_file1 and uploaded_file2:
|
52 |
+
original_text = uploaded_file1.read().decode("utf-8")
|
53 |
+
modified_text = uploaded_file2.read().decode("utf-8")
|
54 |
+
|
55 |
+
sbert_score = compute_sbert_similarity(original_text, modified_text)
|
56 |
+
tfidf_score = compute_tfidf_similarity(original_text, modified_text)
|
57 |
+
|
58 |
+
html_comparison = html_diff(original_text, modified_text)
|
59 |
+
|
60 |
+
st.markdown("### 🔍 Similarity Scores")
|
61 |
+
st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}")
|
62 |
+
st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}")
|
63 |
+
|
64 |
+
st.markdown("### 📑 Comparison Result")
|
65 |
+
|
66 |
+
html_report = f'''
|
67 |
+
<html>
|
68 |
+
<head>
|
69 |
+
<style>
|
70 |
+
.diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
|
71 |
+
.diff th, .diff td {{ padding: 5px; }}
|
72 |
+
iframe {{ width: 100%; height: 600px; border: none; }}
|
73 |
+
</style>
|
74 |
+
<script>
|
75 |
+
window.addEventListener("DOMContentLoaded", () => {{
|
76 |
+
const iframes = document.querySelectorAll("iframe");
|
77 |
+
if (iframes.length === 2) {{
|
78 |
+
const syncScroll = (e) => {{
|
79 |
+
iframes.forEach((frame) => {{
|
80 |
+
if (frame !== e.target) {{
|
81 |
+
frame.contentWindow.scrollTo(0, e.target.scrollTop);
|
82 |
+
}}
|
83 |
+
}});
|
84 |
+
}};
|
85 |
+
iframes.forEach((iframe) => {{
|
86 |
+
iframe.contentWindow.onscroll = syncScroll;
|
87 |
+
}});
|
88 |
+
}}
|
89 |
+
}});
|
90 |
+
</script>
|
91 |
+
</head>
|
92 |
+
<body>
|
93 |
+
{html_comparison}
|
94 |
+
</body>
|
95 |
+
</html>
|
96 |
+
'''
|
97 |
+
|
98 |
+
st.components.v1.html(html_report, height=700, scrolling=True)
|
99 |
+
|
100 |
+
if st.button("Generate PDF Report"):
|
101 |
+
pdf_bytes = convert_html_to_pdf(html_report)
|
102 |
+
if pdf_bytes:
|
103 |
+
st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
|
104 |
else:
|
105 |
+
st.error("❌ Failed to generate PDF. Check for HTML formatting issues.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|