Spaces:
Sleeping
Sleeping
File size: 9,391 Bytes
7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 446457d f623e18 446457d f623e18 7f05983 f623e18 b4b9a80 f623e18 b4b9a80 7f05983 f623e18 b4b9a80 7f05983 f623e18 b4b9a80 f623e18 7f05983 f623e18 b4b9a80 f623e18 b4b9a80 f623e18 b4b9a80 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 4665d41 7f05983 f623e18 4665d41 7f05983 f623e18 7f05983 f623e18 4665d41 f623e18 7f05983 f623e18 237b16a f623e18 237b16a 4665d41 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 4665d41 f623e18 4665d41 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 4665d41 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 7f05983 f623e18 b4b9a80 7f05983 f623e18 7f05983 f623e18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
import streamlit as st
from predict import run_prediction
from io import StringIO
import PyPDF4
import docx2txt
import pdfplumber
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# ========== CONFIG ==========
st.set_page_config(layout="wide", page_title="Contract Analysis Suite", page_icon="π")
# ========== SESSION STATE ==========
if 'comparison_results' not in st.session_state:
st.session_state.comparison_results = None
if 'analysis_results' not in st.session_state:
st.session_state.analysis_results = None
# ========== CACHED HELPERS ==========
@st.cache_data(show_spinner=False)
def load_questions():
try:
with open('data/questions.txt') as f:
return [q.strip() for q in f.readlines() if q.strip()]
except Exception as e:
st.error(f"Error loading questions: {str(e)}")
return []
@st.cache_data(show_spinner=False)
def load_questions_short():
try:
with open('data/questions_short.txt') as f:
return [q.strip() for q in f.readlines() if q.strip()]
except Exception as e:
st.error(f"Error loading short questions: {str(e)}")
return []
# ========== FILE PARSING ==========
def extract_text_from_pdf(uploaded_file):
try:
with pdfplumber.open(uploaded_file) as pdf:
full_text = ""
for page in pdf.pages:
try:
text = page.extract_text_formatted()
except AttributeError:
text = page.extract_text()
full_text += (text or "") + "\n\n"
return full_text.strip()
except Exception as e:
st.error(f"PDF extraction error: {str(e)}")
return ""
def load_contract(file):
if not file:
return ""
try:
ext = file.name.split('.')[-1].lower()
if ext == 'txt':
return StringIO(file.getvalue().decode("utf-8")).read().strip()
elif ext == 'pdf':
content = extract_text_from_pdf(file)
if not content:
pdfReader = PyPDF4.PdfFileReader(file)
return "\n\n".join([p.extractText() for p in pdfReader.pages])
return content
elif ext == 'docx':
return docx2txt.process(file).strip()
else:
st.warning("Unsupported file type")
return ""
except Exception as e:
st.error(f"Error loading file: {str(e)}")
return ""
# ========== TEXT UTILS ==========
def highlight_differences_words(text1, text2):
differ = difflib.Differ()
diff = list(differ.compare(text1.split(), text2.split()))
h1, h2 = "", ""
for i, word in enumerate(diff):
if word.startswith("- "):
w = word[2:]
h1 += f'<span style="background-color:#ffcccc;">{w}</span> '
if i+1 < len(diff) and diff[i+1].startswith("+ "):
h2 += f'<span style="background-color:#ffffcc;">{diff[i+1][2:]}</span> '
diff[i+1] = ' '
else:
h2 += " "
elif word.startswith("+ "):
w = word[2:]
h2 += f'<span style="background-color:#ccffcc;">{w}</span> '
if i-1 >= 0 and diff[i-1].startswith("- "):
h1 += f'<span style="background-color:#ffffcc;">{diff[i-1][2:]}</span> '
diff[i-1] = ' '
else:
h1 += " "
elif word.startswith(" "):
w = word[2:] + " "
h1 += w
h2 += w
return h1.strip(), h2.strip()
def calculate_similarity(text1, text2):
if not text1.strip() or not text2.strip():
return 0.0
try:
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
tfidf = vectorizer.fit_transform([text1, text2])
sim = cosine_similarity(tfidf[0:1], tfidf[1:2])
return sim[0][0] * 100
except:
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
# ========== MAIN APP ==========
def main():
st.title("π Contract Analysis Suite")
st.markdown("Compare documents and analyze legal clauses using AI-powered tools.")
questions = load_questions()
questions_short = load_questions_short()
if not questions or not questions_short or len(questions) != len(questions_short):
st.error("Questions failed to load properly.")
return
st.header("1. Upload Documents")
col1, col2 = st.columns(2)
with col1:
file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
text1 = load_contract(file1) if file1 else ""
display1 = st.empty()
with col2:
file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
text2 = load_contract(file2) if file2 else ""
display2 = st.empty()
if file1:
display1.text_area("Document 1 Content", value=text1, height=400, key="area1")
if file2:
display2.text_area("Document 2 Content", value=text2, height=400, key="area2")
if not (file1 and file2):
st.warning("Please upload both documents.")
return
st.header("2. Document Comparison")
with st.expander("Show Document Differences", expanded=True):
if st.button("Compare Documents"):
with st.spinner("Analyzing..."):
sim = calculate_similarity(text1, text2)
diff1, diff2 = highlight_differences_words(text1, text2)
st.session_state.comparison_results = {
'similarity': sim,
'diff1': diff1,
'diff2': diff2,
}
if st.session_state.comparison_results:
sim = st.session_state.comparison_results['similarity']
st.metric("Document Similarity Score", f"{sim:.2f}%")
if sim >= 70:
st.markdown("### Visual Difference Highlighting")
sync_scroll_script = """
<script>
const left = document.getElementById("left");
const right = document.getElementById("right");
left.onscroll = function() {
right.scrollTop = left.scrollTop;
};
right.onscroll = function() {
left.scrollTop = right.scrollTop;
};
</script>
"""
html = f"""
<div style="display: flex; gap: 20px;">
<div id="left" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
{st.session_state.comparison_results['diff1']}
</div>
<div id="right" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
{st.session_state.comparison_results['diff2']}
</div>
</div>
{sync_scroll_script}
"""
st.markdown(html, unsafe_allow_html=True)
else:
st.warning("Similarity below 70%. Skipping visual diff display.")
# ========== CLAUSE ANALYSIS ==========
st.header("3. Clause Analysis")
try:
question_short = st.selectbox("Select a legal question to analyze:", questions_short)
idx = questions_short.index(question_short)
question = questions[idx]
except:
st.error("Error selecting question")
return
if st.button("Analyze Both Documents"):
if not (text1.strip() and text2.strip()):
st.error("Ensure both documents have content.")
return
col1, col2 = st.columns(2)
with col1:
st.subheader("First Document Analysis")
with st.spinner("Processing..."):
try:
ans1 = run_prediction([question], text1, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
st.session_state.analysis_results = st.session_state.analysis_results or {}
st.session_state.analysis_results['doc1'] = ans1
except Exception as e:
st.session_state.analysis_results['doc1'] = f"Failed: {e}"
with col2:
st.subheader("Second Document Analysis")
with st.spinner("Processing..."):
try:
ans2 = run_prediction([question], text2, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
st.session_state.analysis_results = st.session_state.analysis_results or {}
st.session_state.analysis_results['doc2'] = ans2
except Exception as e:
st.session_state.analysis_results['doc2'] = f"Failed: {e}"
if st.session_state.analysis_results:
col1, col2 = st.columns(2)
with col1:
st.subheader("First Document Result")
st.success(st.session_state.analysis_results.get('doc1', 'No analysis yet'))
with col2:
st.subheader("Second Document Result")
st.success(st.session_state.analysis_results.get('doc2', 'No analysis yet'))
if __name__ == "__main__":
main()
|