Spaces:

ludigija
/

crosscheck

Running

App Files Files Community

ludigija commited on Apr 14

Commit

1cb59c0

verified ·

1 Parent(s): f623e18

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -239

app.py CHANGED Viewed

@@ -1,244 +1,105 @@
 import streamlit as st
-from predict import run_prediction
-from io import StringIO
-import PyPDF4
-import docx2txt
-import pdfplumber
 import difflib
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# ========== CONFIG ==========
-st.set_page_config(layout="wide", page_title="Contract Analysis Suite", page_icon="📑")
-# ========== SESSION STATE ==========
-if 'comparison_results' not in st.session_state:
-    st.session_state.comparison_results = None
-if 'analysis_results' not in st.session_state:
-    st.session_state.analysis_results = None
-# ========== CACHED HELPERS ==========
-@st.cache_data(show_spinner=False)
-def load_questions():
-    try:
-        with open('data/questions.txt') as f:
-            return [q.strip() for q in f.readlines() if q.strip()]
-    except Exception as e:
-        st.error(f"Error loading questions: {str(e)}")
-        return []
-@st.cache_data(show_spinner=False)
-def load_questions_short():
-    try:
-        with open('data/questions_short.txt') as f:
-            return [q.strip() for q in f.readlines() if q.strip()]
-    except Exception as e:
-        st.error(f"Error loading short questions: {str(e)}")
-        return []
-# ========== FILE PARSING ==========
-def extract_text_from_pdf(uploaded_file):
-    try:
-        with pdfplumber.open(uploaded_file) as pdf:
-            full_text = ""
-            for page in pdf.pages:
-                try:
-                    text = page.extract_text_formatted()
-                except AttributeError:
-                    text = page.extract_text()
-                full_text += (text or "") + "\n\n"
-            return full_text.strip()
-    except Exception as e:
-        st.error(f"PDF extraction error: {str(e)}")
-        return ""
-def load_contract(file):
-    if not file:
-        return ""
-    try:
-        ext = file.name.split('.')[-1].lower()
-        if ext == 'txt':
-            return StringIO(file.getvalue().decode("utf-8")).read().strip()
-        elif ext == 'pdf':
-            content = extract_text_from_pdf(file)
-            if not content:
-                pdfReader = PyPDF4.PdfFileReader(file)
-                return "\n\n".join([p.extractText() for p in pdfReader.pages])
-            return content
-        elif ext == 'docx':
-            return docx2txt.process(file).strip()
         else:
-            st.warning("Unsupported file type")
-            return ""
-    except Exception as e:
-        st.error(f"Error loading file: {str(e)}")
-        return ""
-# ========== TEXT UTILS ==========
-def highlight_differences_words(text1, text2):
-    differ = difflib.Differ()
-    diff = list(differ.compare(text1.split(), text2.split()))
-    h1, h2 = "", ""
-    for i, word in enumerate(diff):
-        if word.startswith("- "):
-            w = word[2:]
-            h1 += f'<span style="background-color:#ffcccc;">{w}</span> '
-            if i+1 < len(diff) and diff[i+1].startswith("+ "):
-                h2 += f'<span style="background-color:#ffffcc;">{diff[i+1][2:]}</span> '
-                diff[i+1] = '  '
-            else:
-                h2 += " "
-        elif word.startswith("+ "):
-            w = word[2:]
-            h2 += f'<span style="background-color:#ccffcc;">{w}</span> '
-            if i-1 >= 0 and diff[i-1].startswith("- "):
-                h1 += f'<span style="background-color:#ffffcc;">{diff[i-1][2:]}</span> '
-                diff[i-1] = '  '
-            else:
-                h1 += " "
-        elif word.startswith("  "):
-            w = word[2:] + " "
-            h1 += w
-            h2 += w
-    return h1.strip(), h2.strip()
-def calculate_similarity(text1, text2):
-    if not text1.strip() or not text2.strip():
-        return 0.0
-    try:
-        vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
-        tfidf = vectorizer.fit_transform([text1, text2])
-        sim = cosine_similarity(tfidf[0:1], tfidf[1:2])
-        return sim[0][0] * 100
-    except:
-        return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
-# ========== MAIN APP ==========
-def main():
-    st.title("📑 Contract Analysis Suite")
-    st.markdown("Compare documents and analyze legal clauses using AI-powered tools.")
-    questions = load_questions()
-    questions_short = load_questions_short()
-    if not questions or not questions_short or len(questions) != len(questions_short):
-        st.error("Questions failed to load properly.")
-        return
-    st.header("1. Upload Documents")
-    col1, col2 = st.columns(2)
-    with col1:
-        file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
-        text1 = load_contract(file1) if file1 else ""
-        display1 = st.empty()
-    with col2:
-        file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
-        text2 = load_contract(file2) if file2 else ""
-        display2 = st.empty()
-    if file1:
-        display1.text_area("Document 1 Content", value=text1, height=400, key="area1")
-    if file2:
-        display2.text_area("Document 2 Content", value=text2, height=400, key="area2")
-    if not (file1 and file2):
-        st.warning("Please upload both documents.")
-        return
-    st.header("2. Document Comparison")
-    with st.expander("Show Document Differences", expanded=True):
-        if st.button("Compare Documents"):
-            with st.spinner("Analyzing..."):
-                sim = calculate_similarity(text1, text2)
-                diff1, diff2 = highlight_differences_words(text1, text2)
-                st.session_state.comparison_results = {
-                    'similarity': sim,
-                    'diff1': diff1,
-                    'diff2': diff2,
-                }
-        if st.session_state.comparison_results:
-            sim = st.session_state.comparison_results['similarity']
-            st.metric("Document Similarity Score", f"{sim:.2f}%")
-            if sim >= 70:
-                st.markdown("### Visual Difference Highlighting")
-                sync_scroll_script = """
-                <script>
-                const left = document.getElementById("left");
-                const right = document.getElementById("right");
-                left.onscroll = function() {
-                    right.scrollTop = left.scrollTop;
-                };
-                right.onscroll = function() {
-                    left.scrollTop = right.scrollTop;
-                };
-                </script>
-                """
-                html = f"""
-                <div style="display: flex; gap: 20px;">
-                    <div id="left" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
-                        {st.session_state.comparison_results['diff1']}
-                    </div>
-                    <div id="right" style="width: 100%; height: 500px; overflow-y: auto; padding: 10px; font-family: monospace; border: 1px solid #ccc;">
-                        {st.session_state.comparison_results['diff2']}
-                    </div>
-                </div>
-                {sync_scroll_script}
-                """
-                st.markdown(html, unsafe_allow_html=True)
-            else:
-                st.warning("Similarity below 70%. Skipping visual diff display.")
-    # ========== CLAUSE ANALYSIS ==========
-    st.header("3. Clause Analysis")
-    try:
-        question_short = st.selectbox("Select a legal question to analyze:", questions_short)
-        idx = questions_short.index(question_short)
-        question = questions[idx]
-    except:
-        st.error("Error selecting question")
-        return
-    if st.button("Analyze Both Documents"):
-        if not (text1.strip() and text2.strip()):
-            st.error("Ensure both documents have content.")
-            return
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("First Document Analysis")
-            with st.spinner("Processing..."):
-                try:
-                    ans1 = run_prediction([question], text1, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
-                    st.session_state.analysis_results = st.session_state.analysis_results or {}
-                    st.session_state.analysis_results['doc1'] = ans1
-                except Exception as e:
-                    st.session_state.analysis_results['doc1'] = f"Failed: {e}"
-        with col2:
-            st.subheader("Second Document Analysis")
-            with st.spinner("Processing..."):
-                try:
-                    ans2 = run_prediction([question], text2, 'marshmellow77/roberta-base-cuad', n_best_size=5).get('0', 'No answer')
-                    st.session_state.analysis_results = st.session_state.analysis_results or {}
-                    st.session_state.analysis_results['doc2'] = ans2
-                except Exception as e:
-                    st.session_state.analysis_results['doc2'] = f"Failed: {e}"
-    if st.session_state.analysis_results:
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("First Document Result")
-            st.success(st.session_state.analysis_results.get('doc1', 'No analysis yet'))
-        with col2:
-            st.subheader("Second Document Result")
-            st.success(st.session_state.analysis_results.get('doc2', 'No analysis yet'))
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import difflib
+from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from xhtml2pdf import pisa
+import base64
+import os
+from io import BytesIO
+# Load SBERT model
+sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
+def compute_sbert_similarity(text1, text2):
+    emb1 = sbert_model.encode([text1])[0]
+    emb2 = sbert_model.encode([text2])[0]
+    score = cosine_similarity([emb1], [emb2])[0][0]
+    return score
+def compute_tfidf_similarity(text1, text2):
+    vectorizer = TfidfVectorizer().fit([text1, text2])
+    vectors = vectorizer.transform([text1, text2])
+    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
+def html_diff(a, b):
+    differ = difflib.HtmlDiff()
+    return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)
+def convert_html_to_pdf(source_html):
+    pdf_file = BytesIO()
+    pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
+    if pisa_status.err:
+        return None
+    return pdf_file.getvalue()
+def create_download_link(pdf_data, filename="report.pdf"):
+    b64 = base64.b64encode(pdf_data).decode()
+    href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
+    return href
+st.set_page_config(layout="wide")
+st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")
+col1, col2 = st.columns(2)
+with col1:
+    uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
+with col2:
+    uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])
+if uploaded_file1 and uploaded_file2:
+    original_text = uploaded_file1.read().decode("utf-8")
+    modified_text = uploaded_file2.read().decode("utf-8")
+    sbert_score = compute_sbert_similarity(original_text, modified_text)
+    tfidf_score = compute_tfidf_similarity(original_text, modified_text)
+    html_comparison = html_diff(original_text, modified_text)
+    st.markdown("### 🔍 Similarity Scores")
+    st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}")
+    st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}")
+    st.markdown("### 📑 Comparison Result")
+    html_report = f'''
+    <html>
+    <head>
+    <style>
+    .diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
+    .diff th, .diff td {{ padding: 5px; }}
+    iframe {{ width: 100%; height: 600px; border: none; }}
+    </style>
+    <script>
+    window.addEventListener("DOMContentLoaded", () => {{
+        const iframes = document.querySelectorAll("iframe");
+        if (iframes.length === 2) {{
+            const syncScroll = (e) => {{
+                iframes.forEach((frame) => {{
+                    if (frame !== e.target) {{
+                        frame.contentWindow.scrollTo(0, e.target.scrollTop);
+                    }}
+                }});
+            }};
+            iframes.forEach((iframe) => {{
+                iframe.contentWindow.onscroll = syncScroll;
+            }});
+        }}
+    }});
+    </script>
+    </head>
+    <body>
+    {html_comparison}
+    </body>
+    </html>
+    '''
+    st.components.v1.html(html_report, height=700, scrolling=True)
+    if st.button("Generate PDF Report"):
+        pdf_bytes = convert_html_to_pdf(html_report)
+        if pdf_bytes:
+            st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
         else:
+            st.error("❌ Failed to generate PDF. Check for HTML formatting issues.")