Spaces:

ludigija
/

crosscheck

Sleeping

App Files Files Community

ludigija commited on Apr 14

Commit

2c5182f

verified ·

1 Parent(s): 048c8c7

Update app.py

Browse files

Files changed (1) hide show

app.py +328 -99

app.py CHANGED Viewed

@@ -1,105 +1,334 @@
 import streamlit as st
 import difflib
-from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from xhtml2pdf import pisa
-import base64
-import os
-from io import BytesIO
-# Load SBERT model
-sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
-def compute_sbert_similarity(text1, text2):
-    emb1 = sbert_model.encode([text1])[0]
-    emb2 = sbert_model.encode([text2])[0]
-    score = cosine_similarity([emb1], [emb2])[0][0]
-    return score
-def compute_tfidf_similarity(text1, text2):
-    vectorizer = TfidfVectorizer().fit([text1, text2])
-    vectors = vectorizer.transform([text1, text2])
-    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
-def html_diff(a, b):
-    differ = difflib.HtmlDiff()
-    return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)
-def convert_html_to_pdf(source_html):
-    pdf_file = BytesIO()
-    pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
-    if pisa_status.err:
-        return None
-    return pdf_file.getvalue()
-def create_download_link(pdf_data, filename="report.pdf"):
-    b64 = base64.b64encode(pdf_data).decode()
-    href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
-    return href
-st.set_page_config(layout="wide")
-st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")
-col1, col2 = st.columns(2)
-with col1:
-    uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
-with col2:
-    uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])
-if uploaded_file1 and uploaded_file2:
-    original_text = uploaded_file1.read().decode("utf-8")
-    modified_text = uploaded_file2.read().decode("utf-8")
-    sbert_score = compute_sbert_similarity(original_text, modified_text)
-    tfidf_score = compute_tfidf_similarity(original_text, modified_text)
-    html_comparison = html_diff(original_text, modified_text)
-    st.markdown("### 🔍 Similarity Scores")
-    st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}")
-    st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}")
-    st.markdown("### 📑 Comparison Result")
-    html_report = f'''
-    <html>
-    <head>
-    <style>
-    .diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
-    .diff th, .diff td {{ padding: 5px; }}
-    iframe {{ width: 100%; height: 600px; border: none; }}
-    </style>
     <script>
-    window.addEventListener("DOMContentLoaded", () => {{
-        const iframes = document.querySelectorAll("iframe");
-        if (iframes.length === 2) {{
-            const syncScroll = (e) => {{
-                iframes.forEach((frame) => {{
-                    if (frame !== e.target) {{
-                        frame.contentWindow.scrollTo(0, e.target.scrollTop);
-                    }}
-                }});
-            }};
-            iframes.forEach((iframe) => {{
-                iframe.contentWindow.onscroll = syncScroll;
-            }});
-        }}
-    }});
     </script>
-    </head>
-    <body>
-    {html_comparison}
-    </body>
-    </html>
-    '''
-    st.components.v1.html(html_report, height=700, scrolling=True)
-    if st.button("Generate PDF Report"):
-        pdf_bytes = convert_html_to_pdf(html_report)
-        if pdf_bytes:
-            st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
-        else:
-            st.error("❌ Failed to generate PDF. Check for HTML formatting issues.")

 import streamlit as st
+from predict import run_prediction
+from io import StringIO
+import PyPDF4
+import docx2txt
+import pdfplumber
 import difflib
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+import streamlit.components.v1 as components
+# ========== CONFIGURATION ==========
+st.set_page_config(
+    layout="wide",
+    page_title="Contract Analysis Suite",
+    page_icon="📑"
+)
+# Initialize session state variables if they don't exist
+if 'comparison_results' not in st.session_state:
+    st.session_state.comparison_results = None
+if 'analysis_results' not in st.session_state:
+    st.session_state.analysis_results = None
+# ========== CACHED DATA LOADING ==========
+@st.cache_data(show_spinner=False)
+def load_questions():
+    try:
+        with open('data/questions.txt') as f:
+            return [q.strip() for q in f.readlines() if q.strip()]
+    except Exception as e:
+        st.error(f"Error loading questions: {str(e)}")
+        return []
+@st.cache_data(show_spinner=False)
+def load_questions_short():
+    try:
+        with open('data/questions_short.txt') as f:
+            return [q.strip() for q in f.readlines() if q.strip()]
+    except Exception as e:
+        st.error(f"Error loading short questions: {str(e)}")
+        return []
+# ========== UTILITY FUNCTIONS ==========
+def extract_text_from_pdf(uploaded_file):
+    try:
+        with pdfplumber.open(uploaded_file) as pdf:
+            full_text = ""
+            for page in pdf.pages:
+                try:
+                    text = page.extract_text_formatted()  # Try to get formatted text
+                except AttributeError:
+                    text = page.extract_text()
+                if text:
+                    full_text += text + "\n\n"  # Add page separator
+                else:
+                    full_text += page.extract_text() + "\n\n"
+            return full_text if full_text.strip() else ""
+    except Exception as e:
+        st.error(f"PDF extraction error: {str(e)}")
+        return ""
+def highlight_differences_words(text1, text2):
+    differ = difflib.Differ()
+    diff = list(differ.compare(text1.split(), text2.split()))
+    highlighted_text1 = ""
+    highlighted_text2 = ""
+    for i, word in enumerate(diff):
+        if word.startswith("- "):
+            removed_word = word[2:]
+            highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
+            # Check for corresponding addition to highlight as changed
+            if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
+                added_word = diff[i + 1][2:]
+                highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'  # Yellow for changed in text2
+                diff[i + 1] = '  '  # Consume the addition
+            else:
+                highlighted_text2 += " "
+        elif word.startswith("+ "):
+            added_word = word[2:]
+            highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
+            # Check for corresponding removal
+            if i - 1 >= 0 and diff[i - 1].startswith("- "):
+                highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
+                diff[i-1] = '  '
+            else:
+                highlighted_text1 += " "
+        elif word.startswith("  "):
+            highlighted_text1 += word[2:] + " "
+            highlighted_text2 += word[2:] + " "
+    return highlighted_text1, highlighted_text2
+def calculate_similarity(text1, text2):
+    if not text1.strip() or not text2.strip():
+        return 0.0
+    try:
+        vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
+        tfidf_matrix = vectorizer.fit_transform([text1, text2])
+        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
+        return similarity[0][0] * 100
+    except ValueError:
+        return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
+def load_contract(file):
+    if file is None:
+        return ""
+    ext = file.name.split('.')[-1].lower()
+    try:
+        if ext == 'txt':
+            content = StringIO(file.getvalue().decode("utf-8")).read()
+        elif ext == 'pdf':
+            content = extract_text_from_pdf(file)
+            if not content:
+                # Fallback to PyPDF4
+                pdfReader = PyPDF4.PdfFileReader(file)
+                full_text = ""
+                for page in pdfReader.pages:
+                    text = page.extractText()
+                    if text:
+                        full_text += text + "\n\n"
+                content = full_text
+        elif ext == 'docx':
+            content = docx2txt.process(file)
+        else:
+            st.warning('Unsupported file type')
+            return ""
+        return content.strip() if content else ""
+    except Exception as e:
+        st.error(f"Error loading {ext.upper()} file: {str(e)}")
+        return ""
+# ========== MAIN APP ==========
+def main():
+    questions = load_questions()
+    questions_short = load_questions_short()
+    if not questions or not questions_short or len(questions) != len(questions_short):
+        st.error("Failed to load questions or questions mismatch. Please check data files.")
+        return
+    st.title("📑 Contract Analysis Suite")
+    st.markdown("""
+    Compare documents and analyze legal clauses using AI-powered question answering.
+    """)
+    # ===== DOCUMENT UPLOAD SECTION =====
+    st.header("1. Upload Documents")
+    col1, col2 = st.columns(2)
+    with col1:
+        uploaded_file1 = st.file_uploader(
+            "Upload First Document",
+            type=["txt", "pdf", "docx"],
+            key="file1"
+        )
+        contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
+        doc1_container = st.empty()
+    with col2:
+        uploaded_file2 = st.file_uploader(
+            "Upload Second Document",
+            type=["txt", "pdf", "docx"],
+            key="file2"
+        )
+        contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
+        doc2_container = st.empty()
+    # Update document displays (initial content)
+    if uploaded_file1:
+        doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
+        doc1_container.markdown(doc1_content, unsafe_allow_html=True)
+    if uploaded_file2:
+        doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
+        doc2_container.markdown(doc2_content, unsafe_allow_html=True)
+    # JavaScript for synchronized scrolling of the initial document panes
+    scroll_script = """
     <script>
+    function syncScroll(id, otherId) {
+        var element = document.getElementById(id);
+        var otherElement = document.getElementById(otherId);
+        if (element && otherElement) {
+            element.addEventListener('scroll', function() {
+                otherElement.scrollTop = element.scrollTop;
+            });
+            otherElement.addEventListener('scroll', function() {
+                element.scrollTop = otherElement.scrollTop;
+            });
+        }
+    }
+    window.onload = function() {
+        syncScroll('doc1_text', 'doc2_text');
+    };
     </script>
+    """
+    components.html(scroll_script, height=0)
+    if not (uploaded_file1 and uploaded_file2):
+        st.warning("Please upload both documents to proceed")
+        return
+    # ===== DOCUMENT COMPARISON SECTION =====
+    st.header("2. Document Comparison")
+    with st.expander("Show Document Differences", expanded=True):
+        if st.button("Compare Documents"):
+            with st.spinner("Analyzing documents..."):
+                if not contract_text1.strip() or not contract_text2.strip():
+                    st.error("One or both documents appear to be empty or couldn't be read properly")
+                    return
+                similarity_score = calculate_similarity(contract_text1, contract_text2)
+                highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
+                st.session_state.comparison_results = {
+                    'similarity_score': similarity_score,
+                    'highlighted_diff1': highlighted_diff1,
+                    'highlighted_diff2': highlighted_diff2,
+                }
+        # Display comparison results
+        if st.session_state.comparison_results:
+            st.metric("Document Similarity Score",
+                        f"{st.session_state.comparison_results['similarity_score']:.2f}%")
+            if st.session_state.comparison_results['similarity_score'] <= 70:
+                st.warning("Significant differences detected")
+            st.markdown("**Visual Difference Highlighting:**")
+            col1_diff, col2_diff = st.columns(2)
+            with col1_diff:
+                st.markdown("### Original Document")
+                diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
+                st.markdown(diff1_content, unsafe_allow_html=True)
+            with col2_diff:
+                st.markdown("### Modified Document")
+                diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
+                st.markdown(diff2_content, unsafe_allow_html=True)
+            # JavaScript for synchronized scrolling of the difference panes
+            diff_scroll_script = """
+            <script>
+            function syncDiffScroll(id, otherId) {
+                var element = document.getElementById(id);
+                var otherElement = document.getElementById(otherId);
+                if (element && otherElement) {
+                    element.addEventListener('scroll', function() {
+                        otherElement.scrollTop = element.scrollTop;
+                    });
+                    otherElement.addEventListener('scroll', function() {
+                        element.scrollTop = otherElement.scrollTop;
+                    });
+                }
+            }
+            // Execute this script after the elements are rendered
+            setTimeout(function() {
+                syncDiffScroll('diff1_text', 'diff2_text');
+            }, 100); // Add a small delay to ensure elements are loaded
+            </script>
+            """
+            components.html(diff_scroll_script, height=0)
+    # ===== QUESTION ANALYSIS SECTION =====
+    st.header("3. Clause Analysis")
+    try:
+        question_selected = st.selectbox(
+            'Select a legal question to analyze:',
+            questions_short,
+            index=0,
+            key="question_select"
+        )
+        question_idx = questions_short.index(question_selected)
+        selected_question = questions[question_idx]
+    except Exception as e:
+        st.error(f"Error selecting question: {str(e)}")
+        return
+    if st.button("Analyze Both Documents"):
+        if not (contract_text1.strip() and contract_text2.strip()):
+            st.error("Please ensure both documents have readable content")
+            return
+        col1_analysis, col2_analysis = st.columns(2)
+        with col1_analysis:
+            st.subheader("First Document Analysis")
+            with st.spinner('Processing first document...'):
+                try:
+                    predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
+                    answer1 = predictions1.get('0', 'No answer found')
+                    st.session_state.analysis_results = st.session_state.analysis_results or {}
+                    st.session_state.analysis_results['doc1'] = answer1 if answer1 else "No relevant clause found"
+                except Exception as e:
+                    st.session_state.analysis_results = st.session_state.analysis_results or {}
+                    st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
+        with col2_analysis:
+            st.subheader("Second Document Analysis")
+            with st.spinner('Processing second document...'):
+                try:
+                    predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
+                    answer2 = predictions2.get('0', 'No answer found')
+                    st.session_state.analysis_results = st.session_state.analysis_results or {}
+                    st.session_state.analysis_results['doc2'] = answer2 if answer2 else "No relevant clause found"
+                except Exception as e:
+                    st.session_state.analysis_results = st.session_state.analysis_results or {}
+                    st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
+    # Display analysis results
+    if st.session_state.analysis_results:
+        col1_answer, col2_answer = st.columns(2)
+        with col1_answer:
+            st.subheader("First Document Analysis")
+            st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
+        with col2_answer:
+            st.subheader("Second Document Analysis")
+            st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
+if __name__ == "__main__":
+    main()