Spaces:

ludigija
/

crosscheck

Running

App Files Files Community

ludigija commited on Apr 14

Commit

e731a1a

verified ·

1 Parent(s): e0f7f4f

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -113

app.py CHANGED Viewed

@@ -7,13 +7,14 @@ import pdfplumber
 import difflib
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-import streamlit.components.v1 as components
 # ========== CONFIGURATION ==========
 st.set_page_config(
     layout="wide",
     page_title="Contract Analysis Suite",
-    page_icon="📑"
 )
 # Initialize session state variables if they don't exist
@@ -48,11 +49,11 @@ def extract_text_from_pdf(uploaded_file):
             full_text = ""
             for page in pdf.pages:
                 try:
-                    text = page.extract_text_formatted()  # Try to get formatted text
                 except AttributeError:
                     text = page.extract_text()
                 if text:
-                    full_text += text + "\n\n"  # Add page separator
                 else:
                     full_text += page.extract_text() + "\n\n"
             return full_text if full_text.strip() else ""
@@ -60,8 +61,6 @@ def extract_text_from_pdf(uploaded_file):
         st.error(f"PDF extraction error: {str(e)}")
         return ""
 def highlight_differences_words(text1, text2):
     differ = difflib.Differ()
     diff = list(differ.compare(text1.split(), text2.split()))
@@ -73,39 +72,64 @@ def highlight_differences_words(text1, text2):
         if word.startswith("- "):
             removed_word = word[2:]
             highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
-            # Check for corresponding addition to highlight as changed
             if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
                 added_word = diff[i + 1][2:]
-                highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'  # Yellow for changed in text2
-                diff[i + 1] = '  '  # Consume the addition
             else:
                 highlighted_text2 += " "
         elif word.startswith("+ "):
             added_word = word[2:]
             highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
-            # Check for corresponding removal
             if i - 1 >= 0 and diff[i - 1].startswith("- "):
-                highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
                 diff[i-1] = '  '
             else:
                 highlighted_text1 += " "
         elif word.startswith("  "):
             highlighted_text1 += word[2:] + " "
             highlighted_text2 += word[2:] + " "
     return highlighted_text1, highlighted_text2
 def calculate_similarity(text1, text2):
     if not text1.strip() or not text2.strip():
         return 0.0
     try:
-        vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
-        tfidf_matrix = vectorizer.fit_transform([text1, text2])
-        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
-        return similarity[0][0] * 100
-    except ValueError:
-        return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
 def load_contract(file):
     if file is None:
@@ -118,7 +142,6 @@ def load_contract(file):
         elif ext == 'pdf':
             content = extract_text_from_pdf(file)
             if not content:
-                # Fallback to PyPDF4
                 pdfReader = PyPDF4.PdfFileReader(file)
                 full_text = ""
                 for page in pdfReader.pages:
@@ -145,68 +168,33 @@ def main():
         st.error("Failed to load questions or questions mismatch. Please check data files.")
         return
-    st.title("📑 Contract Analysis Suite")
     st.markdown("""
     Compare documents and analyze legal clauses using AI-powered question answering.
     """)
-    # ===== DOCUMENT UPLOAD SECTION =====
     st.header("1. Upload Documents")
     col1, col2 = st.columns(2)
     with col1:
-        uploaded_file1 = st.file_uploader(
-            "Upload First Document",
-            type=["txt", "pdf", "docx"],
-            key="file1"
-        )
         contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
-        doc1_container = st.empty()
     with col2:
-        uploaded_file2 = st.file_uploader(
-            "Upload Second Document",
-            type=["txt", "pdf", "docx"],
-            key="file2"
-        )
         contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
-        doc2_container = st.empty()
-    # Update document displays (initial content)
     if uploaded_file1:
-        doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
-        doc1_container.markdown(doc1_content, unsafe_allow_html=True)
     if uploaded_file2:
-        doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
-        doc2_container.markdown(doc2_content, unsafe_allow_html=True)
-    # JavaScript for synchronized scrolling of the initial document panes
-    scroll_script = """
-    <script>
-    function syncScroll(id, otherId) {
-        var element = document.getElementById(id);
-        var otherElement = document.getElementById(otherId);
-        if (element && otherElement) {
-            element.addEventListener('scroll', function() {
-                otherElement.scrollTop = element.scrollTop;
-            });
-            otherElement.addEventListener('scroll', function() {
-                element.scrollTop = otherElement.scrollTop;
-            });
-        }
-    }
-    window.onload = function() {
-        syncScroll('doc1_text', 'doc2_text');
-    };
-    </script>
-    """
-    components.html(scroll_script, height=0)
     if not (uploaded_file1 and uploaded_file2):
         st.warning("Please upload both documents to proceed")
         return
-    # ===== DOCUMENT COMPARISON SECTION =====
     st.header("2. Document Comparison")
     with st.expander("Show Document Differences", expanded=True):
@@ -218,7 +206,6 @@ def main():
                 similarity_score = calculate_similarity(contract_text1, contract_text2)
                 highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
                 st.session_state.comparison_results = {
                     'similarity_score': similarity_score,
@@ -226,61 +213,40 @@ def main():
                     'highlighted_diff2': highlighted_diff2,
                 }
-        # Display comparison results
         if st.session_state.comparison_results:
-            st.metric("Document Similarity Score",
-                        f"{st.session_state.comparison_results['similarity_score']:.2f}%")
-            if st.session_state.comparison_results['similarity_score'] <= 70:
                 st.warning("Significant differences detected")
             st.markdown("**Visual Difference Highlighting:**")
-            col1_diff, col2_diff = st.columns(2)
-            with col1_diff:
                 st.markdown("### Original Document")
-                diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
-                st.markdown(diff1_content, unsafe_allow_html=True)
-            with col2_diff:
                 st.markdown("### Modified Document")
-                diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
-                st.markdown(diff2_content, unsafe_allow_html=True)
-            # JavaScript for synchronized scrolling of the difference panes
-            diff_scroll_script = """
-            <script>
-            function syncDiffScroll(id, otherId) {
-                var element = document.getElementById(id);
-                var otherElement = document.getElementById(otherId);
-                if (element && otherElement) {
-                    element.addEventListener('scroll', function() {
-                        otherElement.scrollTop = element.scrollTop;
-                    });
-                    otherElement.addEventListener('scroll', function() {
-                        element.scrollTop = otherElement.scrollTop;
-                    });
-                }
-            }
-            // Execute this script after the elements are rendered
-            setTimeout(function() {
-                syncDiffScroll('diff1_text', 'diff2_text');
-            }, 200); // Increased delay to ensure rendering
-            </script>
-            """
-            components.html(diff_scroll_script, height=0)
-    # ===== QUESTION ANALYSIS SECTION =====
     st.header("3. Clause Analysis")
     try:
-        question_selected = st.selectbox(
-            'Select a legal question to analyze:',
-            questions_short,
-            index=0,
-            key="question_select"
-        )
         question_idx = questions_short.index(question_selected)
         selected_question = questions[question_idx]
     except Exception as e:
@@ -292,9 +258,9 @@ def main():
             st.error("Please ensure both documents have readable content")
             return
-        col1_analysis, col2_analysis = st.columns(2)
-        with col1_analysis:
             st.subheader("First Document Analysis")
             with st.spinner('Processing first document...'):
                 try:
@@ -306,7 +272,7 @@ def main():
                     st.session_state.analysis_results = st.session_state.analysis_results or {}
                     st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
-        with col2_analysis:
             st.subheader("Second Document Analysis")
             with st.spinner('Processing second document...'):
                 try:
@@ -318,16 +284,15 @@ def main():
                     st.session_state.analysis_results = st.session_state.analysis_results or {}
                     st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
-    # Display analysis results
     if st.session_state.analysis_results:
-        col1_answer, col2_answer = st.columns(2)
-        with col1_answer:
             st.subheader("First Document Analysis")
             st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
-        with col2_answer:
             st.subheader("Second Document Analysis")
             st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
 if __name__ == "__main__":
-    main()

 import difflib
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers import SentenceTransformer, util
+from fpdf import FPDF
 # ========== CONFIGURATION ==========
 st.set_page_config(
     layout="wide",
     page_title="Contract Analysis Suite",
+    page_icon="📁"
 )
 # Initialize session state variables if they don't exist
             full_text = ""
             for page in pdf.pages:
                 try:
+                    text = page.extract_text_formatted()
                 except AttributeError:
                     text = page.extract_text()
                 if text:
+                    full_text += text + "\n\n"
                 else:
                     full_text += page.extract_text() + "\n\n"
             return full_text if full_text.strip() else ""
         st.error(f"PDF extraction error: {str(e)}")
         return ""
 def highlight_differences_words(text1, text2):
     differ = difflib.Differ()
     diff = list(differ.compare(text1.split(), text2.split()))
         if word.startswith("- "):
             removed_word = word[2:]
             highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
             if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
                 added_word = diff[i + 1][2:]
+                highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'
+                diff[i + 1] = '  '
             else:
                 highlighted_text2 += " "
         elif word.startswith("+ "):
             added_word = word[2:]
             highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
             if i - 1 >= 0 and diff[i - 1].startswith("- "):
+                highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>'
                 diff[i-1] = '  '
             else:
                 highlighted_text1 += " "
         elif word.startswith("  "):
             highlighted_text1 += word[2:] + " "
             highlighted_text2 += word[2:] + " "
     return highlighted_text1, highlighted_text2
 def calculate_similarity(text1, text2):
     if not text1.strip() or not text2.strip():
         return 0.0
     try:
+        model = SentenceTransformer('all-MiniLM-L6-v2')
+        embeddings = model.encode([text1, text2], convert_to_tensor=True)
+        similarity = util.cos_sim(embeddings[0], embeddings[1])
+        return float(similarity.item()) * 100
+    except Exception as e:
+        st.error(f"Similarity calculation error: {e}")
+        return 0.0
+def generate_pdf_report(similarity_score, doc1, doc2):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.set_font("Arial", 'B', 16)
+    pdf.cell(0, 10, "Contract Comparison Report", ln=True, align="C")
+    pdf.set_font("Arial", '', 12)
+    pdf.ln(10)
+    pdf.multi_cell(0, 10, f"Document Similarity Score: {similarity_score:.2f}%")
+    pdf.ln(5)
+    pdf.set_font("Arial", 'B', 12)
+    pdf.cell(0, 10, "Document 1 Excerpt:", ln=True)
+    pdf.set_font("Arial", '', 10)
+    pdf.multi_cell(0, 10, doc1[:1000])
+    pdf.ln(5)
+    pdf.set_font("Arial", 'B', 12)
+    pdf.cell(0, 10, "Document 2 Excerpt:", ln=True)
+    pdf.set_font("Arial", '', 10)
+    pdf.multi_cell(0, 10, doc2[:1000])
+    return pdf.output(dest='S').encode('latin1')
 def load_contract(file):
     if file is None:
         elif ext == 'pdf':
             content = extract_text_from_pdf(file)
             if not content:
                 pdfReader = PyPDF4.PdfFileReader(file)
                 full_text = ""
                 for page in pdfReader.pages:
         st.error("Failed to load questions or questions mismatch. Please check data files.")
         return
+    st.title("📁 Contract Analysis Suite")
     st.markdown("""
     Compare documents and analyze legal clauses using AI-powered question answering.
     """)
     st.header("1. Upload Documents")
     col1, col2 = st.columns(2)
     with col1:
+        uploaded_file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
         contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
+        doc1_display = st.empty()
     with col2:
+        uploaded_file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
         contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
+        doc2_display = st.empty()
     if uploaded_file1:
+        doc1_display.text_area("Document 1 Content", value=contract_text1, height=400, key="area1")
     if uploaded_file2:
+        doc2_display.text_area("Document 2 Content", value=contract_text2, height=400, key="area2")
     if not (uploaded_file1 and uploaded_file2):
         st.warning("Please upload both documents to proceed")
         return
     st.header("2. Document Comparison")
     with st.expander("Show Document Differences", expanded=True):
                 similarity_score = calculate_similarity(contract_text1, contract_text2)
                 highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
                 st.session_state.comparison_results = {
                     'similarity_score': similarity_score,
                     'highlighted_diff2': highlighted_diff2,
                 }
         if st.session_state.comparison_results:
+            st.metric("Document Similarity Score", f"{st.session_state.comparison_results['similarity_score']:.2f}%")
+            if st.session_state.comparison_results['similarity_score'] < 50:
                 st.warning("Significant differences detected")
             st.markdown("**Visual Difference Highlighting:**")
+            col1, col2 = st.columns(2)
+            with col1:
                 st.markdown("### Original Document")
+                st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff1"]}</div>', unsafe_allow_html=True)
+            with col2:
                 st.markdown("### Modified Document")
+                st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff2"]}</div>', unsafe_allow_html=True)
+            if st.button("Download PDF Report"):
+                with st.spinner("Generating report..."):
+                    pdf_bytes = generate_pdf_report(
+                        st.session_state.comparison_results['similarity_score'],
+                        contract_text1,
+                        contract_text2
+                    )
+                    st.download_button(
+                        label="Click to download PDF",
+                        data=pdf_bytes,
+                        file_name="contract_comparison_report.pdf",
+                        mime="application/pdf"
+                    )
     st.header("3. Clause Analysis")
     try:
+        question_selected = st.selectbox('Select a legal question to analyze:', questions_short, index=0, key="question_select")
         question_idx = questions_short.index(question_selected)
         selected_question = questions[question_idx]
     except Exception as e:
             st.error("Please ensure both documents have readable content")
             return
+        col1, col2 = st.columns(2)
+        with col1:
             st.subheader("First Document Analysis")
             with st.spinner('Processing first document...'):
                 try:
                     st.session_state.analysis_results = st.session_state.analysis_results or {}
                     st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
+        with col2:
             st.subheader("Second Document Analysis")
             with st.spinner('Processing second document...'):
                 try:
                     st.session_state.analysis_results = st.session_state.analysis_results or {}
                     st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
     if st.session_state.analysis_results:
+        col1, col2 = st.columns(2)
+        with col1:
             st.subheader("First Document Analysis")
             st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
+        with col2:
             st.subheader("Second Document Analysis")
             st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
 if __name__ == "__main__":
+    main()