Spaces:

ludigija
/

crosscheck

Sleeping

App Files Files Community

ludigija commited on Apr 14

Commit

dc1a085

verified ·

1 Parent(s): 8da2ea9

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -325

app.py CHANGED Viewed

@@ -1,333 +1,131 @@
-import streamlit as st
-from predict import run_prediction
-from io import StringIO
-import PyPDF4
-import docx2txt
-import pdfplumber
 import difflib
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import streamlit.components.v1 as components
-# ========== CONFIGURATION ==========
-st.set_page_config(
-    layout="wide",
-    page_title="Contract Analysis Suite",
-    page_icon="📑"
-)
-# Initialize session state variables if they don't exist
-if 'comparison_results' not in st.session_state:
     st.session_state.comparison_results = None
-if 'analysis_results' not in st.session_state:
-    st.session_state.analysis_results = None
-# ========== CACHED DATA LOADING ==========
-@st.cache_data(show_spinner=False)
-def load_questions():
-    try:
-        with open('data/questions.txt') as f:
-            return [q.strip() for q in f.readlines() if q.strip()]
-    except Exception as e:
-        st.error(f"Error loading questions: {str(e)}")
-        return []
-@st.cache_data(show_spinner=False)
-def load_questions_short():
-    try:
-        with open('data/questions_short.txt') as f:
-            return [q.strip() for q in f.readlines() if q.strip()]
-    except Exception as e:
-        st.error(f"Error loading short questions: {str(e)}")
-        return []
-# ========== UTILITY FUNCTIONS ==========
-def extract_text_from_pdf(uploaded_file):
-    try:
-        with pdfplumber.open(uploaded_file) as pdf:
-            full_text = ""
-            for page in pdf.pages:
-                try:
-                    text = page.extract_text_formatted()  # Try to get formatted text
-                except AttributeError:
-                    text = page.extract_text()
-                if text:
-                    full_text += text + "\n\n"  # Add page separator
-                else:
-                    full_text += page.extract_text() + "\n\n"
-            return full_text if full_text.strip() else ""
-    except Exception as e:
-        st.error(f"PDF extraction error: {str(e)}")
-        return ""
-def highlight_differences_words(text1, text2):
-    differ = difflib.Differ()
-    diff = list(differ.compare(text1.split(), text2.split()))
-    highlighted_text1 = ""
-    highlighted_text2 = ""
-    for i, word in enumerate(diff):
-        if word.startswith("- "):
-            removed_word = word[2:]
-            highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
-            # Check for corresponding addition to highlight as changed
-            if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
-                added_word = diff[i + 1][2:]
-                highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'  # Yellow for changed in text2
-                diff[i + 1] = '  '  # Consume the addition
-            else:
-                highlighted_text2 += " "
-        elif word.startswith("+ "):
-            added_word = word[2:]
-            highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
-            # Check for corresponding removal
-            if i - 1 >= 0 and diff[i - 1].startswith("- "):
-                highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>' # Yellow for changed in text1
-                diff[i-1] = '  '
-            else:
-                highlighted_text1 += " "
-        elif word.startswith("  "):
-            highlighted_text1 += word[2:] + " "
-            highlighted_text2 += word[2:] + " "
-    return highlighted_text1, highlighted_text2
-def calculate_similarity(text1, text2):
-    if not text1.strip() or not text2.strip():
-        return 0.0
-    try:
-        vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
-        tfidf_matrix = vectorizer.fit_transform([text1, text2])
-        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
-        return similarity[0][0] * 100
-    except ValueError:
-        return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
-def load_contract(file):
-    if file is None:
-        return ""
-    ext = file.name.split('.')[-1].lower()
-    try:
-        if ext == 'txt':
-            content = StringIO(file.getvalue().decode("utf-8")).read()
-        elif ext == 'pdf':
-            content = extract_text_from_pdf(file)
-            if not content:
-                # Fallback to PyPDF4
-                pdfReader = PyPDF4.PdfFileReader(file)
-                full_text = ""
-                for page in pdfReader.pages:
-                    text = page.extractText()
-                    if text:
-                        full_text += text + "\n\n"
-                content = full_text
-        elif ext == 'docx':
-            content = docx2txt.process(file)
-        else:
-            st.warning('Unsupported file type')
-            return ""
-        return content.strip() if content else ""
-    except Exception as e:
-        st.error(f"Error loading {ext.upper()} file: {str(e)}")
-        return ""
-# ========== MAIN APP ==========
-def main():
-    questions = load_questions()
-    questions_short = load_questions_short()
-    if not questions or not questions_short or len(questions) != len(questions_short):
-        st.error("Failed to load questions or questions mismatch. Please check data files.")
-        return
-    st.title("📑 Contract Analysis Suite")
-    st.markdown("""
-    Compare documents and analyze legal clauses using AI-powered question answering.
-    """)
-    # ===== DOCUMENT UPLOAD SECTION =====
-    st.header("1. Upload Documents")
-    col1, col2 = st.columns(2)
-    with col1:
-        uploaded_file1 = st.file_uploader(
-            "Upload First Document",
-            type=["txt", "pdf", "docx"],
-            key="file1"
-        )
-        contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
-        doc1_container = st.empty()
-    with col2:
-        uploaded_file2 = st.file_uploader(
-            "Upload Second Document",
-            type=["txt", "pdf", "docx"],
-            key="file2"
-        )
-        contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
-        doc2_container = st.empty()
-    # Update document displays (initial content)
-    if uploaded_file1:
-        doc1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc1_text">{contract_text1}</div>'
-        doc1_container.markdown(doc1_content, unsafe_allow_html=True)
-    if uploaded_file2:
-        doc2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; height: 400px; overflow-y: auto;" id="doc2_text">{contract_text2}</div>'
-        doc2_container.markdown(doc2_content, unsafe_allow_html=True)
-    # JavaScript for synchronized scrolling of the initial document panes
-    scroll_script = """
-    <script>
-    function syncScroll(id, otherId) {
-        var element = document.getElementById(id);
-        var otherElement = document.getElementById(otherId);
-        if (element && otherElement) {
-            element.addEventListener('scroll', function() {
-                otherElement.scrollTop = element.scrollTop;
-            });
-            otherElement.addEventListener('scroll', function() {
-                element.scrollTop = otherElement.scrollTop;
-            });
         }
-    }
-    window.onload = function() {
-        syncScroll('doc1_text', 'doc2_text');
-    };
-    </script>
-    """
-    components.html(scroll_script, height=0)
-    if not (uploaded_file1 and uploaded_file2):
-        st.warning("Please upload both documents to proceed")
-        return
-    # ===== DOCUMENT COMPARISON SECTION =====
-    st.header("2. Document Comparison")
-    with st.expander("Show Document Differences", expanded=True):
-        if st.button("Compare Documents"):
-            with st.spinner("Analyzing documents..."):
-                if not contract_text1.strip() or not contract_text2.strip():
-                    st.error("One or both documents appear to be empty or couldn't be read properly")
-                    return
-                similarity_score = calculate_similarity(contract_text1, contract_text2)
-                highlighted_diff1, highlighted_diff2 = highlight_differences_words(contract_text1, contract_text2)
-                st.session_state.comparison_results = {
-                    'similarity_score': similarity_score,
-                    'highlighted_diff1': highlighted_diff1,
-                    'highlighted_diff2': highlighted_diff2,
-                }
-        # Display comparison results
-        if st.session_state.comparison_results:
-            st.metric("Document Similarity Score",
-                        f"{st.session_state.comparison_results['similarity_score']:.2f}%")
-            if st.session_state.comparison_results['similarity_score'] <= 70:
-                st.warning("Significant differences detected")
-            st.markdown("**Visual Difference Highlighting:**")
-            col1_diff, col2_diff = st.columns(2)
-            with col1_diff:
-                st.markdown("### Original Document")
-                diff1_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff1_text">{st.session_state.comparison_results["highlighted_diff1"]}</div>'
-                st.markdown(diff1_content, unsafe_allow_html=True)
-            with col2_diff:
-                st.markdown("### Modified Document")
-                diff2_content = f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;" id="diff2_text">{st.session_state.comparison_results["highlighted_diff2"]}</div>'
-                st.markdown(diff2_content, unsafe_allow_html=True)
-            # JavaScript for synchronized scrolling of the difference panes
-            diff_scroll_script = f"""
-            <script>
-            function syncDiffScroll(id, otherId) {
-                var element = document.getElementById(id);
-                var otherElement = document.getElementById(otherId);
-                if (element && otherElement) {
-                    element.addEventListener('scroll', function() {
-                        otherElement.scrollTop = element.scrollTop;
-                    });
-                    otherElement.addEventListener('scroll', function() {
-                        element.scrollTop = otherElement.scrollTop;
-                    });
-                }
-            }
-            // Execute this script after the elements are rendered
-            setTimeout(function() {
-                syncDiffScroll('diff1_text', 'diff2_text');
-            }, 200); // Increased delay to ensure rendering
-            </script>
-            """
-            components.html(diff_scroll_script, height=0)
-    # ===== QUESTION ANALYSIS SECTION =====
-    st.header("3. Clause Analysis")
-    try:
-        question_selected = st.selectbox(
-            'Select a legal question to analyze:',
-            questions_short,
-            index=0,
-            key="question_select"
-        )
-        question_idx = questions_short.index(question_selected)
-        selected_question = questions[question_idx]
-    except Exception as e:
-        st.error(f"Error selecting question: {str(e)}")
-        return
-    if st.button("Analyze Both Documents"):
-        if not (contract_text1.strip() and contract_text2.strip()):
-            st.error("Please ensure both documents have readable content")
-            return
-        col1_analysis, col2_analysis = st.columns(2)
-        with col1_analysis:
-            st.subheader("First Document Analysis")
-            with st.spinner('Processing first document...'):
-                try:
-                    predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
-                    answer1 = predictions1.get('0', 'No answer found')
-                    st.session_state.analysis_results = st.session_state.analysis_results or {}
-                    st.session_state.analysis_results['doc1'] = answer1 if answer1 else "No relevant clause found"
-                except Exception as e:
-                    st.session_state.analysis_results = st.session_state.analysis_results or {}
-                    st.session_state.analysis_results['doc1'] = f"Analysis failed: {str(e)}"
-        with col2_analysis:
-            st.subheader("Second Document Analysis")
-            with st.spinner('Processing second document...'):
-                try:
-                    predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
-                    answer2 = predictions2.get('0', 'No answer found')
-                    st.session_state.analysis_results = st.session_state.analysis_results or {}
-                    st.session_state.analysis_results['doc2'] = answer2 if answer2 else "No relevant clause found"
-                except Exception as e:
-                    st.session_state.analysis_results = st.session_state.analysis_results or {}
-                    st.session_state.analysis_results['doc2'] = f"Analysis failed: {str(e)}"
-    # Display analysis results
-    if st.session_state.analysis_results:
-        col1_answer, col2_answer = st.columns(2)
-        with col1_answer:
-            st.subheader("First Document Analysis")
-            st.success(st.session_state.analysis_results.get('doc1', 'No analysis performed yet'))
-        with col2_answer:
-            st.subheader("Second Document Analysis")
-            st.success(st.session_state.analysis_results.get('doc2', 'No analysis performed yet'))
-if __name__ == "__main__":
-    main()

+import os
 import difflib
+import streamlit as st
+import pandas as pd
+from PyPDF4 import PdfReader
+from docx import Document
+from difflib import HtmlDiff
+from fpdf import FPDF
+import base64
+from streamlit.components.v1 import html
+# Set page config
+st.set_page_config(page_title="Contract Analysis Suite", layout="wide")
+st.title("📄 Contract Analysis Suite")
+# Session state initialization
+if "comparison_results" not in st.session_state:
     st.session_state.comparison_results = None
+def extract_text_from_pdf(file):
+    pdf = PdfReader(file)
+    text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
+    return text
+def extract_text_from_docx(file):
+    doc = Document(file)
+    return "\n".join([para.text for para in doc.paragraphs])
+def get_text(file):
+    if file.name.endswith(".pdf"):
+        return extract_text_from_pdf(file)
+    elif file.name.endswith(".docx"):
+        return extract_text_from_docx(file)
+    else:
+        return file.read().decode("utf-8")
+def compare_documents(text1, text2):
+    diff = difflib.ndiff(text1.split(), text2.split())
+    diff1, diff2 = [], []
+    added, removed, unchanged = 0, 0, 0
+    for word in diff:
+        tag = word[:2]
+        content = word[2:]
+        if tag == "  ":
+            diff1.append(content)
+            diff2.append(content)
+            unchanged += 1
+        elif tag == "- ":
+            diff1.append(f'<span style="background-color: red;">{content}</span>')
+            removed += 1
+        elif tag == "+ ":
+            diff2.append(f'<span style="background-color: lightgreen;">{content}</span>')
+            added += 1
+    total = added + removed + unchanged
+    similarity_score = 100 * unchanged / total if total > 0 else 0
+    return " ".join(diff1), " ".join(diff2), similarity_score
+def generate_pdf_report(html1, html2, score):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    pdf.multi_cell(0, 10, f"Contract Comparison Report\n\nSimilarity Score: {score:.2f}%\n\nDocument 1 Differences:\n\n", align='L')
+    pdf.multi_cell(0, 10, html1.replace('<span style="background-color:', '\n[').replace('</span>', ']'), align='L')
+    pdf.add_page()
+    pdf.multi_cell(0, 10, "Document 2 Differences:\n\n", align='L')
+    pdf.multi_cell(0, 10, html2.replace('<span style="background-color:', '\n[').replace('</span>', ']'), align='L')
+    file_path = "/tmp/report.pdf"
+    pdf.output(file_path)
+    return file_path
+# File upload section
+st.sidebar.header("Upload Documents")
+file1 = st.sidebar.file_uploader("Choose the original document", type=["pdf", "docx", "txt"], key="file1")
+file2 = st.sidebar.file_uploader("Choose the modified document", type=["pdf", "docx", "txt"], key="file2")
+if file1 and file2:
+    with st.spinner("Analyzing documents..."):
+        text1 = get_text(file1)
+        text2 = get_text(file2)
+        diff1, diff2, score = compare_documents(text1, text2)
+        st.session_state.comparison_results = {
+            "highlighted_diff1": diff1,
+            "highlighted_diff2": diff2,
+            "similarity_score": score
         }
+if st.session_state.comparison_results:
+    st.markdown("## Comparison Results")
+    st.markdown(f"**Similarity Score:** {st.session_state.comparison_results['similarity_score']:.2f}%")
+    st.markdown("### Original vs Modified (Synchronized View)")
+    html(f"""
+    <div style="display: flex; gap: 10px;">
+        <div id="panel1" style="flex: 1; border:1px solid #ccc; padding:10px; overflow-y: scroll; max-height:500px; font-family: monospace; font-size: 0.9em; white-space: pre-wrap; background-color: #fdfdfd;">
+            {st.session_state.comparison_results["highlighted_diff1"]}
+        </div>
+        <div id="panel2" style="flex: 1; border:1px solid #ccc; padding:10px; overflow-y: scroll; max-height:500px; font-family: monospace; font-size: 0.9em; white-space: pre-wrap; background-color: #fdfdfd;">
+            {st.session_state.comparison_results["highlighted_diff2"]}
+        </div>
+    </div>
+    <script>
+    const panel1 = document.getElementById('panel1');
+    const panel2 = document.getElementById('panel2');
+    function syncScroll(source, target) {{
+        target.scrollTop = source.scrollTop;
+    }}
+    panel1.addEventListener('scroll', () => syncScroll(panel1, panel2));
+    panel2.addEventListener('scroll', () => syncScroll(panel2, panel1));
+    </script>
+    """, height=520)
+    if st.button("📄 Generate PDF Report"):
+        try:
+            file_path = generate_pdf_report(
+                st.session_state.comparison_results["highlighted_diff1"],
+                st.session_state.comparison_results["highlighted_diff2"],
+                st.session_state.comparison_results["similarity_score"]
+            )
+            with open(file_path, "rb") as f:
+                base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+            href = f'<a href="data:application/pdf;base64,{base64_pdf}" download="comparison_report.pdf">📥 Click to Download Report</a>'
+            st.markdown(href, unsafe_allow_html=True)
+        except Exception as e:
+            st.error(f"Failed to generate PDF report: {e}")