Spaces:

ludigija
/

CUAD_contract

Running

App Files Files Community

ludigija commited on Apr 1

Commit

66a7f5c

verified ·

1 Parent(s): 383dff9

Update app.py

Browse files

Files changed (1) hide show

app.py +543 -152

app.py CHANGED Viewed

@@ -1,155 +1,546 @@
-import streamlit as st
-from predict import run_prediction
-from io import StringIO
 import json
-st.set_page_config(layout="wide")
-st.cache(show_spinner=False, persist=True)
-def load_questions():
-    questions = []
-    with open('data/questions.txt') as f:
-        questions = f.readlines()
-    # questions = []
-    # for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
-    #     question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
-    #     questions.append(question)
-    return questions
-def load_questions_short():
-    questions_short = []
-    with open('data/questions_short.txt') as f:
-        questions_short = f.readlines()
-    # questions = []
-    # for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
-    #     question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
-    #     questions.append(question)
-    return questions_short
-st.cache(show_spinner=False, persist=True)
-def load_contracts():
-    with open('data/test.json') as json_file:
-        data = json.load(json_file)
-    contracts = []
-    for i, q in enumerate(data['data']):
-        contract = ' '.join(data['data'][i]['paragraphs'][0]['context'].split())
-        contracts.append(contract)
-    return contracts
-questions = load_questions()
-questions_short = load_questions_short()
-# contracts = load_contracts()
-### DEFINE SIDEBAR
-st.sidebar.title("Interactive Contract Analysis")
-st.sidebar.markdown(
-"""
-This model uses a pretrained snapshot trained on the [Atticus](https://www.atticusprojectai.org/) Dataset - CUAD
-Model used for this demo: https://huggingface.co/marshmellow77/roberta-base-cuad
-Related blog posts:
-- https://bit.ly/3pKWICB
-- https://bit.ly/3ETApRO
-"""
-)
-st.sidebar.header("Contract Selection")
-# select contract
-contracts_drop = ['Contract 1', 'Contract 2', 'Contract 3']
-contracts_files = ['contract-1.txt', 'contract-2.txt', 'contract-3.txt']
-contract = st.sidebar.selectbox('Please Select a Contract', contracts_drop)
-idx = contracts_drop.index(contract)
-with open('data/'+contracts_files[idx]) as f:
-    contract_data = f.read()
-# upload contract
-user_upload = st.sidebar.file_uploader('Please upload your own', type=['txt'],
-                                       accept_multiple_files=False)
-# process upload
-if user_upload is not None:
-    print(user_upload.name, user_upload.type)
-    extension = user_upload.name.split('.')[-1].lower()
-    if extension == 'txt':
-        print('text file uploaded')
-         # To convert to a string based IO:
-        stringio = StringIO(user_upload.getvalue().decode("utf-8"))
-        # To read file as string:
-        contract_data = stringio.read()
-    # elif extension == 'pdf':
-    #     import PyPDF4
-    #     try:
-    #         # Extracting Text from PDFs
-    #         pdfReader = PyPDF4.PdfFileReader(user_upload)
-    #         print(pdfReader.numPages)
-    #         contract_data = ''
-    #         for i in range(0, pdfReader.numPages):
-    #
-    #             print(i)
-    #             pageobj = pdfReader.getPage(i)
-    #             contract_data = contract_data + pageobj.extractText()
-    #     except:
-    #         st.warning('Unable to read PDF, please try another file')
-    #
-    # elif extension == 'docx':
-    #     import docx2txt
-    #
-    #     contract_data = docx2txt.process(user_upload)
-    else:
-        st.warning('Unknown uploaded file type, please try again')
-results_drop = ['1', '2', '3']
-number_results = st.sidebar.selectbox('Select number of results', results_drop)
-### DEFINE MAIN PAGE
-st.header("Legal Contract Review Demo")
-st.write("This demo uses the CUAD dataset for Contract Understanding.")
-paragraph = st.text_area(label="Contract", value=contract_data, height=300)
-questions_drop = questions_short
-question_short = st.selectbox('Choose one of the 41 queries from the CUAD dataset:', questions_drop)
-idxq = questions_drop.index(question_short)
-question = questions[idxq]
-if st.button('Analyze'):
-    if (not len(paragraph)==0) and not (len(question)==0):
-        print('getting predictions')
-        with st.spinner(text='Analysis in progress...'):
-            predictions = run_prediction([question], paragraph, 'marshmellow77/roberta-base-cuad',
-                                         n_best_size=5)
-        answer = ""
-        if predictions['0'] == "":
-            answer = 'No answer found in document'
         else:
-            # if number_results == '1':
-            #     answer = f"Answer: {predictions['0']}"
-            #     # st.text_area(label="Answer", value=f"{answer}")
-            # else:
-            answer = ""
-            with open("nbest.json") as jf:
-                data = json.load(jf)
-                for i in range(int(number_results)):
-                    answer += f"Answer {i+1}: {data['0'][i]['text']} -- \n"
-                    answer += f"Probability: {round(data['0'][i]['probability']*100,1)}%\n\n"
-        st.success(answer)
     else:
-        st.write("Unable to call model, please select question and contract")

+import sys
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
+import numpy as np
+import cv2
+import os
+import shutil
+from difflib import SequenceMatcher
+from PyPDF2 import PdfReader
 import json
+import logging
+import argparse
+import hashlib
+from transformers import pipeline
+import torch
+import streamlit as st  # Added Streamlit import
+from io import StringIO
+import docx2txt
+import pdfplumber
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+# Constants
+ORANGE_LOWER_BOUND = np.array([0, 120, 240])
+ORANGE_UPPER_BOUND = np.array([239, 247, 255])
+BLUE_LOWER_BOUND = np.array([230, 115, 0])
+BLUE_UPPER_BOUND = np.array([255, 238, 218])
+KERNEL_SIZE = (35, 35)
+EXPAND_BY = 10
+SIMILARITY_THRESHOLD = 0.7
+FREE_MODEL_NAME = "google/flan-t5-large"  # You can change this
+# Setup argument parser
+parser = argparse.ArgumentParser(description="PDF Difference Analyzer")
+parser.add_argument('--log-level', type=str, default='INFO',
+                    help='Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)')
+args = parser.parse_args()
+# Setup logging
+logging.basicConfig(level=getattr(logging, args.log_level.upper(), logging.INFO),
+                    format='\033[92m[%(asctime)s] %(levelname)s: %(message)s\033[0m',
+                    datefmt='%Y-%m-%d %H:%M:%S')
+# Check Python path
+logging.debug(f"Python executable: {sys.executable}")
+logging.debug(f"Python version: {sys.version}")
+logging.debug(f"Python path: {sys.path}")
+logging.debug("Tesseract imported successfully!")
+# Initialize the Hugging Face Transformers pipeline
+logging.info(f"Loading free model: {FREE_MODEL_NAME}")
+try:
+    device = 0 if torch.cuda.is_available() else -1
+    generator = pipeline('text2text-generation', model=FREE_MODEL_NAME,
+                           device=device)  # Can also use 'question-answering'
+    logging.info(f"Free model {FREE_MODEL_NAME} loaded successfully.")
+except Exception as e:
+    logging.error(
+        f"Error loading the free model: {e}.  The script will attempt to continue, but component name identification will not work.")
+    generator = None  # Set generator to None to prevent further errors
+# ================== UTILITY FUNCTIONS (Modified for Streamlit) ==================
+def extract_text_from_pdf(uploaded_file):
+    """Extracts text from a PDF file, handling different extraction methods."""
+    try:
+        with pdfplumber.open(uploaded_file) as pdf:
+            full_text = ""
+            for page in pdf.pages:
+                try:
+                    text = page.extract_text_formatted()  # Try to get formatted text
+                except AttributeError:
+                    text = page.extract_text()
+                if text:
+                    full_text += text + "\n\n"  # Add page separator
+                else:
+                    full_text += page.extract_text() + "\n\n"
+            return full_text if full_text.strip() else ""
+    except Exception as e:
+        st.error(f"PDF extraction error: {str(e)}")
+        return ""
+def highlight_differences_words(text1, text2):
+    """Highlights differences between two texts at the word level."""
+    differ = difflib.Differ()
+    diff = list(differ.compare(text1.split(), text2.split()))
+    highlighted_text1 = ""
+    highlighted_text2 = ""
+    for i, word in enumerate(diff):
+        if word.startswith("- "):
+            removed_word = word[2:]
+            highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
+            if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
+                added_word = diff[i + 1][2:]
+                highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'
+                diff[i + 1] = '  '
+            else:
+                highlighted_text2 += " "
+        elif word.startswith("+ "):
+            added_word = word[2:]
+            highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
+            if i - 1 >= 0 and diff[i - 1].startswith("- "):
+                highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>'
+                diff[i - 1] = '  '
+            else:
+                highlighted_text1 += " "
+        elif word.startswith("  "):
+            highlighted_text1 += word[2:] + " "
+            highlighted_text2 += word[2:] + " "
+    return highlighted_text1, highlighted_text2
+def calculate_similarity(text1, text2):
+    """Calculates the similarity between two texts using cosine similarity."""
+    if not text1.strip() or not text2.strip():
+        return 0.0
+    try:
+        vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
+        tfidf_matrix = vectorizer.fit_transform([text1, text2])
+        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
+        return similarity[0][0] * 100
+    except ValueError:
+        return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
+def load_contract(file):
+    """Loads contract text from a file (txt, pdf, docx)."""
+    if file is None:
+        return ""
+    ext = file.name.split('.')[-1].lower()
+    try:
+        if ext == 'txt':
+            content = StringIO(file.getvalue().decode("utf-8")).read()
+        elif ext == 'pdf':
+            content = extract_text_from_pdf(file)
+        elif ext == 'docx':
+            content = docx2txt.process(file)
         else:
+            st.warning('Unsupported file type')
+            return ""
+        return content.strip() if content else ""
+    except Exception as e:
+        st.error(f"Error loading {ext.upper()} file: {str(e)}")
+        return ""
+# ================== OCR and Image Processing Functions ==================
+def convert_pdf_to_images(pdf_path, output_folder):
+    """Converts PDF pages to images."""
+    logging.debug(f"Converting PDF to images: {pdf_path}")
+    images = convert_from_path(pdf_path)
+    for i, image in enumerate(images):
+        image.save(os.path.join(output_folder, f"page_{i + 1}.png"))
+    logging.debug(f"Converted {len(images)} pages and saved to {output_folder}.")
+    return images
+def detect_colored_regions(image, color, page_num, base_dir):
+    """Detects colored regions (orange or blue) in an image."""
+    logging.debug(f"Detecting {color} regions on page {page_num}.")
+    img_np = np.array(image)
+    img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+    if color == 'orange':
+        lower_bound = ORANGE_LOWER_BOUND
+        upper_bound = ORANGE_UPPER_BOUND
+    elif color == 'blue':
+        lower_bound = BLUE_LOWER_BOUND
+        upper_bound = BLUE_UPPER_BOUND
     else:
+        raise ValueError("Color not supported")
+    logging.debug(
+        f"Using lower bound {lower_bound} and upper bound {upper_bound} for color detection.")
+    mask = cv2.inRange(img_np, lower_bound, upper_bound)
+    logging.debug(f"Mask created. Saving mask for verification.")
+    mask_image = Image.fromarray(mask)
+    mask_image_path = os.path.join(base_dir, "masks", f"mask_page_{page_num}.png")
+    mask_image.save(mask_image_path)
+    logging.debug(f"Saved mask to {mask_image_path}")
+    kernel = np.ones(KERNEL_SIZE, np.uint8)
+    closed_mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
+    closed_mask_image = Image.fromarray(closed_mask)
+    closed_mask_image_path = os.path.join(base_dir, "masks",
+                                         f"closed_mask_page_{page_num}.png")
+    closed_mask_image.save(closed_mask_image_path)
+    logging.debug(f"Saved closed mask to {closed_mask_image_path}")
+    contours, _ = cv2.findContours(
+        closed_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    logging.debug(f"Found {len(contours)} contours.")
+    return contours
+def expand_bounding_box(x, y, w, h, expand_by, image_width, image_height):
+    """Expands a bounding box, ensuring it stays within image boundaries."""
+    x = max(0, x - expand_by)
+    y = max(0, y - expand_by)
+    w = min(image_width - x, w + 2 * expand_by)
+    h = min(image_height - y, h + 2 * expand_by)
+    return x, y, w, h
+def ocr_image(image):
+    """Performs OCR on an image."""
+    logging.debug("Performing OCR on image.")
+    custom_config = r'--oem 3 --psm 6'
+    text = pytesseract.image_to_string(image, config=custom_config)
+    logging.debug("OCR completed.")
+    return text
+def postprocess_ocr_text(text):
+    """Corrects common OCR mistakes in extracted text."""
+    corrections = {
+        "Clinvar": "ClinVar"
+    }
+    for wrong, correct in corrections.items():
+        text = text.replace(wrong, correct)
+    return text
+# ================== Caching Functions ==================
+def get_cache_filename(query):
+    """Generates a cache filename based on the hash of the query."""
+    query_hash = hashlib.md5(query.encode()).hexdigest()
+    return os.path.join("cache", f"{query_hash}.json")
+def read_cache(query):
+    """Reads the cached response for a given query."""
+    cache_filename = get_cache_filename(query)
+    if os.path.exists(cache_filename):
+        with open(cache_filename, "r") as cache_file:
+            return json.load(cache_file)
+    return None
+def write_cache(query, response):
+    """Writes the response to the cache for a given query."""
+    os.makedirs("cache", exist_ok=True)
+    cache_filename = get_cache_filename(query)
+    with open(cache_filename, "w") as cache_file:
+        json.dump(response, cache_file)
+# ================== Free Model Interaction Function ==================
+def find_component_name(summary_json, pdf_payload):
+    """
+    Finds the component name using a free model with caching.
+    Args:
+    summary_json (list): Summary of changes.
+    pdf_payload (dict): Payload data from the PDF metadata.
+    Returns:
+    list: Updated summary JSON with component names.
+    """
+    query = f"""
+    Here is a summary of PDF diffing script:
+    {json.dumps(summary_json)}
+    Here is a payload which helped to generate the PDF:
+    {json.dumps(pdf_payload)}
+    VERY IMPORTANT. Give the answer in JSON format of the the summary json structure described above by replacing "__COMPONENT_PLACEHOLDER__" with the name of the `componentName` involved in the diff. Replace with "Unknown" if you unable to recognize the source component.
+    The JSON should be valid and parseable by python's json.loads(...) function
+    DO NOT use any formatting.
+    """
+    if generator is None:
+        logging.warning(
+            "Free model is not loaded, returning original summary.")
+        return [item.update({"component_name": "Unknown"}) for item in
+                summary_json]  # Sets all component names to unknown
+    # Check cache
+    cached_response = read_cache(query)
+    if cached_response:
+        logging.debug("Returning cached response.")
+        return cached_response
+    try:
+        response = generator(query, max_length=512)  # Adjust max_length as needed
+        response_text = response[0]['generated_text']
+        logging.debug(f"Response from free model: {response_text}")
+        response_data = json.loads(response_text)  # Parse the generated JSON
+    except Exception as e:
+        logging.error(
+            f"Error getting response from free model: {e}.  Returning original summary")
+        return [item.update({"component_name": "Unknown"}) for item in
+                summary_json]  # Sets all component names to unknown
+    # Write to cache
+    write_cache(query, response_data)
+    return response_data
+# ================== Main Function (Modified for Integration) ==================
+def analyze_differences(diff_pdf, baseline_pdf, changed_pdf):
+    """
+    Analyzes the differences between the baseline and changed PDFs by detecting and comparing regions with differences.
+    Args:
+        diff_pdf (str): Path to the diff PDF.
+        baseline_pdf (str): Path to the baseline PDF.
+        changed_pdf (str): Path to the changed PDF.
+    Returns:
+        list: Summary of changes with component names.
+    """
+    # Setup output directories (using temp dirs)
+    temp_dir = "temp_diff_analysis"
+    os.makedirs(temp_dir, exist_ok=True)
+    setup_output_directories([temp_dir])
+    # Extract metadata from baseline PDF
+    baseline_metadata = extract_metadata(baseline_pdf)
+    if baseline_metadata is None:
+        logging.debug("No metadata found in baseline PDF.")
+        return []  # Return empty list for consistency
+    payload = baseline_metadata["payload"]
+    # Convert diff.pdf to images
+    diff_images = convert_pdf_to_images(diff_pdf, os.path.join(temp_dir, "diff_pages"))
+    # Convert baseline.pdf to images
+    baseline_images = convert_pdf_to_images(baseline_pdf, os.path.join(temp_dir, "baseline", "pages"))
+    # Convert changed.pdf to images
+    changed_images = convert_pdf_to_images(changed_pdf, os.path.join(temp_dir, "changed", "pages"))
+    changes = []
+    baseline_texts = []
+    changed_texts = []
+    baseline_contours = []
+    changed_contours = []
+    for page_num, diff_image in enumerate(diff_images):
+        logging.debug(f"Processing page {page_num + 1}/{len(diff_images)}")
+        image_width, image_height = diff_image.size
+        # Detect orange regions
+        orange_contours = detect_colored_regions(diff_image, 'orange', page_num + 1,
+                                                 os.path.join(temp_dir, "changed"))
+        logging.debug(f"Merged to {len(orange_contours)} orange contours.")
+        for rect_num, cnt in enumerate(orange_contours):
+            x, y, w, h = cv2.boundingRect(cnt)
+            x, y, w, h = expand_bounding_box(x, y, w, h, EXPAND_BY, image_width,
+                                             image_height)
+            logging.debug(
+                f"Orange Rect {rect_num + 1}: Expanded bounding box (x={x}, y={y}, w={w}, h={h})")
+            roi = changed_images[page_num].crop((x, y, x + w, y + h))
+            # Save the detected region to disk
+            region_path = os.path.join(temp_dir, "changed", "regions",
+                                       f"page_{page_num + 1}_region_{rect_num + 1}.png")
+            roi.save(region_path)
+            logging.debug(f"Saved detected region to {region_path}")
+            # Save the merged region to disk
+            merged_region_path = os.path.join(temp_dir, "changed", "contours",
+                                               f"page_{page_num + 1}_merged_region_{rect_num + 1}.png")
+            roi.save(merged_region_path)
+            logging.debug(f"Saved merged region to {merged_region_path}")
+            # Perform OCR on the detected region
+            orange_text = ocr_image(roi)
+            orange_text = postprocess_ocr_text(orange_text)
+            changed_texts.append((page_num + 1, orange_text))
+            changed_contours.append((x, y, w, h))
+            logging.debug(f"Extracted orange text: {orange_text}")
+        # Detect blue regions
+        blue_contours = detect_colored_regions(diff_image, 'blue', page_num + 1,
+                                               os.path.join(temp_dir, "baseline"))
+        logging.debug(f"Merged to {len(blue_contours)} blue contours.")
+        for rect_num, cnt in enumerate(blue_contours):
+            x, y, w, h = cv2.boundingRect(cnt)
+            x, y, w, h = expand_bounding_box(x, y, w, h, EXPAND_BY, image_width,
+                                             image_height)
+            logging.debug(
+                f"Blue Rect {rect_num + 1}: Expanded bounding box (x={x}, y={y}, w={w}, h={h})")
+            roi = baseline_images[page_num].crop((x, y, x + w, y + h))
+            # Save the detected region to disk
+            region_path = os.path.join(temp_dir, "baseline", "regions",
+                                       f"page_{page_num + 1}_region_{rect_num + 1}.png")
+            roi.save(region_path)
+            logging.debug(f"Saved detected region to {region_path}")
+            # Save the merged region to disk
+            merged_region_path = os.path.join(temp_dir, "baseline", "contours",
+                                               f"page_{page_num + 1}_merged_region_{rect_num + 1}.png")
+            roi.save(merged_region_path)
+            logging.debug(f"Saved merged region to {merged_region_path}")
+            # Perform OCR on the detected region
+            blue_text = ocr_image(roi)
+            blue_text = postprocess_ocr_text(blue_text)
+            baseline_texts.append((page_num + 1, blue_text))
+            baseline_contours.append((x, y, w, h))
+            logging.debug(f"Extracted blue text: {blue_text}")
+    # Analyze differences
+    for i, ((baseline_page_num, baseline_text), (changed_page_num, changed_text)) in enumerate(
+            zip(baseline_texts, changed_texts)):
+        similarity_ratio = compare_texts(baseline_text, changed_text)
+        baseline_contour = baseline_contours[i]
+        changed_contour = changed_contours[i]
+        offset = {
+            "x_offset": changed_contour[0] - baseline_contour[0],
+            "y_offset": changed_contour[1] - baseline_contour[1]
+        }
+        if similarity_ratio == 1.0:
+            change_type = "style change"
+        elif similarity_ratio >= SIMILARITY_THRESHOLD:
+            change_type = "wording change"
+        else:
+            change_type = "content change"
+        changes.append({
+            "page_num": baseline_page_num,
+            "baseline_text": baseline_text.replace("\n", " ").strip(),
+            "changed_text": changed_text.replace("\n", " ").strip(),
+            "type": change_type,
+            "offset": offset if change_type == "style change" else None,
+            "component_name": "__COMPONENT_PLACEHOLDER__"
+        })
+    # Call  model to determine component names
+    updated_changes_summary = find_component_name(changes, payload)
+    return updated_changes_summary
+def main():
+    """Main function to run the Streamlit app."""
+    # ... (Load questions - as before)
+    questions = load_questions()
+    questions_short = load_questions_short()
+    if not questions or not questions_short or len(questions) != len(
+            questions_short):
+        st.error(
+            "Failed to load questions or questions mismatch. Please check data files.")
+        return
+    st.title("📑 Contract Analysis Suite")
+    st.markdown(
+        """
+    Compare documents and analyze legal clauses using AI-powered question answering.
+    """)
+    # ===== DOCUMENT UPLOAD SECTION =====
+    st.header("1. Upload Documents")
+    col1, col2 = st.columns(2)
+    with col1:
+        uploaded_file1 = st.file_uploader(
+            "Upload First Document",
+            type=["txt", "pdf", "docx"],
+            key="file1"
+        )
+        contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
+        doc1_display = st.empty()
+    with col2:
+        uploaded_file2 = st.file_uploader(
+            "Upload Second Document",
+            type=["txt", "pdf", "docx"],
+            key="file2"
+        )
+        contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
+        doc2_display = st.empty()
+    # Update document displays
+    if uploaded_file1:
+        doc1_display.text_area("Document 1 Content",
+                            value=contract_text1,
+                            height=400,
+                            key="area1")
+    if uploaded_file2:
+        doc2_display.text_area("Document 2 Content",
+                            value=contract_text2,
+                            height=400,
+                            key="area2")
+    if not (uploaded_file1 and uploaded_file2):
+        st.warning("Please upload both documents to proceed")
+        return
+    # ===== DOCUMENT COMPARISON SECTION =====
+    st.header("2. Document Comparison")
+    with st.expander("Show Document Differences", expanded=True):
+        if st.button("Compare Documents"):
+            with st.spinner("Analyzing documents..."):
+                if not contract_text1.strip() or not contract_text2.strip():
+                    st.error(
+                        "One or both documents appear to be empty or couldn't be read properly")
+                    return
+                similarity_score = calculate_similarity(contract_text1,
+                                                          contract_text2)
+                highlighted_diff1, highlighted_diff2 = highlight_differences_words(
+                    contract_text1, contract_text2)
+                st.session_state.comparison_results = {
+                    'similarity_score': similarity_score,
+                    'highlighted_diff1': highlighted_diff1,
+                    'highlighted_diff2': highlighted_diff2,
+                }
+        # Display comparison results
+        if st.session_state.comparison_results:
+            st.metric("Document Similarity Score",
+                        f"{st.session_state.comparison_results['similarity_score']:.2f}%")
+            if st.session_state.comparison_results['similarity_score'] < 50:
+                st.warning("Significant differences detected")
+            st.markdown("**Visual Difference Highlighting:**")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.markdown("### Original Document")
+                st.markdown(
+                    f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff1"]}</div>',
+                    unsafe_allow_html=True)
+            with col2:
+                st.markdown("### Modified Document")
+                st.markdown(
+                    f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff2"]}</div>',
+                    unsafe_allow_html=True)
+    # ===== QUESTION ANALYSIS SECTION ==