ludigija commited on
Commit
66a7f5c
·
verified ·
1 Parent(s): 383dff9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +543 -152
app.py CHANGED
@@ -1,155 +1,546 @@
1
- import streamlit as st
2
- from predict import run_prediction
3
- from io import StringIO
 
 
 
 
 
 
 
4
  import json
5
-
6
- st.set_page_config(layout="wide")
7
- st.cache(show_spinner=False, persist=True)
8
-
9
-
10
- def load_questions():
11
- questions = []
12
- with open('data/questions.txt') as f:
13
- questions = f.readlines()
14
-
15
- # questions = []
16
- # for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
17
- # question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
18
- # questions.append(question)
19
- return questions
20
-
21
-
22
- def load_questions_short():
23
- questions_short = []
24
- with open('data/questions_short.txt') as f:
25
- questions_short = f.readlines()
26
-
27
- # questions = []
28
- # for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
29
- # question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
30
- # questions.append(question)
31
- return questions_short
32
-
33
-
34
- st.cache(show_spinner=False, persist=True)
35
-
36
-
37
- def load_contracts():
38
- with open('data/test.json') as json_file:
39
- data = json.load(json_file)
40
-
41
- contracts = []
42
- for i, q in enumerate(data['data']):
43
- contract = ' '.join(data['data'][i]['paragraphs'][0]['context'].split())
44
- contracts.append(contract)
45
- return contracts
46
-
47
-
48
- questions = load_questions()
49
- questions_short = load_questions_short()
50
- # contracts = load_contracts()
51
-
52
- ### DEFINE SIDEBAR
53
- st.sidebar.title("Interactive Contract Analysis")
54
- st.sidebar.markdown(
55
- """
56
- This model uses a pretrained snapshot trained on the [Atticus](https://www.atticusprojectai.org/) Dataset - CUAD
57
-
58
- Model used for this demo: https://huggingface.co/marshmellow77/roberta-base-cuad
59
-
60
- Related blog posts:
61
- - https://bit.ly/3pKWICB
62
- - https://bit.ly/3ETApRO
63
- """
64
- )
65
-
66
- st.sidebar.header("Contract Selection")
67
-
68
- # select contract
69
- contracts_drop = ['Contract 1', 'Contract 2', 'Contract 3']
70
- contracts_files = ['contract-1.txt', 'contract-2.txt', 'contract-3.txt']
71
- contract = st.sidebar.selectbox('Please Select a Contract', contracts_drop)
72
-
73
-
74
- idx = contracts_drop.index(contract)
75
- with open('data/'+contracts_files[idx]) as f:
76
- contract_data = f.read()
77
-
78
- # upload contract
79
- user_upload = st.sidebar.file_uploader('Please upload your own', type=['txt'],
80
- accept_multiple_files=False)
81
-
82
-
83
- # process upload
84
- if user_upload is not None:
85
- print(user_upload.name, user_upload.type)
86
- extension = user_upload.name.split('.')[-1].lower()
87
- if extension == 'txt':
88
- print('text file uploaded')
89
- # To convert to a string based IO:
90
- stringio = StringIO(user_upload.getvalue().decode("utf-8"))
91
-
92
- # To read file as string:
93
- contract_data = stringio.read()
94
-
95
- # elif extension == 'pdf':
96
- # import PyPDF4
97
- # try:
98
- # # Extracting Text from PDFs
99
- # pdfReader = PyPDF4.PdfFileReader(user_upload)
100
- # print(pdfReader.numPages)
101
- # contract_data = ''
102
- # for i in range(0, pdfReader.numPages):
103
- #
104
- # print(i)
105
- # pageobj = pdfReader.getPage(i)
106
- # contract_data = contract_data + pageobj.extractText()
107
- # except:
108
- # st.warning('Unable to read PDF, please try another file')
109
- #
110
- # elif extension == 'docx':
111
- # import docx2txt
112
- #
113
- # contract_data = docx2txt.process(user_upload)
114
-
115
- else:
116
- st.warning('Unknown uploaded file type, please try again')
117
-
118
- results_drop = ['1', '2', '3']
119
- number_results = st.sidebar.selectbox('Select number of results', results_drop)
120
-
121
- ### DEFINE MAIN PAGE
122
- st.header("Legal Contract Review Demo")
123
- st.write("This demo uses the CUAD dataset for Contract Understanding.")
124
-
125
- paragraph = st.text_area(label="Contract", value=contract_data, height=300)
126
-
127
- questions_drop = questions_short
128
- question_short = st.selectbox('Choose one of the 41 queries from the CUAD dataset:', questions_drop)
129
- idxq = questions_drop.index(question_short)
130
- question = questions[idxq]
131
-
132
- if st.button('Analyze'):
133
- if (not len(paragraph)==0) and not (len(question)==0):
134
- print('getting predictions')
135
- with st.spinner(text='Analysis in progress...'):
136
- predictions = run_prediction([question], paragraph, 'marshmellow77/roberta-base-cuad',
137
- n_best_size=5)
138
- answer = ""
139
- if predictions['0'] == "":
140
- answer = 'No answer found in document'
141
  else:
142
- # if number_results == '1':
143
- # answer = f"Answer: {predictions['0']}"
144
- # # st.text_area(label="Answer", value=f"{answer}")
145
- # else:
146
- answer = ""
147
- with open("nbest.json") as jf:
148
- data = json.load(jf)
149
- for i in range(int(number_results)):
150
- answer += f"Answer {i+1}: {data['0'][i]['text']} -- \n"
151
- answer += f"Probability: {round(data['0'][i]['probability']*100,1)}%\n\n"
152
- st.success(answer)
153
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  else:
155
- st.write("Unable to call model, please select question and contract")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
+ from PIL import Image
5
+ import numpy as np
6
+ import cv2
7
+ import os
8
+ import shutil
9
+ from difflib import SequenceMatcher
10
+ from PyPDF2 import PdfReader
11
  import json
12
+ import logging
13
+ import argparse
14
+ import hashlib
15
+ from transformers import pipeline
16
+ import torch
17
+ import streamlit as st # Added Streamlit import
18
+ from io import StringIO
19
+ import docx2txt
20
+ import pdfplumber
21
+ from sklearn.feature_extraction.text import TfidfVectorizer
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+
24
+ # Constants
25
+ ORANGE_LOWER_BOUND = np.array([0, 120, 240])
26
+ ORANGE_UPPER_BOUND = np.array([239, 247, 255])
27
+ BLUE_LOWER_BOUND = np.array([230, 115, 0])
28
+ BLUE_UPPER_BOUND = np.array([255, 238, 218])
29
+ KERNEL_SIZE = (35, 35)
30
+ EXPAND_BY = 10
31
+ SIMILARITY_THRESHOLD = 0.7
32
+ FREE_MODEL_NAME = "google/flan-t5-large" # You can change this
33
+
34
+ # Setup argument parser
35
+ parser = argparse.ArgumentParser(description="PDF Difference Analyzer")
36
+ parser.add_argument('--log-level', type=str, default='INFO',
37
+ help='Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)')
38
+ args = parser.parse_args()
39
+
40
+ # Setup logging
41
+ logging.basicConfig(level=getattr(logging, args.log_level.upper(), logging.INFO),
42
+ format='\033[92m[%(asctime)s] %(levelname)s: %(message)s\033[0m',
43
+ datefmt='%Y-%m-%d %H:%M:%S')
44
+
45
+ # Check Python path
46
+ logging.debug(f"Python executable: {sys.executable}")
47
+ logging.debug(f"Python version: {sys.version}")
48
+ logging.debug(f"Python path: {sys.path}")
49
+
50
+ logging.debug("Tesseract imported successfully!")
51
+
52
+ # Initialize the Hugging Face Transformers pipeline
53
+ logging.info(f"Loading free model: {FREE_MODEL_NAME}")
54
+ try:
55
+ device = 0 if torch.cuda.is_available() else -1
56
+ generator = pipeline('text2text-generation', model=FREE_MODEL_NAME,
57
+ device=device) # Can also use 'question-answering'
58
+ logging.info(f"Free model {FREE_MODEL_NAME} loaded successfully.")
59
+ except Exception as e:
60
+ logging.error(
61
+ f"Error loading the free model: {e}. The script will attempt to continue, but component name identification will not work.")
62
+ generator = None # Set generator to None to prevent further errors
63
+
64
+ # ================== UTILITY FUNCTIONS (Modified for Streamlit) ==================
65
+ def extract_text_from_pdf(uploaded_file):
66
+ """Extracts text from a PDF file, handling different extraction methods."""
67
+ try:
68
+ with pdfplumber.open(uploaded_file) as pdf:
69
+ full_text = ""
70
+ for page in pdf.pages:
71
+ try:
72
+ text = page.extract_text_formatted() # Try to get formatted text
73
+ except AttributeError:
74
+ text = page.extract_text()
75
+ if text:
76
+ full_text += text + "\n\n" # Add page separator
77
+ else:
78
+ full_text += page.extract_text() + "\n\n"
79
+ return full_text if full_text.strip() else ""
80
+ except Exception as e:
81
+ st.error(f"PDF extraction error: {str(e)}")
82
+ return ""
83
+
84
+
85
+
86
+ def highlight_differences_words(text1, text2):
87
+ """Highlights differences between two texts at the word level."""
88
+ differ = difflib.Differ()
89
+ diff = list(differ.compare(text1.split(), text2.split()))
90
+
91
+ highlighted_text1 = ""
92
+ highlighted_text2 = ""
93
+
94
+ for i, word in enumerate(diff):
95
+ if word.startswith("- "):
96
+ removed_word = word[2:]
97
+ highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
98
+ if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
99
+ added_word = diff[i + 1][2:]
100
+ highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'
101
+ diff[i + 1] = ' '
102
+ else:
103
+ highlighted_text2 += " "
104
+ elif word.startswith("+ "):
105
+ added_word = word[2:]
106
+ highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
107
+ if i - 1 >= 0 and diff[i - 1].startswith("- "):
108
+ highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>'
109
+ diff[i - 1] = ' '
110
+ else:
111
+ highlighted_text1 += " "
112
+ elif word.startswith(" "):
113
+ highlighted_text1 += word[2:] + " "
114
+ highlighted_text2 += word[2:] + " "
115
+ return highlighted_text1, highlighted_text2
116
+
117
+
118
+ def calculate_similarity(text1, text2):
119
+ """Calculates the similarity between two texts using cosine similarity."""
120
+ if not text1.strip() or not text2.strip():
121
+ return 0.0
122
+ try:
123
+ vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
124
+ tfidf_matrix = vectorizer.fit_transform([text1, text2])
125
+ similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
126
+ return similarity[0][0] * 100
127
+ except ValueError:
128
+ return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
129
+
130
+
131
+ def load_contract(file):
132
+ """Loads contract text from a file (txt, pdf, docx)."""
133
+ if file is None:
134
+ return ""
135
+ ext = file.name.split('.')[-1].lower()
136
+ try:
137
+ if ext == 'txt':
138
+ content = StringIO(file.getvalue().decode("utf-8")).read()
139
+ elif ext == 'pdf':
140
+ content = extract_text_from_pdf(file)
141
+ elif ext == 'docx':
142
+ content = docx2txt.process(file)
 
 
 
 
 
143
  else:
144
+ st.warning('Unsupported file type')
145
+ return ""
146
+ return content.strip() if content else ""
147
+ except Exception as e:
148
+ st.error(f"Error loading {ext.upper()} file: {str(e)}")
149
+ return ""
150
+
151
+
152
+ # ================== OCR and Image Processing Functions ==================
153
+ def convert_pdf_to_images(pdf_path, output_folder):
154
+ """Converts PDF pages to images."""
155
+ logging.debug(f"Converting PDF to images: {pdf_path}")
156
+ images = convert_from_path(pdf_path)
157
+ for i, image in enumerate(images):
158
+ image.save(os.path.join(output_folder, f"page_{i + 1}.png"))
159
+ logging.debug(f"Converted {len(images)} pages and saved to {output_folder}.")
160
+ return images
161
+
162
+
163
+ def detect_colored_regions(image, color, page_num, base_dir):
164
+ """Detects colored regions (orange or blue) in an image."""
165
+ logging.debug(f"Detecting {color} regions on page {page_num}.")
166
+ img_np = np.array(image)
167
+ img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
168
+
169
+ if color == 'orange':
170
+ lower_bound = ORANGE_LOWER_BOUND
171
+ upper_bound = ORANGE_UPPER_BOUND
172
+ elif color == 'blue':
173
+ lower_bound = BLUE_LOWER_BOUND
174
+ upper_bound = BLUE_UPPER_BOUND
175
  else:
176
+ raise ValueError("Color not supported")
177
+
178
+ logging.debug(
179
+ f"Using lower bound {lower_bound} and upper bound {upper_bound} for color detection.")
180
+
181
+ mask = cv2.inRange(img_np, lower_bound, upper_bound)
182
+ logging.debug(f"Mask created. Saving mask for verification.")
183
+ mask_image = Image.fromarray(mask)
184
+ mask_image_path = os.path.join(base_dir, "masks", f"mask_page_{page_num}.png")
185
+ mask_image.save(mask_image_path)
186
+ logging.debug(f"Saved mask to {mask_image_path}")
187
+
188
+ kernel = np.ones(KERNEL_SIZE, np.uint8)
189
+ closed_mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
190
+ closed_mask_image = Image.fromarray(closed_mask)
191
+ closed_mask_image_path = os.path.join(base_dir, "masks",
192
+ f"closed_mask_page_{page_num}.png")
193
+ closed_mask_image.save(closed_mask_image_path)
194
+ logging.debug(f"Saved closed mask to {closed_mask_image_path}")
195
+
196
+ contours, _ = cv2.findContours(
197
+ closed_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
198
+ logging.debug(f"Found {len(contours)} contours.")
199
+ return contours
200
+
201
+
202
+ def expand_bounding_box(x, y, w, h, expand_by, image_width, image_height):
203
+ """Expands a bounding box, ensuring it stays within image boundaries."""
204
+ x = max(0, x - expand_by)
205
+ y = max(0, y - expand_by)
206
+ w = min(image_width - x, w + 2 * expand_by)
207
+ h = min(image_height - y, h + 2 * expand_by)
208
+ return x, y, w, h
209
+
210
+
211
+ def ocr_image(image):
212
+ """Performs OCR on an image."""
213
+ logging.debug("Performing OCR on image.")
214
+ custom_config = r'--oem 3 --psm 6'
215
+ text = pytesseract.image_to_string(image, config=custom_config)
216
+ logging.debug("OCR completed.")
217
+ return text
218
+
219
+
220
+ def postprocess_ocr_text(text):
221
+ """Corrects common OCR mistakes in extracted text."""
222
+ corrections = {
223
+ "Clinvar": "ClinVar"
224
+ }
225
+ for wrong, correct in corrections.items():
226
+ text = text.replace(wrong, correct)
227
+ return text
228
+
229
+
230
+ # ================== Caching Functions ==================
231
+ def get_cache_filename(query):
232
+ """Generates a cache filename based on the hash of the query."""
233
+ query_hash = hashlib.md5(query.encode()).hexdigest()
234
+ return os.path.join("cache", f"{query_hash}.json")
235
+
236
+
237
+ def read_cache(query):
238
+ """Reads the cached response for a given query."""
239
+ cache_filename = get_cache_filename(query)
240
+ if os.path.exists(cache_filename):
241
+ with open(cache_filename, "r") as cache_file:
242
+ return json.load(cache_file)
243
+ return None
244
+
245
+
246
+ def write_cache(query, response):
247
+ """Writes the response to the cache for a given query."""
248
+ os.makedirs("cache", exist_ok=True)
249
+ cache_filename = get_cache_filename(query)
250
+ with open(cache_filename, "w") as cache_file:
251
+ json.dump(response, cache_file)
252
+
253
+
254
+ # ================== Free Model Interaction Function ==================
255
+ def find_component_name(summary_json, pdf_payload):
256
+ """
257
+ Finds the component name using a free model with caching.
258
+
259
+ Args:
260
+ summary_json (list): Summary of changes.
261
+ pdf_payload (dict): Payload data from the PDF metadata.
262
+
263
+ Returns:
264
+ list: Updated summary JSON with component names.
265
+ """
266
+ query = f"""
267
+ Here is a summary of PDF diffing script:
268
+ {json.dumps(summary_json)}
269
+
270
+ Here is a payload which helped to generate the PDF:
271
+ {json.dumps(pdf_payload)}
272
+
273
+ VERY IMPORTANT. Give the answer in JSON format of the the summary json structure described above by replacing "__COMPONENT_PLACEHOLDER__" with the name of the `componentName` involved in the diff. Replace with "Unknown" if you unable to recognize the source component.
274
+ The JSON should be valid and parseable by python's json.loads(...) function
275
+ DO NOT use any formatting.
276
+ """
277
+ if generator is None:
278
+ logging.warning(
279
+ "Free model is not loaded, returning original summary.")
280
+ return [item.update({"component_name": "Unknown"}) for item in
281
+ summary_json] # Sets all component names to unknown
282
+
283
+ # Check cache
284
+ cached_response = read_cache(query)
285
+ if cached_response:
286
+ logging.debug("Returning cached response.")
287
+ return cached_response
288
+
289
+ try:
290
+ response = generator(query, max_length=512) # Adjust max_length as needed
291
+ response_text = response[0]['generated_text']
292
+ logging.debug(f"Response from free model: {response_text}")
293
+ response_data = json.loads(response_text) # Parse the generated JSON
294
+ except Exception as e:
295
+ logging.error(
296
+ f"Error getting response from free model: {e}. Returning original summary")
297
+ return [item.update({"component_name": "Unknown"}) for item in
298
+ summary_json] # Sets all component names to unknown
299
+
300
+ # Write to cache
301
+ write_cache(query, response_data)
302
+ return response_data
303
+
304
+
305
+ # ================== Main Function (Modified for Integration) ==================
306
+ def analyze_differences(diff_pdf, baseline_pdf, changed_pdf):
307
+ """
308
+ Analyzes the differences between the baseline and changed PDFs by detecting and comparing regions with differences.
309
+
310
+ Args:
311
+ diff_pdf (str): Path to the diff PDF.
312
+ baseline_pdf (str): Path to the baseline PDF.
313
+ changed_pdf (str): Path to the changed PDF.
314
+
315
+ Returns:
316
+ list: Summary of changes with component names.
317
+ """
318
+ # Setup output directories (using temp dirs)
319
+ temp_dir = "temp_diff_analysis"
320
+ os.makedirs(temp_dir, exist_ok=True)
321
+ setup_output_directories([temp_dir])
322
+
323
+ # Extract metadata from baseline PDF
324
+ baseline_metadata = extract_metadata(baseline_pdf)
325
+ if baseline_metadata is None:
326
+ logging.debug("No metadata found in baseline PDF.")
327
+ return [] # Return empty list for consistency
328
+
329
+ payload = baseline_metadata["payload"]
330
+
331
+ # Convert diff.pdf to images
332
+ diff_images = convert_pdf_to_images(diff_pdf, os.path.join(temp_dir, "diff_pages"))
333
+
334
+ # Convert baseline.pdf to images
335
+ baseline_images = convert_pdf_to_images(baseline_pdf, os.path.join(temp_dir, "baseline", "pages"))
336
+
337
+ # Convert changed.pdf to images
338
+ changed_images = convert_pdf_to_images(changed_pdf, os.path.join(temp_dir, "changed", "pages"))
339
+
340
+ changes = []
341
+ baseline_texts = []
342
+ changed_texts = []
343
+ baseline_contours = []
344
+ changed_contours = []
345
+
346
+ for page_num, diff_image in enumerate(diff_images):
347
+ logging.debug(f"Processing page {page_num + 1}/{len(diff_images)}")
348
+ image_width, image_height = diff_image.size
349
+
350
+ # Detect orange regions
351
+ orange_contours = detect_colored_regions(diff_image, 'orange', page_num + 1,
352
+ os.path.join(temp_dir, "changed"))
353
+ logging.debug(f"Merged to {len(orange_contours)} orange contours.")
354
+
355
+ for rect_num, cnt in enumerate(orange_contours):
356
+ x, y, w, h = cv2.boundingRect(cnt)
357
+ x, y, w, h = expand_bounding_box(x, y, w, h, EXPAND_BY, image_width,
358
+ image_height)
359
+ logging.debug(
360
+ f"Orange Rect {rect_num + 1}: Expanded bounding box (x={x}, y={y}, w={w}, h={h})")
361
+ roi = changed_images[page_num].crop((x, y, x + w, y + h))
362
+
363
+ # Save the detected region to disk
364
+ region_path = os.path.join(temp_dir, "changed", "regions",
365
+ f"page_{page_num + 1}_region_{rect_num + 1}.png")
366
+ roi.save(region_path)
367
+ logging.debug(f"Saved detected region to {region_path}")
368
+
369
+ # Save the merged region to disk
370
+ merged_region_path = os.path.join(temp_dir, "changed", "contours",
371
+ f"page_{page_num + 1}_merged_region_{rect_num + 1}.png")
372
+ roi.save(merged_region_path)
373
+ logging.debug(f"Saved merged region to {merged_region_path}")
374
+
375
+ # Perform OCR on the detected region
376
+ orange_text = ocr_image(roi)
377
+ orange_text = postprocess_ocr_text(orange_text)
378
+ changed_texts.append((page_num + 1, orange_text))
379
+ changed_contours.append((x, y, w, h))
380
+ logging.debug(f"Extracted orange text: {orange_text}")
381
+
382
+ # Detect blue regions
383
+ blue_contours = detect_colored_regions(diff_image, 'blue', page_num + 1,
384
+ os.path.join(temp_dir, "baseline"))
385
+ logging.debug(f"Merged to {len(blue_contours)} blue contours.")
386
+
387
+ for rect_num, cnt in enumerate(blue_contours):
388
+ x, y, w, h = cv2.boundingRect(cnt)
389
+ x, y, w, h = expand_bounding_box(x, y, w, h, EXPAND_BY, image_width,
390
+ image_height)
391
+ logging.debug(
392
+ f"Blue Rect {rect_num + 1}: Expanded bounding box (x={x}, y={y}, w={w}, h={h})")
393
+ roi = baseline_images[page_num].crop((x, y, x + w, y + h))
394
+
395
+ # Save the detected region to disk
396
+ region_path = os.path.join(temp_dir, "baseline", "regions",
397
+ f"page_{page_num + 1}_region_{rect_num + 1}.png")
398
+ roi.save(region_path)
399
+ logging.debug(f"Saved detected region to {region_path}")
400
+
401
+ # Save the merged region to disk
402
+ merged_region_path = os.path.join(temp_dir, "baseline", "contours",
403
+ f"page_{page_num + 1}_merged_region_{rect_num + 1}.png")
404
+ roi.save(merged_region_path)
405
+ logging.debug(f"Saved merged region to {merged_region_path}")
406
+
407
+ # Perform OCR on the detected region
408
+ blue_text = ocr_image(roi)
409
+ blue_text = postprocess_ocr_text(blue_text)
410
+ baseline_texts.append((page_num + 1, blue_text))
411
+ baseline_contours.append((x, y, w, h))
412
+ logging.debug(f"Extracted blue text: {blue_text}")
413
+
414
+ # Analyze differences
415
+ for i, ((baseline_page_num, baseline_text), (changed_page_num, changed_text)) in enumerate(
416
+ zip(baseline_texts, changed_texts)):
417
+ similarity_ratio = compare_texts(baseline_text, changed_text)
418
+ baseline_contour = baseline_contours[i]
419
+ changed_contour = changed_contours[i]
420
+ offset = {
421
+ "x_offset": changed_contour[0] - baseline_contour[0],
422
+ "y_offset": changed_contour[1] - baseline_contour[1]
423
+ }
424
+ if similarity_ratio == 1.0:
425
+ change_type = "style change"
426
+ elif similarity_ratio >= SIMILARITY_THRESHOLD:
427
+ change_type = "wording change"
428
+ else:
429
+ change_type = "content change"
430
+
431
+ changes.append({
432
+ "page_num": baseline_page_num,
433
+ "baseline_text": baseline_text.replace("\n", " ").strip(),
434
+ "changed_text": changed_text.replace("\n", " ").strip(),
435
+ "type": change_type,
436
+ "offset": offset if change_type == "style change" else None,
437
+ "component_name": "__COMPONENT_PLACEHOLDER__"
438
+ })
439
+
440
+ # Call model to determine component names
441
+ updated_changes_summary = find_component_name(changes, payload)
442
+ return updated_changes_summary
443
+
444
+
445
+ def main():
446
+ """Main function to run the Streamlit app."""
447
+ # ... (Load questions - as before)
448
+ questions = load_questions()
449
+ questions_short = load_questions_short()
450
+
451
+ if not questions or not questions_short or len(questions) != len(
452
+ questions_short):
453
+ st.error(
454
+ "Failed to load questions or questions mismatch. Please check data files.")
455
+ return
456
+
457
+ st.title("📑 Contract Analysis Suite")
458
+ st.markdown(
459
+ """
460
+ Compare documents and analyze legal clauses using AI-powered question answering.
461
+ """)
462
+
463
+ # ===== DOCUMENT UPLOAD SECTION =====
464
+ st.header("1. Upload Documents")
465
+ col1, col2 = st.columns(2)
466
+
467
+ with col1:
468
+ uploaded_file1 = st.file_uploader(
469
+ "Upload First Document",
470
+ type=["txt", "pdf", "docx"],
471
+ key="file1"
472
+ )
473
+ contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
474
+ doc1_display = st.empty()
475
+
476
+ with col2:
477
+ uploaded_file2 = st.file_uploader(
478
+ "Upload Second Document",
479
+ type=["txt", "pdf", "docx"],
480
+ key="file2"
481
+ )
482
+ contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
483
+ doc2_display = st.empty()
484
+
485
+ # Update document displays
486
+ if uploaded_file1:
487
+ doc1_display.text_area("Document 1 Content",
488
+ value=contract_text1,
489
+ height=400,
490
+ key="area1")
491
+ if uploaded_file2:
492
+ doc2_display.text_area("Document 2 Content",
493
+ value=contract_text2,
494
+ height=400,
495
+ key="area2")
496
+
497
+ if not (uploaded_file1 and uploaded_file2):
498
+ st.warning("Please upload both documents to proceed")
499
+ return
500
+
501
+ # ===== DOCUMENT COMPARISON SECTION =====
502
+ st.header("2. Document Comparison")
503
+
504
+ with st.expander("Show Document Differences", expanded=True):
505
+ if st.button("Compare Documents"):
506
+ with st.spinner("Analyzing documents..."):
507
+ if not contract_text1.strip() or not contract_text2.strip():
508
+ st.error(
509
+ "One or both documents appear to be empty or couldn't be read properly")
510
+ return
511
+
512
+ similarity_score = calculate_similarity(contract_text1,
513
+ contract_text2)
514
+
515
+ highlighted_diff1, highlighted_diff2 = highlight_differences_words(
516
+ contract_text1, contract_text2)
517
+ st.session_state.comparison_results = {
518
+ 'similarity_score': similarity_score,
519
+ 'highlighted_diff1': highlighted_diff1,
520
+ 'highlighted_diff2': highlighted_diff2,
521
+
522
+ }
523
+
524
+ # Display comparison results
525
+ if st.session_state.comparison_results:
526
+ st.metric("Document Similarity Score",
527
+ f"{st.session_state.comparison_results['similarity_score']:.2f}%")
528
+
529
+ if st.session_state.comparison_results['similarity_score'] < 50:
530
+ st.warning("Significant differences detected")
531
+
532
+ st.markdown("**Visual Difference Highlighting:**")
533
+
534
+ col1, col2 = st.columns(2)
535
+ with col1:
536
+ st.markdown("### Original Document")
537
+ st.markdown(
538
+ f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff1"]}</div>',
539
+ unsafe_allow_html=True)
540
+ with col2:
541
+ st.markdown("### Modified Document")
542
+ st.markdown(
543
+ f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff2"]}</div>',
544
+ unsafe_allow_html=True)
545
+
546
+ # ===== QUESTION ANALYSIS SECTION ==