Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,155 +1,546 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import json
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
#
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
predictions = run_prediction([question], paragraph, 'marshmellow77/roberta-base-cuad',
|
137 |
-
n_best_size=5)
|
138 |
-
answer = ""
|
139 |
-
if predictions['0'] == "":
|
140 |
-
answer = 'No answer found in document'
|
141 |
else:
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
else:
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import pytesseract
|
3 |
+
from pdf2image import convert_from_path
|
4 |
+
from PIL import Image
|
5 |
+
import numpy as np
|
6 |
+
import cv2
|
7 |
+
import os
|
8 |
+
import shutil
|
9 |
+
from difflib import SequenceMatcher
|
10 |
+
from PyPDF2 import PdfReader
|
11 |
import json
|
12 |
+
import logging
|
13 |
+
import argparse
|
14 |
+
import hashlib
|
15 |
+
from transformers import pipeline
|
16 |
+
import torch
|
17 |
+
import streamlit as st # Added Streamlit import
|
18 |
+
from io import StringIO
|
19 |
+
import docx2txt
|
20 |
+
import pdfplumber
|
21 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
22 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
+
|
24 |
+
# Constants
|
25 |
+
ORANGE_LOWER_BOUND = np.array([0, 120, 240])
|
26 |
+
ORANGE_UPPER_BOUND = np.array([239, 247, 255])
|
27 |
+
BLUE_LOWER_BOUND = np.array([230, 115, 0])
|
28 |
+
BLUE_UPPER_BOUND = np.array([255, 238, 218])
|
29 |
+
KERNEL_SIZE = (35, 35)
|
30 |
+
EXPAND_BY = 10
|
31 |
+
SIMILARITY_THRESHOLD = 0.7
|
32 |
+
FREE_MODEL_NAME = "google/flan-t5-large" # You can change this
|
33 |
+
|
34 |
+
# Setup argument parser
|
35 |
+
parser = argparse.ArgumentParser(description="PDF Difference Analyzer")
|
36 |
+
parser.add_argument('--log-level', type=str, default='INFO',
|
37 |
+
help='Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)')
|
38 |
+
args = parser.parse_args()
|
39 |
+
|
40 |
+
# Setup logging
|
41 |
+
logging.basicConfig(level=getattr(logging, args.log_level.upper(), logging.INFO),
|
42 |
+
format='\033[92m[%(asctime)s] %(levelname)s: %(message)s\033[0m',
|
43 |
+
datefmt='%Y-%m-%d %H:%M:%S')
|
44 |
+
|
45 |
+
# Check Python path
|
46 |
+
logging.debug(f"Python executable: {sys.executable}")
|
47 |
+
logging.debug(f"Python version: {sys.version}")
|
48 |
+
logging.debug(f"Python path: {sys.path}")
|
49 |
+
|
50 |
+
logging.debug("Tesseract imported successfully!")
|
51 |
+
|
52 |
+
# Initialize the Hugging Face Transformers pipeline
|
53 |
+
logging.info(f"Loading free model: {FREE_MODEL_NAME}")
|
54 |
+
try:
|
55 |
+
device = 0 if torch.cuda.is_available() else -1
|
56 |
+
generator = pipeline('text2text-generation', model=FREE_MODEL_NAME,
|
57 |
+
device=device) # Can also use 'question-answering'
|
58 |
+
logging.info(f"Free model {FREE_MODEL_NAME} loaded successfully.")
|
59 |
+
except Exception as e:
|
60 |
+
logging.error(
|
61 |
+
f"Error loading the free model: {e}. The script will attempt to continue, but component name identification will not work.")
|
62 |
+
generator = None # Set generator to None to prevent further errors
|
63 |
+
|
64 |
+
# ================== UTILITY FUNCTIONS (Modified for Streamlit) ==================
|
65 |
+
def extract_text_from_pdf(uploaded_file):
|
66 |
+
"""Extracts text from a PDF file, handling different extraction methods."""
|
67 |
+
try:
|
68 |
+
with pdfplumber.open(uploaded_file) as pdf:
|
69 |
+
full_text = ""
|
70 |
+
for page in pdf.pages:
|
71 |
+
try:
|
72 |
+
text = page.extract_text_formatted() # Try to get formatted text
|
73 |
+
except AttributeError:
|
74 |
+
text = page.extract_text()
|
75 |
+
if text:
|
76 |
+
full_text += text + "\n\n" # Add page separator
|
77 |
+
else:
|
78 |
+
full_text += page.extract_text() + "\n\n"
|
79 |
+
return full_text if full_text.strip() else ""
|
80 |
+
except Exception as e:
|
81 |
+
st.error(f"PDF extraction error: {str(e)}")
|
82 |
+
return ""
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
def highlight_differences_words(text1, text2):
|
87 |
+
"""Highlights differences between two texts at the word level."""
|
88 |
+
differ = difflib.Differ()
|
89 |
+
diff = list(differ.compare(text1.split(), text2.split()))
|
90 |
+
|
91 |
+
highlighted_text1 = ""
|
92 |
+
highlighted_text2 = ""
|
93 |
+
|
94 |
+
for i, word in enumerate(diff):
|
95 |
+
if word.startswith("- "):
|
96 |
+
removed_word = word[2:]
|
97 |
+
highlighted_text1 += f'<span style="background-color:#ffcccc; display: inline-block;">{removed_word}</span>'
|
98 |
+
if i + 1 < len(diff) and diff[i + 1].startswith("+ "):
|
99 |
+
added_word = diff[i + 1][2:]
|
100 |
+
highlighted_text2 += f'<span style="background-color:#ffffcc; display: inline-block;">{added_word}</span>'
|
101 |
+
diff[i + 1] = ' '
|
102 |
+
else:
|
103 |
+
highlighted_text2 += " "
|
104 |
+
elif word.startswith("+ "):
|
105 |
+
added_word = word[2:]
|
106 |
+
highlighted_text2 += f'<span style="background-color:#ccffcc; display: inline-block;">{added_word}</span>'
|
107 |
+
if i - 1 >= 0 and diff[i - 1].startswith("- "):
|
108 |
+
highlighted_text1 += f'<span style="background-color:#ffffcc; display: inline-block;">{diff[i-1][2:]}</span>'
|
109 |
+
diff[i - 1] = ' '
|
110 |
+
else:
|
111 |
+
highlighted_text1 += " "
|
112 |
+
elif word.startswith(" "):
|
113 |
+
highlighted_text1 += word[2:] + " "
|
114 |
+
highlighted_text2 += word[2:] + " "
|
115 |
+
return highlighted_text1, highlighted_text2
|
116 |
+
|
117 |
+
|
118 |
+
def calculate_similarity(text1, text2):
|
119 |
+
"""Calculates the similarity between two texts using cosine similarity."""
|
120 |
+
if not text1.strip() or not text2.strip():
|
121 |
+
return 0.0
|
122 |
+
try:
|
123 |
+
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
|
124 |
+
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
125 |
+
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
126 |
+
return similarity[0][0] * 100
|
127 |
+
except ValueError:
|
128 |
+
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
|
129 |
+
|
130 |
+
|
131 |
+
def load_contract(file):
|
132 |
+
"""Loads contract text from a file (txt, pdf, docx)."""
|
133 |
+
if file is None:
|
134 |
+
return ""
|
135 |
+
ext = file.name.split('.')[-1].lower()
|
136 |
+
try:
|
137 |
+
if ext == 'txt':
|
138 |
+
content = StringIO(file.getvalue().decode("utf-8")).read()
|
139 |
+
elif ext == 'pdf':
|
140 |
+
content = extract_text_from_pdf(file)
|
141 |
+
elif ext == 'docx':
|
142 |
+
content = docx2txt.process(file)
|
|
|
|
|
|
|
|
|
|
|
143 |
else:
|
144 |
+
st.warning('Unsupported file type')
|
145 |
+
return ""
|
146 |
+
return content.strip() if content else ""
|
147 |
+
except Exception as e:
|
148 |
+
st.error(f"Error loading {ext.upper()} file: {str(e)}")
|
149 |
+
return ""
|
150 |
+
|
151 |
+
|
152 |
+
# ================== OCR and Image Processing Functions ==================
|
153 |
+
def convert_pdf_to_images(pdf_path, output_folder):
|
154 |
+
"""Converts PDF pages to images."""
|
155 |
+
logging.debug(f"Converting PDF to images: {pdf_path}")
|
156 |
+
images = convert_from_path(pdf_path)
|
157 |
+
for i, image in enumerate(images):
|
158 |
+
image.save(os.path.join(output_folder, f"page_{i + 1}.png"))
|
159 |
+
logging.debug(f"Converted {len(images)} pages and saved to {output_folder}.")
|
160 |
+
return images
|
161 |
+
|
162 |
+
|
163 |
+
def detect_colored_regions(image, color, page_num, base_dir):
|
164 |
+
"""Detects colored regions (orange or blue) in an image."""
|
165 |
+
logging.debug(f"Detecting {color} regions on page {page_num}.")
|
166 |
+
img_np = np.array(image)
|
167 |
+
img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
|
168 |
+
|
169 |
+
if color == 'orange':
|
170 |
+
lower_bound = ORANGE_LOWER_BOUND
|
171 |
+
upper_bound = ORANGE_UPPER_BOUND
|
172 |
+
elif color == 'blue':
|
173 |
+
lower_bound = BLUE_LOWER_BOUND
|
174 |
+
upper_bound = BLUE_UPPER_BOUND
|
175 |
else:
|
176 |
+
raise ValueError("Color not supported")
|
177 |
+
|
178 |
+
logging.debug(
|
179 |
+
f"Using lower bound {lower_bound} and upper bound {upper_bound} for color detection.")
|
180 |
+
|
181 |
+
mask = cv2.inRange(img_np, lower_bound, upper_bound)
|
182 |
+
logging.debug(f"Mask created. Saving mask for verification.")
|
183 |
+
mask_image = Image.fromarray(mask)
|
184 |
+
mask_image_path = os.path.join(base_dir, "masks", f"mask_page_{page_num}.png")
|
185 |
+
mask_image.save(mask_image_path)
|
186 |
+
logging.debug(f"Saved mask to {mask_image_path}")
|
187 |
+
|
188 |
+
kernel = np.ones(KERNEL_SIZE, np.uint8)
|
189 |
+
closed_mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
|
190 |
+
closed_mask_image = Image.fromarray(closed_mask)
|
191 |
+
closed_mask_image_path = os.path.join(base_dir, "masks",
|
192 |
+
f"closed_mask_page_{page_num}.png")
|
193 |
+
closed_mask_image.save(closed_mask_image_path)
|
194 |
+
logging.debug(f"Saved closed mask to {closed_mask_image_path}")
|
195 |
+
|
196 |
+
contours, _ = cv2.findContours(
|
197 |
+
closed_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
198 |
+
logging.debug(f"Found {len(contours)} contours.")
|
199 |
+
return contours
|
200 |
+
|
201 |
+
|
202 |
+
def expand_bounding_box(x, y, w, h, expand_by, image_width, image_height):
|
203 |
+
"""Expands a bounding box, ensuring it stays within image boundaries."""
|
204 |
+
x = max(0, x - expand_by)
|
205 |
+
y = max(0, y - expand_by)
|
206 |
+
w = min(image_width - x, w + 2 * expand_by)
|
207 |
+
h = min(image_height - y, h + 2 * expand_by)
|
208 |
+
return x, y, w, h
|
209 |
+
|
210 |
+
|
211 |
+
def ocr_image(image):
|
212 |
+
"""Performs OCR on an image."""
|
213 |
+
logging.debug("Performing OCR on image.")
|
214 |
+
custom_config = r'--oem 3 --psm 6'
|
215 |
+
text = pytesseract.image_to_string(image, config=custom_config)
|
216 |
+
logging.debug("OCR completed.")
|
217 |
+
return text
|
218 |
+
|
219 |
+
|
220 |
+
def postprocess_ocr_text(text):
|
221 |
+
"""Corrects common OCR mistakes in extracted text."""
|
222 |
+
corrections = {
|
223 |
+
"Clinvar": "ClinVar"
|
224 |
+
}
|
225 |
+
for wrong, correct in corrections.items():
|
226 |
+
text = text.replace(wrong, correct)
|
227 |
+
return text
|
228 |
+
|
229 |
+
|
230 |
+
# ================== Caching Functions ==================
|
231 |
+
def get_cache_filename(query):
|
232 |
+
"""Generates a cache filename based on the hash of the query."""
|
233 |
+
query_hash = hashlib.md5(query.encode()).hexdigest()
|
234 |
+
return os.path.join("cache", f"{query_hash}.json")
|
235 |
+
|
236 |
+
|
237 |
+
def read_cache(query):
|
238 |
+
"""Reads the cached response for a given query."""
|
239 |
+
cache_filename = get_cache_filename(query)
|
240 |
+
if os.path.exists(cache_filename):
|
241 |
+
with open(cache_filename, "r") as cache_file:
|
242 |
+
return json.load(cache_file)
|
243 |
+
return None
|
244 |
+
|
245 |
+
|
246 |
+
def write_cache(query, response):
|
247 |
+
"""Writes the response to the cache for a given query."""
|
248 |
+
os.makedirs("cache", exist_ok=True)
|
249 |
+
cache_filename = get_cache_filename(query)
|
250 |
+
with open(cache_filename, "w") as cache_file:
|
251 |
+
json.dump(response, cache_file)
|
252 |
+
|
253 |
+
|
254 |
+
# ================== Free Model Interaction Function ==================
|
255 |
+
def find_component_name(summary_json, pdf_payload):
|
256 |
+
"""
|
257 |
+
Finds the component name using a free model with caching.
|
258 |
+
|
259 |
+
Args:
|
260 |
+
summary_json (list): Summary of changes.
|
261 |
+
pdf_payload (dict): Payload data from the PDF metadata.
|
262 |
+
|
263 |
+
Returns:
|
264 |
+
list: Updated summary JSON with component names.
|
265 |
+
"""
|
266 |
+
query = f"""
|
267 |
+
Here is a summary of PDF diffing script:
|
268 |
+
{json.dumps(summary_json)}
|
269 |
+
|
270 |
+
Here is a payload which helped to generate the PDF:
|
271 |
+
{json.dumps(pdf_payload)}
|
272 |
+
|
273 |
+
VERY IMPORTANT. Give the answer in JSON format of the the summary json structure described above by replacing "__COMPONENT_PLACEHOLDER__" with the name of the `componentName` involved in the diff. Replace with "Unknown" if you unable to recognize the source component.
|
274 |
+
The JSON should be valid and parseable by python's json.loads(...) function
|
275 |
+
DO NOT use any formatting.
|
276 |
+
"""
|
277 |
+
if generator is None:
|
278 |
+
logging.warning(
|
279 |
+
"Free model is not loaded, returning original summary.")
|
280 |
+
return [item.update({"component_name": "Unknown"}) for item in
|
281 |
+
summary_json] # Sets all component names to unknown
|
282 |
+
|
283 |
+
# Check cache
|
284 |
+
cached_response = read_cache(query)
|
285 |
+
if cached_response:
|
286 |
+
logging.debug("Returning cached response.")
|
287 |
+
return cached_response
|
288 |
+
|
289 |
+
try:
|
290 |
+
response = generator(query, max_length=512) # Adjust max_length as needed
|
291 |
+
response_text = response[0]['generated_text']
|
292 |
+
logging.debug(f"Response from free model: {response_text}")
|
293 |
+
response_data = json.loads(response_text) # Parse the generated JSON
|
294 |
+
except Exception as e:
|
295 |
+
logging.error(
|
296 |
+
f"Error getting response from free model: {e}. Returning original summary")
|
297 |
+
return [item.update({"component_name": "Unknown"}) for item in
|
298 |
+
summary_json] # Sets all component names to unknown
|
299 |
+
|
300 |
+
# Write to cache
|
301 |
+
write_cache(query, response_data)
|
302 |
+
return response_data
|
303 |
+
|
304 |
+
|
305 |
+
# ================== Main Function (Modified for Integration) ==================
|
306 |
+
def analyze_differences(diff_pdf, baseline_pdf, changed_pdf):
|
307 |
+
"""
|
308 |
+
Analyzes the differences between the baseline and changed PDFs by detecting and comparing regions with differences.
|
309 |
+
|
310 |
+
Args:
|
311 |
+
diff_pdf (str): Path to the diff PDF.
|
312 |
+
baseline_pdf (str): Path to the baseline PDF.
|
313 |
+
changed_pdf (str): Path to the changed PDF.
|
314 |
+
|
315 |
+
Returns:
|
316 |
+
list: Summary of changes with component names.
|
317 |
+
"""
|
318 |
+
# Setup output directories (using temp dirs)
|
319 |
+
temp_dir = "temp_diff_analysis"
|
320 |
+
os.makedirs(temp_dir, exist_ok=True)
|
321 |
+
setup_output_directories([temp_dir])
|
322 |
+
|
323 |
+
# Extract metadata from baseline PDF
|
324 |
+
baseline_metadata = extract_metadata(baseline_pdf)
|
325 |
+
if baseline_metadata is None:
|
326 |
+
logging.debug("No metadata found in baseline PDF.")
|
327 |
+
return [] # Return empty list for consistency
|
328 |
+
|
329 |
+
payload = baseline_metadata["payload"]
|
330 |
+
|
331 |
+
# Convert diff.pdf to images
|
332 |
+
diff_images = convert_pdf_to_images(diff_pdf, os.path.join(temp_dir, "diff_pages"))
|
333 |
+
|
334 |
+
# Convert baseline.pdf to images
|
335 |
+
baseline_images = convert_pdf_to_images(baseline_pdf, os.path.join(temp_dir, "baseline", "pages"))
|
336 |
+
|
337 |
+
# Convert changed.pdf to images
|
338 |
+
changed_images = convert_pdf_to_images(changed_pdf, os.path.join(temp_dir, "changed", "pages"))
|
339 |
+
|
340 |
+
changes = []
|
341 |
+
baseline_texts = []
|
342 |
+
changed_texts = []
|
343 |
+
baseline_contours = []
|
344 |
+
changed_contours = []
|
345 |
+
|
346 |
+
for page_num, diff_image in enumerate(diff_images):
|
347 |
+
logging.debug(f"Processing page {page_num + 1}/{len(diff_images)}")
|
348 |
+
image_width, image_height = diff_image.size
|
349 |
+
|
350 |
+
# Detect orange regions
|
351 |
+
orange_contours = detect_colored_regions(diff_image, 'orange', page_num + 1,
|
352 |
+
os.path.join(temp_dir, "changed"))
|
353 |
+
logging.debug(f"Merged to {len(orange_contours)} orange contours.")
|
354 |
+
|
355 |
+
for rect_num, cnt in enumerate(orange_contours):
|
356 |
+
x, y, w, h = cv2.boundingRect(cnt)
|
357 |
+
x, y, w, h = expand_bounding_box(x, y, w, h, EXPAND_BY, image_width,
|
358 |
+
image_height)
|
359 |
+
logging.debug(
|
360 |
+
f"Orange Rect {rect_num + 1}: Expanded bounding box (x={x}, y={y}, w={w}, h={h})")
|
361 |
+
roi = changed_images[page_num].crop((x, y, x + w, y + h))
|
362 |
+
|
363 |
+
# Save the detected region to disk
|
364 |
+
region_path = os.path.join(temp_dir, "changed", "regions",
|
365 |
+
f"page_{page_num + 1}_region_{rect_num + 1}.png")
|
366 |
+
roi.save(region_path)
|
367 |
+
logging.debug(f"Saved detected region to {region_path}")
|
368 |
+
|
369 |
+
# Save the merged region to disk
|
370 |
+
merged_region_path = os.path.join(temp_dir, "changed", "contours",
|
371 |
+
f"page_{page_num + 1}_merged_region_{rect_num + 1}.png")
|
372 |
+
roi.save(merged_region_path)
|
373 |
+
logging.debug(f"Saved merged region to {merged_region_path}")
|
374 |
+
|
375 |
+
# Perform OCR on the detected region
|
376 |
+
orange_text = ocr_image(roi)
|
377 |
+
orange_text = postprocess_ocr_text(orange_text)
|
378 |
+
changed_texts.append((page_num + 1, orange_text))
|
379 |
+
changed_contours.append((x, y, w, h))
|
380 |
+
logging.debug(f"Extracted orange text: {orange_text}")
|
381 |
+
|
382 |
+
# Detect blue regions
|
383 |
+
blue_contours = detect_colored_regions(diff_image, 'blue', page_num + 1,
|
384 |
+
os.path.join(temp_dir, "baseline"))
|
385 |
+
logging.debug(f"Merged to {len(blue_contours)} blue contours.")
|
386 |
+
|
387 |
+
for rect_num, cnt in enumerate(blue_contours):
|
388 |
+
x, y, w, h = cv2.boundingRect(cnt)
|
389 |
+
x, y, w, h = expand_bounding_box(x, y, w, h, EXPAND_BY, image_width,
|
390 |
+
image_height)
|
391 |
+
logging.debug(
|
392 |
+
f"Blue Rect {rect_num + 1}: Expanded bounding box (x={x}, y={y}, w={w}, h={h})")
|
393 |
+
roi = baseline_images[page_num].crop((x, y, x + w, y + h))
|
394 |
+
|
395 |
+
# Save the detected region to disk
|
396 |
+
region_path = os.path.join(temp_dir, "baseline", "regions",
|
397 |
+
f"page_{page_num + 1}_region_{rect_num + 1}.png")
|
398 |
+
roi.save(region_path)
|
399 |
+
logging.debug(f"Saved detected region to {region_path}")
|
400 |
+
|
401 |
+
# Save the merged region to disk
|
402 |
+
merged_region_path = os.path.join(temp_dir, "baseline", "contours",
|
403 |
+
f"page_{page_num + 1}_merged_region_{rect_num + 1}.png")
|
404 |
+
roi.save(merged_region_path)
|
405 |
+
logging.debug(f"Saved merged region to {merged_region_path}")
|
406 |
+
|
407 |
+
# Perform OCR on the detected region
|
408 |
+
blue_text = ocr_image(roi)
|
409 |
+
blue_text = postprocess_ocr_text(blue_text)
|
410 |
+
baseline_texts.append((page_num + 1, blue_text))
|
411 |
+
baseline_contours.append((x, y, w, h))
|
412 |
+
logging.debug(f"Extracted blue text: {blue_text}")
|
413 |
+
|
414 |
+
# Analyze differences
|
415 |
+
for i, ((baseline_page_num, baseline_text), (changed_page_num, changed_text)) in enumerate(
|
416 |
+
zip(baseline_texts, changed_texts)):
|
417 |
+
similarity_ratio = compare_texts(baseline_text, changed_text)
|
418 |
+
baseline_contour = baseline_contours[i]
|
419 |
+
changed_contour = changed_contours[i]
|
420 |
+
offset = {
|
421 |
+
"x_offset": changed_contour[0] - baseline_contour[0],
|
422 |
+
"y_offset": changed_contour[1] - baseline_contour[1]
|
423 |
+
}
|
424 |
+
if similarity_ratio == 1.0:
|
425 |
+
change_type = "style change"
|
426 |
+
elif similarity_ratio >= SIMILARITY_THRESHOLD:
|
427 |
+
change_type = "wording change"
|
428 |
+
else:
|
429 |
+
change_type = "content change"
|
430 |
+
|
431 |
+
changes.append({
|
432 |
+
"page_num": baseline_page_num,
|
433 |
+
"baseline_text": baseline_text.replace("\n", " ").strip(),
|
434 |
+
"changed_text": changed_text.replace("\n", " ").strip(),
|
435 |
+
"type": change_type,
|
436 |
+
"offset": offset if change_type == "style change" else None,
|
437 |
+
"component_name": "__COMPONENT_PLACEHOLDER__"
|
438 |
+
})
|
439 |
+
|
440 |
+
# Call model to determine component names
|
441 |
+
updated_changes_summary = find_component_name(changes, payload)
|
442 |
+
return updated_changes_summary
|
443 |
+
|
444 |
+
|
445 |
+
def main():
|
446 |
+
"""Main function to run the Streamlit app."""
|
447 |
+
# ... (Load questions - as before)
|
448 |
+
questions = load_questions()
|
449 |
+
questions_short = load_questions_short()
|
450 |
+
|
451 |
+
if not questions or not questions_short or len(questions) != len(
|
452 |
+
questions_short):
|
453 |
+
st.error(
|
454 |
+
"Failed to load questions or questions mismatch. Please check data files.")
|
455 |
+
return
|
456 |
+
|
457 |
+
st.title("📑 Contract Analysis Suite")
|
458 |
+
st.markdown(
|
459 |
+
"""
|
460 |
+
Compare documents and analyze legal clauses using AI-powered question answering.
|
461 |
+
""")
|
462 |
+
|
463 |
+
# ===== DOCUMENT UPLOAD SECTION =====
|
464 |
+
st.header("1. Upload Documents")
|
465 |
+
col1, col2 = st.columns(2)
|
466 |
+
|
467 |
+
with col1:
|
468 |
+
uploaded_file1 = st.file_uploader(
|
469 |
+
"Upload First Document",
|
470 |
+
type=["txt", "pdf", "docx"],
|
471 |
+
key="file1"
|
472 |
+
)
|
473 |
+
contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
|
474 |
+
doc1_display = st.empty()
|
475 |
+
|
476 |
+
with col2:
|
477 |
+
uploaded_file2 = st.file_uploader(
|
478 |
+
"Upload Second Document",
|
479 |
+
type=["txt", "pdf", "docx"],
|
480 |
+
key="file2"
|
481 |
+
)
|
482 |
+
contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
|
483 |
+
doc2_display = st.empty()
|
484 |
+
|
485 |
+
# Update document displays
|
486 |
+
if uploaded_file1:
|
487 |
+
doc1_display.text_area("Document 1 Content",
|
488 |
+
value=contract_text1,
|
489 |
+
height=400,
|
490 |
+
key="area1")
|
491 |
+
if uploaded_file2:
|
492 |
+
doc2_display.text_area("Document 2 Content",
|
493 |
+
value=contract_text2,
|
494 |
+
height=400,
|
495 |
+
key="area2")
|
496 |
+
|
497 |
+
if not (uploaded_file1 and uploaded_file2):
|
498 |
+
st.warning("Please upload both documents to proceed")
|
499 |
+
return
|
500 |
+
|
501 |
+
# ===== DOCUMENT COMPARISON SECTION =====
|
502 |
+
st.header("2. Document Comparison")
|
503 |
+
|
504 |
+
with st.expander("Show Document Differences", expanded=True):
|
505 |
+
if st.button("Compare Documents"):
|
506 |
+
with st.spinner("Analyzing documents..."):
|
507 |
+
if not contract_text1.strip() or not contract_text2.strip():
|
508 |
+
st.error(
|
509 |
+
"One or both documents appear to be empty or couldn't be read properly")
|
510 |
+
return
|
511 |
+
|
512 |
+
similarity_score = calculate_similarity(contract_text1,
|
513 |
+
contract_text2)
|
514 |
+
|
515 |
+
highlighted_diff1, highlighted_diff2 = highlight_differences_words(
|
516 |
+
contract_text1, contract_text2)
|
517 |
+
st.session_state.comparison_results = {
|
518 |
+
'similarity_score': similarity_score,
|
519 |
+
'highlighted_diff1': highlighted_diff1,
|
520 |
+
'highlighted_diff2': highlighted_diff2,
|
521 |
+
|
522 |
+
}
|
523 |
+
|
524 |
+
# Display comparison results
|
525 |
+
if st.session_state.comparison_results:
|
526 |
+
st.metric("Document Similarity Score",
|
527 |
+
f"{st.session_state.comparison_results['similarity_score']:.2f}%")
|
528 |
+
|
529 |
+
if st.session_state.comparison_results['similarity_score'] < 50:
|
530 |
+
st.warning("Significant differences detected")
|
531 |
+
|
532 |
+
st.markdown("**Visual Difference Highlighting:**")
|
533 |
+
|
534 |
+
col1, col2 = st.columns(2)
|
535 |
+
with col1:
|
536 |
+
st.markdown("### Original Document")
|
537 |
+
st.markdown(
|
538 |
+
f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff1"]}</div>',
|
539 |
+
unsafe_allow_html=True)
|
540 |
+
with col2:
|
541 |
+
st.markdown("### Modified Document")
|
542 |
+
st.markdown(
|
543 |
+
f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff2"]}</div>',
|
544 |
+
unsafe_allow_html=True)
|
545 |
+
|
546 |
+
# ===== QUESTION ANALYSIS SECTION ==
|