ludigija commited on
Commit
446457d
·
verified ·
1 Parent(s): c1d1ac0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -7
app.py CHANGED
@@ -44,8 +44,17 @@ def load_questions_short():
44
  def extract_text_from_pdf(uploaded_file):
45
  try:
46
  with pdfplumber.open(uploaded_file) as pdf:
47
- text = "\n".join(page.extract_text() or "" for page in pdf.pages)
48
- return text if text.strip() else ""
 
 
 
 
 
 
 
 
 
49
  except Exception as e:
50
  st.error(f"PDF extraction error: {str(e)}")
51
  return ""
@@ -110,7 +119,12 @@ def load_contract(file):
110
  if not content:
111
  # Fallback to PyPDF4
112
  pdfReader = PyPDF4.PdfFileReader(file)
113
- content = '\n'.join([pdfReader.getPage(i).extractText() for i in range(pdfReader.numPages)])
 
 
 
 
 
114
  elif ext == 'docx':
115
  content = docx2txt.process(file)
116
  else:
@@ -161,12 +175,12 @@ def main():
161
  if uploaded_file1:
162
  doc1_display.text_area("Document 1 Content",
163
  value=contract_text1,
164
- height=200,
165
  key="area1")
166
  if uploaded_file2:
167
  doc2_display.text_area("Document 2 Content",
168
  value=contract_text2,
169
- height=200,
170
  key="area2")
171
 
172
  if not (uploaded_file1 and uploaded_file2):
@@ -208,10 +222,10 @@ def main():
208
  col1, col2 = st.columns(2)
209
  with col1:
210
  st.markdown("### Original Document")
211
- st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em;">{st.session_state.comparison_results["highlighted_diff1"]}</div>', unsafe_allow_html=True)
212
  with col2:
213
  st.markdown("### Modified Document")
214
- st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em;">{st.session_state.comparison_results["highlighted_diff2"]}</div>', unsafe_allow_html=True)
215
 
216
 
217
  # ===== QUESTION ANALYSIS SECTION =====
 
44
  def extract_text_from_pdf(uploaded_file):
45
  try:
46
  with pdfplumber.open(uploaded_file) as pdf:
47
+ full_text = ""
48
+ for page in pdf.pages:
49
+ try:
50
+ text = page.extract_text_formatted() # Try to get formatted text
51
+ except AttributeError:
52
+ text = page.extract_text()
53
+ if text:
54
+ full_text += text + "\n\n" # Add page separator
55
+ else:
56
+ full_text += page.extract_text() + "\n\n"
57
+ return full_text if full_text.strip() else ""
58
  except Exception as e:
59
  st.error(f"PDF extraction error: {str(e)}")
60
  return ""
 
119
  if not content:
120
  # Fallback to PyPDF4
121
  pdfReader = PyPDF4.PdfFileReader(file)
122
+ full_text = ""
123
+ for page in pdfReader.pages:
124
+ text = page.extractText()
125
+ if text:
126
+ full_text += text + "\n\n"
127
+ content = full_text
128
  elif ext == 'docx':
129
  content = docx2txt.process(file)
130
  else:
 
175
  if uploaded_file1:
176
  doc1_display.text_area("Document 1 Content",
177
  value=contract_text1,
178
+ height=400, # Increased height for larger display
179
  key="area1")
180
  if uploaded_file2:
181
  doc2_display.text_area("Document 2 Content",
182
  value=contract_text2,
183
+ height=400, # Increased height for larger display
184
  key="area2")
185
 
186
  if not (uploaded_file1 and uploaded_file2):
 
222
  col1, col2 = st.columns(2)
223
  with col1:
224
  st.markdown("### Original Document")
225
+ st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff1"]}</div>', unsafe_allow_html=True)
226
  with col2:
227
  st.markdown("### Modified Document")
228
+ st.markdown(f'<div style="border:1px solid #ccc; padding:10px; white-space: pre-wrap; font-family: monospace; font-size: 0.9em; max-height: 500px; overflow-y: auto;">{st.session_state.comparison_results["highlighted_diff2"]}</div>', unsafe_allow_html=True)
229
 
230
 
231
  # ===== QUESTION ANALYSIS SECTION =====