milwright commited on
Commit
ef3661a
·
1 Parent(s): 88d3e04

Fix 'Process Document Again' button to only appear for the current document

Browse files
Files changed (1) hide show
  1. app.py +175 -26
app.py CHANGED
@@ -58,7 +58,7 @@ def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
58
  return []
59
 
60
  # Cache preprocessed images for better performance
61
- @st.cache_data(ttl=24*3600, show_spinner=False) # Cache for 24 hours
62
  def preprocess_image(image_bytes, preprocessing_options):
63
  """Preprocess image with selected options optimized for historical document OCR quality"""
64
  # Setup basic console logging
@@ -175,7 +175,7 @@ def preprocess_image(image_bytes, preprocessing_options):
175
 
176
  # Cache OCR results in memory to speed up repeated processing
177
  @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
178
- def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key):
179
  """Cached version of OCR processing to reuse results"""
180
  # Initialize OCR processor
181
  processor = StructuredOCR()
@@ -241,6 +241,8 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
241
  with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
242
  tmp.write(file_bytes)
243
  temp_path = tmp.name
 
 
244
 
245
  # Get PDF rotation value if available and file is a PDF
246
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() and file_type == "pdf" else 0
@@ -284,11 +286,34 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
284
  # Generate cache key
285
  import hashlib
286
  file_hash = hashlib.md5(file_bytes).hexdigest()
287
- cache_key = f"{file_hash}_{file_type}_{use_vision}_{pdf_rotation_value}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
  # Process with cached function if possible
290
  try:
291
- result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key)
292
  progress_bar.progress(90)
293
  status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
294
  except Exception as e:
@@ -343,7 +368,12 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
343
  # Clean up original temp file and use the processed one
344
  if os.path.exists(temp_path):
345
  os.unlink(temp_path)
 
 
 
346
  temp_path = proc_tmp.name
 
 
347
  progress_bar.progress(30)
348
  else:
349
  progress_bar.progress(30)
@@ -377,19 +407,37 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
377
  # Add pdf_rotation to cache key if present
378
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
379
  file_hash = hashlib.md5(open(temp_path, 'rb').read()).hexdigest()
380
- cache_key = f"{file_hash}_{file_type}_{use_vision}_{pdf_rotation_value}"
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
  progress_bar.progress(50)
383
  # Check if we have custom instructions
384
  has_custom_prompt = 'custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0
 
 
385
  if has_custom_prompt:
386
  status_text.markdown('<div class="processing-status-container">Processing document with custom instructions...</div>', unsafe_allow_html=True)
 
 
 
387
  else:
388
  status_text.markdown('<div class="processing-status-container">Processing document with OCR...</div>', unsafe_allow_html=True)
389
 
390
  # Process the file using cached function if possible
391
  try:
392
- result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key)
393
  progress_bar.progress(80)
394
  status_text.markdown('<div class="processing-status-container">Analyzing document structure...</div>', unsafe_allow_html=True)
395
  progress_bar.progress(90)
@@ -471,6 +519,40 @@ except ImportError:
471
  if 'previous_results' not in st.session_state:
472
  st.session_state.previous_results = []
473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  # Create main layout with tabs and columns
475
  main_tab1, main_tab2, main_tab3 = st.tabs(["Document Processing", "Previous Results", "About"])
476
 
@@ -1243,32 +1325,47 @@ with main_tab1:
1243
  </style>
1244
  """, unsafe_allow_html=True)
1245
 
1246
- # Use uploaded_file or sample_document if available
 
 
 
 
1247
  if 'sample_document' in st.session_state and st.session_state.sample_document is not None:
1248
  # Use the sample document
1249
  uploaded_file = st.session_state.sample_document
1250
- # Add a notice about using sample document with better style
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1251
  st.markdown(
1252
  f"""
1253
  <div style="background-color: #D4EDDA; color: #155724; padding: 10px;
1254
  border-radius: 4px; border-left: 5px solid #155724; margin-bottom: 10px;">
1255
  <div style="display: flex; justify-content: space-between; align-items: center;">
1256
- <span style="font-weight: bold;">Sample Document: {uploaded_file.name}</span>
1257
  </div>
1258
  </div>
1259
  """,
1260
  unsafe_allow_html=True
1261
  )
1262
-
1263
- # Set auto-process flag in session state if this is a newly loaded sample
1264
- if st.session_state.sample_just_loaded:
1265
- st.session_state.auto_process_sample = True
1266
- # Mark that this is a sample document being processed
1267
- st.session_state.sample_document_processed = True
1268
- st.session_state.sample_just_loaded = False
1269
-
1270
- # Clear sample document after use to avoid interference with future uploads
1271
- st.session_state.sample_document = None
1272
 
1273
  if uploaded_file is not None:
1274
  # Check file size (cap at 50MB)
@@ -1278,6 +1375,12 @@ with main_tab1:
1278
  with left_col:
1279
  st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
1280
  st.stop()
 
 
 
 
 
 
1281
 
1282
  file_ext = Path(uploaded_file.name).suffix.lower()
1283
 
@@ -1288,6 +1391,32 @@ with main_tab1:
1288
  # Make the button more clear about its function
1289
  if st.session_state.processed_document_active:
1290
  process_button = st.button("Process Document Again")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1291
  else:
1292
  process_button = st.button("Process Document")
1293
 
@@ -1392,6 +1521,8 @@ with main_tab1:
1392
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
1393
  tmp.write(uploaded_file.getvalue())
1394
  temp_path = tmp.name
 
 
1395
 
1396
  # Apply PDF rotation if specified
1397
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
@@ -1451,6 +1582,9 @@ with main_tab1:
1451
  # Clean up temp file
1452
  if os.path.exists(temp_path):
1453
  os.unlink(temp_path)
 
 
 
1454
 
1455
  except Exception as e:
1456
  # If anything fails, revert to standard processing
@@ -1460,6 +1594,9 @@ with main_tab1:
1460
  # For non-PDF files, use normal processing with custom prompt
1461
  # Save the uploaded file to a temporary file with preprocessing
1462
  with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
 
 
 
1463
  # Apply preprocessing if any options are selected
1464
  if any(preprocessing_options.values()):
1465
  # Apply performance mode settings
@@ -1473,7 +1610,6 @@ with main_tab1:
1473
  tmp.write(processed_bytes)
1474
  else:
1475
  tmp.write(uploaded_file.getvalue())
1476
- temp_path = tmp.name
1477
 
1478
  # Show progress
1479
  with progress_placeholder.container():
@@ -1522,6 +1658,9 @@ with main_tab1:
1522
  # Clean up temporary file
1523
  if os.path.exists(temp_path):
1524
  try:
 
 
 
1525
  os.unlink(temp_path)
1526
  except:
1527
  pass
@@ -2104,6 +2243,10 @@ with main_tab1:
2104
  # Set processed_document_active to True when a new document is processed
2105
  st.session_state.processed_document_active = True
2106
 
 
 
 
 
2107
  # Button styling is now handled by global CSS
2108
 
2109
  # Display success message with close button for dismissing processed documents
@@ -2114,12 +2257,18 @@ with main_tab1:
2114
  # Close button styling is now handled by global CSS
2115
 
2116
  if st.button("✕ Close Document", key="close_document_button", help="Clear current document and start over"):
2117
- # Clear the session state
2118
- st.session_state.processed_document_active = False
2119
- # Reset any active document data
2120
- if 'current_result' in st.session_state:
2121
- del st.session_state.current_result
2122
- # Rerun to reset the page
 
 
 
 
 
 
2123
  st.rerun()
2124
 
2125
  # Store the result in the previous results list
 
58
  return []
59
 
60
  # Cache preprocessed images for better performance
61
+ @st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))}) # Cache for 24 hours
62
  def preprocess_image(image_bytes, preprocessing_options):
63
  """Preprocess image with selected options optimized for historical document OCR quality"""
64
  # Setup basic console logging
 
175
 
176
  # Cache OCR results in memory to speed up repeated processing
177
  @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
178
+ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
179
  """Cached version of OCR processing to reuse results"""
180
  # Initialize OCR processor
181
  processor = StructuredOCR()
 
241
  with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
242
  tmp.write(file_bytes)
243
  temp_path = tmp.name
244
+ # Track temporary file for cleanup
245
+ st.session_state.temp_file_paths.append(temp_path)
246
 
247
  # Get PDF rotation value if available and file is a PDF
248
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() and file_type == "pdf" else 0
 
286
  # Generate cache key
287
  import hashlib
288
  file_hash = hashlib.md5(file_bytes).hexdigest()
289
+
290
+ # Include preprocessing options in cache key if available
291
+ preprocessing_options_hash = ""
292
+ if 'preprocessing_options' in locals() and preprocessing_options:
293
+ # Add pdf_rotation to preprocessing options to ensure it's part of the cache key
294
+ if pdf_rotation_value != 0:
295
+ preprocessing_options_with_rotation = preprocessing_options.copy()
296
+ preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation_value
297
+ preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
298
+ else:
299
+ preprocessing_str = str(sorted(preprocessing_options.items()))
300
+ preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
301
+ elif pdf_rotation_value != 0:
302
+ # If no preprocessing options but we have rotation, include that in the hash
303
+ preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation_value}".encode()).hexdigest()
304
+
305
+ cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
306
+
307
+ # Check if we have custom prompt to include in cache key
308
+ has_custom_prompt = 'custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0
309
+ if has_custom_prompt:
310
+ # Update cache key to include custom prompt hash
311
+ custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
312
+ cache_key = f"{cache_key}_{custom_prompt_hash}"
313
 
314
  # Process with cached function if possible
315
  try:
316
+ result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash)
317
  progress_bar.progress(90)
318
  status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
319
  except Exception as e:
 
368
  # Clean up original temp file and use the processed one
369
  if os.path.exists(temp_path):
370
  os.unlink(temp_path)
371
+ # Remove original temp path from tracking list
372
+ if temp_path in st.session_state.temp_file_paths:
373
+ st.session_state.temp_file_paths.remove(temp_path)
374
  temp_path = proc_tmp.name
375
+ # Track new temporary file for cleanup
376
+ st.session_state.temp_file_paths.append(temp_path)
377
  progress_bar.progress(30)
378
  else:
379
  progress_bar.progress(30)
 
407
  # Add pdf_rotation to cache key if present
408
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
409
  file_hash = hashlib.md5(open(temp_path, 'rb').read()).hexdigest()
410
+
411
+ # Include preprocessing options in cache key to ensure reprocessing when options change
412
+ preprocessing_options_hash = ""
413
+ if preprocessing_options:
414
+ # Add pdf_rotation to preprocessing options to ensure it's part of the cache key
415
+ if pdf_rotation_value != 0:
416
+ preprocessing_options_with_rotation = preprocessing_options.copy()
417
+ preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation_value
418
+ preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
419
+ else:
420
+ preprocessing_str = str(sorted(preprocessing_options.items()))
421
+ preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
422
+
423
+ cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
424
 
425
  progress_bar.progress(50)
426
  # Check if we have custom instructions
427
  has_custom_prompt = 'custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0
428
+
429
+ # If we have custom instructions, include them in cache key
430
  if has_custom_prompt:
431
  status_text.markdown('<div class="processing-status-container">Processing document with custom instructions...</div>', unsafe_allow_html=True)
432
+ # Update cache key to include custom prompt hash
433
+ custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
434
+ cache_key = f"{cache_key}_{custom_prompt_hash}"
435
  else:
436
  status_text.markdown('<div class="processing-status-container">Processing document with OCR...</div>', unsafe_allow_html=True)
437
 
438
  # Process the file using cached function if possible
439
  try:
440
+ result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash)
441
  progress_bar.progress(80)
442
  status_text.markdown('<div class="processing-status-container">Analyzing document structure...</div>', unsafe_allow_html=True)
443
  progress_bar.progress(90)
 
519
  if 'previous_results' not in st.session_state:
520
  st.session_state.previous_results = []
521
 
522
+ # Initialize temp file tracking
523
+ if 'temp_file_paths' not in st.session_state:
524
+ st.session_state.temp_file_paths = []
525
+
526
+ # Initialize last processed file tracking to fix "Process Document Again" button
527
+ if 'last_processed_file' not in st.session_state:
528
+ st.session_state.last_processed_file = None
529
+
530
+ # Check if we need to perform a complete reset (coming from "Close Document" button)
531
+ if 'perform_reset' in st.session_state and st.session_state.perform_reset:
532
+ # List of all session state keys that should be reset, except previous_results
533
+ reset_keys = [key for key in list(st.session_state.keys())
534
+ if key != 'previous_results']
535
+
536
+ # Remove all keys except previous_results
537
+ for key in reset_keys:
538
+ if key == 'perform_reset':
539
+ st.session_state[key] = False # Clear this flag
540
+ else:
541
+ st.session_state.pop(key, None)
542
+
543
+ # Reinitialize required session state variables
544
+ st.session_state.auto_process_sample = False
545
+ st.session_state.sample_just_loaded = False
546
+ st.session_state.processed_document_active = False
547
+ st.session_state.sample_document_processed = False
548
+ st.session_state.last_processed_file = None
549
+
550
+ # Explicitly reset document-related variables
551
+ st.session_state.sample_document = None
552
+ st.session_state.original_sample_bytes = None
553
+ st.session_state.original_sample_name = None
554
+ st.session_state.is_sample_document = False
555
+
556
  # Create main layout with tabs and columns
557
  main_tab1, main_tab2, main_tab3 = st.tabs(["Document Processing", "Previous Results", "About"])
558
 
 
1325
  </style>
1326
  """, unsafe_allow_html=True)
1327
 
1328
+ # Check if we're using a sample document (either newly loaded or from session state)
1329
+ using_sample_document = False
1330
+ sample_document_name = None
1331
+
1332
+ # Check for newly loaded sample document
1333
  if 'sample_document' in st.session_state and st.session_state.sample_document is not None:
1334
  # Use the sample document
1335
  uploaded_file = st.session_state.sample_document
1336
+ using_sample_document = True
1337
+ sample_document_name = uploaded_file.name
1338
+
1339
+ # Set auto-process flag in session state if this is a newly loaded sample
1340
+ if st.session_state.sample_just_loaded:
1341
+ st.session_state.auto_process_sample = True
1342
+ # Mark that this is a sample document being processed
1343
+ st.session_state.sample_document_processed = True
1344
+ st.session_state.sample_just_loaded = False
1345
+
1346
+ # Store sample document bytes in a separate session state variable for potential reprocessing
1347
+ st.session_state.original_sample_bytes = uploaded_file.getvalue()
1348
+ st.session_state.original_sample_name = uploaded_file.name
1349
+ st.session_state.is_sample_document = True
1350
+
1351
+ # Check for reprocessing of previously loaded sample
1352
+ elif 'is_sample_document' in st.session_state and st.session_state.is_sample_document:
1353
+ using_sample_document = True
1354
+ sample_document_name = st.session_state.original_sample_name if 'original_sample_name' in st.session_state else "Sample Document"
1355
+
1356
+ # Display sample document notice if using a sample document
1357
+ if using_sample_document:
1358
  st.markdown(
1359
  f"""
1360
  <div style="background-color: #D4EDDA; color: #155724; padding: 10px;
1361
  border-radius: 4px; border-left: 5px solid #155724; margin-bottom: 10px;">
1362
  <div style="display: flex; justify-content: space-between; align-items: center;">
1363
+ <span style="font-weight: bold;">Sample Document: {sample_document_name}</span>
1364
  </div>
1365
  </div>
1366
  """,
1367
  unsafe_allow_html=True
1368
  )
 
 
 
 
 
 
 
 
 
 
1369
 
1370
  if uploaded_file is not None:
1371
  # Check file size (cap at 50MB)
 
1375
  with left_col:
1376
  st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
1377
  st.stop()
1378
+
1379
+ # Check if this is a new file (different from the last processed file)
1380
+ current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
1381
+ if st.session_state.last_processed_file != current_file_identifier:
1382
+ # Reset processed_document_active if a new file is uploaded
1383
+ st.session_state.processed_document_active = False
1384
 
1385
  file_ext = Path(uploaded_file.name).suffix.lower()
1386
 
 
1391
  # Make the button more clear about its function
1392
  if st.session_state.processed_document_active:
1393
  process_button = st.button("Process Document Again")
1394
+
1395
+ # If process button is clicked and we're dealing with a sample document that was cleared
1396
+ if process_button and uploaded_file is None and 'original_sample_bytes' in st.session_state:
1397
+ # Recreate the uploaded file from stored bytes
1398
+ from io import BytesIO
1399
+ import mimetypes
1400
+
1401
+ # Determine mime type based on file extension
1402
+ file_ext = os.path.splitext(st.session_state.original_sample_name)[1].lower()
1403
+ if file_ext == '.pdf':
1404
+ mime_type = 'application/pdf'
1405
+ elif file_ext in ['.jpg', '.jpeg']:
1406
+ mime_type = 'image/jpeg'
1407
+ elif file_ext == '.png':
1408
+ mime_type = 'image/png'
1409
+ else:
1410
+ mime_type = mimetypes.guess_type(st.session_state.original_sample_name)[0] or 'application/octet-stream'
1411
+
1412
+ # Create a synthetic file-like object with the same interface as UploadedFile
1413
+ uploaded_file = type('obj', (object,), {
1414
+ 'name': st.session_state.original_sample_name,
1415
+ 'getvalue': lambda: st.session_state.original_sample_bytes,
1416
+ 'read': lambda: st.session_state.original_sample_bytes,
1417
+ 'seek': lambda x: None,
1418
+ 'type': mime_type
1419
+ })
1420
  else:
1421
  process_button = st.button("Process Document")
1422
 
 
1521
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
1522
  tmp.write(uploaded_file.getvalue())
1523
  temp_path = tmp.name
1524
+ # Track temporary file for cleanup
1525
+ st.session_state.temp_file_paths.append(temp_path)
1526
 
1527
  # Apply PDF rotation if specified
1528
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
 
1582
  # Clean up temp file
1583
  if os.path.exists(temp_path):
1584
  os.unlink(temp_path)
1585
+ # Remove from tracking list
1586
+ if temp_path in st.session_state.temp_file_paths:
1587
+ st.session_state.temp_file_paths.remove(temp_path)
1588
 
1589
  except Exception as e:
1590
  # If anything fails, revert to standard processing
 
1594
  # For non-PDF files, use normal processing with custom prompt
1595
  # Save the uploaded file to a temporary file with preprocessing
1596
  with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
1597
+ temp_path = tmp.name
1598
+ # Track temporary file for cleanup
1599
+ st.session_state.temp_file_paths.append(temp_path)
1600
  # Apply preprocessing if any options are selected
1601
  if any(preprocessing_options.values()):
1602
  # Apply performance mode settings
 
1610
  tmp.write(processed_bytes)
1611
  else:
1612
  tmp.write(uploaded_file.getvalue())
 
1613
 
1614
  # Show progress
1615
  with progress_placeholder.container():
 
1658
  # Clean up temporary file
1659
  if os.path.exists(temp_path):
1660
  try:
1661
+ # Remove from tracking list
1662
+ if temp_path in st.session_state.temp_file_paths:
1663
+ st.session_state.temp_file_paths.remove(temp_path)
1664
  os.unlink(temp_path)
1665
  except:
1666
  pass
 
2243
  # Set processed_document_active to True when a new document is processed
2244
  st.session_state.processed_document_active = True
2245
 
2246
+ # Store information about this processed file to track when new files are uploaded
2247
+ if uploaded_file is not None:
2248
+ st.session_state.last_processed_file = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
2249
+
2250
  # Button styling is now handled by global CSS
2251
 
2252
  # Display success message with close button for dismissing processed documents
 
2257
  # Close button styling is now handled by global CSS
2258
 
2259
  if st.button("✕ Close Document", key="close_document_button", help="Clear current document and start over"):
2260
+ # Create a special flag to signal a complete reset on the next rerun
2261
+ st.session_state.perform_reset = True
2262
+
2263
+ # Clean up any temporary files
2264
+ if 'temp_file_paths' in st.session_state:
2265
+ for temp_path in st.session_state.temp_file_paths:
2266
+ try:
2267
+ if os.path.exists(temp_path):
2268
+ os.remove(temp_path)
2269
+ except Exception:
2270
+ pass # Ignore errors in cleanup
2271
+
2272
  st.rerun()
2273
 
2274
  # Store the result in the previous results list