rongo1 commited on
Commit
3139aea
·
1 Parent(s): 9d9da11
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -14,7 +14,7 @@ venv/
14
  # Google Drive authentication files
15
  token.pickle
16
  google_token_base64.txt
17
-
18
  # IDE
19
  .vscode/
20
  .idea/
 
14
  # Google Drive authentication files
15
  token.pickle
16
  google_token_base64.txt
17
+ convert_token_to_base64.py
18
  # IDE
19
  .vscode/
20
  .idea/
app.py CHANGED
@@ -13,7 +13,16 @@ import sys
13
  import tempfile
14
 
15
  # Import Google Drive functionality
16
- from google_funcs import get_drive_service, upload_excel_to_exports_folder, upload_image_to_images_folder, list_files_in_folder
 
 
 
 
 
 
 
 
 
17
 
18
  # Configure logging
19
  logging.basicConfig(
@@ -449,28 +458,63 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
449
  current_filename = f"current_run_{timestamp}.xlsx"
450
  cumulative_filename = "all_business_cards_total.xlsx"
451
 
452
- # Try to download existing cumulative data from Google Drive
453
  logger.info("Checking for existing cumulative file in Google Drive")
 
 
454
  try:
455
- # List files in exports folder to find existing cumulative file
456
- exports_files = list_files_in_folder(drive_service, "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO")
457
- cumulative_file = None
458
- for file in exports_files:
459
- if file['name'] == 'all_business_cards_total.xlsx':
460
- cumulative_file = file
461
- break
462
 
463
- if cumulative_file:
464
- logger.info("Existing cumulative file found in Google Drive")
465
- # For now, we'll just use current data since downloading and merging is complex
466
- # In production, you'd want to implement Google Drive file download
467
- cumulative_df = current_df
468
- logger.info("Using current data only (Google Drive download not implemented yet)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  else:
470
  logger.info("No existing cumulative file found, using current data only")
471
  cumulative_df = current_df
 
472
  except Exception as e:
473
- logger.warning(f"Could not check for existing data in Google Drive: {e}")
 
474
  cumulative_df = current_df
475
 
476
  # Write current run Excel file
@@ -545,7 +589,14 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
545
 
546
  summary += f"📁 Current run file: {current_filename} (uploaded to Google Drive)\n"
547
  summary += f"📊 Total cumulative file: {cumulative_filename} (uploaded to Google Drive)\n"
548
- summary += f"📊 Total cards in database: {len(cumulative_df)}\n\n"
 
 
 
 
 
 
 
549
 
550
  # Add Google Drive links
551
  summary += "🔗 Google Drive Links:\n"
 
13
  import tempfile
14
 
15
  # Import Google Drive functionality
16
+ from google_funcs import (
17
+ get_drive_service,
18
+ upload_excel_to_exports_folder,
19
+ upload_image_to_images_folder,
20
+ list_files_in_folder,
21
+ download_file_from_drive,
22
+ get_existing_cumulative_file,
23
+ cleanup_duplicate_cumulative_files,
24
+ delete_file_from_drive
25
+ )
26
 
27
  # Configure logging
28
  logging.basicConfig(
 
458
  current_filename = f"current_run_{timestamp}.xlsx"
459
  cumulative_filename = "all_business_cards_total.xlsx"
460
 
461
+ # Download and merge existing cumulative data from Google Drive
462
  logger.info("Checking for existing cumulative file in Google Drive")
463
+ cumulative_df = current_df # Default to current data
464
+
465
  try:
466
+ # Clean up any duplicate cumulative files first
467
+ duplicates_removed = cleanup_duplicate_cumulative_files(drive_service)
468
+ if duplicates_removed > 0:
469
+ logger.info(f"Cleaned up {duplicates_removed} duplicate cumulative files")
 
 
 
470
 
471
+ # Get the existing cumulative file
472
+ existing_file = get_existing_cumulative_file(drive_service)
473
+
474
+ if existing_file:
475
+ logger.info(f"Existing cumulative file found: {existing_file['name']} (ID: {existing_file['id']})")
476
+
477
+ # Create temporary file for download
478
+ with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as existing_temp:
479
+ existing_temp_path = existing_temp.name
480
+
481
+ # Download existing file
482
+ if download_file_from_drive(drive_service, existing_file['id'], existing_temp_path):
483
+ logger.info("Successfully downloaded existing cumulative file")
484
+
485
+ try:
486
+ # Read existing data
487
+ existing_df = pd.read_excel(existing_temp_path)
488
+ logger.info(f"Loaded existing data: {len(existing_df)} rows")
489
+
490
+ # Merge with current data
491
+ cumulative_df = pd.concat([existing_df, current_df], ignore_index=True)
492
+ logger.info(f"Merged data: {len(existing_df)} existing + {len(current_df)} new = {len(cumulative_df)} total rows")
493
+
494
+ # Delete the old file from Google Drive since we'll upload a new one
495
+ delete_file_from_drive(drive_service, existing_file['id'])
496
+ logger.info("Deleted old cumulative file from Google Drive")
497
+
498
+ except Exception as e:
499
+ logger.error(f"Failed to read existing Excel file: {e}")
500
+ logger.info("Using current data only")
501
+ cumulative_df = current_df
502
+ finally:
503
+ # Clean up temporary file
504
+ try:
505
+ os.unlink(existing_temp_path)
506
+ except:
507
+ pass
508
+ else:
509
+ logger.warning("Failed to download existing cumulative file, using current data only")
510
+ cumulative_df = current_df
511
  else:
512
  logger.info("No existing cumulative file found, using current data only")
513
  cumulative_df = current_df
514
+
515
  except Exception as e:
516
+ logger.warning(f"Error handling existing cumulative data: {e}")
517
+ logger.info("Using current data only")
518
  cumulative_df = current_df
519
 
520
  # Write current run Excel file
 
589
 
590
  summary += f"📁 Current run file: {current_filename} (uploaded to Google Drive)\n"
591
  summary += f"📊 Total cumulative file: {cumulative_filename} (uploaded to Google Drive)\n"
592
+ summary += f"📊 Total cards in database: {len(cumulative_df)}\n"
593
+
594
+ # Add cleanup information
595
+ if 'duplicates_removed' in locals() and duplicates_removed > 0:
596
+ summary += f"🧹 Cleaned up {duplicates_removed} duplicate cumulative files\n"
597
+ if 'old_runs_removed' in locals() and old_runs_removed > 0:
598
+ summary += f"🧹 Cleaned up {old_runs_removed} old current run files\n"
599
+ summary += "\n"
600
 
601
  # Add Google Drive links
602
  summary += "🔗 Google Drive Links:\n"
business_card_exports/.gitkeep DELETED
@@ -1,2 +0,0 @@
1
- # This file ensures the business_card_exports directory is created and tracked by git
2
- # Excel files with extracted business card data will be saved here
 
 
 
business_cards/.gitkeep DELETED
@@ -1,2 +0,0 @@
1
- # This file ensures the business_cards directory is created and tracked by git
2
- # Business card images will be saved here when the app runs
 
 
 
google_funcs.py CHANGED
@@ -4,7 +4,7 @@ import base64
4
  from google.auth.transport.requests import Request
5
  from google_auth_oauthlib.flow import InstalledAppFlow
6
  from googleapiclient.discovery import build
7
- from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload
8
  import io
9
  from pathlib import Path
10
  import logging
@@ -13,11 +13,10 @@ logger = logging.getLogger(__name__)
13
 
14
  # --- CONFIGURATION ---
15
  # Get credentials from environment variables
16
- # CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
17
- # CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
 
18
 
19
- CLIENT_ID = "204189841641-rdes6684mk1ekl30kfoaecjq0gnf0qvq.apps.googleusercontent.com"
20
- CLIENT_SECRET = "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"
21
 
22
  # Google Drive folder IDs
23
  EXPORTS_FOLDER_ID = "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO" # For Excel exports
@@ -34,8 +33,7 @@ def get_drive_service():
34
  # --- NEW CODE FOR DEPLOYMENT ENVIRONMENTS ---
35
  # If token file doesn't exist, try to create it from environment variable
36
  if not os.path.exists(TOKEN_PICKLE_FILE):
37
- # encoded_token = os.environ.get('GOOGLE_TOKEN_BASE64')
38
- encoded_token = "gASVyQMAAAAAAACMGWdvb2dsZS5vYXV0aDIuY3JlZGVudGlhbHOUjAtDcmVkZW50aWFsc5STlCmBlH2UKIwFdG9rZW6UjN55YTI5LmEwQVMzSDZOeW9FRnJtbmpXLUxTWkF1c3N0aU9lLUtxR1pSQmlsRDhVWGNtSEhJd1lCLXlXOTZEQ2NnclhPdDBHOWlSdnpOMUxlNUZwa25YazlYeTRvaXJLZktJb0tsNzRhU0NpTVVXWDNTV0h1LTNKM3IzLW1IUk1wdVg3bzVrS2U5Q3pFdUVMSUs1VHp4cEdHdzZsYWhNbDZsdW1UNnFrcDBEMkh1VC1fYUNnWUtBZnNTQVJBU0ZRSEdYMk1peXNudFF3bFVCdHRUblIxb0NMSzhaQTAxNzWUjAZleHBpcnmUjAhkYXRldGltZZSMCGRhdGV0aW1llJOUQwoH6QcPFzsxAOfTlIWUUpSMEV9xdW90YV9wcm9qZWN0X2lklE6MD190cnVzdF9ib3VuZGFyeZROjBBfdW5pdmVyc2VfZG9tYWlulIwOZ29vZ2xlYXBpcy5jb22UjBlfdXNlX25vbl9ibG9ja2luZ19yZWZyZXNolImMB19zY29wZXOUXZSMKmh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL2F1dGgvZHJpdmUuZmlsZZRhjA9fZGVmYXVsdF9zY29wZXOUTowOX3JlZnJlc2hfdG9rZW6UjGcxLy8wY3JmV2NzZ1FVbjZzQ2dZSUFSQUFHQXdTTndGLUw5SXI4bG1qd1ZvaHluSkFtRFFoZmhuOE14dUwwSkhIQklodl8xanpsYjdfeEFrTHlNTC1ybENPUm96a3RSa3BuUjNWcXFBlIwJX2lkX3Rva2VulE6MD19ncmFudGVkX3Njb3Blc5RdlIwqaHR0cHM6Ly93d3cuZ29vZ2xlYXBpcy5jb20vYXV0aC9kcml2ZS5maWxllGGMCl90b2tlbl91cmmUjCNodHRwczovL29hdXRoMi5nb29nbGVhcGlzLmNvbS90b2tlbpSMCl9jbGllbnRfaWSUjEgyMDQxODk4NDE2NDEtcmRlczY2ODRtazFla2wzMGtmb2FlY2pxMGduZjBxdnEuYXBwcy5nb29nbGV1c2VyY29udGVudC5jb22UjA5fY2xpZW50X3NlY3JldJSMI0dPQ1NQWC1CUnRKVHAwemtydkM2Q2JMRUl3LWdqZGhMMjJylIwLX3JhcHRfdG9rZW6UTowWX2VuYWJsZV9yZWF1dGhfcmVmcmVzaJSJjAhfYWNjb3VudJSMAJR1Yi4="
39
  if encoded_token:
40
  logger.info("Found token in environment variable. Recreating token.pickle file.")
41
  try:
@@ -182,6 +180,117 @@ def list_files_in_folder(service, folder_id, max_results=100):
182
  logger.error(f"Failed to list files in folder {folder_id}: {e}")
183
  return []
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  if __name__ == '__main__':
186
  # Test the Google Drive connection
187
  try:
 
4
  from google.auth.transport.requests import Request
5
  from google_auth_oauthlib.flow import InstalledAppFlow
6
  from googleapiclient.discovery import build
7
+ from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload, MediaIoBaseDownload
8
  import io
9
  from pathlib import Path
10
  import logging
 
13
 
14
  # --- CONFIGURATION ---
15
  # Get credentials from environment variables
16
+ CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
17
+ CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
18
+
19
 
 
 
20
 
21
  # Google Drive folder IDs
22
  EXPORTS_FOLDER_ID = "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO" # For Excel exports
 
33
  # --- NEW CODE FOR DEPLOYMENT ENVIRONMENTS ---
34
  # If token file doesn't exist, try to create it from environment variable
35
  if not os.path.exists(TOKEN_PICKLE_FILE):
36
+ encoded_token = os.environ.get('GOOGLE_TOKEN_BASE64')
 
37
  if encoded_token:
38
  logger.info("Found token in environment variable. Recreating token.pickle file.")
39
  try:
 
180
  logger.error(f"Failed to list files in folder {folder_id}: {e}")
181
  return []
182
 
183
+ def download_file_from_drive(service, file_id, file_path):
184
+ """Download a file from Google Drive to local path."""
185
+ try:
186
+ request = service.files().get_media(fileId=file_id)
187
+
188
+ with open(file_path, 'wb') as local_file:
189
+ downloader = MediaIoBaseDownload(local_file, request)
190
+ done = False
191
+ while done is False:
192
+ status, done = downloader.next_chunk()
193
+
194
+ logger.info(f"Successfully downloaded file {file_id} to {file_path}")
195
+ return True
196
+ except Exception as e:
197
+ logger.error(f"Failed to download file {file_id}: {e}")
198
+ return False
199
+
200
+ def delete_file_from_drive(service, file_id):
201
+ """Delete a file from Google Drive."""
202
+ try:
203
+ service.files().delete(fileId=file_id).execute()
204
+ logger.info(f"Successfully deleted file {file_id} from Google Drive")
205
+ return True
206
+ except Exception as e:
207
+ logger.error(f"Failed to delete file {file_id}: {e}")
208
+ return False
209
+
210
+ def get_existing_cumulative_file(service):
211
+ """Find and return the existing cumulative Excel file from exports folder."""
212
+ try:
213
+ exports_files = list_files_in_folder(service, EXPORTS_FOLDER_ID)
214
+ cumulative_files = []
215
+
216
+ for file in exports_files:
217
+ if file['name'] == 'all_business_cards_total.xlsx':
218
+ cumulative_files.append(file)
219
+
220
+ if cumulative_files:
221
+ logger.info(f"Found {len(cumulative_files)} cumulative files")
222
+ # Return the most recent one (by creation time)
223
+ most_recent = max(cumulative_files, key=lambda x: x['createdTime'])
224
+ logger.info(f"Most recent cumulative file: {most_recent['name']} (ID: {most_recent['id']})")
225
+ return most_recent
226
+ else:
227
+ logger.info("No existing cumulative file found")
228
+ return None
229
+
230
+ except Exception as e:
231
+ logger.error(f"Failed to get existing cumulative file: {e}")
232
+ return None
233
+
234
+ def cleanup_duplicate_cumulative_files(service):
235
+ """Remove duplicate cumulative files, keeping only the most recent one."""
236
+ try:
237
+ exports_files = list_files_in_folder(service, EXPORTS_FOLDER_ID)
238
+ cumulative_files = []
239
+
240
+ for file in exports_files:
241
+ if file['name'] == 'all_business_cards_total.xlsx':
242
+ cumulative_files.append(file)
243
+
244
+ if len(cumulative_files) > 1:
245
+ logger.info(f"Found {len(cumulative_files)} duplicate cumulative files, cleaning up...")
246
+ # Sort by creation time and keep the most recent one
247
+ cumulative_files.sort(key=lambda x: x['createdTime'], reverse=True)
248
+ files_to_delete = cumulative_files[1:] # All except the most recent
249
+
250
+ for file in files_to_delete:
251
+ logger.info(f"Deleting duplicate file: {file['name']} (ID: {file['id']})")
252
+ delete_file_from_drive(service, file['id'])
253
+
254
+ logger.info(f"Cleaned up {len(files_to_delete)} duplicate files")
255
+ return len(files_to_delete)
256
+ else:
257
+ logger.info("No duplicate cumulative files found")
258
+ return 0
259
+
260
+ except Exception as e:
261
+ logger.error(f"Failed to cleanup duplicate files: {e}")
262
+ return 0
263
+
264
+ def cleanup_old_current_run_files(service, keep_count=5):
265
+ """Clean up old current run files, keeping only the most recent ones."""
266
+ try:
267
+ exports_files = list_files_in_folder(service, EXPORTS_FOLDER_ID)
268
+ current_run_files = []
269
+
270
+ for file in exports_files:
271
+ if file['name'].startswith('current_run_') and file['name'].endswith('.xlsx'):
272
+ current_run_files.append(file)
273
+
274
+ if len(current_run_files) > keep_count:
275
+ logger.info(f"Found {len(current_run_files)} current run files, keeping {keep_count} most recent...")
276
+ # Sort by creation time and keep the most recent ones
277
+ current_run_files.sort(key=lambda x: x['createdTime'], reverse=True)
278
+ files_to_delete = current_run_files[keep_count:] # All except the most recent ones
279
+
280
+ for file in files_to_delete:
281
+ logger.info(f"Deleting old current run file: {file['name']} (ID: {file['id']})")
282
+ delete_file_from_drive(service, file['id'])
283
+
284
+ logger.info(f"Cleaned up {len(files_to_delete)} old current run files")
285
+ return len(files_to_delete)
286
+ else:
287
+ logger.info(f"Found {len(current_run_files)} current run files, no cleanup needed")
288
+ return 0
289
+
290
+ except Exception as e:
291
+ logger.error(f"Failed to cleanup old current run files: {e}")
292
+ return 0
293
+
294
  if __name__ == '__main__':
295
  # Test the Google Drive connection
296
  try: