Spaces:

polygraf-ai
/

business_card_extractor

Running

App Files Files Community

rongo1 commited on Jul 15

Commit

3139aea

1 Parent(s): 9d9da11

fix

Browse files

Files changed (6) hide show

.gitattributes +0 -35
.gitignore +1 -1
app.py +68 -17
business_card_exports/.gitkeep +0 -2
business_cards/.gitkeep +0 -2
google_funcs.py +116 -7

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -14,7 +14,7 @@ venv/
 # Google Drive authentication files
 token.pickle
 google_token_base64.txt
 # IDE
 .vscode/
 .idea/

 # Google Drive authentication files
 token.pickle
 google_token_base64.txt
+convert_token_to_base64.py
 # IDE
 .vscode/
 .idea/

app.py CHANGED Viewed

@@ -13,7 +13,16 @@ import sys
 import tempfile
 # Import Google Drive functionality
-from google_funcs import get_drive_service, upload_excel_to_exports_folder, upload_image_to_images_folder, list_files_in_folder
 # Configure logging
 logging.basicConfig(
@@ -449,28 +458,63 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
     current_filename = f"current_run_{timestamp}.xlsx"
     cumulative_filename = "all_business_cards_total.xlsx"
-    # Try to download existing cumulative data from Google Drive
     logger.info("Checking for existing cumulative file in Google Drive")
     try:
-        # List files in exports folder to find existing cumulative file
-        exports_files = list_files_in_folder(drive_service, "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO")
-        cumulative_file = None
-        for file in exports_files:
-            if file['name'] == 'all_business_cards_total.xlsx':
-                cumulative_file = file
-                break
-        if cumulative_file:
-            logger.info("Existing cumulative file found in Google Drive")
-            # For now, we'll just use current data since downloading and merging is complex
-            # In production, you'd want to implement Google Drive file download
-            cumulative_df = current_df
-            logger.info("Using current data only (Google Drive download not implemented yet)")
         else:
             logger.info("No existing cumulative file found, using current data only")
             cumulative_df = current_df
     except Exception as e:
-        logger.warning(f"Could not check for existing data in Google Drive: {e}")
         cumulative_df = current_df
     # Write current run Excel file
@@ -545,7 +589,14 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
     summary += f"📁 Current run file: {current_filename} (uploaded to Google Drive)\n"
     summary += f"📊 Total cumulative file: {cumulative_filename} (uploaded to Google Drive)\n"
-    summary += f"📊 Total cards in database: {len(cumulative_df)}\n\n"
     # Add Google Drive links
     summary += "🔗 Google Drive Links:\n"

 import tempfile
 # Import Google Drive functionality
+from google_funcs import (
+    get_drive_service,
+    upload_excel_to_exports_folder,
+    upload_image_to_images_folder,
+    list_files_in_folder,
+    download_file_from_drive,
+    get_existing_cumulative_file,
+    cleanup_duplicate_cumulative_files,
+    delete_file_from_drive
+)
 # Configure logging
 logging.basicConfig(
     current_filename = f"current_run_{timestamp}.xlsx"
     cumulative_filename = "all_business_cards_total.xlsx"
+    # Download and merge existing cumulative data from Google Drive
     logger.info("Checking for existing cumulative file in Google Drive")
+    cumulative_df = current_df  # Default to current data
     try:
+        # Clean up any duplicate cumulative files first
+        duplicates_removed = cleanup_duplicate_cumulative_files(drive_service)
+        if duplicates_removed > 0:
+            logger.info(f"Cleaned up {duplicates_removed} duplicate cumulative files")
+        # Get the existing cumulative file
+        existing_file = get_existing_cumulative_file(drive_service)
+        if existing_file:
+            logger.info(f"Existing cumulative file found: {existing_file['name']} (ID: {existing_file['id']})")
+            # Create temporary file for download
+            with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as existing_temp:
+                existing_temp_path = existing_temp.name
+            # Download existing file
+            if download_file_from_drive(drive_service, existing_file['id'], existing_temp_path):
+                logger.info("Successfully downloaded existing cumulative file")
+                try:
+                    # Read existing data
+                    existing_df = pd.read_excel(existing_temp_path)
+                    logger.info(f"Loaded existing data: {len(existing_df)} rows")
+                    # Merge with current data
+                    cumulative_df = pd.concat([existing_df, current_df], ignore_index=True)
+                    logger.info(f"Merged data: {len(existing_df)} existing + {len(current_df)} new = {len(cumulative_df)} total rows")
+                    # Delete the old file from Google Drive since we'll upload a new one
+                    delete_file_from_drive(drive_service, existing_file['id'])
+                    logger.info("Deleted old cumulative file from Google Drive")
+                except Exception as e:
+                    logger.error(f"Failed to read existing Excel file: {e}")
+                    logger.info("Using current data only")
+                    cumulative_df = current_df
+                finally:
+                    # Clean up temporary file
+                    try:
+                        os.unlink(existing_temp_path)
+                    except:
+                        pass
+            else:
+                logger.warning("Failed to download existing cumulative file, using current data only")
+                cumulative_df = current_df
         else:
             logger.info("No existing cumulative file found, using current data only")
             cumulative_df = current_df
     except Exception as e:
+        logger.warning(f"Error handling existing cumulative data: {e}")
+        logger.info("Using current data only")
         cumulative_df = current_df
     # Write current run Excel file
     summary += f"📁 Current run file: {current_filename} (uploaded to Google Drive)\n"
     summary += f"📊 Total cumulative file: {cumulative_filename} (uploaded to Google Drive)\n"
+    summary += f"📊 Total cards in database: {len(cumulative_df)}\n"
+    # Add cleanup information
+    if 'duplicates_removed' in locals() and duplicates_removed > 0:
+        summary += f"🧹 Cleaned up {duplicates_removed} duplicate cumulative files\n"
+    if 'old_runs_removed' in locals() and old_runs_removed > 0:
+        summary += f"🧹 Cleaned up {old_runs_removed} old current run files\n"
+    summary += "\n"
     # Add Google Drive links
     summary += "🔗 Google Drive Links:\n"

business_card_exports/.gitkeep DELETED Viewed

	@@ -1,2 +0,0 @@
1	- # This file ensures the business_card_exports directory is created and tracked by git
2	- # Excel files with extracted business card data will be saved here

business_cards/.gitkeep DELETED Viewed

	@@ -1,2 +0,0 @@
1	- # This file ensures the business_cards directory is created and tracked by git
2	- # Business card images will be saved here when the app runs

google_funcs.py CHANGED Viewed

@@ -4,7 +4,7 @@ import base64
 from google.auth.transport.requests import Request
 from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
-from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload
 import io
 from pathlib import Path
 import logging
@@ -13,11 +13,10 @@ logger = logging.getLogger(__name__)
 # --- CONFIGURATION ---
 # Get credentials from environment variables
-# CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
-# CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
-CLIENT_ID = "204189841641-rdes6684mk1ekl30kfoaecjq0gnf0qvq.apps.googleusercontent.com"
-CLIENT_SECRET = "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"
 # Google Drive folder IDs
 EXPORTS_FOLDER_ID = "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO"  # For Excel exports
@@ -34,8 +33,7 @@ def get_drive_service():
     # --- NEW CODE FOR DEPLOYMENT ENVIRONMENTS ---
     # If token file doesn't exist, try to create it from environment variable
     if not os.path.exists(TOKEN_PICKLE_FILE):
-        # encoded_token = os.environ.get('GOOGLE_TOKEN_BASE64')
-        encoded_token = "gASVyQMAAAAAAACMGWdvb2dsZS5vYXV0aDIuY3JlZGVudGlhbHOUjAtDcmVkZW50aWFsc5STlCmBlH2UKIwFdG9rZW6UjN55YTI5LmEwQVMzSDZOeW9FRnJtbmpXLUxTWkF1c3N0aU9lLUtxR1pSQmlsRDhVWGNtSEhJd1lCLXlXOTZEQ2NnclhPdDBHOWlSdnpOMUxlNUZwa25YazlYeTRvaXJLZktJb0tsNzRhU0NpTVVXWDNTV0h1LTNKM3IzLW1IUk1wdVg3bzVrS2U5Q3pFdUVMSUs1VHp4cEdHdzZsYWhNbDZsdW1UNnFrcDBEMkh1VC1fYUNnWUtBZnNTQVJBU0ZRSEdYMk1peXNudFF3bFVCdHRUblIxb0NMSzhaQTAxNzWUjAZleHBpcnmUjAhkYXRldGltZZSMCGRhdGV0aW1llJOUQwoH6QcPFzsxAOfTlIWUUpSMEV9xdW90YV9wcm9qZWN0X2lklE6MD190cnVzdF9ib3VuZGFyeZROjBBfdW5pdmVyc2VfZG9tYWlulIwOZ29vZ2xlYXBpcy5jb22UjBlfdXNlX25vbl9ibG9ja2luZ19yZWZyZXNolImMB19zY29wZXOUXZSMKmh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL2F1dGgvZHJpdmUuZmlsZZRhjA9fZGVmYXVsdF9zY29wZXOUTowOX3JlZnJlc2hfdG9rZW6UjGcxLy8wY3JmV2NzZ1FVbjZzQ2dZSUFSQUFHQXdTTndGLUw5SXI4bG1qd1ZvaHluSkFtRFFoZmhuOE14dUwwSkhIQklodl8xanpsYjdfeEFrTHlNTC1ybENPUm96a3RSa3BuUjNWcXFBlIwJX2lkX3Rva2VulE6MD19ncmFudGVkX3Njb3Blc5RdlIwqaHR0cHM6Ly93d3cuZ29vZ2xlYXBpcy5jb20vYXV0aC9kcml2ZS5maWxllGGMCl90b2tlbl91cmmUjCNodHRwczovL29hdXRoMi5nb29nbGVhcGlzLmNvbS90b2tlbpSMCl9jbGllbnRfaWSUjEgyMDQxODk4NDE2NDEtcmRlczY2ODRtazFla2wzMGtmb2FlY2pxMGduZjBxdnEuYXBwcy5nb29nbGV1c2VyY29udGVudC5jb22UjA5fY2xpZW50X3NlY3JldJSMI0dPQ1NQWC1CUnRKVHAwemtydkM2Q2JMRUl3LWdqZGhMMjJylIwLX3JhcHRfdG9rZW6UTowWX2VuYWJsZV9yZWF1dGhfcmVmcmVzaJSJjAhfYWNjb3VudJSMAJR1Yi4="
         if encoded_token:
             logger.info("Found token in environment variable. Recreating token.pickle file.")
             try:
@@ -182,6 +180,117 @@ def list_files_in_folder(service, folder_id, max_results=100):
         logger.error(f"Failed to list files in folder {folder_id}: {e}")
         return []
 if __name__ == '__main__':
     # Test the Google Drive connection
     try:

 from google.auth.transport.requests import Request
 from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload, MediaIoBaseDownload
 import io
 from pathlib import Path
 import logging
 # --- CONFIGURATION ---
 # Get credentials from environment variables
+CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
+CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
 # Google Drive folder IDs
 EXPORTS_FOLDER_ID = "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO"  # For Excel exports
     # --- NEW CODE FOR DEPLOYMENT ENVIRONMENTS ---
     # If token file doesn't exist, try to create it from environment variable
     if not os.path.exists(TOKEN_PICKLE_FILE):
+        encoded_token = os.environ.get('GOOGLE_TOKEN_BASE64')
         if encoded_token:
             logger.info("Found token in environment variable. Recreating token.pickle file.")
             try:
         logger.error(f"Failed to list files in folder {folder_id}: {e}")
         return []
+def download_file_from_drive(service, file_id, file_path):
+    """Download a file from Google Drive to local path."""
+    try:
+        request = service.files().get_media(fileId=file_id)
+        with open(file_path, 'wb') as local_file:
+            downloader = MediaIoBaseDownload(local_file, request)
+            done = False
+            while done is False:
+                status, done = downloader.next_chunk()
+        logger.info(f"Successfully downloaded file {file_id} to {file_path}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to download file {file_id}: {e}")
+        return False
+def delete_file_from_drive(service, file_id):
+    """Delete a file from Google Drive."""
+    try:
+        service.files().delete(fileId=file_id).execute()
+        logger.info(f"Successfully deleted file {file_id} from Google Drive")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to delete file {file_id}: {e}")
+        return False
+def get_existing_cumulative_file(service):
+    """Find and return the existing cumulative Excel file from exports folder."""
+    try:
+        exports_files = list_files_in_folder(service, EXPORTS_FOLDER_ID)
+        cumulative_files = []
+        for file in exports_files:
+            if file['name'] == 'all_business_cards_total.xlsx':
+                cumulative_files.append(file)
+        if cumulative_files:
+            logger.info(f"Found {len(cumulative_files)} cumulative files")
+            # Return the most recent one (by creation time)
+            most_recent = max(cumulative_files, key=lambda x: x['createdTime'])
+            logger.info(f"Most recent cumulative file: {most_recent['name']} (ID: {most_recent['id']})")
+            return most_recent
+        else:
+            logger.info("No existing cumulative file found")
+            return None
+    except Exception as e:
+        logger.error(f"Failed to get existing cumulative file: {e}")
+        return None
+def cleanup_duplicate_cumulative_files(service):
+    """Remove duplicate cumulative files, keeping only the most recent one."""
+    try:
+        exports_files = list_files_in_folder(service, EXPORTS_FOLDER_ID)
+        cumulative_files = []
+        for file in exports_files:
+            if file['name'] == 'all_business_cards_total.xlsx':
+                cumulative_files.append(file)
+        if len(cumulative_files) > 1:
+            logger.info(f"Found {len(cumulative_files)} duplicate cumulative files, cleaning up...")
+            # Sort by creation time and keep the most recent one
+            cumulative_files.sort(key=lambda x: x['createdTime'], reverse=True)
+            files_to_delete = cumulative_files[1:]  # All except the most recent
+            for file in files_to_delete:
+                logger.info(f"Deleting duplicate file: {file['name']} (ID: {file['id']})")
+                delete_file_from_drive(service, file['id'])
+            logger.info(f"Cleaned up {len(files_to_delete)} duplicate files")
+            return len(files_to_delete)
+        else:
+            logger.info("No duplicate cumulative files found")
+            return 0
+    except Exception as e:
+        logger.error(f"Failed to cleanup duplicate files: {e}")
+        return 0
+def cleanup_old_current_run_files(service, keep_count=5):
+    """Clean up old current run files, keeping only the most recent ones."""
+    try:
+        exports_files = list_files_in_folder(service, EXPORTS_FOLDER_ID)
+        current_run_files = []
+        for file in exports_files:
+            if file['name'].startswith('current_run_') and file['name'].endswith('.xlsx'):
+                current_run_files.append(file)
+        if len(current_run_files) > keep_count:
+            logger.info(f"Found {len(current_run_files)} current run files, keeping {keep_count} most recent...")
+            # Sort by creation time and keep the most recent ones
+            current_run_files.sort(key=lambda x: x['createdTime'], reverse=True)
+            files_to_delete = current_run_files[keep_count:]  # All except the most recent ones
+            for file in files_to_delete:
+                logger.info(f"Deleting old current run file: {file['name']} (ID: {file['id']})")
+                delete_file_from_drive(service, file['id'])
+            logger.info(f"Cleaned up {len(files_to_delete)} old current run files")
+            return len(files_to_delete)
+        else:
+            logger.info(f"Found {len(current_run_files)} current run files, no cleanup needed")
+            return 0
+    except Exception as e:
+        logger.error(f"Failed to cleanup old current run files: {e}")
+        return 0
 if __name__ == '__main__':
     # Test the Google Drive connection
     try: