rongo1 commited on
Commit
dae9b98
·
1 Parent(s): 46dec01

feat: added google drive support

Browse files
Files changed (4) hide show
  1. app.py +153 -275
  2. env.example +11 -6
  3. google.py +175 -0
  4. requirements.txt +3 -1
app.py CHANGED
@@ -10,11 +10,12 @@ import io
10
  import base64
11
  import logging
12
  import sys
13
- import shutil
14
- from huggingface_hub import HfApi
 
 
15
 
16
  # Configure logging
17
- # Simplified logging for cloud deployment
18
  logging.basicConfig(
19
  level=logging.INFO,
20
  format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s',
@@ -30,72 +31,63 @@ gemini_api_key = os.getenv("Gemini_API")
30
  if not gemini_api_key:
31
  logger.error("Gemini_API environment variable not found!")
32
  logger.error("Please set the Gemini_API environment variable with your Google Gemini API key")
33
- logger.error("For Hugging Face Spaces: Add it as a Repository Secret in Space Settings")
34
- raise ValueError("❌ Gemini_API environment variable is required. Please set it in your environment or Hugging Face Space secrets.")
35
 
36
  genai.configure(api_key=gemini_api_key)
37
  logger.info("Gemini API configured successfully")
38
 
39
- # Configure Hugging Face Hub API for file persistence
40
- space_repo_id = "polygraf-ai/business_card_extractor" # Your Space repo
41
- hf_token = os.getenv("HF_TOKEN") # Optional - only needed for Hub uploads
42
-
43
- if hf_token:
44
- hf_api = HfApi(token=hf_token)
45
- logger.info("Hugging Face Hub API configured for file uploads")
46
- logger.info(f"Space repo: {space_repo_id}")
47
- else:
48
- hf_api = None
49
- logger.info("HF_TOKEN not found - files will be saved to persistent storage only")
50
- logger.info("To enable Hub uploads: Set HF_TOKEN in Space secrets (optional)")
51
-
52
- # Create output directories
53
- logger.info("Setting up output directories")
54
- # Try /data for persistent storage, fallback to current directory
55
  try:
56
- # Check if /data is available (persistent storage enabled)
57
- data_path = Path("/data")
58
- if data_path.exists() and os.access(data_path, os.W_OK):
59
- BASE_PATH = data_path
60
- logger.info("Using /data directory for persistent storage")
61
- else:
62
- raise PermissionError("No access to /data")
63
- except (PermissionError, OSError):
64
- # Fallback to current directory for free tier
65
- BASE_PATH = Path(".")
66
- logger.info("Using current directory (files will be ephemeral without persistent storage)")
67
-
68
- output_dir = BASE_PATH / "business_card_exports"
69
- images_dir = BASE_PATH / "business_cards"
70
- output_dir.mkdir(parents=True, exist_ok=True)
71
- images_dir.mkdir(parents=True, exist_ok=True)
72
- logger.info(f"Export directory created/verified: {output_dir}")
73
- logger.info(f"Images directory created/verified: {images_dir}")
74
 
75
  # Log startup
76
- logger.info("Business Card Data Extractor starting up")
77
- logger.info(f"Working directory: {os.getcwd()}")
78
- logger.info(f"Export directory: {output_dir.absolute()}")
79
- logger.info(f"Images directory: {images_dir.absolute()}")
80
 
81
- def upload_to_hub(file_path, path_in_repo, commit_message="Update files"):
82
- """Upload a file to the Hugging Face Hub repository"""
83
- if hf_api: # Only need API token, repo ID is hardcoded
84
- try:
85
- logger.info(f"Uploading {file_path} to Hub at {path_in_repo}")
86
- hf_api.upload_file(
87
- path_or_fileobj=str(file_path),
88
- path_in_repo=path_in_repo,
89
- repo_id=space_repo_id,
90
- repo_type="space",
91
- commit_message=commit_message
92
- )
93
- logger.info(f"Successfully uploaded {file_path} to Hub")
94
- return True
95
- except Exception as e:
96
- logger.error(f"Failed to upload {file_path} to Hub: {e}")
97
- return False
98
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  def extract_business_card_data_batch(images, filenames, model_name="gemini-2.5-flash"):
101
  """Extract data from multiple business card images in a single API call"""
@@ -277,6 +269,7 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
277
  # Load and group images into batches of 5
278
  loaded_images = []
279
  filenames = []
 
280
 
281
  logger.info(f"Loading {len(images)} images")
282
  for idx, image_path in enumerate(images):
@@ -302,10 +295,9 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
302
 
303
  logger.info(f"Successfully loaded {len(loaded_images)} out of {len(images)} images")
304
 
305
- # Save images if requested
306
- saved_image_paths = []
307
  if save_images and loaded_images:
308
- logger.info(f"Saving {len(loaded_images)} images to business_cards directory")
309
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
310
 
311
  for i, (image, filename) in enumerate(zip(loaded_images, filenames)):
@@ -315,33 +307,27 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
315
  if not ext:
316
  ext = '.png'
317
  unique_filename = f"{timestamp}_{i+1:03d}_{name}{ext}"
318
- image_path = images_dir / unique_filename
319
 
320
- # Save the image
321
- image.save(image_path)
322
- saved_image_paths.append(str(image_path))
323
- logger.debug(f"Saved image {i+1}: {unique_filename}")
324
 
325
- # Upload image to Hub
326
- upload_to_hub(
327
- image_path,
328
- f"images/{unique_filename}",
329
- f"Add business card image {unique_filename}"
330
- )
 
 
 
331
 
332
  except Exception as e:
333
  logger.error(f"Failed to save image {filename}: {e}")
 
334
 
335
- logger.info(f"Successfully saved {len(saved_image_paths)} images")
336
-
337
- # List directory contents for debugging
338
- try:
339
- images_list = list(images_dir.iterdir())
340
- logger.info(f"Images directory contains {len(images_list)} files")
341
- for img_file in images_list[-5:]: # Show last 5 files
342
- logger.debug(f" - {img_file.name}")
343
- except Exception as e:
344
- logger.error(f"Error listing images directory: {e}")
345
 
346
  # Group into batches
347
  logger.info(f"Grouping {len(loaded_images)} images into batches of {batch_size}")
@@ -377,13 +363,13 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
377
  data['processed_date'] = timestamp
378
  logger.debug(f"Added timestamp {timestamp} to {card_filename}")
379
 
380
- # Add saved image path if images were saved
381
  global_index = batch_idx * batch_size + i
382
- if save_images and global_index < len(saved_image_paths):
383
- data['saved_image_path'] = saved_image_paths[global_index]
384
- logger.debug(f"Added saved image path for {card_filename}: {saved_image_paths[global_index]}")
385
  else:
386
- data['saved_image_path'] = None
387
 
388
  # Handle multiple values (emails, phones) by joining with commas
389
  list_fields_processed = []
@@ -454,138 +440,96 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
454
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
455
  logger.debug(f"Generated timestamp: {timestamp}")
456
 
457
- # Create current run file
458
- current_filename = output_dir / f"current_run_{timestamp}.xlsx"
459
- logger.info(f"Current run file will be saved as: {current_filename}")
 
 
460
 
461
- # Load existing cumulative data if it exists
462
- cumulative_filename = output_dir / "all_business_cards_total.xlsx"
463
- logger.info(f"Checking for existing cumulative file: {cumulative_filename}")
464
 
465
- if cumulative_filename.exists():
466
- logger.info("Existing cumulative file found, loading and merging data")
467
- try:
468
- existing_df = pd.read_excel(cumulative_filename)
469
- logger.info(f"Loaded existing data: {len(existing_df)} rows")
470
- # Append new data to existing
471
- cumulative_df = pd.concat([existing_df, current_df], ignore_index=True)
472
- logger.info(f"Merged data: {len(cumulative_df)} total rows ({len(existing_df)} existing + {len(current_df)} new)")
473
- except Exception as e:
474
- error_msg = f"Warning: Could not load existing data: {e}"
475
- logger.warning(error_msg)
476
- print(error_msg)
 
 
 
477
  cumulative_df = current_df
478
- logger.info("Using current data only for cumulative file")
479
- else:
480
- logger.info("No existing cumulative file found, using current data only")
 
 
 
481
  cumulative_df = current_df
482
 
483
  # Write current run Excel file
484
- logger.info(f"Writing current run Excel file: {current_filename}")
485
  try:
486
- # Ensure the file path is a Path object
487
- current_filename = Path(current_filename)
488
- with pd.ExcelWriter(current_filename, engine='openpyxl') as writer:
489
  current_df.to_excel(writer, index=False, sheet_name='Current Run')
490
  logger.debug(f"Written {len(current_df)} rows to 'Current Run' sheet")
491
 
492
  # Auto-adjust column widths
493
  logger.debug("Auto-adjusting column widths for current run file")
494
  worksheet = writer.sheets['Current Run']
495
- adjusted_columns = []
496
  for column in current_df:
497
  column_length = max(current_df[column].astype(str).map(len).max(), len(column))
498
  col_idx = current_df.columns.get_loc(column)
499
  final_width = min(column_length + 2, 50)
500
  worksheet.column_dimensions[chr(65 + col_idx)].width = final_width
501
- adjusted_columns.append(f"{column}:{final_width}")
502
- logger.debug(f"Adjusted column widths: {adjusted_columns}")
503
 
504
- logger.info(f"Current run Excel file saved successfully: {current_filename}")
505
 
506
- # Upload current run file to Hub
507
- upload_to_hub(
508
- current_filename,
509
- f"exports/{current_filename.name}",
510
- f"Add current run export {current_filename.name}"
511
- )
512
 
513
- # Create a manifest file for the current run directory
514
- manifest_filename = output_dir / f"current_run_{timestamp}_manifest.txt"
515
- try:
516
- with open(manifest_filename, "w") as f:
517
- f.write(f"Current Run Directory: {output_dir}\n")
518
- f.write(f"Images Directory: {images_dir}\n")
519
- f.write(f"Current Run File: {current_filename}\n")
520
- f.write(f"Cumulative File: {cumulative_filename}\n")
521
- f.write(f"Total Cards in Database: {len(cumulative_df)}\n")
522
- f.write(f"Total Images Saved: {len(saved_image_paths) if saved_image_paths else 0}\n")
523
- f.write(f"Total API Calls Made: {len(image_batches)}\n")
524
- f.write(f"Model Used: {model_name}\n")
525
- f.write(f"Save Images: {save_images}\n")
526
- f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
527
- f.write(f"Session ID: {timestamp}\n")
528
- f.write(f"Errors: {len(errors) if 'errors' in locals() else 0}\n")
529
- logger.info(f"Manifest file created: {manifest_filename}")
530
- except Exception as e:
531
- logger.error(f"Failed to create manifest file: {e}")
532
  except Exception as e:
533
- logger.error(f"Failed to write current run Excel file: {e}")
534
  raise
535
 
536
  # Write cumulative Excel file
537
- logger.info(f"Writing cumulative Excel file: {cumulative_filename}")
538
  try:
539
- # Ensure the file path is a Path object
540
- cumulative_filename = Path(cumulative_filename)
541
- with pd.ExcelWriter(cumulative_filename, engine='openpyxl') as writer:
542
  cumulative_df.to_excel(writer, index=False, sheet_name='All Business Cards')
543
  logger.debug(f"Written {len(cumulative_df)} rows to 'All Business Cards' sheet")
544
 
545
  # Auto-adjust column widths
546
  logger.debug("Auto-adjusting column widths for cumulative file")
547
  worksheet = writer.sheets['All Business Cards']
548
- adjusted_columns = []
549
  for column in cumulative_df:
550
  column_length = max(cumulative_df[column].astype(str).map(len).max(), len(column))
551
  col_idx = cumulative_df.columns.get_loc(column)
552
  final_width = min(column_length + 2, 50)
553
  worksheet.column_dimensions[chr(65 + col_idx)].width = final_width
554
- adjusted_columns.append(f"{column}:{final_width}")
555
- logger.debug(f"Adjusted column widths: {adjusted_columns}")
556
 
557
- logger.info(f"Cumulative Excel file saved successfully: {cumulative_filename}")
558
 
559
- # Upload cumulative file to Hub
560
- upload_to_hub(
561
- cumulative_filename,
562
- f"exports/{cumulative_filename.name}",
563
- f"Update cumulative database - {len(cumulative_df)} total cards"
564
- )
565
 
566
- # Create a manifest file for the cumulative directory
567
- manifest_filename = output_dir / f"all_business_cards_total_{timestamp}_manifest.txt"
568
- try:
569
- with open(manifest_filename, "w") as f:
570
- f.write(f"All Business Cards Directory: {output_dir}\n")
571
- f.write(f"Images Directory: {images_dir}\n")
572
- f.write(f"Current Run File: {current_filename}\n")
573
- f.write(f"Cumulative File: {cumulative_filename}\n")
574
- f.write(f"Total Cards in Database: {len(cumulative_df)}\n")
575
- f.write(f"Total Images Saved: {len(saved_image_paths) if saved_image_paths else 0}\n")
576
- f.write(f"Total API Calls Made: {len(image_batches)}\n")
577
- f.write(f"Model Used: {model_name}\n")
578
- f.write(f"Save Images: {save_images}\n")
579
- f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
580
- f.write(f"Session ID: {timestamp}\n")
581
- f.write(f"Errors: {len(errors) if 'errors' in locals() else 0}\n")
582
- logger.info(f"Manifest file created: {manifest_filename}")
583
- except Exception as e:
584
- logger.error(f"Failed to create manifest file: {e}")
585
  except Exception as e:
586
- logger.error(f"Failed to write cumulative Excel file: {e}")
587
  raise
588
 
 
 
 
589
  # Create summary message
590
  logger.info("Creating summary message")
591
  num_batches = len(image_batches) if 'image_batches' in locals() else 1
@@ -594,20 +538,23 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
594
  summary += f"⚡ API calls made: {num_batches} (instead of {len(all_data)})\n"
595
 
596
  if save_images:
597
- num_saved = len(saved_image_paths) if 'saved_image_paths' in locals() else 0
598
- summary += f"💾 Images saved: {num_saved} cards saved to business_cards folder\n\n"
599
  else:
600
- summary += f"💾 Images saved: No (save option was disabled)\n\n"
601
 
602
- summary += f"📁 Current run file: {current_filename.name}\n"
603
- summary += f"📁 Total cumulative file: {cumulative_filename.name}\n"
604
  summary += f"📊 Total cards in database: {len(cumulative_df)}\n\n"
605
 
606
- # Add note about file locations
607
- summary += "📌 File Locations:\n"
608
- summary += f" - Excel files: business_card_exports/ folder\n"
609
- summary += f" - Images: business_cards/ folder\n"
610
- summary += f" - Check FOLDER_CONTENTS.txt for complete file listing\n\n"
 
 
 
611
 
612
  if errors:
613
  logger.warning(f"Encountered {len(errors)} errors during processing")
@@ -625,78 +572,8 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
625
  logger.info("Business card processing session completed successfully")
626
  logger.info(f"Session summary - Cards: {len(all_data)}, Batches: {num_batches}, API calls: {num_batches}, Total DB size: {len(cumulative_df)}")
627
 
628
- # Verify files exist and return as strings for Gradio
629
- if current_filename.exists():
630
- logger.info(f"Current file exists: {current_filename}")
631
- else:
632
- logger.error(f"Current file NOT found: {current_filename}")
633
-
634
- if cumulative_filename.exists():
635
- logger.info(f"Cumulative file exists: {cumulative_filename}")
636
- else:
637
- logger.error(f"Cumulative file NOT found: {cumulative_filename}")
638
-
639
- # List export directory contents for debugging
640
- try:
641
- export_list = list(output_dir.iterdir())
642
- logger.info(f"Export directory contains {len(export_list)} files")
643
- for exp_file in export_list[-5:]: # Show last 5 files
644
- logger.debug(f" - {exp_file.name}")
645
- except Exception as e:
646
- logger.error(f"Error listing export directory: {e}")
647
-
648
- # List root directory Excel/ZIP files for debugging
649
- try:
650
- root_files = [f for f in Path(".").iterdir() if f.suffix in ['.xlsx', '.zip'] and f.is_file()]
651
- logger.info(f"Root directory contains {len(root_files)} Excel/ZIP files")
652
- for root_file in root_files[-5:]: # Show last 5 files
653
- logger.info(f" - {root_file.name} ({root_file.stat().st_size} bytes)")
654
- except Exception as e:
655
- logger.error(f"Error listing root directory: {e}")
656
-
657
- # Create a directory listing file in root showing folder contents
658
- try:
659
- with open("FOLDER_CONTENTS.txt", "w") as f:
660
- f.write("=== BUSINESS CARD ANALYZER - FOLDER CONTENTS ===\n")
661
- f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
662
-
663
- # List business_card_exports folder
664
- f.write("📁 business_card_exports/\n")
665
- f.write("-" * 50 + "\n")
666
- if output_dir.exists():
667
- files = sorted(output_dir.iterdir())
668
- for file in files:
669
- if file.is_file():
670
- size = file.stat().st_size
671
- f.write(f" 📄 {file.name} ({size:,} bytes)\n")
672
- else:
673
- f.write(" [Folder does not exist]\n")
674
- f.write(f"\nTotal files: {len(list(output_dir.glob('*')))}\n\n")
675
-
676
- # List business_cards folder
677
- f.write("📁 business_cards/\n")
678
- f.write("-" * 50 + "\n")
679
- if images_dir.exists():
680
- files = sorted(images_dir.iterdir())
681
- for file in files:
682
- if file.is_file():
683
- size = file.stat().st_size
684
- f.write(f" 🖼️ {file.name} ({size:,} bytes)\n")
685
- else:
686
- f.write(" [Folder does not exist]\n")
687
- f.write(f"\nTotal files: {len(list(images_dir.glob('*')))}\n\n")
688
-
689
- # Add latest processing summary
690
- f.write("📊 Latest Processing Summary\n")
691
- f.write("-" * 50 + "\n")
692
- f.write(summary)
693
-
694
- logger.info("Created FOLDER_CONTENTS.txt in root directory")
695
- except Exception as e:
696
- logger.error(f"Failed to create folder contents file: {e}")
697
-
698
- # Return string paths for Gradio File components
699
- return str(current_filename), str(cumulative_filename), summary, preview_df
700
 
701
  # Create Gradio interface
702
  logger.info("Creating Gradio interface")
@@ -712,20 +589,21 @@ with gr.Blocks(title="Business Card Data Extractor") as demo:
712
  - 📁 **Current Run**: Contains only the cards you just processed
713
  - 📊 **Total Database**: Contains ALL cards ever processed (cumulative)
714
 
715
- **💾 File Storage:**
716
- - 📂 Excel files: `business_card_exports/` folder
717
- - 🖼️ Images: `business_cards/` folder (if save option enabled)
718
- - 🔄 **Persistent storage**: Enable in Space settings for files to survive restarts
719
- - 🌐 **Hub uploads**: Files auto-uploaded to repository (if `HF_TOKEN` configured)
720
 
721
  **📌 File Access:**
722
- - ⬇️ Download directly from interface buttons
723
- - 📁 Check Space's Files tab (if persistent storage enabled)
724
- - 🌐 Hub repository: `exports/` and `images/` folders (if token set)
725
- - 📋 `FOLDER_CONTENTS.txt` shows complete file listing
726
 
727
- **⚙️ Optional Hub Integration:**
728
- - Set `HF_TOKEN` in Space secrets to upload files to `polygraf-ai/business_card_extractor`
 
729
  """
730
  )
731
 
@@ -784,10 +662,10 @@ with gr.Blocks(title="Business Card Data Extractor") as demo:
784
  - 📄 **Data Extraction**: Names, emails, phone numbers, addresses, and more
785
  - 📞 **Smart Combination**: Multiple emails/phones combined with commas
786
  - 🏠 **Address Merging**: All phone types and address fields combined
787
- - 💾 **Smart Storage**: Auto-detects persistent storage, falls back gracefully
788
- - 🌐 **Hub Integration**: Auto-upload to repository (optional)
789
  - 📊 **Dual Output**: Current run + cumulative database files
790
- - 📝 **Full Tracking**: Processing date, filename, image path, and AI model used
791
  - 🎯 **One Row Per Card**: Each business card becomes one spreadsheet row
792
  """
793
  )
 
10
  import base64
11
  import logging
12
  import sys
13
+ import tempfile
14
+
15
+ # Import Google Drive functionality
16
+ from google import get_drive_service, upload_excel_to_exports_folder, upload_image_to_images_folder, list_files_in_folder
17
 
18
  # Configure logging
 
19
  logging.basicConfig(
20
  level=logging.INFO,
21
  format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s',
 
31
  if not gemini_api_key:
32
  logger.error("Gemini_API environment variable not found!")
33
  logger.error("Please set the Gemini_API environment variable with your Google Gemini API key")
34
+ raise ValueError(" Gemini_API environment variable is required. Please set it in your environment.")
 
35
 
36
  genai.configure(api_key=gemini_api_key)
37
  logger.info("Gemini API configured successfully")
38
 
39
+ # Initialize Google Drive service
40
+ logger.info("Initializing Google Drive service")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  try:
42
+ drive_service = get_drive_service()
43
+ logger.info("Google Drive service initialized successfully")
44
+ except Exception as e:
45
+ logger.error(f"Failed to initialize Google Drive service: {e}")
46
+ logger.error("Please ensure GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET environment variables are set")
47
+ raise ValueError("❌ Google Drive credentials are required. Please set GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET environment variables.")
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Log startup
50
+ logger.info("Business Card Data Extractor starting up with Google Drive storage")
 
 
 
51
 
52
+ def upload_to_google_drive(file_path, is_excel=False, filename=None):
53
+ """Upload a file to Google Drive"""
54
+ try:
55
+ if is_excel:
56
+ logger.info(f"Uploading Excel file to Google Drive: {filename or file_path}")
57
+ result = upload_excel_to_exports_folder(drive_service, file_path=file_path, filename=filename)
58
+ else:
59
+ logger.info(f"Uploading image file to Google Drive: {filename or file_path}")
60
+ result = upload_image_to_images_folder(drive_service, file_path=file_path, filename=filename)
61
+
62
+ if result:
63
+ logger.info(f"Successfully uploaded to Google Drive: {result['webViewLink']}")
64
+ return result
65
+ else:
66
+ logger.error("Failed to upload to Google Drive")
67
+ return None
68
+ except Exception as e:
69
+ logger.error(f"Failed to upload to Google Drive: {e}")
70
+ return None
71
+
72
+ def upload_bytes_to_google_drive(file_data, filename, is_excel=False):
73
+ """Upload file data (bytes) to Google Drive"""
74
+ try:
75
+ if is_excel:
76
+ logger.info(f"Uploading Excel data to Google Drive: {filename}")
77
+ result = upload_excel_to_exports_folder(drive_service, file_data=file_data, filename=filename)
78
+ else:
79
+ logger.info(f"Uploading image data to Google Drive: {filename}")
80
+ result = upload_image_to_images_folder(drive_service, file_data=file_data, filename=filename)
81
+
82
+ if result:
83
+ logger.info(f"Successfully uploaded to Google Drive: {result['webViewLink']}")
84
+ return result
85
+ else:
86
+ logger.error("Failed to upload to Google Drive")
87
+ return None
88
+ except Exception as e:
89
+ logger.error(f"Failed to upload to Google Drive: {e}")
90
+ return None
91
 
92
  def extract_business_card_data_batch(images, filenames, model_name="gemini-2.5-flash"):
93
  """Extract data from multiple business card images in a single API call"""
 
269
  # Load and group images into batches of 5
270
  loaded_images = []
271
  filenames = []
272
+ uploaded_image_links = []
273
 
274
  logger.info(f"Loading {len(images)} images")
275
  for idx, image_path in enumerate(images):
 
295
 
296
  logger.info(f"Successfully loaded {len(loaded_images)} out of {len(images)} images")
297
 
298
+ # Save images to Google Drive if requested
 
299
  if save_images and loaded_images:
300
+ logger.info(f"Saving {len(loaded_images)} images to Google Drive")
301
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
302
 
303
  for i, (image, filename) in enumerate(zip(loaded_images, filenames)):
 
307
  if not ext:
308
  ext = '.png'
309
  unique_filename = f"{timestamp}_{i+1:03d}_{name}{ext}"
 
310
 
311
+ # Convert image to bytes
312
+ img_buffer = io.BytesIO()
313
+ image.save(img_buffer, format='PNG')
314
+ img_bytes = img_buffer.getvalue()
315
 
316
+ # Upload to Google Drive
317
+ result = upload_bytes_to_google_drive(img_bytes, unique_filename, is_excel=False)
318
+
319
+ if result:
320
+ uploaded_image_links.append(result['webViewLink'])
321
+ logger.debug(f"Saved image {i+1}: {unique_filename}")
322
+ else:
323
+ uploaded_image_links.append(None)
324
+ logger.error(f"Failed to upload image {unique_filename}")
325
 
326
  except Exception as e:
327
  logger.error(f"Failed to save image {filename}: {e}")
328
+ uploaded_image_links.append(None)
329
 
330
+ logger.info(f"Successfully uploaded {sum(1 for link in uploaded_image_links if link)} images to Google Drive")
 
 
 
 
 
 
 
 
 
331
 
332
  # Group into batches
333
  logger.info(f"Grouping {len(loaded_images)} images into batches of {batch_size}")
 
363
  data['processed_date'] = timestamp
364
  logger.debug(f"Added timestamp {timestamp} to {card_filename}")
365
 
366
+ # Add Google Drive image link if images were saved
367
  global_index = batch_idx * batch_size + i
368
+ if save_images and global_index < len(uploaded_image_links) and uploaded_image_links[global_index]:
369
+ data['google_drive_image_link'] = uploaded_image_links[global_index]
370
+ logger.debug(f"Added Google Drive image link for {card_filename}: {uploaded_image_links[global_index]}")
371
  else:
372
+ data['google_drive_image_link'] = None
373
 
374
  # Handle multiple values (emails, phones) by joining with commas
375
  list_fields_processed = []
 
440
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
441
  logger.debug(f"Generated timestamp: {timestamp}")
442
 
443
+ # Create temporary files for Excel generation
444
+ with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as current_temp:
445
+ current_temp_path = current_temp.name
446
+ with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as cumulative_temp:
447
+ cumulative_temp_path = cumulative_temp.name
448
 
449
+ current_filename = f"current_run_{timestamp}.xlsx"
450
+ cumulative_filename = "all_business_cards_total.xlsx"
 
451
 
452
+ # Try to download existing cumulative data from Google Drive
453
+ logger.info("Checking for existing cumulative file in Google Drive")
454
+ try:
455
+ # List files in exports folder to find existing cumulative file
456
+ exports_files = list_files_in_folder(drive_service, "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO")
457
+ cumulative_file = None
458
+ for file in exports_files:
459
+ if file['name'] == 'all_business_cards_total.xlsx':
460
+ cumulative_file = file
461
+ break
462
+
463
+ if cumulative_file:
464
+ logger.info("Existing cumulative file found in Google Drive")
465
+ # For now, we'll just use current data since downloading and merging is complex
466
+ # In production, you'd want to implement Google Drive file download
467
  cumulative_df = current_df
468
+ logger.info("Using current data only (Google Drive download not implemented yet)")
469
+ else:
470
+ logger.info("No existing cumulative file found, using current data only")
471
+ cumulative_df = current_df
472
+ except Exception as e:
473
+ logger.warning(f"Could not check for existing data in Google Drive: {e}")
474
  cumulative_df = current_df
475
 
476
  # Write current run Excel file
477
+ logger.info(f"Creating current run Excel file: {current_filename}")
478
  try:
479
+ with pd.ExcelWriter(current_temp_path, engine='openpyxl') as writer:
 
 
480
  current_df.to_excel(writer, index=False, sheet_name='Current Run')
481
  logger.debug(f"Written {len(current_df)} rows to 'Current Run' sheet")
482
 
483
  # Auto-adjust column widths
484
  logger.debug("Auto-adjusting column widths for current run file")
485
  worksheet = writer.sheets['Current Run']
 
486
  for column in current_df:
487
  column_length = max(current_df[column].astype(str).map(len).max(), len(column))
488
  col_idx = current_df.columns.get_loc(column)
489
  final_width = min(column_length + 2, 50)
490
  worksheet.column_dimensions[chr(65 + col_idx)].width = final_width
 
 
491
 
492
+ logger.info(f"Current run Excel file created locally")
493
 
494
+ # Upload current run file to Google Drive
495
+ current_result = upload_to_google_drive(current_temp_path, is_excel=True, filename=current_filename)
496
+ if current_result:
497
+ logger.info(f"Current run file uploaded to Google Drive: {current_result['webViewLink']}")
 
 
498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  except Exception as e:
500
+ logger.error(f"Failed to create current run Excel file: {e}")
501
  raise
502
 
503
  # Write cumulative Excel file
504
+ logger.info(f"Creating cumulative Excel file: {cumulative_filename}")
505
  try:
506
+ with pd.ExcelWriter(cumulative_temp_path, engine='openpyxl') as writer:
 
 
507
  cumulative_df.to_excel(writer, index=False, sheet_name='All Business Cards')
508
  logger.debug(f"Written {len(cumulative_df)} rows to 'All Business Cards' sheet")
509
 
510
  # Auto-adjust column widths
511
  logger.debug("Auto-adjusting column widths for cumulative file")
512
  worksheet = writer.sheets['All Business Cards']
 
513
  for column in cumulative_df:
514
  column_length = max(cumulative_df[column].astype(str).map(len).max(), len(column))
515
  col_idx = cumulative_df.columns.get_loc(column)
516
  final_width = min(column_length + 2, 50)
517
  worksheet.column_dimensions[chr(65 + col_idx)].width = final_width
 
 
518
 
519
+ logger.info(f"Cumulative Excel file created locally")
520
 
521
+ # Upload cumulative file to Google Drive
522
+ cumulative_result = upload_to_google_drive(cumulative_temp_path, is_excel=True, filename=cumulative_filename)
523
+ if cumulative_result:
524
+ logger.info(f"Cumulative file uploaded to Google Drive: {cumulative_result['webViewLink']}")
 
 
525
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
  except Exception as e:
527
+ logger.error(f"Failed to create cumulative Excel file: {e}")
528
  raise
529
 
530
+ # Note: Don't delete temp files here - Gradio needs them for download
531
+ # Gradio will handle cleanup automatically
532
+
533
  # Create summary message
534
  logger.info("Creating summary message")
535
  num_batches = len(image_batches) if 'image_batches' in locals() else 1
 
538
  summary += f"⚡ API calls made: {num_batches} (instead of {len(all_data)})\n"
539
 
540
  if save_images:
541
+ num_uploaded = sum(1 for link in uploaded_image_links if link) if 'uploaded_image_links' in locals() else 0
542
+ summary += f"💾 Images uploaded to Google Drive: {num_uploaded} cards\n\n"
543
  else:
544
+ summary += f"💾 Images uploaded to Google Drive: No (save option was disabled)\n\n"
545
 
546
+ summary += f"📁 Current run file: {current_filename} (uploaded to Google Drive)\n"
547
+ summary += f"📊 Total cumulative file: {cumulative_filename} (uploaded to Google Drive)\n"
548
  summary += f"📊 Total cards in database: {len(cumulative_df)}\n\n"
549
 
550
+ # Add Google Drive links
551
+ summary += "🔗 Google Drive Links:\n"
552
+ if 'current_result' in locals() and current_result:
553
+ summary += f" 📄 Current Run: {current_result['webViewLink']}\n"
554
+ if 'cumulative_result' in locals() and cumulative_result:
555
+ summary += f" 📊 Total Database: {cumulative_result['webViewLink']}\n"
556
+ summary += f" 📁 Exports Folder: https://drive.google.com/drive/folders/1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO\n"
557
+ summary += f" 🖼️ Images Folder: https://drive.google.com/drive/folders/1gd280IqcAzpAFTPeYsZjoBUOU9S7Zx3c\n\n"
558
 
559
  if errors:
560
  logger.warning(f"Encountered {len(errors)} errors during processing")
 
572
  logger.info("Business card processing session completed successfully")
573
  logger.info(f"Session summary - Cards: {len(all_data)}, Batches: {num_batches}, API calls: {num_batches}, Total DB size: {len(cumulative_df)}")
574
 
575
+ # Return the temporary file paths for download (Gradio will handle the download)
576
+ return current_temp_path, cumulative_temp_path, summary, preview_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  # Create Gradio interface
579
  logger.info("Creating Gradio interface")
 
589
  - 📁 **Current Run**: Contains only the cards you just processed
590
  - 📊 **Total Database**: Contains ALL cards ever processed (cumulative)
591
 
592
+ **☁️ Google Drive Storage:**
593
+ - 📂 Excel files: Automatically uploaded to Google Drive exports folder
594
+ - 🖼️ Images: Uploaded to Google Drive images folder (if save option enabled)
595
+ - 🔗 **Direct Links**: Access files directly through provided Google Drive links
596
+ - 📁 **Organized Folders**: Separate folders for exports and images
597
 
598
  **📌 File Access:**
599
+ - ⬇️ Download directly from interface buttons (temporary copies)
600
+ - 🔗 Access permanent files via Google Drive links in results
601
+ - 📁 **Exports Folder**: https://drive.google.com/drive/folders/1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO
602
+ - 🖼️ **Images Folder**: https://drive.google.com/drive/folders/1gd280IqcAzpAFTPeYsZjoBUOU9S7Zx3c
603
 
604
+ **⚙️ Google Drive Integration:**
605
+ - Requires `GOOGLE_CLIENT_ID` and `GOOGLE_CLIENT_SECRET` environment variables
606
+ - Files are automatically uploaded and organized in predefined folders
607
  """
608
  )
609
 
 
662
  - 📄 **Data Extraction**: Names, emails, phone numbers, addresses, and more
663
  - 📞 **Smart Combination**: Multiple emails/phones combined with commas
664
  - 🏠 **Address Merging**: All phone types and address fields combined
665
+ - ☁️ **Google Drive Storage**: Automatic upload to organized Drive folders
666
+ - 🔗 **Direct Links**: Instant access to files via Google Drive URLs
667
  - 📊 **Dual Output**: Current run + cumulative database files
668
+ - 📝 **Full Tracking**: Processing date, filename, Google Drive links, and AI model used
669
  - 🎯 **One Row Per Card**: Each business card becomes one spreadsheet row
670
  """
671
  )
env.example CHANGED
@@ -3,14 +3,19 @@
3
 
4
  # Google Gemini API Key (Required)
5
  # Get your key from: https://aistudio.google.com/
6
- # For Hugging Face Spaces: Add this as a Repository Secret named "Gemini_API"
7
  Gemini_API=your_gemini_api_key_here
8
 
9
- # Hugging Face Hub Integration (Optional - for file uploads to repository)
10
- # Get your token from: https://huggingface.co/settings/tokens
11
- # For Hugging Face Spaces: Add this as a Repository Secret
12
- HF_TOKEN=your_huggingface_token_here
 
 
 
 
13
 
14
  # Examples:
15
  # Gemini_API=AIzaSyBxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
16
- # HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 
 
3
 
4
  # Google Gemini API Key (Required)
5
  # Get your key from: https://aistudio.google.com/
6
+ # For deployment: Add this as an environment variable named "Gemini_API"
7
  Gemini_API=your_gemini_api_key_here
8
 
9
+ # Google Drive API Credentials (Required - for file storage)
10
+ # Get these from Google Cloud Console:
11
+ # 1. Create a project at https://console.cloud.google.com/
12
+ # 2. Enable Google Drive API
13
+ # 3. Create OAuth 2.0 credentials (Desktop application)
14
+ # 4. Download the JSON and extract client_id and client_secret
15
+ GOOGLE_CLIENT_ID=your_google_client_id_here
16
+ GOOGLE_CLIENT_SECRET=your_google_client_secret_here
17
 
18
  # Examples:
19
  # Gemini_API=AIzaSyBxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
20
+ # GOOGLE_CLIENT_ID=1234567890-abcdefghijklmnopqrstuvwxyz.apps.googleusercontent.com
21
+ # GOOGLE_CLIENT_SECRET=GOCSPX-xxxxxxxxxxxxxxxxxxxxxxxx
google.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from google.auth.transport.requests import Request
4
+ from google_auth_oauthlib.flow import InstalledAppFlow
5
+ from googleapiclient.discovery import build
6
+ from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload
7
+ import io
8
+ from pathlib import Path
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # --- CONFIGURATION ---
14
+ # Get credentials from environment variables
15
+ CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
16
+ CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
17
+
18
+ # Google Drive folder IDs
19
+ EXPORTS_FOLDER_ID = "1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO" # For Excel exports
20
+ IMAGES_FOLDER_ID = "1gd280IqcAzpAFTPeYsZjoBUOU9S7Zx3c" # For business card images
21
+
22
+ # Scopes define the level of access you are requesting.
23
+ SCOPES = ['https://www.googleapis.com/auth/drive.file']
24
+ TOKEN_PICKLE_FILE = 'token.pickle'
25
+
26
+ def get_drive_service():
27
+ """Authenticates with Google and returns a Drive service object."""
28
+ creds = None
29
+ # The file token.pickle stores the user's access and refresh tokens.
30
+ if os.path.exists(TOKEN_PICKLE_FILE):
31
+ with open(TOKEN_PICKLE_FILE, 'rb') as token:
32
+ creds = pickle.load(token)
33
+
34
+ # If there are no (valid) credentials available, let the user log in.
35
+ if not creds or not creds.valid:
36
+ if creds and creds.expired and creds.refresh_token:
37
+ creds.refresh(Request())
38
+ else:
39
+ if not CLIENT_ID or not CLIENT_SECRET:
40
+ raise ValueError("GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET environment variables are required")
41
+
42
+ # Use client_config dictionary instead of a client_secret.json file
43
+ client_config = {
44
+ "installed": {
45
+ "client_id": CLIENT_ID,
46
+ "client_secret": CLIENT_SECRET,
47
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
48
+ "token_uri": "https://oauth2.googleapis.com/token",
49
+ "redirect_uris": ["http://localhost"]
50
+ }
51
+ }
52
+ flow = InstalledAppFlow.from_client_config(client_config, SCOPES)
53
+ creds = flow.run_local_server(port=0)
54
+
55
+ # Save the credentials for the next run
56
+ with open(TOKEN_PICKLE_FILE, 'wb') as token:
57
+ pickle.dump(creds, token)
58
+
59
+ return build('drive', 'v3', credentials=creds)
60
+
61
+ def upload_file_to_drive(service, file_path=None, file_data=None, filename=None, folder_id=None, mimetype='application/octet-stream'):
62
+ """
63
+ Uploads a file to a specific folder in Google Drive.
64
+
65
+ Args:
66
+ service: Google Drive service object
67
+ file_path: Path to local file (for file uploads)
68
+ file_data: Bytes data (for in-memory uploads)
69
+ filename: Name for the file in Drive
70
+ folder_id: ID of the target folder
71
+ mimetype: MIME type of the file
72
+
73
+ Returns:
74
+ dict: File information (id, webViewLink) or None if failed
75
+ """
76
+ try:
77
+ if file_path and os.path.exists(file_path):
78
+ # Upload from local file
79
+ if not filename:
80
+ filename = os.path.basename(file_path)
81
+ media = MediaFileUpload(file_path, mimetype=mimetype, resumable=True)
82
+ logger.info(f"Uploading file from path: {file_path}")
83
+ elif file_data and filename:
84
+ # Upload from bytes data
85
+ file_io = io.BytesIO(file_data)
86
+ media = MediaIoBaseUpload(file_io, mimetype=mimetype, resumable=True)
87
+ logger.info(f"Uploading file from memory: {filename}")
88
+ else:
89
+ logger.error("Either file_path or (file_data + filename) must be provided")
90
+ return None
91
+
92
+ # Define the file's metadata
93
+ file_metadata = {
94
+ 'name': filename,
95
+ 'parents': [folder_id] if folder_id else []
96
+ }
97
+
98
+ logger.info(f"Uploading '{filename}' to Google Drive folder {folder_id}")
99
+
100
+ # Execute the upload request
101
+ file = service.files().create(
102
+ body=file_metadata,
103
+ media_body=media,
104
+ fields='id, webViewLink, name'
105
+ ).execute()
106
+
107
+ logger.info(f"✅ File uploaded successfully!")
108
+ logger.info(f" File ID: {file.get('id')}")
109
+ logger.info(f" File Name: {file.get('name')}")
110
+ logger.info(f" View Link: {file.get('webViewLink')}")
111
+
112
+ return {
113
+ 'id': file.get('id'),
114
+ 'name': file.get('name'),
115
+ 'webViewLink': file.get('webViewLink')
116
+ }
117
+
118
+ except Exception as e:
119
+ logger.error(f"Failed to upload file to Google Drive: {e}")
120
+ return None
121
+
122
+ def upload_excel_to_exports_folder(service, file_path=None, file_data=None, filename=None):
123
+ """Upload Excel file to the exports folder."""
124
+ return upload_file_to_drive(
125
+ service,
126
+ file_path=file_path,
127
+ file_data=file_data,
128
+ filename=filename,
129
+ folder_id=EXPORTS_FOLDER_ID,
130
+ mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
131
+ )
132
+
133
+ def upload_image_to_images_folder(service, file_path=None, file_data=None, filename=None, mimetype='image/png'):
134
+ """Upload image file to the images folder."""
135
+ return upload_file_to_drive(
136
+ service,
137
+ file_path=file_path,
138
+ file_data=file_data,
139
+ filename=filename,
140
+ folder_id=IMAGES_FOLDER_ID,
141
+ mimetype=mimetype
142
+ )
143
+
144
+ def list_files_in_folder(service, folder_id, max_results=100):
145
+ """List files in a specific Google Drive folder."""
146
+ try:
147
+ query = f"'{folder_id}' in parents"
148
+ results = service.files().list(
149
+ q=query,
150
+ maxResults=max_results,
151
+ fields="files(id, name, size, createdTime, webViewLink)"
152
+ ).execute()
153
+
154
+ files = results.get('files', [])
155
+ logger.info(f"Found {len(files)} files in folder {folder_id}")
156
+ return files
157
+ except Exception as e:
158
+ logger.error(f"Failed to list files in folder {folder_id}: {e}")
159
+ return []
160
+
161
+ if __name__ == '__main__':
162
+ # Test the Google Drive connection
163
+ try:
164
+ drive_service = get_drive_service()
165
+ logger.info("Google Drive service initialized successfully")
166
+
167
+ # List files in both folders to verify access
168
+ exports_files = list_files_in_folder(drive_service, EXPORTS_FOLDER_ID)
169
+ images_files = list_files_in_folder(drive_service, IMAGES_FOLDER_ID)
170
+
171
+ print(f"Exports folder contains {len(exports_files)} files")
172
+ print(f"Images folder contains {len(images_files)} files")
173
+
174
+ except Exception as e:
175
+ logger.error(f"Failed to initialize Google Drive: {e}")
requirements.txt CHANGED
@@ -4,4 +4,6 @@ google-generativeai==0.8.0
4
  pandas==2.1.4
5
  openpyxl==3.1.2
6
  Pillow==10.2.0
7
- huggingface_hub>=0.19.0
 
 
 
4
  pandas==2.1.4
5
  openpyxl==3.1.2
6
  Pillow==10.2.0
7
+ google-auth==2.23.4
8
+ google-auth-oauthlib==1.1.0
9
+ google-api-python-client==2.108.0