rongo1 commited on
Commit
6ba44e8
Β·
1 Parent(s): 94cacbb
Files changed (3) hide show
  1. app.py +77 -16
  2. env.example +8 -2
  3. requirements.txt +2 -1
app.py CHANGED
@@ -11,6 +11,7 @@ import base64
11
  import logging
12
  import sys
13
  import shutil
 
14
 
15
  # Configure logging
16
  # Simplified logging for cloud deployment
@@ -35,13 +36,27 @@ if not gemini_api_key:
35
  genai.configure(api_key=gemini_api_key)
36
  logger.info("Gemini API configured successfully")
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Create output directories
39
  logger.info("Setting up output directories")
40
- # Use absolute paths to avoid ambiguity in different environments
41
- output_dir = Path("business_card_exports").resolve()
42
- images_dir = Path("business_cards").resolve()
43
- output_dir.mkdir(exist_ok=True)
44
- images_dir.mkdir(exist_ok=True)
 
45
  logger.info(f"Export directory created/verified: {output_dir}")
46
  logger.info(f"Images directory created/verified: {images_dir}")
47
 
@@ -51,6 +66,25 @@ logger.info(f"Working directory: {os.getcwd()}")
51
  logger.info(f"Export directory: {output_dir.absolute()}")
52
  logger.info(f"Images directory: {images_dir.absolute()}")
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def extract_business_card_data_batch(images, filenames, model_name="gemini-2.5-flash"):
55
  """Extract data from multiple business card images in a single API call"""
56
 
@@ -276,6 +310,13 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
276
  saved_image_paths.append(str(image_path))
277
  logger.debug(f"Saved image {i+1}: {unique_filename}")
278
 
 
 
 
 
 
 
 
279
  except Exception as e:
280
  logger.error(f"Failed to save image {filename}: {e}")
281
 
@@ -450,6 +491,13 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
450
 
451
  logger.info(f"Current run Excel file saved successfully: {current_filename}")
452
 
 
 
 
 
 
 
 
453
  # Create a manifest file for the current run directory
454
  manifest_filename = output_dir / f"current_run_{timestamp}_manifest.txt"
455
  try:
@@ -465,7 +513,7 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
465
  f.write(f"Save Images: {save_images}\n")
466
  f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
467
  f.write(f"Session ID: {timestamp}\n")
468
- f.write(f"Session Summary: {summary}\n") # Use the summary variable from the previous run
469
  logger.info(f"Manifest file created: {manifest_filename}")
470
  except Exception as e:
471
  logger.error(f"Failed to create manifest file: {e}")
@@ -496,6 +544,13 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
496
 
497
  logger.info(f"Cumulative Excel file saved successfully: {cumulative_filename}")
498
 
 
 
 
 
 
 
 
499
  # Create a manifest file for the cumulative directory
500
  manifest_filename = output_dir / f"all_business_cards_total_{timestamp}_manifest.txt"
501
  try:
@@ -511,7 +566,7 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
511
  f.write(f"Save Images: {save_images}\n")
512
  f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
513
  f.write(f"Session ID: {timestamp}\n")
514
- f.write(f"Session Summary: {summary}\n") # Use the summary variable from the previous run
515
  logger.info(f"Manifest file created: {manifest_filename}")
516
  except Exception as e:
517
  logger.error(f"Failed to create manifest file: {e}")
@@ -645,15 +700,20 @@ with gr.Blocks(title="Business Card Data Extractor") as demo:
645
  - πŸ“ **Current Run**: Contains only the cards you just processed
646
  - πŸ“Š **Total Database**: Contains ALL cards ever processed (cumulative)
647
 
648
- **Image Storage:**
649
- - πŸ’Ύ **Optional**: Save uploaded images to business_cards folder
650
- - πŸ“ **Tracking**: Image file paths included in Excel database
 
 
 
 
 
 
 
651
 
652
- **πŸ“Œ Finding Your Files in Hugging Face Spaces:**
653
- - All files are saved in the `business_card_exports` and `business_cards` folders
654
- - Check `FOLDER_CONTENTS.txt` in the Files tab to see a complete list of all files
655
- - Excel files are in `business_card_exports/`
656
- - Images are in `business_cards/` (if save option enabled)
657
  """
658
  )
659
 
@@ -712,7 +772,8 @@ with gr.Blocks(title="Business Card Data Extractor") as demo:
712
  - πŸ“„ **Data Extraction**: Names, emails, phone numbers, addresses, and more
713
  - πŸ“ž **Smart Combination**: Multiple emails/phones combined with commas
714
  - 🏠 **Address Merging**: All phone types and address fields combined
715
- - πŸ’Ύ **Image Storage**: Optionally save images to business_cards folder
 
716
  - πŸ“Š **Dual Output**: Current run + cumulative database files
717
  - πŸ“ **Full Tracking**: Processing date, filename, image path, and AI model used
718
  - 🎯 **One Row Per Card**: Each business card becomes one spreadsheet row
 
11
  import logging
12
  import sys
13
  import shutil
14
+ from huggingface_hub import HfApi
15
 
16
  # Configure logging
17
  # Simplified logging for cloud deployment
 
36
  genai.configure(api_key=gemini_api_key)
37
  logger.info("Gemini API configured successfully")
38
 
39
+ # Configure Hugging Face Hub API for file persistence
40
+ space_repo_id = "polygraf-ai/business_card_extractor" # Your Space repo
41
+ hf_token = os.getenv("HF_TOKEN") # Optional - only needed for Hub uploads
42
+
43
+ if hf_token:
44
+ hf_api = HfApi(token=hf_token)
45
+ logger.info("Hugging Face Hub API configured for file uploads")
46
+ logger.info(f"Space repo: {space_repo_id}")
47
+ else:
48
+ hf_api = None
49
+ logger.info("HF_TOKEN not found - files will be saved to persistent storage only")
50
+ logger.info("To enable Hub uploads: Set HF_TOKEN in Space secrets (optional)")
51
+
52
  # Create output directories
53
  logger.info("Setting up output directories")
54
+ # Use /data for persistent storage in Hugging Face Spaces
55
+ BASE_PATH = Path("/data")
56
+ output_dir = BASE_PATH / "business_card_exports"
57
+ images_dir = BASE_PATH / "business_cards"
58
+ output_dir.mkdir(parents=True, exist_ok=True)
59
+ images_dir.mkdir(parents=True, exist_ok=True)
60
  logger.info(f"Export directory created/verified: {output_dir}")
61
  logger.info(f"Images directory created/verified: {images_dir}")
62
 
 
66
  logger.info(f"Export directory: {output_dir.absolute()}")
67
  logger.info(f"Images directory: {images_dir.absolute()}")
68
 
69
+ def upload_to_hub(file_path, path_in_repo, commit_message="Update files"):
70
+ """Upload a file to the Hugging Face Hub repository"""
71
+ if hf_api: # Only need API token, repo ID is hardcoded
72
+ try:
73
+ logger.info(f"Uploading {file_path} to Hub at {path_in_repo}")
74
+ hf_api.upload_file(
75
+ path_or_fileobj=str(file_path),
76
+ path_in_repo=path_in_repo,
77
+ repo_id=space_repo_id,
78
+ repo_type="space",
79
+ commit_message=commit_message
80
+ )
81
+ logger.info(f"Successfully uploaded {file_path} to Hub")
82
+ return True
83
+ except Exception as e:
84
+ logger.error(f"Failed to upload {file_path} to Hub: {e}")
85
+ return False
86
+ return False
87
+
88
  def extract_business_card_data_batch(images, filenames, model_name="gemini-2.5-flash"):
89
  """Extract data from multiple business card images in a single API call"""
90
 
 
310
  saved_image_paths.append(str(image_path))
311
  logger.debug(f"Saved image {i+1}: {unique_filename}")
312
 
313
+ # Upload image to Hub
314
+ upload_to_hub(
315
+ image_path,
316
+ f"images/{unique_filename}",
317
+ f"Add business card image {unique_filename}"
318
+ )
319
+
320
  except Exception as e:
321
  logger.error(f"Failed to save image {filename}: {e}")
322
 
 
491
 
492
  logger.info(f"Current run Excel file saved successfully: {current_filename}")
493
 
494
+ # Upload current run file to Hub
495
+ upload_to_hub(
496
+ current_filename,
497
+ f"exports/{current_filename.name}",
498
+ f"Add current run export {current_filename.name}"
499
+ )
500
+
501
  # Create a manifest file for the current run directory
502
  manifest_filename = output_dir / f"current_run_{timestamp}_manifest.txt"
503
  try:
 
513
  f.write(f"Save Images: {save_images}\n")
514
  f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
515
  f.write(f"Session ID: {timestamp}\n")
516
+ f.write(f"Errors: {len(errors) if 'errors' in locals() else 0}\n")
517
  logger.info(f"Manifest file created: {manifest_filename}")
518
  except Exception as e:
519
  logger.error(f"Failed to create manifest file: {e}")
 
544
 
545
  logger.info(f"Cumulative Excel file saved successfully: {cumulative_filename}")
546
 
547
+ # Upload cumulative file to Hub
548
+ upload_to_hub(
549
+ cumulative_filename,
550
+ f"exports/{cumulative_filename.name}",
551
+ f"Update cumulative database - {len(cumulative_df)} total cards"
552
+ )
553
+
554
  # Create a manifest file for the cumulative directory
555
  manifest_filename = output_dir / f"all_business_cards_total_{timestamp}_manifest.txt"
556
  try:
 
566
  f.write(f"Save Images: {save_images}\n")
567
  f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
568
  f.write(f"Session ID: {timestamp}\n")
569
+ f.write(f"Errors: {len(errors) if 'errors' in locals() else 0}\n")
570
  logger.info(f"Manifest file created: {manifest_filename}")
571
  except Exception as e:
572
  logger.error(f"Failed to create manifest file: {e}")
 
700
  - πŸ“ **Current Run**: Contains only the cards you just processed
701
  - πŸ“Š **Total Database**: Contains ALL cards ever processed (cumulative)
702
 
703
+ **πŸ”„ Persistent Storage & Hub Integration:**
704
+ - πŸ’Ύ Files saved to `/data` directory (persistent across restarts)
705
+ - 🌐 Automatically uploaded to Hub repository (if configured)
706
+ - πŸ“‚ Excel files: `/data/business_card_exports/`
707
+ - πŸ–ΌοΈ Images: `/data/business_cards/` (if save option enabled)
708
+
709
+ **πŸ“Œ File Access:**
710
+ - Download directly from interface or check Space's Files tab
711
+ - Hub uploads available at: `exports/` and `images/` folders in repo
712
+ - Check `FOLDER_CONTENTS.txt` for complete file listing
713
 
714
+ **βš™οΈ Optional Hub Integration:**
715
+ - Set `HF_TOKEN` in Space secrets to enable file uploads to repository
716
+ - Files will be uploaded to `polygraf-ai/business_card_extractor`
 
 
717
  """
718
  )
719
 
 
772
  - πŸ“„ **Data Extraction**: Names, emails, phone numbers, addresses, and more
773
  - πŸ“ž **Smart Combination**: Multiple emails/phones combined with commas
774
  - 🏠 **Address Merging**: All phone types and address fields combined
775
+ - πŸ’Ύ **Persistent Storage**: Files saved to `/data` directory (survives restarts)
776
+ - 🌐 **Hub Integration**: Auto-upload to repository (optional)
777
  - πŸ“Š **Dual Output**: Current run + cumulative database files
778
  - πŸ“ **Full Tracking**: Processing date, filename, image path, and AI model used
779
  - 🎯 **One Row Per Card**: Each business card becomes one spreadsheet row
env.example CHANGED
@@ -6,5 +6,11 @@
6
  # For Hugging Face Spaces: Add this as a Repository Secret named "Gemini_API"
7
  Gemini_API=your_gemini_api_key_here
8
 
9
- # Example:
10
- # Gemini_API=AIzaSyBxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 
 
 
 
 
 
 
6
  # For Hugging Face Spaces: Add this as a Repository Secret named "Gemini_API"
7
  Gemini_API=your_gemini_api_key_here
8
 
9
+ # Hugging Face Hub Integration (Optional - for file uploads to repository)
10
+ # Get your token from: https://huggingface.co/settings/tokens
11
+ # For Hugging Face Spaces: Add this as a Repository Secret
12
+ HF_TOKEN=your_huggingface_token_here
13
+
14
+ # Examples:
15
+ # Gemini_API=AIzaSyBxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
16
+ # HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
requirements.txt CHANGED
@@ -3,4 +3,5 @@ pydantic==2.10.6
3
  google-generativeai==0.8.0
4
  pandas==2.1.4
5
  openpyxl==3.1.2
6
- Pillow==10.2.0
 
 
3
  google-generativeai==0.8.0
4
  pandas==2.1.4
5
  openpyxl==3.1.2
6
+ Pillow==10.2.0
7
+ huggingface_hub>=0.19.0