rongo1
commited on
Commit
Β·
6ba44e8
1
Parent(s):
94cacbb
fix
Browse files- app.py +77 -16
- env.example +8 -2
- requirements.txt +2 -1
app.py
CHANGED
@@ -11,6 +11,7 @@ import base64
|
|
11 |
import logging
|
12 |
import sys
|
13 |
import shutil
|
|
|
14 |
|
15 |
# Configure logging
|
16 |
# Simplified logging for cloud deployment
|
@@ -35,13 +36,27 @@ if not gemini_api_key:
|
|
35 |
genai.configure(api_key=gemini_api_key)
|
36 |
logger.info("Gemini API configured successfully")
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
# Create output directories
|
39 |
logger.info("Setting up output directories")
|
40 |
-
# Use
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
45 |
logger.info(f"Export directory created/verified: {output_dir}")
|
46 |
logger.info(f"Images directory created/verified: {images_dir}")
|
47 |
|
@@ -51,6 +66,25 @@ logger.info(f"Working directory: {os.getcwd()}")
|
|
51 |
logger.info(f"Export directory: {output_dir.absolute()}")
|
52 |
logger.info(f"Images directory: {images_dir.absolute()}")
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
def extract_business_card_data_batch(images, filenames, model_name="gemini-2.5-flash"):
|
55 |
"""Extract data from multiple business card images in a single API call"""
|
56 |
|
@@ -276,6 +310,13 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
|
|
276 |
saved_image_paths.append(str(image_path))
|
277 |
logger.debug(f"Saved image {i+1}: {unique_filename}")
|
278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
except Exception as e:
|
280 |
logger.error(f"Failed to save image {filename}: {e}")
|
281 |
|
@@ -450,6 +491,13 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
|
|
450 |
|
451 |
logger.info(f"Current run Excel file saved successfully: {current_filename}")
|
452 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
# Create a manifest file for the current run directory
|
454 |
manifest_filename = output_dir / f"current_run_{timestamp}_manifest.txt"
|
455 |
try:
|
@@ -465,7 +513,7 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
|
|
465 |
f.write(f"Save Images: {save_images}\n")
|
466 |
f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
467 |
f.write(f"Session ID: {timestamp}\n")
|
468 |
-
f.write(f"
|
469 |
logger.info(f"Manifest file created: {manifest_filename}")
|
470 |
except Exception as e:
|
471 |
logger.error(f"Failed to create manifest file: {e}")
|
@@ -496,6 +544,13 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
|
|
496 |
|
497 |
logger.info(f"Cumulative Excel file saved successfully: {cumulative_filename}")
|
498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
# Create a manifest file for the cumulative directory
|
500 |
manifest_filename = output_dir / f"all_business_cards_total_{timestamp}_manifest.txt"
|
501 |
try:
|
@@ -511,7 +566,7 @@ def process_business_cards(images, model_name="gemini-2.5-flash", save_images=Tr
|
|
511 |
f.write(f"Save Images: {save_images}\n")
|
512 |
f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
513 |
f.write(f"Session ID: {timestamp}\n")
|
514 |
-
f.write(f"
|
515 |
logger.info(f"Manifest file created: {manifest_filename}")
|
516 |
except Exception as e:
|
517 |
logger.error(f"Failed to create manifest file: {e}")
|
@@ -645,15 +700,20 @@ with gr.Blocks(title="Business Card Data Extractor") as demo:
|
|
645 |
- π **Current Run**: Contains only the cards you just processed
|
646 |
- π **Total Database**: Contains ALL cards ever processed (cumulative)
|
647 |
|
648 |
-
|
649 |
-
- πΎ
|
650 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
|
652 |
-
|
653 |
-
-
|
654 |
-
-
|
655 |
-
- Excel files are in `business_card_exports/`
|
656 |
-
- Images are in `business_cards/` (if save option enabled)
|
657 |
"""
|
658 |
)
|
659 |
|
@@ -712,7 +772,8 @@ with gr.Blocks(title="Business Card Data Extractor") as demo:
|
|
712 |
- π **Data Extraction**: Names, emails, phone numbers, addresses, and more
|
713 |
- π **Smart Combination**: Multiple emails/phones combined with commas
|
714 |
- π **Address Merging**: All phone types and address fields combined
|
715 |
-
- πΎ **
|
|
|
716 |
- π **Dual Output**: Current run + cumulative database files
|
717 |
- π **Full Tracking**: Processing date, filename, image path, and AI model used
|
718 |
- π― **One Row Per Card**: Each business card becomes one spreadsheet row
|
|
|
11 |
import logging
|
12 |
import sys
|
13 |
import shutil
|
14 |
+
from huggingface_hub import HfApi
|
15 |
|
16 |
# Configure logging
|
17 |
# Simplified logging for cloud deployment
|
|
|
36 |
genai.configure(api_key=gemini_api_key)
|
37 |
logger.info("Gemini API configured successfully")
|
38 |
|
39 |
+
# Configure Hugging Face Hub API for file persistence
|
40 |
+
space_repo_id = "polygraf-ai/business_card_extractor" # Your Space repo
|
41 |
+
hf_token = os.getenv("HF_TOKEN") # Optional - only needed for Hub uploads
|
42 |
+
|
43 |
+
if hf_token:
|
44 |
+
hf_api = HfApi(token=hf_token)
|
45 |
+
logger.info("Hugging Face Hub API configured for file uploads")
|
46 |
+
logger.info(f"Space repo: {space_repo_id}")
|
47 |
+
else:
|
48 |
+
hf_api = None
|
49 |
+
logger.info("HF_TOKEN not found - files will be saved to persistent storage only")
|
50 |
+
logger.info("To enable Hub uploads: Set HF_TOKEN in Space secrets (optional)")
|
51 |
+
|
52 |
# Create output directories
|
53 |
logger.info("Setting up output directories")
|
54 |
+
# Use /data for persistent storage in Hugging Face Spaces
|
55 |
+
BASE_PATH = Path("/data")
|
56 |
+
output_dir = BASE_PATH / "business_card_exports"
|
57 |
+
images_dir = BASE_PATH / "business_cards"
|
58 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
59 |
+
images_dir.mkdir(parents=True, exist_ok=True)
|
60 |
logger.info(f"Export directory created/verified: {output_dir}")
|
61 |
logger.info(f"Images directory created/verified: {images_dir}")
|
62 |
|
|
|
66 |
logger.info(f"Export directory: {output_dir.absolute()}")
|
67 |
logger.info(f"Images directory: {images_dir.absolute()}")
|
68 |
|
69 |
+
def upload_to_hub(file_path, path_in_repo, commit_message="Update files"):
|
70 |
+
"""Upload a file to the Hugging Face Hub repository"""
|
71 |
+
if hf_api: # Only need API token, repo ID is hardcoded
|
72 |
+
try:
|
73 |
+
logger.info(f"Uploading {file_path} to Hub at {path_in_repo}")
|
74 |
+
hf_api.upload_file(
|
75 |
+
path_or_fileobj=str(file_path),
|
76 |
+
path_in_repo=path_in_repo,
|
77 |
+
repo_id=space_repo_id,
|
78 |
+
repo_type="space",
|
79 |
+
commit_message=commit_message
|
80 |
+
)
|
81 |
+
logger.info(f"Successfully uploaded {file_path} to Hub")
|
82 |
+
return True
|
83 |
+
except Exception as e:
|
84 |
+
logger.error(f"Failed to upload {file_path} to Hub: {e}")
|
85 |
+
return False
|
86 |
+
return False
|
87 |
+
|
88 |
def extract_business_card_data_batch(images, filenames, model_name="gemini-2.5-flash"):
|
89 |
"""Extract data from multiple business card images in a single API call"""
|
90 |
|
|
|
310 |
saved_image_paths.append(str(image_path))
|
311 |
logger.debug(f"Saved image {i+1}: {unique_filename}")
|
312 |
|
313 |
+
# Upload image to Hub
|
314 |
+
upload_to_hub(
|
315 |
+
image_path,
|
316 |
+
f"images/{unique_filename}",
|
317 |
+
f"Add business card image {unique_filename}"
|
318 |
+
)
|
319 |
+
|
320 |
except Exception as e:
|
321 |
logger.error(f"Failed to save image {filename}: {e}")
|
322 |
|
|
|
491 |
|
492 |
logger.info(f"Current run Excel file saved successfully: {current_filename}")
|
493 |
|
494 |
+
# Upload current run file to Hub
|
495 |
+
upload_to_hub(
|
496 |
+
current_filename,
|
497 |
+
f"exports/{current_filename.name}",
|
498 |
+
f"Add current run export {current_filename.name}"
|
499 |
+
)
|
500 |
+
|
501 |
# Create a manifest file for the current run directory
|
502 |
manifest_filename = output_dir / f"current_run_{timestamp}_manifest.txt"
|
503 |
try:
|
|
|
513 |
f.write(f"Save Images: {save_images}\n")
|
514 |
f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
515 |
f.write(f"Session ID: {timestamp}\n")
|
516 |
+
f.write(f"Errors: {len(errors) if 'errors' in locals() else 0}\n")
|
517 |
logger.info(f"Manifest file created: {manifest_filename}")
|
518 |
except Exception as e:
|
519 |
logger.error(f"Failed to create manifest file: {e}")
|
|
|
544 |
|
545 |
logger.info(f"Cumulative Excel file saved successfully: {cumulative_filename}")
|
546 |
|
547 |
+
# Upload cumulative file to Hub
|
548 |
+
upload_to_hub(
|
549 |
+
cumulative_filename,
|
550 |
+
f"exports/{cumulative_filename.name}",
|
551 |
+
f"Update cumulative database - {len(cumulative_df)} total cards"
|
552 |
+
)
|
553 |
+
|
554 |
# Create a manifest file for the cumulative directory
|
555 |
manifest_filename = output_dir / f"all_business_cards_total_{timestamp}_manifest.txt"
|
556 |
try:
|
|
|
566 |
f.write(f"Save Images: {save_images}\n")
|
567 |
f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
568 |
f.write(f"Session ID: {timestamp}\n")
|
569 |
+
f.write(f"Errors: {len(errors) if 'errors' in locals() else 0}\n")
|
570 |
logger.info(f"Manifest file created: {manifest_filename}")
|
571 |
except Exception as e:
|
572 |
logger.error(f"Failed to create manifest file: {e}")
|
|
|
700 |
- π **Current Run**: Contains only the cards you just processed
|
701 |
- π **Total Database**: Contains ALL cards ever processed (cumulative)
|
702 |
|
703 |
+
**π Persistent Storage & Hub Integration:**
|
704 |
+
- πΎ Files saved to `/data` directory (persistent across restarts)
|
705 |
+
- π Automatically uploaded to Hub repository (if configured)
|
706 |
+
- π Excel files: `/data/business_card_exports/`
|
707 |
+
- πΌοΈ Images: `/data/business_cards/` (if save option enabled)
|
708 |
+
|
709 |
+
**π File Access:**
|
710 |
+
- Download directly from interface or check Space's Files tab
|
711 |
+
- Hub uploads available at: `exports/` and `images/` folders in repo
|
712 |
+
- Check `FOLDER_CONTENTS.txt` for complete file listing
|
713 |
|
714 |
+
**βοΈ Optional Hub Integration:**
|
715 |
+
- Set `HF_TOKEN` in Space secrets to enable file uploads to repository
|
716 |
+
- Files will be uploaded to `polygraf-ai/business_card_extractor`
|
|
|
|
|
717 |
"""
|
718 |
)
|
719 |
|
|
|
772 |
- π **Data Extraction**: Names, emails, phone numbers, addresses, and more
|
773 |
- π **Smart Combination**: Multiple emails/phones combined with commas
|
774 |
- π **Address Merging**: All phone types and address fields combined
|
775 |
+
- πΎ **Persistent Storage**: Files saved to `/data` directory (survives restarts)
|
776 |
+
- π **Hub Integration**: Auto-upload to repository (optional)
|
777 |
- π **Dual Output**: Current run + cumulative database files
|
778 |
- π **Full Tracking**: Processing date, filename, image path, and AI model used
|
779 |
- π― **One Row Per Card**: Each business card becomes one spreadsheet row
|
env.example
CHANGED
@@ -6,5 +6,11 @@
|
|
6 |
# For Hugging Face Spaces: Add this as a Repository Secret named "Gemini_API"
|
7 |
Gemini_API=your_gemini_api_key_here
|
8 |
|
9 |
-
#
|
10 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
# For Hugging Face Spaces: Add this as a Repository Secret named "Gemini_API"
|
7 |
Gemini_API=your_gemini_api_key_here
|
8 |
|
9 |
+
# Hugging Face Hub Integration (Optional - for file uploads to repository)
|
10 |
+
# Get your token from: https://huggingface.co/settings/tokens
|
11 |
+
# For Hugging Face Spaces: Add this as a Repository Secret
|
12 |
+
HF_TOKEN=your_huggingface_token_here
|
13 |
+
|
14 |
+
# Examples:
|
15 |
+
# Gemini_API=AIzaSyBxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
16 |
+
# HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ pydantic==2.10.6
|
|
3 |
google-generativeai==0.8.0
|
4 |
pandas==2.1.4
|
5 |
openpyxl==3.1.2
|
6 |
-
Pillow==10.2.0
|
|
|
|
3 |
google-generativeai==0.8.0
|
4 |
pandas==2.1.4
|
5 |
openpyxl==3.1.2
|
6 |
+
Pillow==10.2.0
|
7 |
+
huggingface_hub>=0.19.0
|