File size: 36,438 Bytes
29d41e1
 
 
 
 
 
 
 
 
 
 
 
dae9b98
 
50ef2fd
 
 
 
 
 
 
 
 
dae9b98
3139aea
 
 
 
 
 
 
 
 
 
29d41e1
 
 
 
 
 
 
 
 
 
 
87132f1
 
29d41e1
 
 
87132f1
dae9b98
29d41e1
 
87132f1
29d41e1
dae9b98
 
46dec01
dae9b98
 
 
 
 
 
29d41e1
 
dae9b98
29d41e1
dae9b98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ba44e8
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87132f1
29d41e1
 
 
 
 
 
 
 
 
 
 
87132f1
29d41e1
87132f1
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6f9d21
657f105
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6f9d21
657f105
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50ef2fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9b98
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
50ef2fd
 
 
 
29d41e1
50ef2fd
29d41e1
 
 
 
 
 
 
 
dae9b98
29d41e1
dae9b98
29d41e1
 
 
 
 
 
 
 
 
 
dae9b98
 
 
 
29d41e1
dae9b98
 
 
 
 
 
 
 
 
6ba44e8
29d41e1
 
dae9b98
29d41e1
dae9b98
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9b98
29d41e1
dae9b98
 
 
29d41e1
dae9b98
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9b98
 
 
 
 
29d41e1
dae9b98
 
29d41e1
3139aea
dae9b98
3139aea
 
dae9b98
3139aea
 
 
 
dae9b98
3139aea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9b98
 
 
3139aea
dae9b98
3139aea
 
29d41e1
 
 
dae9b98
29d41e1
dae9b98
29d41e1
 
 
 
 
 
 
 
 
 
 
 
dae9b98
d9decab
dae9b98
 
 
 
6ba44e8
29d41e1
dae9b98
29d41e1
 
 
dae9b98
29d41e1
dae9b98
29d41e1
 
 
 
 
 
 
 
 
 
 
 
dae9b98
d9decab
dae9b98
 
 
 
6ba44e8
29d41e1
dae9b98
29d41e1
 
dae9b98
 
 
29d41e1
 
 
 
657f105
87132f1
29d41e1
 
 
dae9b98
 
29d41e1
dae9b98
29d41e1
dae9b98
 
3139aea
 
 
 
 
 
 
 
29d41e1
dae9b98
 
 
 
 
 
 
 
d9decab
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9b98
 
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9b98
 
 
 
 
6ba44e8
 
dae9b98
 
 
 
d9decab
dae9b98
 
 
29d41e1
 
 
 
 
50ef2fd
 
 
 
 
29d41e1
 
 
50ef2fd
29d41e1
 
 
657f105
2b81079
1fe0c11
29d41e1
 
 
 
1fe0c11
29d41e1
 
 
 
 
 
 
 
 
1fe0c11
29d41e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657f105
29d41e1
 
 
 
dae9b98
 
29d41e1
dae9b98
29d41e1
 
 
 
0c68bd6
 
1e9afab
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
import gradio as gr
import google.generativeai as genai
import json
import pandas as pd
from datetime import datetime
import os
from pathlib import Path
from PIL import Image
import io
import base64
import logging
import sys
import tempfile

# Import and register HEIF support
try:
    from pillow_heif import register_heif_opener
    register_heif_opener()
    HEIF_SUPPORTED = True
except ImportError:
    HEIF_SUPPORTED = False
    logging.warning("pillow-heif not installed. HEIF/HEIC support disabled.")

# Import Google Drive functionality
from google_funcs import (
    get_drive_service, 
    upload_excel_to_exports_folder, 
    upload_image_to_images_folder, 
    list_files_in_folder,
    download_file_from_drive,
    get_existing_cumulative_file,
    cleanup_duplicate_cumulative_files,
    delete_file_from_drive
)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

# Configure AI API
logger.info("Configuring AI API")
gemini_api_key = os.getenv("Gemini_API")
if not gemini_api_key:
    logger.error("Gemini_API environment variable not found!")
    logger.error("Please set the Gemini_API environment variable with your AI API key")
    raise ValueError("❌ Gemini_API environment variable is required. Please set it in your environment.")

genai.configure(api_key=gemini_api_key)
logger.info("AI API configured successfully")

# Initialize Google Drive service
logger.info("Initializing Google Drive service")
try:
    drive_service = get_drive_service()
    logger.info("Google Drive service initialized successfully")
except Exception as e:
    logger.error(f"Failed to initialize Google Drive service: {e}")
    logger.error("Please ensure GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET environment variables are set")
    raise ValueError("❌ Google Drive credentials are required. Please set GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET environment variables.")

# Log startup
logger.info("Business Card Data Extractor starting up with Google Drive storage")

def upload_to_google_drive(file_path, is_excel=False, filename=None):
    """Upload a file to Google Drive"""
    try:
        if is_excel:
            logger.info(f"Uploading Excel file to Google Drive: {filename or file_path}")
            result = upload_excel_to_exports_folder(drive_service, file_path=file_path, filename=filename)
        else:
            logger.info(f"Uploading image file to Google Drive: {filename or file_path}")
            result = upload_image_to_images_folder(drive_service, file_path=file_path, filename=filename)
        
        if result:
            logger.info(f"Successfully uploaded to Google Drive: {result['webViewLink']}")
            return result
        else:
            logger.error("Failed to upload to Google Drive")
            return None
    except Exception as e:
        logger.error(f"Failed to upload to Google Drive: {e}")
        return None

def upload_bytes_to_google_drive(file_data, filename, is_excel=False):
    """Upload file data (bytes) to Google Drive"""
    try:
        if is_excel:
            logger.info(f"Uploading Excel data to Google Drive: {filename}")
            result = upload_excel_to_exports_folder(drive_service, file_data=file_data, filename=filename)
        else:
            logger.info(f"Uploading image data to Google Drive: {filename}")
            result = upload_image_to_images_folder(drive_service, file_data=file_data, filename=filename)
        
        if result:
            logger.info(f"Successfully uploaded to Google Drive: {result['webViewLink']}")
            return result
        else:
            logger.error("Failed to upload to Google Drive")
            return None
    except Exception as e:
        logger.error(f"Failed to upload to Google Drive: {e}")
        return None

def extract_business_card_data_batch(images, filenames, model_name="gemini-2.5-flash"):
    """Extract data from multiple business card images in a single API call"""
    
    logger.info(f"Starting batch extraction for {len(images)} images using model: {model_name}")
    logger.debug(f"Filenames in batch: {filenames}")
    
    # Load prompts
    logger.debug("Loading prompt templates")
    try:
        with open("prompts/prompt.txt", "r", encoding="utf-8") as f:
            prompt_template = f.read()
        logger.debug(f"Loaded prompt template ({len(prompt_template)} characters)")
        
        with open("prompts/system_prompt.txt", "r", encoding="utf-8") as f:
            system_prompt = f.read()
        logger.debug(f"Loaded system prompt ({len(system_prompt)} characters)")
    except FileNotFoundError as e:
        logger.error(f"Failed to load prompt files: {e}")
        raise
    
    # Configure model
    logger.debug(f"Configuring AI model: {model_name}")
    generation_config = {
        "temperature": 0.1,
        "response_mime_type": "application/json"
    }
    
    try:
        model = genai.GenerativeModel(
            model_name=model_name,
            generation_config=generation_config,
            system_instruction=system_prompt
        )
        logger.debug("AI model configured successfully")
    except Exception as e:
        logger.error(f"Failed to configure AI model: {e}")
        raise
    
    # Prepare multiple images for the model
    logger.debug("Preparing content parts for API request")
    content_parts = []
    
    # Add the prompt first
    batch_prompt = f"""
{prompt_template}

I'm sending you {len(images)} business card images. Please extract the data from each card and return a JSON array with {len(images)} objects. Each object should contain the extracted data for one business card in the same order as the images.

Return format: [card1_data, card2_data, card3_data, ...]
    """
    content_parts.append(batch_prompt)
    logger.debug(f"Added batch prompt ({len(batch_prompt)} characters)")
    
    # Add each image
    logger.debug("Converting and adding images to request")
    for i, image in enumerate(images):
        try:
            buffered = io.BytesIO()
            image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            
            image_part = {
                "mime_type": "image/png",
                "data": img_base64
            }
            content_parts.append(f"Business Card {i+1}:")
            content_parts.append(image_part)
            logger.debug(f"Added image {i+1} ({len(img_base64)} base64 characters)")
        except Exception as e:
            logger.error(f"Failed to process image {i+1} ({filenames[i] if i < len(filenames) else 'unknown'}): {e}")
            raise
    
    # Generate content
    logger.info(f"Making API call to {model_name} with {len(content_parts)} content parts")
    try:
        response = model.generate_content(content_parts)
        logger.info(f"API call successful. Response length: {len(response.text) if response.text else 0} characters")
        logger.debug(f"Raw response: {response.text[:500]}..." if len(response.text) > 500 else f"Raw response: {response.text}")
    except Exception as e:
        logger.error(f"API call failed: {e}")
        raise
    
    # Parse response
    logger.debug("Parsing JSON response")
    try:
        # Parse JSON response
        response_data = json.loads(response.text)
        logger.info(f"Successfully parsed JSON response")
        
        # Ensure we got an array
        if not isinstance(response_data, list):
            logger.debug("Response is not an array, converting to array")
            response_data = [response_data]
        
        logger.info(f"Response contains {len(response_data)} extracted card data objects")
        
        # Add metadata to each card's data
        logger.debug("Adding metadata to extracted data")
        for i, data in enumerate(response_data):
            # Use user-friendly model name for Excel
            data['method'] = "Speed-Optimized model" if "flash" in model_name else "Accuracy-Optimized model"
            if i < len(filenames):
                data['filename'] = filenames[i]
                logger.debug(f"Added metadata to card {i+1}: {filenames[i]}")
        
        logger.info(f"Batch extraction completed successfully for {len(response_data)} cards")
        return response_data
        
    except json.JSONDecodeError as e:
        logger.warning(f"Initial JSON parsing failed: {e}. Attempting to clean response.")
        # Try to clean the response
        text = response.text.strip()
        if text.startswith("```json"):
            text = text[7:]
            logger.debug("Removed ```json prefix")
        if text.endswith("```"):
            text = text[:-3]
            logger.debug("Removed ``` suffix")
        
        try:
            response_data = json.loads(text.strip())
            logger.info("Successfully parsed cleaned JSON response")
            
            # Ensure we got an array
            if not isinstance(response_data, list):
                logger.debug("Cleaned response is not an array, converting to array")
                response_data = [response_data]
            
            logger.info(f"Cleaned response contains {len(response_data)} extracted card data objects")
            
            # Add metadata to each card's data
            logger.debug("Adding metadata to cleaned extracted data")
            for i, data in enumerate(response_data):
                # Use user-friendly model name for Excel
                data['method'] = "Speed-Optimized model" if "flash" in model_name else "Accuracy-Optimized model"
                if i < len(filenames):
                    data['filename'] = filenames[i]
                    logger.debug(f"Added metadata to cleaned card {i+1}: {filenames[i]}")
            
            logger.info(f"Batch extraction completed successfully after cleaning for {len(response_data)} cards")
            return response_data
        except json.JSONDecodeError as e2:
            logger.error(f"Failed to parse even cleaned JSON response: {e2}")
            logger.error(f"Cleaned text: {text[:1000]}...")
            raise

def extract_business_card_data(image, model_name="gemini-2.5-flash"):
    """Extract data from single business card image - legacy function"""
    logger.debug(f"Single card extraction called with model: {model_name}")
    result = extract_business_card_data_batch([image], ["single_card"], model_name)
    if result:
        logger.debug("Single card extraction successful")
        return result[0]
    else:
        logger.warning("Single card extraction returned no results")
        return None

def convert_image_for_processing(image, filename):
    """Convert image to RGB JPEG format for better compatibility"""
    try:
        # Convert to RGB if necessary (HEIF images might be in different modes)
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Create a buffer for the converted image
        buffer = io.BytesIO()
        image.save(buffer, format='JPEG', quality=95)
        buffer.seek(0)
        
        # Return the converted image
        return Image.open(buffer)
    except Exception as e:
        logger.warning(f"Could not convert {filename}: {str(e)}. Using original image.")
        return image

def process_business_cards(images, model_name="gemini-2.5-flash", save_images=True):
    """Process multiple business card images and create both current run and cumulative Excel files"""
    
    logger.info(f"Starting business card processing session")
    logger.info(f"Number of images received: {len(images) if images else 0}")
    logger.info(f"Model selected: {model_name}")
    logger.info(f"Save images option: {save_images}")
    
    if not images:
        logger.warning("No images provided for processing")
        return None, None, "Please upload at least one business card image.", None
    
    all_data = []
    errors = []
    
    # Prepare images for batch processing
    logger.info("Preparing images for batch processing")
    image_batches = []
    filename_batches = []
    batch_size = 5
    logger.debug(f"Using batch size: {batch_size}")
    
    # Load and group images into batches of 5
    loaded_images = []
    filenames = []
    uploaded_image_links = []
    
    logger.info(f"Loading {len(images)} images")
    for idx, image_path in enumerate(images):
        try:
            # Load image
            if isinstance(image_path, str):
                logger.debug(f"Loading image {idx+1}: {image_path}")
                image = Image.open(image_path)
                filename = os.path.basename(image_path)
            else:
                logger.debug(f"Using direct image object {idx+1}")
                image = image_path
                filename = f"image_{idx+1}.png"
            
            # Convert image for better compatibility (especially for HEIF/HEIC)
            converted_image = convert_image_for_processing(image, filename)
            
            loaded_images.append(converted_image)
            filenames.append(filename)
            logger.debug(f"Successfully loaded image {idx+1}: {filename} (size: {converted_image.size})")
            
        except Exception as e:
            error_msg = f"Error loading {image_path}: {str(e)}"
            logger.error(error_msg)
            errors.append(error_msg)
    
    logger.info(f"Successfully loaded {len(loaded_images)} out of {len(images)} images")
    
    # Save images to Google Drive if requested
    if save_images and loaded_images:
        logger.info(f"Saving {len(loaded_images)} images to Google Drive")
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        for i, (image, filename) in enumerate(zip(loaded_images, filenames)):
            try:
                # Create unique filename with timestamp
                name, ext = os.path.splitext(filename)
                if not ext:
                    ext = '.png'
                unique_filename = f"{timestamp}_{i+1:03d}_{name}{ext}"
                
                # Convert image to bytes
                img_buffer = io.BytesIO()
                image.save(img_buffer, format='PNG')
                img_bytes = img_buffer.getvalue()
                
                # Upload to Google Drive
                result = upload_bytes_to_google_drive(img_bytes, unique_filename, is_excel=False)
                
                if result:
                    uploaded_image_links.append(result['webViewLink'])
                    logger.debug(f"Saved image {i+1}: {unique_filename}")
                else:
                    uploaded_image_links.append(None)
                    logger.error(f"Failed to upload image {unique_filename}")
                
            except Exception as e:
                logger.error(f"Failed to save image {filename}: {e}")
                uploaded_image_links.append(None)
        
        logger.info(f"Successfully uploaded {sum(1 for link in uploaded_image_links if link)} images to Google Drive")
    
    # Group into batches
    logger.info(f"Grouping {len(loaded_images)} images into batches of {batch_size}")
    for i in range(0, len(loaded_images), batch_size):
        batch_images = loaded_images[i:i + batch_size]
        batch_filenames = filenames[i:i + batch_size]
        image_batches.append(batch_images)
        filename_batches.append(batch_filenames)
        logger.debug(f"Created batch {len(image_batches)} with {len(batch_images)} images: {batch_filenames}")
    
    logger.info(f"Created {len(image_batches)} batches for processing")
    
    # Process each batch
    logger.info(f"Starting processing of {len(image_batches)} batches")
    for batch_idx, (batch_images, batch_filenames) in enumerate(zip(image_batches, filename_batches)):
        try:
            logger.info(f"Processing batch {batch_idx + 1}/{len(image_batches)} ({len(batch_images)} cards)")
            print(f"Processing batch {batch_idx + 1}/{len(image_batches)} ({len(batch_images)} cards)")
            
            # Extract data for the entire batch
            logger.debug(f"Calling batch extraction for batch {batch_idx + 1}")
            batch_data = extract_business_card_data_batch(batch_images, batch_filenames, model_name)
            logger.info(f"Batch {batch_idx + 1} extraction completed, got {len(batch_data)} results")
            
            # Process each card's data in the batch
            logger.debug(f"Processing individual card data for batch {batch_idx + 1}")
            for i, data in enumerate(batch_data):
                card_filename = batch_filenames[i] if i < len(batch_filenames) else f"card_{i+1}"
                logger.debug(f"Processing card data for: {card_filename}")
                
                # Add timestamp to data
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                data['processed_date'] = timestamp
                logger.debug(f"Added timestamp {timestamp} to {card_filename}")
                
                # Add Google Drive image link if images were saved
                global_index = batch_idx * batch_size + i
                if save_images and global_index < len(uploaded_image_links) and uploaded_image_links[global_index]:
                    data['google_drive_image_link'] = uploaded_image_links[global_index]
                    logger.debug(f"Added Google Drive image link for {card_filename}: {uploaded_image_links[global_index]}")
                else:
                    data['google_drive_image_link'] = None
                
                # Handle multiple values (emails, phones) by joining with commas
                list_fields_processed = []
                for key, value in data.items():
                    if isinstance(value, list):
                        original_count = len(value)
                        data[key] = ', '.join(str(v) for v in value)
                        list_fields_processed.append(f"{key}({original_count})")
                        logger.debug(f"Combined {original_count} {key} values for {card_filename}")
                
                if list_fields_processed:
                    logger.debug(f"List fields processed for {card_filename}: {list_fields_processed}")
                
                # Combine phone fields if they exist separately
                if 'mobile_phones' in data and data['mobile_phones']:
                    logger.debug(f"Combining phone fields for {card_filename}")
                    if data.get('phones'):
                        # Combine mobile and regular phones
                        existing_phones = str(data['phones']) if data['phones'] else ""
                        mobile_phones = str(data['mobile_phones']) if data['mobile_phones'] else ""
                        combined = [p for p in [existing_phones, mobile_phones] if p and p != 'null']
                        data['phones'] = ', '.join(combined)
                        logger.debug(f"Combined phones for {card_filename}: {data['phones']}")
                    else:
                        data['phones'] = data['mobile_phones']
                        logger.debug(f"Used mobile phones as phones for {card_filename}: {data['phones']}")
                    del data['mobile_phones']  # Remove separate mobile field
                
                # Combine address fields if they exist separately  
                if 'street' in data and data['street']:
                    logger.debug(f"Combining address fields for {card_filename}")
                    if data.get('address'):
                        # If both exist, combine them
                        if str(data['street']) != str(data['address']) and data['street'] != 'null':
                            original_address = data['address']
                            data['address'] = f"{data['street']}, {data['address']}"
                            logger.debug(f"Combined address for {card_filename}: '{data['street']}' + '{original_address}' = '{data['address']}'")
                    else:
                        data['address'] = data['street']
                        logger.debug(f"Used street as address for {card_filename}: {data['address']}")
                    del data['street']  # Remove separate street field
                
                all_data.append(data)
                logger.debug(f"Added processed data for {card_filename} to results (total: {len(all_data)})")
            
            logger.info(f"Completed processing batch {batch_idx + 1}, total cards processed so far: {len(all_data)}")
            
        except Exception as e:
            batch_filenames_str = ', '.join(batch_filenames)
            error_msg = f"Error processing batch {batch_idx + 1} ({batch_filenames_str}): {str(e)}"
            logger.error(error_msg)
            errors.append(error_msg)
    
    if not all_data:
        logger.warning("No data could be extracted from any images")
        error_summary = "No data could be extracted from the images.\n" + "\n".join(errors)
        return None, None, error_summary, None
    
    logger.info(f"Successfully extracted data from {len(all_data)} business cards")
    
    # Create DataFrame for current run
    logger.info("Creating DataFrame for current run")
    current_df = pd.DataFrame(all_data)
    logger.debug(f"Current run DataFrame created with {len(current_df)} rows and {len(current_df.columns)} columns")
    logger.debug(f"Columns: {list(current_df.columns)}")
    
    # Generate timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    logger.debug(f"Generated timestamp: {timestamp}")
    
    # Create temporary files for Excel generation
    with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as current_temp:
        current_temp_path = current_temp.name
    with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as cumulative_temp:
        cumulative_temp_path = cumulative_temp.name
    
    current_filename = f"current_run_{timestamp}.xlsx"
    cumulative_filename = "all_business_cards_total.xlsx"
    
    # Download and merge existing cumulative data from Google Drive
    logger.info("Checking for existing cumulative file in Google Drive")
    cumulative_df = current_df  # Default to current data
    
    try:
        # Clean up any duplicate cumulative files first
        duplicates_removed = cleanup_duplicate_cumulative_files(drive_service)
        if duplicates_removed > 0:
            logger.info(f"Cleaned up {duplicates_removed} duplicate cumulative files")
        
        # Get the existing cumulative file
        existing_file = get_existing_cumulative_file(drive_service)
        
        if existing_file:
            logger.info(f"Existing cumulative file found: {existing_file['name']} (ID: {existing_file['id']})")
            
            # Create temporary file for download
            with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as existing_temp:
                existing_temp_path = existing_temp.name
            
            # Download existing file
            if download_file_from_drive(drive_service, existing_file['id'], existing_temp_path):
                logger.info("Successfully downloaded existing cumulative file")
                
                try:
                    # Read existing data
                    existing_df = pd.read_excel(existing_temp_path)
                    logger.info(f"Loaded existing data: {len(existing_df)} rows")
                    
                    # Merge with current data
                    cumulative_df = pd.concat([existing_df, current_df], ignore_index=True)
                    logger.info(f"Merged data: {len(existing_df)} existing + {len(current_df)} new = {len(cumulative_df)} total rows")
                    
                    # Delete the old file from Google Drive since we'll upload a new one
                    delete_file_from_drive(drive_service, existing_file['id'])
                    logger.info("Deleted old cumulative file from Google Drive")
                    
                except Exception as e:
                    logger.error(f"Failed to read existing Excel file: {e}")
                    logger.info("Using current data only")
                    cumulative_df = current_df
                finally:
                    # Clean up temporary file
                    try:
                        os.unlink(existing_temp_path)
                    except:
                        pass
            else:
                logger.warning("Failed to download existing cumulative file, using current data only")
                cumulative_df = current_df
        else:
            logger.info("No existing cumulative file found, using current data only")
            cumulative_df = current_df
            
    except Exception as e:
        logger.warning(f"Error handling existing cumulative data: {e}")
        logger.info("Using current data only")
        cumulative_df = current_df
    
    # Write current run Excel file
    logger.info(f"Creating current run Excel file: {current_filename}")
    try:
        with pd.ExcelWriter(current_temp_path, engine='openpyxl') as writer:
            current_df.to_excel(writer, index=False, sheet_name='Current Run')
            logger.debug(f"Written {len(current_df)} rows to 'Current Run' sheet")
            
            # Auto-adjust column widths
            logger.debug("Auto-adjusting column widths for current run file")
            worksheet = writer.sheets['Current Run']
            for column in current_df:
                column_length = max(current_df[column].astype(str).map(len).max(), len(column))
                col_idx = current_df.columns.get_loc(column)
                final_width = min(column_length + 2, 50)
                worksheet.column_dimensions[chr(65 + col_idx)].width = final_width
        
        logger.info(f"Current run Excel file created locally")
        
        # Upload current run file to Google Drive
        current_result = upload_to_google_drive(current_temp_path, is_excel=True, filename=current_filename)
        if current_result:
            logger.info(f"Current run file uploaded to Google Drive: {current_result['webViewLink']}")
        
    except Exception as e:
        logger.error(f"Failed to create current run Excel file: {e}")
        raise
    
    # Write cumulative Excel file
    logger.info(f"Creating cumulative Excel file: {cumulative_filename}")
    try:
        with pd.ExcelWriter(cumulative_temp_path, engine='openpyxl') as writer:
            cumulative_df.to_excel(writer, index=False, sheet_name='All Business Cards')
            logger.debug(f"Written {len(cumulative_df)} rows to 'All Business Cards' sheet")
            
            # Auto-adjust column widths
            logger.debug("Auto-adjusting column widths for cumulative file")
            worksheet = writer.sheets['All Business Cards']
            for column in cumulative_df:
                column_length = max(cumulative_df[column].astype(str).map(len).max(), len(column))
                col_idx = cumulative_df.columns.get_loc(column)
                final_width = min(column_length + 2, 50)
                worksheet.column_dimensions[chr(65 + col_idx)].width = final_width
        
        logger.info(f"Cumulative Excel file created locally")
        
        # Upload cumulative file to Google Drive
        cumulative_result = upload_to_google_drive(cumulative_temp_path, is_excel=True, filename=cumulative_filename)
        if cumulative_result:
            logger.info(f"Cumulative file uploaded to Google Drive: {cumulative_result['webViewLink']}")
        
    except Exception as e:
        logger.error(f"Failed to create cumulative Excel file: {e}")
        raise
    
    # Note: Don't delete temp files here - Gradio needs them for download
    # Gradio will handle cleanup automatically
    
    # Create summary message
    logger.info("Creating summary message")
    num_batches = len(image_batches) if 'image_batches' in locals() else 1
    summary = f"Successfully processed {len(all_data)} business card(s) in {num_batches} batch(es) of up to 5 cards.\n"
    model_display = "Speed-Optimized model" if "flash" in model_name else "Accuracy-Optimized model"
    summary += f"πŸ€– AI Model used: {model_display}\n"
    summary += f"⚑ API calls made: {num_batches} (instead of {len(all_data)})\n"
    
    if save_images:
        num_uploaded = sum(1 for link in uploaded_image_links if link) if 'uploaded_image_links' in locals() else 0
        summary += f"πŸ’Ύ Images uploaded to Google Drive: {num_uploaded} cards\n\n"
    else:
        summary += f"πŸ’Ύ Images uploaded to Google Drive: No (save option was disabled)\n\n"
    
    summary += f"πŸ“ Current run file: {current_filename} (uploaded to Google Drive)\n"
    summary += f"πŸ“Š Total cumulative file: {cumulative_filename} (uploaded to Google Drive)\n"
    summary += f"πŸ“Š Total cards in database: {len(cumulative_df)}\n"
    
    # Add cleanup information
    if 'duplicates_removed' in locals() and duplicates_removed > 0:
        summary += f"🧹 Cleaned up {duplicates_removed} duplicate cumulative files\n"
    if 'old_runs_removed' in locals() and old_runs_removed > 0:
        summary += f"🧹 Cleaned up {old_runs_removed} old current run files\n"
    summary += "\n"
    
    # Add Google Drive links
    summary += "πŸ”— Google Drive Links:\n"
    if 'current_result' in locals() and current_result:
        summary += f"   πŸ“„ Current Run: {current_result['webViewLink']}\n"
    if 'cumulative_result' in locals() and cumulative_result:
        summary += f"   πŸ“Š Total Database: {cumulative_result['webViewLink']}\n"
    summary += f"   πŸ“ Exports Folder: https://drive.google.com/drive/folders/1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO\n"
    summary += f"   πŸ–ΌοΈ Images Folder: https://drive.google.com/drive/folders/1gd280IqcAzpAFTPeYsZjoBUOU9S7Zx3c\n\n"
    
    if errors:
        logger.warning(f"Encountered {len(errors)} errors during processing")
        summary += "Errors encountered:\n" + "\n".join(errors)
        for error in errors:
            logger.warning(f"Processing error: {error}")
    else:
        logger.info("No errors encountered during processing")
    
    # Display preview of current run
    logger.debug("Creating preview DataFrame")
    preview_df = current_df.head(10)
    logger.debug(f"Preview contains {len(preview_df)} rows")
    
    logger.info("Business card processing session completed successfully")
    logger.info(f"Session summary - Cards: {len(all_data)}, Batches: {num_batches}, API calls: {num_batches}, Total DB size: {len(cumulative_df)}")
    
    # Return the temporary file paths for download (Gradio will handle the download)
    return current_temp_path, cumulative_temp_path, summary, preview_df

# Create Gradio interface
logger.info("Creating Gradio interface")
with gr.Blocks(title="Business Card Data Extractor") as demo:
    gr.Markdown(
        """
        # Business Card Data Extractor
        
        Upload business card images to extract contact information and export to Excel.
        Cards are processed in batches of 5 for efficiency (fewer API calls, lower cost).
        
        **Two files are generated:**
        - πŸ“ **Current Run**: Contains only the cards you just processed
        - πŸ“Š **Total Database**: Contains ALL cards ever processed (cumulative)
        
        **☁️ Google Drive Storage:**
        - πŸ“‚ Excel files: Automatically uploaded to Google Drive exports folder
        - πŸ–ΌοΈ Images: Uploaded to Google Drive images folder (if save option enabled)
        - πŸ”— **Direct Links**: Access files directly through provided Google Drive links
        - πŸ“ **Organized Folders**: Separate folders for exports and images
        
        **πŸ“Œ File Access:**
        - ⬇️ Download directly from interface buttons (temporary copies)
        - πŸ”— Access permanent files via Google Drive links in results
        - πŸ“ **Exports Folder**: https://drive.google.com/drive/folders/1k5iP4egzLrGJwnHkMhxt9bAkaCiieojO
        - πŸ–ΌοΈ **Images Folder**: https://drive.google.com/drive/folders/1gd280IqcAzpAFTPeYsZjoBUOU9S7Zx3c
        
        **βš™οΈ Google Drive Integration:**
        - Requires `GOOGLE_CLIENT_ID` and `GOOGLE_CLIENT_SECRET` environment variables
        - Files are automatically uploaded and organized in predefined folders
        """
    )
    
    with gr.Row():
        with gr.Column():
            # Define supported file types including phone formats
            supported_types = [".jpg", ".jpeg", ".png", ".webp", ".bmp"]
            if HEIF_SUPPORTED:
                supported_types.extend([".heif", ".heic"])
            
            image_input = gr.File(
                label="Upload Business Cards",
                file_count="multiple",
                file_types=supported_types
            )
            
            model_selector = gr.Dropdown(
                choices=[("Accuracy-Optimized model", "gemini-2.5-pro"), ("Speed-Optimized model", "gemini-2.5-flash")],
                value="gemini-2.5-pro",
                label="AI Model Selection"
            )
            
            save_images_checkbox = gr.Checkbox(
                value=True,
                label="Save Business Card Images"
            )
            
            process_btn = gr.Button("Process Business Cards", variant="primary")
        
        with gr.Column():
            current_file = gr.File(label="πŸ“ Download Current Run")
            total_file = gr.File(label="πŸ“Š Download Total Database") 
            status_output = gr.Textbox(label="Processing Status", lines=5)
    
    preview_output = gr.Dataframe(label="Data Preview (Current Run)")
    
    # Wrapper function for better error handling and logging
    def process_with_logging(images, model_name, save_images):
        """Wrapper function to add error handling and logging to the main process"""
        try:
            logger.info(f"Gradio interface initiated processing request")
            logger.debug(f"Request parameters - Images: {len(images) if images else 0}, Model: {model_name}, Save Images: {save_images}")
            return process_business_cards(images, model_name, save_images)
        except Exception as e:
            logger.error(f"Unexpected error in Gradio processing: {e}")
            error_msg = f"An unexpected error occurred: {str(e)}\nPlease check the logs for more details."
            return None, None, error_msg, None

    # Handle processing
    process_btn.click(
        fn=process_with_logging,
        inputs=[image_input, model_selector, save_images_checkbox],
        outputs=[current_file, total_file, status_output, preview_output]
    )
    
    gr.Markdown(
        """
        ## Features:
        - πŸ€– **Model Selection**: Choose between Speed-Optimized model (fast) or Accuracy-Optimized model (accurate)
        - ⚑ **Batch Processing**: Processes 5 cards per API call for efficiency
        - πŸ“„ **Data Extraction**: Names, emails, phone numbers, addresses, and more
        - πŸ“ž **Smart Combination**: Multiple emails/phones combined with commas
        - 🏠 **Address Merging**: All phone types and address fields combined
        - ☁️ **Google Drive Storage**: Automatic upload to organized Drive folders
        - πŸ”— **Direct Links**: Instant access to files via Google Drive URLs
        - πŸ“Š **Dual Output**: Current run + cumulative database files
        - πŸ“ **Full Tracking**: Processing date, filename, Google Drive links, and AI model used
        - 🎯 **One Row Per Card**: Each business card becomes one spreadsheet row
        """
    )

# Launch for Hugging Face Spaces deployment
logger.info("Starting Gradio demo")

# Get password from environment variable for authentication
hf_space_password = os.getenv("SPACE_PASSWORD")

if hf_space_password:
    # Launch with password protection
    logger.info("Launching with password protection enabled")
    demo.launch(auth=("user", hf_space_password))
else:
    # Launch without password protection
    logger.warning("SPACE_PASSWORD not set - launching without password protection")
    demo.launch()