File size: 8,044 Bytes
85ad390
 
 
bcde0da
85ad390
 
 
 
 
1552b06
bcde0da
85ad390
 
 
bcde0da
1552b06
 
 
85ad390
4f13b31
85ad390
 
4f13b31
85ad390
 
 
 
 
 
 
1552b06
 
 
 
 
85ad390
1552b06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85ad390
 
 
 
 
9cc14fb
 
85ad390
 
 
 
 
 
 
 
 
9cc14fb
4f13b31
 
 
 
 
85ad390
9cc14fb
 
 
 
 
 
 
 
 
 
 
85ad390
9cc14fb
 
85ad390
 
9cc14fb
85ad390
9cc14fb
 
 
 
 
 
85ad390
4f13b31
9cc14fb
85ad390
9cc14fb
 
 
 
85ad390
9cc14fb
85ad390
9cc14fb
85ad390
9cc14fb
 
85ad390
 
9cc14fb
85ad390
9cc14fb
4f13b31
 
 
 
 
 
 
 
 
 
85ad390
 
4f13b31
85ad390
9cc14fb
85ad390
 
 
9cc14fb
85ad390
 
 
 
 
 
 
 
9cc14fb
 
85ad390
 
 
 
9cc14fb
 
85ad390
1552b06
 
 
 
 
 
 
 
 
 
 
 
85ad390
1552b06
 
 
 
 
 
 
 
 
 
 
 
 
4f13b31
 
8ee319e
1552b06
 
9cc14fb
 
 
85ad390
1552b06
 
 
 
 
 
 
 
 
 
 
85ad390
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
import os
import gradio as gr
from datasets import load_dataset, Dataset
import pandas as pd
from PIL import Image
from tqdm import tqdm
import logging
import json

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check for environment variables
GOOGLE_CREDENTIALS = os.getenv('GOOGLE_CREDENTIALS')

class DatasetManager:
    def __init__(self, local_images_dir="downloaded_cards"):
        self.local_images_dir = local_images_dir
        self.drive = None
        self.dataset_name = "GotThatData/sports-cards"
        
        # Create local directory if it doesn't exist
        os.makedirs(local_images_dir, exist_ok=True)
    
    def authenticate_drive(self):
        """Authenticate with Google Drive"""
        try:
            # Create client_secrets.json from environment variable if available
            if GOOGLE_CREDENTIALS:
                with open('client_secrets.json', 'w') as f:
                    f.write(GOOGLE_CREDENTIALS)
                
            gauth = GoogleAuth()
            # Configure for headless authentication
            gauth.settings['get_refresh_token'] = True
            gauth.LoadCredentialsFile("mycreds.txt")
            
            if gauth.credentials is None:
                # Create local webserver for authentication if running locally
                if os.getenv('SPACE_ID') is None:
                    gauth.LocalWebserverAuth()
                else:
                    # For Hugging Face Spaces, use service account or saved credentials
                    gauth.CommandLineAuth()
            elif gauth.access_token_expired:
                gauth.Refresh()
            else:
                gauth.Authorize()
            
            # Save the credentials for future use
            gauth.SaveCredentialsFile("mycreds.txt")
            
            self.drive = GoogleDrive(gauth)
            return True, "Successfully authenticated with Google Drive"
        except Exception as e:
            return False, f"Authentication failed: {str(e)}"

    def download_and_rename_files(self, drive_folder_id, naming_convention):
        """Download files from Google Drive and rename them"""
        if not self.drive:
            return False, "Google Drive not authenticated", []
        
        try:
            # List files in the folder
            query = f"'{drive_folder_id}' in parents and trashed=false"
            file_list = self.drive.ListFile({'q': query}).GetList()
            
            if not file_list:
                # Try to get single file if folder is empty
                file = self.drive.CreateFile({'id': drive_folder_id})
                if file:
                    file_list = [file]
                else:
                    return False, "No files found with the specified ID", []
            
            renamed_files = []
            existing_dataset = None
            try:
                existing_dataset = load_dataset(self.dataset_name)
                logger.info(f"Loaded existing dataset: {self.dataset_name}")
                start_index = len(existing_dataset['train']) if 'train' in existing_dataset else 0
            except Exception as e:
                logger.info(f"No existing dataset found, starting fresh: {str(e)}")
                start_index = 0
            
            for i, file in enumerate(tqdm(file_list, desc="Downloading files")):
                if file['mimeType'].startswith('image/'):
                    new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
                    file_path = os.path.join(self.local_images_dir, new_filename)
                    
                    # Download file
                    file.GetContentFile(file_path)
                    
                    # Verify the image can be opened
                    try:
                        with Image.open(file_path) as img:
                            img.verify()
                        renamed_files.append({
                            'file_path': file_path,
                            'original_name': file['title'],
                            'new_name': new_filename,
                            'image': file_path
                        })
                    except Exception as e:
                        logger.error(f"Error processing image {file['title']}: {str(e)}")
                        if os.path.exists(file_path):
                            os.remove(file_path)
            
            return True, f"Successfully processed {len(renamed_files)} images", renamed_files
        except Exception as e:
            return False, f"Error downloading files: {str(e)}", []

    def update_huggingface_dataset(self, renamed_files):
        """Update the sports-cards dataset with new images"""
        try:
            # Create a DataFrame with the file information
            df = pd.DataFrame(renamed_files)
            
            # Create a Hugging Face Dataset
            new_dataset = Dataset.from_pandas(df)
            
            try:
                # Try to load existing dataset
                existing_dataset = load_dataset(self.dataset_name)
                # Concatenate with existing dataset if it exists
                if 'train' in existing_dataset:
                    new_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
            except Exception:
                logger.info("Creating new dataset")
            
            # Push to Hugging Face Hub
            new_dataset.push_to_hub(self.dataset_name, split="train")
            
            return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images"
        except Exception as e:
            return False, f"Error updating Hugging Face dataset: {str(e)}"

def process_pipeline(folder_id, naming_convention):
    """Main pipeline to process images and update dataset"""
    manager = DatasetManager()
    
    # Step 1: Authenticate
    auth_success, auth_message = manager.authenticate_drive()
    if not auth_success:
        return auth_message
    
    # Step 2: Download and rename files
    success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
    if not success:
        return message
    
    # Step 3: Update Hugging Face dataset
    success, hf_message = manager.update_huggingface_dataset(renamed_files)
    return f"{message}\n{hf_message}"

# Authentication status interface
def check_auth_status():
    try:
        gauth = GoogleAuth()
        gauth.LoadCredentialsFile("mycreds.txt")
        if gauth.credentials is not None and not gauth.access_token_expired:
            return "🟒 Authenticated with Google Drive"
        else:
            return "πŸ”΄ Not authenticated with Google Drive"
    except Exception as e:
        return f"πŸ”΄ Error checking authentication: {str(e)}"

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Sports Cards Dataset Processor")
    gr.Markdown("Download card images from Google Drive and add them to the sports-cards dataset")
    
    # Authentication status
    auth_status = gr.Textbox(
        label="Authentication Status",
        value=check_auth_status(),
        interactive=False
    )
    
    with gr.Row():
        folder_id = gr.Textbox(
            label="Google Drive File/Folder ID",
            placeholder="Enter the ID from your Google Drive URL",
            value="151VOxPO91mg0C3ORiioGUd4hogzP1ujm"
        )
        naming = gr.Textbox(
            label="Naming Convention",
            placeholder="e.g., sports_card",
            value="sports_card"
        )
    
    # Process button and output
    process_btn = gr.Button("Process Images")
    output = gr.Textbox(label="Status")
    
    # Handle processing
    process_btn.click(
        fn=process_pipeline,
        inputs=[folder_id, naming],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()