Ali Mohsin commited on
Commit
1f07471
Β·
1 Parent(s): 4619bfc

Optimizations number two try

Browse files

Refactor dataset preparation logic to require manual intervention for setup. Update checks for existing images and splits, improving user feedback during the process. Ensure no automatic training occurs, and enhance error handling for dataset preparation.

Files changed (2) hide show
  1. app.py +48 -69
  2. utils/data_fetch.py +17 -8
app.py CHANGED
@@ -270,12 +270,12 @@ def _background_bootstrap():
270
  global BOOT_STATUS
271
  global DATASET_ROOT
272
  try:
273
- # Check if dataset root exists and has basic structure
274
  root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
275
  images_dir = os.path.join(root, "images")
276
  splits_dir = os.path.join(root, "splits")
277
 
278
- # Only check dataset if images directory doesn't exist
279
  has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
280
  has_splits = (
281
  os.path.isfile(os.path.join(splits_dir, "train.json")) or
@@ -283,62 +283,19 @@ def _background_bootstrap():
283
  )
284
 
285
  if has_images and has_splits:
286
- print("βœ… Dataset and splits already prepared, skipping startup preparation")
287
  DATASET_ROOT = root
288
  BOOT_STATUS = "ready"
289
- return
290
-
291
- # Only prepare dataset if images are missing
292
- if not has_images:
293
- BOOT_STATUS = "preparing-dataset"
294
- ds_root = ensure_dataset_ready()
295
- DATASET_ROOT = ds_root
296
- if not ds_root:
297
- BOOT_STATUS = "dataset-not-prepared"
298
- return
299
- else:
300
  DATASET_ROOT = root
301
- print("βœ… Dataset images already exist, skipping extraction")
302
-
303
- # Only prepare splits if missing
304
- if not has_splits:
305
- BOOT_STATUS = "creating-splits"
306
- os.makedirs(splits_dir, exist_ok=True)
307
- from scripts.prepare_polyvore import main as prepare_main
308
- os.environ.setdefault("PYTHONWARNINGS", "ignore")
309
- import sys
310
- argv_bak = sys.argv
311
- try:
312
- # Use official splits from nondisjoint/ and disjoint/ folders with default size limit (500 samples for faster training)
313
- sys.argv = ["prepare_polyvore.py", "--root", DATASET_ROOT, "--max_samples", "500"]
314
- prepare_main()
315
- finally:
316
- sys.argv = argv_bak
317
  else:
318
- print("βœ… Splits already prepared, skipping")
319
-
320
- # Train if checkpoints are absent
321
- export_dir = os.getenv("EXPORT_DIR", "models/exports")
322
- os.makedirs(export_dir, exist_ok=True)
323
- resnet_ckpt = os.path.join(export_dir, "resnet_item_embedder_best.pth")
324
- vit_ckpt = os.path.join(export_dir, "vit_outfit_model_best.pth")
325
- import subprocess
326
- if not os.path.exists(resnet_ckpt):
327
- BOOT_STATUS = "training-resnet"
328
- subprocess.run([
329
- "python", "train_resnet.py", "--data_root", ds_root, "--epochs", "3",
330
- "--batch_size", "4", "--lr", "1e-3", "--early_stopping_patience", "3",
331
- "--out", os.path.join(export_dir, "resnet_item_embedder.pth")
332
- ], check=False)
333
- if not os.path.exists(vit_ckpt):
334
- BOOT_STATUS = "training-vit"
335
- subprocess.run([
336
- "python", "train_vit_triplet.py", "--data_root", ds_root, "--epochs", "10",
337
- "--batch_size", "4", "--lr", "5e-4", "--early_stopping_patience", "5",
338
- "--max_samples", "5000", "--triplet_margin", "0.5", "--gradient_clip", "1.0",
339
- "--warmup_epochs", "2", "--export", os.path.join(export_dir, "vit_outfit_model.pth")
340
- ], check=False)
341
- service.reload_models()
342
  BOOT_STATUS = "ready"
343
  except Exception as e:
344
  BOOT_STATUS = f"error: {e}"
@@ -1462,7 +1419,7 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
1462
 
1463
  # Dataset Preparation Section
1464
  with gr.Accordion("πŸ“¦ Dataset Preparation (Optional)", open=False):
1465
- gr.Markdown("**Note**: Dataset is automatically prepared on first startup. Use this only if you need to re-download or re-extract the dataset.")
1466
  with gr.Row():
1467
  prepare_dataset_btn = gr.Button("πŸ“₯ Download & Prepare Dataset", variant="secondary")
1468
  prepare_status = gr.Textbox(label="Dataset Preparation Status", value="Dataset will be prepared if missing", interactive=False)
@@ -1472,29 +1429,51 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
1472
  global DATASET_ROOT, BOOT_STATUS
1473
  try:
1474
  BOOT_STATUS = "preparing-dataset"
1475
- ds_root = ensure_dataset_ready()
 
 
 
 
 
 
 
 
 
 
 
 
1476
  DATASET_ROOT = ds_root
1477
  if not ds_root:
1478
  BOOT_STATUS = "dataset-not-prepared"
1479
  return "❌ Failed to prepare dataset"
1480
 
1481
- # Prepare splits
1482
  splits_dir = os.path.join(ds_root, "splits")
1483
- os.makedirs(splits_dir, exist_ok=True)
1484
- from scripts.prepare_polyvore import main as prepare_main
1485
- os.environ.setdefault("PYTHONWARNINGS", "ignore")
1486
- import sys
1487
- argv_bak = sys.argv
1488
- try:
1489
- sys.argv = ["prepare_polyvore.py", "--root", ds_root, "--max_samples", "500"]
1490
- prepare_main()
 
 
 
 
 
 
 
 
 
 
 
1491
  BOOT_STATUS = "ready"
1492
- return "βœ… Dataset prepared successfully!"
1493
- finally:
1494
- sys.argv = argv_bak
1495
  except Exception as e:
1496
  BOOT_STATUS = "error"
1497
- return f"❌ Error: {str(e)}"
 
1498
 
1499
  prepare_dataset_btn.click(fn=prepare_dataset_manual, inputs=[], outputs=prepare_status)
1500
 
 
270
  global BOOT_STATUS
271
  global DATASET_ROOT
272
  try:
273
+ # Only check if dataset exists - DO NOT prepare it automatically
274
  root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
275
  images_dir = os.path.join(root, "images")
276
  splits_dir = os.path.join(root, "splits")
277
 
278
+ # Check if dataset already exists
279
  has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
280
  has_splits = (
281
  os.path.isfile(os.path.join(splits_dir, "train.json")) or
 
283
  )
284
 
285
  if has_images and has_splits:
286
+ print("βœ… Dataset and splits already prepared")
287
  DATASET_ROOT = root
288
  BOOT_STATUS = "ready"
289
+ elif has_images:
290
+ print("βœ… Dataset images exist, but splits may be missing (use Advanced Training to prepare)")
 
 
 
 
 
 
 
 
 
291
  DATASET_ROOT = root
292
+ BOOT_STATUS = "ready"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  else:
294
+ print("ℹ️ Dataset not prepared. Use 'Download & Prepare Dataset' button in Advanced Training tab if needed.")
295
+ DATASET_ROOT = None
296
+ BOOT_STATUS = "ready" # System is ready, just dataset not prepared
297
+
298
+ # NO automatic training - models should be pre-trained or trained manually via UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  BOOT_STATUS = "ready"
300
  except Exception as e:
301
  BOOT_STATUS = f"error: {e}"
 
1419
 
1420
  # Dataset Preparation Section
1421
  with gr.Accordion("πŸ“¦ Dataset Preparation (Optional)", open=False):
1422
+ gr.Markdown("**Note**: Dataset preparation is now manual only. Click the button below to download and prepare the dataset when needed.")
1423
  with gr.Row():
1424
  prepare_dataset_btn = gr.Button("πŸ“₯ Download & Prepare Dataset", variant="secondary")
1425
  prepare_status = gr.Textbox(label="Dataset Preparation Status", value="Dataset will be prepared if missing", interactive=False)
 
1429
  global DATASET_ROOT, BOOT_STATUS
1430
  try:
1431
  BOOT_STATUS = "preparing-dataset"
1432
+
1433
+ # Check if dataset already exists
1434
+ root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
1435
+ images_dir = os.path.join(root, "images")
1436
+ has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
1437
+
1438
+ if has_images:
1439
+ print("βœ… Images already exist, skipping download/extraction")
1440
+ ds_root = root
1441
+ else:
1442
+ print("πŸ“₯ Downloading and extracting dataset...")
1443
+ ds_root = ensure_dataset_ready()
1444
+
1445
  DATASET_ROOT = ds_root
1446
  if not ds_root:
1447
  BOOT_STATUS = "dataset-not-prepared"
1448
  return "❌ Failed to prepare dataset"
1449
 
1450
+ # Prepare splits if missing
1451
  splits_dir = os.path.join(ds_root, "splits")
1452
+ has_splits = (
1453
+ os.path.isfile(os.path.join(splits_dir, "train.json")) or
1454
+ os.path.isfile(os.path.join(splits_dir, "outfit_triplets_train.json"))
1455
+ )
1456
+
1457
+ if not has_splits:
1458
+ os.makedirs(splits_dir, exist_ok=True)
1459
+ from scripts.prepare_polyvore import main as prepare_main
1460
+ os.environ.setdefault("PYTHONWARNINGS", "ignore")
1461
+ import sys
1462
+ argv_bak = sys.argv
1463
+ try:
1464
+ sys.argv = ["prepare_polyvore.py", "--root", ds_root, "--max_samples", "500"]
1465
+ prepare_main()
1466
+ BOOT_STATUS = "ready"
1467
+ return "βœ… Dataset and splits prepared successfully!"
1468
+ finally:
1469
+ sys.argv = argv_bak
1470
+ else:
1471
  BOOT_STATUS = "ready"
1472
+ return "βœ… Dataset already prepared (images and splits exist)"
 
 
1473
  except Exception as e:
1474
  BOOT_STATUS = "error"
1475
+ import traceback
1476
+ return f"❌ Error: {str(e)}\n{traceback.format_exc()}"
1477
 
1478
  prepare_dataset_btn.click(fn=prepare_dataset_manual, inputs=[], outputs=prepare_status)
1479
 
utils/data_fetch.py CHANGED
@@ -75,14 +75,8 @@ def ensure_dataset_ready() -> Optional[str]:
75
  has_metadata = all(os.path.exists(os.path.join(root, f)) for f in metadata_files)
76
 
77
  if has_images and has_metadata:
78
- print("βœ… Dataset already complete")
79
  return root
80
-
81
- # If images are already present, skip extraction
82
- if not has_images:
83
- _unzip_images_if_needed(root)
84
- else:
85
- print("βœ… Images already extracted, skipping extraction")
86
 
87
  # Download the HF dataset snapshot into root
88
  try:
@@ -126,7 +120,8 @@ def ensure_dataset_ready() -> Optional[str]:
126
  )
127
  )
128
 
129
- if need_download or not has_images:
 
130
  print("πŸš€ Starting download...")
131
  snapshot_download(
132
  "Stylique/Polyvore",
@@ -137,6 +132,20 @@ def ensure_dataset_ready() -> Optional[str]:
137
  ignore_patterns=ignore,
138
  )
139
  print("βœ… Download completed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  else:
141
  print("βœ… All required files already present")
142
 
 
75
  has_metadata = all(os.path.exists(os.path.join(root, f)) for f in metadata_files)
76
 
77
  if has_images and has_metadata:
78
+ print("βœ… Dataset already complete - skipping download and extraction")
79
  return root
 
 
 
 
 
 
80
 
81
  # Download the HF dataset snapshot into root
82
  try:
 
120
  )
121
  )
122
 
123
+ # Only download if images are missing
124
+ if not has_images:
125
  print("πŸš€ Starting download...")
126
  snapshot_download(
127
  "Stylique/Polyvore",
 
132
  ignore_patterns=ignore,
133
  )
134
  print("βœ… Download completed")
135
+ # Extract images after download
136
+ _unzip_images_if_needed(root)
137
+ elif not has_metadata:
138
+ # Only download metadata if images exist but metadata is missing
139
+ print("πŸ“₯ Downloading missing metadata files...")
140
+ snapshot_download(
141
+ "Stylique/Polyvore",
142
+ repo_type="dataset",
143
+ local_dir=root,
144
+ local_dir_use_symlinks=False,
145
+ allow_patterns=["polyvore_item_metadata.json", "polyvore_outfit_titles.json", "categories.csv"],
146
+ ignore_patterns=ignore,
147
+ )
148
+ print("βœ… Metadata download completed")
149
  else:
150
  print("βœ… All required files already present")
151