Spaces:
Paused
Paused
Ali Mohsin
commited on
Commit
Β·
1f07471
1
Parent(s):
4619bfc
Optimizations number two try
Browse filesRefactor dataset preparation logic to require manual intervention for setup. Update checks for existing images and splits, improving user feedback during the process. Ensure no automatic training occurs, and enhance error handling for dataset preparation.
- app.py +48 -69
- utils/data_fetch.py +17 -8
app.py
CHANGED
|
@@ -270,12 +270,12 @@ def _background_bootstrap():
|
|
| 270 |
global BOOT_STATUS
|
| 271 |
global DATASET_ROOT
|
| 272 |
try:
|
| 273 |
-
#
|
| 274 |
root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
|
| 275 |
images_dir = os.path.join(root, "images")
|
| 276 |
splits_dir = os.path.join(root, "splits")
|
| 277 |
|
| 278 |
-
#
|
| 279 |
has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
|
| 280 |
has_splits = (
|
| 281 |
os.path.isfile(os.path.join(splits_dir, "train.json")) or
|
|
@@ -283,62 +283,19 @@ def _background_bootstrap():
|
|
| 283 |
)
|
| 284 |
|
| 285 |
if has_images and has_splits:
|
| 286 |
-
print("β
Dataset and splits already prepared
|
| 287 |
DATASET_ROOT = root
|
| 288 |
BOOT_STATUS = "ready"
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
# Only prepare dataset if images are missing
|
| 292 |
-
if not has_images:
|
| 293 |
-
BOOT_STATUS = "preparing-dataset"
|
| 294 |
-
ds_root = ensure_dataset_ready()
|
| 295 |
-
DATASET_ROOT = ds_root
|
| 296 |
-
if not ds_root:
|
| 297 |
-
BOOT_STATUS = "dataset-not-prepared"
|
| 298 |
-
return
|
| 299 |
-
else:
|
| 300 |
DATASET_ROOT = root
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
# Only prepare splits if missing
|
| 304 |
-
if not has_splits:
|
| 305 |
-
BOOT_STATUS = "creating-splits"
|
| 306 |
-
os.makedirs(splits_dir, exist_ok=True)
|
| 307 |
-
from scripts.prepare_polyvore import main as prepare_main
|
| 308 |
-
os.environ.setdefault("PYTHONWARNINGS", "ignore")
|
| 309 |
-
import sys
|
| 310 |
-
argv_bak = sys.argv
|
| 311 |
-
try:
|
| 312 |
-
# Use official splits from nondisjoint/ and disjoint/ folders with default size limit (500 samples for faster training)
|
| 313 |
-
sys.argv = ["prepare_polyvore.py", "--root", DATASET_ROOT, "--max_samples", "500"]
|
| 314 |
-
prepare_main()
|
| 315 |
-
finally:
|
| 316 |
-
sys.argv = argv_bak
|
| 317 |
else:
|
| 318 |
-
print("
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
resnet_ckpt = os.path.join(export_dir, "resnet_item_embedder_best.pth")
|
| 324 |
-
vit_ckpt = os.path.join(export_dir, "vit_outfit_model_best.pth")
|
| 325 |
-
import subprocess
|
| 326 |
-
if not os.path.exists(resnet_ckpt):
|
| 327 |
-
BOOT_STATUS = "training-resnet"
|
| 328 |
-
subprocess.run([
|
| 329 |
-
"python", "train_resnet.py", "--data_root", ds_root, "--epochs", "3",
|
| 330 |
-
"--batch_size", "4", "--lr", "1e-3", "--early_stopping_patience", "3",
|
| 331 |
-
"--out", os.path.join(export_dir, "resnet_item_embedder.pth")
|
| 332 |
-
], check=False)
|
| 333 |
-
if not os.path.exists(vit_ckpt):
|
| 334 |
-
BOOT_STATUS = "training-vit"
|
| 335 |
-
subprocess.run([
|
| 336 |
-
"python", "train_vit_triplet.py", "--data_root", ds_root, "--epochs", "10",
|
| 337 |
-
"--batch_size", "4", "--lr", "5e-4", "--early_stopping_patience", "5",
|
| 338 |
-
"--max_samples", "5000", "--triplet_margin", "0.5", "--gradient_clip", "1.0",
|
| 339 |
-
"--warmup_epochs", "2", "--export", os.path.join(export_dir, "vit_outfit_model.pth")
|
| 340 |
-
], check=False)
|
| 341 |
-
service.reload_models()
|
| 342 |
BOOT_STATUS = "ready"
|
| 343 |
except Exception as e:
|
| 344 |
BOOT_STATUS = f"error: {e}"
|
|
@@ -1462,7 +1419,7 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
|
|
| 1462 |
|
| 1463 |
# Dataset Preparation Section
|
| 1464 |
with gr.Accordion("π¦ Dataset Preparation (Optional)", open=False):
|
| 1465 |
-
gr.Markdown("**Note**: Dataset is
|
| 1466 |
with gr.Row():
|
| 1467 |
prepare_dataset_btn = gr.Button("π₯ Download & Prepare Dataset", variant="secondary")
|
| 1468 |
prepare_status = gr.Textbox(label="Dataset Preparation Status", value="Dataset will be prepared if missing", interactive=False)
|
|
@@ -1472,29 +1429,51 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
|
|
| 1472 |
global DATASET_ROOT, BOOT_STATUS
|
| 1473 |
try:
|
| 1474 |
BOOT_STATUS = "preparing-dataset"
|
| 1475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1476 |
DATASET_ROOT = ds_root
|
| 1477 |
if not ds_root:
|
| 1478 |
BOOT_STATUS = "dataset-not-prepared"
|
| 1479 |
return "β Failed to prepare dataset"
|
| 1480 |
|
| 1481 |
-
# Prepare splits
|
| 1482 |
splits_dir = os.path.join(ds_root, "splits")
|
| 1483 |
-
|
| 1484 |
-
|
| 1485 |
-
|
| 1486 |
-
|
| 1487 |
-
|
| 1488 |
-
|
| 1489 |
-
|
| 1490 |
-
prepare_main
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1491 |
BOOT_STATUS = "ready"
|
| 1492 |
-
return "β
Dataset prepared
|
| 1493 |
-
finally:
|
| 1494 |
-
sys.argv = argv_bak
|
| 1495 |
except Exception as e:
|
| 1496 |
BOOT_STATUS = "error"
|
| 1497 |
-
|
|
|
|
| 1498 |
|
| 1499 |
prepare_dataset_btn.click(fn=prepare_dataset_manual, inputs=[], outputs=prepare_status)
|
| 1500 |
|
|
|
|
| 270 |
global BOOT_STATUS
|
| 271 |
global DATASET_ROOT
|
| 272 |
try:
|
| 273 |
+
# Only check if dataset exists - DO NOT prepare it automatically
|
| 274 |
root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
|
| 275 |
images_dir = os.path.join(root, "images")
|
| 276 |
splits_dir = os.path.join(root, "splits")
|
| 277 |
|
| 278 |
+
# Check if dataset already exists
|
| 279 |
has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
|
| 280 |
has_splits = (
|
| 281 |
os.path.isfile(os.path.join(splits_dir, "train.json")) or
|
|
|
|
| 283 |
)
|
| 284 |
|
| 285 |
if has_images and has_splits:
|
| 286 |
+
print("β
Dataset and splits already prepared")
|
| 287 |
DATASET_ROOT = root
|
| 288 |
BOOT_STATUS = "ready"
|
| 289 |
+
elif has_images:
|
| 290 |
+
print("β
Dataset images exist, but splits may be missing (use Advanced Training to prepare)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
DATASET_ROOT = root
|
| 292 |
+
BOOT_STATUS = "ready"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
else:
|
| 294 |
+
print("βΉοΈ Dataset not prepared. Use 'Download & Prepare Dataset' button in Advanced Training tab if needed.")
|
| 295 |
+
DATASET_ROOT = None
|
| 296 |
+
BOOT_STATUS = "ready" # System is ready, just dataset not prepared
|
| 297 |
+
|
| 298 |
+
# NO automatic training - models should be pre-trained or trained manually via UI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
BOOT_STATUS = "ready"
|
| 300 |
except Exception as e:
|
| 301 |
BOOT_STATUS = f"error: {e}"
|
|
|
|
| 1419 |
|
| 1420 |
# Dataset Preparation Section
|
| 1421 |
with gr.Accordion("π¦ Dataset Preparation (Optional)", open=False):
|
| 1422 |
+
gr.Markdown("**Note**: Dataset preparation is now manual only. Click the button below to download and prepare the dataset when needed.")
|
| 1423 |
with gr.Row():
|
| 1424 |
prepare_dataset_btn = gr.Button("π₯ Download & Prepare Dataset", variant="secondary")
|
| 1425 |
prepare_status = gr.Textbox(label="Dataset Preparation Status", value="Dataset will be prepared if missing", interactive=False)
|
|
|
|
| 1429 |
global DATASET_ROOT, BOOT_STATUS
|
| 1430 |
try:
|
| 1431 |
BOOT_STATUS = "preparing-dataset"
|
| 1432 |
+
|
| 1433 |
+
# Check if dataset already exists
|
| 1434 |
+
root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
|
| 1435 |
+
images_dir = os.path.join(root, "images")
|
| 1436 |
+
has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
|
| 1437 |
+
|
| 1438 |
+
if has_images:
|
| 1439 |
+
print("β
Images already exist, skipping download/extraction")
|
| 1440 |
+
ds_root = root
|
| 1441 |
+
else:
|
| 1442 |
+
print("π₯ Downloading and extracting dataset...")
|
| 1443 |
+
ds_root = ensure_dataset_ready()
|
| 1444 |
+
|
| 1445 |
DATASET_ROOT = ds_root
|
| 1446 |
if not ds_root:
|
| 1447 |
BOOT_STATUS = "dataset-not-prepared"
|
| 1448 |
return "β Failed to prepare dataset"
|
| 1449 |
|
| 1450 |
+
# Prepare splits if missing
|
| 1451 |
splits_dir = os.path.join(ds_root, "splits")
|
| 1452 |
+
has_splits = (
|
| 1453 |
+
os.path.isfile(os.path.join(splits_dir, "train.json")) or
|
| 1454 |
+
os.path.isfile(os.path.join(splits_dir, "outfit_triplets_train.json"))
|
| 1455 |
+
)
|
| 1456 |
+
|
| 1457 |
+
if not has_splits:
|
| 1458 |
+
os.makedirs(splits_dir, exist_ok=True)
|
| 1459 |
+
from scripts.prepare_polyvore import main as prepare_main
|
| 1460 |
+
os.environ.setdefault("PYTHONWARNINGS", "ignore")
|
| 1461 |
+
import sys
|
| 1462 |
+
argv_bak = sys.argv
|
| 1463 |
+
try:
|
| 1464 |
+
sys.argv = ["prepare_polyvore.py", "--root", ds_root, "--max_samples", "500"]
|
| 1465 |
+
prepare_main()
|
| 1466 |
+
BOOT_STATUS = "ready"
|
| 1467 |
+
return "β
Dataset and splits prepared successfully!"
|
| 1468 |
+
finally:
|
| 1469 |
+
sys.argv = argv_bak
|
| 1470 |
+
else:
|
| 1471 |
BOOT_STATUS = "ready"
|
| 1472 |
+
return "β
Dataset already prepared (images and splits exist)"
|
|
|
|
|
|
|
| 1473 |
except Exception as e:
|
| 1474 |
BOOT_STATUS = "error"
|
| 1475 |
+
import traceback
|
| 1476 |
+
return f"β Error: {str(e)}\n{traceback.format_exc()}"
|
| 1477 |
|
| 1478 |
prepare_dataset_btn.click(fn=prepare_dataset_manual, inputs=[], outputs=prepare_status)
|
| 1479 |
|
utils/data_fetch.py
CHANGED
|
@@ -75,14 +75,8 @@ def ensure_dataset_ready() -> Optional[str]:
|
|
| 75 |
has_metadata = all(os.path.exists(os.path.join(root, f)) for f in metadata_files)
|
| 76 |
|
| 77 |
if has_images and has_metadata:
|
| 78 |
-
print("β
Dataset already complete")
|
| 79 |
return root
|
| 80 |
-
|
| 81 |
-
# If images are already present, skip extraction
|
| 82 |
-
if not has_images:
|
| 83 |
-
_unzip_images_if_needed(root)
|
| 84 |
-
else:
|
| 85 |
-
print("β
Images already extracted, skipping extraction")
|
| 86 |
|
| 87 |
# Download the HF dataset snapshot into root
|
| 88 |
try:
|
|
@@ -126,7 +120,8 @@ def ensure_dataset_ready() -> Optional[str]:
|
|
| 126 |
)
|
| 127 |
)
|
| 128 |
|
| 129 |
-
if
|
|
|
|
| 130 |
print("π Starting download...")
|
| 131 |
snapshot_download(
|
| 132 |
"Stylique/Polyvore",
|
|
@@ -137,6 +132,20 @@ def ensure_dataset_ready() -> Optional[str]:
|
|
| 137 |
ignore_patterns=ignore,
|
| 138 |
)
|
| 139 |
print("β
Download completed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
else:
|
| 141 |
print("β
All required files already present")
|
| 142 |
|
|
|
|
| 75 |
has_metadata = all(os.path.exists(os.path.join(root, f)) for f in metadata_files)
|
| 76 |
|
| 77 |
if has_images and has_metadata:
|
| 78 |
+
print("β
Dataset already complete - skipping download and extraction")
|
| 79 |
return root
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
# Download the HF dataset snapshot into root
|
| 82 |
try:
|
|
|
|
| 120 |
)
|
| 121 |
)
|
| 122 |
|
| 123 |
+
# Only download if images are missing
|
| 124 |
+
if not has_images:
|
| 125 |
print("π Starting download...")
|
| 126 |
snapshot_download(
|
| 127 |
"Stylique/Polyvore",
|
|
|
|
| 132 |
ignore_patterns=ignore,
|
| 133 |
)
|
| 134 |
print("β
Download completed")
|
| 135 |
+
# Extract images after download
|
| 136 |
+
_unzip_images_if_needed(root)
|
| 137 |
+
elif not has_metadata:
|
| 138 |
+
# Only download metadata if images exist but metadata is missing
|
| 139 |
+
print("π₯ Downloading missing metadata files...")
|
| 140 |
+
snapshot_download(
|
| 141 |
+
"Stylique/Polyvore",
|
| 142 |
+
repo_type="dataset",
|
| 143 |
+
local_dir=root,
|
| 144 |
+
local_dir_use_symlinks=False,
|
| 145 |
+
allow_patterns=["polyvore_item_metadata.json", "polyvore_outfit_titles.json", "categories.csv"],
|
| 146 |
+
ignore_patterns=ignore,
|
| 147 |
+
)
|
| 148 |
+
print("β
Metadata download completed")
|
| 149 |
else:
|
| 150 |
print("β
All required files already present")
|
| 151 |
|