|
import sys |
|
from pathlib import Path |
|
|
|
sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) |
|
import argparse |
|
import os |
|
|
|
from jean_zay.launch import JeanZayExperiment |
|
|
|
|
|
def parse_mode(): |
|
parser = argparse.ArgumentParser( |
|
description="Extract embeddings from YFCC dataset using DINOv2" |
|
) |
|
parser.add_argument( |
|
"--launch", |
|
action="store_true", |
|
help="Launch the experiment", |
|
) |
|
parser.add_argument("--src_csv_dir", help="path to source csv directory") |
|
parser.add_argument("--src_images_dir", help="path to source images directory") |
|
parser.add_argument("--dest", help="path to destination") |
|
parser.add_argument( |
|
"--num_samples_per_tar", |
|
help="number of samples per tar", |
|
type=int, |
|
default=10000, |
|
) |
|
parser.add_argument("--batch_size", help="batch size", type=int, default=256) |
|
args = parser.parse_args() |
|
|
|
return args |
|
|
|
|
|
args = parse_mode() |
|
|
|
number_of_jobs = len(list(Path(args.src_csv_dir).glob("*.csv"))) |
|
cmd_modifiers = [] |
|
exps = [] |
|
|
|
exp_name = f"yfcc_preprocessing" |
|
job_name = f"yfcc_preprocessing" |
|
jz_exp = JeanZayExperiment( |
|
exp_name, |
|
job_name, |
|
slurm_array_nb_jobs=number_of_jobs, |
|
cmd_path="data/to_webdataset/yfcc_to_wds.py", |
|
num_nodes=1, |
|
num_gpus_per_node=1, |
|
qos="t3", |
|
account="syq", |
|
gpu_type="a100", |
|
time="1:30:00", |
|
) |
|
|
|
exps.append(jz_exp) |
|
|
|
trainer_modifiers = {} |
|
|
|
exp_modifier = { |
|
"--src_csv_dir": args.src_csv_dir, |
|
"--src_images_dir": args.src_images_dir, |
|
"--dest": args.dest, |
|
"--num_samples_per_tar": args.num_samples_per_tar, |
|
"--job_offset": "${SLURM_ARRAY_TASK_ID}", |
|
"--batch_size": args.batch_size, |
|
} |
|
|
|
cmd_modifiers.append(dict(trainer_modifiers, **exp_modifier)) |
|
|
|
|
|
if __name__ == "__main__": |
|
for exp, cmd_modifier in zip(exps, cmd_modifiers): |
|
exp.build_cmd(cmd_modifier) |
|
if args.launch == True: |
|
exp.launch() |
|
|