File size: 11,916 Bytes
e75a247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# Eval a given a training run name at the given steps, taking into account the chaning of the training runs

import sys
import pickle
import wandb
import argparse
import os

from src.utils.paths import get_path
from src.utils.wandb_utils import get_run_initial_steps, get_run_step_direct, get_run_step_ckpt, get_steps_from_file, get_run_by_name

parser = argparse.ArgumentParser()
parser.add_argument("--tag", "-tag", type=str, required=False, default="")
parser.add_argument("--input", "-input", type=str, required=False, default="Feb26_2025_E1000_N500_noPartonFilter_C_F") # --input Feb26_2025_E1000_N500_full
parser.add_argument("--clustering-suffix", "-c", type=str, required=False, default="") #  -c MinSamples0
parser.add_argument("--no-submit", "-ns", action="store_true") # do not submit the slurm job
parser.add_argument("--submit-AKX", "-AKX", action="store_true")
parser.add_argument("--submit-AK8", "-AK8", action="store_true")
parser.add_argument("--parton-level", "-pl", action="store_true") # To be used together with 'fastjet_jets' and --submit-AKX
parser.add_argument("--gen-level", "-gl", action="store_true")
parser.add_argument("--overwrite", "-ow", action="store_true") # overwrite the slurm job if it exists
parser.add_argument("--pt-cutoff-jet", "-pt", type=float, default=100.0, help="pt cutoff for what is considered a jet")
parser.add_argument("--high-eta-only", "-he", action="store_true", help="Only evaluate high eta jets (eta > 1.5)")
parser.add_argument("--low-eta-only", "-le",  action="store_true", help="Only evaluate low eta jets (eta < 1.5)")
parser.add_argument("--ds-cap", "-ds", type=int, default=10000, help="dataset cap ")

args = parser.parse_args()
api = wandb.Api()

DSCAP = args.ds_cap

def get_eval_run_names(tag):
    # from the api, get all the runs with the tag that are finished
    runs = api.runs(
        path="fcc_ml/svj_clustering",
        filters={"tags": {"$in": [tag.strip()]}}
    )
    return [run.name for run in runs if run.state == "finished"], [run.config for run in runs if run.state == "finished"]

def get_log_number(tag):
    numbers = set()
    for file in os.listdir("jobs/slurm_files"):
        if tag in file:
            numbers.add(int(file.split("_")[-1].split(".")[0]))
    if len(numbers) == 0:
        return 0
    return max(list(numbers)) + 1

def get_slurm_file_text_AKX(tag, log_number):
    bindings = "-B /t3home/gkrzmanc/ -B /work/gkrzmanc/"
    partition = "standard"
    account = "t3"
    d = "jobs/logs/{}".format(tag)
    err = d + "_{}_CPUerr.txt".format(log_number)
    log = d + "_{}_CPUlog.txt".format(log_number)
    suffix_pl = "--parton-level" if args.parton_level else ""
    suffix_gl = "--gen-level" if args.gen_level else ""
    pl_folder = "_PL" if args.parton_level else ""
    gl_folder = "_GL" if args.gen_level else ""
    if args.pt_cutoff_jet != 100.0:
        pt_cutoff_suffix = f"_pt_{args.pt_cutoff_jet}"
        pt_cutoff_suffix_cmd = " --pt-jet-cutoff {}".format(args.pt_cutoff_jet)
    else:
        pt_cutoff_suffix = ""
        pt_cutoff_suffix_cmd = ""
    if args.high_eta_only:
        pt_cutoff_suffix += "_high_eta"
        pt_cutoff_suffix_cmd += " --high-eta-only"
    elif args.low_eta_only:
        pt_cutoff_suffix += "_low_eta"
        pt_cutoff_suffix_cmd += " --low-eta-only"
    file = f"""#!/bin/bash
#SBATCH --partition={partition}           # Specify the partition
#SBATCH --account={account}                  # Specify the account
#SBATCH --mem=25000                   # Request 10GB of memory
#SBATCH --time=06:00:00               # Set the time limit to 1 hour
#SBATCH --job-name=SVJan_AKX{pl_folder}{gl_folder}_{str(log_number)}  # Name the job
#SBATCH --error={err}         # Redirect stderr to a log file
#SBATCH --output={log}         # Redirect stderr to a log file
#SBATCH --mail-type=FAIL
#SBATCH [email protected]
source env.sh
export APPTAINER_TMPDIR=/work/gkrzmanc/singularity_tmp
export APPTAINER_CACHEDIR=/work/gkrzmanc/singularity_cache
nvidia-smi
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}{pt_cutoff_suffix}/AKX{pl_folder}{gl_folder} --jets-object fastjet_jets {suffix_pl} {suffix_gl} --dataset-cap {DSCAP}  {pt_cutoff_suffix_cmd}
    """
    return file

def get_slurm_file_text_AK(tag, log_number):
    bindings = "-B /t3home/gkrzmanc/ -B /work/gkrzmanc/"
    partition = "standard"
    account = "t3"
    d = "jobs/logs/{}".format(tag)
    err = d + "_{}_CPUerr.txt".format(log_number)
    log = d + "_{}_CPUlog.txt".format(log_number)
    file = f"""#!/bin/bash
#SBATCH --partition={partition}           # Specify the partition
#SBATCH --account={account}                  # Specify the account
#SBATCH --mem=25000                   # Request 10GB of memory
#SBATCH --time=02:00:00               # Set the time limit to 1 hour
#SBATCH --job-name=SVJan  # Name the job
#SBATCH --error={err}         # Redirect stderr to a log file
#SBATCH --output={log}         # Redirect stderr to a log file
#SBATCH --mail-type=END,FAIL
#SBATCH [email protected]
source env.sh
export APPTAINER_TMPDIR=/work/gkrzmanc/singularity_tmp
export APPTAINER_CACHEDIR=/work/gkrzmanc/singularity_cache

nvidia-smi
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}/AK8  --dataset-cap 1500  
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}/AK8_GenJets --jets-object genjets --dataset-cap {DSCAP}
    """
    return file

def get_slurm_file_text(tag, eval_job_name, log_number, aug_suffix = ""):
    bindings = "-B /t3home/gkrzmanc/ -B /work/gkrzmanc/  -B /pnfs/psi.ch/cms/trivcat/store/user/gkrzmanc/ "
    partition = "standard"
    account = "t3"
    d = "jobs/logs/{}".format(tag)
    err = d + "_{}_CPUerr.txt".format(log_number)
    log = d + "_{}_CPUlog.txt".format(log_number)
    clust_suffix = ""
    if args.clustering_suffix != "":
        clust_suffix = f" --clustering-suffix {args.clustering_suffix}"
    pt_cutoff_suffix_cmd = f" --pt-jet-cutoff {args.pt_cutoff_jet}"
    pt_cutoff_suffix = ""
    if args.pt_cutoff_jet != 100.0:
        pt_cutoff_suffix = f"_pt_{args.pt_cutoff_jet}"
    if args.high_eta_only:
        pt_cutoff_suffix += "_high_eta"
        #aug_suffix += " --high-eta-only"
    elif args.low_eta_only:
        pt_cutoff_suffix += "_low_eta"
        #aug_suffix += " --low-eta-only"
    file = f"""#!/bin/bash
#SBATCH --partition={partition}           # Specify the partition
#SBATCH --account={account}               # Specify the account
#SBATCH --mem=25000                   # Request 10GB of memory
#SBATCH --time=02:00:00               # Set the time limit to 1 hour
#SBATCH --job-name=SVJ_CPU_{eval_job_name}_{str(log_number)}  # Name the job
#SBATCH --error={err}         # Redirect stderr to a log file
#SBATCH --output={log}         # Redirect stderr to a log file
#SBATCH --mail-type=FAIL
#SBATCH [email protected]
source env.sh
export APPTAINER_TMPDIR=/work/gkrzmanc/singularity_tmp
export APPTAINER_CACHEDIR=/work/gkrzmanc/singularity_cache
nvidia-smi
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}{pt_cutoff_suffix}/{eval_job_name}{args.clustering_suffix} --eval-dir train/{eval_job_name} --jets-object model_jets --dataset-cap {DSCAP} {aug_suffix} {clust_suffix} {pt_cutoff_suffix_cmd}
    """
    return file

runs, run_config = get_eval_run_names(args.tag)
print("RUNS:", runs)


if args.submit_AK8:
   # Submit also ak and ak8
    if not os.path.exists("jobs/slurm_files"):
        os.makedirs("jobs/slurm_files")
    if not os.path.exists("jobs/logs"):
        os.makedirs("jobs/logs")
    log_number = get_log_number(args.tag)
    slurm_file_text = get_slurm_file_text_AK(args.tag, log_number)
    # write the file to jobs/slurm_files
    with open("jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number), "w") as f:
        f.write(slurm_file_text)
        print("Wrote file to jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number))
    if not args.no_submit:
        os.system("sbatch jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number))
    print("---- Submitted AK8 run -----")
    sys.exit(0)

def extract_n_events(filename):
    if not os.path.exists(filename):
        return -1
    content = open(filename).read().strip()
    try:
        return int(content)
    except:
        return -1


if args.submit_AKX:
    # Submit also AKX
    if not os.path.exists("jobs/slurm_files"):
        os.makedirs("jobs/slurm_files")
    if not os.path.exists("jobs/logs"):
        os.makedirs("jobs/logs")
    log_number = get_log_number(args.tag)
    slurm_file_text = get_slurm_file_text_AKX(args.tag, log_number)
    # write the file to jobs/slurm_files
    with open("jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number), "w") as f:
        f.write(slurm_file_text)
        print("Wrote file to jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number))
    if not args.no_submit:
        os.system("sbatch jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number))
    print("---- Submitted AKX run -----")
    sys.exit(0)

for i, run in enumerate(runs):
    #if get_run_by_name(run).state != "finished":
    #    print("Run not finished (failed or still in progress) - skipping", run)
    #    continue

    conf = get_run_by_name(run).config
    if( conf.get("parton_level") or conf.get("gen_level")) and args.pt_cutoff_jet != 100.0:
        print("Skipping run", run, "because it is parton level or gen level and pt cutoff is not 100.0")
        continue
    aug_soft_p = conf.get("augment_soft_particles", False)
    if aug_soft_p:
        aug_suffix = "-aug-soft"
    else:
        aug_suffix = ""
    if not os.path.exists("jobs/slurm_files"):
        os.makedirs("jobs/slurm_files")
    if not os.path.exists("jobs/logs"):
        os.makedirs("jobs/logs")
    log_number = get_log_number(args.tag)
    pt_cutoff_suffix = ""
    if args.pt_cutoff_jet != 100.0:
        pt_cutoff_suffix = f"_pt_{args.pt_cutoff_jet}"
    if args.high_eta_only:
        pt_cutoff_suffix += "_high_eta"
        aug_suffix += " --high-eta-only"
    elif args.low_eta_only:
        pt_cutoff_suffix += "_low_eta"
        aug_suffix += " --low-eta-only"
    slurm_file_text = get_slurm_file_text(args.tag, run, log_number, aug_suffix)
    rel_path_save = f"{args.input}/batch_eval_2k/{args.tag}{pt_cutoff_suffix}/{run}{args.clustering_suffix}"
    rel_path_save = get_path(rel_path_save, "results")
    if not os.path.exists(rel_path_save):
        os.makedirs(rel_path_save)
    #if evaluated(rel_path_save):
    n_events = extract_n_events(os.path.join(rel_path_save, "count_matched_quarks", "n_events.txt"))
    if os.path.exists(os.path.join(rel_path_save, "count_matched_quarks", "n_events.txt")) and not args.overwrite and n_events > 0:
        print("Skipping", run, "because this file exists:", os.path.join(rel_path_save, "count_matched_quarks", "n_events.txt"))
        continue
    else:
        print("Evaluating", run)
    # save run config here
    with open(f"{rel_path_save}/run_config.pkl", "wb") as f:
        pickle.dump(run_config[i], f)
    # write the file to jobs/slurm_files
    with open("jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number), "w") as f:
        f.write(slurm_file_text)
        print("Wrote file to jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number))
    if not args.no_submit:
        os.system("sbatch jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number))