File size: 4,900 Bytes
e75a247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pickle
import os
from src.utils.paths import get_path
from src.utils.utils import CPU_Unpickler
import argparse
from src.jetfinder.clustering import get_clustering_labels
import optuna
from src.dataset.dataset import EventDataset
from src.evaluation.clustering_metrics import compute_f1_score
import torch

import warnings
warnings.filterwarnings("ignore")

# filename = get_path("/work/gkrzmanc/jetclustering/results/train/Test_betaPt_BC_2025_01_03_15_07_14/eval_0.pkl", "results")
# for rinv=0.7, see /work/gkrzmanc/jetclustering/results/train/Test_betaPt_BC_rinv07_2025_01_03_15_38_58
# keeping the clustering script here for now, so that it's separated from the GPU-heavy tasks like inference (clustering may be changed frequently...)
# parameters: min-cluster-size: [5, 30]
#             min-samples: [2, 30]
#             epsilon: [0.01, 0.5]

parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, required=True)
parser.add_argument("--dataset", type=int, required=False, default=11) # Which dataset to optimize on
parser.add_argument("--dataset-cap", type=int, required=False, default=-1)
parser.add_argument("--spatial-components-only", "-spatial-only", action="store_true")
parser.add_argument("--lorentz-cos-sim", action="store_true")
parser.add_argument("--cos-sim", action="store_true")
parser.add_argument("--normalize", action="store_true")
# --input train/  --dataset-cap 1000 --spatial-components-only
args = parser.parse_args()
path = get_path(args.input, "results")
suffix = ""
if args.spatial_components_only:
    suffix = "_sp_comp_only"
#if args.lorentz_norm:
#    suffix = "_lorentz_norm"
if args.lorentz_cos_sim:
    suffix = "_lorentz_cos_sim"
if args.cos_sim:
    suffix = "_cos_sim"
if args.normalize:
    suffix = "_norm"

study_file = os.path.join(path, "clustering_tuning_{}{}.log".format(args.dataset, suffix))

study_exists = os.path.exists(study_file)
storage = optuna.storages.JournalStorage(
    optuna.storages.journal.JournalFileBackend(study_file)
)

if study_exists:
    study = optuna.load_study(storage=storage, study_name="clustering")
else:
    study = optuna.create_study(storage=storage, study_name="clustering", direction="maximize")

eval_result_file = os.path.join(path, "eval_{}.pkl".format(args.dataset))
eval_result = CPU_Unpickler(open(eval_result_file, "rb")).load()

dataset_cap = args.dataset_cap


def objective(trial):
    min_clust_size = trial.suggest_int("min_cluster_size", 2, 20)
    min_samples = trial.suggest_int("min_samples", 0, 10)
    epsilon = trial.suggest_uniform("epsilon", 0.01, 0.5)
    print("Starting trial with parameters:", trial.params)
    suffix = "{}-{}-{}".format(min_clust_size, min_samples, epsilon)
    if args.spatial_components_only:
        suffix = "sp-" + suffix
    #if args.lorentz_norm:
    #    suffix = "ln-" + suffix
    if args.cos_sim:
        suffix = "cs-" + suffix
    if args.lorentz_cos_sim:
        suffix = "lcs-" + suffix
    if args.normalize:
        suffix = "norm-" + suffix
    clustering_file = os.path.join(path, "clustering_{}_{}.pkl".format(suffix, args.dataset))
    if not os.path.exists(clustering_file):
        if eval_result["pred"].shape[1] == 4:
            coords = eval_result["pred"][:, :3]
        else:
            if args.spatial_components_only or args.cos_sim:
                coords = eval_result["pred"][:, 1:4]
            else:
                coords = eval_result["pred"][:, :4]
        event_idx = eval_result["event_idx"]
        if dataset_cap > 0:
            filt = event_idx < dataset_cap
            event_idx = event_idx[filt]
            coords = coords[filt]
        if args.cos_sim or args.normalize:
            coords = coords / torch.norm(coords, dim=1, keepdim=True)
        labels = get_clustering_labels(coords, event_idx, min_cluster_size=min_clust_size,
                                       min_samples=min_samples, epsilon=epsilon, bar=True,
                                       lorentz_cos_sim=args.lorentz_cos_sim,
                                       cos_sim=args.cos_sim)
        with open(clustering_file, "wb") as f:
            pickle.dump(labels, f)
        print("Clustering saved to", clustering_file)
    #else:
    #    labels = pickle.load(open(clustering_file, "rb"))
    print("Dataset:", eval_result["filename"])
    dataset = EventDataset.from_directory(eval_result["filename"],
                                          model_clusters_file=clustering_file,
                                          model_output_file=eval_result_file,
                                          include_model_jets_unfiltered=True, parton_level=True, aug_soft=True)
    score = compute_f1_score(dataset, dataset_cap=dataset_cap)
    print("F1 score for", suffix, ":", score)
    return score

study.optimize(objective, n_trials=100)
print(f"Best params is {study.best_params} with value {study.best_value}")