import pickle import os from src.utils.paths import get_path from src.utils.utils import CPU_Unpickler import argparse from src.jetfinder.clustering import get_clustering_labels import optuna from src.dataset.dataset import EventDataset from src.evaluation.clustering_metrics import compute_f1_score import torch import warnings warnings.filterwarnings("ignore") # filename = get_path("/work/gkrzmanc/jetclustering/results/train/Test_betaPt_BC_2025_01_03_15_07_14/eval_0.pkl", "results") # for rinv=0.7, see /work/gkrzmanc/jetclustering/results/train/Test_betaPt_BC_rinv07_2025_01_03_15_38_58 # keeping the clustering script here for now, so that it's separated from the GPU-heavy tasks like inference (clustering may be changed frequently...) # parameters: min-cluster-size: [5, 30] # min-samples: [2, 30] # epsilon: [0.01, 0.5] parser = argparse.ArgumentParser() parser.add_argument("--input", type=str, required=True) parser.add_argument("--dataset", type=int, required=False, default=11) # Which dataset to optimize on parser.add_argument("--dataset-cap", type=int, required=False, default=-1) parser.add_argument("--spatial-components-only", "-spatial-only", action="store_true") parser.add_argument("--lorentz-cos-sim", action="store_true") parser.add_argument("--cos-sim", action="store_true") parser.add_argument("--normalize", action="store_true") # --input train/ --dataset-cap 1000 --spatial-components-only args = parser.parse_args() path = get_path(args.input, "results") suffix = "" if args.spatial_components_only: suffix = "_sp_comp_only" #if args.lorentz_norm: # suffix = "_lorentz_norm" if args.lorentz_cos_sim: suffix = "_lorentz_cos_sim" if args.cos_sim: suffix = "_cos_sim" if args.normalize: suffix = "_norm" study_file = os.path.join(path, "clustering_tuning_{}{}.log".format(args.dataset, suffix)) study_exists = os.path.exists(study_file) storage = optuna.storages.JournalStorage( optuna.storages.journal.JournalFileBackend(study_file) ) if study_exists: study = optuna.load_study(storage=storage, study_name="clustering") else: study = optuna.create_study(storage=storage, study_name="clustering", direction="maximize") eval_result_file = os.path.join(path, "eval_{}.pkl".format(args.dataset)) eval_result = CPU_Unpickler(open(eval_result_file, "rb")).load() dataset_cap = args.dataset_cap def objective(trial): min_clust_size = trial.suggest_int("min_cluster_size", 2, 20) min_samples = trial.suggest_int("min_samples", 0, 10) epsilon = trial.suggest_uniform("epsilon", 0.01, 0.5) print("Starting trial with parameters:", trial.params) suffix = "{}-{}-{}".format(min_clust_size, min_samples, epsilon) if args.spatial_components_only: suffix = "sp-" + suffix #if args.lorentz_norm: # suffix = "ln-" + suffix if args.cos_sim: suffix = "cs-" + suffix if args.lorentz_cos_sim: suffix = "lcs-" + suffix if args.normalize: suffix = "norm-" + suffix clustering_file = os.path.join(path, "clustering_{}_{}.pkl".format(suffix, args.dataset)) if not os.path.exists(clustering_file): if eval_result["pred"].shape[1] == 4: coords = eval_result["pred"][:, :3] else: if args.spatial_components_only or args.cos_sim: coords = eval_result["pred"][:, 1:4] else: coords = eval_result["pred"][:, :4] event_idx = eval_result["event_idx"] if dataset_cap > 0: filt = event_idx < dataset_cap event_idx = event_idx[filt] coords = coords[filt] if args.cos_sim or args.normalize: coords = coords / torch.norm(coords, dim=1, keepdim=True) labels = get_clustering_labels(coords, event_idx, min_cluster_size=min_clust_size, min_samples=min_samples, epsilon=epsilon, bar=True, lorentz_cos_sim=args.lorentz_cos_sim, cos_sim=args.cos_sim) with open(clustering_file, "wb") as f: pickle.dump(labels, f) print("Clustering saved to", clustering_file) #else: # labels = pickle.load(open(clustering_file, "rb")) print("Dataset:", eval_result["filename"]) dataset = EventDataset.from_directory(eval_result["filename"], model_clusters_file=clustering_file, model_output_file=eval_result_file, include_model_jets_unfiltered=True, parton_level=True, aug_soft=True) score = compute_f1_score(dataset, dataset_cap=dataset_cap) print("F1 score for", suffix, ":", score) return score study.optimize(objective, n_trials=100) print(f"Best params is {study.best_params} with value {study.best_value}")