{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2025-04-10T09:06:33.918985Z", "start_time": "2025-04-10T09:06:30.413364Z" } }, "cell_type": "code", "source": [ "import os\n", "import ast\n", "import glob\n", "import functools\n", "import math\n", "import torch\n", "\n", "from torch.utils.data import DataLoader\n", "from src.logger.logger import _logger, _configLogger\n", "from src.dataset.dataset import EventDatasetCollection, EventDataset\n", "from src.utils.import_tools import import_module\n", "from src.dataset.functions_graph import graph_batch_func\n", "from src.dataset.functions_data import concat_events\n", "from src.utils.paths import get_path\n" ], "id": "e3d819c470939dc", "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-04-10T09:06:34.311191Z", "start_time": "2025-04-10T09:06:34.004349Z" } }, "cell_type": "code", "source": [ "train_data = EventDataset.from_directory(\"/work/gkrzmanc/jetclustering/preprocessed_data/scouting_PFNano_signals1/SVJ_hadronic_std3/s-channel_mMed-1100_mDark-20_rinv-0.3_alpha-peak_13TeV-pythia8_n-2000\", mmap=True)\n", "print(\"N events:\", len(train_data))\n", "train_loader = DataLoader(\n", " train_data,\n", " batch_size=8,\n", " drop_last=True,\n", " pin_memory=True,\n", " num_workers=1,\n", " collate_fn=concat_events,\n", " persistent_workers=1\n", " )\n" ], "id": "8c937e192df126b9", "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: '/work/gkrzmanc/jetclustering/preprocessed_data/scouting_PFNano_signals1/SVJ_hadronic_std3/s-channel_mMed-1100_mDark-20_rinv-0.3_alpha-peak_13TeV-pythia8_n-2000'", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[0;31mFileNotFoundError\u001B[0m Traceback (most recent call last)", "Cell \u001B[0;32mIn[2], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m train_data \u001B[38;5;241m=\u001B[39m \u001B[43mEventDataset\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfrom_directory\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m/work/gkrzmanc/jetclustering/preprocessed_data/scouting_PFNano_signals1/SVJ_hadronic_std3/s-channel_mMed-1100_mDark-20_rinv-0.3_alpha-peak_13TeV-pythia8_n-2000\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmmap\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m)\u001B[49m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mN events:\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;28mlen\u001B[39m(train_data))\n\u001B[1;32m 3\u001B[0m train_loader \u001B[38;5;241m=\u001B[39m DataLoader(\n\u001B[1;32m 4\u001B[0m train_data,\n\u001B[1;32m 5\u001B[0m batch_size\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m8\u001B[39m,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 10\u001B[0m persistent_workers\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m1\u001B[39m\n\u001B[1;32m 11\u001B[0m )\n", "File \u001B[0;32m/work/gkrzmanc/jetclustering/code/src/dataset/dataset.py:377\u001B[0m, in \u001B[0;36mEventDataset.from_directory\u001B[0;34m(dir, mmap, model_clusters_file, model_output_file, include_model_jets_unfiltered, fastjet_R, parton_level, gen_level, aug_soft, seed, aug_collinear)\u001B[0m\n\u001B[1;32m 374\u001B[0m \u001B[38;5;129m@staticmethod\u001B[39m\n\u001B[1;32m 375\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21mfrom_directory\u001B[39m(\u001B[38;5;28mdir\u001B[39m, mmap\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m, model_clusters_file\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, model_output_file\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, include_model_jets_unfiltered\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m, fastjet_R\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, parton_level\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m, gen_level\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m, aug_soft\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m, seed\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m, aug_collinear\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m 376\u001B[0m result \u001B[38;5;241m=\u001B[39m {}\n\u001B[0;32m--> 377\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m \u001B[43mos\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mlistdir\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mdir\u001B[39;49m\u001B[43m)\u001B[49m:\n\u001B[1;32m 378\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m file \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmetadata.pkl\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n\u001B[1;32m 379\u001B[0m metadata \u001B[38;5;241m=\u001B[39m pickle\u001B[38;5;241m.\u001B[39mload(\u001B[38;5;28mopen\u001B[39m(os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(\u001B[38;5;28mdir\u001B[39m, file), \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mrb\u001B[39m\u001B[38;5;124m\"\u001B[39m))\n", "\u001B[0;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: '/work/gkrzmanc/jetclustering/preprocessed_data/scouting_PFNano_signals1/SVJ_hadronic_std3/s-channel_mMed-1100_mDark-20_rinv-0.3_alpha-peak_13TeV-pythia8_n-2000'" ] } ], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-19T17:19:10.625599Z", "start_time": "2024-12-19T17:19:10.457272Z" } }, "cell_type": "code", "source": "b = next(iter(train_loader))", "id": "6ddf6f2f4c0f3c37", "outputs": [], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-19T17:19:20.974583Z", "start_time": "2024-12-19T17:19:20.965173Z" } }, "cell_type": "code", "source": "b.pfcands.pid.unique()", "id": "5f2a6492abfac378", "outputs": [ { "data": { "text/plain": [ "tensor([-211., -13., -11., 13., 22., 130., 211.])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-18T14:37:13.995401Z", "start_time": "2024-12-18T14:37:13.986881Z" } }, "cell_type": "code", "source": "dir(b)", "id": "df59e9660ebabc72", "outputs": [ { "data": { "text/plain": [ "['MET',\n", " '__class__',\n", " '__delattr__',\n", " '__dict__',\n", " '__dir__',\n", " '__doc__',\n", " '__eq__',\n", " '__format__',\n", " '__ge__',\n", " '__getattribute__',\n", " '__gt__',\n", " '__hash__',\n", " '__init__',\n", " '__init_subclass__',\n", " '__le__',\n", " '__len__',\n", " '__lt__',\n", " '__module__',\n", " '__ne__',\n", " '__new__',\n", " '__reduce__',\n", " '__reduce_ex__',\n", " '__repr__',\n", " '__setattr__',\n", " '__sizeof__',\n", " '__str__',\n", " '__subclasshook__',\n", " '__weakref__',\n", " 'evt_collections',\n", " 'fatjets',\n", " 'genjets',\n", " 'init_attrs',\n", " 'jets',\n", " 'matrix_element_gen_particles',\n", " 'n_events',\n", " 'offline_pfcands',\n", " 'pfcands',\n", " 'serialize',\n", " 'special_pfcands']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-18T14:37:14.126468Z", "start_time": "2024-12-18T14:37:14.116496Z" } }, "cell_type": "code", "source": "len(b.pfcands), b.pfcands.batch_number", "id": "d286a98188187c6", "outputs": [ { "data": { "text/plain": [ "(2023, tensor([ 0, 331, 636, 821, 1044, 1404, 1619, 1787, 2023]))" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-18T14:41:22.295628Z", "start_time": "2024-12-18T14:41:22.277888Z" } }, "cell_type": "code", "source": [ "def get_idx_for_event(obj, i):\n", " return obj.batch_number[i], obj.batch_number[i+1]\n", "\n", "def get_labels(pfcands):\n", " labels = torch.zeros(len(pfcands)).long()\n", " R = 0.8\n", " for i in range(len(b)):\n", " s, e = get_idx_for_event(b.matrix_element_gen_particles, i)\n", " dq_eta = b.matrix_element_gen_particles.eta[s:e]\n", " dq_phi = b.matrix_element_gen_particles.phi[s:e]\n", " # dq_pt = b.matrix_element_gen_particles.pt[s:e] # Maybe we can somehow weigh the loss by pt?\n", " s, e = get_idx_for_event(pfcands, i)\n", " pfcands_eta = pfcands.eta[s:e]\n", " pfcands_phi = pfcands.phi[s:e]\n", " # calculate the distance matrix between each dark quark and pfcands\n", " dist_matrix = torch.cdist(\n", " torch.stack([dq_eta, dq_phi], dim=1),\n", " torch.stack([pfcands_eta, pfcands_phi], dim=1),\n", " p=2\n", " )\n", " dist_matrix = dist_matrix.T\n", " closest_quark_dist, closest_quark_idx = dist_matrix.min(dim=1)\n", " closest_quark_idx[closest_quark_dist > R] = -1\n", " labels[s:e] = closest_quark_idx\n", " return labels" ], "id": "1311033e4b878a6b", "outputs": [], "execution_count": 22 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-29T11:28:01.367425Z", "start_time": "2024-12-29T11:28:01.348995Z" } }, "cell_type": "code", "source": "", "id": "878ebfeda196f1ea", "outputs": [], "execution_count": 1 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "import os\n", "from src.dataset.dataset import SimpleIterDataset, EventDataset\n", "from src.utils.utils import to_filelist\n", "from src.utils.paths import get_path\n", "\n", "class Args:\n", " def __init__(self):\n", " self.data_train = datasets\n", " self.data_val = datasets\n", " #self.data_train = files_train\n", " self.data_config = \"config_files/config_jets_1_delphes.yaml\"\n", " self.extra_selection = None\n", " self.train_val_split = 1.0\n", " self.data_fraction = 1\n", " self.file_fraction = 1\n", " self.fetch_by_files = False\n", " self.fetch_step = 1\n", " self.steps_per_epoch = None\n", " self.in_memory = False\n", " self.local_rank = None\n", " self.copy_inputs = False\n", " self.no_remake_weights = False\n", " self.batch_size = 10\n", " self.num_workers = 0\n", " self.demo = False\n", " self.laplace = False\n", " self.diffs = False\n", " self.class_edges = False\n", "args = Args()\n", "train_range = (0, args.train_val_split)\n", "train_file_dict, train_files = to_filelist(args, 'train')\n", "train_data = SimpleIterDataset(train_file_dict, args.data_config, for_training=True,\n", " extra_selection=args.extra_selection,\n", " remake_weights=True,\n", " load_range_and_fraction=(train_range, args.data_fraction),\n", " file_fraction=args.file_fraction,\n", " fetch_by_files=args.fetch_by_files,\n", " fetch_step=args.fetch_step,\n", " infinity_mode=False,\n", " in_memory=args.in_memory,\n", " async_load=False,\n", " name='train', jets=True)\n", "iterator = iter(train_data)" ], "id": "7475933352961325" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "5759f10b89b33d82" } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3 (ipykernel)" } }, "nbformat": 5, "nbformat_minor": 9 }