UniversalAlgorithmic commited on Aug 5

Commit

09a2af4

verified ·

1 Parent(s): d8e11b0

Upload 178 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
hpo-examples/audio-classification/ac/README.md +88 -0
hpo-examples/audio-classification/ac/all_results.json +13 -0
hpo-examples/audio-classification/ac/config.json +147 -0
hpo-examples/audio-classification/ac/eval_results.json +8 -0
hpo-examples/audio-classification/ac/model.safetensors +3 -0
hpo-examples/audio-classification/ac/preprocessor_config.json +9 -0
hpo-examples/audio-classification/ac/runs/May15_03-06-03_cs-Precision-7960-Tower/events.out.tfevents.1747292768.cs-Precision-7960-Tower.146737.0 +3 -0
hpo-examples/audio-classification/ac/runs/May15_03-06-03_cs-Precision-7960-Tower/events.out.tfevents.1747293535.cs-Precision-7960-Tower.146737.1 +3 -0
hpo-examples/audio-classification/ac/train_results.json +8 -0
hpo-examples/audio-classification/ac/trainer_state.json +1598 -0
hpo-examples/audio-classification/ac/training_args.bin +3 -0
hpo-examples/audio-classification/requirements.txt +5 -0
hpo-examples/audio-classification/run.sh +30 -0
hpo-examples/audio-classification/run_audio_classification.py +462 -0
hpo-examples/audio-classification/trplib.py +1181 -0
hpo-examples/image-classification/__pycache__/presets.cpython-310.pyc +0 -0
hpo-examples/image-classification/__pycache__/sampler.cpython-310.pyc +0 -0
hpo-examples/image-classification/__pycache__/transforms.cpython-310.pyc +0 -0
hpo-examples/image-classification/__pycache__/trplib.cpython-310.pyc +0 -0
hpo-examples/image-classification/__pycache__/utils.cpython-310.pyc +0 -0
hpo-examples/image-classification/efficientnet_v2_m/model_7.pth +3 -0
hpo-examples/image-classification/mobilenetv2/model_32.pth +3 -0
hpo-examples/image-classification/presets.py +71 -0
hpo-examples/image-classification/resnet50/model_35.pth +3 -0
hpo-examples/image-classification/run.sh +49 -0
hpo-examples/image-classification/sampler.py +62 -0
hpo-examples/image-classification/train.py +524 -0
hpo-examples/image-classification/train_quantization.py +265 -0
hpo-examples/image-classification/transforms.py +183 -0
hpo-examples/image-classification/trplib.py +1181 -0
hpo-examples/image-classification/utils.py +465 -0
hpo-examples/image-classification/vit_b_16/model_4.pth +3 -0
hpo-examples/question-answering/qa/README.md +55 -0
hpo-examples/question-answering/qa/all_results.json +15 -0
hpo-examples/question-answering/qa/config.json +26 -0
hpo-examples/question-answering/qa/eval_nbest_predictions.json +3 -0
hpo-examples/question-answering/qa/eval_predictions.json +0 -0
hpo-examples/question-answering/qa/eval_results.json +9 -0
hpo-examples/question-answering/qa/model.safetensors +3 -0
hpo-examples/question-answering/qa/runs/May15_03-24-14_cs-Precision-7960-Tower/events.out.tfevents.1747293859.cs-Precision-7960-Tower.147971.0 +3 -0
hpo-examples/question-answering/qa/runs/May15_03-24-14_cs-Precision-7960-Tower/events.out.tfevents.1747297197.cs-Precision-7960-Tower.147971.1 +3 -0
hpo-examples/question-answering/qa/special_tokens_map.json +7 -0
hpo-examples/question-answering/qa/tokenizer.json +0 -0
hpo-examples/question-answering/qa/tokenizer_config.json +56 -0
hpo-examples/question-answering/qa/train_results.json +9 -0
hpo-examples/question-answering/qa/trainer_state.json +245 -0
hpo-examples/question-answering/qa/training_args.bin +3 -0
hpo-examples/question-answering/qa/vocab.txt +0 -0
hpo-examples/question-answering/requirements.txt +4 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,4 @@ qa/eval_nbest_predictions.json filter=lfs diff=lfs merge=lfs -text
 qa/sequential-policy-gradient.pdf filter=lfs diff=lfs merge=lfs -text
 sequential-policy-gradient.png filter=lfs diff=lfs merge=lfs -text
 examples/question-answering/qa/eval_nbest_predictions.json filter=lfs diff=lfs merge=lfs -text

 qa/sequential-policy-gradient.pdf filter=lfs diff=lfs merge=lfs -text
 sequential-policy-gradient.png filter=lfs diff=lfs merge=lfs -text
 examples/question-answering/qa/eval_nbest_predictions.json filter=lfs diff=lfs merge=lfs -text
+hpo-examples/question-answering/qa/eval_nbest_predictions.json filter=lfs diff=lfs merge=lfs -text

hpo-examples/audio-classification/ac/README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: facebook/wav2vec2-base
+tags:
+- audio-classification
+- generated_from_trainer
+datasets:
+- superb
+metrics:
+- accuracy
+model-index:
+- name: wav2vec2-base-ft-keyword-spotting
+  results:
+  - task:
+      name: Audio Classification
+      type: audio-classification
+    dataset:
+      name: superb
+      type: superb
+      config: ks
+      split: validation
+      args: ks
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.9826419535157399
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# wav2vec2-base-ft-keyword-spotting
+This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the superb dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0954
+- Accuracy: 0.9826
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 48
+- eval_batch_size: 32
+- seed: 0
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 192
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 8.0
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss | Accuracy |
+|:-------------:|:------:|:----:|:---------------:|:--------:|
+| 1.3624        | 1.0    | 267  | 1.1959          | 0.6546   |
+| 0.3854        | 2.0    | 534  | 0.2675          | 0.9734   |
+| 0.2473        | 3.0    | 801  | 0.1461          | 0.9768   |
+| 0.1997        | 4.0    | 1068 | 0.1088          | 0.9804   |
+| 0.1723        | 5.0    | 1335 | 0.0954          | 0.9826   |
+| 0.1442        | 6.0    | 1602 | 0.0927          | 0.9813   |
+| 0.1397        | 7.0    | 1869 | 0.0892          | 0.9812   |
+| 0.1368        | 7.9728 | 2128 | 0.0896          | 0.9812   |
+### Framework versions
+- Transformers 4.49.0
+- Pytorch 2.6.0+cu118
+- Datasets 3.3.1
+- Tokenizers 0.21.0

hpo-examples/audio-classification/ac/all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 7.972769953051643,
+    "eval_accuracy": 0.9826419535157399,
+    "eval_loss": 0.09542840719223022,
+    "eval_runtime": 5.5538,
+    "eval_samples_per_second": 1224.023,
+    "eval_steps_per_second": 38.352,
+    "total_flos": 3.767900833756416e+18,
+    "train_loss": 0.5178930132572812,
+    "train_runtime": 756.2923,
+    "train_samples_per_second": 540.468,
+    "train_steps_per_second": 2.814
+}

hpo-examples/audio-classification/ac/config.json ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+  "_name_or_path": "facebook/wav2vec2-base",
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "finetuning_task": "audio-classification",
+  "freeze_feat_extract_train": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "yes",
+    "1": "no",
+    "10": "_silence_",
+    "11": "_unknown_",
+    "2": "up",
+    "3": "down",
+    "4": "left",
+    "5": "right",
+    "6": "on",
+    "7": "off",
+    "8": "stop",
+    "9": "go"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "_silence_": "10",
+    "_unknown_": "11",
+    "down": "3",
+    "go": "9",
+    "left": "4",
+    "no": "1",
+    "off": "7",
+    "on": "6",
+    "right": "5",
+    "stop": "8",
+    "up": "2",
+    "yes": "0"
+  },
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "no_mask_channel_overlap": false,
+  "no_mask_time_overlap": false,
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

hpo-examples/audio-classification/ac/eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 7.972769953051643,
+    "eval_accuracy": 0.9826419535157399,
+    "eval_loss": 0.09542840719223022,
+    "eval_runtime": 5.5538,
+    "eval_samples_per_second": 1224.023,
+    "eval_steps_per_second": 38.352
+}

hpo-examples/audio-classification/ac/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e4f1c85d883f3e41ebfab4cd7752ab2e6d6b968b847795be22e1e0662657a3
+size 385400352

hpo-examples/audio-classification/ac/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

hpo-examples/audio-classification/ac/runs/May15_03-06-03_cs-Precision-7960-Tower/events.out.tfevents.1747292768.cs-Precision-7960-Tower.146737.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75015326d15cb1786d8788b228e95bd7e952bad9e8e8be4ed02764cc17c8464c
+size 54953

hpo-examples/audio-classification/ac/runs/May15_03-06-03_cs-Precision-7960-Tower/events.out.tfevents.1747293535.cs-Precision-7960-Tower.146737.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:215a99aa1f96db505ff53265c988d18ddcf70f16a1f13e713316a0354b768356
+size 411

hpo-examples/audio-classification/ac/train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 7.972769953051643,
+    "total_flos": 3.767900833756416e+18,
+    "train_loss": 0.5178930132572812,
+    "train_runtime": 756.2923,
+    "train_samples_per_second": 540.468,
+    "train_steps_per_second": 2.814
+}

hpo-examples/audio-classification/ac/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1598 @@

+{
+  "best_metric": 0.9826419535157399,
+  "best_model_checkpoint": "wav2vec2-base-ft-keyword-spotting/checkpoint-1335",
+  "epoch": 7.972769953051643,
+  "eval_steps": 500,
+  "global_step": 2128,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.03755868544600939,
+      "grad_norm": 2.0377416610717773,
+      "learning_rate": 1.4084507042253521e-06,
+      "loss": 3.8687,
+      "step": 10
+    },
+    {
+      "epoch": 0.07511737089201878,
+      "grad_norm": 3.055781602859497,
+      "learning_rate": 2.8169014084507042e-06,
+      "loss": 4.1156,
+      "step": 20
+    },
+    {
+      "epoch": 0.11267605633802817,
+      "grad_norm": 3.383268356323242,
+      "learning_rate": 4.225352112676057e-06,
+      "loss": 4.0885,
+      "step": 30
+    },
+    {
+      "epoch": 0.15023474178403756,
+      "grad_norm": 3.8566606044769287,
+      "learning_rate": 5.6338028169014084e-06,
+      "loss": 3.9316,
+      "step": 40
+    },
+    {
+      "epoch": 0.18779342723004694,
+      "grad_norm": 5.065456867218018,
+      "learning_rate": 7.042253521126761e-06,
+      "loss": 3.6474,
+      "step": 50
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 5.89341926574707,
+      "learning_rate": 8.450704225352114e-06,
+      "loss": 3.2124,
+      "step": 60
+    },
+    {
+      "epoch": 0.26291079812206575,
+      "grad_norm": 5.9929399490356445,
+      "learning_rate": 9.859154929577466e-06,
+      "loss": 2.756,
+      "step": 70
+    },
+    {
+      "epoch": 0.3004694835680751,
+      "grad_norm": 5.689433574676514,
+      "learning_rate": 1.1267605633802817e-05,
+      "loss": 2.4596,
+      "step": 80
+    },
+    {
+      "epoch": 0.3380281690140845,
+      "grad_norm": 4.89589262008667,
+      "learning_rate": 1.267605633802817e-05,
+      "loss": 2.2638,
+      "step": 90
+    },
+    {
+      "epoch": 0.3755868544600939,
+      "grad_norm": 4.8666839599609375,
+      "learning_rate": 1.4084507042253522e-05,
+      "loss": 2.1166,
+      "step": 100
+    },
+    {
+      "epoch": 0.4131455399061033,
+      "grad_norm": 4.466708660125732,
+      "learning_rate": 1.5492957746478876e-05,
+      "loss": 2.0048,
+      "step": 110
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 3.676050901412964,
+      "learning_rate": 1.6901408450704228e-05,
+      "loss": 1.9138,
+      "step": 120
+    },
+    {
+      "epoch": 0.48826291079812206,
+      "grad_norm": 2.183825731277466,
+      "learning_rate": 1.830985915492958e-05,
+      "loss": 1.863,
+      "step": 130
+    },
+    {
+      "epoch": 0.5258215962441315,
+      "grad_norm": 2.075413465499878,
+      "learning_rate": 1.9718309859154933e-05,
+      "loss": 1.7616,
+      "step": 140
+    },
+    {
+      "epoch": 0.5633802816901409,
+      "grad_norm": 0.8534318208694458,
+      "learning_rate": 2.112676056338028e-05,
+      "loss": 1.7185,
+      "step": 150
+    },
+    {
+      "epoch": 0.6009389671361502,
+      "grad_norm": 0.9039830565452576,
+      "learning_rate": 2.2535211267605634e-05,
+      "loss": 1.8054,
+      "step": 160
+    },
+    {
+      "epoch": 0.6384976525821596,
+      "grad_norm": 1.32124662399292,
+      "learning_rate": 2.3943661971830986e-05,
+      "loss": 1.7367,
+      "step": 170
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 1.232069969177246,
+      "learning_rate": 2.535211267605634e-05,
+      "loss": 1.7423,
+      "step": 180
+    },
+    {
+      "epoch": 0.7136150234741784,
+      "grad_norm": 1.9570960998535156,
+      "learning_rate": 2.676056338028169e-05,
+      "loss": 1.6132,
+      "step": 190
+    },
+    {
+      "epoch": 0.7511737089201878,
+      "grad_norm": 2.4463119506835938,
+      "learning_rate": 2.8169014084507043e-05,
+      "loss": 1.6099,
+      "step": 200
+    },
+    {
+      "epoch": 0.7887323943661971,
+      "grad_norm": 6.601908206939697,
+      "learning_rate": 2.9577464788732395e-05,
+      "loss": 1.6043,
+      "step": 210
+    },
+    {
+      "epoch": 0.8262910798122066,
+      "grad_norm": 3.225101947784424,
+      "learning_rate": 2.989033942558747e-05,
+      "loss": 1.5621,
+      "step": 220
+    },
+    {
+      "epoch": 0.863849765258216,
+      "grad_norm": 3.698263645172119,
+      "learning_rate": 2.9733681462140994e-05,
+      "loss": 1.514,
+      "step": 230
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 5.209756374359131,
+      "learning_rate": 2.9577023498694518e-05,
+      "loss": 1.4532,
+      "step": 240
+    },
+    {
+      "epoch": 0.9389671361502347,
+      "grad_norm": 2.1304848194122314,
+      "learning_rate": 2.9420365535248042e-05,
+      "loss": 1.4312,
+      "step": 250
+    },
+    {
+      "epoch": 0.9765258215962441,
+      "grad_norm": 4.837350368499756,
+      "learning_rate": 2.926370757180157e-05,
+      "loss": 1.3624,
+      "step": 260
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.6546042953809944,
+      "eval_loss": 1.19585382938385,
+      "eval_runtime": 4.9178,
+      "eval_samples_per_second": 1382.328,
+      "eval_steps_per_second": 43.312,
+      "step": 267
+    },
+    {
+      "epoch": 1.0112676056338028,
+      "grad_norm": 4.779292106628418,
+      "learning_rate": 2.9107049608355094e-05,
+      "loss": 1.2541,
+      "step": 270
+    },
+    {
+      "epoch": 1.0488262910798123,
+      "grad_norm": 3.60760498046875,
+      "learning_rate": 2.8950391644908618e-05,
+      "loss": 1.2271,
+      "step": 280
+    },
+    {
+      "epoch": 1.0863849765258216,
+      "grad_norm": 2.3788599967956543,
+      "learning_rate": 2.8793733681462142e-05,
+      "loss": 1.2335,
+      "step": 290
+    },
+    {
+      "epoch": 1.123943661971831,
+      "grad_norm": 3.353325843811035,
+      "learning_rate": 2.8637075718015666e-05,
+      "loss": 1.1613,
+      "step": 300
+    },
+    {
+      "epoch": 1.1615023474178403,
+      "grad_norm": 4.326411247253418,
+      "learning_rate": 2.8480417754569193e-05,
+      "loss": 1.0754,
+      "step": 310
+    },
+    {
+      "epoch": 1.1990610328638498,
+      "grad_norm": 3.1939706802368164,
+      "learning_rate": 2.8323759791122717e-05,
+      "loss": 1.0353,
+      "step": 320
+    },
+    {
+      "epoch": 1.236619718309859,
+      "grad_norm": 2.8827011585235596,
+      "learning_rate": 2.816710182767624e-05,
+      "loss": 0.9806,
+      "step": 330
+    },
+    {
+      "epoch": 1.2741784037558685,
+      "grad_norm": 3.910698652267456,
+      "learning_rate": 2.8010443864229766e-05,
+      "loss": 1.0813,
+      "step": 340
+    },
+    {
+      "epoch": 1.3117370892018778,
+      "grad_norm": 3.5916378498077393,
+      "learning_rate": 2.7853785900783293e-05,
+      "loss": 0.9792,
+      "step": 350
+    },
+    {
+      "epoch": 1.3492957746478873,
+      "grad_norm": 2.6981167793273926,
+      "learning_rate": 2.7697127937336817e-05,
+      "loss": 0.9231,
+      "step": 360
+    },
+    {
+      "epoch": 1.3868544600938968,
+      "grad_norm": 5.702897071838379,
+      "learning_rate": 2.754046997389034e-05,
+      "loss": 0.9435,
+      "step": 370
+    },
+    {
+      "epoch": 1.424413145539906,
+      "grad_norm": 4.622363090515137,
+      "learning_rate": 2.7383812010443865e-05,
+      "loss": 0.8449,
+      "step": 380
+    },
+    {
+      "epoch": 1.4619718309859155,
+      "grad_norm": 2.2103636264801025,
+      "learning_rate": 2.7227154046997393e-05,
+      "loss": 0.7713,
+      "step": 390
+    },
+    {
+      "epoch": 1.4995305164319248,
+      "grad_norm": 4.545182228088379,
+      "learning_rate": 2.7070496083550917e-05,
+      "loss": 0.7719,
+      "step": 400
+    },
+    {
+      "epoch": 1.5370892018779343,
+      "grad_norm": 6.883026599884033,
+      "learning_rate": 2.691383812010444e-05,
+      "loss": 0.7564,
+      "step": 410
+    },
+    {
+      "epoch": 1.5746478873239438,
+      "grad_norm": 4.770920276641846,
+      "learning_rate": 2.6757180156657965e-05,
+      "loss": 0.6994,
+      "step": 420
+    },
+    {
+      "epoch": 1.612206572769953,
+      "grad_norm": 4.413459300994873,
+      "learning_rate": 2.660052219321149e-05,
+      "loss": 0.6313,
+      "step": 430
+    },
+    {
+      "epoch": 1.6497652582159623,
+      "grad_norm": 2.0261390209198,
+      "learning_rate": 2.6443864229765013e-05,
+      "loss": 0.6017,
+      "step": 440
+    },
+    {
+      "epoch": 1.6873239436619718,
+      "grad_norm": 5.67121696472168,
+      "learning_rate": 2.6287206266318537e-05,
+      "loss": 0.5792,
+      "step": 450
+    },
+    {
+      "epoch": 1.7248826291079813,
+      "grad_norm": 2.573594808578491,
+      "learning_rate": 2.6146214099216712e-05,
+      "loss": 0.545,
+      "step": 460
+    },
+    {
+      "epoch": 1.7624413145539906,
+      "grad_norm": 4.145854949951172,
+      "learning_rate": 2.5989556135770236e-05,
+      "loss": 0.4907,
+      "step": 470
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.7418975830078125,
+      "learning_rate": 2.583289817232376e-05,
+      "loss": 0.485,
+      "step": 480
+    },
+    {
+      "epoch": 1.8375586854460093,
+      "grad_norm": 4.651867866516113,
+      "learning_rate": 2.5676240208877287e-05,
+      "loss": 0.4572,
+      "step": 490
+    },
+    {
+      "epoch": 1.8751173708920188,
+      "grad_norm": 4.849829196929932,
+      "learning_rate": 2.551958224543081e-05,
+      "loss": 0.4864,
+      "step": 500
+    },
+    {
+      "epoch": 1.9126760563380283,
+      "grad_norm": 2.631229877471924,
+      "learning_rate": 2.5362924281984335e-05,
+      "loss": 0.4035,
+      "step": 510
+    },
+    {
+      "epoch": 1.9502347417840376,
+      "grad_norm": 5.099828243255615,
+      "learning_rate": 2.520626631853786e-05,
+      "loss": 0.3818,
+      "step": 520
+    },
+    {
+      "epoch": 1.9877934272300468,
+      "grad_norm": 3.25174617767334,
+      "learning_rate": 2.5049608355091387e-05,
+      "loss": 0.3854,
+      "step": 530
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9733745219182113,
+      "eval_loss": 0.2675245702266693,
+      "eval_runtime": 5.0739,
+      "eval_samples_per_second": 1339.801,
+      "eval_steps_per_second": 41.98,
+      "step": 534
+    },
+    {
+      "epoch": 2.0225352112676056,
+      "grad_norm": 4.533545017242432,
+      "learning_rate": 2.489295039164491e-05,
+      "loss": 0.3589,
+      "step": 540
+    },
+    {
+      "epoch": 2.060093896713615,
+      "grad_norm": 4.4245991706848145,
+      "learning_rate": 2.4736292428198435e-05,
+      "loss": 0.378,
+      "step": 550
+    },
+    {
+      "epoch": 2.0976525821596246,
+      "grad_norm": 5.778880596160889,
+      "learning_rate": 2.457963446475196e-05,
+      "loss": 0.3653,
+      "step": 560
+    },
+    {
+      "epoch": 2.1352112676056336,
+      "grad_norm": 3.5573890209198,
+      "learning_rate": 2.4422976501305487e-05,
+      "loss": 0.3107,
+      "step": 570
+    },
+    {
+      "epoch": 2.172769953051643,
+      "grad_norm": 3.655824899673462,
+      "learning_rate": 2.426631853785901e-05,
+      "loss": 0.3405,
+      "step": 580
+    },
+    {
+      "epoch": 2.2103286384976526,
+      "grad_norm": 2.430022954940796,
+      "learning_rate": 2.4109660574412535e-05,
+      "loss": 0.3298,
+      "step": 590
+    },
+    {
+      "epoch": 2.247887323943662,
+      "grad_norm": 2.9207568168640137,
+      "learning_rate": 2.3953002610966055e-05,
+      "loss": 0.3022,
+      "step": 600
+    },
+    {
+      "epoch": 2.2854460093896716,
+      "grad_norm": 4.8787007331848145,
+      "learning_rate": 2.3796344647519583e-05,
+      "loss": 0.3991,
+      "step": 610
+    },
+    {
+      "epoch": 2.3230046948356806,
+      "grad_norm": 3.0268468856811523,
+      "learning_rate": 2.3639686684073107e-05,
+      "loss": 0.3159,
+      "step": 620
+    },
+    {
+      "epoch": 2.36056338028169,
+      "grad_norm": 2.6611557006835938,
+      "learning_rate": 2.348302872062663e-05,
+      "loss": 0.2868,
+      "step": 630
+    },
+    {
+      "epoch": 2.3981220657276996,
+      "grad_norm": 2.485551595687866,
+      "learning_rate": 2.3326370757180155e-05,
+      "loss": 0.3032,
+      "step": 640
+    },
+    {
+      "epoch": 2.435680751173709,
+      "grad_norm": 4.556153297424316,
+      "learning_rate": 2.316971279373368e-05,
+      "loss": 0.2985,
+      "step": 650
+    },
+    {
+      "epoch": 2.473239436619718,
+      "grad_norm": 5.270796298980713,
+      "learning_rate": 2.3013054830287207e-05,
+      "loss": 0.2839,
+      "step": 660
+    },
+    {
+      "epoch": 2.5107981220657276,
+      "grad_norm": 3.347005844116211,
+      "learning_rate": 2.285639686684073e-05,
+      "loss": 0.2871,
+      "step": 670
+    },
+    {
+      "epoch": 2.548356807511737,
+      "grad_norm": 5.236591815948486,
+      "learning_rate": 2.2699738903394255e-05,
+      "loss": 0.3028,
+      "step": 680
+    },
+    {
+      "epoch": 2.5859154929577466,
+      "grad_norm": 2.995059013366699,
+      "learning_rate": 2.254308093994778e-05,
+      "loss": 0.2537,
+      "step": 690
+    },
+    {
+      "epoch": 2.6234741784037556,
+      "grad_norm": 2.805640459060669,
+      "learning_rate": 2.2386422976501306e-05,
+      "loss": 0.297,
+      "step": 700
+    },
+    {
+      "epoch": 2.661032863849765,
+      "grad_norm": 3.0646071434020996,
+      "learning_rate": 2.222976501305483e-05,
+      "loss": 0.2453,
+      "step": 710
+    },
+    {
+      "epoch": 2.6985915492957746,
+      "grad_norm": 3.6719613075256348,
+      "learning_rate": 2.2073107049608354e-05,
+      "loss": 0.2655,
+      "step": 720
+    },
+    {
+      "epoch": 2.736150234741784,
+      "grad_norm": 3.2248122692108154,
+      "learning_rate": 2.191644908616188e-05,
+      "loss": 0.2297,
+      "step": 730
+    },
+    {
+      "epoch": 2.7737089201877936,
+      "grad_norm": 3.769843578338623,
+      "learning_rate": 2.1759791122715406e-05,
+      "loss": 0.2548,
+      "step": 740
+    },
+    {
+      "epoch": 2.8112676056338026,
+      "grad_norm": 3.6679906845092773,
+      "learning_rate": 2.160313315926893e-05,
+      "loss": 0.2836,
+      "step": 750
+    },
+    {
+      "epoch": 2.848826291079812,
+      "grad_norm": 1.6924936771392822,
+      "learning_rate": 2.1446475195822454e-05,
+      "loss": 0.2555,
+      "step": 760
+    },
+    {
+      "epoch": 2.8863849765258216,
+      "grad_norm": 2.1275901794433594,
+      "learning_rate": 2.1289817232375978e-05,
+      "loss": 0.2334,
+      "step": 770
+    },
+    {
+      "epoch": 2.923943661971831,
+      "grad_norm": 6.528135299682617,
+      "learning_rate": 2.1133159268929506e-05,
+      "loss": 0.2544,
+      "step": 780
+    },
+    {
+      "epoch": 2.9615023474178406,
+      "grad_norm": 2.4497199058532715,
+      "learning_rate": 2.097650130548303e-05,
+      "loss": 0.2628,
+      "step": 790
+    },
+    {
+      "epoch": 2.9990610328638496,
+      "grad_norm": 2.278947591781616,
+      "learning_rate": 2.0819843342036554e-05,
+      "loss": 0.2473,
+      "step": 800
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9767578699617535,
+      "eval_loss": 0.1461225152015686,
+      "eval_runtime": 5.0057,
+      "eval_samples_per_second": 1358.045,
+      "eval_steps_per_second": 42.551,
+      "step": 801
+    },
+    {
+      "epoch": 3.0338028169014084,
+      "grad_norm": 3.1185402870178223,
+      "learning_rate": 2.0663185378590078e-05,
+      "loss": 0.2245,
+      "step": 810
+    },
+    {
+      "epoch": 3.071361502347418,
+      "grad_norm": 2.456102132797241,
+      "learning_rate": 2.0506527415143602e-05,
+      "loss": 0.2423,
+      "step": 820
+    },
+    {
+      "epoch": 3.1089201877934274,
+      "grad_norm": 2.9463231563568115,
+      "learning_rate": 2.034986945169713e-05,
+      "loss": 0.2274,
+      "step": 830
+    },
+    {
+      "epoch": 3.1464788732394364,
+      "grad_norm": 3.5940473079681396,
+      "learning_rate": 2.0193211488250653e-05,
+      "loss": 0.2368,
+      "step": 840
+    },
+    {
+      "epoch": 3.184037558685446,
+      "grad_norm": 4.721577167510986,
+      "learning_rate": 2.0036553524804177e-05,
+      "loss": 0.2554,
+      "step": 850
+    },
+    {
+      "epoch": 3.2215962441314554,
+      "grad_norm": 2.496495485305786,
+      "learning_rate": 1.98798955613577e-05,
+      "loss": 0.2363,
+      "step": 860
+    },
+    {
+      "epoch": 3.259154929577465,
+      "grad_norm": 3.0665740966796875,
+      "learning_rate": 1.972323759791123e-05,
+      "loss": 0.2248,
+      "step": 870
+    },
+    {
+      "epoch": 3.2967136150234744,
+      "grad_norm": 4.336172580718994,
+      "learning_rate": 1.9566579634464753e-05,
+      "loss": 0.1922,
+      "step": 880
+    },
+    {
+      "epoch": 3.3342723004694834,
+      "grad_norm": 4.110763072967529,
+      "learning_rate": 1.9409921671018277e-05,
+      "loss": 0.1965,
+      "step": 890
+    },
+    {
+      "epoch": 3.371830985915493,
+      "grad_norm": 1.9457247257232666,
+      "learning_rate": 1.92532637075718e-05,
+      "loss": 0.2258,
+      "step": 900
+    },
+    {
+      "epoch": 3.4093896713615024,
+      "grad_norm": 2.719369411468506,
+      "learning_rate": 1.909660574412533e-05,
+      "loss": 0.2184,
+      "step": 910
+    },
+    {
+      "epoch": 3.446948356807512,
+      "grad_norm": 3.438279151916504,
+      "learning_rate": 1.8939947780678853e-05,
+      "loss": 0.1964,
+      "step": 920
+    },
+    {
+      "epoch": 3.4845070422535214,
+      "grad_norm": 3.2813045978546143,
+      "learning_rate": 1.8783289817232377e-05,
+      "loss": 0.2348,
+      "step": 930
+    },
+    {
+      "epoch": 3.5220657276995304,
+      "grad_norm": 4.151478290557861,
+      "learning_rate": 1.86266318537859e-05,
+      "loss": 0.2004,
+      "step": 940
+    },
+    {
+      "epoch": 3.55962441314554,
+      "grad_norm": 3.4271771907806396,
+      "learning_rate": 1.8469973890339425e-05,
+      "loss": 0.2039,
+      "step": 950
+    },
+    {
+      "epoch": 3.5971830985915494,
+      "grad_norm": 4.0341901779174805,
+      "learning_rate": 1.8313315926892952e-05,
+      "loss": 0.1997,
+      "step": 960
+    },
+    {
+      "epoch": 3.6347417840375584,
+      "grad_norm": 4.762091636657715,
+      "learning_rate": 1.8156657963446476e-05,
+      "loss": 0.2153,
+      "step": 970
+    },
+    {
+      "epoch": 3.672300469483568,
+      "grad_norm": 3.3214402198791504,
+      "learning_rate": 1.8e-05,
+      "loss": 0.1801,
+      "step": 980
+    },
+    {
+      "epoch": 3.7098591549295774,
+      "grad_norm": 3.84503173828125,
+      "learning_rate": 1.7843342036553525e-05,
+      "loss": 0.2106,
+      "step": 990
+    },
+    {
+      "epoch": 3.747417840375587,
+      "grad_norm": 3.303781747817993,
+      "learning_rate": 1.7686684073107052e-05,
+      "loss": 0.1965,
+      "step": 1000
+    },
+    {
+      "epoch": 3.7849765258215964,
+      "grad_norm": 2.691159248352051,
+      "learning_rate": 1.7530026109660576e-05,
+      "loss": 0.193,
+      "step": 1010
+    },
+    {
+      "epoch": 3.8225352112676054,
+      "grad_norm": 4.134768009185791,
+      "learning_rate": 1.73733681462141e-05,
+      "loss": 0.1908,
+      "step": 1020
+    },
+    {
+      "epoch": 3.860093896713615,
+      "grad_norm": 2.9195241928100586,
+      "learning_rate": 1.7216710182767624e-05,
+      "loss": 0.1886,
+      "step": 1030
+    },
+    {
+      "epoch": 3.8976525821596244,
+      "grad_norm": 3.795133352279663,
+      "learning_rate": 1.706005221932115e-05,
+      "loss": 0.2007,
+      "step": 1040
+    },
+    {
+      "epoch": 3.935211267605634,
+      "grad_norm": 3.9436607360839844,
+      "learning_rate": 1.6903394255874676e-05,
+      "loss": 0.1834,
+      "step": 1050
+    },
+    {
+      "epoch": 3.9727699530516434,
+      "grad_norm": 3.4115564823150635,
+      "learning_rate": 1.67467362924282e-05,
+      "loss": 0.1997,
+      "step": 1060
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.980435422182995,
+      "eval_loss": 0.10877315700054169,
+      "eval_runtime": 4.9191,
+      "eval_samples_per_second": 1381.955,
+      "eval_steps_per_second": 43.3,
+      "step": 1068
+    },
+    {
+      "epoch": 4.007511737089202,
+      "grad_norm": 5.121041774749756,
+      "learning_rate": 1.6590078328981724e-05,
+      "loss": 0.1785,
+      "step": 1070
+    },
+    {
+      "epoch": 4.045070422535211,
+      "grad_norm": 2.908527374267578,
+      "learning_rate": 1.643342036553525e-05,
+      "loss": 0.1678,
+      "step": 1080
+    },
+    {
+      "epoch": 4.08262910798122,
+      "grad_norm": 1.9687402248382568,
+      "learning_rate": 1.6276762402088775e-05,
+      "loss": 0.192,
+      "step": 1090
+    },
+    {
+      "epoch": 4.12018779342723,
+      "grad_norm": 2.722937822341919,
+      "learning_rate": 1.61201044386423e-05,
+      "loss": 0.1983,
+      "step": 1100
+    },
+    {
+      "epoch": 4.157746478873239,
+      "grad_norm": 2.3741490840911865,
+      "learning_rate": 1.5963446475195823e-05,
+      "loss": 0.2145,
+      "step": 1110
+    },
+    {
+      "epoch": 4.195305164319249,
+      "grad_norm": 2.653414011001587,
+      "learning_rate": 1.5806788511749348e-05,
+      "loss": 0.1701,
+      "step": 1120
+    },
+    {
+      "epoch": 4.232863849765258,
+      "grad_norm": 3.444087266921997,
+      "learning_rate": 1.5650130548302875e-05,
+      "loss": 0.2047,
+      "step": 1130
+    },
+    {
+      "epoch": 4.270422535211267,
+      "grad_norm": 2.024235486984253,
+      "learning_rate": 1.54934725848564e-05,
+      "loss": 0.1817,
+      "step": 1140
+    },
+    {
+      "epoch": 4.307981220657277,
+      "grad_norm": 2.742171049118042,
+      "learning_rate": 1.533681462140992e-05,
+      "loss": 0.1723,
+      "step": 1150
+    },
+    {
+      "epoch": 4.345539906103286,
+      "grad_norm": 3.3700480461120605,
+      "learning_rate": 1.5180156657963446e-05,
+      "loss": 0.17,
+      "step": 1160
+    },
+    {
+      "epoch": 4.383098591549296,
+      "grad_norm": 2.552915573120117,
+      "learning_rate": 1.5023498694516973e-05,
+      "loss": 0.1802,
+      "step": 1170
+    },
+    {
+      "epoch": 4.420657276995305,
+      "grad_norm": 3.3317511081695557,
+      "learning_rate": 1.4866840731070497e-05,
+      "loss": 0.1933,
+      "step": 1180
+    },
+    {
+      "epoch": 4.458215962441314,
+      "grad_norm": 1.9266548156738281,
+      "learning_rate": 1.4710182767624021e-05,
+      "loss": 0.1739,
+      "step": 1190
+    },
+    {
+      "epoch": 4.495774647887324,
+      "grad_norm": 2.1459243297576904,
+      "learning_rate": 1.4553524804177547e-05,
+      "loss": 0.1599,
+      "step": 1200
+    },
+    {
+      "epoch": 4.533333333333333,
+      "grad_norm": 3.9314770698547363,
+      "learning_rate": 1.4396866840731071e-05,
+      "loss": 0.1958,
+      "step": 1210
+    },
+    {
+      "epoch": 4.570892018779343,
+      "grad_norm": 2.6377363204956055,
+      "learning_rate": 1.4240208877284597e-05,
+      "loss": 0.1604,
+      "step": 1220
+    },
+    {
+      "epoch": 4.608450704225352,
+      "grad_norm": 2.810866594314575,
+      "learning_rate": 1.408355091383812e-05,
+      "loss": 0.1495,
+      "step": 1230
+    },
+    {
+      "epoch": 4.646009389671361,
+      "grad_norm": 2.2084455490112305,
+      "learning_rate": 1.3926892950391646e-05,
+      "loss": 0.185,
+      "step": 1240
+    },
+    {
+      "epoch": 4.683568075117371,
+      "grad_norm": 2.7217283248901367,
+      "learning_rate": 1.377023498694517e-05,
+      "loss": 0.1757,
+      "step": 1250
+    },
+    {
+      "epoch": 4.72112676056338,
+      "grad_norm": 3.075267791748047,
+      "learning_rate": 1.3613577023498696e-05,
+      "loss": 0.1814,
+      "step": 1260
+    },
+    {
+      "epoch": 4.758685446009389,
+      "grad_norm": 3.2452406883239746,
+      "learning_rate": 1.345691906005222e-05,
+      "loss": 0.1622,
+      "step": 1270
+    },
+    {
+      "epoch": 4.796244131455399,
+      "grad_norm": 2.712754487991333,
+      "learning_rate": 1.3300261096605744e-05,
+      "loss": 0.1714,
+      "step": 1280
+    },
+    {
+      "epoch": 4.833802816901408,
+      "grad_norm": 1.6795600652694702,
+      "learning_rate": 1.3143603133159269e-05,
+      "loss": 0.1519,
+      "step": 1290
+    },
+    {
+      "epoch": 4.871361502347418,
+      "grad_norm": 3.9085493087768555,
+      "learning_rate": 1.2986945169712793e-05,
+      "loss": 0.1758,
+      "step": 1300
+    },
+    {
+      "epoch": 4.908920187793427,
+      "grad_norm": 3.529478073120117,
+      "learning_rate": 1.2830287206266318e-05,
+      "loss": 0.1549,
+      "step": 1310
+    },
+    {
+      "epoch": 4.946478873239436,
+      "grad_norm": 2.559157609939575,
+      "learning_rate": 1.2673629242819842e-05,
+      "loss": 0.1824,
+      "step": 1320
+    },
+    {
+      "epoch": 4.984037558685446,
+      "grad_norm": 2.2350497245788574,
+      "learning_rate": 1.2516971279373368e-05,
+      "loss": 0.1723,
+      "step": 1330
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9826419535157399,
+      "eval_loss": 0.09542840719223022,
+      "eval_runtime": 5.0389,
+      "eval_samples_per_second": 1349.105,
+      "eval_steps_per_second": 42.271,
+      "step": 1335
+    },
+    {
+      "epoch": 5.018779342723005,
+      "grad_norm": 2.5073907375335693,
+      "learning_rate": 1.2360313315926892e-05,
+      "loss": 0.1401,
+      "step": 1340
+    },
+    {
+      "epoch": 5.056338028169014,
+      "grad_norm": 4.696757793426514,
+      "learning_rate": 1.2203655352480418e-05,
+      "loss": 0.1801,
+      "step": 1350
+    },
+    {
+      "epoch": 5.093896713615023,
+      "grad_norm": 1.2180489301681519,
+      "learning_rate": 1.2046997389033942e-05,
+      "loss": 0.1335,
+      "step": 1360
+    },
+    {
+      "epoch": 5.131455399061033,
+      "grad_norm": 0.887860119342804,
+      "learning_rate": 1.1890339425587468e-05,
+      "loss": 0.1479,
+      "step": 1370
+    },
+    {
+      "epoch": 5.169014084507042,
+      "grad_norm": 3.6347432136535645,
+      "learning_rate": 1.1733681462140992e-05,
+      "loss": 0.1575,
+      "step": 1380
+    },
+    {
+      "epoch": 5.206572769953052,
+      "grad_norm": 2.901700496673584,
+      "learning_rate": 1.1577023498694518e-05,
+      "loss": 0.1367,
+      "step": 1390
+    },
+    {
+      "epoch": 5.244131455399061,
+      "grad_norm": 2.6395390033721924,
+      "learning_rate": 1.1420365535248042e-05,
+      "loss": 0.144,
+      "step": 1400
+    },
+    {
+      "epoch": 5.28169014084507,
+      "grad_norm": 3.923652172088623,
+      "learning_rate": 1.1263707571801567e-05,
+      "loss": 0.1576,
+      "step": 1410
+    },
+    {
+      "epoch": 5.31924882629108,
+      "grad_norm": 2.290224313735962,
+      "learning_rate": 1.1107049608355092e-05,
+      "loss": 0.16,
+      "step": 1420
+    },
+    {
+      "epoch": 5.356807511737089,
+      "grad_norm": 2.332317590713501,
+      "learning_rate": 1.0950391644908617e-05,
+      "loss": 0.1505,
+      "step": 1430
+    },
+    {
+      "epoch": 5.394366197183099,
+      "grad_norm": 3.474155902862549,
+      "learning_rate": 1.0793733681462141e-05,
+      "loss": 0.1828,
+      "step": 1440
+    },
+    {
+      "epoch": 5.431924882629108,
+      "grad_norm": 2.5219180583953857,
+      "learning_rate": 1.0637075718015665e-05,
+      "loss": 0.1563,
+      "step": 1450
+    },
+    {
+      "epoch": 5.469483568075117,
+      "grad_norm": 4.863851547241211,
+      "learning_rate": 1.0480417754569191e-05,
+      "loss": 0.1308,
+      "step": 1460
+    },
+    {
+      "epoch": 5.507042253521127,
+      "grad_norm": 4.817688941955566,
+      "learning_rate": 1.0323759791122715e-05,
+      "loss": 0.1757,
+      "step": 1470
+    },
+    {
+      "epoch": 5.544600938967136,
+      "grad_norm": 3.194732189178467,
+      "learning_rate": 1.0167101827676241e-05,
+      "loss": 0.1577,
+      "step": 1480
+    },
+    {
+      "epoch": 5.582159624413146,
+      "grad_norm": 3.6605474948883057,
+      "learning_rate": 1.0010443864229765e-05,
+      "loss": 0.2044,
+      "step": 1490
+    },
+    {
+      "epoch": 5.619718309859155,
+      "grad_norm": 2.427701473236084,
+      "learning_rate": 9.853785900783291e-06,
+      "loss": 0.1574,
+      "step": 1500
+    },
+    {
+      "epoch": 5.657276995305164,
+      "grad_norm": 2.8025519847869873,
+      "learning_rate": 9.697127937336815e-06,
+      "loss": 0.188,
+      "step": 1510
+    },
+    {
+      "epoch": 5.694835680751174,
+      "grad_norm": 2.042407989501953,
+      "learning_rate": 9.54046997389034e-06,
+      "loss": 0.1639,
+      "step": 1520
+    },
+    {
+      "epoch": 5.732394366197183,
+      "grad_norm": 4.5383477210998535,
+      "learning_rate": 9.383812010443865e-06,
+      "loss": 0.1641,
+      "step": 1530
+    },
+    {
+      "epoch": 5.769953051643192,
+      "grad_norm": 2.919588804244995,
+      "learning_rate": 9.22715404699739e-06,
+      "loss": 0.1374,
+      "step": 1540
+    },
+    {
+      "epoch": 5.807511737089202,
+      "grad_norm": 2.4344029426574707,
+      "learning_rate": 9.070496083550915e-06,
+      "loss": 0.1711,
+      "step": 1550
+    },
+    {
+      "epoch": 5.845070422535211,
+      "grad_norm": 1.5614906549453735,
+      "learning_rate": 8.913838120104439e-06,
+      "loss": 0.1624,
+      "step": 1560
+    },
+    {
+      "epoch": 5.882629107981221,
+      "grad_norm": 3.0189967155456543,
+      "learning_rate": 8.757180156657963e-06,
+      "loss": 0.1691,
+      "step": 1570
+    },
+    {
+      "epoch": 5.92018779342723,
+      "grad_norm": 2.44000506401062,
+      "learning_rate": 8.600522193211488e-06,
+      "loss": 0.1513,
+      "step": 1580
+    },
+    {
+      "epoch": 5.957746478873239,
+      "grad_norm": 2.4327423572540283,
+      "learning_rate": 8.443864229765013e-06,
+      "loss": 0.1538,
+      "step": 1590
+    },
+    {
+      "epoch": 5.995305164319249,
+      "grad_norm": 2.1192240715026855,
+      "learning_rate": 8.287206266318538e-06,
+      "loss": 0.1442,
+      "step": 1600
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.981318034716093,
+      "eval_loss": 0.09270217269659042,
+      "eval_runtime": 4.8524,
+      "eval_samples_per_second": 1400.961,
+      "eval_steps_per_second": 43.896,
+      "step": 1602
+    },
+    {
+      "epoch": 6.030046948356808,
+      "grad_norm": 1.8678548336029053,
+      "learning_rate": 8.130548302872062e-06,
+      "loss": 0.1328,
+      "step": 1610
+    },
+    {
+      "epoch": 6.067605633802817,
+      "grad_norm": 3.0712783336639404,
+      "learning_rate": 7.973890339425586e-06,
+      "loss": 0.1543,
+      "step": 1620
+    },
+    {
+      "epoch": 6.105164319248826,
+      "grad_norm": 4.49588680267334,
+      "learning_rate": 7.817232375979112e-06,
+      "loss": 0.1452,
+      "step": 1630
+    },
+    {
+      "epoch": 6.142723004694836,
+      "grad_norm": 3.9594759941101074,
+      "learning_rate": 7.660574412532636e-06,
+      "loss": 0.1513,
+      "step": 1640
+    },
+    {
+      "epoch": 6.180281690140845,
+      "grad_norm": 2.528153657913208,
+      "learning_rate": 7.503916449086162e-06,
+      "loss": 0.1589,
+      "step": 1650
+    },
+    {
+      "epoch": 6.217840375586855,
+      "grad_norm": 2.159458875656128,
+      "learning_rate": 7.347258485639687e-06,
+      "loss": 0.1443,
+      "step": 1660
+    },
+    {
+      "epoch": 6.255399061032864,
+      "grad_norm": 2.098022222518921,
+      "learning_rate": 7.190600522193212e-06,
+      "loss": 0.1564,
+      "step": 1670
+    },
+    {
+      "epoch": 6.292957746478873,
+      "grad_norm": 1.993698239326477,
+      "learning_rate": 7.033942558746737e-06,
+      "loss": 0.1401,
+      "step": 1680
+    },
+    {
+      "epoch": 6.330516431924883,
+      "grad_norm": 2.2639145851135254,
+      "learning_rate": 6.877284595300262e-06,
+      "loss": 0.1452,
+      "step": 1690
+    },
+    {
+      "epoch": 6.368075117370892,
+      "grad_norm": 2.5003936290740967,
+      "learning_rate": 6.720626631853786e-06,
+      "loss": 0.1439,
+      "step": 1700
+    },
+    {
+      "epoch": 6.405633802816902,
+      "grad_norm": 2.0841052532196045,
+      "learning_rate": 6.563968668407311e-06,
+      "loss": 0.1438,
+      "step": 1710
+    },
+    {
+      "epoch": 6.443192488262911,
+      "grad_norm": 3.550182819366455,
+      "learning_rate": 6.4073107049608355e-06,
+      "loss": 0.1433,
+      "step": 1720
+    },
+    {
+      "epoch": 6.48075117370892,
+      "grad_norm": 1.4857251644134521,
+      "learning_rate": 6.2506527415143605e-06,
+      "loss": 0.1404,
+      "step": 1730
+    },
+    {
+      "epoch": 6.51830985915493,
+      "grad_norm": 3.503309726715088,
+      "learning_rate": 6.093994778067885e-06,
+      "loss": 0.1493,
+      "step": 1740
+    },
+    {
+      "epoch": 6.555868544600939,
+      "grad_norm": 3.59545636177063,
+      "learning_rate": 5.93733681462141e-06,
+      "loss": 0.1563,
+      "step": 1750
+    },
+    {
+      "epoch": 6.593427230046949,
+      "grad_norm": 2.879582405090332,
+      "learning_rate": 5.780678851174934e-06,
+      "loss": 0.122,
+      "step": 1760
+    },
+    {
+      "epoch": 6.630985915492958,
+      "grad_norm": 1.7240543365478516,
+      "learning_rate": 5.624020887728459e-06,
+      "loss": 0.1404,
+      "step": 1770
+    },
+    {
+      "epoch": 6.668544600938967,
+      "grad_norm": 3.0438528060913086,
+      "learning_rate": 5.467362924281984e-06,
+      "loss": 0.1432,
+      "step": 1780
+    },
+    {
+      "epoch": 6.706103286384977,
+      "grad_norm": 2.496366024017334,
+      "learning_rate": 5.310704960835509e-06,
+      "loss": 0.1277,
+      "step": 1790
+    },
+    {
+      "epoch": 6.743661971830986,
+      "grad_norm": 1.7166277170181274,
+      "learning_rate": 5.154046997389034e-06,
+      "loss": 0.143,
+      "step": 1800
+    },
+    {
+      "epoch": 6.781220657276995,
+      "grad_norm": 2.4547784328460693,
+      "learning_rate": 4.997389033942559e-06,
+      "loss": 0.1198,
+      "step": 1810
+    },
+    {
+      "epoch": 6.818779342723005,
+      "grad_norm": 2.604220390319824,
+      "learning_rate": 4.840731070496084e-06,
+      "loss": 0.1705,
+      "step": 1820
+    },
+    {
+      "epoch": 6.856338028169014,
+      "grad_norm": 2.7237601280212402,
+      "learning_rate": 4.684073107049609e-06,
+      "loss": 0.1506,
+      "step": 1830
+    },
+    {
+      "epoch": 6.893896713615024,
+      "grad_norm": 2.638058662414551,
+      "learning_rate": 4.527415143603134e-06,
+      "loss": 0.154,
+      "step": 1840
+    },
+    {
+      "epoch": 6.931455399061033,
+      "grad_norm": 3.8382205963134766,
+      "learning_rate": 4.3707571801566586e-06,
+      "loss": 0.1553,
+      "step": 1850
+    },
+    {
+      "epoch": 6.969014084507043,
+      "grad_norm": 2.071164131164551,
+      "learning_rate": 4.2140992167101835e-06,
+      "loss": 0.1397,
+      "step": 1860
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.9811709326272433,
+      "eval_loss": 0.08920056372880936,
+      "eval_runtime": 4.9166,
+      "eval_samples_per_second": 1382.662,
+      "eval_steps_per_second": 43.323,
+      "step": 1869
+    },
+    {
+      "epoch": 7.003755868544601,
+      "grad_norm": 2.5346381664276123,
+      "learning_rate": 4.0574412532637075e-06,
+      "loss": 0.1296,
+      "step": 1870
+    },
+    {
+      "epoch": 7.041314553990611,
+      "grad_norm": 2.575307846069336,
+      "learning_rate": 3.9007832898172325e-06,
+      "loss": 0.1389,
+      "step": 1880
+    },
+    {
+      "epoch": 7.07887323943662,
+      "grad_norm": 2.0408527851104736,
+      "learning_rate": 3.7441253263707574e-06,
+      "loss": 0.1521,
+      "step": 1890
+    },
+    {
+      "epoch": 7.1164319248826295,
+      "grad_norm": 3.2742061614990234,
+      "learning_rate": 3.5874673629242823e-06,
+      "loss": 0.1342,
+      "step": 1900
+    },
+    {
+      "epoch": 7.153990610328639,
+      "grad_norm": 1.4502960443496704,
+      "learning_rate": 3.4308093994778068e-06,
+      "loss": 0.1204,
+      "step": 1910
+    },
+    {
+      "epoch": 7.191549295774648,
+      "grad_norm": 3.7600743770599365,
+      "learning_rate": 3.2741514360313317e-06,
+      "loss": 0.1431,
+      "step": 1920
+    },
+    {
+      "epoch": 7.229107981220658,
+      "grad_norm": 2.7332417964935303,
+      "learning_rate": 3.1174934725848566e-06,
+      "loss": 0.1281,
+      "step": 1930
+    },
+    {
+      "epoch": 7.266666666666667,
+      "grad_norm": 2.6618921756744385,
+      "learning_rate": 2.960835509138381e-06,
+      "loss": 0.141,
+      "step": 1940
+    },
+    {
+      "epoch": 7.304225352112676,
+      "grad_norm": 3.625688314437866,
+      "learning_rate": 2.804177545691906e-06,
+      "loss": 0.1455,
+      "step": 1950
+    },
+    {
+      "epoch": 7.341784037558686,
+      "grad_norm": 2.0667765140533447,
+      "learning_rate": 2.647519582245431e-06,
+      "loss": 0.1359,
+      "step": 1960
+    },
+    {
+      "epoch": 7.379342723004695,
+      "grad_norm": 2.369652509689331,
+      "learning_rate": 2.490861618798956e-06,
+      "loss": 0.1295,
+      "step": 1970
+    },
+    {
+      "epoch": 7.416901408450705,
+      "grad_norm": 3.836838722229004,
+      "learning_rate": 2.3342036553524807e-06,
+      "loss": 0.1489,
+      "step": 1980
+    },
+    {
+      "epoch": 7.454460093896714,
+      "grad_norm": 3.3261311054229736,
+      "learning_rate": 2.1775456919060052e-06,
+      "loss": 0.1289,
+      "step": 1990
+    },
+    {
+      "epoch": 7.492018779342723,
+      "grad_norm": 2.6514954566955566,
+      "learning_rate": 2.0208877284595297e-06,
+      "loss": 0.1185,
+      "step": 2000
+    },
+    {
+      "epoch": 7.529577464788733,
+      "grad_norm": 2.1017005443573,
+      "learning_rate": 1.8642297650130548e-06,
+      "loss": 0.1472,
+      "step": 2010
+    },
+    {
+      "epoch": 7.567136150234742,
+      "grad_norm": 2.5104258060455322,
+      "learning_rate": 1.7075718015665795e-06,
+      "loss": 0.1467,
+      "step": 2020
+    },
+    {
+      "epoch": 7.6046948356807516,
+      "grad_norm": 1.7915935516357422,
+      "learning_rate": 1.5509138381201045e-06,
+      "loss": 0.1212,
+      "step": 2030
+    },
+    {
+      "epoch": 7.642253521126761,
+      "grad_norm": 2.4937989711761475,
+      "learning_rate": 1.3942558746736294e-06,
+      "loss": 0.1395,
+      "step": 2040
+    },
+    {
+      "epoch": 7.67981220657277,
+      "grad_norm": 2.758594274520874,
+      "learning_rate": 1.237597911227154e-06,
+      "loss": 0.1361,
+      "step": 2050
+    },
+    {
+      "epoch": 7.71737089201878,
+      "grad_norm": 2.291672468185425,
+      "learning_rate": 1.0809399477806788e-06,
+      "loss": 0.1182,
+      "step": 2060
+    },
+    {
+      "epoch": 7.754929577464789,
+      "grad_norm": 1.944736361503601,
+      "learning_rate": 9.242819843342037e-07,
+      "loss": 0.1307,
+      "step": 2070
+    },
+    {
+      "epoch": 7.792488262910798,
+      "grad_norm": 1.448411226272583,
+      "learning_rate": 7.676240208877285e-07,
+      "loss": 0.1407,
+      "step": 2080
+    },
+    {
+      "epoch": 7.830046948356808,
+      "grad_norm": 3.276000499725342,
+      "learning_rate": 6.109660574412533e-07,
+      "loss": 0.1361,
+      "step": 2090
+    },
+    {
+      "epoch": 7.867605633802817,
+      "grad_norm": 3.627788543701172,
+      "learning_rate": 4.5430809399477806e-07,
+      "loss": 0.131,
+      "step": 2100
+    },
+    {
+      "epoch": 7.905164319248827,
+      "grad_norm": 1.2533661127090454,
+      "learning_rate": 2.9765013054830287e-07,
+      "loss": 0.1245,
+      "step": 2110
+    },
+    {
+      "epoch": 7.942723004694836,
+      "grad_norm": 1.472484827041626,
+      "learning_rate": 1.409921671018277e-07,
+      "loss": 0.1368,
+      "step": 2120
+    },
+    {
+      "epoch": 7.972769953051643,
+      "eval_accuracy": 0.9811709326272433,
+      "eval_loss": 0.08957477658987045,
+      "eval_runtime": 5.418,
+      "eval_samples_per_second": 1254.697,
+      "eval_steps_per_second": 39.313,
+      "step": 2128
+    },
+    {
+      "epoch": 7.972769953051643,
+      "step": 2128,
+      "total_flos": 3.767900833756416e+18,
+      "train_loss": 0.5178930132572812,
+      "train_runtime": 756.2923,
+      "train_samples_per_second": 540.468,
+      "train_steps_per_second": 2.814
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2128,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.767900833756416e+18,
+  "train_batch_size": 48,
+  "trial_name": null,
+  "trial_params": null
+}

hpo-examples/audio-classification/ac/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11e71a8faf8833e9bfb138217263fb0518314e3b8597902b752b2bc9dd143942
+size 5368

hpo-examples/audio-classification/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets>=1.14.0
+evaluate
+librosa
+torchaudio
+torch>=1.6

hpo-examples/audio-classification/run.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+CUDA_VISIBLE_DEVICES=0 python run_audio_classification.py \
+    --model_name_or_path facebook/wav2vec2-base \
+    --dataset_name superb \
+    --dataset_config_name ks \
+    --trust_remote_code \
+    --output_dir wav2vec2-base-ft-keyword-spotting \
+    --overwrite_output_dir \
+    --remove_unused_columns False \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --learning_rate 3e-5 \
+    --max_length_seconds 1 \
+    --attention_mask False \
+    --warmup_ratio 0.1 \
+    --num_train_epochs 8 \
+    --per_device_train_batch_size 64 \
+    --gradient_accumulation_steps 4 \
+    --per_device_eval_batch_size 32 \
+    --dataloader_num_workers 4 \
+    --logging_strategy steps \
+    --logging_steps 10 \
+    --eval_strategy epoch \
+    --save_strategy epoch \
+    --load_best_model_at_end True \
+    --metric_for_best_model accuracy \
+    --save_total_limit 3 \
+    --seed 0 \
+    --push_to_hub \
+    --apply-trp --trp-depths 1 --trp-p 0.1 --trp-lambdas 0.4 0.2 0.1

hpo-examples/audio-classification/run_audio_classification.py ADDED Viewed

	@@ -0,0 +1,462 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import sys
+import warnings
+from dataclasses import dataclass, field
+from random import randint
+from typing import Optional, List
+import datasets
+import evaluate
+import numpy as np
+from datasets import DatasetDict, load_dataset
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForAudioClassification,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from trplib import apply_trp
+logger = logging.getLogger(__name__)
+# # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.50.0.dev0")
+require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
+def random_subsample(wav: np.ndarray, max_length: float, sample_rate: int = 16000):
+    """Randomly sample chunks of `max_length` seconds from the input audio"""
+    sample_length = int(round(sample_rate * max_length))
+    if len(wav) <= sample_length:
+        return wav
+    random_offset = randint(0, len(wav) - sample_length - 1)
+    return wav[random_offset : random_offset + sample_length]
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: Optional[str] = field(default=None, metadata={"help": "Name of a dataset from the datasets package"})
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "A file containing the training audio paths and labels."}
+    )
+    eval_file: Optional[str] = field(
+        default=None, metadata={"help": "A file containing the validation audio paths and labels."}
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": (
+                "The name of the training data set split to use (via the datasets library). Defaults to 'validation'"
+            )
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    label_column_name: str = field(
+        default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_length_seconds: float = field(
+        default=20,
+        metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
+    )
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        default="facebook/wav2vec2-base",
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from the Hub"}
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Name or path of preprocessor config."}
+    )
+    freeze_feature_encoder: bool = field(
+        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
+    )
+    attention_mask: bool = field(
+        default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    freeze_feature_extractor: Optional[bool] = field(
+        default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
+    )
+    ignore_mismatched_sizes: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
+    )
+    apply_trp: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to apply trp or not."},
+    )
+    trp_depths: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": "TRP depth value."
+        },
+    )
+    trp_p: Optional[float] = field(
+        default=0.1,
+        metadata={
+            "help": "TRP p value."
+        },
+    )
+    trp_lambdas: Optional[List[float]] = field(
+        default_factory=lambda: [0.4, 0.2, 0.1],
+        metadata={
+            "help": "TRP lambda values (list of floats)."
+        },
+    )
+    def __post_init__(self):
+        if not self.freeze_feature_extractor and self.freeze_feature_encoder:
+            warnings.warn(
+                "The argument `--freeze_feature_extractor` is deprecated and "
+                "will be removed in a future version. Use `--freeze_feature_encoder` "
+                "instead. Setting `freeze_feature_encoder==True`.",
+                FutureWarning,
+            )
+        if self.freeze_feature_extractor and not self.freeze_feature_encoder:
+            raise ValueError(
+                "The argument `--freeze_feature_extractor` is deprecated and "
+                "should not be used in combination with `--freeze_feature_encoder`. "
+                "Only make use of `--freeze_feature_encoder`."
+            )
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_audio_classification", model_args, data_args)
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to train from scratch."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Initialize our dataset and prepare it for the audio classification task.
+    raw_datasets = DatasetDict()
+    raw_datasets["train"] = load_dataset(
+        data_args.dataset_name,
+        data_args.dataset_config_name,
+        split=data_args.train_split_name,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    raw_datasets["eval"] = load_dataset(
+        data_args.dataset_name,
+        data_args.dataset_config_name,
+        split=data_args.eval_split_name,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    if data_args.audio_column_name not in raw_datasets["train"].column_names:
+        raise ValueError(
+            f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--audio_column_name` to the correct audio column - one of "
+            f"{', '.join(raw_datasets['train'].column_names)}."
+        )
+    if data_args.label_column_name not in raw_datasets["train"].column_names:
+        raise ValueError(
+            f"--label_column_name {data_args.label_column_name} not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--label_column_name` to the correct text column - one of "
+            f"{', '.join(raw_datasets['train'].column_names)}."
+        )
+    # Setting `return_attention_mask=True` is the way to get a correctly masked mean-pooling over
+    # transformer outputs in the classifier, but it doesn't always lead to better accuracy
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name or model_args.model_name_or_path,
+        return_attention_mask=model_args.attention_mask,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    # `datasets` takes care of automatically loading and resampling the audio,
+    # so we just need to set the correct target sampling rate.
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+    )
+    model_input_name = feature_extractor.model_input_names[0]
+    def train_transforms(batch):
+        """Apply train_transforms across a batch."""
+        subsampled_wavs = []
+        for audio in batch[data_args.audio_column_name]:
+            wav = random_subsample(
+                audio["array"], max_length=data_args.max_length_seconds, sample_rate=feature_extractor.sampling_rate
+            )
+            subsampled_wavs.append(wav)
+        inputs = feature_extractor(subsampled_wavs, sampling_rate=feature_extractor.sampling_rate)
+        output_batch = {model_input_name: inputs.get(model_input_name)}
+        output_batch["labels"] = list(batch[data_args.label_column_name])
+        return output_batch
+    def val_transforms(batch):
+        """Apply val_transforms across a batch."""
+        wavs = [audio["array"] for audio in batch[data_args.audio_column_name]]
+        inputs = feature_extractor(wavs, sampling_rate=feature_extractor.sampling_rate)
+        output_batch = {model_input_name: inputs.get(model_input_name)}
+        output_batch["labels"] = list(batch[data_args.label_column_name])
+        return output_batch
+    # Prepare label mappings.
+    # We'll include these in the model's config to get human readable labels in the Inference API.
+    labels = raw_datasets["train"].features[data_args.label_column_name].names
+    label2id, id2label = {}, {}
+    for i, label in enumerate(labels):
+        label2id[label] = str(i)
+        id2label[str(i)] = label
+    # Load the accuracy metric from the datasets package
+    metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
+    # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with
+    # `predictions` and `label_ids` fields) and has to return a dictionary string to float.
+    def compute_metrics(eval_pred):
+        """Computes accuracy on a batch of predictions"""
+        predictions = np.argmax(eval_pred.predictions, axis=1)
+        return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+    config = AutoConfig.from_pretrained(
+        model_args.config_name or model_args.model_name_or_path,
+        num_labels=len(labels),
+        label2id=label2id,
+        id2label=id2label,
+        finetuning_task="audio-classification",
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    model = AutoModelForAudioClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+    )
+    # freeze the convolutional waveform encoder
+    if model_args.freeze_feature_encoder:
+        model.freeze_feature_encoder()
+    if model_args.apply_trp:
+        model = apply_trp(model, depths=model_args.trp_depths, p=model_args.trp_p, lambdas=model_args.trp_lambdas)
+    if training_args.do_train:
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = (
+                raw_datasets["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
+            )
+        # Set the training transforms
+        raw_datasets["train"].set_transform(train_transforms, output_all_columns=False)
+    if training_args.do_eval:
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = (
+                raw_datasets["eval"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
+            )
+        # Set the validation transforms
+        raw_datasets["eval"].set_transform(val_transforms, output_all_columns=False)
+    # Initialize our trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=raw_datasets["train"] if training_args.do_train else None,
+        eval_dataset=raw_datasets["eval"] if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        processing_class=feature_extractor,
+    )
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate()
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    # Write model card and (optionally) push to hub
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "audio-classification",
+        "dataset": data_args.dataset_name,
+        "tags": ["audio-classification"],
+    }
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+if __name__ == "__main__":
+    main()

hpo-examples/audio-classification/trplib.py ADDED Viewed

	@@ -0,0 +1,1181 @@

+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torchvision.models.mobilenetv2 import MobileNetV2
+from torchvision.models.resnet import ResNet
+from torchvision.models.efficientnet import EfficientNet
+from torchvision.models.vision_transformer import VisionTransformer
+from torchvision.models.segmentation.fcn import FCN
+from torchvision.models.segmentation.deeplabv3 import DeepLabV3
+import transformers
+from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, CausalLMOutput, Seq2SeqLMOutput
+from typing import Optional, Tuple, List, Union, Callable
+from collections import OrderedDict
+import types
+def trp_criterion(trp_blocks: nn.ModuleList, shared_head: Callable, criterion: Callable, lambdas: List[float], hidden_states: Tensor, logits: Tensor, targets: Tensor, loss_normalization=False):
+    loss, mask = criterion(logits, targets)
+    if loss_normalization:
+        coeff = loss.detach()
+    embeds = [hidden_states]
+    predictions = []
+    for k, c in enumerate(lambdas):
+        embeds.append(trp_blocks[k](embeds[-1]))
+        predictions.append(shared_head(embeds[-1]))
+        replica_loss, mask = criterion(predictions[-1], targets, mask)
+        loss += c * replica_loss
+    if loss_normalization:
+        with torch.no_grad():
+            coeff = torch.exp(coeff) / torch.exp(loss.detach())
+        loss = coeff * loss
+    return loss
+class TPBlock(nn.Module):
+    def __init__(self, depths: int, in_features: int, p: float, dim=-1):
+        super(TPBlock, self).__init__()
+        self.dropout = nn.Dropout(p)
+        self.cdim = dim
+        blocks = []
+        for _ in range(depths):
+            blocks.append(nn.Linear(in_features, in_features))
+            nn.init.constant_(blocks[-1].weight, 0.0)
+            nn.init.constant_(blocks[-1].bias, 0.0)
+            blocks.append(nn.ReLU())
+        self.blocks = nn.Sequential(*blocks)
+    def forward(self, x):
+        x = self.dropout(x)
+        if self.cdim == -1:
+            x = x + self.blocks(x)
+        else:
+            x = x + torch.movedim(self.blocks(torch.movedim(x, self.cdim, -1)), -1, self.cdim)
+        return x
+class Config:
+    @staticmethod
+    def gen_criterion(*args, **kwargs):
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor.
+                target (Tensor): Target labels.
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor with the same shape of target.
+            """
+            pass
+        return func
+    @staticmethod
+    def gen_shared_head(*args, **kwargs):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States tensor.
+            Returns:
+                logits (Tensor): Logits tensor.
+            """
+            pass
+        return func
+    @staticmethod
+    def forward(*args, **kwargs):
+        pass
+# Wav2Vec2 for Audio Classification
+class Wav2Vec2ForSequenceClassificationConfig(Config):
+    _HIDDEN_STATES_START_POSITION = 2
+    @staticmethod
+    def gen_criterion():
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C].
+                target (Tensor): Target labels of shape [B].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(target, dtype=torch.float32, device=target.device)
+            unmasked_loss = F.cross_entropy(input, target, reduction="none")
+            loss = torch.sum(mask * unmasked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                mask = mask * torch.eq(torch.argmax(input, dim=1), target).to(input.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self, attention_mask):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, L, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            _hidden_states = self.projector(hidden_states)
+            if attention_mask is None:
+                pooled_output = _hidden_states.mean(dim=1)
+            else:
+                padding_mask = self._get_feature_vector_attention_mask(_hidden_states.shape[1], attention_mask)
+                expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, _hidden_states.shape[2])
+                _hidden_states[~expand_padding_mask] = 0.0
+                pooled_output = _hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+            logits = self.classifier(pooled_output)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_values: Optional[torch.Tensor],
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            labels: Optional[torch.Tensor] = None,
+        ) -> Union[Tuple, SequenceClassifierOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+            outputs = self.wav2vec2(
+                input_values,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            if self.config.use_weighted_layer_sum:
+                hidden_states = outputs[Wav2Vec2ForSequenceClassificationConfig._HIDDEN_STATES_START_POSITION]
+                hidden_states = torch.stack(hidden_states, dim=1)
+                norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+                hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+            else:
+                hidden_states = outputs[0]
+            _hidden_states = self.projector(hidden_states)
+            if attention_mask is None:
+                pooled_output = _hidden_states.mean(dim=1)
+            else:
+                padding_mask = self._get_feature_vector_attention_mask(_hidden_states.shape[1], attention_mask)
+                expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, _hidden_states.shape[2])
+                _hidden_states[~expand_padding_mask] = 0.0
+                pooled_output = _hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+            logits = self.classifier(pooled_output)
+            loss = None
+            if labels is not None:
+                shared_head = Wav2Vec2ForSequenceClassificationConfig.gen_shared_head(self, attention_mask)
+                criterion = Wav2Vec2ForSequenceClassificationConfig.gen_criterion()
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, hidden_states, logits.view(-1, self.config.num_labels),  labels.view(-1), loss_normalization)  # NOTE: Apply TRP!
+            if not return_dict:
+                output = (logits,) + outputs[Wav2Vec2ForSequenceClassificationConfig._HIDDEN_STATES_START_POSITION:]
+                return ((loss,) + output) if loss is not None else output
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# MobileNetV2 for Image Classification
+class MobileNetV2Config(Config):
+    @staticmethod
+    def gen_criterion(label_smoothing=0.0, top_k=1):
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C].
+                target (Tensor): Target labels of shape [B] or [B, C].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            label = torch.argmax(target, dim=1) if label_smoothing > 0.0 else target
+            unmasked_loss = F.cross_entropy(input, label, reduction="none", label_smoothing=label_smoothing)
+            if mask is None:
+                mask = torch.ones_like(unmasked_loss, dtype=torch.float32, device=target.device)
+            loss = torch.sum(mask * unmasked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(input, top_k, dim=-1)
+                mask = mask * torch.eq(topk_indices, label[:, None]).any(dim=-1).to(input.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self.features(images)
+            x = nn.functional.adaptive_avg_pool2d(x, (1, 1))
+            x = torch.flatten(x, 1)
+            logits = self.classifier(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = MobileNetV2Config.gen_shared_head(self)
+                criterion = MobileNetV2Config.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# ResNet for Image Classification
+class ResNetConfig(MobileNetV2Config):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.fc(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self.conv1(images)
+            x = self.bn1(x)
+            x = self.relu(x)
+            x = self.maxpool(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            logits = self.fc(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = ResNetConfig.gen_shared_head(self)
+                criterion = ResNetConfig.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# EfficientNet for Image Classification
+class EfficientNetConfig(MobileNetV2Config):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self.features(images)
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            logits = self.classifier(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = EfficientNetConfig.gen_shared_head(self)
+                criterion = EfficientNetConfig.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# ViT for Image Classification
+class VisionTransformerConfig(MobileNetV2Config):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.heads(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self._process_input(images)
+            n = x.shape[0]
+            batch_class_token = self.class_token.expand(n, -1, -1)
+            x = torch.cat([batch_class_token, x], dim=1)
+            x = self.encoder(x)
+            x = x[:, 0]
+            logits = self.heads(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = VisionTransformerConfig.gen_shared_head(self)
+                criterion = VisionTransformerConfig.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# Bert for Question Answering
+class BertForQuestionAnsweringConfig(Config):
+    @staticmethod
+    def gen_criterion(top_k=1):
+        def func(input, target: List[Tensor], mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C, 2].
+                target (List[Tensor]):
+                    Start Positions of shape [B].
+                    End Positions of shape [B].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            start_positions, end_positions = target
+            if mask is None:
+                mask = torch.ones_like(start_positions, dtype=torch.float32, device=start_positions.device)
+            start_logits, end_logits = input.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1).contiguous()
+            end_logits = end_logits.squeeze(-1).contiguous()
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            masked_start_losses = F.cross_entropy(start_logits, start_positions, ignore_index=ignored_index, reduction="none")
+            start_loss = torch.sum(mask * masked_start_losses) / (torch.sum(mask) + 1e-6)
+            masked_end_losses = F.cross_entropy(end_logits, end_positions, ignore_index=ignored_index, reduction="none")
+            end_loss = torch.sum(mask * masked_end_losses) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(start_logits, top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, start_positions[:, None]).any(dim=1).to(start_logits.dtype)
+                topk_values, topk_indices = torch.topk(end_logits, top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, end_positions[:, None]).any(dim=1).to(end_logits.dtype)
+            return (start_loss + end_loss) / 2, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, C, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C, 2].
+            """
+            logits = self.qa_outputs(hidden_states)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, top_k=1):
+        def func(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            start_positions: Optional[torch.Tensor] = None,
+            end_positions: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+            r"""
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            outputs = self.bert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output = outputs[0]
+            logits = self.qa_outputs(sequence_output)
+            start_logits, end_logits = logits.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1).contiguous()
+            end_logits = end_logits.squeeze(-1).contiguous()
+            total_loss = None
+            if start_positions is not None and end_positions is not None:
+                shared_head = BertForQuestionAnsweringConfig.gen_shared_head(self)
+                criterion = BertForQuestionAnsweringConfig.gen_criterion()
+                total_loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, sequence_output,  logits, [start_positions, end_positions], loss_normalization)  # NOTE: Apply TRP!
+            if not return_dict:
+                output = (start_logits, end_logits) + outputs[2:]
+                return ((total_loss,) + output) if total_loss is not None else output
+            return QuestionAnsweringModelOutput(
+                loss=total_loss,
+                start_logits=start_logits,
+                end_logits=end_logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# FCN for Semantic Segmentation
+class FCNConfig(Config):
+    @staticmethod
+    def gen_criterion(top_k=1):
+        def func(input, target, mask=None):
+            """
+            Args:
+                input Tensor: input tensor of shape [B, C, H, W].
+                target (Tensor): Target labels of shape [B, H, W].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B, H, W].
+            """
+            if mask is None:
+                mask = torch.ones_like(target, dtype=torch.float32, device=target.device)
+            masked_loss = F.cross_entropy(input, target, ignore_index=255, reduction="none")
+            loss = torch.sum(mask * masked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(input, top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, target[:, None, :, :]).any(dim=1).to(input.dtype)
+                # mask = mask * torch.eq(torch.argmax(x, dim=1), target).to(x.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_out_shared_head(self, input_shape):
+        def func(features):
+            """
+            Args:
+                features (Tensor): features tensor of shape [B, hidden_units, H, W].
+            Returns:
+                result (Tensors): result tensor of shape [B, C, H, W].
+            """
+            x = self.classifier(features)
+            result = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+            return result
+        return func
+    @staticmethod
+    def gen_aux_shared_head(self, input_shape):
+        def func(features):
+            """
+            Args:
+                features (Tensor): features tensor of shape [B, hidden_units, H, W].
+            Returns:
+                result (Tensors): result tensor of shape [B, C, H, W].
+            """
+            x = self.aux_classifier(features)
+            result = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+            return result
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            input_shape = images.shape[-2:]
+            # contract: features is a dict of tensors
+            features = self.backbone(images)
+            result = OrderedDict()
+            x = features["out"]
+            x = self.classifier(x)
+            x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+            result["out"] = x
+            if self.aux_classifier is not None:
+                x = features["aux"]
+                x = self.aux_classifier(x)
+                x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+                result["aux"] = x
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                out_shared_head = FCNConfig.gen_out_shared_head(self, input_shape)
+                aux_shared_head = FCNConfig.gen_aux_shared_head(self, input_shape)
+                criterion = FCNConfig.gen_criterion(top_k)
+                out_loss = trp_criterion(self.out_trp_blocks, out_shared_head, criterion, lambdas, features["out"], result["out"], targets, loss_normalization)
+                aux_loss = trp_criterion(self.aux_trp_blocks, aux_shared_head, criterion, lambdas, features["aux"], result["aux"], targets, loss_normalization)
+                loss = out_loss + 0.5 * aux_loss
+                return result, loss
+            return result
+        return func
+# DeepLabV3Config for Semantic Segmentation
+class DeepLabV3Config(FCNConfig):
+    pass
+# Bert for Text Classification
+class BertForSequenceClassificationConfig(Config):
+    @staticmethod
+    def gen_criterion():
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C].
+                target (Tensor): Target labels of shape [B].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(target, dtype=torch.float32, device=target.device)
+            unmasked_loss = F.cross_entropy(input, target, reduction="none")
+            loss = torch.sum(mask * unmasked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                mask = mask * torch.eq(torch.argmax(input, dim=1), target).to(input.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(hidden_states)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            outputs = self.bert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            pooled_output = outputs[1]
+            pooled_output = self.dropout(pooled_output)
+            logits = self.classifier(pooled_output)
+            loss = None
+            if labels is not None:
+                assert self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int)  # TODO: remove this
+                if self.config.problem_type is None:
+                    if self.num_labels == 1:
+                        self.config.problem_type = "regression"
+                    elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                        self.config.problem_type = "single_label_classification"
+                    else:
+                        self.config.problem_type = "multi_label_classification"
+                if self.config.problem_type == "regression":
+                    if self.num_labels == 1:
+                        loss = F.mse_loss(logits.squeeze(), labels.squeeze())
+                    else:
+                        loss = F.mse_loss(logits, labels)
+                elif self.config.problem_type == "single_label_classification":
+                    shared_head = BertForSequenceClassificationConfig.gen_shared_head(self)
+                    criterion = BertForSequenceClassificationConfig.gen_criterion()
+                    loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, pooled_output,  logits, labels, loss_normalization)
+                elif self.config.problem_type == "multi_label_classification":
+                    loss = F.binary_cross_entropy_with_logits(logits, labels)
+            if not return_dict:
+                output = (logits,) + outputs[2:]
+                return ((loss,) + output) if loss is not None else output
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# Boberta for Text Classification
+class RobertaForSequenceClassificationConfig(BertForSequenceClassificationConfig):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(hidden_states)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            token_type_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            outputs = self.roberta(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output = outputs[0]
+            logits = self.classifier(sequence_output)
+            loss = None
+            if labels is not None:
+                assert self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int)  # TODO: remove this
+                # move labels to correct device to enable model parallelism
+                labels = labels.to(logits.device)
+                if self.config.problem_type is None:
+                    if self.num_labels == 1:
+                        self.config.problem_type = "regression"
+                    elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                        self.config.problem_type = "single_label_classification"
+                    else:
+                        self.config.problem_type = "multi_label_classification"
+                if self.config.problem_type == "regression":
+                    if self.num_labels == 1:
+                        loss = F.mse_loss(logits.squeeze(), labels.squeeze())
+                    else:
+                        loss = F.mse_loss(logits, labels)
+                elif self.config.problem_type == "single_label_classification":
+                    shared_head = BertForSequenceClassificationConfig.gen_shared_head(self)
+                    criterion = BertForSequenceClassificationConfig.gen_criterion()
+                    loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, sequence_output,  logits, labels, loss_normalization)
+                elif self.config.problem_type == "multi_label_classification":
+                    loss = F.binary_cross_entropy_with_logits(logits, labels)
+            if not return_dict:
+                output = (logits,) + outputs[2:]
+                return ((loss,) + output) if loss is not None else output
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# Wav2Vec2 for Speech Recognition
+class Wav2Vec2ForCTCConfig(Config):
+    _HIDDEN_STATES_START_POSITION = 2
+    @staticmethod
+    def greedy_decode_ctc(
+        log_probs: torch.Tensor,
+        input_lengths: torch.Tensor,
+        blank_token_id: int,
+        target_lengths: torch.Tensor
+    ):
+        """
+        Convert logits to flattened predictions that match the shape of flattened_targets.
+        Args:
+            log_probs: [B, L, V] - log-softmax output
+            input_lengths: [B] - actual length of each input
+            blank_token_id: int - index of blank token
+            target_lengths: [B] - used to determine how many predictions to keep per sample
+        Returns:
+            flattened_predictions: 1D tensor, same total length as sum(target_lengths)
+        """
+        batch_size = log_probs.size(0)
+        decoded_all = []
+        predicted_ids = log_probs.argmax(dim=-1)  # [B, L]
+        for i in range(batch_size):
+            pred = predicted_ids[i][:input_lengths[i]]  # [Li]
+            prev = None
+            decoded = []
+            for token in pred:
+                token = token.item()
+                if token != blank_token_id and token != prev:
+                    decoded.append(token)
+                prev = token
+            # Trim or pad to match target_lengths[i]
+            tgt_len = target_lengths[i].item()
+            if len(decoded) >= tgt_len:
+                decoded = decoded[:tgt_len]
+            else:
+                decoded = decoded + [blank_token_id] * (tgt_len - len(decoded))  # pad with blank
+            decoded_all.extend(decoded)
+        return torch.tensor(decoded_all, dtype=torch.long, device=log_probs.device)  # shape: [sum(target_lengths)]
+    @staticmethod
+    def gen_criterion(input_lengths: Tensor, pad_token_id: int, ctc_zero_infinity: bool):
+        def func(logits: Tensor, labels: Tensor, mask=None):
+            """
+            Args:
+                logits (Tensor): Log Probablities of shape [B, L, V].
+                labels (Tensor): Flattened Targets of shape [B, L'].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(input_lengths, dtype=torch.float32, device=input_lengths.device)
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            with torch.backends.cudnn.flags(enabled=False):
+                masked_losses = nn.functional.ctc_loss(log_probs, flattened_targets, input_lengths, target_lengths, blank=pad_token_id, reduction="none", zero_infinity=ctc_zero_infinity)
+                loss = torch.sum(mask * masked_losses) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                thres = 0.5
+                flattened_predictions = Wav2Vec2ForCTCConfig.greedy_decode_ctc(
+                    log_probs.transpose(0, 1),  # [B, T, V]
+                    input_lengths=input_lengths,
+                    blank_token_id=pad_token_id,
+                    target_lengths=target_lengths
+                )
+                token_wise_mask =  torch.eq(flattened_predictions, flattened_targets).to(flattened_targets.dtype)
+                segment_ids = torch.arange(len(target_lengths), device=target_lengths.device).repeat_interleave(target_lengths)
+                sequence_wise_mask = torch.zeros(len(target_lengths), dtype=target_lengths.dtype, device=token_wise_mask.device).scatter_add(0, segment_ids, token_wise_mask)
+                mask = mask * torch.ge(sequence_wise_mask, thres * target_lengths).to(flattened_targets.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, C, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C, 2].
+            """
+            logits = self.lm_head(hidden_states)
+            # log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_values: Optional[torch.Tensor],
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            labels: Optional[torch.Tensor] = None,
+        ) -> Union[Tuple, CausalLMOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+                Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+                the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+                All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+                config.vocab_size - 1]`.
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            if labels is not None and labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+            outputs = self.wav2vec2(
+                input_values,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = outputs[0]
+            hidden_states = self.dropout(hidden_states)
+            logits = self.lm_head(hidden_states)
+            loss = None
+            if labels is not None:
+                # retrieve loss input_lengths from attention_mask
+                attention_mask = (
+                    attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+                )
+                input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+                shared_head = Wav2Vec2ForCTCConfig.gen_shared_head(self)
+                criterion = Wav2Vec2ForCTCConfig.gen_criterion(input_lengths, self.config.pad_token_id, self.config.ctc_zero_infinity)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, hidden_states,  logits, labels, loss_normalization)  # NOTE: Apply TRP!
+            if not return_dict:
+                output = (logits,) + outputs[Wav2Vec2ForCTCConfig._HIDDEN_STATES_START_POSITION:]
+                return ((loss,) + output) if loss is not None else output
+            return CausalLMOutput(
+                loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+            )
+        return func
+# MBart for Translation
+class MBartForConditionalGenerationConfig(Config):
+    @staticmethod
+    def gen_criterion(vocab_size: int, top_k=1):
+        def func(logits, labels, mask=None):
+            """
+            Args:
+                logits (Tensor): Logits tensor of shape [B, L, V].
+                labels (Tensor): Target labels of shape [B, L].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(labels.view(-1), dtype=torch.float32, device=labels.device)
+            masked_losses = F.cross_entropy(logits.view(-1, vocab_size), labels.view(-1), reduction="none")
+            loss = torch.sum(mask * masked_losses) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(logits.view(-1, vocab_size), top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, labels.view(-1, 1)).any(dim=1).to(logits.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, L, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, L].
+            """
+            logits = self.lm_head(hidden_states) + self.final_logits_bias
+            return logits
+        return func
+    @staticmethod
+    def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
+        """
+        Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
+        have a single `decoder_start_token_id` in contrast to other Bart-like models.
+        """
+        prev_output_tokens = input_ids.clone()
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
+        index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
+        decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
+        prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
+        prev_output_tokens[:, 0] = decoder_start_tokens
+        return prev_output_tokens
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            decoder_input_ids: Optional[torch.LongTensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            Returns:
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            if labels is not None:
+                # if use_cache:
+                #     logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+                use_cache = False
+                if decoder_input_ids is None and decoder_inputs_embeds is None:
+                    decoder_input_ids = MBartForConditionalGenerationConfig.shift_tokens_right(labels, self.config.pad_token_id)
+            outputs = self.model(
+                input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                encoder_outputs=encoder_outputs,
+                decoder_attention_mask=decoder_attention_mask,
+                head_mask=head_mask,
+                decoder_head_mask=decoder_head_mask,
+                cross_attn_head_mask=cross_attn_head_mask,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                decoder_inputs_embeds=decoder_inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+            masked_lm_loss = None
+            if labels is not None:
+                shared_head = MBartForConditionalGenerationConfig.gen_shared_head(self)
+                criterion = MBartForConditionalGenerationConfig.gen_criterion(self.config.vocab_size)
+                masked_lm_loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, outputs[0],  lm_logits, labels, loss_normalization)
+            if not return_dict:
+                output = (lm_logits,) + outputs[1:]
+                return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            return Seq2SeqLMOutput(
+                loss=masked_lm_loss,
+                logits=lm_logits,
+                past_key_values=outputs.past_key_values,
+                decoder_hidden_states=outputs.decoder_hidden_states,
+                decoder_attentions=outputs.decoder_attentions,
+                cross_attentions=outputs.cross_attentions,
+                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+                encoder_hidden_states=outputs.encoder_hidden_states,
+                encoder_attentions=outputs.encoder_attentions,
+            )
+        return func
+def apply_trp(model, depths: int, p: float, lambdas: List[float], **kwargs):
+    if isinstance(model, transformers.Wav2Vec2ForSequenceClassification):
+        print("✅ Applying TRP to Wav2Vec2 for Audio Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(Wav2Vec2ForSequenceClassificationConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, MobileNetV2):
+        print("✅ Applying TRP to MobileNetV2 for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1280, p) for _ in lambdas])
+        model.forward = types.MethodType(MobileNetV2Config.gen_forward(lambdas, True, label_smoothing=0.0, top_k=1), model)
+    elif isinstance(model, ResNet):
+        print("✅ Applying TRP to ResNet for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 2048, p) for _ in lambdas])
+        model.forward = types.MethodType(ResNetConfig.gen_forward(lambdas, True, label_smoothing=0.0, top_k=1), model)
+    elif isinstance(model, EfficientNet):
+        print("✅ Applying TRP to EfficientNet for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1280, p) for _ in lambdas])
+        model.forward = types.MethodType(EfficientNetConfig.gen_forward(lambdas, True, label_smoothing=kwargs["label_smoothing"], top_k=1), model)
+    elif isinstance(model, VisionTransformer):
+        print("✅ Applying TRP to VisionTransformer for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(VisionTransformerConfig.gen_forward(lambdas, True, label_smoothing=kwargs["label_smoothing"], top_k=1), model)
+    elif isinstance(model, transformers.BertForQuestionAnswering):
+        print("✅ Applying TRP to Bert for Question Answering...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(BertForQuestionAnsweringConfig.gen_forward(lambdas, True, 1), model)
+    elif isinstance(model, FCN):
+        print("✅ Applying TRP to FCN for Semantic Segmentation...")
+        model.out_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 2048, p, dim=1) for _ in lambdas])
+        model.aux_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p, dim=1) for _ in lambdas])
+        model.forward = types.MethodType(FCNConfig.gen_forward(lambdas, True, 1), model)
+    elif isinstance(model, DeepLabV3):
+        print("✅ Applying TRP to DeepLabV3 for Semantic Segmentation...")
+        model.out_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 2048, p, dim=1) for _ in lambdas])
+        model.aux_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p, dim=1) for _ in lambdas])
+        model.forward = types.MethodType(DeepLabV3Config.gen_forward(lambdas, True, 1), model)
+    elif isinstance(model, transformers.BertForSequenceClassification):
+        print("✅ Applying TRP to Bert for Text Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(BertForSequenceClassificationConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, transformers.RobertaForSequenceClassification):
+        print("✅ Applying TRP to Roberta for Text Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(RobertaForSequenceClassificationConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, transformers.Wav2Vec2ForCTC):
+        print("✅ Applying TRP to Wav2Vec2 for Speech Recognition...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p) for _ in lambdas])
+        model.forward = types.MethodType(Wav2Vec2ForCTCConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, transformers.MBartForConditionalGeneration):
+        print("✅ Applying TRP to MBart for Translation...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p) for _ in lambdas])
+        model.forward = types.MethodType(MBartForConditionalGenerationConfig.gen_forward(lambdas, False), model)
+    else:
+        torch._assert(
+            isinstance(model, transformers.Wav2Vec2ForSequenceClassification),
+            "The model should be an object of [`Wav2Vec2ForSequenceClassification`].")
+    return model

hpo-examples/image-classification/__pycache__/presets.cpython-310.pyc ADDED Viewed

Binary file (2.31 kB). View file

hpo-examples/image-classification/__pycache__/sampler.cpython-310.pyc ADDED Viewed

Binary file (2.41 kB). View file

hpo-examples/image-classification/__pycache__/transforms.cpython-310.pyc ADDED Viewed

Binary file (5.29 kB). View file

hpo-examples/image-classification/__pycache__/trplib.cpython-310.pyc ADDED Viewed

Binary file (37.5 kB). View file

hpo-examples/image-classification/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (14.9 kB). View file

hpo-examples/image-classification/efficientnet_v2_m/model_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f4b45a082517a3e60498e92710f5a97b5869515db52e536d9c92a3b68ae4e8f
+size 454515355

hpo-examples/image-classification/mobilenetv2/model_32.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f240e3b954e1b1786878733c3045125461fa67bcb6ce50a200d5ec6e46081bc7
+size 48002008

hpo-examples/image-classification/presets.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+from torchvision.transforms import autoaugment, transforms
+from torchvision.transforms.functional import InterpolationMode
+class ClassificationPresetTrain:
+    def __init__(
+        self,
+        *,
+        crop_size,
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225),
+        interpolation=InterpolationMode.BILINEAR,
+        hflip_prob=0.5,
+        auto_augment_policy=None,
+        ra_magnitude=9,
+        augmix_severity=3,
+        random_erase_prob=0.0,
+    ):
+        trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+        if hflip_prob > 0:
+            trans.append(transforms.RandomHorizontalFlip(hflip_prob))
+        if auto_augment_policy is not None:
+            if auto_augment_policy == "ra":
+                trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
+            elif auto_augment_policy == "ta_wide":
+                trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation))
+            elif auto_augment_policy == "augmix":
+                trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity))
+            else:
+                aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy)
+                trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation))
+        trans.extend(
+            [
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(mean=mean, std=std),
+            ]
+        )
+        if random_erase_prob > 0:
+            trans.append(transforms.RandomErasing(p=random_erase_prob))
+        self.transforms = transforms.Compose(trans)
+    def __call__(self, img):
+        return self.transforms(img)
+class ClassificationPresetEval:
+    def __init__(
+        self,
+        *,
+        crop_size,
+        resize_size=256,
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225),
+        interpolation=InterpolationMode.BILINEAR,
+    ):
+        self.transforms = transforms.Compose(
+            [
+                transforms.Resize(resize_size, interpolation=interpolation),
+                transforms.CenterCrop(crop_size),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(mean=mean, std=std),
+            ]
+        )
+    def __call__(self, img):
+        return self.transforms(img)

hpo-examples/image-classification/resnet50/model_35.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b304e1309e3143b9e7109d5b7263369e4e93c949dde9bab44b3d3b193d16361
+size 255177167

hpo-examples/image-classification/run.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+# ✅ --lr 0.00002 Acc@1 71.878 Acc@5 90.286 ->  Acc@1 72.104 Acc@5 90.316  (with normalization)
+torchrun --nproc_per_node=4 train.py\
+    --data-path /home/cs/Documents/datasets/imagenet\
+    --model mobilenet_v2  --output-dir mobilenet_v2 --weights MobileNet_V2_Weights.IMAGENET1K_V1\
+    --batch-size 192 --epochs 40 --lr 0.0004 --lr-step-size 10 --lr-gamma 0.5 --wd 0.00004 --apply-trp --trp-depths 1 --trp-p 0.15 --trp-lambdas 0.4 0.2 0.1
+# torchrun --nproc_per_node=4 train.py\
+#     --data-path /home/cs/Documents/datasets/imagenet\
+#     --model mobilenet_v2  --resume mobilenet_v2/model_32.pth --test-only
+# ✅ --lr 0.0002 Acc@1 76.130 Acc@5 92.862 -> Acc@1 77.234 Acc@5 93.322  (with normalization)
+torchrun --nproc_per_node=4 train.py\
+    --data-path /home/cs/Documents/datasets/imagenet\
+    --model resnet50 --output-dir resnet50 --weights ResNet50_Weights.IMAGENET1K_V1\
+    --batch-size 64 --epochs 40 --lr 0.0004 --lr-step-size 10 --lr-gamma 0.5 --print-freq 100\
+    --apply-trp --trp-depths 1 --trp-p 0.2 --trp-lambdas 0.4 0.2 0.1
+# torchrun --nproc_per_node=4 train.py\
+#     --data-path /home/cs/Documents/datasets/imagenet\
+#     --model resnet50  --resume resnet50/model_35.pth --test-only
+# ✅ Test:  Acc@1 85.218 Acc@5 97.208
+torchrun --nproc_per_node=4 train.py \
+    --data-path /home/cs/Documents/datasets/imagenet\
+    --model efficientnet_v2_m --output-dir efficientnet_v2_m --weights EfficientNet_V2_M_Weights.IMAGENET1K_V1\
+    --epochs 10 --batch-size 64 --lr 5e-9 --lr-scheduler cosineannealinglr --weight-decay 0.00002 \
+    --lr-warmup-method constant --lr-warmup-epochs 8 --lr-warmup-decay 0. \
+    --auto-augment ta_wide --random-erase 0.1 --label-smoothing 0.1 --mixup-alpha 0.2 --cutmix-alpha 1.0 --norm-weight-decay 0.0 \
+    --train-crop-size 384 --val-crop-size 480 --val-resize-size 480 --ra-sampler --ra-reps 4 --print-freq 100\
+    --apply-trp --trp-depths 1 --trp-p 0.2 --trp-lambdas 0.4 0.2 0.1
+# torchrun --nproc_per_node=4 train.py\
+#     --data-path /home/cs/Documents/datasets/imagenet\
+#     --model efficientnet_v2_m  --resume efficientnet_v2_m/model_7.pth --test-only\
+#      --val-crop-size 480 --val-resize-size 480
+# ✅ Test:  Acc@1 81.092 Acc@5 95.304
+torchrun --nproc_per_node=4 train.py\
+    --data-path /home/cs/Documents/datasets/imagenet\
+    --model vit_b_16 --output-dir vit_b_16 --weights ViT_B_16_Weights.IMAGENET1K_V1\
+    --epochs 5 --batch-size 196 --opt adamw --lr 5e-9 --lr-scheduler cosineannealinglr --wd 0.3\
+    --lr-warmup-method constant --lr-warmup-epochs 3 --lr-warmup-decay 0. \
+    --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra --clip-grad-norm 1 --cutmix-alpha 1.0\
+    --apply-trp --trp-depths 1 --trp-p 0.1 --trp-lambdas 0.4 0.2 0.1 --print-freq 100
+# torchrun --nproc_per_node=4 train.py\
+#     --data-path /home/cs/Documents/datasets/imagenet\
+#     --model vit_b_16  --resume vit_b_16/model_4.pth --test-only

hpo-examples/image-classification/sampler.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import math
+import torch
+import torch.distributed as dist
+class RASampler(torch.utils.data.Sampler):
+    """Sampler that restricts data loading to a subset of the dataset for distributed,
+    with repeated augmentation.
+    It ensures that different each augmented version of a sample will be visible to a
+    different process (GPU).
+    Heavily based on 'torch.utils.data.DistributedSampler'.
+    This is borrowed from the DeiT Repo:
+    https://github.com/facebookresearch/deit/blob/main/samplers.py
+    """
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, repetitions=3):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available!")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available!")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * float(repetitions) / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
+        self.shuffle = shuffle
+        self.seed = seed
+        self.repetitions = repetitions
+    def __iter__(self):
+        if self.shuffle:
+            # Deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+        # Add extra samples to make it evenly divisible
+        indices = [ele for ele in indices for i in range(self.repetitions)]
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        # Subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices[: self.num_selected_samples])
+    def __len__(self):
+        return self.num_selected_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch

hpo-examples/image-classification/train.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import datetime
+import os
+import time
+import warnings
+import presets
+import torch
+import torch.utils.data
+import torchvision
+import transforms
+import utils
+from sampler import RASampler
+from torch import nn
+from torch.utils.data.dataloader import default_collate
+from torchvision.transforms.functional import InterpolationMode
+from trplib import apply_trp
+def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
+    metric_logger.add_meter("img/s", utils.SmoothedValue(window_size=10, fmt="{value}"))
+    header = f"Epoch: [{epoch}]"
+    for i, (image, target) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+        start_time = time.time()
+        image, target = image.to(device), target.to(device)
+        with torch.amp.autocast("cuda", enabled=scaler is not None):
+            # output = model(image)
+            # loss = criterion(output, target)
+            output, loss = model(image, target)
+        optimizer.zero_grad()
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            if args.clip_grad_norm is not None:
+                # we should unscale the gradients of optimizer's assigned params if do gradient clipping
+                scaler.unscale_(optimizer)
+                nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            if args.clip_grad_norm is not None:
+                nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm)
+            optimizer.step()
+        if model_ema and i % args.model_ema_steps == 0:
+            model_ema.update_parameters(model)
+            if epoch < args.lr_warmup_epochs:
+                # Reset ema buffer to keep copying weights during warmup period
+                model_ema.n_averaged.fill_(0)
+        acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
+        batch_size = image.shape[0]
+        metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"])
+        metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
+        metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
+        metric_logger.meters["img/s"].update(batch_size / (time.time() - start_time))
+def evaluate(model, criterion, data_loader, device, print_freq=100, log_suffix=""):
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = f"Test: {log_suffix}"
+    num_processed_samples = 0
+    with torch.inference_mode():
+        for image, target in metric_logger.log_every(data_loader, print_freq, header):
+            image = image.to(device, non_blocking=True)
+            target = target.to(device, non_blocking=True)
+            output = model(image)
+            loss = criterion(output, target)
+            acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
+            # FIXME need to take into account that the datasets
+            # could have been padded in distributed setup
+            batch_size = image.shape[0]
+            metric_logger.update(loss=loss.item())
+            metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
+            metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
+            num_processed_samples += batch_size
+    # gather the stats from all processes
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
+    if (
+        hasattr(data_loader.dataset, "__len__")
+        and len(data_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        # See FIXME above
+        warnings.warn(
+            f"It looks like the dataset has {len(data_loader.dataset)} samples, but {num_processed_samples} "
+            "samples were used for the validation, which might bias the results. "
+            "Try adjusting the batch size and / or the world size. "
+            "Setting the world size to 1 is always a safe bet."
+        )
+    metric_logger.synchronize_between_processes()
+    print(f"{header} Acc@1 {metric_logger.acc1.global_avg:.3f} Acc@5 {metric_logger.acc5.global_avg:.3f}")
+    return metric_logger.acc1.global_avg
+def _get_cache_path(filepath):
+    import hashlib
+    h = hashlib.sha1(filepath.encode()).hexdigest()
+    cache_path = os.path.join("~", ".torch", "vision", "datasets", "imagefolder", h[:10] + ".pt")
+    cache_path = os.path.expanduser(cache_path)
+    return cache_path
+def load_data(traindir, valdir, args):
+    # Data loading code
+    print("Loading data")
+    val_resize_size, val_crop_size, train_crop_size = (
+        args.val_resize_size,
+        args.val_crop_size,
+        args.train_crop_size,
+    )
+    interpolation = InterpolationMode(args.interpolation)
+    print("Loading training data")
+    st = time.time()
+    cache_path = _get_cache_path(traindir)
+    if args.cache_dataset and os.path.exists(cache_path):
+        # Attention, as the transforms are also cached!
+        print(f"Loading dataset_train from {cache_path}")
+        dataset, _ = torch.load(cache_path)
+    else:
+        auto_augment_policy = getattr(args, "auto_augment", None)
+        random_erase_prob = getattr(args, "random_erase", 0.0)
+        ra_magnitude = args.ra_magnitude
+        augmix_severity = args.augmix_severity
+        dataset = torchvision.datasets.ImageFolder(
+            traindir,
+            presets.ClassificationPresetTrain(
+                crop_size=train_crop_size,
+                interpolation=interpolation,
+                auto_augment_policy=auto_augment_policy,
+                random_erase_prob=random_erase_prob,
+                ra_magnitude=ra_magnitude,
+                augmix_severity=augmix_severity,
+            ),
+        )
+        if args.cache_dataset:
+            print(f"Saving dataset_train to {cache_path}")
+            utils.mkdir(os.path.dirname(cache_path))
+            utils.save_on_master((dataset, traindir), cache_path)
+    print("Took", time.time() - st)
+    print("Loading validation data")
+    cache_path = _get_cache_path(valdir)
+    if args.cache_dataset and os.path.exists(cache_path):
+        # Attention, as the transforms are also cached!
+        print(f"Loading dataset_test from {cache_path}")
+        dataset_test, _ = torch.load(cache_path)
+    else:
+        if args.weights and args.test_only:
+            weights = torchvision.models.get_weight(args.weights)
+            preprocessing = weights.transforms()
+        else:
+            preprocessing = presets.ClassificationPresetEval(
+                crop_size=val_crop_size, resize_size=val_resize_size, interpolation=interpolation
+            )
+        dataset_test = torchvision.datasets.ImageFolder(
+            valdir,
+            preprocessing,
+        )
+        if args.cache_dataset:
+            print(f"Saving dataset_test to {cache_path}")
+            utils.mkdir(os.path.dirname(cache_path))
+            utils.save_on_master((dataset_test, valdir), cache_path)
+    print("Creating data loaders")
+    if args.distributed:
+        if hasattr(args, "ra_sampler") and args.ra_sampler:
+            train_sampler = RASampler(dataset, shuffle=True, repetitions=args.ra_reps)
+        else:
+            train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
+    else:
+        train_sampler = torch.utils.data.RandomSampler(dataset)
+        test_sampler = torch.utils.data.SequentialSampler(dataset_test)
+    return dataset, dataset_test, train_sampler, test_sampler
+def main(args):
+    if args.output_dir:
+        utils.mkdir(args.output_dir)
+    utils.init_distributed_mode(args)
+    print(args)
+    device = torch.device(args.device)
+    if args.use_deterministic_algorithms:
+        torch.backends.cudnn.benchmark = False
+        torch.use_deterministic_algorithms(True)
+    else:
+        torch.backends.cudnn.benchmark = True
+    train_dir = os.path.join(args.data_path, "train")
+    val_dir = os.path.join(args.data_path, "val")
+    dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args)
+    collate_fn = None
+    num_classes = len(dataset.classes)
+    mixup_transforms = []
+    if args.mixup_alpha > 0.0:
+        mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha))
+    if args.cutmix_alpha > 0.0:
+        mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha))
+    if mixup_transforms:
+        mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms)
+        def collate_fn(batch):
+            return mixupcutmix(*default_collate(batch))
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        sampler=train_sampler,
+        num_workers=args.workers,
+        pin_memory=True,
+        collate_fn=collate_fn,
+    )
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test, batch_size=8, sampler=test_sampler, num_workers=args.workers, pin_memory=True
+    )
+    print("Creating model")
+    model = torchvision.models.get_model(args.model, weights=args.weights, num_classes=num_classes)
+    if args.apply_trp:
+        model = apply_trp(model, args.trp_depths, args.trp_p, args.trp_lambdas, label_smoothing=args.label_smoothing)
+    model.to(device)
+    if args.distributed and args.sync_bn:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    criterion = nn.CrossEntropyLoss(label_smoothing=args.label_smoothing)
+    custom_keys_weight_decay = []
+    if args.bias_weight_decay is not None:
+        custom_keys_weight_decay.append(("bias", args.bias_weight_decay))
+    if args.transformer_embedding_decay is not None:
+        for key in ["class_token", "position_embedding", "relative_position_bias_table"]:
+            custom_keys_weight_decay.append((key, args.transformer_embedding_decay))
+    parameters = utils.set_weight_decay(
+        model,
+        args.weight_decay,
+        norm_weight_decay=args.norm_weight_decay,
+        custom_keys_weight_decay=custom_keys_weight_decay if len(custom_keys_weight_decay) > 0 else None,
+    )
+    opt_name = args.opt.lower()
+    if opt_name.startswith("sgd"):
+        optimizer = torch.optim.SGD(
+            parameters,
+            lr=args.lr,
+            momentum=args.momentum,
+            weight_decay=args.weight_decay,
+            nesterov="nesterov" in opt_name,
+        )
+    elif opt_name == "rmsprop":
+        optimizer = torch.optim.RMSprop(
+            parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eps=0.0316, alpha=0.9
+        )
+    elif opt_name == "adamw":
+        optimizer = torch.optim.AdamW(parameters, lr=args.lr, weight_decay=args.weight_decay)
+    else:
+        raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD, RMSprop and AdamW are supported.")
+    scaler = torch.amp.GradScaler("cuda") if args.amp else None
+    args.lr_scheduler = args.lr_scheduler.lower()
+    if args.lr_scheduler == "steplr":
+        main_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
+    elif args.lr_scheduler == "cosineannealinglr":
+        main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=args.epochs - args.lr_warmup_epochs, eta_min=args.lr_min
+        )
+    elif args.lr_scheduler == "exponentiallr":
+        main_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_gamma)
+    else:
+        raise RuntimeError(
+            f"Invalid lr scheduler '{args.lr_scheduler}'. Only StepLR, CosineAnnealingLR and ExponentialLR "
+            "are supported."
+        )
+    if args.lr_warmup_epochs > 0:
+        if args.lr_warmup_method == "linear":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs
+            )
+        elif args.lr_warmup_method == "constant":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
+                optimizer, factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs
+            )
+        else:
+            raise RuntimeError(
+                f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported."
+            )
+        lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
+            optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[args.lr_warmup_epochs]
+        )
+    else:
+        lr_scheduler = main_lr_scheduler
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    model_ema = None
+    if args.model_ema:
+        # Decay adjustment that aims to keep the decay independent from other hyper-parameters originally proposed at:
+        # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123
+        #
+        # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps)
+        # We consider constant = Dataset_size for a given dataset/setup and ommit it. Thus:
+        # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs
+        adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs
+        alpha = 1.0 - args.model_ema_decay
+        alpha = min(1.0, alpha * adjust)
+        model_ema = utils.ExponentialMovingAverage(model_without_ddp, device=device, decay=1.0 - alpha)
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=False)
+        model_without_ddp.load_state_dict(checkpoint["model"])
+        if not args.test_only:
+            optimizer.load_state_dict(checkpoint["optimizer"])
+            lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
+        args.start_epoch = checkpoint["epoch"] + 1
+        if model_ema:
+            model_ema.load_state_dict(checkpoint["model_ema"])
+        if scaler:
+            scaler.load_state_dict(checkpoint["scaler"])
+    if args.test_only:
+        # We disable the cudnn benchmarking because it can noticeably affect the accuracy
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+        if model_ema:
+            evaluate(model_ema, criterion, data_loader_test, device=device, log_suffix="EMA")
+        else:
+            evaluate(model, criterion, data_loader_test, device=device)
+        return
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema, scaler)
+        lr_scheduler.step()
+        evaluate(model, criterion, data_loader_test, device=device)
+        if model_ema:
+            evaluate(model_ema, criterion, data_loader_test, device=device, log_suffix="EMA")
+        if args.output_dir:
+            checkpoint = {
+                "model": model_without_ddp.state_dict() if not args.apply_trp else {k: v for k, v in model_without_ddp.state_dict().items() if not k.startswith("trp_blocks")},  # NOTE: remove TRP heads
+                "optimizer": optimizer.state_dict(),
+                "lr_scheduler": lr_scheduler.state_dict(),
+                "epoch": epoch,
+                "args": args,
+            }
+            if model_ema:
+                checkpoint["model_ema"] = model_ema.state_dict() if not args.apply_trp else {k: v for k, v in model_ema.state_dict().items() if not k.startswith("trp_blocks")}  # NOTE: remove TRP heads
+            if scaler:
+                checkpoint["scaler"] = scaler.state_dict()
+            utils.save_on_master(checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth"))
+            utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print(f"Training time {total_time_str}")
+def get_args_parser(add_help=True):
+    import argparse
+    parser = argparse.ArgumentParser(description="PyTorch Classification Training", add_help=add_help)
+    parser.add_argument("--data-path", default="/datasets01/imagenet_full_size/061417/", type=str, help="dataset path")
+    parser.add_argument("--model", default="resnet18", type=str, help="model name")
+    parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
+    parser.add_argument(
+        "-b", "--batch-size", default=32, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
+    )
+    parser.add_argument("--epochs", default=90, type=int, metavar="N", help="number of total epochs to run")
+    parser.add_argument(
+        "-j", "--workers", default=16, type=int, metavar="N", help="number of data loading workers (default: 16)"
+    )
+    parser.add_argument("--opt", default="sgd", type=str, help="optimizer")
+    parser.add_argument("--lr", default=0.1, type=float, help="initial learning rate")
+    parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
+    parser.add_argument(
+        "--wd",
+        "--weight-decay",
+        default=1e-4,
+        type=float,
+        metavar="W",
+        help="weight decay (default: 1e-4)",
+        dest="weight_decay",
+    )
+    parser.add_argument(
+        "--norm-weight-decay",
+        default=None,
+        type=float,
+        help="weight decay for Normalization layers (default: None, same value as --wd)",
+    )
+    parser.add_argument(
+        "--bias-weight-decay",
+        default=None,
+        type=float,
+        help="weight decay for bias parameters of all layers (default: None, same value as --wd)",
+    )
+    parser.add_argument(
+        "--transformer-embedding-decay",
+        default=None,
+        type=float,
+        help="weight decay for embedding parameters for vision transformer models (default: None, same value as --wd)",
+    )
+    parser.add_argument(
+        "--label-smoothing", default=0.0, type=float, help="label smoothing (default: 0.0)", dest="label_smoothing"
+    )
+    parser.add_argument("--mixup-alpha", default=0.0, type=float, help="mixup alpha (default: 0.0)")
+    parser.add_argument("--cutmix-alpha", default=0.0, type=float, help="cutmix alpha (default: 0.0)")
+    parser.add_argument("--lr-scheduler", default="steplr", type=str, help="the lr scheduler (default: steplr)")
+    parser.add_argument("--lr-warmup-epochs", default=0, type=int, help="the number of epochs to warmup (default: 0)")
+    parser.add_argument(
+        "--lr-warmup-method", default="constant", type=str, help="the warmup method (default: constant)"
+    )
+    parser.add_argument("--lr-warmup-decay", default=0.01, type=float, help="the decay for lr")
+    parser.add_argument("--lr-step-size", default=30, type=int, help="decrease lr every step-size epochs")
+    parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma")
+    parser.add_argument("--lr-min", default=0.0, type=float, help="minimum lr of lr schedule (default: 0.0)")
+    parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
+    parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs")
+    parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
+    parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
+    parser.add_argument(
+        "--cache-dataset",
+        dest="cache_dataset",
+        help="Cache the datasets for quicker initialization. It also serializes the transforms",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--sync-bn",
+        dest="sync_bn",
+        help="Use sync batch norm",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--test-only",
+        dest="test_only",
+        help="Only test the model",
+        action="store_true",
+    )
+    parser.add_argument("--auto-augment", default=None, type=str, help="auto augment policy (default: None)")
+    parser.add_argument("--ra-magnitude", default=9, type=int, help="magnitude of auto augment policy")
+    parser.add_argument("--augmix-severity", default=3, type=int, help="severity of augmix policy")
+    parser.add_argument("--random-erase", default=0.0, type=float, help="random erasing probability (default: 0.0)")
+    # Mixed precision training parameters
+    parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
+    # distributed training parameters
+    parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
+    parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
+    parser.add_argument(
+        "--model-ema", action="store_true", help="enable tracking Exponential Moving Average of model parameters"
+    )
+    parser.add_argument(
+        "--model-ema-steps",
+        type=int,
+        default=32,
+        help="the number of iterations that controls how often to update the EMA model (default: 32)",
+    )
+    parser.add_argument(
+        "--model-ema-decay",
+        type=float,
+        default=0.99998,
+        help="decay factor for Exponential Moving Average of model parameters (default: 0.99998)",
+    )
+    parser.add_argument(
+        "--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
+    )
+    parser.add_argument(
+        "--interpolation", default="bilinear", type=str, help="the interpolation method (default: bilinear)"
+    )
+    parser.add_argument(
+        "--val-resize-size", default=256, type=int, help="the resize size used for validation (default: 256)"
+    )
+    parser.add_argument(
+        "--val-crop-size", default=224, type=int, help="the central crop size used for validation (default: 224)"
+    )
+    parser.add_argument(
+        "--train-crop-size", default=224, type=int, help="the random crop size used for training (default: 224)"
+    )
+    parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)")
+    parser.add_argument("--ra-sampler", action="store_true", help="whether to use Repeated Augmentation in training")
+    parser.add_argument(
+        "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)"
+    )
+    parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
+    parser.add_argument("--apply-trp", action="store_true", help="enable applying trp")
+    parser.add_argument("--trp-depths", type=int, help="trp depth")
+    parser.add_argument("--trp-p", type=float, help="trp p")
+    parser.add_argument("--trp-lambdas", nargs="+", type=float, help="trp lambdas")
+    return parser
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)

hpo-examples/image-classification/train_quantization.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import copy
+import datetime
+import os
+import time
+import torch
+import torch.ao.quantization
+import torch.utils.data
+import torchvision
+import utils
+from torch import nn
+from train import evaluate, load_data, train_one_epoch
+def main(args):
+    if args.output_dir:
+        utils.mkdir(args.output_dir)
+    utils.init_distributed_mode(args)
+    print(args)
+    if args.post_training_quantize and args.distributed:
+        raise RuntimeError("Post training quantization example should not be performed on distributed mode")
+    # Set backend engine to ensure that quantized model runs on the correct kernels
+    if args.backend not in torch.backends.quantized.supported_engines:
+        raise RuntimeError("Quantized backend not supported: " + str(args.backend))
+    torch.backends.quantized.engine = args.backend
+    device = torch.device(args.device)
+    torch.backends.cudnn.benchmark = True
+    # Data loading code
+    print("Loading data")
+    train_dir = os.path.join(args.data_path, "train")
+    val_dir = os.path.join(args.data_path, "val")
+    dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, pin_memory=True
+    )
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test, batch_size=args.eval_batch_size, sampler=test_sampler, num_workers=args.workers, pin_memory=True
+    )
+    print("Creating model", args.model)
+    # when training quantized models, we always start from a pre-trained fp32 reference model
+    prefix = "quantized_"
+    model_name = args.model
+    if not model_name.startswith(prefix):
+        model_name = prefix + model_name
+    model = torchvision.models.get_model(model_name, weights=args.weights, quantize=args.test_only)
+    model.to(device)
+    if not (args.test_only or args.post_training_quantize):
+        model.fuse_model(is_qat=True)
+        model.qconfig = torch.ao.quantization.get_default_qat_qconfig(args.backend)
+        torch.ao.quantization.prepare_qat(model, inplace=True)
+        if args.distributed and args.sync_bn:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        optimizer = torch.optim.SGD(
+            model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay
+        )
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
+    criterion = nn.CrossEntropyLoss()
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location="cpu")
+        model_without_ddp.load_state_dict(checkpoint["model"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
+        args.start_epoch = checkpoint["epoch"] + 1
+    if args.post_training_quantize:
+        # perform calibration on a subset of the training dataset
+        # for that, create a subset of the training dataset
+        ds = torch.utils.data.Subset(dataset, indices=list(range(args.batch_size * args.num_calibration_batches)))
+        data_loader_calibration = torch.utils.data.DataLoader(
+            ds, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True
+        )
+        model.eval()
+        model.fuse_model(is_qat=False)
+        model.qconfig = torch.ao.quantization.get_default_qconfig(args.backend)
+        torch.ao.quantization.prepare(model, inplace=True)
+        # Calibrate first
+        print("Calibrating")
+        evaluate(model, criterion, data_loader_calibration, device=device, print_freq=1)
+        torch.ao.quantization.convert(model, inplace=True)
+        if args.output_dir:
+            print("Saving quantized model")
+            if utils.is_main_process():
+                torch.save(model.state_dict(), os.path.join(args.output_dir, "quantized_post_train_model.pth"))
+        print("Evaluating post-training quantized model")
+        evaluate(model, criterion, data_loader_test, device=device)
+        return
+    if args.test_only:
+        evaluate(model, criterion, data_loader_test, device=device)
+        return
+    model.apply(torch.ao.quantization.enable_observer)
+    model.apply(torch.ao.quantization.enable_fake_quant)
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        print("Starting training for epoch", epoch)
+        train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args)
+        lr_scheduler.step()
+        with torch.inference_mode():
+            if epoch >= args.num_observer_update_epochs:
+                print("Disabling observer for subseq epochs, epoch = ", epoch)
+                model.apply(torch.ao.quantization.disable_observer)
+            if epoch >= args.num_batch_norm_update_epochs:
+                print("Freezing BN for subseq epochs, epoch = ", epoch)
+                model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
+            print("Evaluate QAT model")
+            evaluate(model, criterion, data_loader_test, device=device, log_suffix="QAT")
+            quantized_eval_model = copy.deepcopy(model_without_ddp)
+            quantized_eval_model.eval()
+            quantized_eval_model.to(torch.device("cpu"))
+            torch.ao.quantization.convert(quantized_eval_model, inplace=True)
+            print("Evaluate Quantized model")
+            evaluate(quantized_eval_model, criterion, data_loader_test, device=torch.device("cpu"))
+        model.train()
+        if args.output_dir:
+            checkpoint = {
+                "model": model_without_ddp.state_dict(),
+                "eval_model": quantized_eval_model.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "lr_scheduler": lr_scheduler.state_dict(),
+                "epoch": epoch,
+                "args": args,
+            }
+            utils.save_on_master(checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth"))
+            utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
+        print("Saving models after epoch ", epoch)
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print(f"Training time {total_time_str}")
+def get_args_parser(add_help=True):
+    import argparse
+    parser = argparse.ArgumentParser(description="PyTorch Quantized Classification Training", add_help=add_help)
+    parser.add_argument("--data-path", default="/datasets01/imagenet_full_size/061417/", type=str, help="dataset path")
+    parser.add_argument("--model", default="mobilenet_v2", type=str, help="model name")
+    parser.add_argument("--backend", default="qnnpack", type=str, help="fbgemm or qnnpack")
+    parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
+    parser.add_argument(
+        "-b", "--batch-size", default=32, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
+    )
+    parser.add_argument("--eval-batch-size", default=128, type=int, help="batch size for evaluation")
+    parser.add_argument("--epochs", default=90, type=int, metavar="N", help="number of total epochs to run")
+    parser.add_argument(
+        "--num-observer-update-epochs",
+        default=4,
+        type=int,
+        metavar="N",
+        help="number of total epochs to update observers",
+    )
+    parser.add_argument(
+        "--num-batch-norm-update-epochs",
+        default=3,
+        type=int,
+        metavar="N",
+        help="number of total epochs to update batch norm stats",
+    )
+    parser.add_argument(
+        "--num-calibration-batches",
+        default=32,
+        type=int,
+        metavar="N",
+        help="number of batches of training set for \
+                              observer calibration ",
+    )
+    parser.add_argument(
+        "-j", "--workers", default=16, type=int, metavar="N", help="number of data loading workers (default: 16)"
+    )
+    parser.add_argument("--lr", default=0.0001, type=float, help="initial learning rate")
+    parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
+    parser.add_argument(
+        "--wd",
+        "--weight-decay",
+        default=1e-4,
+        type=float,
+        metavar="W",
+        help="weight decay (default: 1e-4)",
+        dest="weight_decay",
+    )
+    parser.add_argument("--lr-step-size", default=30, type=int, help="decrease lr every step-size epochs")
+    parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma")
+    parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
+    parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs")
+    parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
+    parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
+    parser.add_argument(
+        "--cache-dataset",
+        dest="cache_dataset",
+        help="Cache the datasets for quicker initialization. \
+             It also serializes the transforms",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--sync-bn",
+        dest="sync_bn",
+        help="Use sync batch norm",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--test-only",
+        dest="test_only",
+        help="Only test the model",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--post-training-quantize",
+        dest="post_training_quantize",
+        help="Post training quantize the model",
+        action="store_true",
+    )
+    # distributed training parameters
+    parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
+    parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
+    parser.add_argument(
+        "--interpolation", default="bilinear", type=str, help="the interpolation method (default: bilinear)"
+    )
+    parser.add_argument(
+        "--val-resize-size", default=256, type=int, help="the resize size used for validation (default: 256)"
+    )
+    parser.add_argument(
+        "--val-crop-size", default=224, type=int, help="the central crop size used for validation (default: 224)"
+    )
+    parser.add_argument(
+        "--train-crop-size", default=224, type=int, help="the random crop size used for training (default: 224)"
+    )
+    parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)")
+    parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
+    return parser
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)

hpo-examples/image-classification/transforms.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import math
+from typing import Tuple
+import torch
+from torch import Tensor
+from torchvision.transforms import functional as F
+class RandomMixup(torch.nn.Module):
+    """Randomly apply Mixup to the provided batch and targets.
+    The class implements the data augmentations as described in the paper
+    `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.
+    Args:
+        num_classes (int): number of classes used for one-hot encoding.
+        p (float): probability of the batch being transformed. Default value is 0.5.
+        alpha (float): hyperparameter of the Beta distribution used for mixup.
+            Default value is 1.0.
+        inplace (bool): boolean to make this transform inplace. Default set to False.
+    """
+    def __init__(self, num_classes: int, p: float = 0.5, alpha: float = 1.0, inplace: bool = False) -> None:
+        super().__init__()
+        if num_classes < 1:
+            raise ValueError(
+                f"Please provide a valid positive value for the num_classes. Got num_classes={num_classes}"
+            )
+        if alpha <= 0:
+            raise ValueError("Alpha param can't be zero.")
+        self.num_classes = num_classes
+        self.p = p
+        self.alpha = alpha
+        self.inplace = inplace
+    def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            batch (Tensor): Float tensor of size (B, C, H, W)
+            target (Tensor): Integer tensor of size (B, )
+        Returns:
+            Tensor: Randomly transformed batch.
+        """
+        if batch.ndim != 4:
+            raise ValueError(f"Batch ndim should be 4. Got {batch.ndim}")
+        if target.ndim != 1:
+            raise ValueError(f"Target ndim should be 1. Got {target.ndim}")
+        if not batch.is_floating_point():
+            raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.")
+        if target.dtype != torch.int64:
+            raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}")
+        if not self.inplace:
+            batch = batch.clone()
+            target = target.clone()
+        if target.ndim == 1:
+            target = torch.nn.functional.one_hot(target, num_classes=self.num_classes).to(dtype=batch.dtype)
+        if torch.rand(1).item() >= self.p:
+            return batch, target
+        # It's faster to roll the batch by one instead of shuffling it to create image pairs
+        batch_rolled = batch.roll(1, 0)
+        target_rolled = target.roll(1, 0)
+        # Implemented as on mixup paper, page 3.
+        lambda_param = float(torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0])
+        batch_rolled.mul_(1.0 - lambda_param)
+        batch.mul_(lambda_param).add_(batch_rolled)
+        target_rolled.mul_(1.0 - lambda_param)
+        target.mul_(lambda_param).add_(target_rolled)
+        return batch, target
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"num_classes={self.num_classes}"
+            f", p={self.p}"
+            f", alpha={self.alpha}"
+            f", inplace={self.inplace}"
+            f")"
+        )
+        return s
+class RandomCutmix(torch.nn.Module):
+    """Randomly apply Cutmix to the provided batch and targets.
+    The class implements the data augmentations as described in the paper
+    `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features"
+    <https://arxiv.org/abs/1905.04899>`_.
+    Args:
+        num_classes (int): number of classes used for one-hot encoding.
+        p (float): probability of the batch being transformed. Default value is 0.5.
+        alpha (float): hyperparameter of the Beta distribution used for cutmix.
+            Default value is 1.0.
+        inplace (bool): boolean to make this transform inplace. Default set to False.
+    """
+    def __init__(self, num_classes: int, p: float = 0.5, alpha: float = 1.0, inplace: bool = False) -> None:
+        super().__init__()
+        if num_classes < 1:
+            raise ValueError("Please provide a valid positive value for the num_classes.")
+        if alpha <= 0:
+            raise ValueError("Alpha param can't be zero.")
+        self.num_classes = num_classes
+        self.p = p
+        self.alpha = alpha
+        self.inplace = inplace
+    def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            batch (Tensor): Float tensor of size (B, C, H, W)
+            target (Tensor): Integer tensor of size (B, )
+        Returns:
+            Tensor: Randomly transformed batch.
+        """
+        if batch.ndim != 4:
+            raise ValueError(f"Batch ndim should be 4. Got {batch.ndim}")
+        if target.ndim != 1:
+            raise ValueError(f"Target ndim should be 1. Got {target.ndim}")
+        if not batch.is_floating_point():
+            raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.")
+        if target.dtype != torch.int64:
+            raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}")
+        if not self.inplace:
+            batch = batch.clone()
+            target = target.clone()
+        if target.ndim == 1:
+            target = torch.nn.functional.one_hot(target, num_classes=self.num_classes).to(dtype=batch.dtype)
+        if torch.rand(1).item() >= self.p:
+            return batch, target
+        # It's faster to roll the batch by one instead of shuffling it to create image pairs
+        batch_rolled = batch.roll(1, 0)
+        target_rolled = target.roll(1, 0)
+        # Implemented as on cutmix paper, page 12 (with minor corrections on typos).
+        lambda_param = float(torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0])
+        _, H, W = F.get_dimensions(batch)
+        r_x = torch.randint(W, (1,))
+        r_y = torch.randint(H, (1,))
+        r = 0.5 * math.sqrt(1.0 - lambda_param)
+        r_w_half = int(r * W)
+        r_h_half = int(r * H)
+        x1 = int(torch.clamp(r_x - r_w_half, min=0))
+        y1 = int(torch.clamp(r_y - r_h_half, min=0))
+        x2 = int(torch.clamp(r_x + r_w_half, max=W))
+        y2 = int(torch.clamp(r_y + r_h_half, max=H))
+        batch[:, :, y1:y2, x1:x2] = batch_rolled[:, :, y1:y2, x1:x2]
+        lambda_param = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
+        target_rolled.mul_(1.0 - lambda_param)
+        target.mul_(lambda_param).add_(target_rolled)
+        return batch, target
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"num_classes={self.num_classes}"
+            f", p={self.p}"
+            f", alpha={self.alpha}"
+            f", inplace={self.inplace}"
+            f")"
+        )
+        return s

hpo-examples/image-classification/trplib.py ADDED Viewed

	@@ -0,0 +1,1181 @@

+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torchvision.models.mobilenetv2 import MobileNetV2
+from torchvision.models.resnet import ResNet
+from torchvision.models.efficientnet import EfficientNet
+from torchvision.models.vision_transformer import VisionTransformer
+from torchvision.models.segmentation.fcn import FCN
+from torchvision.models.segmentation.deeplabv3 import DeepLabV3
+import transformers
+from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, CausalLMOutput, Seq2SeqLMOutput
+from typing import Optional, Tuple, List, Union, Callable
+from collections import OrderedDict
+import types
+def trp_criterion(trp_blocks: nn.ModuleList, shared_head: Callable, criterion: Callable, lambdas: List[float], hidden_states: Tensor, logits: Tensor, targets: Tensor, loss_normalization=False):
+    loss, mask = criterion(logits, targets)
+    if loss_normalization:
+        coeff = loss.detach()
+    embeds = [hidden_states]
+    predictions = []
+    for k, c in enumerate(lambdas):
+        embeds.append(trp_blocks[k](embeds[-1]))
+        predictions.append(shared_head(embeds[-1]))
+        replica_loss, mask = criterion(predictions[-1], targets, mask)
+        loss += c * replica_loss
+    if loss_normalization:
+        with torch.no_grad():
+            coeff = torch.exp(coeff) / torch.exp(loss.detach())
+        loss = coeff * loss
+    return loss
+class TPBlock(nn.Module):
+    def __init__(self, depths: int, in_features: int, p: float, dim=-1):
+        super(TPBlock, self).__init__()
+        self.dropout = nn.Dropout(p)
+        self.cdim = dim
+        blocks = []
+        for _ in range(depths):
+            blocks.append(nn.Linear(in_features, in_features))
+            nn.init.constant_(blocks[-1].weight, 0.0)
+            nn.init.constant_(blocks[-1].bias, 0.0)
+            blocks.append(nn.ReLU())
+        self.blocks = nn.Sequential(*blocks)
+    def forward(self, x):
+        x = self.dropout(x)
+        if self.cdim == -1:
+            x = x + self.blocks(x)
+        else:
+            x = x + torch.movedim(self.blocks(torch.movedim(x, self.cdim, -1)), -1, self.cdim)
+        return x
+class Config:
+    @staticmethod
+    def gen_criterion(*args, **kwargs):
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor.
+                target (Tensor): Target labels.
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor with the same shape of target.
+            """
+            pass
+        return func
+    @staticmethod
+    def gen_shared_head(*args, **kwargs):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States tensor.
+            Returns:
+                logits (Tensor): Logits tensor.
+            """
+            pass
+        return func
+    @staticmethod
+    def forward(*args, **kwargs):
+        pass
+# Wav2Vec2 for Audio Classification
+class Wav2Vec2ForSequenceClassificationConfig(Config):
+    _HIDDEN_STATES_START_POSITION = 2
+    @staticmethod
+    def gen_criterion():
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C].
+                target (Tensor): Target labels of shape [B].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(target, dtype=torch.float32, device=target.device)
+            unmasked_loss = F.cross_entropy(input, target, reduction="none")
+            loss = torch.sum(mask * unmasked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                mask = mask * torch.eq(torch.argmax(input, dim=1), target).to(input.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self, attention_mask):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, L, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            _hidden_states = self.projector(hidden_states)
+            if attention_mask is None:
+                pooled_output = _hidden_states.mean(dim=1)
+            else:
+                padding_mask = self._get_feature_vector_attention_mask(_hidden_states.shape[1], attention_mask)
+                expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, _hidden_states.shape[2])
+                _hidden_states[~expand_padding_mask] = 0.0
+                pooled_output = _hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+            logits = self.classifier(pooled_output)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_values: Optional[torch.Tensor],
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            labels: Optional[torch.Tensor] = None,
+        ) -> Union[Tuple, SequenceClassifierOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+            outputs = self.wav2vec2(
+                input_values,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            if self.config.use_weighted_layer_sum:
+                hidden_states = outputs[Wav2Vec2ForSequenceClassificationConfig._HIDDEN_STATES_START_POSITION]
+                hidden_states = torch.stack(hidden_states, dim=1)
+                norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+                hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+            else:
+                hidden_states = outputs[0]
+            _hidden_states = self.projector(hidden_states)
+            if attention_mask is None:
+                pooled_output = _hidden_states.mean(dim=1)
+            else:
+                padding_mask = self._get_feature_vector_attention_mask(_hidden_states.shape[1], attention_mask)
+                expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, _hidden_states.shape[2])
+                _hidden_states[~expand_padding_mask] = 0.0
+                pooled_output = _hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+            logits = self.classifier(pooled_output)
+            loss = None
+            if labels is not None:
+                shared_head = Wav2Vec2ForSequenceClassificationConfig.gen_shared_head(self, attention_mask)
+                criterion = Wav2Vec2ForSequenceClassificationConfig.gen_criterion()
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, hidden_states, logits.view(-1, self.config.num_labels),  labels.view(-1), loss_normalization)  # NOTE: Apply TRP!
+            if not return_dict:
+                output = (logits,) + outputs[Wav2Vec2ForSequenceClassificationConfig._HIDDEN_STATES_START_POSITION:]
+                return ((loss,) + output) if loss is not None else output
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# MobileNetV2 for Image Classification
+class MobileNetV2Config(Config):
+    @staticmethod
+    def gen_criterion(label_smoothing=0.0, top_k=1):
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C].
+                target (Tensor): Target labels of shape [B] or [B, C].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            label = torch.argmax(target, dim=1) if label_smoothing > 0.0 else target
+            unmasked_loss = F.cross_entropy(input, label, reduction="none", label_smoothing=label_smoothing)
+            if mask is None:
+                mask = torch.ones_like(unmasked_loss, dtype=torch.float32, device=target.device)
+            loss = torch.sum(mask * unmasked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(input, top_k, dim=-1)
+                mask = mask * torch.eq(topk_indices, label[:, None]).any(dim=-1).to(input.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self.features(images)
+            x = nn.functional.adaptive_avg_pool2d(x, (1, 1))
+            x = torch.flatten(x, 1)
+            logits = self.classifier(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = MobileNetV2Config.gen_shared_head(self)
+                criterion = MobileNetV2Config.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# ResNet for Image Classification
+class ResNetConfig(MobileNetV2Config):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.fc(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self.conv1(images)
+            x = self.bn1(x)
+            x = self.relu(x)
+            x = self.maxpool(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            logits = self.fc(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = ResNetConfig.gen_shared_head(self)
+                criterion = ResNetConfig.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# EfficientNet for Image Classification
+class EfficientNetConfig(MobileNetV2Config):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self.features(images)
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            logits = self.classifier(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = EfficientNetConfig.gen_shared_head(self)
+                criterion = EfficientNetConfig.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# ViT for Image Classification
+class VisionTransformerConfig(MobileNetV2Config):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(x):
+            """
+            Args:
+                x (Tensor): Hidden States tensor of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.heads(x)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, label_smoothing=0.0, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            x = self._process_input(images)
+            n = x.shape[0]
+            batch_class_token = self.class_token.expand(n, -1, -1)
+            x = torch.cat([batch_class_token, x], dim=1)
+            x = self.encoder(x)
+            x = x[:, 0]
+            logits = self.heads(x)
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                shared_head = VisionTransformerConfig.gen_shared_head(self)
+                criterion = VisionTransformerConfig.gen_criterion(label_smoothing, top_k)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, x, logits, targets, loss_normalization)
+                return logits, loss
+            return logits
+        return func
+# Bert for Question Answering
+class BertForQuestionAnsweringConfig(Config):
+    @staticmethod
+    def gen_criterion(top_k=1):
+        def func(input, target: List[Tensor], mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C, 2].
+                target (List[Tensor]):
+                    Start Positions of shape [B].
+                    End Positions of shape [B].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            start_positions, end_positions = target
+            if mask is None:
+                mask = torch.ones_like(start_positions, dtype=torch.float32, device=start_positions.device)
+            start_logits, end_logits = input.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1).contiguous()
+            end_logits = end_logits.squeeze(-1).contiguous()
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            masked_start_losses = F.cross_entropy(start_logits, start_positions, ignore_index=ignored_index, reduction="none")
+            start_loss = torch.sum(mask * masked_start_losses) / (torch.sum(mask) + 1e-6)
+            masked_end_losses = F.cross_entropy(end_logits, end_positions, ignore_index=ignored_index, reduction="none")
+            end_loss = torch.sum(mask * masked_end_losses) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(start_logits, top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, start_positions[:, None]).any(dim=1).to(start_logits.dtype)
+                topk_values, topk_indices = torch.topk(end_logits, top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, end_positions[:, None]).any(dim=1).to(end_logits.dtype)
+            return (start_loss + end_loss) / 2, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, C, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C, 2].
+            """
+            logits = self.qa_outputs(hidden_states)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, top_k=1):
+        def func(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            start_positions: Optional[torch.Tensor] = None,
+            end_positions: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+            r"""
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            outputs = self.bert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output = outputs[0]
+            logits = self.qa_outputs(sequence_output)
+            start_logits, end_logits = logits.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1).contiguous()
+            end_logits = end_logits.squeeze(-1).contiguous()
+            total_loss = None
+            if start_positions is not None and end_positions is not None:
+                shared_head = BertForQuestionAnsweringConfig.gen_shared_head(self)
+                criterion = BertForQuestionAnsweringConfig.gen_criterion()
+                total_loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, sequence_output,  logits, [start_positions, end_positions], loss_normalization)  # NOTE: Apply TRP!
+            if not return_dict:
+                output = (start_logits, end_logits) + outputs[2:]
+                return ((total_loss,) + output) if total_loss is not None else output
+            return QuestionAnsweringModelOutput(
+                loss=total_loss,
+                start_logits=start_logits,
+                end_logits=end_logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# FCN for Semantic Segmentation
+class FCNConfig(Config):
+    @staticmethod
+    def gen_criterion(top_k=1):
+        def func(input, target, mask=None):
+            """
+            Args:
+                input Tensor: input tensor of shape [B, C, H, W].
+                target (Tensor): Target labels of shape [B, H, W].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B, H, W].
+            """
+            if mask is None:
+                mask = torch.ones_like(target, dtype=torch.float32, device=target.device)
+            masked_loss = F.cross_entropy(input, target, ignore_index=255, reduction="none")
+            loss = torch.sum(mask * masked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(input, top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, target[:, None, :, :]).any(dim=1).to(input.dtype)
+                # mask = mask * torch.eq(torch.argmax(x, dim=1), target).to(x.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_out_shared_head(self, input_shape):
+        def func(features):
+            """
+            Args:
+                features (Tensor): features tensor of shape [B, hidden_units, H, W].
+            Returns:
+                result (Tensors): result tensor of shape [B, C, H, W].
+            """
+            x = self.classifier(features)
+            result = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+            return result
+        return func
+    @staticmethod
+    def gen_aux_shared_head(self, input_shape):
+        def func(features):
+            """
+            Args:
+                features (Tensor): features tensor of shape [B, hidden_units, H, W].
+            Returns:
+                result (Tensors): result tensor of shape [B, C, H, W].
+            """
+            x = self.aux_classifier(features)
+            result = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+            return result
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=True, top_k=1):
+        def func(self, images: Tensor, targets=None):
+            input_shape = images.shape[-2:]
+            # contract: features is a dict of tensors
+            features = self.backbone(images)
+            result = OrderedDict()
+            x = features["out"]
+            x = self.classifier(x)
+            x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+            result["out"] = x
+            if self.aux_classifier is not None:
+                x = features["aux"]
+                x = self.aux_classifier(x)
+                x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
+                result["aux"] = x
+            if self.training:
+                torch._assert(targets is not None, "targets should not be none when in training mode")
+                out_shared_head = FCNConfig.gen_out_shared_head(self, input_shape)
+                aux_shared_head = FCNConfig.gen_aux_shared_head(self, input_shape)
+                criterion = FCNConfig.gen_criterion(top_k)
+                out_loss = trp_criterion(self.out_trp_blocks, out_shared_head, criterion, lambdas, features["out"], result["out"], targets, loss_normalization)
+                aux_loss = trp_criterion(self.aux_trp_blocks, aux_shared_head, criterion, lambdas, features["aux"], result["aux"], targets, loss_normalization)
+                loss = out_loss + 0.5 * aux_loss
+                return result, loss
+            return result
+        return func
+# DeepLabV3Config for Semantic Segmentation
+class DeepLabV3Config(FCNConfig):
+    pass
+# Bert for Text Classification
+class BertForSequenceClassificationConfig(Config):
+    @staticmethod
+    def gen_criterion():
+        def func(input, target, mask=None):
+            """
+            Args:
+                input (Tensor): Input tensor of shape [B, C].
+                target (Tensor): Target labels of shape [B].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(target, dtype=torch.float32, device=target.device)
+            unmasked_loss = F.cross_entropy(input, target, reduction="none")
+            loss = torch.sum(mask * unmasked_loss) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                mask = mask * torch.eq(torch.argmax(input, dim=1), target).to(input.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(hidden_states)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            outputs = self.bert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            pooled_output = outputs[1]
+            pooled_output = self.dropout(pooled_output)
+            logits = self.classifier(pooled_output)
+            loss = None
+            if labels is not None:
+                assert self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int)  # TODO: remove this
+                if self.config.problem_type is None:
+                    if self.num_labels == 1:
+                        self.config.problem_type = "regression"
+                    elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                        self.config.problem_type = "single_label_classification"
+                    else:
+                        self.config.problem_type = "multi_label_classification"
+                if self.config.problem_type == "regression":
+                    if self.num_labels == 1:
+                        loss = F.mse_loss(logits.squeeze(), labels.squeeze())
+                    else:
+                        loss = F.mse_loss(logits, labels)
+                elif self.config.problem_type == "single_label_classification":
+                    shared_head = BertForSequenceClassificationConfig.gen_shared_head(self)
+                    criterion = BertForSequenceClassificationConfig.gen_criterion()
+                    loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, pooled_output,  logits, labels, loss_normalization)
+                elif self.config.problem_type == "multi_label_classification":
+                    loss = F.binary_cross_entropy_with_logits(logits, labels)
+            if not return_dict:
+                output = (logits,) + outputs[2:]
+                return ((loss,) + output) if loss is not None else output
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# Boberta for Text Classification
+class RobertaForSequenceClassificationConfig(BertForSequenceClassificationConfig):
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C].
+            """
+            logits = self.classifier(hidden_states)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            token_type_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            outputs = self.roberta(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output = outputs[0]
+            logits = self.classifier(sequence_output)
+            loss = None
+            if labels is not None:
+                assert self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int)  # TODO: remove this
+                # move labels to correct device to enable model parallelism
+                labels = labels.to(logits.device)
+                if self.config.problem_type is None:
+                    if self.num_labels == 1:
+                        self.config.problem_type = "regression"
+                    elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                        self.config.problem_type = "single_label_classification"
+                    else:
+                        self.config.problem_type = "multi_label_classification"
+                if self.config.problem_type == "regression":
+                    if self.num_labels == 1:
+                        loss = F.mse_loss(logits.squeeze(), labels.squeeze())
+                    else:
+                        loss = F.mse_loss(logits, labels)
+                elif self.config.problem_type == "single_label_classification":
+                    shared_head = BertForSequenceClassificationConfig.gen_shared_head(self)
+                    criterion = BertForSequenceClassificationConfig.gen_criterion()
+                    loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, sequence_output,  logits, labels, loss_normalization)
+                elif self.config.problem_type == "multi_label_classification":
+                    loss = F.binary_cross_entropy_with_logits(logits, labels)
+            if not return_dict:
+                output = (logits,) + outputs[2:]
+                return ((loss,) + output) if loss is not None else output
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return func
+# Wav2Vec2 for Speech Recognition
+class Wav2Vec2ForCTCConfig(Config):
+    _HIDDEN_STATES_START_POSITION = 2
+    @staticmethod
+    def greedy_decode_ctc(
+        log_probs: torch.Tensor,
+        input_lengths: torch.Tensor,
+        blank_token_id: int,
+        target_lengths: torch.Tensor
+    ):
+        """
+        Convert logits to flattened predictions that match the shape of flattened_targets.
+        Args:
+            log_probs: [B, L, V] - log-softmax output
+            input_lengths: [B] - actual length of each input
+            blank_token_id: int - index of blank token
+            target_lengths: [B] - used to determine how many predictions to keep per sample
+        Returns:
+            flattened_predictions: 1D tensor, same total length as sum(target_lengths)
+        """
+        batch_size = log_probs.size(0)
+        decoded_all = []
+        predicted_ids = log_probs.argmax(dim=-1)  # [B, L]
+        for i in range(batch_size):
+            pred = predicted_ids[i][:input_lengths[i]]  # [Li]
+            prev = None
+            decoded = []
+            for token in pred:
+                token = token.item()
+                if token != blank_token_id and token != prev:
+                    decoded.append(token)
+                prev = token
+            # Trim or pad to match target_lengths[i]
+            tgt_len = target_lengths[i].item()
+            if len(decoded) >= tgt_len:
+                decoded = decoded[:tgt_len]
+            else:
+                decoded = decoded + [blank_token_id] * (tgt_len - len(decoded))  # pad with blank
+            decoded_all.extend(decoded)
+        return torch.tensor(decoded_all, dtype=torch.long, device=log_probs.device)  # shape: [sum(target_lengths)]
+    @staticmethod
+    def gen_criterion(input_lengths: Tensor, pad_token_id: int, ctc_zero_infinity: bool):
+        def func(logits: Tensor, labels: Tensor, mask=None):
+            """
+            Args:
+                logits (Tensor): Log Probablities of shape [B, L, V].
+                labels (Tensor): Flattened Targets of shape [B, L'].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(input_lengths, dtype=torch.float32, device=input_lengths.device)
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            with torch.backends.cudnn.flags(enabled=False):
+                masked_losses = nn.functional.ctc_loss(log_probs, flattened_targets, input_lengths, target_lengths, blank=pad_token_id, reduction="none", zero_infinity=ctc_zero_infinity)
+                loss = torch.sum(mask * masked_losses) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                thres = 0.5
+                flattened_predictions = Wav2Vec2ForCTCConfig.greedy_decode_ctc(
+                    log_probs.transpose(0, 1),  # [B, T, V]
+                    input_lengths=input_lengths,
+                    blank_token_id=pad_token_id,
+                    target_lengths=target_lengths
+                )
+                token_wise_mask =  torch.eq(flattened_predictions, flattened_targets).to(flattened_targets.dtype)
+                segment_ids = torch.arange(len(target_lengths), device=target_lengths.device).repeat_interleave(target_lengths)
+                sequence_wise_mask = torch.zeros(len(target_lengths), dtype=target_lengths.dtype, device=token_wise_mask.device).scatter_add(0, segment_ids, token_wise_mask)
+                mask = mask * torch.ge(sequence_wise_mask, thres * target_lengths).to(flattened_targets.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, C, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, C, 2].
+            """
+            logits = self.lm_head(hidden_states)
+            # log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            return logits
+        return func
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_values: Optional[torch.Tensor],
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            labels: Optional[torch.Tensor] = None,
+        ) -> Union[Tuple, CausalLMOutput]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+                Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+                the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+                All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+                config.vocab_size - 1]`.
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            if labels is not None and labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+            outputs = self.wav2vec2(
+                input_values,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = outputs[0]
+            hidden_states = self.dropout(hidden_states)
+            logits = self.lm_head(hidden_states)
+            loss = None
+            if labels is not None:
+                # retrieve loss input_lengths from attention_mask
+                attention_mask = (
+                    attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+                )
+                input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+                shared_head = Wav2Vec2ForCTCConfig.gen_shared_head(self)
+                criterion = Wav2Vec2ForCTCConfig.gen_criterion(input_lengths, self.config.pad_token_id, self.config.ctc_zero_infinity)
+                loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, hidden_states,  logits, labels, loss_normalization)  # NOTE: Apply TRP!
+            if not return_dict:
+                output = (logits,) + outputs[Wav2Vec2ForCTCConfig._HIDDEN_STATES_START_POSITION:]
+                return ((loss,) + output) if loss is not None else output
+            return CausalLMOutput(
+                loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+            )
+        return func
+# MBart for Translation
+class MBartForConditionalGenerationConfig(Config):
+    @staticmethod
+    def gen_criterion(vocab_size: int, top_k=1):
+        def func(logits, labels, mask=None):
+            """
+            Args:
+                logits (Tensor): Logits tensor of shape [B, L, V].
+                labels (Tensor): Target labels of shape [B, L].
+            Returns:
+                loss (Tensor): Scalar tensor representing the loss.
+                mask (Tensor): Boolean mask tensor of shape [B].
+            """
+            if mask is None:
+                mask = torch.ones_like(labels.view(-1), dtype=torch.float32, device=labels.device)
+            masked_losses = F.cross_entropy(logits.view(-1, vocab_size), labels.view(-1), reduction="none")
+            loss = torch.sum(mask * masked_losses) / (torch.sum(mask) + 1e-6)
+            with torch.no_grad():
+                topk_values, topk_indices = torch.topk(logits.view(-1, vocab_size), top_k, dim=1)
+                mask = mask * torch.eq(topk_indices, labels.view(-1, 1)).any(dim=1).to(logits.dtype)
+            return loss, mask
+        return func
+    @staticmethod
+    def gen_shared_head(self):
+        def func(hidden_states):
+            """
+            Args:
+                hidden_states (Tensor): Hidden States of shape [B, L, hidden_units].
+            Returns:
+                logits (Tensor): Logits tensor of shape [B, L].
+            """
+            logits = self.lm_head(hidden_states) + self.final_logits_bias
+            return logits
+        return func
+    @staticmethod
+    def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
+        """
+        Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
+        have a single `decoder_start_token_id` in contrast to other Bart-like models.
+        """
+        prev_output_tokens = input_ids.clone()
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
+        index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
+        decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
+        prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
+        prev_output_tokens[:, 0] = decoder_start_tokens
+        return prev_output_tokens
+    @staticmethod
+    def gen_forward(lambdas, loss_normalization=False):
+        def func(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            decoder_input_ids: Optional[torch.LongTensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+        ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+            r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            Returns:
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            if labels is not None:
+                # if use_cache:
+                #     logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+                use_cache = False
+                if decoder_input_ids is None and decoder_inputs_embeds is None:
+                    decoder_input_ids = MBartForConditionalGenerationConfig.shift_tokens_right(labels, self.config.pad_token_id)
+            outputs = self.model(
+                input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                encoder_outputs=encoder_outputs,
+                decoder_attention_mask=decoder_attention_mask,
+                head_mask=head_mask,
+                decoder_head_mask=decoder_head_mask,
+                cross_attn_head_mask=cross_attn_head_mask,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                decoder_inputs_embeds=decoder_inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+            masked_lm_loss = None
+            if labels is not None:
+                shared_head = MBartForConditionalGenerationConfig.gen_shared_head(self)
+                criterion = MBartForConditionalGenerationConfig.gen_criterion(self.config.vocab_size)
+                masked_lm_loss = trp_criterion(self.trp_blocks, shared_head, criterion, lambdas, outputs[0],  lm_logits, labels, loss_normalization)
+            if not return_dict:
+                output = (lm_logits,) + outputs[1:]
+                return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            return Seq2SeqLMOutput(
+                loss=masked_lm_loss,
+                logits=lm_logits,
+                past_key_values=outputs.past_key_values,
+                decoder_hidden_states=outputs.decoder_hidden_states,
+                decoder_attentions=outputs.decoder_attentions,
+                cross_attentions=outputs.cross_attentions,
+                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+                encoder_hidden_states=outputs.encoder_hidden_states,
+                encoder_attentions=outputs.encoder_attentions,
+            )
+        return func
+def apply_trp(model, depths: int, p: float, lambdas: List[float], **kwargs):
+    if isinstance(model, transformers.Wav2Vec2ForSequenceClassification):
+        print("✅ Applying TRP to Wav2Vec2 for Audio Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(Wav2Vec2ForSequenceClassificationConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, MobileNetV2):
+        print("✅ Applying TRP to MobileNetV2 for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1280, p) for _ in lambdas])
+        model.forward = types.MethodType(MobileNetV2Config.gen_forward(lambdas, True, label_smoothing=kwargs["label_smoothing"], top_k=1), model)
+    elif isinstance(model, ResNet):
+        print("✅ Applying TRP to ResNet for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 2048, p) for _ in lambdas])
+        model.forward = types.MethodType(ResNetConfig.gen_forward(lambdas, True, label_smoothing=kwargs["label_smoothing"], top_k=1), model)
+    elif isinstance(model, EfficientNet):
+        print("✅ Applying TRP to EfficientNet for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1280, p) for _ in lambdas])
+        model.forward = types.MethodType(EfficientNetConfig.gen_forward(lambdas, True, label_smoothing=kwargs["label_smoothing"], top_k=1), model)
+    elif isinstance(model, VisionTransformer):
+        print("✅ Applying TRP to VisionTransformer for Image Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(VisionTransformerConfig.gen_forward(lambdas, True, label_smoothing=kwargs["label_smoothing"], top_k=1), model)
+    elif isinstance(model, transformers.BertForQuestionAnswering):
+        print("✅ Applying TRP to Bert for Question Answering...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(BertForQuestionAnsweringConfig.gen_forward(lambdas, True, 1), model)
+    elif isinstance(model, FCN):
+        print("✅ Applying TRP to FCN for Semantic Segmentation...")
+        model.out_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 2048, p, dim=1) for _ in lambdas])
+        model.aux_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p, dim=1) for _ in lambdas])
+        model.forward = types.MethodType(FCNConfig.gen_forward(lambdas, True, 1), model)
+    elif isinstance(model, DeepLabV3):
+        print("✅ Applying TRP to DeepLabV3 for Semantic Segmentation...")
+        model.out_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 2048, p, dim=1) for _ in lambdas])
+        model.aux_trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p, dim=1) for _ in lambdas])
+        model.forward = types.MethodType(DeepLabV3Config.gen_forward(lambdas, True, 1), model)
+    elif isinstance(model, transformers.BertForSequenceClassification):
+        print("✅ Applying TRP to Bert for Text Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(BertForSequenceClassificationConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, transformers.RobertaForSequenceClassification):
+        print("✅ Applying TRP to Roberta for Text Classification...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 768, p) for _ in lambdas])
+        model.forward = types.MethodType(RobertaForSequenceClassificationConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, transformers.Wav2Vec2ForCTC):
+        print("✅ Applying TRP to Wav2Vec2 for Speech Recognition...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p) for _ in lambdas])
+        model.forward = types.MethodType(Wav2Vec2ForCTCConfig.gen_forward(lambdas, False), model)
+    elif isinstance(model, transformers.MBartForConditionalGeneration):
+        print("✅ Applying TRP to MBart for Translation...")
+        model.trp_blocks = torch.nn.ModuleList([TPBlock(depths, 1024, p) for _ in lambdas])
+        model.forward = types.MethodType(MBartForConditionalGenerationConfig.gen_forward(lambdas, False), model)
+    else:
+        torch._assert(
+            isinstance(model, transformers.Wav2Vec2ForSequenceClassification),
+            "The model should be an object of [`Wav2Vec2ForSequenceClassification`].")
+    return model

hpo-examples/image-classification/utils.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import copy
+import datetime
+import errno
+import hashlib
+import os
+import time
+from collections import defaultdict, deque, OrderedDict
+from typing import List, Optional, Tuple
+import torch
+import torch.distributed as dist
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        t = reduce_across_processes([self.count, self.total])
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
+        )
+class MetricLogger:
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(f"{name}: {str(meter)}")
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                    "max mem: {memory:.0f}",
+                ]
+            )
+        else:
+            log_msg = self.delimiter.join(
+                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
+            )
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(f"{header} Total time: {total_time_str}")
+class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
+    """Maintains moving averages of model parameters using an exponential decay.
+    ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
+    `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
+    is used to compute the EMA.
+    """
+    def __init__(self, model, decay, device="cpu"):
+        def ema_avg(avg_model_param, model_param, num_averaged):
+            return decay * avg_model_param + (1 - decay) * model_param
+        super().__init__(model, device, ema_avg, use_buffers=True)
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.inference_mode():
+        maxk = max(topk)
+        batch_size = target.size(0)
+        if target.ndim == 2:
+            target = target.max(dim=1)[1]
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target[None])
+        res = []
+        for k in topk:
+            correct_k = correct[:k].flatten().sum(dtype=torch.float32)
+            res.append(correct_k * (100.0 / batch_size))
+        return res
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    elif hasattr(args, "rank"):
+        pass
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
+    torch.distributed.init_process_group(
+        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def average_checkpoints(inputs):
+    """Loads checkpoints from inputs and returns a model with averaged weights. Original implementation taken from:
+    https://github.com/pytorch/fairseq/blob/a48f235636557b8d3bc4922a6fa90f3a0fa57955/scripts/average_checkpoints.py#L16
+    Args:
+      inputs (List[str]): An iterable of string paths of checkpoints to load from.
+    Returns:
+      A dict of string keys mapping to various values. The 'model' key
+      from the returned dict should correspond to an OrderedDict mapping
+      string parameter names to torch Tensors.
+    """
+    params_dict = OrderedDict()
+    params_keys = None
+    new_state = None
+    num_models = len(inputs)
+    for fpath in inputs:
+        with open(fpath, "rb") as f:
+            state = torch.load(
+                f,
+                map_location=(lambda s, _: torch.serialization.default_restore_location(s, "cpu")),
+            )
+        # Copies over the settings from the first checkpoint
+        if new_state is None:
+            new_state = state
+        model_params = state["model"]
+        model_params_keys = list(model_params.keys())
+        if params_keys is None:
+            params_keys = model_params_keys
+        elif params_keys != model_params_keys:
+            raise KeyError(
+                f"For checkpoint {f}, expected list of params: {params_keys}, but found: {model_params_keys}"
+            )
+        for k in params_keys:
+            p = model_params[k]
+            if isinstance(p, torch.HalfTensor):
+                p = p.float()
+            if k not in params_dict:
+                params_dict[k] = p.clone()
+                # NOTE: clone() is needed in case of p is a shared parameter
+            else:
+                params_dict[k] += p
+    averaged_params = OrderedDict()
+    for k, v in params_dict.items():
+        averaged_params[k] = v
+        if averaged_params[k].is_floating_point():
+            averaged_params[k].div_(num_models)
+        else:
+            averaged_params[k] //= num_models
+    new_state["model"] = averaged_params
+    return new_state
+def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=True):
+    """
+    This method can be used to prepare weights files for new models. It receives as
+    input a model architecture and a checkpoint from the training script and produces
+    a file with the weights ready for release.
+    Examples:
+        from torchvision import models as M
+        # Classification
+        model = M.mobilenet_v3_large(weights=None)
+        print(store_model_weights(model, './class.pth'))
+        # Quantized Classification
+        model = M.quantization.mobilenet_v3_large(weights=None, quantize=False)
+        model.fuse_model(is_qat=True)
+        model.qconfig = torch.ao.quantization.get_default_qat_qconfig('qnnpack')
+        _ = torch.ao.quantization.prepare_qat(model, inplace=True)
+        print(store_model_weights(model, './qat.pth'))
+        # Object Detection
+        model = M.detection.fasterrcnn_mobilenet_v3_large_fpn(weights=None, weights_backbone=None)
+        print(store_model_weights(model, './obj.pth'))
+        # Segmentation
+        model = M.segmentation.deeplabv3_mobilenet_v3_large(weights=None, weights_backbone=None, aux_loss=True)
+        print(store_model_weights(model, './segm.pth', strict=False))
+    Args:
+        model (pytorch.nn.Module): The model on which the weights will be loaded for validation purposes.
+        checkpoint_path (str): The path of the checkpoint we will load.
+        checkpoint_key (str, optional): The key of the checkpoint where the model weights are stored.
+            Default: "model".
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
+    Returns:
+        output_path (str): The location where the weights are saved.
+    """
+    # Store the new model next to the checkpoint_path
+    checkpoint_path = os.path.abspath(checkpoint_path)
+    output_dir = os.path.dirname(checkpoint_path)
+    # Deep copy to avoid side-effects on the model object.
+    model = copy.deepcopy(model)
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    # Load the weights to the model to validate that everything works
+    # and remove unnecessary weights (such as auxiliaries, etc)
+    if checkpoint_key == "model_ema":
+        del checkpoint[checkpoint_key]["n_averaged"]
+        torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.")
+    model.load_state_dict(checkpoint[checkpoint_key], strict=strict)
+    tmp_path = os.path.join(output_dir, str(model.__hash__()))
+    torch.save(model.state_dict(), tmp_path)
+    sha256_hash = hashlib.sha256()
+    with open(tmp_path, "rb") as f:
+        # Read and update hash string value in blocks of 4K
+        for byte_block in iter(lambda: f.read(4096), b""):
+            sha256_hash.update(byte_block)
+        hh = sha256_hash.hexdigest()
+    output_path = os.path.join(output_dir, "weights-" + str(hh[:8]) + ".pth")
+    os.replace(tmp_path, output_path)
+    return output_path
+def reduce_across_processes(val):
+    if not is_dist_avail_and_initialized():
+        # nothing to sync, but we still convert to tensor for consistency with the distributed case.
+        return torch.tensor(val)
+    t = torch.tensor(val, device="cuda")
+    dist.barrier()
+    dist.all_reduce(t)
+    return t
+def set_weight_decay(
+    model: torch.nn.Module,
+    weight_decay: float,
+    norm_weight_decay: Optional[float] = None,
+    norm_classes: Optional[List[type]] = None,
+    custom_keys_weight_decay: Optional[List[Tuple[str, float]]] = None,
+):
+    if not norm_classes:
+        norm_classes = [
+            torch.nn.modules.batchnorm._BatchNorm,
+            torch.nn.LayerNorm,
+            torch.nn.GroupNorm,
+            torch.nn.modules.instancenorm._InstanceNorm,
+            torch.nn.LocalResponseNorm,
+        ]
+    norm_classes = tuple(norm_classes)
+    params = {
+        "other": [],
+        "norm": [],
+    }
+    params_weight_decay = {
+        "other": weight_decay,
+        "norm": norm_weight_decay,
+    }
+    custom_keys = []
+    if custom_keys_weight_decay is not None:
+        for key, weight_decay in custom_keys_weight_decay:
+            params[key] = []
+            params_weight_decay[key] = weight_decay
+            custom_keys.append(key)
+    def _add_params(module, prefix=""):
+        for name, p in module.named_parameters(recurse=False):
+            if not p.requires_grad:
+                continue
+            is_custom_key = False
+            for key in custom_keys:
+                target_name = f"{prefix}.{name}" if prefix != "" and "." in key else name
+                if key == target_name:
+                    params[key].append(p)
+                    is_custom_key = True
+                    break
+            if not is_custom_key:
+                if norm_weight_decay is not None and isinstance(module, norm_classes):
+                    params["norm"].append(p)
+                else:
+                    params["other"].append(p)
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix != "" else child_name
+            _add_params(child_module, prefix=child_prefix)
+    _add_params(model)
+    param_groups = []
+    for key in params:
+        if len(params[key]) > 0:
+            param_groups.append({"params": params[key], "weight_decay": params_weight_decay[key]})
+    return param_groups

hpo-examples/image-classification/vit_b_16/model_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29f1bd991a3f27f7982e8700f31332cb94c4783b83e240fc71f1eca03d4eb468
+size 1053172110

hpo-examples/question-answering/qa/README.md ADDED Viewed

	@@ -0,0 +1,55 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: google-bert/bert-base-uncased
+tags:
+- generated_from_trainer
+datasets:
+- squad
+model-index:
+- name: baseline
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# baseline
+This model is a fine-tuned version of [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) on the squad dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 12
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- num_epochs: 2.0
+### Training results
+### Framework versions
+- Transformers 4.49.0
+- Pytorch 2.6.0+cu118
+- Datasets 3.3.1
+- Tokenizers 0.21.0

hpo-examples/question-answering/qa/all_results.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "epoch": 2.0,
+    "eval_exact_match": 81.49479659413434,
+    "eval_f1": 88.62945564424126,
+    "eval_runtime": 61.0301,
+    "eval_samples": 10784,
+    "eval_samples_per_second": 176.7,
+    "eval_steps_per_second": 22.087,
+    "total_flos": 3.541929151120589e+16,
+    "train_loss": 1.148573803161563,
+    "train_runtime": 3245.3985,
+    "train_samples": 88524,
+    "train_samples_per_second": 54.554,
+    "train_steps_per_second": 4.546
+}

hpo-examples/question-answering/qa/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "google-bert/bert-base-uncased",
+  "architectures": [
+    "BertForQuestionAnswering"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

hpo-examples/question-answering/qa/eval_nbest_predictions.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b8d44953cbe0ce20d1d1b62b72e7adba18bf1dc81d055492e22bfa21ff46657
+size 49596120

hpo-examples/question-answering/qa/eval_predictions.json ADDED Viewed

The diff for this file is too large to render. See raw diff

hpo-examples/question-answering/qa/eval_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 2.0,
+    "eval_exact_match": 81.49479659413434,
+    "eval_f1": 88.62945564424126,
+    "eval_runtime": 61.0301,
+    "eval_samples": 10784,
+    "eval_samples_per_second": 176.7,
+    "eval_steps_per_second": 22.087
+}

hpo-examples/question-answering/qa/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38003bd65e4bfa70dd16886f29af7ab00d1aa0ae4de191b0a7de4d7883d17dde
+size 442683784

hpo-examples/question-answering/qa/runs/May15_03-24-14_cs-Precision-7960-Tower/events.out.tfevents.1747293859.cs-Precision-7960-Tower.147971.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36bfca6273a2422943de7b634cf75efd69b8e92079abe84df9e9c9e026d497f6
+size 11535

hpo-examples/question-answering/qa/runs/May15_03-24-14_cs-Precision-7960-Tower/events.out.tfevents.1747297197.cs-Precision-7960-Tower.147971.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:259c79a03ba9c522b1fd728e92dae5cfc31c6cd73b2377d124749c83a0163910
+size 412

hpo-examples/question-answering/qa/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

hpo-examples/question-answering/qa/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

hpo-examples/question-answering/qa/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

hpo-examples/question-answering/qa/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 2.0,
+    "total_flos": 3.541929151120589e+16,
+    "train_loss": 1.148573803161563,
+    "train_runtime": 3245.3985,
+    "train_samples": 88524,
+    "train_samples_per_second": 54.554,
+    "train_steps_per_second": 4.546
+}

hpo-examples/question-answering/qa/trainer_state.json ADDED Viewed

	@@ -0,0 +1,245 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 14754,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.06777822963264199,
+      "grad_norm": 31.397275924682617,
+      "learning_rate": 2.8983326555510372e-05,
+      "loss": 2.7299,
+      "step": 500
+    },
+    {
+      "epoch": 0.13555645926528398,
+      "grad_norm": 25.8492431640625,
+      "learning_rate": 2.796665311102074e-05,
+      "loss": 1.752,
+      "step": 1000
+    },
+    {
+      "epoch": 0.203334688897926,
+      "grad_norm": 29.627431869506836,
+      "learning_rate": 2.694997966653111e-05,
+      "loss": 1.5588,
+      "step": 1500
+    },
+    {
+      "epoch": 0.27111291853056796,
+      "grad_norm": 21.147193908691406,
+      "learning_rate": 2.593330622204148e-05,
+      "loss": 1.5014,
+      "step": 2000
+    },
+    {
+      "epoch": 0.33889114816321,
+      "grad_norm": 17.81966781616211,
+      "learning_rate": 2.491663277755185e-05,
+      "loss": 1.4768,
+      "step": 2500
+    },
+    {
+      "epoch": 0.406669377795852,
+      "grad_norm": 20.26822853088379,
+      "learning_rate": 2.389995933306222e-05,
+      "loss": 1.4064,
+      "step": 3000
+    },
+    {
+      "epoch": 0.47444760742849396,
+      "grad_norm": 16.216028213500977,
+      "learning_rate": 2.288328588857259e-05,
+      "loss": 1.3502,
+      "step": 3500
+    },
+    {
+      "epoch": 0.5422258370611359,
+      "grad_norm": 17.930505752563477,
+      "learning_rate": 2.1866612444082963e-05,
+      "loss": 1.3101,
+      "step": 4000
+    },
+    {
+      "epoch": 0.6100040666937779,
+      "grad_norm": 26.499574661254883,
+      "learning_rate": 2.084993899959333e-05,
+      "loss": 1.2922,
+      "step": 4500
+    },
+    {
+      "epoch": 0.67778229632642,
+      "grad_norm": 26.83368492126465,
+      "learning_rate": 1.9833265555103702e-05,
+      "loss": 1.3053,
+      "step": 5000
+    },
+    {
+      "epoch": 0.745560525959062,
+      "grad_norm": 22.85872459411621,
+      "learning_rate": 1.8816592110614073e-05,
+      "loss": 1.2555,
+      "step": 5500
+    },
+    {
+      "epoch": 0.813338755591704,
+      "grad_norm": 23.48080825805664,
+      "learning_rate": 1.779991866612444e-05,
+      "loss": 1.2068,
+      "step": 6000
+    },
+    {
+      "epoch": 0.8811169852243459,
+      "grad_norm": 20.919252395629883,
+      "learning_rate": 1.6783245221634812e-05,
+      "loss": 1.1991,
+      "step": 6500
+    },
+    {
+      "epoch": 0.9488952148569879,
+      "grad_norm": 23.9005126953125,
+      "learning_rate": 1.576657177714518e-05,
+      "loss": 1.2156,
+      "step": 7000
+    },
+    {
+      "epoch": 1.01667344448963,
+      "grad_norm": 22.660743713378906,
+      "learning_rate": 1.4749898332655551e-05,
+      "loss": 1.0827,
+      "step": 7500
+    },
+    {
+      "epoch": 1.0844516741222718,
+      "grad_norm": 25.28419303894043,
+      "learning_rate": 1.373322488816592e-05,
+      "loss": 0.8481,
+      "step": 8000
+    },
+    {
+      "epoch": 1.152229903754914,
+      "grad_norm": 14.510698318481445,
+      "learning_rate": 1.271655144367629e-05,
+      "loss": 0.872,
+      "step": 8500
+    },
+    {
+      "epoch": 1.2200081333875559,
+      "grad_norm": 29.12289810180664,
+      "learning_rate": 1.1699877999186661e-05,
+      "loss": 0.8375,
+      "step": 9000
+    },
+    {
+      "epoch": 1.287786363020198,
+      "grad_norm": 19.038454055786133,
+      "learning_rate": 1.0683204554697033e-05,
+      "loss": 0.8464,
+      "step": 9500
+    },
+    {
+      "epoch": 1.35556459265284,
+      "grad_norm": 21.09101676940918,
+      "learning_rate": 9.666531110207402e-06,
+      "loss": 0.8746,
+      "step": 10000
+    },
+    {
+      "epoch": 1.4233428222854818,
+      "grad_norm": 20.79250144958496,
+      "learning_rate": 8.649857665717772e-06,
+      "loss": 0.8776,
+      "step": 10500
+    },
+    {
+      "epoch": 1.491121051918124,
+      "grad_norm": 21.217571258544922,
+      "learning_rate": 7.633184221228141e-06,
+      "loss": 0.8523,
+      "step": 11000
+    },
+    {
+      "epoch": 1.5588992815507658,
+      "grad_norm": 15.557079315185547,
+      "learning_rate": 6.616510776738511e-06,
+      "loss": 0.8387,
+      "step": 11500
+    },
+    {
+      "epoch": 1.626677511183408,
+      "grad_norm": 14.53345012664795,
+      "learning_rate": 5.5998373322488825e-06,
+      "loss": 0.8377,
+      "step": 12000
+    },
+    {
+      "epoch": 1.6944557408160499,
+      "grad_norm": 26.921611785888672,
+      "learning_rate": 4.583163887759252e-06,
+      "loss": 0.8449,
+      "step": 12500
+    },
+    {
+      "epoch": 1.7622339704486918,
+      "grad_norm": 12.789366722106934,
+      "learning_rate": 3.566490443269622e-06,
+      "loss": 0.8547,
+      "step": 13000
+    },
+    {
+      "epoch": 1.830012200081334,
+      "grad_norm": 37.19759750366211,
+      "learning_rate": 2.549816998779992e-06,
+      "loss": 0.818,
+      "step": 13500
+    },
+    {
+      "epoch": 1.8977904297139758,
+      "grad_norm": 14.62682819366455,
+      "learning_rate": 1.533143554290362e-06,
+      "loss": 0.8128,
+      "step": 14000
+    },
+    {
+      "epoch": 1.965568659346618,
+      "grad_norm": 21.051790237426758,
+      "learning_rate": 5.164701098007319e-07,
+      "loss": 0.8115,
+      "step": 14500
+    },
+    {
+      "epoch": 2.0,
+      "step": 14754,
+      "total_flos": 3.541929151120589e+16,
+      "train_loss": 1.148573803161563,
+      "train_runtime": 3245.3985,
+      "train_samples_per_second": 54.554,
+      "train_steps_per_second": 4.546
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 14754,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.541929151120589e+16,
+  "train_batch_size": 12,
+  "trial_name": null,
+  "trial_params": null
+}

hpo-examples/question-answering/qa/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe8e61ba1ca1cb106ca9adca5e9262fa9a262238814728a69256855c78c32f51
+size 5304

hpo-examples/question-answering/qa/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

hpo-examples/question-answering/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+accelerate >= 0.12.0
+datasets >= 1.8.0
+torch >= 1.3.0
+evaluate