File size: 2,864 Bytes
64094d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
default: &DEFAULT

  #data
  exp_dir: carnatic/ckpts
  metadata_labeled_path: labeled_small_wav_metadata.json
  metadata_unlabeled_path: unlabeled_data/unlabeled_mp3_metadata.json
  num_files_per_raga_path: metadata_small.json
  num_classes: 150
  use_frac: 1
  use_unlabeled_data: !!bool False
  labeled_data_dir: labeled_data_small
  clip_length: 30
  sample_rate: 8000
  normalize: !!bool True

  #training
  batch_size: 16
  num_data_workers: 16
  n_epochs: 100
  lr: 0.001
  class_imbalance_weights: !!bool False
  patience: 10
  train_frac: 0.8
  
  #model
  model: 'base'
  n_input: 2 #stereo
  stride: 16
  n_channel: 32
  max_pool_every: 1
  
  #logging
  save_checkpoint: !!bool False
  wandb_api_key: f7892f37dd96b5f1da5c85a410300bb661f3c4de
  log_to_wandb: !!bool False
  

default_0.7: &DEFAULT_0.7

  <<: *DEFAULT
  metadata_labeled_path: labeled_0.7_wav_metadata.json
  num_files_per_raga_path: metadata_0.7.json
  labeled_data_dir: labeled_data_0.7

default_0.9: &DEFAULT_0.9

  <<: *DEFAULT
  metadata_labeled_path: labeled_0.9_wav_metadata.json
  num_files_per_raga_path: metadata_0.9.json
  labeled_data_dir: labeled_data_0.9
  train_frac: 0.85
  num_classes: 200

  
resnet: &RESNET

  <<: *DEFAULT
  model: 'resnet'
  n_blocks: 5 #for resnet
  n_channel: 128

resnet_0.7: &RESNET_0.7

  <<: *DEFAULT_0.7
  model: 'resnet'
  n_blocks: 10 #for resnet
  n_channel: 300
  num_classes: 150

resnet_0.9: &RESNET_0.9

  <<: *DEFAULT_0.9
  model: 'resnet'
  n_blocks: 10 #for resnet
  n_channel: 350
  max_pool_every: 1 #downsample every other res block


wav2vec_0.7: &WAV2VEC_0.7
  <<: *DEFAULT_0.7

  model: 'wav2vec'
  n_input: 1 #mono

  #transformer parameters (this config leads to around 29M params)
  extractor_mode: "layer_norm"
  extractor_conv_layer_config: None #harcoded for now, fix this at some point
  extractor_conv_bias: !!bool True
  encoder_embed_dim: 512
  encoder_projection_dropout: 0
  encoder_pos_conv_kernel: 3
  encoder_pos_conv_groups: 32
  encoder_num_layers: 12
  encoder_num_heads: 16
  encoder_attention_dropout: 0
  encoder_ff_interm_features: 1024
  encoder_ff_interm_dropout: 0
  encoder_dropout: 0
  encoder_layer_norm_first: !!bool True
  encoder_layer_drop: 0


wav2vec_0.9: &WAV2VEC_0.9
  <<: *DEFAULT_0.9

  model: 'wav2vec'
  n_input: 1 #mono

  #transformer parameters (this config leads to around 29M params)
  extractor_mode: "layer_norm"
  extractor_conv_layer_config: None #harcoded for now, fix this at some point
  extractor_conv_bias: !!bool True
  encoder_embed_dim: 512
  encoder_projection_dropout: 0
  encoder_pos_conv_kernel: 3
  encoder_pos_conv_groups: 32
  encoder_num_layers: 12
  encoder_num_heads: 16
  encoder_attention_dropout: 0
  encoder_ff_interm_features: 1024
  encoder_ff_interm_dropout: 0
  encoder_dropout: 0
  encoder_layer_norm_first: !!bool True
  encoder_layer_drop: 0