File size: 2,241 Bytes
bcc039b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7622d28
bcc039b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ffeb66
82ab593
bcc039b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7044771
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Template config, need to change dump_dir, data.root_dir and tokenizer.path
# Evals can be activated by uncommenting its config
# python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest

dump_dir: /tmp/
name: "debug"
steps: 100_000
probe_freq: null
seed: 777
optim:
  lr: 4e-04
  warmup: 500
  lr_min_ratio: 0.1
  clip: 10.0

distributed:
  fsdp_type: full_shard
  model_dtype: bf16
  matmul_allow_tf32: false
  selective_activation_checkpointing: false
  tp_size: 1

model:
  n_heads: 8
  dim: 512
  vocab_size: 260
  dim_token: 256
  patch_size: 6
  patching_mode: "space"
  tie_local_encoder_decoder_logits: false
  patch_in_forward: false
  max_encoder_seq_length: 12288
  pad_to_max_length: true
  patching_threshold: 3.1439168453216553
  encoder_hash_byte_group_size: [4]
  encoder_hash_byte_group_vocab: 50002
  encoder_hash_byte_group_nb_functions: 3
  encoder_enable_byte_ngrams: false
  cross_attn_encoder: true # assuming cross_attention is true
  cross_attn_decoder: true # assuming cross_attention is true
  cross_attn_window_encoder: 512
  cross_attn_window_decoder: 512
  dim_local_encoder: 256
  dim_local_decoder: 256
  cross_attn_k: 8
  cross_attn_nheads: 4
  cross_attn_all_layers_decoder: true
  cross_attn_all_layers_encoder: true
  cross_attn_use_flex_attention: true
  cross_attn_init_by_pooling: true
  log_patch_lengths: true
  non_linearity: "swiglu"
  use_rope: true
  recompute_fc1_out: false
  recompute_fc3_out: false
  recompute_attn: false
  custom_bwd: false
  layer_ckpt: "none"
  use_local_encoder_transformer: true
  init_use_gaussian: true
  init_use_depth: "current"
  attn_impl: "xformers"
  attn_bias_type: "block_causal"
  alpha_depth: "disabled"
  max_length: 256
  local_attention_window_len: 512
  max_seqlen: 12288
  downsampling_by_pooling: "max"

data:
  root_dir: ???
  sources:
    dclm_baseline_1.0: 1.0
  batch_size: 2
  prefetch_size: 64
  seq_len: 4096
  load_async: true
  preprocess_dir: ???
  tokenizer_args:
    name: blt
    init_kwargs:
      bpe_tokenizer_path: ???

profiling:
  run: false

checkpoint:
  dump:
    every: 500
    keep: 3
  eval:
    every: 1000
    keep: -1

logging:
  freq: 10

eval_on_gpus: 8
eval: null