mtasic85 commited on
Commit
e15928d
·
1 Parent(s): 78df5e9

pretrain core dataset

Browse files
Files changed (2) hide show
  1. README.md +2 -2
  2. scripts/pretrain-core-model.yaml +1 -1
README.md CHANGED
@@ -53,8 +53,8 @@ time python -B prepare_core_datasets.py
53
  ```
54
 
55
  ```
56
- i=0, min_len=0, max_len=1048576, block_size=2049, chunk_size=16392000, len(dataset)=3134311, len(dataset) * block_size=6422203239
57
- Total number of tokens in the optimized dataset '../core-data-0-0-1048576-2049-8000' is 6422203239
58
  ```
59
 
60
  ```bash
 
53
  ```
54
 
55
  ```
56
+ i=0, min_len=0, max_len=1048576, block_size=4097, chunk_size=16388000, len(dataset)=1567386, len(dataset) * block_size=6421580442
57
+ Total number of tokens in the optimized dataset '../core-data-0-0-1048576-4097-4000' is 6421580442
58
  ```
59
 
60
  ```bash
scripts/pretrain-core-model.yaml CHANGED
@@ -70,7 +70,7 @@ train:
70
  epochs:
71
 
72
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
- max_tokens: 6422203239
74
 
75
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
76
  max_steps:
 
70
  epochs:
71
 
72
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
+ max_tokens: 6421580442
74
 
75
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
76
  max_steps: