Spaces:
Runtime error
Runtime error
2022-04-30 02:06:29,560 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:06:29,563 Model: "SequenceTagger( | |
(embeddings): TransformerWordEmbeddings( | |
(model): BertModel( | |
(embeddings): BertEmbeddings( | |
(word_embeddings): Embedding(21128, 768, padding_idx=0) | |
(position_embeddings): Embedding(512, 768) | |
(token_type_embeddings): Embedding(2, 768) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(encoder): BertEncoder( | |
(layer): ModuleList( | |
(0): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(1): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(2): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(3): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(4): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(5): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(6): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(7): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(8): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(9): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(10): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(11): BertLayer( | |
(attention): BertAttention( | |
(self): BertSelfAttention( | |
(query): Linear(in_features=768, out_features=768, bias=True) | |
(key): Linear(in_features=768, out_features=768, bias=True) | |
(value): Linear(in_features=768, out_features=768, bias=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
(output): BertSelfOutput( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
(intermediate): BertIntermediate( | |
(dense): Linear(in_features=768, out_features=3072, bias=True) | |
(intermediate_act_fn): GELUActivation() | |
) | |
(output): BertOutput( | |
(dense): Linear(in_features=3072, out_features=768, bias=True) | |
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) | |
(dropout): Dropout(p=0.1, inplace=False) | |
) | |
) | |
) | |
) | |
(pooler): BertPooler( | |
(dense): Linear(in_features=768, out_features=768, bias=True) | |
(activation): Tanh() | |
) | |
) | |
) | |
(word_dropout): WordDropout(p=0.05) | |
(locked_dropout): LockedDropout(p=0.5) | |
(embedding2nn): Linear(in_features=768, out_features=768, bias=True) | |
(rnn): LSTM(768, 256, batch_first=True, bidirectional=True) | |
(linear): Linear(in_features=512, out_features=5, bias=True) | |
(loss_function): CrossEntropyLoss() | |
)" | |
2022-04-30 02:06:29,565 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:06:29,566 Corpus: "Corpus: 8010 train + 2670 dev + 2670 test sentences" | |
2022-04-30 02:06:29,567 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:06:29,567 Parameters: | |
2022-04-30 02:06:29,568 - learning_rate: "0.010000" | |
2022-04-30 02:06:29,569 - mini_batch_size: "4" | |
2022-04-30 02:06:29,570 - patience: "3" | |
2022-04-30 02:06:29,571 - anneal_factor: "0.5" | |
2022-04-30 02:06:29,571 - max_epochs: "5" | |
2022-04-30 02:06:29,572 - shuffle: "False" | |
2022-04-30 02:06:29,573 - train_with_dev: "False" | |
2022-04-30 02:06:29,574 - batch_growth_annealing: "False" | |
2022-04-30 02:06:29,575 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:06:29,575 Model training base path: "squad_qst_ext_ask" | |
2022-04-30 02:06:29,576 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:06:29,577 Device: cuda:0 | |
2022-04-30 02:06:29,578 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:06:29,578 Embeddings storage mode: cpu | |
2022-04-30 02:06:29,579 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:06:51,308 epoch 1 - iter 200/2003 - loss 0.30899966 - samples/sec: 36.85 - lr: 0.010000 | |
2022-04-30 02:07:12,758 epoch 1 - iter 400/2003 - loss 0.17167131 - samples/sec: 37.33 - lr: 0.010000 | |
2022-04-30 02:07:33,991 epoch 1 - iter 600/2003 - loss 0.12144460 - samples/sec: 37.71 - lr: 0.010000 | |
2022-04-30 02:07:54,841 epoch 1 - iter 800/2003 - loss 0.09428936 - samples/sec: 38.40 - lr: 0.010000 | |
2022-04-30 02:08:15,951 epoch 1 - iter 1000/2003 - loss 0.07690232 - samples/sec: 37.93 - lr: 0.010000 | |
2022-04-30 02:08:36,969 epoch 1 - iter 1200/2003 - loss 0.06530437 - samples/sec: 38.09 - lr: 0.010000 | |
2022-04-30 02:08:57,656 epoch 1 - iter 1400/2003 - loss 0.05648796 - samples/sec: 38.70 - lr: 0.010000 | |
2022-04-30 02:09:18,255 epoch 1 - iter 1600/2003 - loss 0.04988396 - samples/sec: 38.87 - lr: 0.010000 | |
2022-04-30 02:09:39,176 epoch 1 - iter 1800/2003 - loss 0.04459321 - samples/sec: 38.27 - lr: 0.010000 | |
2022-04-30 02:09:59,865 epoch 1 - iter 2000/2003 - loss 0.04081647 - samples/sec: 38.70 - lr: 0.010000 | |
2022-04-30 02:10:00,136 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:10:00,137 EPOCH 1 done: loss 0.0408 - lr 0.010000 | |
2022-04-30 02:10:24,802 Evaluating as a multi-label problem: False | |
2022-04-30 02:10:24,831 DEV : loss 0.001589686726219952 - f1-score (micro avg) 0.9996 | |
2022-04-30 02:10:25,108 BAD EPOCHS (no improvement): 0 | |
2022-04-30 02:10:25,117 saving best model | |
2022-04-30 02:10:25,914 ---------------------------------------------------------------------------------------------------- | |
2022-04-30 02:10:48,401 epoch 2 - iter 200/2003 - loss 0.00235252 - samples/sec: 35.61 - lr: 0.010000 | |
2022-04-30 02:11:10,750 epoch 2 - iter 400/2003 - loss 0.00250680 - samples/sec: 35.83 - lr: 0.010000 | |
2022-04-30 02:11:33,084 epoch 2 - iter 600/2003 - loss 0.00397226 - samples/sec: 35.85 - lr: 0.010000 | |