Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +42 -0
- added_tokens.json +3 -0
- config.json +29 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +33 -0
- tokenizer.json +3 -0
- tokenizer.model +3 -0
- tokenizer_config.json +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- sk
|
4 |
+
tags:
|
5 |
+
- HPLT
|
6 |
+
- decoder
|
7 |
+
license: apache-2.0
|
8 |
+
datasets:
|
9 |
+
- HPLT/HPLT2.0_cleaned
|
10 |
+
---
|
11 |
+
|
12 |
+
# HPLT v2.0 - Cleaned - Slovak
|
13 |
+
|
14 |
+
<img src="https://hplt-project.org/_next/static/media/logo-hplt.d5e16ca5.svg" width=12.5%>
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
This is one of the decoder-only language models trained on [HPLT2.0_cleaned](https://huggingface.co/datasets/HPLT/HPLT2.0_cleaned).
|
19 |
+
|
20 |
+
All the HPLT decoder-only models use the same hyper-parameters, roughly following the llama architecture with 2.15B parameters in total:
|
21 |
+
- hidden size: 2048
|
22 |
+
- attention heads: 32
|
23 |
+
- layers: 24
|
24 |
+
- sequence length: 2048
|
25 |
+
## Intermediate checkpoints
|
26 |
+
|
27 |
+
We are releasing intermediate checkpoints for each model at intervals of every 1000 training steps in separate branches. The naming convention is `checkpoint_00xxxx00`: for example, `checkpoint_0005000`. The checkpoints range from checkpoint_0001000 to checkpoint_0047684 and the latter is in the main branch.
|
28 |
+
|
29 |
+
|
30 |
+
## Cite us
|
31 |
+
|
32 |
+
```bibtex
|
33 |
+
@misc{burchell2025expandedmassivemultilingualdataset,
|
34 |
+
title={An Expanded Massive Multilingual Dataset for High-Performance Language Technologies},
|
35 |
+
author={Laurie Burchell and Ona de Gibert and Nikolay Arefyev and Mikko Aulamo and Marta Bañón and Pinzhen Chen and Mariia Fedorova and Liane Guillou and Barry Haddow and Jan Hajič and Jindřich Helcl and Erik Henriksson and Mateusz Klimaszewski and Ville Komulainen and Andrey Kutuzov and Joona Kytöniemi and Veronika Laippala and Petter Mæhlum and Bhavitvya Malik and Farrokh Mehryary and Vladislav Mikhailov and Nikita Moghe and Amanda Myntti and Dayyán O'Brien and Stephan Oepen and Proyag Pal and Jousia Piha and Sampo Pyysalo and Gema Ramírez-Sánchez and David Samuel and Pavel Stepachev and Jörg Tiedemann and Dušan Variš and Tereza Vojtěchová and Jaume Zaragoza-Bernabeu},
|
36 |
+
year={2025},
|
37 |
+
eprint={2503.10267},
|
38 |
+
archivePrefix={arXiv},
|
39 |
+
primaryClass={cs.CL},
|
40 |
+
url={https://arxiv.org/abs/2503.10267},
|
41 |
+
}
|
42 |
+
```
|
added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<image_soft_token>": 262144
|
3 |
+
}
|
config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LlamaForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 1,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"head_dim": 64,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 2048,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 8192,
|
14 |
+
"max_position_embeddings": 2048,
|
15 |
+
"mlp_bias": false,
|
16 |
+
"model_type": "llama",
|
17 |
+
"num_attention_heads": 32,
|
18 |
+
"num_hidden_layers": 24,
|
19 |
+
"num_key_value_heads": 32,
|
20 |
+
"pretraining_tp": 1,
|
21 |
+
"rms_norm_eps": 1e-05,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rope_theta": 10000,
|
24 |
+
"tie_word_embeddings": true,
|
25 |
+
"torch_dtype": "bfloat16",
|
26 |
+
"transformers_version": "4.48.2",
|
27 |
+
"use_cache": true,
|
28 |
+
"vocab_size": 262272
|
29 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:589e5ba14454a80eb2b6dca8b928f1898857ab5a80e5c6fd7ce00f104f4c83d6
|
3 |
+
size 4295753374
|
special_tokens_map.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"boi_token": "<start_of_image>",
|
3 |
+
"bos_token": {
|
4 |
+
"content": "<bos>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false
|
9 |
+
},
|
10 |
+
"eoi_token": "<end_of_image>",
|
11 |
+
"eos_token": {
|
12 |
+
"content": "<eos>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false
|
17 |
+
},
|
18 |
+
"image_token": "<image_soft_token>",
|
19 |
+
"pad_token": {
|
20 |
+
"content": "<pad>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false
|
25 |
+
},
|
26 |
+
"unk_token": {
|
27 |
+
"content": "<unk>",
|
28 |
+
"lstrip": false,
|
29 |
+
"normalized": false,
|
30 |
+
"rstrip": false,
|
31 |
+
"single_word": false
|
32 |
+
}
|
33 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
|
3 |
+
size 33384568
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
|
3 |
+
size 4689074
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|