Update tokenize_and_upload_mistral.py
Browse files
tokenize_and_upload_mistral.py
CHANGED
@@ -17,8 +17,8 @@ HF_TOKEN = os.getenv("HF_TOKEN")
|
|
17 |
SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
|
18 |
TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
|
19 |
BUFFER_SIZE = 5
|
20 |
-
START_CHUNK_NUMBER =
|
21 |
-
PROCESS_CHUNK_COUNT =
|
22 |
|
23 |
CHUNK_FOLDER = "/data/chunks"
|
24 |
PARQUET_FOLDER = "/data/tokenized_chunks"
|
|
|
17 |
SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
|
18 |
TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
|
19 |
BUFFER_SIZE = 5
|
20 |
+
START_CHUNK_NUMBER = 776
|
21 |
+
PROCESS_CHUNK_COUNT = 2
|
22 |
|
23 |
CHUNK_FOLDER = "/data/chunks"
|
24 |
PARQUET_FOLDER = "/data/tokenized_chunks"
|