load explicit splits on datasets (#1652)
Browse files
src/axolotl/utils/data/sft.py
CHANGED
|
@@ -308,12 +308,16 @@ def load_tokenized_prepared_datasets(
|
|
| 308 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
| 309 |
)
|
| 310 |
elif ds_from_hub:
|
|
|
|
|
|
|
|
|
|
| 311 |
ds = load_dataset(
|
| 312 |
config_dataset.path,
|
| 313 |
name=config_dataset.name,
|
| 314 |
streaming=False,
|
| 315 |
data_files=config_dataset.data_files,
|
| 316 |
token=use_auth_token,
|
|
|
|
| 317 |
)
|
| 318 |
elif ds_from_cloud and remote_file_system:
|
| 319 |
if remote_file_system.isdir(config_dataset.path):
|
|
|
|
| 308 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
| 309 |
)
|
| 310 |
elif ds_from_hub:
|
| 311 |
+
load_ds_kwargs = {}
|
| 312 |
+
if config_dataset.split:
|
| 313 |
+
load_ds_kwargs = {"split": config_dataset.split}
|
| 314 |
ds = load_dataset(
|
| 315 |
config_dataset.path,
|
| 316 |
name=config_dataset.name,
|
| 317 |
streaming=False,
|
| 318 |
data_files=config_dataset.data_files,
|
| 319 |
token=use_auth_token,
|
| 320 |
+
**load_ds_kwargs,
|
| 321 |
)
|
| 322 |
elif ds_from_cloud and remote_file_system:
|
| 323 |
if remote_file_system.isdir(config_dataset.path):
|