Support loading data files from a local directory
Browse filesref: https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path
- src/axolotl/utils/data.py +20 -7
src/axolotl/utils/data.py
CHANGED
|
@@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets(
|
|
| 102 |
pass
|
| 103 |
|
| 104 |
# prefer local dataset, even if hub exists
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
elif ds_from_hub:
|
| 113 |
if d.data_files:
|
| 114 |
ds = load_dataset(
|
|
|
|
| 102 |
pass
|
| 103 |
|
| 104 |
# prefer local dataset, even if hub exists
|
| 105 |
+
local_path = Path(d.path)
|
| 106 |
+
if local_path.exists():
|
| 107 |
+
if local_path.is_dir():
|
| 108 |
+
ds = load_dataset(
|
| 109 |
+
d.path,
|
| 110 |
+
data_files=d.data_files,
|
| 111 |
+
streaming=False,
|
| 112 |
+
split=None,
|
| 113 |
+
)
|
| 114 |
+
elif local_path.is_file():
|
| 115 |
+
ds = load_dataset(
|
| 116 |
+
"json",
|
| 117 |
+
data_files=d.path,
|
| 118 |
+
streaming=False,
|
| 119 |
+
split=None,
|
| 120 |
+
)
|
| 121 |
+
else:
|
| 122 |
+
raise ValueError(
|
| 123 |
+
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
| 124 |
+
)
|
| 125 |
elif ds_from_hub:
|
| 126 |
if d.data_files:
|
| 127 |
ds = load_dataset(
|