Commit
·
88089e8
1
Parent(s):
168a7a0
Add ability to pass 'name' argument to load_dataset
Browse files- src/axolotl/utils/data.py +13 -14
src/axolotl/utils/data.py
CHANGED
|
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
|
|
| 94 |
try:
|
| 95 |
load_dataset(
|
| 96 |
d.path,
|
|
|
|
| 97 |
streaming=True,
|
| 98 |
use_auth_token=use_auth_token,
|
| 99 |
)
|
|
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
|
|
| 107 |
if local_path.is_dir():
|
| 108 |
ds = load_dataset(
|
| 109 |
d.path,
|
|
|
|
| 110 |
data_files=d.data_files,
|
| 111 |
streaming=False,
|
| 112 |
split=None,
|
|
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
|
|
| 114 |
elif local_path.is_file():
|
| 115 |
ds = load_dataset(
|
| 116 |
"json",
|
|
|
|
| 117 |
data_files=d.path,
|
| 118 |
streaming=False,
|
| 119 |
split=None,
|
|
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
|
|
| 123 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
| 124 |
)
|
| 125 |
elif ds_from_hub:
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
else:
|
| 134 |
-
ds = load_dataset(
|
| 135 |
-
d.path,
|
| 136 |
-
streaming=False,
|
| 137 |
-
use_auth_token=use_auth_token,
|
| 138 |
-
)
|
| 139 |
else:
|
| 140 |
fp = hf_hub_download(
|
| 141 |
repo_id=d.path,
|
| 142 |
repo_type="dataset",
|
| 143 |
filename=d.data_files,
|
| 144 |
)
|
| 145 |
-
ds = load_dataset(
|
|
|
|
|
|
|
| 146 |
if not ds:
|
| 147 |
raise ValueError("unhandled dataset load")
|
| 148 |
# support for using a subset of the data
|
|
|
|
| 94 |
try:
|
| 95 |
load_dataset(
|
| 96 |
d.path,
|
| 97 |
+
name=d.name,
|
| 98 |
streaming=True,
|
| 99 |
use_auth_token=use_auth_token,
|
| 100 |
)
|
|
|
|
| 108 |
if local_path.is_dir():
|
| 109 |
ds = load_dataset(
|
| 110 |
d.path,
|
| 111 |
+
name=d.name,
|
| 112 |
data_files=d.data_files,
|
| 113 |
streaming=False,
|
| 114 |
split=None,
|
|
|
|
| 116 |
elif local_path.is_file():
|
| 117 |
ds = load_dataset(
|
| 118 |
"json",
|
| 119 |
+
name=d.name,
|
| 120 |
data_files=d.path,
|
| 121 |
streaming=False,
|
| 122 |
split=None,
|
|
|
|
| 126 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
| 127 |
)
|
| 128 |
elif ds_from_hub:
|
| 129 |
+
ds = load_dataset(
|
| 130 |
+
d.path,
|
| 131 |
+
name=d.name,
|
| 132 |
+
streaming=False,
|
| 133 |
+
data_files=d.data_files,
|
| 134 |
+
use_auth_token=use_auth_token,
|
| 135 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
else:
|
| 137 |
fp = hf_hub_download(
|
| 138 |
repo_id=d.path,
|
| 139 |
repo_type="dataset",
|
| 140 |
filename=d.data_files,
|
| 141 |
)
|
| 142 |
+
ds = load_dataset(
|
| 143 |
+
"json", name=d.name, data_files=fp, streaming=False, split=None
|
| 144 |
+
)
|
| 145 |
if not ds:
|
| 146 |
raise ValueError("unhandled dataset load")
|
| 147 |
# support for using a subset of the data
|