Upload loaders.py with huggingface_hub
Browse files- loaders.py +32 -13
loaders.py
CHANGED
|
@@ -36,9 +36,12 @@ from tqdm import tqdm
|
|
| 36 |
|
| 37 |
from .logging_utils import get_logger
|
| 38 |
from .operator import SourceOperator
|
|
|
|
| 39 |
from .stream import MultiStream, Stream
|
| 40 |
|
| 41 |
logger = get_logger()
|
|
|
|
|
|
|
| 42 |
try:
|
| 43 |
import ibm_boto3
|
| 44 |
|
|
@@ -88,16 +91,23 @@ class LoadHF(Loader):
|
|
| 88 |
NotImplementedError
|
| 89 |
): # streaming is not supported for zipped files so we load without streaming
|
| 90 |
with tempfile.TemporaryDirectory() as dir_to_be_deleted:
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
if self.split is None:
|
| 102 |
for split in dataset.keys():
|
| 103 |
dataset[split] = dataset[split].to_iterable_dataset()
|
|
@@ -268,9 +278,18 @@ class LoadFromIBMCloud(Loader):
|
|
| 268 |
if self.data_dir is not None
|
| 269 |
else data_file
|
| 270 |
)
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
if isinstance(self.data_files, list):
|
| 276 |
dataset = hf_load_dataset(local_dir, streaming=False)
|
|
|
|
| 36 |
|
| 37 |
from .logging_utils import get_logger
|
| 38 |
from .operator import SourceOperator
|
| 39 |
+
from .settings_utils import get_settings
|
| 40 |
from .stream import MultiStream, Stream
|
| 41 |
|
| 42 |
logger = get_logger()
|
| 43 |
+
settings = get_settings()
|
| 44 |
+
|
| 45 |
try:
|
| 46 |
import ibm_boto3
|
| 47 |
|
|
|
|
| 91 |
NotImplementedError
|
| 92 |
): # streaming is not supported for zipped files so we load without streaming
|
| 93 |
with tempfile.TemporaryDirectory() as dir_to_be_deleted:
|
| 94 |
+
try:
|
| 95 |
+
dataset = hf_load_dataset(
|
| 96 |
+
self.path,
|
| 97 |
+
name=self.name,
|
| 98 |
+
data_dir=self.data_dir,
|
| 99 |
+
data_files=self.data_files,
|
| 100 |
+
streaming=False,
|
| 101 |
+
keep_in_memory=True,
|
| 102 |
+
cache_dir=dir_to_be_deleted,
|
| 103 |
+
split=self.split,
|
| 104 |
+
truse_remote_code=settings.allow_unverified_code,
|
| 105 |
+
)
|
| 106 |
+
except ValueError as e:
|
| 107 |
+
if "trust_remote_code" in str(e):
|
| 108 |
+
raise ValueError(
|
| 109 |
+
f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
|
| 110 |
+
) from e
|
| 111 |
if self.split is None:
|
| 112 |
for split in dataset.keys():
|
| 113 |
dataset[split] = dataset[split].to_iterable_dataset()
|
|
|
|
| 278 |
if self.data_dir is not None
|
| 279 |
else data_file
|
| 280 |
)
|
| 281 |
+
with tempfile.NamedTemporaryFile() as temp_file:
|
| 282 |
+
# Download to a temporary file in same file partition, and then do an atomic move
|
| 283 |
+
self._download_from_cos(
|
| 284 |
+
cos,
|
| 285 |
+
self.bucket_name,
|
| 286 |
+
object_key,
|
| 287 |
+
local_dir + "/" + os.path.basename(temp_file.name),
|
| 288 |
+
)
|
| 289 |
+
os.rename(
|
| 290 |
+
local_dir + "/" + os.path.basename(temp_file.name),
|
| 291 |
+
local_dir + "/" + data_file,
|
| 292 |
+
)
|
| 293 |
|
| 294 |
if isinstance(self.data_files, list):
|
| 295 |
dataset = hf_load_dataset(local_dir, streaming=False)
|