Spaces:
Runtime error
Runtime error
import json | |
import os | |
import pytest | |
import fsspec | |
from fsspec.implementations.local import LocalFileSystem | |
from fsspec.implementations.reference import ( | |
LazyReferenceMapper, | |
ReferenceFileSystem, | |
ReferenceNotReachable, | |
) | |
from fsspec.tests.conftest import data, realfile, reset_files, server, win # noqa: F401 | |
def test_simple(server): # noqa: F811 | |
refs = { | |
"a": b"data", | |
"b": (realfile, 0, 5), | |
"c": (realfile, 1, 5), | |
"d": b"base64:aGVsbG8=", | |
} | |
h = fsspec.filesystem("http") | |
fs = fsspec.filesystem("reference", fo=refs, fs=h) | |
assert fs.cat("a") == b"data" | |
assert fs.cat("b") == data[:5] | |
assert fs.cat("c") == data[1 : 1 + 5] | |
assert fs.cat("d") == b"hello" | |
with fs.open("d", "rt") as f: | |
assert f.read(2) == "he" | |
def test_target_options(m): | |
m.pipe("data/0", b"hello") | |
refs = {"a": ["memory://data/0"]} | |
fn = "memory://refs.json.gz" | |
with fsspec.open(fn, "wt", compression="gzip") as f: | |
json.dump(refs, f) | |
fs = fsspec.filesystem("reference", fo=fn, target_options={"compression": "gzip"}) | |
assert fs.cat("a") == b"hello" | |
def test_ls(server): # noqa: F811 | |
refs = {"a": b"data", "b": (realfile, 0, 5), "c/d": (realfile, 1, 6)} | |
h = fsspec.filesystem("http") | |
fs = fsspec.filesystem("reference", fo=refs, fs=h) | |
assert fs.ls("", detail=False) == ["a", "b", "c"] | |
assert {"name": "c", "type": "directory", "size": 0} in fs.ls("", detail=True) | |
assert fs.find("") == ["a", "b", "c/d"] | |
assert fs.find("", withdirs=True) == ["a", "b", "c", "c/d"] | |
assert fs.find("c", detail=True) == { | |
"c/d": {"name": "c/d", "size": 6, "type": "file"} | |
} | |
def test_nested_dirs_ls(): | |
# issue #1430 | |
refs = {"a": "A", "B/C/b": "B", "B/C/d": "d", "B/_": "_"} | |
fs = fsspec.filesystem("reference", fo=refs) | |
assert len(fs.ls("")) == 2 | |
assert {e["name"] for e in fs.ls("")} == {"a", "B"} | |
assert len(fs.ls("B")) == 2 | |
assert {e["name"] for e in fs.ls("B")} == {"B/C", "B/_"} | |
def test_info(server): # noqa: F811 | |
refs = { | |
"a": b"data", | |
"b": (realfile, 0, 5), | |
"c/d": (realfile, 1, 6), | |
"e": (realfile,), | |
} | |
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) | |
fs = fsspec.filesystem("reference", fo=refs, fs=h) | |
assert fs.size("a") == 4 | |
assert fs.size("b") == 5 | |
assert fs.size("c/d") == 6 | |
assert fs.info("e")["size"] == len(data) | |
def test_mutable(server, m): | |
refs = { | |
"a": b"data", | |
"b": (realfile, 0, 5), | |
"c/d": (realfile, 1, 6), | |
"e": (realfile,), | |
} | |
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) | |
fs = fsspec.filesystem("reference", fo=refs, fs=h) | |
fs.rm("a") | |
assert not fs.exists("a") | |
bin_data = b"bin data" | |
fs.pipe("aa", bin_data) | |
assert fs.cat("aa") == bin_data | |
fs.save_json("memory://refs.json") | |
assert m.exists("refs.json") | |
fs = fsspec.filesystem("reference", fo="memory://refs.json", remote_protocol="http") | |
assert not fs.exists("a") | |
assert fs.cat("aa") == bin_data | |
def test_put_get(tmpdir): | |
d1 = f"{tmpdir}/d1" | |
os.mkdir(d1) | |
with open(f"{d1}/a", "wb") as f: | |
f.write(b"1") | |
with open(f"{d1}/b", "wb") as f: | |
f.write(b"2") | |
d2 = f"{tmpdir}/d2" | |
fs = fsspec.filesystem("reference", fo={}, remote_protocol="file") | |
fs.put(d1, "out", recursive=True) | |
fs.get("out", d2, recursive=True) | |
assert open(f"{d2}/a", "rb").read() == b"1" | |
assert open(f"{d2}/b", "rb").read() == b"2" | |
def test_put_get_single(tmpdir): | |
d1 = f"{tmpdir}/f1" | |
d2 = f"{tmpdir}/f2" | |
with open(d1, "wb") as f: | |
f.write(b"1") | |
# skip instance cache since this is the same kwargs as previous test | |
fs = fsspec.filesystem( | |
"reference", fo={}, remote_protocol="file", skip_instance_cache=True | |
) | |
fs.put_file(d1, "out") | |
fs.get_file("out", d2) | |
assert open(d2, "rb").read() == b"1" | |
fs.pipe({"hi": b"data"}) | |
assert fs.cat("hi") == b"data" | |
def test_defaults(server): # noqa: F811 | |
refs = {"a": b"data", "b": (None, 0, 5)} | |
fs = fsspec.filesystem( | |
"reference", | |
fo=refs, | |
target_protocol="http", | |
target=realfile, | |
remote_protocol="http", | |
) | |
assert fs.cat("a") == b"data" | |
assert fs.cat("b") == data[:5] | |
jdata = """{ | |
"metadata": { | |
".zattrs": { | |
"Conventions": "UGRID-0.9.0" | |
}, | |
".zgroup": { | |
"zarr_format": 2 | |
}, | |
"adcirc_mesh/.zarray": { | |
"chunks": [ | |
1 | |
], | |
"dtype": "<i4", | |
"shape": [ | |
1 | |
], | |
"zarr_format": 2 | |
}, | |
"adcirc_mesh/.zattrs": { | |
"_ARRAY_DIMENSIONS": [ | |
"mesh" | |
], | |
"cf_role": "mesh_topology" | |
}, | |
"adcirc_mesh/.zchunkstore": { | |
"adcirc_mesh/0": { | |
"offset": 8928, | |
"size": 4 | |
}, | |
"source": { | |
"array_name": "/adcirc_mesh", | |
"uri": "https://url" | |
} | |
} | |
}, | |
"zarr_consolidated_format": 1 | |
} | |
""" | |
def test_spec1_expand(): | |
pytest.importorskip("jinja2") | |
in_data = { | |
"version": 1, | |
"templates": {"u": "server.domain/path", "f": "{{c}}"}, | |
"gen": [ | |
{ | |
"key": "gen_key{{i}}", | |
"url": "http://{{u}}_{{i}}", | |
"offset": "{{(i + 1) * 1000}}", | |
"length": "1000", | |
"dimensions": {"i": {"stop": 5}}, | |
}, | |
{ | |
"key": "gen_key{{i}}", | |
"url": "http://{{u}}_{{i}}", | |
"dimensions": {"i": {"start": 5, "stop": 7}}, | |
}, | |
], | |
"refs": { | |
"key0": "data", | |
"key1": ["http://target_url", 10000, 100], | |
"key2": ["http://{{u}}", 10000, 100], | |
"key3": ["http://{{f(c='text')}}", 10000, 100], | |
"key4": ["http://target_url"], | |
}, | |
} | |
fs = fsspec.filesystem( | |
"reference", fo=in_data, target_protocol="http", simple_templates=False | |
) | |
assert fs.references == { | |
"key0": "data", | |
"key1": ["http://target_url", 10000, 100], | |
"key2": ["http://server.domain/path", 10000, 100], | |
"key3": ["http://text", 10000, 100], | |
"key4": ["http://target_url"], | |
"gen_key0": ["http://server.domain/path_0", 1000, 1000], | |
"gen_key1": ["http://server.domain/path_1", 2000, 1000], | |
"gen_key2": ["http://server.domain/path_2", 3000, 1000], | |
"gen_key3": ["http://server.domain/path_3", 4000, 1000], | |
"gen_key4": ["http://server.domain/path_4", 5000, 1000], | |
"gen_key5": ["http://server.domain/path_5"], | |
"gen_key6": ["http://server.domain/path_6"], | |
} | |
def test_spec1_expand_simple(): | |
pytest.importorskip("jinja2") | |
in_data = { | |
"version": 1, | |
"templates": {"u": "server.domain/path"}, | |
"refs": { | |
"key0": "base64:ZGF0YQ==", | |
"key2": ["http://{{u}}", 10000, 100], | |
"key4": ["http://target_url"], | |
}, | |
} | |
fs = fsspec.filesystem("reference", fo=in_data, target_protocol="http") | |
assert fs.references["key2"] == ["http://server.domain/path", 10000, 100] | |
fs = fsspec.filesystem( | |
"reference", | |
fo=in_data, | |
target_protocol="http", | |
template_overrides={"u": "not.org/p"}, | |
) | |
assert fs.references["key2"] == ["http://not.org/p", 10000, 100] | |
assert fs.cat("key0") == b"data" | |
def test_spec1_gen_variants(): | |
pytest.importorskip("jinja2") | |
with pytest.raises(ValueError): | |
missing_length_spec = { | |
"version": 1, | |
"templates": {"u": "server.domain/path"}, | |
"gen": [ | |
{ | |
"key": "gen_key{{i}}", | |
"url": "http://{{u}}_{{i}}", | |
"offset": "{{(i + 1) * 1000}}", | |
"dimensions": {"i": {"stop": 2}}, | |
}, | |
], | |
} | |
fsspec.filesystem("reference", fo=missing_length_spec, target_protocol="http") | |
with pytest.raises(ValueError): | |
missing_offset_spec = { | |
"version": 1, | |
"templates": {"u": "server.domain/path"}, | |
"gen": [ | |
{ | |
"key": "gen_key{{i}}", | |
"url": "http://{{u}}_{{i}}", | |
"length": "1000", | |
"dimensions": {"i": {"stop": 2}}, | |
}, | |
], | |
} | |
fsspec.filesystem("reference", fo=missing_offset_spec, target_protocol="http") | |
url_only_gen_spec = { | |
"version": 1, | |
"templates": {"u": "server.domain/path"}, | |
"gen": [ | |
{ | |
"key": "gen_key{{i}}", | |
"url": "http://{{u}}_{{i}}", | |
"dimensions": {"i": {"stop": 2}}, | |
}, | |
], | |
} | |
fs = fsspec.filesystem("reference", fo=url_only_gen_spec, target_protocol="http") | |
assert fs.references == { | |
"gen_key0": ["http://server.domain/path_0"], | |
"gen_key1": ["http://server.domain/path_1"], | |
} | |
def test_empty(): | |
pytest.importorskip("jinja2") | |
fs = fsspec.filesystem("reference", fo={"version": 1}, target_protocol="http") | |
assert fs.references == {} | |
def test_get_sync(tmpdir): | |
localfs = LocalFileSystem() | |
real = tmpdir / "file" | |
real.write_binary(b"0123456789") | |
refs = {"a": b"data", "b": (str(real), 0, 5), "c/d": (str(real), 1, 6)} | |
fs = fsspec.filesystem("reference", fo=refs, fs=localfs) | |
fs.get("a", str(tmpdir / "a")) | |
assert (tmpdir / "a").read_binary() == b"data" | |
fs.get("b", str(tmpdir / "b")) | |
assert (tmpdir / "b").read_binary() == b"01234" | |
fs.get("c/d", str(tmpdir / "d")) | |
assert (tmpdir / "d").read_binary() == b"123456" | |
fs.get("c", str(tmpdir / "c"), recursive=True) | |
assert (tmpdir / "c").isdir() | |
assert (tmpdir / "c" / "d").read_binary() == b"123456" | |
def test_multi_fs_provided(m, tmpdir): | |
localfs = LocalFileSystem() | |
real = tmpdir / "file" | |
real.write_binary(b"0123456789") | |
m.pipe("afile", b"hello") | |
# local URLs are file:// by default | |
refs = { | |
"a": b"data", | |
"b": (f"file://{real}", 0, 5), | |
"c/d": (f"file://{real}", 1, 6), | |
"c/e": ["memory://afile"], | |
} | |
fs = fsspec.filesystem("reference", fo=refs, fs={"file": localfs, "memory": m}) | |
assert fs.cat("c/e") == b"hello" | |
assert fs.cat(["c/e", "a", "b"]) == { | |
"a": b"data", | |
"b": b"01234", | |
"c/e": b"hello", | |
} | |
def test_multi_fs_created(m, tmpdir): | |
real = tmpdir / "file" | |
real.write_binary(b"0123456789") | |
m.pipe("afile", b"hello") | |
# local URLs are file:// by default | |
refs = { | |
"a": b"data", | |
"b": (f"file://{real}", 0, 5), | |
"c/d": (f"file://{real}", 1, 6), | |
"c/e": ["memory://afile"], | |
} | |
fs = fsspec.filesystem("reference", fo=refs, fs={"file": {}, "memory": {}}) | |
assert fs.cat("c/e") == b"hello" | |
assert fs.cat(["c/e", "a", "b"]) == { | |
"a": b"data", | |
"b": b"01234", | |
"c/e": b"hello", | |
} | |
def test_missing_nonasync(m): | |
zarr = pytest.importorskip("zarr") | |
zarray = { | |
"chunks": [1], | |
"compressor": None, | |
"dtype": "<f8", | |
"fill_value": "NaN", | |
"filters": [], | |
"order": "C", | |
"shape": [10], | |
"zarr_format": 2, | |
} | |
refs = {".zarray": json.dumps(zarray)} | |
m = fsspec.get_mapper("reference://", fo=refs, remote_protocol="memory") | |
a = zarr.open_array(m) | |
assert str(a[0]) == "nan" | |
def test_fss_has_defaults(m): | |
fs = fsspec.filesystem("reference", fo={}) | |
assert None in fs.fss | |
fs = fsspec.filesystem("reference", fo={}, remote_protocol="memory") | |
assert fs.fss[None].protocol == "memory" | |
assert fs.fss["memory"].protocol == "memory" | |
fs = fsspec.filesystem("reference", fs=m, fo={}) | |
assert fs.fss[None] is m | |
fs = fsspec.filesystem("reference", fs={"memory": m}, fo={}) | |
assert fs.fss["memory"] is m | |
assert fs.fss[None].protocol == ("file", "local") | |
fs = fsspec.filesystem("reference", fs={None: m}, fo={}) | |
assert fs.fss[None] is m | |
fs = fsspec.filesystem("reference", fo={"key": ["memory://a"]}) | |
assert fs.fss[None] is fs.fss["memory"] | |
fs = fsspec.filesystem("reference", fo={"key": ["memory://a"], "blah": ["path"]}) | |
assert fs.fss[None] is fs.fss["memory"] | |
def test_merging(m): | |
m.pipe("/a", b"test data") | |
other = b"other test data" | |
m.pipe("/b", other) | |
fs = fsspec.filesystem( | |
"reference", | |
fo={ | |
"a": ["memory://a", 1, 1], | |
"b": ["memory://a", 2, 1], | |
"c": ["memory://b"], | |
"d": ["memory://b", 4, 6], | |
}, | |
) | |
out = fs.cat(["a", "b", "c", "d"]) | |
assert out == {"a": b"e", "b": b"s", "c": other, "d": other[4:10]} | |
def test_cat_file_ranges(m): | |
other = b"other test data" | |
m.pipe("/b", other) | |
fs = fsspec.filesystem( | |
"reference", | |
fo={ | |
"c": ["memory://b"], | |
"d": ["memory://b", 4, 6], | |
}, | |
) | |
assert fs.cat_file("c") == other | |
assert fs.cat_file("c", start=1) == other[1:] | |
assert fs.cat_file("c", start=-5) == other[-5:] | |
assert fs.cat_file("c", 1, -5) == other[1:-5] | |
assert fs.cat_file("d") == other[4:10] | |
assert fs.cat_file("d", start=1) == other[4:10][1:] | |
assert fs.cat_file("d", start=-5) == other[4:10][-5:] | |
assert fs.cat_file("d", 1, -3) == other[4:10][1:-3] | |
def test_cat_missing(m, fo): | |
other = b"other test data" | |
m.pipe("/b", other) | |
fs = fsspec.filesystem( | |
"reference", | |
fo=fo, | |
) | |
with pytest.raises(FileNotFoundError): | |
fs.cat("notafile") | |
with pytest.raises(FileNotFoundError): | |
fs.cat(["notone", "nottwo"]) | |
mapper = fs.get_mapper("") | |
with pytest.raises(KeyError): | |
mapper["notakey"] | |
with pytest.raises(KeyError): | |
mapper.getitems(["notone", "nottwo"]) | |
with pytest.raises(ReferenceNotReachable) as ex: | |
fs.cat("d") | |
assert ex.value.__cause__ | |
out = fs.cat("d", on_error="return") | |
assert isinstance(out, ReferenceNotReachable) | |
with pytest.raises(ReferenceNotReachable) as e: | |
mapper["d"] | |
assert '"d"' in str(e.value) | |
assert "//unknown" in str(e.value) | |
with pytest.raises(ReferenceNotReachable): | |
mapper.getitems(["c", "d"]) | |
out = mapper.getitems(["c", "d"], on_error="return") | |
assert isinstance(out["d"], ReferenceNotReachable) | |
out = fs.cat(["notone", "c", "d"], on_error="return") | |
assert isinstance(out["notone"], FileNotFoundError) | |
assert out["c"] == other | |
assert isinstance(out["d"], ReferenceNotReachable) | |
out = mapper.getitems(["c", "d"], on_error="omit") | |
assert list(out) == ["c"] | |
def test_df_single(m): | |
pd = pytest.importorskip("pandas") | |
pytest.importorskip("fastparquet") | |
data = b"data0data1data2" | |
m.pipe({"data": data}) | |
df = pd.DataFrame( | |
{ | |
"path": [None, "memory://data", "memory://data"], | |
"offset": [0, 0, 4], | |
"size": [0, 0, 4], | |
"raw": [b"raw", None, None], | |
} | |
) | |
df.to_parquet("memory://stuff/refs.0.parq") | |
m.pipe( | |
".zmetadata", | |
b"""{ | |
"metadata": { | |
".zgroup": { | |
"zarr_format": 2 | |
}, | |
"stuff/.zarray": { | |
"chunks": [1], | |
"compressor": null, | |
"dtype": "i8", | |
"filters": null, | |
"shape": [3], | |
"zarr_format": 2 | |
} | |
}, | |
"zarr_consolidated_format": 1, | |
"record_size": 10 | |
} | |
""", | |
) | |
fs = ReferenceFileSystem(fo="memory:///", remote_protocol="memory") | |
allfiles = fs.find("") | |
assert ".zmetadata" in allfiles | |
assert ".zgroup" in allfiles | |
assert "stuff/2" in allfiles | |
assert fs.cat("stuff/0") == b"raw" | |
assert fs.cat("stuff/1") == data | |
assert fs.cat("stuff/2") == data[4:8] | |
def test_df_multi(m): | |
pd = pytest.importorskip("pandas") | |
pytest.importorskip("fastparquet") | |
data = b"data0data1data2" | |
m.pipe({"data": data}) | |
df0 = pd.DataFrame( | |
{ | |
"path": [None, "memory://data", "memory://data"], | |
"offset": [0, 0, 4], | |
"size": [0, 0, 4], | |
"raw": [b"raw1", None, None], | |
} | |
) | |
df0.to_parquet("memory://stuff/refs.0.parq") | |
df1 = pd.DataFrame( | |
{ | |
"path": [None, "memory://data", "memory://data"], | |
"offset": [0, 0, 2], | |
"size": [0, 0, 2], | |
"raw": [b"raw2", None, None], | |
} | |
) | |
df1.to_parquet("memory://stuff/refs.1.parq") | |
m.pipe( | |
".zmetadata", | |
b"""{ | |
"metadata": { | |
".zgroup": { | |
"zarr_format": 2 | |
}, | |
"stuff/.zarray": { | |
"chunks": [1], | |
"compressor": null, | |
"dtype": "i8", | |
"filters": null, | |
"shape": [6], | |
"zarr_format": 2 | |
} | |
}, | |
"zarr_consolidated_format": 1, | |
"record_size": 3 | |
} | |
""", | |
) | |
fs = ReferenceFileSystem( | |
fo="memory:///", remote_protocol="memory", skip_instance_cache=True | |
) | |
allfiles = fs.find("") | |
assert ".zmetadata" in allfiles | |
assert ".zgroup" in allfiles | |
assert "stuff/2" in allfiles | |
assert "stuff/4" in allfiles | |
assert fs.cat("stuff/0") == b"raw1" | |
assert fs.cat("stuff/1") == data | |
assert fs.cat("stuff/2") == data[4:8] | |
assert fs.cat("stuff/3") == b"raw2" | |
assert fs.cat("stuff/4") == data | |
assert fs.cat("stuff/5") == data[2:4] | |
def test_mapping_getitems(m): | |
m.pipe({"a": b"A", "b": b"B"}) | |
refs = { | |
"a": ["a"], | |
"b": ["b"], | |
} | |
h = fsspec.filesystem("memory") | |
fs = fsspec.filesystem("reference", fo=refs, fs=h) | |
mapping = fs.get_mapper("") | |
assert mapping.getitems(["b", "a"]) == {"a": b"A", "b": b"B"} | |
def test_cached(m, tmpdir): | |
fn = f"{tmpdir}/ref.json" | |
m.pipe({"a": b"A", "b": b"B"}) | |
m.pipe("ref.json", b"""{"a": ["a"], "b": ["b"]}""") | |
fs = fsspec.filesystem( | |
"reference", | |
fo="simplecache::memory://ref.json", | |
fs=m, | |
target_options={"cache_storage": str(tmpdir), "same_names": True}, | |
) | |
assert fs.cat("a") == b"A" | |
assert os.path.exists(fn) | |
# truncate original file to show we are loading from the cached version | |
m.pipe("ref.json", b"") | |
fs = fsspec.filesystem( | |
"reference", | |
fo="simplecache::memory://ref.json", | |
fs=m, | |
target_options={"cache_storage": str(tmpdir), "same_names": True}, | |
skip_instance_cache=True, | |
) | |
assert fs.cat("a") == b"A" | |
def lazy_refs(m): | |
zarr = pytest.importorskip("zarr") | |
l = LazyReferenceMapper.create("memory://refs", fs=m) | |
g = zarr.open(l, mode="w") | |
g.create_dataset(name="data", shape=(100,), chunks=(10,), dtype="int64") | |
return l | |
def test_append_parquet(lazy_refs, m): | |
pytest.importorskip("kerchunk") | |
with pytest.raises(KeyError): | |
lazy_refs["data/0"] | |
lazy_refs["data/0"] = b"data" | |
assert lazy_refs["data/0"] == b"data" | |
lazy_refs.flush() | |
lazy2 = LazyReferenceMapper("memory://refs", fs=m) | |
assert lazy2["data/0"] == b"data" | |
with pytest.raises(KeyError): | |
lazy_refs["data/1"] | |
lazy2["data/1"] = b"Bdata" | |
assert lazy2["data/1"] == b"Bdata" | |
lazy2.flush() | |
lazy2 = LazyReferenceMapper("memory://refs", fs=m) | |
assert lazy2["data/0"] == b"data" | |
assert lazy2["data/1"] == b"Bdata" | |
lazy2["data/1"] = b"Adata" | |
del lazy2["data/0"] | |
assert lazy2["data/1"] == b"Adata" | |
assert "data/0" not in lazy2 | |
lazy2.flush() | |
lazy2 = LazyReferenceMapper("memory://refs", fs=m) | |
with pytest.raises(KeyError): | |
lazy2["data/0"] | |
assert lazy2["data/1"] == b"Adata" | |