|
import dask |
|
import dask.dataframe as dd |
|
from dask.diagnostics import ProgressBar |
|
|
|
with ProgressBar(): |
|
ddf = dd.read_csv( |
|
"../datasets/YFCC100M/yfcc100m_dataset", |
|
names=[ |
|
"photo_id", |
|
"user_nsid", |
|
"user_nickname", |
|
"date_taken", |
|
"date_uploaded", |
|
"capture_device", |
|
"title", |
|
"description", |
|
"user_tags", |
|
"machine_tags", |
|
"longitude", |
|
"latitude", |
|
"accuracy", |
|
"page_url", |
|
"download_url", |
|
"license_name", |
|
"license_url", |
|
"server_id", |
|
"farm_id", |
|
"secret", |
|
"secret_original", |
|
"extension", |
|
"media_type", |
|
], |
|
dtype={ |
|
"photo_id": str, |
|
"user_nsid": str, |
|
"user_nickname": str, |
|
"user_tags": str, |
|
"machine_tags": str, |
|
"longitude": float, |
|
"latitude": float, |
|
"accuracy": float, |
|
"server_id": str, |
|
"farm_id": str, |
|
"secret": str, |
|
"secret_original": str, |
|
"extension": str, |
|
"media_type": float, |
|
}, |
|
sep="\t", |
|
) |
|
ddf = ddf[ |
|
[ |
|
"photo_id", |
|
"longitude", |
|
"latitude", |
|
"accuracy", |
|
"extension", |
|
"download_url", |
|
"media_type", |
|
] |
|
] |
|
filtered_ddf = ddf[ |
|
ddf["longitude"].notnull() |
|
& ddf["latitude"].notnull() |
|
& (ddf["media_type"] == 0) |
|
] |
|
del ddf["media_type"] |
|
hash_ddf = dd.read_csv( |
|
"../datasets/YFCC100M/yfcc100m_hash", |
|
names=["photo_id", "hash"], |
|
dtype={"photo_id": str, "hash": str}, |
|
sep="\t", |
|
) |
|
filtered_ddf = filtered_ddf.merge(hash_ddf, on="photo_id", how="left") |
|
|
|
with open("../datasets/YFCC100M/yfcc_4k_ids.txt", "r") as f: |
|
test_photo_ids = set(f.read().splitlines()) |
|
|
|
|
|
filter = filtered_ddf["photo_id"].isin(test_photo_ids) |
|
test_ddf = filtered_ddf[filter] |
|
train_ddf = filtered_ddf[~filter] |
|
|
|
train_ddf = train_ddf[train_ddf["accuracy"] >= 12] |
|
|
|
|
|
test_ddf.to_csv( |
|
"../datasets/YFCC100M/yfcc_4k_dataset_with_gps.csv", |
|
sep="\t", |
|
index=False, |
|
single_file=True, |
|
) |
|
train_ddf = train_ddf.repartition(npartitions=len(train_ddf) // 100000 + 1) |
|
train_ddf.to_csv( |
|
"../datasets/YFCC100M/yfcc100m_dataset_with_gps_train/*.csv", |
|
sep="\t", |
|
index=False, |
|
single_file=False, |
|
) |
|
|