|
import hydra |
|
import numpy as np |
|
import pandas as pd |
|
from os.path import join, dirname |
|
import matplotlib.pyplot as plt |
|
import torch |
|
|
|
|
|
class QuadTree(object): |
|
def __init__(self, data, mins=None, maxs=None, id="", depth=3, do_split=1000): |
|
self.id = id |
|
self.data = data |
|
|
|
if mins is None: |
|
mins = data[["latitude", "longitude"]].to_numpy().min(0) |
|
if maxs is None: |
|
maxs = data[["latitude", "longitude"]].to_numpy().max(0) |
|
|
|
self.mins = np.asarray(mins) |
|
self.maxs = np.asarray(maxs) |
|
self.sizes = self.maxs - self.mins |
|
|
|
self.children = [] |
|
|
|
mids = 0.5 * (self.mins + self.maxs) |
|
xmin, ymin = self.mins |
|
xmax, ymax = self.maxs |
|
xmid, ymid = mids |
|
|
|
if (depth > 0) and (len(self.data) >= do_split): |
|
|
|
data_q1 = data[(data["latitude"] < mids[0]) & (data["longitude"] < mids[1])] |
|
data_q2 = data[ |
|
(data["latitude"] < mids[0]) & (data["longitude"] >= mids[1]) |
|
] |
|
data_q3 = data[ |
|
(data["latitude"] >= mids[0]) & (data["longitude"] < mids[1]) |
|
] |
|
data_q4 = data[ |
|
(data["latitude"] >= mids[0]) & (data["longitude"] >= mids[1]) |
|
] |
|
|
|
|
|
if data_q1.shape[0] > 0: |
|
self.children.append( |
|
QuadTree( |
|
data_q1, |
|
[xmin, ymin], |
|
[xmid, ymid], |
|
id + "0", |
|
depth - 1, |
|
do_split=do_split, |
|
) |
|
) |
|
if data_q2.shape[0] > 0: |
|
self.children.append( |
|
QuadTree( |
|
data_q2, |
|
[xmin, ymid], |
|
[xmid, ymax], |
|
id + "1", |
|
depth - 1, |
|
do_split=do_split, |
|
) |
|
) |
|
if data_q3.shape[0] > 0: |
|
self.children.append( |
|
QuadTree( |
|
data_q3, |
|
[xmid, ymin], |
|
[xmax, ymid], |
|
id + "2", |
|
depth - 1, |
|
do_split=do_split, |
|
) |
|
) |
|
if data_q4.shape[0] > 0: |
|
self.children.append( |
|
QuadTree( |
|
data_q4, |
|
[xmid, ymid], |
|
[xmax, ymax], |
|
id + "3", |
|
depth - 1, |
|
do_split=do_split, |
|
) |
|
) |
|
|
|
def unwrap(self): |
|
if len(self.children) == 0: |
|
return {self.id: [self.mins, self.maxs, self.data.copy()]} |
|
else: |
|
d = dict() |
|
for child in self.children: |
|
d.update(child.unwrap()) |
|
return d |
|
|
|
|
|
def extract(qt, name_new_column): |
|
cluster = qt.unwrap() |
|
boundaries, data = {}, [] |
|
id_to_quad = np.array(list(cluster.keys())) |
|
for i, (id, vs) in zip(np.arange(len(cluster)), cluster.items()): |
|
(min_lat, min_lon), (max_lat, max_lon), points = vs |
|
points[name_new_column] = int(i) |
|
data.append(points) |
|
boundaries[i] = ( |
|
float(min_lat), |
|
float(min_lon), |
|
float(max_lat), |
|
float(max_lon), |
|
points["latitude"].mean(), |
|
points["longitude"].mean(), |
|
) |
|
|
|
data = pd.concat(data) |
|
return boundaries, data, id_to_quad |
|
|
|
|
|
def vizu(name_new_column, df_train, boundaries): |
|
plt.hist(df_train[name_new_column], bins=len(boundaries)) |
|
plt.xlabel("Cluster ID") |
|
plt.ylabel("Number of images") |
|
plt.title("Cluster distribution") |
|
plt.yscale("log") |
|
plt.savefig(f"{name_new_column}_distrib.png") |
|
plt.clf() |
|
|
|
plt.scatter( |
|
df_train["longitude"].to_numpy(), |
|
df_train["latitude"].to_numpy(), |
|
c=np.random.permutation(len(boundaries))[df_train[name_new_column].to_numpy()], |
|
cmap="tab20", |
|
s=0.1, |
|
alpha=0.5, |
|
) |
|
plt.xlabel("Longitude") |
|
plt.ylabel("Latitude") |
|
plt.title("Quadtree map") |
|
plt.savefig(f"{name_new_column}_map.png") |
|
|
|
|
|
@hydra.main( |
|
config_path="../configs/scripts", |
|
config_name="enrich-metadata-quadtree", |
|
version_base=None, |
|
) |
|
def main(cfg): |
|
data_path = join(cfg.data_dir, "osv5m") |
|
name_new_column = f"quadtree_{cfg.depth}_{cfg.do_split}" |
|
|
|
|
|
train_fp = join(data_path, f"train.csv") |
|
df_train = pd.read_csv(train_fp) |
|
|
|
qt = QuadTree(df_train, depth=cfg.depth, do_split=cfg.do_split) |
|
boundaries, df_train, id_to_quad = extract(qt, name_new_column) |
|
|
|
vizu(name_new_column, df_train, boundaries) |
|
|
|
|
|
boundaries = pd.DataFrame.from_dict( |
|
boundaries, |
|
orient="index", |
|
columns=["min_lat", "min_lon", "max_lat", "max_lon", "mean_lat", "mean_lon"], |
|
) |
|
boundaries.to_csv(f"{name_new_column}.csv", index_label="cluster_id") |
|
|
|
|
|
test_fp = join(data_path, f"test.csv") |
|
df_test = pd.read_csv(test_fp) |
|
|
|
above_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) > np.expand_dims( |
|
boundaries["min_lat"].to_numpy(), 0 |
|
) |
|
below_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) < np.expand_dims( |
|
boundaries["max_lat"].to_numpy(), 0 |
|
) |
|
above_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) > np.expand_dims( |
|
boundaries["min_lon"].to_numpy(), 0 |
|
) |
|
below_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) < np.expand_dims( |
|
boundaries["max_lon"].to_numpy(), 0 |
|
) |
|
|
|
mask = np.logical_and( |
|
np.logical_and(above_lat, below_lat), np.logical_and(above_lon, below_lon) |
|
) |
|
|
|
df_test[name_new_column] = np.argmax(mask, axis=1) |
|
|
|
|
|
lat = torch.tensor(boundaries["mean_lat"]) |
|
lon = torch.tensor(boundaries["mean_lon"]) |
|
coord = torch.stack([lat / 90, lon / 180], dim=-1) |
|
torch.save( |
|
coord, join(data_path, f"index_to_gps_quadtree_{cfg.depth}_{cfg.do_split}.pt") |
|
) |
|
|
|
torch.save(id_to_quad, join(data_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt")) |
|
|
|
if cfg.overwrite_csv: |
|
df_train.to_csv(train_fp, index=False) |
|
df_test.to_csv(test_fp, index=False) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|