Spaces:
Running
on
Zero
Running
on
Zero
import argparse | |
import sys | |
import random | |
import time | |
from omegaconf import open_dict | |
import matplotlib.pyplot as plt | |
sys.path.extend([".", ".."]) | |
from generate_ply_sequence import get_cam_k | |
from point_utils import read_calib, generate_point_grid, get_fov_mask | |
from gen_voxelgrid_npy import save_as_voxel_ply, remove_invisible | |
import logging | |
from pathlib import Path | |
import subprocess | |
import yaml | |
import cv2 | |
import os | |
import numpy as np | |
from tqdm import tqdm | |
import pickle | |
import torch | |
from torch import nn | |
import torch.nn.functional as F | |
from hydra import compose, initialize | |
import matplotlib.pyplot as plt | |
from sscbench_dataset import SSCBenchDataset | |
from pathlib import Path | |
from scipy.optimize import linear_sum_assignment | |
import torchvision | |
RELOAD_DATASET = True | |
DATASET_LENGTH = 10 | |
FULL_EVAL = True | |
SAMPLE_EVERY = None | |
SAMPLE_OFFSET = 2 | |
SAMPLE_RANGE = None | |
SIZE = 51.2 # Can be: 51.2, 25.6, 12.8 | |
SIZES = (12.8, 25.6, 51.2) | |
VOXEL_SIZE = 0.2 # Needs: 0.2 % VOXEL_SIZE == 0 | |
USE_ADDITIONAL_INVALIDS = True | |
TEST_ALPHA_CUTOFFS = False | |
SEARCH_VALUES = [10e-1, 10e-2, 10e-3, 10e-4, 10e-5, 10e-6, 10e-7] | |
SIGMA_CUTOFF = 0.2 | |
USE_ALPHA_WEIGHTING = True | |
USE_GROW = True | |
CREATE_SIGMA_TRADEOFF_PLOT = True | |
SIGMA_VALUES = [1, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.0025, 0.001] | |
PLOT_ALL_IMAGES = False | |
GENERATE_PLY_FILES = False | |
PLY_ONLY_FOV = True | |
PLY_IDS = [300, 400, 470] | |
OUTPUT_PATH = Path("<PATH-OUTPUT>") | |
PLY_SIZES = [25.6, 51.2] | |
GENERATE_STATISTICS = False | |
# For ply generation: | |
# USE_ADDITIONAL_INVALIDS = False | |
# USE_GROW = False | |
# GENERATE_PLY_FILES = True | |
os.system("nvidia-smi") | |
device = f'cuda:0' | |
# DO NOT TOUCH OR YOU WILL BREAK RUNS (should be None) | |
gpu_id = None | |
if gpu_id is not None: | |
print("GPU ID: " + str(gpu_id)) | |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) | |
if torch.cuda.is_available(): | |
torch.backends.cudnn.enabled = True | |
torch.backends.cudnn.benchmark = True | |
torch.backends.cudnn.deterministic = True | |
logging.basicConfig(level=logging.INFO) | |
def main(): | |
parser = argparse.ArgumentParser("SSCBenchmark Output generation") | |
parser.add_argument("--sscbench_data_root", "-ssc", type=str) | |
parser.add_argument("--voxel_gt_path", "-vgt", type=str) | |
parser.add_argument("--resolution", "-r", default=(192, 640)) | |
parser.add_argument("--checkpoint", "-cp", type=str, required=True) | |
parser.add_argument("--full", "-f", action="store_true") | |
parser.add_argument("--mode", "-m", default="s4c") | |
parser.add_argument("--ply_checkname", "-p", default="none") | |
args = parser.parse_args() | |
sscbench_data_root = args.sscbench_data_root | |
voxel_gt_path = args.voxel_gt_path | |
resolution = args.resolution | |
cp_path = args.checkpoint | |
full_evaluation = args.full | |
mode = args.mode | |
ply_checkname = args.ply_checkname | |
if FULL_EVAL: | |
full_evaluation = True | |
if GENERATE_PLY_FILES: | |
assert (not USE_GROW) and (not USE_ADDITIONAL_INVALIDS) # and VOXEL_SIZE == 0.1 | |
# make the necessary dirs | |
for size in PLY_SIZES: | |
if not os.path.exists(OUTPUT_PATH / ply_checkname / str(int(size))): | |
os.makedirs(OUTPUT_PATH / ply_checkname / str(int(size))) | |
if not os.path.exists(OUTPUT_PATH / ply_checkname): | |
os.makedirs(OUTPUT_PATH / ply_checkname) | |
logging.info(f"Using a sigma cutoff of {SIGMA_CUTOFF}") | |
logging.info("Setting up dataset") | |
with open("label_maps.yaml", "r") as f: | |
label_maps = yaml.safe_load(f) | |
# pickle the dataset so we don't have to wait all the time | |
if os.path.isfile("dataset.pkl") and not RELOAD_DATASET: | |
logging.info("Loading dataset from dataset.pkl file.") | |
with open("dataset.pkl", "rb") as f: | |
dataset = pickle.load(f) | |
else: | |
logging.info("Generating the dataset and dumping it to dataset.pkl") | |
dataset = SSCBenchDataset( | |
data_path=sscbench_data_root, | |
voxel_gt_path=voxel_gt_path, | |
sequences=(9,), | |
target_image_size=resolution, | |
return_stereo=False, | |
frame_count=1, | |
color_aug=False, | |
load_fisheye=True, | |
fisheye_offset=10, | |
) | |
if DATASET_LENGTH and not full_evaluation: | |
dataset.length = DATASET_LENGTH | |
with open("dataset.pkl", 'wb') as f: | |
pickle.dump(dataset, f) | |
logging.info("Setting up the model...") | |
config_path = "exp_kitti_360" | |
cp_path = Path(cp_path) | |
if cp_path.suffix == ".pt": | |
cp_root_path = cp_path.parent | |
else: | |
cp_root_path = cp_path | |
cp_path = next(cp_root_path.glob("training*.pt")) | |
bts_dino_config_path = "training_config.yaml" | |
PRODUCE_FEAT_VIS = GENERATE_PLY_FILES and mode.startswith("scenedino") | |
prediction_mode = None | |
if mode == "s4c": | |
from models.bts.model import BTSNet | |
from models.common.render import NeRFRenderer | |
initialize(version_base=None, config_path="../../../configs", job_name="gen_sscbench_outputs") | |
config = compose(config_name=config_path, overrides=[]) | |
logging.info('Loading checkpoint') | |
cp = torch.load(cp_path, map_location=device) | |
with open_dict(config): | |
config["renderer"]["hard_alpha_cap"] = True | |
config["model_conf"]["code_mode"] = "z" | |
# config["model_conf"]["z_near"] = 8 | |
config["model_conf"]["mlp_coarse"]["n_blocks"] = 0 | |
config["model_conf"]["mlp_coarse"]["d_hidden"] = 64 | |
config["model_conf"]["encoder"]["d_out"] = 64 | |
config["model_conf"]["encoder"]["type"] = "monodepth2" | |
config["model_conf"]["grid_learn_empty"] = False | |
config["model_conf"]["sample_color"] = True | |
# stuff for segmentation | |
config["model_conf"]["segmentation_mode"] = "panoptic_deeplab" | |
net = BTSNet(config["model_conf"]) | |
net.sample_color = False | |
renderer = NeRFRenderer.from_conf(config["renderer"]) | |
renderer = renderer.bind_parallel(net, gpus=None).eval() | |
renderer.renderer.n_coarse = 64 | |
renderer.renderer.lindisp = True | |
class _Wrapper(nn.Module): | |
def __init__(self): | |
super().__init__() | |
self.renderer = renderer | |
_wrapper = _Wrapper() | |
_wrapper.load_state_dict(cp["model"], strict=False) | |
renderer.to(device) | |
renderer.eval() | |
elif mode.startswith("scenedino"): | |
from scenedino.models import make_model as dino_bts_make_model | |
from scenedino.renderer.nerf import NeRFRenderer as dino_bts_NeRFRenderer | |
from scenedino.common.ray_sampler import ImageRaySampler as dino_bts_ImageRaySampler | |
bts_dino_parent_relative = Path("../../../../") | |
bts_dino_parent_absolute = str(bts_dino_parent_relative.resolve()) | |
initialize(version_base=None, | |
config_path=str(bts_dino_parent_relative / cp_root_path.relative_to(bts_dino_parent_absolute)), | |
job_name="gen_sscbench_outputs") | |
config = compose(config_name=bts_dino_config_path, overrides=[]) | |
logging.info('Loading checkpoint') | |
cp = torch.load(cp_path, map_location=device) | |
net = dino_bts_make_model(config["model"], config["downstream"]) | |
renderer = dino_bts_NeRFRenderer.from_conf(config["renderer"]) | |
renderer.hard_alpha_cap = False | |
renderer = renderer.bind_parallel(net, gpus=None).eval() | |
class _Wrapper(nn.Module): | |
def __init__(self): | |
super().__init__() | |
self.renderer = renderer | |
_wrapper = _Wrapper() | |
_wrapper.load_state_dict(cp, strict=False) # _wrapper.load_state_dict(cp["model"], strict=False) | |
renderer.to(device) | |
renderer.eval() | |
height, width = config["dataset"]["image_size"] | |
ray_sampler = dino_bts_ImageRaySampler(z_near=3, z_far=80, width=width, height=height) | |
if mode == "scenedino_linear": | |
prediction_mode = "direct_linear" | |
elif mode == "scenedino_direct_cluster": | |
prediction_mode = "direct_kmeans" | |
else: | |
prediction_mode = "stego_kmeans" | |
else: | |
raise NotImplementedError() | |
logging.info("Loading the Lidar to Camera matrices...") | |
calib = read_calib() | |
T_velo_2_cam = calib["Tr"] | |
logging.info("Generating the point cloud...") | |
pts, _ = generate_point_grid(vox_origin=np.array([0, -25.6, -2]), | |
scene_size=(51.2, 51.2, 6.4), | |
voxel_size=VOXEL_SIZE, | |
cam_E=T_velo_2_cam, | |
cam_k=get_cam_k()) | |
fov_mask = get_fov_mask() | |
pts = torch.tensor(pts).to(device).reshape(1, -1, 3).float() | |
fov_mask = fov_mask.reshape(256, 256, 32) | |
logging.info("Setting up folders...") | |
downsample_factor = int(0.2 // VOXEL_SIZE) | |
results = {} | |
for size in SIZES: | |
results[size] = { | |
"tp": 0, | |
"fp": 0, | |
"tn": 0, | |
"fn": 0, | |
"tp_seg": np.zeros(15), | |
"fp_seg": np.zeros(15), | |
"tn_seg": np.zeros(15), | |
"fn_seg": np.zeros(15), | |
"confusion_seg": np.zeros((16, 16)), | |
"tp_recall_seg": np.zeros(15), | |
"sum_recall_seg": np.zeros(15), | |
} | |
# for the sigma tradeoff plots | |
trade_off_values = np.zeros([len(SIGMA_VALUES), 4]) | |
cutoff_results = {i: {sv: {"tp":0, "fp": 0, "tn": 0, "fn": 0} for sv in SEARCH_VALUES} for i in range(1, 16)} | |
pbar = tqdm(range(len(dataset))) | |
# Randomly select indices without replacement | |
# dataset_size = len(dataset) | |
# subset_size = dataset_size // 10 | |
# subset_indices = random.sample(range(dataset_size), subset_size) | |
# pbar = tqdm(subset_indices) | |
images = {"ids": [], "images": []} | |
ids = [125, 280, 960, 1000, 1150, 1325, 2300, 3175, 3750, 4300, 5155, 5475, 5750, 6475, 6525, 6670, 6775, 7500, 7860, 8000, 8350, 9000, 9350, 10975] | |
ids = [60, 250, 455, 690, 835, 2235, 2385, 2495, 3385, 4235, 4360, 4550, 4875, 5550, 6035, 7010, 7110, 8575, 9010, 9410, 11260, 11460, 11885] | |
# for our statistics | |
tframeIds = [] | |
tinval = [] | |
ttp = [] | |
tfp = [] | |
ttn = [] | |
tfn = [] | |
# plot_image_at_frame_id(dataset, 952) | |
for i in pbar: | |
if SAMPLE_EVERY: | |
if (i - SAMPLE_OFFSET) % SAMPLE_EVERY != 0: | |
continue | |
sequence, id, is_right = dataset._datapoints[i] | |
if SAMPLE_RANGE: | |
if id not in SAMPLE_RANGE: | |
continue | |
if GENERATE_PLY_FILES and id not in PLY_IDS: | |
continue | |
if GENERATE_STATISTICS: | |
tframeIds.append(id) | |
data = dataset[i] | |
torch.cuda.empty_cache() | |
torch.cuda.reset_peak_memory_stats() | |
torch.cuda.synchronize() | |
start_time = time.time() | |
# downsample the sigmas | |
sigmas, segs, dino = downsample_and_predict(data, net, pts, downsample_factor, prediction_mode, vis=GENERATE_PLY_FILES, feat_vis=PRODUCE_FEAT_VIS) | |
torch.cuda.synchronize() | |
inference_time = time.time() - start_time | |
memory_used = torch.cuda.max_memory_allocated(device) / 1024**2 # in MB | |
num_params = sum(p.numel() for key, p in net.named_parameters() if not key.startswith("encoder.gt_encoder")) | |
#print(f"Inference time: {inference_time:.6f} seconds") | |
#print(f"Memory used: {memory_used:.2f} MB") | |
#print(f"Number of parameters: {num_params:,}") | |
# convert both to the right format | |
segs = convert_voxels(segs, label_maps["cityscapes_to_label"]) | |
target = convert_voxels(data["voxel_gt"][0].astype(int), label_maps["sscbench_to_label"]) | |
is_occupied_seg = torch.Tensor(sigmas > SIGMA_CUTOFF).to(torch.bool) | |
is_occupied_seg = remove_invisible(is_occupied_seg) | |
#raise ValueError(is_occupied_seg, segs) | |
is_occupied_seg[segs==0] = False | |
images = torch.stack([torch.Tensor(_img) for _img in data["imgs"]], dim=0).cuda() | |
if PRODUCE_FEAT_VIS: | |
dino = calculate_pca(dino, is_occupied_seg, net) | |
dino = (255*dino).astype(int) | |
poses = torch.stack([torch.Tensor(_pose) for _pose in data["poses"]], dim=0).unsqueeze(0).cuda() | |
projs = torch.stack([torch.Tensor(_proj) for _proj in data["projs"]], dim=0).unsqueeze(0).cuda() | |
poses = torch.inverse(poses[:, :1]) @ poses | |
all_rays, _ = ray_sampler.sample(None, poses, projs) | |
render_dict = renderer(all_rays[:, :], want_weights=True, want_alphas=True) | |
render_dict = ray_sampler.reconstruct(render_dict) | |
dino_features = net.encoder.expand_dim(render_dict["coarse"]["dino_features"]).squeeze() | |
dino_gt = net.encoder.gt_encoder(images / 2 + 0.5)[-1].permute(0, 2, 3, 1) | |
dino_gt = F.normalize(dino_gt, dim=-1) | |
dino_rgb_vis = torch.clamp(net.encoder.transform_visualization(dino_features.cpu()), min=-0.5, max=0.5) + 0.5 | |
dino_rgb_vis_gt = torch.clamp(net.encoder.transform_visualization(dino_gt.cpu()), min=-0.5, max=0.5) + 0.5 | |
dino_rgb_vis_gt = dino_rgb_vis_gt.repeat_interleave(8, 1).repeat_interleave(8, 2) | |
if PLOT_ALL_IMAGES: | |
images["ids"].append(id) | |
images["images"].append(((data["imgs"][0] + 1) / 2).permute(1, 2, 0)) | |
if len(images["ids"]) == 6: | |
plot_images(images) | |
images = {"images": [], "ids": []} | |
# print(f"Image_Id: {id}") | |
# | |
# plt.imshow(((data["imgs"][0] + 1) / 2).permute(1, 2, 0)) | |
# plt.show() | |
# | |
# out_dict = {"sigmas": sigmas, "segs": segs.copy(), "gt": target, "fov_mask": fov_mask} | |
# | |
# with open(f'plots10_40/{id:06d}.pkl', 'wb') as f: | |
# pickle.dump(out_dict, f) | |
if GENERATE_PLY_FILES: | |
_segs = segs.copy() | |
_target = target.copy() | |
if PRODUCE_FEAT_VIS: | |
_dino = dino.copy() | |
mask = target != 255 | |
if PLY_ONLY_FOV: | |
mask = mask & fov_mask | |
seg_mask = mask.copy() | |
for dim in range(seg_mask.ndim): | |
seg_mask = np.repeat(seg_mask, downsample_factor, axis=dim) | |
# _segs[~seg_mask] = 0 | |
# _dino[~seg_mask] = 0 | |
_target[~mask] = 0 | |
is_occupied_seg = is_occupied_seg.logical_and(torch.Tensor(fov_mask)) | |
# is_occupied_seg = torch.tensor(_segs > 0) | |
is_occupied_gt = torch.tensor(_target > 0) | |
full_num_voxels = int(SIZE // VOXEL_SIZE) | |
for idx in range(images.size(0)): | |
torchvision.utils.save_image(((images[idx] + 1) / 2), OUTPUT_PATH / ply_checkname / str(int(size)) / f"{id:06d}_image_{idx}.png") | |
if PRODUCE_FEAT_VIS: | |
torchvision.utils.save_image(dino_rgb_vis[idx].permute(2, 0, 1), OUTPUT_PATH / ply_checkname / str(int(size)) / f"{id:06d}_features_{idx}.png") | |
torchvision.utils.save_image(dino_rgb_vis_gt[idx].permute(2, 0, 1), OUTPUT_PATH / ply_checkname / str(int(size)) / f"{id:06d}_features_gt_{idx}.png") | |
images = None | |
for size in PLY_SIZES: | |
num_voxels = int(size // 0.2) | |
save_as_voxel_ply(OUTPUT_PATH / ply_checkname / str(int(size)) / f"{id:06d}_gt.ply", | |
is_occupied_gt[: num_voxels, (128 - num_voxels // 2): (128 + num_voxels // 2),:], | |
voxel_size=0.2, | |
classes=torch.tensor(_target[: num_voxels, (128 - num_voxels // 2): (128 + num_voxels // 2),:])) | |
num_voxels = int(size // VOXEL_SIZE) | |
save_as_voxel_ply(OUTPUT_PATH / ply_checkname / str(int(size)) / f"{id:06d}.ply", | |
is_occupied_seg[: num_voxels, (full_num_voxels // 2 - num_voxels // 2): (full_num_voxels // 2 + num_voxels // 2),:], | |
size=(num_voxels, num_voxels, num_voxels//8), | |
voxel_size=VOXEL_SIZE, | |
classes=torch.tensor(_segs[: num_voxels, (full_num_voxels // 2 - num_voxels // 2): (full_num_voxels // 2 + num_voxels // 2),:])) | |
if PRODUCE_FEAT_VIS: | |
save_as_voxel_ply(OUTPUT_PATH / ply_checkname / str(int(size)) / f"{id:06d}_feat.ply", | |
is_occupied_seg[: num_voxels, (full_num_voxels // 2 - num_voxels // 2): (full_num_voxels // 2 + num_voxels // 2),:], | |
size=(num_voxels, num_voxels, num_voxels//8), | |
voxel_size=VOXEL_SIZE, | |
colors=torch.tensor(_dino[: num_voxels, (full_num_voxels // 2 - num_voxels // 2): (full_num_voxels // 2 + num_voxels // 2),:])) | |
continue | |
if USE_ADDITIONAL_INVALIDS: | |
invalids = identify_additional_invalids(target) | |
# logging.info(np.mean(invalids)) | |
target[invalids == 1] = 255 | |
if GENERATE_STATISTICS: | |
tinval.append(np.mean(invalids)) | |
# test and summarize different alpha cutoffs | |
if TEST_ALPHA_CUTOFFS: | |
for i in range(1, 16): | |
for search_value in SEARCH_VALUES: | |
_tmp = segs.copy() | |
_tmp[np.logical_and(segs == i, sigmas < search_value)] = 0 | |
_tp_seg, _fp_seg, _tn_seg, _fn_seg = compute_occupancy_numbers_segmentation( | |
y_pred=_tmp, y_true=target, fov_mask=fov_mask, labels=label_maps["labels"]) | |
cutoff_results[i][search_value]["tp"] += _tp_seg[i-1] | |
cutoff_results[i][search_value]["fp"] += _fp_seg[i-1] | |
cutoff_results[i][search_value]["tn"] += _tn_seg[i-1] | |
cutoff_results[i][search_value]["fn"] += _fn_seg[i-1] | |
if CREATE_SIGMA_TRADEOFF_PLOT: | |
for i, val in enumerate(SIGMA_VALUES): | |
_tmp = segs.copy() | |
_tmp[sigmas < val] = 0 | |
_tp, _fp, _tn, _fn = compute_occupancy_numbers(y_pred=_tmp, y_true=target, fov_mask=fov_mask) | |
trade_off_values[i] += np.array([_tp, _fp, _tn, _fn]) | |
segs[sigmas < SIGMA_CUTOFF] = 0 | |
for size in SIZES: | |
num_voxels = int(size // 0.2) | |
# resize to right scene size | |
_segs = segs[:num_voxels, (128 - num_voxels//2):(128 + num_voxels//2), :] | |
_target = target[:num_voxels, (128 - num_voxels//2):(128 + num_voxels//2), :] | |
_fov_mask = fov_mask[:num_voxels, (128 - num_voxels // 2):(128 + num_voxels // 2), :] | |
_tp, _fp, _tn, _fn = compute_occupancy_numbers(y_pred=_segs, y_true=_target, fov_mask=_fov_mask) | |
_tp_seg, _fp_seg, _tn_seg, _fn_seg, _confusion_seg = compute_occupancy_numbers_segmentation( | |
y_pred=_segs, y_true=_target, fov_mask=_fov_mask, labels=label_maps["labels"]) | |
_tp_rec_seg, _sum_rec_seg = compute_occupancy_recall_segmentation( | |
y_pred=_segs, y_true=_target, fov_mask=_fov_mask, labels=label_maps["labels"]) | |
if size == 51.2 and GENERATE_STATISTICS: | |
ttp += [_tp] | |
tfp += [_fp] | |
ttn += [_fn] | |
tfn += [_fn] | |
results[size]["tp"] += _tp | |
results[size]["fp"] += _fp | |
results[size]["tn"] += _tn | |
results[size]["fn"] += _fn | |
results[size]["tp_seg"] += _tp_seg | |
results[size]["fp_seg"] += _fp_seg | |
results[size]["tn_seg"] += _tn_seg | |
results[size]["fn_seg"] += _fn_seg | |
results[size]["confusion_seg"] += _confusion_seg | |
results[size]["tp_recall_seg"] += _tp_rec_seg | |
results[size]["sum_recall_seg"] += _sum_rec_seg | |
recall = results[size]["tp"] / (results[size]["tp"] + results[size]["fn"]) | |
precision = results[size]["tp"] / (results[size]["tp"] + results[size]["fp"]) | |
iou = results[size]["tp"] / (results[size]["tp"] + results[size]["fp"] + results[size]["fn"]) | |
pbar.set_postfix_str(f"IoU: {iou*100:.2f} Prec: {precision*100:.2f} Rec: {recall*100:.2f}") | |
result_str = "" | |
for mode in ["direct", "hungarian"]: | |
results_table = np.zeros((19, 3), dtype=np.float32) | |
if mode == "hungarian": | |
assignments = linear_sum_assignment(results[51.2]["confusion_seg"], maximize=True) # Hungarian matching on full range | |
# Here we compute all the metrics | |
for size_i, size in enumerate(SIZES): | |
recall = results[size]["tp"] / (results[size]["tp"] + results[size]["fn"]) | |
precision = results[size]["tp"] / (results[size]["tp"] + results[size]["fp"]) | |
iou = results[size]["tp"] / (results[size]["tp"] + results[size]["fp"] + results[size]["fn"]) | |
results_table[0, size_i] = iou | |
results_table[1, size_i] = precision | |
results_table[2, size_i] = recall | |
# logging.info(f"#" * 50) | |
# logging.info(f"Results for size {size}. ") | |
# logging.info(f"#" * 50) | |
# logging.info("Occupancy metrics") | |
# logging.info(f"Recall: {recall*100:.2f}%") | |
# logging.info(f"Precision: {precision*100:.2f}%") | |
# logging.info(f"IoU: {iou*100:.2f}") | |
# recall_seg = results[size]["tp_seg"] / (results[size]["tp_seg"] + results[size]["fn_seg"]) | |
# precision_seg = results[size]["tp_seg"] / (results[size]["tp_seg"] + results[size]["fp_seg"]) | |
# iou_seg = results[size]["tp_seg"] / (results[size]["tp_seg"] + results[size]["fp_seg"] + results[size]["fn_seg"]) | |
# mean_iou = np.mean(np.nan_to_num(iou_seg)) | |
# Calculate hungarian matching | |
confusion_matrix = results[size]["confusion_seg"] | |
if mode == "hungarian": | |
confusion_matrix = confusion_matrix[np.argsort(assignments[1]), :] | |
confusion_matrix_tp = np.diag(confusion_matrix) | |
confusion_matrix_denom = confusion_matrix.sum(0) + confusion_matrix.sum(1) - confusion_matrix_tp | |
confusion_matrix_per_class_iou = confusion_matrix_tp[1:] / confusion_matrix_denom[1:] | |
confusion_matrix_miou = np.mean(np.nan_to_num(confusion_matrix_per_class_iou)) | |
# occupancy_recall_seg = results[size]["tp_recall_seg"] / results[size]["sum_recall_seg"] | |
weights = label_maps["weights"] | |
weights_val = np.array(list(weights.values())) | |
weighted_mean_iou = np.sum(weights_val * np.nan_to_num(confusion_matrix_per_class_iou)) / np.sum(weights_val) | |
results_table[3, size_i] = confusion_matrix_miou | |
results_table[4:, size_i] = confusion_matrix_per_class_iou | |
row_labels = [ | |
"IoU", "Precision", "Recall", | |
"mIoU", "car", "bicycle", "motorcycle", "truck", "other-vehicle", "person", | |
"road", "sidewalk", "building", "fence", "vegetation", "terrain", "pole", | |
"traffic-sign", "other-object" | |
] | |
column_headers = ["12.8m", "25.6m", "51.2m"] | |
result_str += f"\n# Benchmark Results for '{ply_checkname}' / Mode: {mode}\n" | |
result_str += "\n| | " + " | ".join(column_headers) + " |\n" | |
result_str += "|---------------|-------|-------|-------|\n" | |
for i in range(len(row_labels)): | |
row_values = results_table[i] | |
row_str = f"| {row_labels[i]:<13} | " + " | ".join(f"{v * 100:5.2f}" for v in row_values) + " |\n" | |
result_str += row_str | |
if i == 2: | |
result_str += "|---------------|-------|-------|-------|\n" | |
result_str += "\n" | |
if mode == "hungarian": | |
result_str += f"Reassignment: {np.argsort(assignments[1])}\n" | |
result_str += f"Mean IoU: {confusion_matrix_miou * 100:.2f}\n" | |
result_str += f"Weighted Mean IoU: {weighted_mean_iou * 100:.2f}\n\n" | |
print(result_str) | |
if not GENERATE_PLY_FILES: | |
with open(OUTPUT_PATH / ply_checkname / "results.md", "w") as file: | |
file.write(result_str) | |
if TEST_ALPHA_CUTOFFS: | |
cutoff_metrics = \ | |
{i: {sv: {"precision": np.nan_to_num(100*cutoff_results[i][sv]["tp"] / (cutoff_results[i][sv]["tp"] + cutoff_results[i][sv]["fp"])), | |
"recall": np.nan_to_num(100*cutoff_results[i][sv]["tp"] / (cutoff_results[i][sv]["tp"] + cutoff_results[i][sv]["fn"])), | |
"IoU": np.nan_to_num(100*cutoff_results[i][sv]["tp"] / (cutoff_results[i][sv]["tp"] + cutoff_results[i][sv]["fn"] + cutoff_results[i][sv]["fp"]))} | |
for sv in SEARCH_VALUES} for i in range(1, 16)} | |
best_values = {i: SEARCH_VALUES[torch.argmax(torch.tensor([cutoff_metrics[i][sv]["IoU"] for sv in SEARCH_VALUES]))] for i in range(1, 16)} | |
print(best_values) | |
if CREATE_SIGMA_TRADEOFF_PLOT: | |
plt.figure(figsize=(10, 8)) | |
plt.xlabel("Precision") | |
plt.ylabel("Recall") | |
plt.xlim([10, 70]) | |
# plt.ylim([0, 100]) | |
for i, val in enumerate(SIGMA_VALUES): | |
tp, fp, tn, fn = trade_off_values[i] | |
pres = 100*tp / (tp + fp) | |
recall = 100*tp/ (tp + fn) | |
plt.scatter(pres, recall) | |
plt.annotate(f"Sigma: {val}; IoU: {100*tp / (tp + fp + fn):.2f}", (pres, recall)) | |
identifier = os.path.basename(cp_path) | |
if FULL_EVAL: | |
path = f"figures/inv{str(USE_ADDITIONAL_INVALIDS)}_{VOXEL_SIZE:.1f}_mp{str(USE_GROW)}_{identifier}.png" | |
else: | |
path = f"figures/inv{str(USE_ADDITIONAL_INVALIDS)}_{DATASET_LENGTH}_{VOXEL_SIZE:.1f}_mp{str(USE_GROW)}_{identifier}.png" | |
if os.path.isfile(path): | |
os.remove(path) | |
plt.savefig(path) | |
plt.show() | |
if GENERATE_STATISTICS: | |
statistics_raw = {"frameId": tframeIds, "TP": ttp, "FP": tfp, "TN": ttn, "FN": tfn, "invalids": tinval} | |
with open("stats.pkl", "wb") as f: | |
pickle.dump(statistics_raw, f) | |
logging.info("Saved the statistics for further analysis.") | |
def downsample_and_predict(data, net, pts, factor, prediction_mode, vis=False, feat_vis=False): | |
pts = pts.reshape(256*factor, 256*factor, 32*factor, 3) | |
if vis: | |
sigmas = torch.zeros(256*factor, 256*factor, 32*factor).numpy() | |
segs = torch.zeros(256*factor, 256*factor, 32*factor).numpy() | |
if feat_vis: | |
dino = torch.zeros(256*factor, 256*factor, 32*factor, 768).numpy() | |
else: | |
dino = None | |
else: | |
sigmas = torch.zeros(256, 256, 32).numpy() | |
segs = torch.zeros(256, 256, 32).numpy() | |
dino = None | |
chunk_size_x = chunk_size_y = 128 | |
chunk_size_z = 32 | |
n_chunks_x = int(256*factor / chunk_size_x) | |
n_chunks_y = int(256*factor / chunk_size_y) | |
n_chunks_z = int(32*factor / chunk_size_z) | |
if vis: | |
factor = 1 | |
b_x = chunk_size_x // factor # size of the mini blocks | |
b_y = chunk_size_y // factor | |
b_z = chunk_size_z // factor | |
# Changed for efficiency | |
images = torch.stack(data["imgs"], dim=0).unsqueeze(0).to(device).float() | |
poses = torch.tensor(np.stack(data["poses"], 0)).unsqueeze(0).to(device).float() | |
projs = torch.tensor(np.stack(data["projs"], 0)).unsqueeze(0).to(device).float() | |
poses = torch.inverse(poses[:, :1]) @ poses | |
extra_args = {"images_alt": images * 0.5 + 0.5} | |
net.compute_grid_transforms(projs, poses) | |
torch.cuda.synchronize() | |
encoding_start_time = time.time() | |
net.encode(images, projs, poses, ids_encoder=[0], ids_render=[0], **extra_args) | |
torch.cuda.synchronize() | |
encoding_time = time.time() - encoding_start_time | |
#print(f" - Encoding time: {encoding_time:.6f} seconds") | |
net.set_scale(0) | |
for i in range(n_chunks_x): | |
for j in range(n_chunks_y): | |
for k in range(n_chunks_z): | |
pts_block = pts[i * chunk_size_x:(i + 1) * chunk_size_x, j * chunk_size_y:(j + 1) * chunk_size_y, k * chunk_size_z:(k + 1) * chunk_size_z] | |
#with torch.autograd.profiler.profile([torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], use_cuda=True) as prof: | |
sigmas_block, segs_block, dino_feat_block = predict_grid(data, net, pts_block, prediction_mode) | |
#print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1)) | |
#raise ValueError("Profiling done.") | |
sigmas_block = sigmas_block.reshape(chunk_size_x, chunk_size_y, chunk_size_z) | |
segs_block = segs_block.reshape(chunk_size_x, chunk_size_y, chunk_size_z, 19) | |
if feat_vis: | |
dino_feat_block = dino_feat_block.reshape(chunk_size_x, chunk_size_y, chunk_size_z, dino_feat_block.size(-1)) | |
if USE_ALPHA_WEIGHTING: | |
alphas = 1 - torch.exp(- VOXEL_SIZE * sigmas_block) | |
segs_block = (alphas.unsqueeze(-1) * segs_block).unsqueeze(0) | |
else: | |
segs_block = (sigmas_block.unsqueeze(-1) * segs_block).unsqueeze(0) | |
if vis: | |
sigmas_block = sigmas_block.detach().cpu().numpy() | |
segs_pool = torch.argmax(segs_block, dim=-1).detach().cpu().numpy() | |
if feat_vis: | |
dino_feat_block = dino_feat_block.detach().cpu().numpy() | |
else: | |
segs_pool_list = [F.avg_pool3d(segs_block[..., i], kernel_size=factor, stride=factor, padding=0) for i in | |
range(segs_block.shape[-1])] | |
segs_pool = torch.stack(segs_pool_list, dim=-1).unsqueeze(0) | |
segs_pool = torch.argmax(segs_pool, dim=-1).detach().cpu().numpy() | |
# pool the observations | |
sigmas_block = F.max_pool3d(sigmas_block.unsqueeze(0), kernel_size=factor, stride=factor, padding=0).squeeze(0).detach().cpu().numpy() | |
sigmas[i * b_x:(i + 1) * b_x, j * b_y: (j + 1) * b_y, b_z * k:b_z * (k + 1)] = sigmas_block | |
segs[i * b_x:(i + 1) * b_x, j * b_y: (j + 1) * b_y, b_z * k:b_z * (k + 1)] = segs_pool | |
if feat_vis: | |
dino[i * b_x:(i + 1) * b_x, j * b_y: (j + 1) * b_y, b_z * k:b_z * (k + 1), :] = dino_feat_block | |
torch.cuda.empty_cache() | |
if USE_GROW: | |
sigmas = F.max_pool3d(torch.tensor(sigmas).unsqueeze(0), kernel_size=3, stride=1, padding=1).squeeze(0).numpy() | |
return sigmas, segs, dino | |
def calculate_pca(dino, is_occupied_seg, net): | |
dino = torch.Tensor(dino) | |
visible_dino = dino[is_occupied_seg] | |
# print(net.encoder.visualization.batch_rgb_mean, net.encoder.visualization.batch_rgb_comp) | |
net.encoder.fit_visualization(visible_dino.flatten(0, -2), refit=True) | |
return torch.clamp(net.encoder.transform_visualization(dino), min=-0.5, max=0.5).cpu().numpy() + 0.5 | |
def use_custom_maxpool(_sigmas): | |
sigmas = torch.zeros(258, 258, 34) | |
sigmas[1:257, 1:257, 1:33] = torch.tensor(_sigmas) | |
sigmas_pooled = torch.zeros(256, 256, 32) | |
for i in range(256): | |
for j in range(256): | |
for k in range(32): | |
sigmas_pooled[i, j, k] = max(sigmas[i+1, j+1, k+1], | |
sigmas[i, j+1, k+1], sigmas[i+1, j, k+1],sigmas[i+1, j+1, k], | |
sigmas[i+2, j+1, k+1], sigmas[i+1, j+2, k+1],sigmas[i+1, j+1, k+2]) | |
return sigmas_pooled | |
def plot_images(images_dict): | |
"""The images dict should include six images and six corresponding ids""" | |
images = images_dict["images"] | |
ids = images_dict["ids"] | |
fig, axes = plt.subplots(3, 2, figsize=(10, 6)) | |
axes = axes.flatten() | |
for i, img in enumerate(images): | |
axes[i].imshow(images[i]) | |
axes[i].axis("off") | |
axes[i].set_title(f"FrameId: {ids[i]}") | |
plt.subplots_adjust(wspace=0.01, hspace=0.01) | |
plt.show() | |
def plot_image_at_frame_id(dataset, frame_id): | |
for i in range(len(dataset)): | |
sequence, id, is_right = dataset._datapoints[i] | |
if id == frame_id: | |
data = dataset[i] | |
plt.figure(figsize=(10, 4)) | |
plt.imshow(((data["imgs"][0] + 1) / 2).permute(1, 2, 0)) | |
plt.gca().set_axis_off() | |
plt.show() | |
return | |
def identify_additional_invalids(target): | |
# Note: The Numpy implementation is a bit faster (about 0.1 seconds per iteration) | |
_t = np.concatenate([np.zeros([256, 256, 1]), target], axis=2) | |
invalids = np.cumsum(np.logical_and(_t != 255, _t != 0), axis=2)[:, :, :32] == 0 | |
# _t = torch.cat([torch.zeros([256, 256, 1], device=device, dtype=torch.int32), torch.tensor(target, dtype=torch.int32).to(device)], dim=2) | |
# invalids = torch.cumsum((_t != 255) & (_t != 0), axis=2)[:,:, :32] == 0 | |
# height cut-off (z > 6 ==> no invalid) | |
invalids[: , :, 7:] = 0 | |
# only empty voxels matter | |
invalids[target != 0] = 0 | |
# return invalids.cpu().numpy() | |
return invalids | |
def predict_grid(data_batch, net, points, prediction_mode): | |
# Removed for efficiency | |
# images = torch.stack(data_batch["imgs"], dim=0).unsqueeze(0).to(device).float() | |
# poses = torch.tensor(np.stack(data_batch["poses"], 0)).unsqueeze(0).to(device).float() | |
# projs = torch.tensor(np.stack(data_batch["projs"], 0)).unsqueeze(0).to(device).float() | |
# poses = torch.inverse(poses[:, :1]) @ poses | |
# extra_args = {"images_alt": images * 0.5 + 0.5} | |
# net.compute_grid_transforms(projs, poses) | |
# net.encode(images, projs, poses, ids_encoder=[0], ids_render=[0], **extra_args) | |
# net.set_scale(0) | |
# q_pts = get_pts(X_RANGE, Y_RANGE, Z_RANGE, p_res[1], p_res_y, p_res[0]) | |
# q_pts = q_pts.to(device).reshape(1, -1, 3) | |
# # _, invalid, sigmas = net.forward(q_pts) | |
# | |
points = points.reshape(1, -1, 3) | |
if prediction_mode is not None: | |
dino_feat, invalid, sigmas, segs = net.forward(points, predict_segmentation=True, prediction_mode=prediction_mode) | |
else: | |
dino_feat, invalid, sigmas, segs = net.forward(points, predict_segmentation=True) | |
return sigmas, segs, dino_feat | |
def convert_voxels(arr, map_dict): | |
f = np.vectorize(map_dict.__getitem__) | |
return f(arr) | |
def compute_occupancy_numbers_segmentation(y_pred, y_true, fov_mask, labels): | |
label_ids = list(labels.keys())[1:] | |
mask = y_true != 255 | |
mask = np.logical_and(mask, fov_mask) | |
mask = mask.flatten() | |
y_pred = y_pred.flatten()[mask] | |
y_true = y_true.flatten()[mask] | |
tp = np.zeros(len(label_ids)) | |
fp = np.zeros(len(label_ids)) | |
fn = np.zeros(len(label_ids)) | |
tn = np.zeros(len(label_ids)) | |
for label_id in label_ids: | |
tp[label_id - 1] = np.sum(np.logical_and(y_true == label_id, y_pred == label_id)) | |
fp[label_id - 1] = np.sum(np.logical_and(y_true != label_id, y_pred == label_id)) | |
fn[label_id - 1] = np.sum(np.logical_and(y_true == label_id, y_pred != label_id)) | |
tn[label_id - 1] = np.sum(np.logical_and(y_true != label_id, y_pred != label_id)) | |
dim_conf = len(label_ids) + 1 | |
bincount_values = dim_conf * y_true + y_pred | |
confusion_matrix = np.bincount(bincount_values, minlength=dim_conf*dim_conf).reshape(dim_conf, dim_conf) | |
return tp, fp, tn, fn, confusion_matrix | |
def compute_occupancy_recall_segmentation(y_pred, y_true, fov_mask, labels): | |
label_ids = list(labels.keys())[1:] | |
mask = y_true != 255 | |
mask = np.logical_and(mask, fov_mask) | |
mask = mask.flatten() | |
y_pred = y_pred.flatten()[mask] | |
y_true = y_true.flatten()[mask] | |
tp = np.zeros(len(label_ids)) | |
sum = np.zeros(len(label_ids)) | |
for label_id in label_ids: | |
tp[label_id - 1] = np.sum(np.logical_and(y_true == label_id, y_pred > 0)) | |
sum[label_id - 1] = np.sum(y_true == label_id) | |
return tp, sum | |
def compute_occupancy_numbers(y_pred, y_true, fov_mask): | |
mask = y_true != 255 | |
mask = np.logical_and(mask, fov_mask) | |
mask = mask.flatten() | |
y_pred = y_pred.flatten() | |
y_true = y_true.flatten() | |
occ_true = y_true[mask] > 0 | |
occ_pred = y_pred[mask] > 0 | |
tp = np.sum(np.logical_and(occ_true == 1, occ_pred == 1)) | |
fp = np.sum(np.logical_and(occ_true == 0, occ_pred == 1)) | |
fn = np.sum(np.logical_and(occ_true == 1, occ_pred == 0)) | |
tn = np.sum(np.logical_and(occ_true == 0, occ_pred == 0)) | |
return tp, fp, tn, fn | |
if __name__ == "__main__": | |
#with torch.cuda.amp.autocast(dtype=torch.float16): | |
with torch.no_grad(): | |
main() | |