Spaces:

luisoala
/

raw2logit

Runtime error

willis

cleanup

712050d almost 4 years ago

13.1 kB

	"""
	Utilities for other scripts
	"""

	import os
	import shutil

	import random

	import torch
	import mlflow
	from mlflow.tracking import MlflowClient
	import numpy as np

	from IPython.display import display, Markdown

	from b2sdk.v1 import *

	import argparse

	from torch import nn


	def str2bool(string):
	return string == 'True'


	def np2torch(nparray):
	"""Convert numpy array to torch tensor
	For array with more than 3 channels, it is better to use an input array in the format BxHxWxC

	Args:
	numpy array (ndarray) BxHxWxC
	Returns:
	torch tensor (tensor) BxCxHxW"""

	tensor = torch.Tensor(nparray)

	if tensor.ndim == 2:
	return tensor
	if tensor.ndim == 3:
	height, width, channels = tensor.shape
	if channels <= 3: # Single image with more channels (HxWxC)
	return tensor.permute(2, 0, 1)

	if tensor.ndim == 4: # More images with more channels (BxHxWxC)
	return tensor.permute(0, 3, 1, 2)

	return tensor


	def torch2np(torchtensor):
	"""Convert torch tensor to numpy array
	For tensor with more than 3 channels or batch, it is better to use an input tensor in the format BxCxHxW

	Args:
	torch tensor (tensor) BxCxHxW
	Returns:
	numpy array (ndarray) BxHxWxC"""

	ndarray = torchtensor.detach().cpu().numpy().astype(np.float32)

	if ndarray.ndim == 3: # Single image with more channels (CxHxW)
	channels, height, width = ndarray.shape
	if channels <= 3:
	return ndarray.transpose(1, 2, 0)

	if ndarray.ndim == 4: # More images with more channels (BxCxHxW)
	return ndarray.transpose(0, 2, 3, 1)

	return ndarray


	def set_random_seed(seed):
	np.random.seed(seed) # cpu vars
	torch.manual_seed(seed) # cpu vars
	random.seed(seed) # Python
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed) # gpu vars
	torch.backends.cudnn.deterministic = True # needed
	torch.backends.cudnn.benchmark = False


	def normalize(img):
	"""Normalize images

	Args:
	imgs (ndarray): image to normalize --> size: (Height,Width,Channels)
	Returns:
	normalized (ndarray): normalized image
	mu (ndarray): mean
	sigma (ndarray): standard deviation
	"""

	img = img.astype(float)

	if len(img.shape) == 2:
	img = img[:, :, np.newaxis]

	height, width, channels = img.shape

	mu, sigma = np.empty(channels), np.empty(channels)

	for ch in range(channels):
	temp_mu = img[:, :, ch].mean()
	temp_sigma = img[:, :, ch].std()

	img[:, :, ch] = (img[:, :, ch] - temp_mu) / (temp_sigma + 1e-4)

	mu[ch] = temp_mu
	sigma[ch] = temp_sigma

	return img, mu, sigma


	def b2_list_files(folder=''):
	bucket = get_b2_bucket()
	for file_info, _ in bucket.ls(folder, show_versions=False):
	print(file_info.file_name)


	def get_b2_bucket():
	bucket_name = 'perturbed-minds'
	application_key_id = '003d6b042de536a0000000008'
	application_key = 'K003HMNxnoa91Dy9c0V8JVCKNUnwR9U'
	info = InMemoryAccountInfo()
	b2_api = B2Api(info)
	b2_api.authorize_account('production', application_key_id, application_key)
	bucket = b2_api.get_bucket_by_name(bucket_name)
	return bucket


	def b2_download_folder(b2_dir, local_dir, force_download=False, mirror_folder=True):
	"""Downloads a folder from the b2 bucket and optionally cleans
	up files that are no longer on the server

	Args:
	b2_dir (str): path to folder on the b2 server
	local_dir (str): path to folder on the local machine
	force_download (bool, optional): force the download, if set to `False`,
	files with matching names on the local machine will be skipped
	mirror_folder (bool, optional): if set to `True`, files that are found in
	the local directory, but are not on the server will be deleted
	"""
	bucket = get_b2_bucket()

	if not os.path.exists(local_dir):
	os.makedirs(local_dir)
	elif not force_download:
	return

	download_files = [file_info.file_name.split(b2_dir + '/')[-1]
	for file_info, _ in bucket.ls(b2_dir, show_versions=False)]

	for file_name in download_files:
	if file_name.endswith('/.bzEmpty'): # subdirectory, download recursively
	subdir = file_name.replace('/.bzEmpty', '')
	if len(subdir) > 0:
	b2_subdir = os.path.join(b2_dir, subdir)
	local_subdir = os.path.join(local_dir, subdir)
	if b2_subdir != b2_dir:
	b2_download_folder(b2_subdir, local_subdir, force_download=force_download,
	mirror_folder=mirror_folder)
	else: # file
	b2_file = os.path.join(b2_dir, file_name)
	local_file = os.path.join(local_dir, file_name)
	if not os.path.exists(local_file) or force_download:
	print(f"downloading b2://{b2_file} -> {local_file}")
	bucket.download_file_by_name(b2_file, DownloadDestLocalFile(local_file))

	if mirror_folder: # remove all files that are not on the b2 server anymore
	for i, file in enumerate(download_files):
	if file.endswith('/.bzEmpty'): # subdirectory, download recursively
	download_files[i] = file.replace('/.bzEmpty', '')
	for file_name in os.listdir(local_dir):
	if file_name not in download_files:
	local_file = os.path.join(local_dir, file_name)
	print(f"deleting {local_file}")
	if os.path.isdir(local_file):
	shutil.rmtree(local_file)
	else:
	os.remove(local_file)


	def get_name(obj):
	return obj.__name__ if hasattr(obj, '__name__') else type(obj).__name__


	def get_mlflow_model_by_name(experiment_name, run_name,
	tracking_uri="http://deplo-mlflo-1ssxo94f973sj-890390d809901dbf.elb.eu-central-1.amazonaws.com",
	download_model=True):
	# 0. mlflow basics
	mlflow.set_tracking_uri(tracking_uri)
	os.environ["AWS_ACCESS_KEY_ID"] = "#TODO: add your AWS access key if you want to write your results to our collaborative lab server"
	os.environ["AWS_SECRET_ACCESS_KEY"] = "#TODO: add your AWS seceret access key if you want to write your results to our collaborative lab server"

	# # 1. use get_experiment_by_name to get experiment objec
	experiment = mlflow.get_experiment_by_name(experiment_name)

	# # 2. use search_runs with experiment_id for string search query
	if os.path.isfile('cache/runs_names.pkl'):
	runs = pd.read_pickle('cache/runs_names.pkl')
	if runs['tags.mlflow.runName'][runs['tags.mlflow.runName'] == run_name].empty:
	# returns a pandas data frame where each row is a run (if several exist under that name)
	runs = fetch_runs_list_mlflow(experiment)
	else:
	# returns a pandas data frame where each row is a run (if several exist under that name)
	runs = fetch_runs_list_mlflow(experiment)

	# 3. get the selected run between all runs inside the selected experiment
	run = runs.loc[runs['tags.mlflow.runName'] == run_name]

	# 4. check if there is only a run with that name
	assert len(run) == 1, "More runs with this name"
	index_run = run.index[0]
	artifact_uri = run.loc[index_run, 'artifact_uri']

	# 5. load state_dict of your run
	state_dict = mlflow.pytorch.load_state_dict(artifact_uri)

	# 6. load model of your run
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
	# model = mlflow.pytorch.load_model(os.path.join(
	# artifact_uri, "model"), map_location=torch.device(DEVICE))
	model = fetch_from_mlflow(os.path.join(
	artifact_uri, "model"), use_cache=True, download_model=download_model)

	return state_dict, model


	def data_loader_mean_and_std(data_loader, transform=None):
	means = []
	stds = []
	for x, y in data_loader:
	if transform is not None:
	x = transform(x)
	means.append(x.mean(dim=(0, 2, 3)).unsqueeze(0))
	stds.append(x.std(dim=(0, 2, 3)).unsqueeze(0))
	return torch.cat(means).mean(dim=0), torch.cat(stds).mean(dim=0)


	def fetch_runs_list_mlflow(experiment):
	runs = mlflow.search_runs(experiment.experiment_id)
	runs.to_pickle('cache/runs_names.pkl') # where to save it, usually as a .pkl
	return runs


	def fetch_from_mlflow(uri, type='', use_cache=True, download_model=True):
	cache_loc = os.path.join('cache', uri.split('//')[1]) + '.pt'
	if use_cache and os.path.exists(cache_loc):
	print(f'loading cached model from {cache_loc} ...')
	model = torch.load(cache_loc)
	else:
	print(f'fetching model from {uri} ...')
	model = mlflow.pytorch.load_model(uri)
	os.makedirs(os.path.dirname(cache_loc), exist_ok=True)
	if download_model:
	torch.save(model, cache_loc, pickle_module=mlflow.pytorch.pickle_module)
	if type == 'processor':
	processor = model.processor
	model.processor = None
	del model # free up memory space
	return processor
	if type == 'classifier':
	classifier = model.classifier
	model.classifier = None
	del model # free up memory space
	return classifier
	return model


	def display_mlflow_run_info(run):
	uri = mlflow.get_tracking_uri()
	experiment_id = run.info.experiment_id
	experiment_name = mlflow.get_experiment(experiment_id).name
	run_id = run.info.run_id
	run_name = run.data.tags['mlflow.runName']
	experiment_url = f'{uri}/#/experiments/{experiment_id}'
	run_url = f'{experiment_url}/runs/{run_id}'

	print(f'view results at {run_url}')
	display(Markdown(
	f"[<a href='{experiment_url}'>experiment {experiment_id} '{experiment_name}'</a>]"
	f" > "
	f"[<a href='{run_url}'>run '{run_name}' {run_id}</a>]"
	))
	print('')


	def get_train_test_indices_drone(df, frac, seed=None):
	""" Split indices of a DataFrame with binary and balanced labels into balanced subindices

	Args:
	df (pd.DataFrame): {0,1}-labeled data
	frac (float): fraction of indicies in first subset
	random_seed (int): random seed used as random state in np.random and as argument for random.seed()
	Returns:
	train_indices (torch.tensor): balanced subset of indices corresponding to rows in the DataFrame
	test_indices (torch.tensor): balanced subset of indices corresponding to rows in the DataFrame
	"""

	split_idx = int(len(df) * frac / 2)
	df_with = df[df['label'] == 1]
	df_without = df[df['label'] == 0]

	np.random.seed(seed)
	df_with_train = df_with.sample(n=split_idx, random_state=seed)
	df_with_test = df_with.drop(df_with_train.index)

	df_without_train = df_without.sample(n=split_idx, random_state=seed)
	df_without_test = df_without.drop(df_without_train.index)

	train_indices = list(df_without_train.index) + list(df_with_train.index)
	test_indices = list(df_without_test.index) + list(df_with_test.index)

	""""
	print('fraction of 1-label in train set: {}'.format(len(df_with_train)/(len(df_with_train) + len(df_without_train))))
	print('fraction of 1-label in test set: {}'.format(len(df_with_test)/(len(df_with_test) + len(df_with_test))))
	"""

	return train_indices, test_indices


	# def smp_get_loss(loss):
	# if loss == "Dice":
	# return smp.losses.DiceLoss(mode='binary', from_logits=True)
	# if loss == "BCE":
	# return nn.BCELoss()
	# elif loss == "BCEWithLogits":
	# return smp.losses.BCEWithLogitsLoss()
	# elif loss == "DicyBCE":
	# from pytorch_toolbelt import losses as ptbl
	# return ptbl.JointLoss(ptbl.DiceLoss(mode='binary', from_logits=False),
	# nn.BCELoss(),
	# first_weight=args.dice_weight,
	# second_weight=args.bce_weight)


	# Adversarial setup

	def l2_regularization(x, y):
	return ((x - y) ** 2).sum()


	class AuxLoss(nn.Module):
	def __init__(self, loss_aux, processor_adv, processor_default, weight=1):
	super().__init__()
	self.loss_aux = loss_aux
	self.weight = weight
	self.processor_adv = processor_adv
	self.processor_default = processor_default

	def forward(self, x):
	with torch.no_grad():
	x_reference = self.processor_default(x)
	x_processed = self.processor.buffer['processed_rgb']
	return self.weight * self.loss_aux(x_reference, x_processed)


	class WeightedLoss(nn.Module):
	def __init__(self, loss, weight=1):
	super().__init__()
	self.loss = loss
	self.weight = weight

	def forward(self, x, y):
	return self.weight * self.loss(x, y)

	def __repr__(self):
	return f'{self.weight} * {get_name(self.loss)}'