Spaces:

sandl
/

private_polymer_compound_prediction

Sleeping

App Files Files Community

private_polymer_compound_prediction / polymerlearn /explain /explain_gnn.py

bndl

Upload 115 files

4f5540c about 2 years ago

raw

history blame contribute delete

9.39 kB

	import math
	from operator import index
	import numpy as np
	import torch
	import torch_geometric
	from typing import Dict, Iterable, Callable, Tuple
	from polymerlearn.utils import make_like_batch
	from polymerlearn.utils.graph_prep import get_AG_info
	from polymerlearn.explain.custom_gcam import LayerGradCam

	# Source: https://medium.com/the-dl/how-to-use-pytorch-hooks-5041d777f904

	class FeatureExtractor(torch.nn.Module):
	'''
	Extracts inputs/outputs to each layer in the model

	Source: https://medium.com/the-dl/how-to-use-pytorch-hooks-5041d777f904

	'''


	def __init__(self, model: torch.nn.Module, use_mono: bool = False):
	super().__init__()
	self.model = model
	self.use_mono = use_mono
	self.layers = ['sage'] if self.use_mono else ['Asage', 'Gsage']
	#print(self.layers)
	if self.use_mono:
	self._features = {layer: None for layer in ['Asage', 'Gsage']}
	else:
	self._features = {layer: torch.empty(0) for layer in self.layers}

	for layer_id in self.layers:
	layer = dict([*self.model.named_modules()])[layer_id]
	# Register forward hook to get intermediate outputs of layers
	layer.register_forward_hook(self.save_outputs_hook(layer_id))

	def save_outputs_hook(self, layer_id: str) -> Callable:
	'''
	Hook function for saving outputs of intermediate layers
	'''
	if self.use_mono:
	def fn(_, __, output):
	if self._features['Asage'] is not None:
	self._features['Gsage'] = output
	#print('Reg G')
	else:
	self._features['Asage'] = output
	#print('Reg A')
	else:
	def fn(_, __, output):
	self._features[layer_id] = output
	return fn

	def forward(self, input_tup) -> Dict[str, torch.Tensor]:
	_ = self.model(*input_tup)
	# print('Features', self._features)
	# print('Features', len(self._features['Asage']))
	# print('Features', len(self._features['Gsage']))
	# exit()
	if self.use_mono:
	feat_copy = self._features
	self._features = {layer: None for layer in ['Asage', 'Gsage']}
	return feat_copy
	else:
	return self._features

	def parse_batches(
	batch: torch_geometric.data.Batch,
	add_test: torch.Tensor):
	Abatch, Gbatch = make_like_batch(batch)

	A_X = Abatch.x
	A_edge_index = Abatch.edge_index
	A_batch = Abatch.batch

	G_X = Gbatch.x
	G_edge_index = Gbatch.edge_index
	G_batch = Gbatch.batch

	return (A_X,
	A_edge_index,
	A_batch,
	G_X,
	G_edge_index,
	G_batch,
	torch.tensor(add_test).float())

	def index_to_batch_mapper(batch, ratio = 0.5):
	'''
	Computes a backwards map from index in a SAGPool output
	to the original sample inputs.
	'''
	num_batches = max(batch).item() + 1
	#print(f'batch (size: {batch.shape})', batch)
	#print('Num batches', num_batches)
	batch_sizes = [torch.sum(batch == b).item() for b in range(num_batches)]
	#print('Batch sizes', batch_sizes)

	# Multiply and take math.ceil for each batch
	final_sizes = [math.ceil(b * ratio) for b in batch_sizes]
	final_sizes = np.cumsum(final_sizes)
	#print(final_sizes)

	# Now return dictionary mapping integer index to the given input sample:
	ind_map = {}
	for i in range(len(final_sizes)):
	bottom = 0 if i == 0 else final_sizes[i-1]
	for j in range(bottom, final_sizes[i]):
	ind_map[j] = i

	return ind_map

	dim1_sum = lambda t: torch.sum(t, dim=1)
	dim1_L1norm = lambda t: torch.norm(t, p=1, dim=1)

	class PolymerGNNExplainer:
	'''
	Explainer for the PolymerGNN. Uses Grad CAM with Captum implementation.
	'''

	def __init__(self, model: torch.nn.Module, explain_layer = 'fc1',
	pool_ratio = 0.5, use_mono: bool = False):

	self.model = model
	self.explain_layer = explain_layer
	self.ratio = pool_ratio
	self.use_mono = use_mono
	self.gcam = LayerGradCam(model, getattr(model, explain_layer))
	self.extractor = FeatureExtractor(model, use_mono = self.use_mono)

	def get_attribution(self,
	batch: Tuple,
	add_test: torch.Tensor,
	mol_rep_agg = dim1_sum):
	'''
	Get explaination for a given sample from the dataset on the model.

	..note:: Assumes max pooling. Would need to implement another expansion
	to work backwards through another pooling method.

	Args:
	'''
	# Parse the batches for captum usage
	batches_tup = parse_batches(batch, add_test) # Parses batch into appropriate input for GNN
	input_tup = tuple([batches_tup[j] for j in range(1, len(batches_tup))])

	if mol_rep_agg is None:
	mol_rep_agg = lambda x: x

	# Compute the attribution from captum
	attribution = self.gcam.attribute(
	batches_tup[0],
	additional_forward_args = input_tup,
	attribute_to_layer_input = True
	)

	# Get intermediate features in a feedforward step
	features = self.extractor(batches_tup)

	def attr_scores(key = 'A', hc = 32):
	#print(key)
	bind = 2 if key == 'A' else -2 # Location of batch
	add_to_bottom = 0 if key == 'A' else 32
	# Map indices to batches
	ind_map = index_to_batch_mapper(batches_tup[bind], ratio = self.ratio)
	#print(ind_map)

	# Set which layer to get attributions from
	str_key = '{}sage'.format(key)
	#print('str key', features[str_key][0].shape)

	# assert (max(ind_map.keys()) + 1) == features[str_key][0].shape[0], \
	# 'Mismatch size dict={} vs. feat={}'.format((max(ind_map.keys()) + 1), features[str_key][0].shape[0])

	#print('Dict', max(ind_map.keys()) + 1)
	#print('Features', features[str_key][0].shape[0])

	# Get argmax of features on which to assign attributions
	feat_argmax = torch.argmax(features[str_key][0], dim = 0)
	# Accesses features for the given layer, defined by key
	#print(feat_argmax)

	# Expand scores backward from the max pooling:
	scores = torch.zeros((len(set(ind_map.values())), 32))
	for j in range(feat_argmax.shape[0]):
	score_ind = ind_map[feat_argmax[j].item()]
	scores[score_ind,j] = attribution[add_to_bottom + j]

	return scores

	# Aggregates molecular representations together in scores:
	scores = {
	'A': mol_rep_agg(attr_scores('A')).detach().clone(),
	'G': mol_rep_agg(attr_scores('G')).detach().clone()
	}

	#print('-----------------------------------------------')

	# Score individual attributes:
	num_add = add_test.shape[0]

	scores['add'] = attribution[-num_add:].detach().clone()

	return scores

	def get_testing_explanation(self,
	dataset,
	test_inds = None,
	add_data_keys = ['Mw', 'AN', 'OHN', '%TMP']):
	'''

	Args:
	dataset: Dataset object from which to extract
	test_inds (list of ints, optional): If given, extracts testing
	data from the dataset with respect to the indices.
	add_data_keys (list of str): List that should have the same
	length as additional
	'''

	if test_inds is None:
	test_batch, Ytest, add_test = dataset.get_test()
	test_inds = dataset.test_mask
	else:
	test_batch = dataset.make_dataloader_by_mask(test_inds)
	Ytest = np.array(dataset.get_Y_by_mask(test_inds))
	add_test = dataset.get_additional_by_mask(test_inds)

	exp_summary = []

	# Summary tools for acid/glycol scores
	acid_key = {a:[] for a in dataset.acid_names}
	glycol_key = {g:[] for g in dataset.glycol_names}
	additional_key = {a:[] for a in add_data_keys}
	acids, glycols, _, _ = get_AG_info(dataset.data)

	for i in range(Ytest.shape[0]):
	scores = self.get_attribution(test_batch[i], add_test[i], mol_rep_agg=dim1_L1norm)
	Ti = test_inds[i]
	scores['table_ind'] = Ti

	# print(scores)
	# print(acids[Ti])
	# print(glycols[Ti])

	for a in range(len(acids[Ti])):
	Ascore = scores['A'].item() if len(acids[Ti]) == 1 else scores['A'][a].item()
	acid_key[acids[Ti][a]].append(Ascore)

	for g in range(len(glycols[Ti])):
	Gscore = scores['G'].item() if len(glycols[Ti]) == 1 else scores['G'][g].item()
	glycol_key[glycols[Ti][g]].append(Gscore)

	# Assign attributions to additional elements:
	for j in range(len(add_data_keys)):
	v = scores['add'][j - len(add_data_keys)].item()
	scores[add_data_keys[j]] = v
	additional_key[add_data_keys[j]].append(v)

	exp_summary.append(scores)

	return exp_summary, acid_key, glycol_key, additional_key