Spaces:

simonduerr
/

ProteinMPNN

Running on T4

ProteinMPNN / af_backprop /alphafold /model /modules.py

Simon Duerr

add fast af

85bd48b about 3 years ago

76.2 kB

	# Copyright 2021 DeepMind Technologies Limited
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Modules and code used in the core part of AlphaFold.

	The structure generation code is in 'folding.py'.
	"""
	import functools
	from alphafold.common import residue_constants
	from alphafold.model import all_atom
	from alphafold.model import common_modules
	from alphafold.model import folding
	from alphafold.model import layer_stack
	from alphafold.model import lddt
	from alphafold.model import mapping
	from alphafold.model import prng
	from alphafold.model import quat_affine
	from alphafold.model import utils
	import haiku as hk
	import jax
	import jax.numpy as jnp

	from alphafold.model.r3 import Rigids, Rots, Vecs


	def softmax_cross_entropy(logits, labels):
	"""Computes softmax cross entropy given logits and one-hot class labels."""
	loss = -jnp.sum(labels * jax.nn.log_softmax(logits), axis=-1)
	return jnp.asarray(loss)


	def sigmoid_cross_entropy(logits, labels):
	"""Computes sigmoid cross entropy given logits and multiple class labels."""
	log_p = jax.nn.log_sigmoid(logits)
	# log(1 - sigmoid(x)) = log_sigmoid(-x), the latter is more numerically stable
	log_not_p = jax.nn.log_sigmoid(-logits)
	loss = -labels * log_p - (1. - labels) * log_not_p
	return jnp.asarray(loss)


	def apply_dropout(*, tensor, safe_key, rate, is_training, broadcast_dim=None):
	"""Applies dropout to a tensor."""
	if is_training: # and rate != 0.0:
	shape = list(tensor.shape)
	if broadcast_dim is not None:
	shape[broadcast_dim] = 1
	keep_rate = 1.0 - rate
	keep = jax.random.bernoulli(safe_key.get(), keep_rate, shape=shape)
	return keep * tensor / keep_rate
	else:
	return tensor


	def dropout_wrapper(module,
	input_act,
	mask,
	safe_key,
	global_config,
	output_act=None,
	is_training=True,
	scale_rate=1.0,
	**kwargs):
	"""Applies module + dropout + residual update."""
	if output_act is None:
	output_act = input_act

	gc = global_config
	residual = module(input_act, mask, is_training=is_training, **kwargs)
	dropout_rate = 0.0 if gc.deterministic else module.config.dropout_rate

	if module.config.shared_dropout:
	if module.config.orientation == 'per_row':
	broadcast_dim = 0
	else:
	broadcast_dim = 1
	else:
	broadcast_dim = None

	residual = apply_dropout(tensor=residual,
	safe_key=safe_key,
	rate=dropout_rate * scale_rate,
	is_training=is_training,
	broadcast_dim=broadcast_dim)

	new_act = output_act + residual

	return new_act


	def create_extra_msa_feature(batch):
	"""Expand extra_msa into 1hot and concat with other extra msa features.

	We do this as late as possible as the one_hot extra msa can be very large.

	Arguments:
	batch: a dictionary with the following keys:
	* 'extra_msa': [N_extra_seq, N_res] MSA that wasn't selected as a cluster
	centre. Note, that this is not one-hot encoded.
	* 'extra_has_deletion': [N_extra_seq, N_res] Whether there is a deletion to
	the left of each position in the extra MSA.
	* 'extra_deletion_value': [N_extra_seq, N_res] The number of deletions to
	the left of each position in the extra MSA.

	Returns:
	Concatenated tensor of extra MSA features.
	"""
	# 23 = 20 amino acids + 'X' for unknown + gap + bert mask
	msa_1hot = jax.nn.one_hot(batch['extra_msa'], 23)
	msa_feat = [msa_1hot,
	jnp.expand_dims(batch['extra_has_deletion'], axis=-1),
	jnp.expand_dims(batch['extra_deletion_value'], axis=-1)]
	return jnp.concatenate(msa_feat, axis=-1)


	class AlphaFoldIteration(hk.Module):
	"""A single recycling iteration of AlphaFold architecture.

	Computes ensembled (averaged) representations from the provided features.
	These representations are then passed to the various heads
	that have been requested by the configuration file. Each head also returns a
	loss which is combined as a weighted sum to produce the total loss.

	Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 3-22
	"""

	def __init__(self, config, global_config, name='alphafold_iteration'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self,
	ensembled_batch,
	non_ensembled_batch,
	is_training,
	compute_loss=False,
	ensemble_representations=False,
	return_representations=False):

	num_ensemble = jnp.asarray(ensembled_batch['seq_length'].shape[0])

	if not ensemble_representations:
	assert ensembled_batch['seq_length'].shape[0] == 1

	def slice_batch(i):
	b = {k: v[i] for k, v in ensembled_batch.items()}
	b.update(non_ensembled_batch)
	return b

	# Compute representations for each batch element and average.
	evoformer_module = EmbeddingsAndEvoformer(
	self.config.embeddings_and_evoformer, self.global_config)
	batch0 = slice_batch(0)
	representations = evoformer_module(batch0, is_training)

	# MSA representations are not ensembled so
	# we don't pass tensor into the loop.
	msa_representation = representations['msa']
	del representations['msa']

	# Average the representations (except MSA) over the batch dimension.
	if ensemble_representations:
	def body(x):
	"""Add one element to the representations ensemble."""
	i, current_representations = x
	feats = slice_batch(i)
	representations_update = evoformer_module(
	feats, is_training)

	new_representations = {}
	for k in current_representations:
	new_representations[k] = (
	current_representations[k] + representations_update[k])
	return i+1, new_representations

	if hk.running_init():
	# When initializing the Haiku module, run one iteration of the
	# while_loop to initialize the Haiku modules used in `body`.
	_, representations = body((1, representations))
	else:
	_, representations = hk.while_loop(
	lambda x: x[0] < num_ensemble,
	body,
	(1, representations))

	for k in representations:
	if k != 'msa':
	representations[k] /= num_ensemble.astype(representations[k].dtype)

	representations['msa'] = msa_representation
	batch = batch0 # We are not ensembled from here on.

	if jnp.issubdtype(ensembled_batch['aatype'].dtype, jnp.integer):
	_, num_residues = ensembled_batch['aatype'].shape
	else:
	_, num_residues, _ = ensembled_batch['aatype'].shape

	if self.config.use_struct:
	struct_module = folding.StructureModule
	else:
	struct_module = folding.dummy

	heads = {}
	for head_name, head_config in sorted(self.config.heads.items()):
	if not head_config.weight:
	continue # Do not instantiate zero-weight heads.
	head_factory = {
	'masked_msa': MaskedMsaHead,
	'distogram': DistogramHead,
	'structure_module': functools.partial(struct_module, compute_loss=compute_loss),
	'predicted_lddt': PredictedLDDTHead,
	'predicted_aligned_error': PredictedAlignedErrorHead,
	'experimentally_resolved': ExperimentallyResolvedHead,
	}[head_name]
	heads[head_name] = (head_config,
	head_factory(head_config, self.global_config))

	total_loss = 0.
	ret = {}
	ret['representations'] = representations

	def loss(module, head_config, ret, name, filter_ret=True):
	if filter_ret:
	value = ret[name]
	else:
	value = ret
	loss_output = module.loss(value, batch)
	ret[name].update(loss_output)
	loss = head_config.weight * ret[name]['loss']
	return loss

	for name, (head_config, module) in heads.items():
	# Skip PredictedLDDTHead and PredictedAlignedErrorHead until
	# StructureModule is executed.
	if name in ('predicted_lddt', 'predicted_aligned_error'):
	continue
	else:
	ret[name] = module(representations, batch, is_training)
	if 'representations' in ret[name]:
	# Extra representations from the head. Used by the structure module
	# to provide activations for the PredictedLDDTHead.
	representations.update(ret[name].pop('representations'))
	if compute_loss:
	total_loss += loss(module, head_config, ret, name)

	if self.config.use_struct:
	if self.config.heads.get('predicted_lddt.weight', 0.0):
	# Add PredictedLDDTHead after StructureModule executes.
	name = 'predicted_lddt'
	# Feed all previous results to give access to structure_module result.
	head_config, module = heads[name]
	ret[name] = module(representations, batch, is_training)
	if compute_loss:
	total_loss += loss(module, head_config, ret, name, filter_ret=False)

	if ('predicted_aligned_error' in self.config.heads
	and self.config.heads.get('predicted_aligned_error.weight', 0.0)):
	# Add PredictedAlignedErrorHead after StructureModule executes.
	name = 'predicted_aligned_error'
	# Feed all previous results to give access to structure_module result.
	head_config, module = heads[name]
	ret[name] = module(representations, batch, is_training)
	if compute_loss:
	total_loss += loss(module, head_config, ret, name, filter_ret=False)

	if compute_loss:
	return ret, total_loss
	else:
	return ret

	class AlphaFold(hk.Module):
	"""AlphaFold model with recycling.

	Jumper et al. (2021) Suppl. Alg. 2 "Inference"
	"""

	def __init__(self, config, name='alphafold'):
	super().__init__(name=name)
	self.config = config
	self.global_config = config.global_config

	def __call__(
	self,
	batch,
	is_training,
	compute_loss=False,
	ensemble_representations=False,
	return_representations=False):
	"""Run the AlphaFold model.

	Arguments:
	batch: Dictionary with inputs to the AlphaFold model.
	is_training: Whether the system is in training or inference mode.
	compute_loss: Whether to compute losses (requires extra features
	to be present in the batch and knowing the true structure).
	ensemble_representations: Whether to use ensembling of representations.
	return_representations: Whether to also return the intermediate
	representations.

	Returns:
	When compute_loss is True:
	a tuple of loss and output of AlphaFoldIteration.
	When compute_loss is False:
	just output of AlphaFoldIteration.

	The output of AlphaFoldIteration is a nested dictionary containing
	predictions from the various heads.
	"""
	if "scale_rate" not in batch:
	batch["scale_rate"] = jnp.ones((1,))
	impl = AlphaFoldIteration(self.config, self.global_config)
	if jnp.issubdtype(batch['aatype'].dtype, jnp.integer):
	batch_size, num_residues = batch['aatype'].shape
	else:
	batch_size, num_residues, _ = batch['aatype'].shape

	def get_prev(ret):
	new_prev = {
	'prev_msa_first_row': ret['representations']['msa_first_row'],
	'prev_pair': ret['representations']['pair'],
	'prev_dgram': ret["distogram"]["logits"],
	}
	if self.config.use_struct:
	new_prev.update({'prev_pos': ret['structure_module']['final_atom_positions'],
	'prev_plddt': ret["predicted_lddt"]["logits"]})

	if "predicted_aligned_error" in ret:
	new_prev["prev_pae"] = ret["predicted_aligned_error"]["logits"]

	if not self.config.backprop_recycle:
	for k in ["prev_pos","prev_msa_first_row","prev_pair"]:
	if k in new_prev:
	new_prev[k] = jax.lax.stop_gradient(new_prev[k])

	return new_prev

	def do_call(prev,
	recycle_idx,
	compute_loss=compute_loss):
	if self.config.resample_msa_in_recycling:
	num_ensemble = batch_size // (self.config.num_recycle + 1)
	def slice_recycle_idx(x):
	start = recycle_idx * num_ensemble
	size = num_ensemble
	return jax.lax.dynamic_slice_in_dim(x, start, size, axis=0)
	ensembled_batch = jax.tree_map(slice_recycle_idx, batch)
	else:
	num_ensemble = batch_size
	ensembled_batch = batch
	non_ensembled_batch = jax.tree_map(lambda x: x, prev)

	return impl(ensembled_batch=ensembled_batch,
	non_ensembled_batch=non_ensembled_batch,
	is_training=is_training,
	compute_loss=compute_loss,
	ensemble_representations=ensemble_representations)


	emb_config = self.config.embeddings_and_evoformer
	prev = {
	'prev_msa_first_row': jnp.zeros([num_residues, emb_config.msa_channel]),
	'prev_pair': jnp.zeros([num_residues, num_residues, emb_config.pair_channel]),
	'prev_dgram': jnp.zeros([num_residues, num_residues, 64]),
	}
	if self.config.use_struct:
	prev.update({'prev_pos': jnp.zeros([num_residues, residue_constants.atom_type_num, 3]),
	'prev_plddt': jnp.zeros([num_residues, 50]),
	'prev_pae': jnp.zeros([num_residues, num_residues, 64])})

	for k in ["pos","msa_first_row","pair","dgram"]:
	if f"init_{k}" in batch: prev[f"prev_{k}"] = batch[f"init_{k}"][0]

	if self.config.num_recycle:
	if 'num_iter_recycling' in batch:
	# Training time: num_iter_recycling is in batch.
	# The value for each ensemble batch is the same, so arbitrarily taking
	# 0-th.
	num_iter = batch['num_iter_recycling'][0]

	# Add insurance that we will not run more
	# recyclings than the model is configured to run.
	num_iter = jnp.minimum(num_iter, self.config.num_recycle)
	else:
	# Eval mode or tests: use the maximum number of iterations.
	num_iter = self.config.num_recycle

	def add_prev(p,p_):
	p_["prev_dgram"] += p["prev_dgram"]
	if self.config.use_struct:
	p_["prev_plddt"] += p["prev_plddt"]
	p_["prev_pae"] += p["prev_pae"]
	return p_

	##############################################################
	def body(p, i):
	p_ = get_prev(do_call(p, recycle_idx=i, compute_loss=False))
	if self.config.add_prev:
	p_ = add_prev(p, p_)
	return p_, None
	if hk.running_init():
	prev,_ = body(prev, 0)
	else:
	prev,_ = hk.scan(body, prev, jnp.arange(num_iter))
	##############################################################

	else:
	num_iter = 0

	ret = do_call(prev=prev, recycle_idx=num_iter)
	if self.config.add_prev:
	prev_ = get_prev(ret)
	if compute_loss:
	ret = ret[0], [ret[1]]

	if not return_representations:
	del (ret[0] if compute_loss else ret)['representations'] # pytype: disable=unsupported-operands

	if self.config.add_prev and num_iter > 0:
	prev_ = add_prev(prev, prev_)
	ret["distogram"]["logits"] = prev_["prev_dgram"]/(num_iter+1)
	if self.config.use_struct:
	ret["predicted_lddt"]["logits"] = prev_["prev_plddt"]/(num_iter+1)
	if "predicted_aligned_error" in ret:
	ret["predicted_aligned_error"]["logits"] = prev_["prev_pae"]/(num_iter+1)

	return ret

	class TemplatePairStack(hk.Module):
	"""Pair stack for the templates.

	Jumper et al. (2021) Suppl. Alg. 16 "TemplatePairStack"
	"""

	def __init__(self, config, global_config, name='template_pair_stack'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, pair_act, pair_mask, is_training, safe_key=None, scale_rate=1.0):
	"""Builds TemplatePairStack module.

	Arguments:
	pair_act: Pair activations for single template, shape [N_res, N_res, c_t].
	pair_mask: Pair mask, shape [N_res, N_res].
	is_training: Whether the module is in training mode.
	safe_key: Safe key object encapsulating the random number generation key.

	Returns:
	Updated pair_act, shape [N_res, N_res, c_t].
	"""

	if safe_key is None:
	safe_key = prng.SafeKey(hk.next_rng_key())

	gc = self.global_config
	c = self.config

	if not c.num_block:
	return pair_act

	def block(x):
	"""One block of the template pair stack."""
	pair_act, safe_key = x

	dropout_wrapper_fn = functools.partial(
	dropout_wrapper, is_training=is_training, global_config=gc, scale_rate=scale_rate)

	safe_key, *sub_keys = safe_key.split(6)
	sub_keys = iter(sub_keys)

	pair_act = dropout_wrapper_fn(
	TriangleAttention(c.triangle_attention_starting_node, gc,
	name='triangle_attention_starting_node'),
	pair_act,
	pair_mask,
	next(sub_keys))
	pair_act = dropout_wrapper_fn(
	TriangleAttention(c.triangle_attention_ending_node, gc,
	name='triangle_attention_ending_node'),
	pair_act,
	pair_mask,
	next(sub_keys))
	pair_act = dropout_wrapper_fn(
	TriangleMultiplication(c.triangle_multiplication_outgoing, gc,
	name='triangle_multiplication_outgoing'),
	pair_act,
	pair_mask,
	next(sub_keys))
	pair_act = dropout_wrapper_fn(
	TriangleMultiplication(c.triangle_multiplication_incoming, gc,
	name='triangle_multiplication_incoming'),
	pair_act,
	pair_mask,
	next(sub_keys))
	pair_act = dropout_wrapper_fn(
	Transition(c.pair_transition, gc, name='pair_transition'),
	pair_act,
	pair_mask,
	next(sub_keys))

	return pair_act, safe_key

	if gc.use_remat:
	block = hk.remat(block)

	res_stack = layer_stack.layer_stack(c.num_block)(block)
	pair_act, safe_key = res_stack((pair_act, safe_key))
	return pair_act


	class Transition(hk.Module):
	"""Transition layer.

	Jumper et al. (2021) Suppl. Alg. 9 "MSATransition"
	Jumper et al. (2021) Suppl. Alg. 15 "PairTransition"
	"""

	def __init__(self, config, global_config, name='transition_block'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, act, mask, is_training=True):
	"""Builds Transition module.

	Arguments:
	act: A tensor of queries of size [batch_size, N_res, N_channel].
	mask: A tensor denoting the mask of size [batch_size, N_res].
	is_training: Whether the module is in training mode.

	Returns:
	A float32 tensor of size [batch_size, N_res, N_channel].
	"""
	_, _, nc = act.shape

	num_intermediate = int(nc * self.config.num_intermediate_factor)
	mask = jnp.expand_dims(mask, axis=-1)

	act = hk.LayerNorm(
	axis=[-1],
	create_scale=True,
	create_offset=True,
	name='input_layer_norm')(
	act)

	transition_module = hk.Sequential([
	common_modules.Linear(
	num_intermediate,
	initializer='relu',
	name='transition1'), jax.nn.relu,
	common_modules.Linear(
	nc,
	initializer=utils.final_init(self.global_config),
	name='transition2')
	])

	act = mapping.inference_subbatch(
	transition_module,
	self.global_config.subbatch_size,
	batched_args=[act],
	nonbatched_args=[],
	low_memory=not is_training)

	return act


	def glorot_uniform():
	return hk.initializers.VarianceScaling(scale=1.0,
	mode='fan_avg',
	distribution='uniform')


	class Attention(hk.Module):
	"""Multihead attention."""

	def __init__(self, config, global_config, output_dim, name='attention'):
	super().__init__(name=name)

	self.config = config
	self.global_config = global_config
	self.output_dim = output_dim

	def __call__(self, q_data, m_data, bias, nonbatched_bias=None):
	"""Builds Attention module.

	Arguments:
	q_data: A tensor of queries, shape [batch_size, N_queries, q_channels].
	m_data: A tensor of memories from which the keys and values are
	projected, shape [batch_size, N_keys, m_channels].
	bias: A bias for the attention, shape [batch_size, N_queries, N_keys].
	nonbatched_bias: Shared bias, shape [N_queries, N_keys].

	Returns:
	A float32 tensor of shape [batch_size, N_queries, output_dim].
	"""
	# Sensible default for when the config keys are missing
	key_dim = self.config.get('key_dim', int(q_data.shape[-1]))
	value_dim = self.config.get('value_dim', int(m_data.shape[-1]))
	num_head = self.config.num_head
	assert key_dim % num_head == 0
	assert value_dim % num_head == 0
	key_dim = key_dim // num_head
	value_dim = value_dim // num_head

	q_weights = hk.get_parameter(
	'query_w', shape=(q_data.shape[-1], num_head, key_dim),
	init=glorot_uniform())
	k_weights = hk.get_parameter(
	'key_w', shape=(m_data.shape[-1], num_head, key_dim),
	init=glorot_uniform())
	v_weights = hk.get_parameter(
	'value_w', shape=(m_data.shape[-1], num_head, value_dim),
	init=glorot_uniform())

	q = jnp.einsum('bqa,ahc->bqhc', q_data, q_weights) * key_dim**(-0.5)
	k = jnp.einsum('bka,ahc->bkhc', m_data, k_weights)
	v = jnp.einsum('bka,ahc->bkhc', m_data, v_weights)
	logits = jnp.einsum('bqhc,bkhc->bhqk', q, k) + bias
	if nonbatched_bias is not None:
	logits += jnp.expand_dims(nonbatched_bias, axis=0)
	weights = jax.nn.softmax(logits)
	weighted_avg = jnp.einsum('bhqk,bkhc->bqhc', weights, v)

	if self.global_config.zero_init:
	init = hk.initializers.Constant(0.0)
	else:
	init = glorot_uniform()

	if self.config.gating:
	gating_weights = hk.get_parameter(
	'gating_w',
	shape=(q_data.shape[-1], num_head, value_dim),
	init=hk.initializers.Constant(0.0))
	gating_bias = hk.get_parameter(
	'gating_b',
	shape=(num_head, value_dim),
	init=hk.initializers.Constant(1.0))

	gate_values = jnp.einsum('bqc, chv->bqhv', q_data,
	gating_weights) + gating_bias

	gate_values = jax.nn.sigmoid(gate_values)

	weighted_avg *= gate_values

	o_weights = hk.get_parameter(
	'output_w', shape=(num_head, value_dim, self.output_dim),
	init=init)
	o_bias = hk.get_parameter('output_b', shape=(self.output_dim,),
	init=hk.initializers.Constant(0.0))

	output = jnp.einsum('bqhc,hco->bqo', weighted_avg, o_weights) + o_bias

	return output


	class GlobalAttention(hk.Module):
	"""Global attention.

	Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention" lines 2-7
	"""

	def __init__(self, config, global_config, output_dim, name='attention'):
	super().__init__(name=name)

	self.config = config
	self.global_config = global_config
	self.output_dim = output_dim

	def __call__(self, q_data, m_data, q_mask, bias):
	"""Builds GlobalAttention module.

	Arguments:
	q_data: A tensor of queries with size [batch_size, N_queries,
	q_channels]
	m_data: A tensor of memories from which the keys and values
	projected. Size [batch_size, N_keys, m_channels]
	q_mask: A binary mask for q_data with zeros in the padded sequence
	elements and ones otherwise. Size [batch_size, N_queries, q_channels]
	(or broadcastable to this shape).
	bias: A bias for the attention.

	Returns:
	A float32 tensor of size [batch_size, N_queries, output_dim].
	"""
	# Sensible default for when the config keys are missing
	key_dim = self.config.get('key_dim', int(q_data.shape[-1]))
	value_dim = self.config.get('value_dim', int(m_data.shape[-1]))
	num_head = self.config.num_head
	assert key_dim % num_head == 0
	assert value_dim % num_head == 0
	key_dim = key_dim // num_head
	value_dim = value_dim // num_head

	q_weights = hk.get_parameter(
	'query_w', shape=(q_data.shape[-1], num_head, key_dim),
	init=glorot_uniform())
	k_weights = hk.get_parameter(
	'key_w', shape=(m_data.shape[-1], key_dim),
	init=glorot_uniform())
	v_weights = hk.get_parameter(
	'value_w', shape=(m_data.shape[-1], value_dim),
	init=glorot_uniform())

	v = jnp.einsum('bka,ac->bkc', m_data, v_weights)

	q_avg = utils.mask_mean(q_mask, q_data, axis=1)

	q = jnp.einsum('ba,ahc->bhc', q_avg, q_weights) * key_dim**(-0.5)
	k = jnp.einsum('bka,ac->bkc', m_data, k_weights)
	bias = (1e9 * (q_mask[:, None, :, 0] - 1.))
	logits = jnp.einsum('bhc,bkc->bhk', q, k) + bias
	weights = jax.nn.softmax(logits)
	weighted_avg = jnp.einsum('bhk,bkc->bhc', weights, v)

	if self.global_config.zero_init:
	init = hk.initializers.Constant(0.0)
	else:
	init = glorot_uniform()

	o_weights = hk.get_parameter(
	'output_w', shape=(num_head, value_dim, self.output_dim),
	init=init)
	o_bias = hk.get_parameter('output_b', shape=(self.output_dim,),
	init=hk.initializers.Constant(0.0))

	if self.config.gating:
	gating_weights = hk.get_parameter(
	'gating_w',
	shape=(q_data.shape[-1], num_head, value_dim),
	init=hk.initializers.Constant(0.0))
	gating_bias = hk.get_parameter(
	'gating_b',
	shape=(num_head, value_dim),
	init=hk.initializers.Constant(1.0))

	gate_values = jnp.einsum('bqc, chv->bqhv', q_data, gating_weights)
	gate_values = jax.nn.sigmoid(gate_values + gating_bias)
	weighted_avg = weighted_avg[:, None] * gate_values
	output = jnp.einsum('bqhc,hco->bqo', weighted_avg, o_weights) + o_bias
	else:
	output = jnp.einsum('bhc,hco->bo', weighted_avg, o_weights) + o_bias
	output = output[:, None]
	return output


	class MSARowAttentionWithPairBias(hk.Module):
	"""MSA per-row attention biased by the pair representation.

	Jumper et al. (2021) Suppl. Alg. 7 "MSARowAttentionWithPairBias"
	"""

	def __init__(self, config, global_config,
	name='msa_row_attention_with_pair_bias'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self,
	msa_act,
	msa_mask,
	pair_act,
	is_training=False):
	"""Builds MSARowAttentionWithPairBias module.

	Arguments:
	msa_act: [N_seq, N_res, c_m] MSA representation.
	msa_mask: [N_seq, N_res] mask of non-padded regions.
	pair_act: [N_res, N_res, c_z] pair representation.
	is_training: Whether the module is in training mode.

	Returns:
	Update to msa_act, shape [N_seq, N_res, c_m].
	"""
	c = self.config

	assert len(msa_act.shape) == 3
	assert len(msa_mask.shape) == 2
	assert c.orientation == 'per_row'

	bias = (1e9 * (msa_mask - 1.))[:, None, None, :]
	assert len(bias.shape) == 4

	msa_act = hk.LayerNorm(
	axis=[-1], create_scale=True, create_offset=True, name='query_norm')(
	msa_act)

	pair_act = hk.LayerNorm(
	axis=[-1],
	create_scale=True,
	create_offset=True,
	name='feat_2d_norm')(
	pair_act)

	init_factor = 1. / jnp.sqrt(int(pair_act.shape[-1]))
	weights = hk.get_parameter(
	'feat_2d_weights',
	shape=(pair_act.shape[-1], c.num_head),
	init=hk.initializers.RandomNormal(stddev=init_factor))
	nonbatched_bias = jnp.einsum('qkc,ch->hqk', pair_act, weights)

	attn_mod = Attention(
	c, self.global_config, msa_act.shape[-1])
	msa_act = mapping.inference_subbatch(
	attn_mod,
	self.global_config.subbatch_size,
	batched_args=[msa_act, msa_act, bias],
	nonbatched_args=[nonbatched_bias],
	low_memory=not is_training)

	return msa_act


	class MSAColumnAttention(hk.Module):
	"""MSA per-column attention.

	Jumper et al. (2021) Suppl. Alg. 8 "MSAColumnAttention"
	"""

	def __init__(self, config, global_config, name='msa_column_attention'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self,
	msa_act,
	msa_mask,
	is_training=False):
	"""Builds MSAColumnAttention module.

	Arguments:
	msa_act: [N_seq, N_res, c_m] MSA representation.
	msa_mask: [N_seq, N_res] mask of non-padded regions.
	is_training: Whether the module is in training mode.

	Returns:
	Update to msa_act, shape [N_seq, N_res, c_m]
	"""
	c = self.config

	assert len(msa_act.shape) == 3
	assert len(msa_mask.shape) == 2
	assert c.orientation == 'per_column'

	msa_act = jnp.swapaxes(msa_act, -2, -3)
	msa_mask = jnp.swapaxes(msa_mask, -1, -2)

	bias = (1e9 * (msa_mask - 1.))[:, None, None, :]
	assert len(bias.shape) == 4

	msa_act = hk.LayerNorm(
	axis=[-1], create_scale=True, create_offset=True, name='query_norm')(
	msa_act)

	attn_mod = Attention(
	c, self.global_config, msa_act.shape[-1])
	msa_act = mapping.inference_subbatch(
	attn_mod,
	self.global_config.subbatch_size,
	batched_args=[msa_act, msa_act, bias],
	nonbatched_args=[],
	low_memory=not is_training)

	msa_act = jnp.swapaxes(msa_act, -2, -3)

	return msa_act


	class MSAColumnGlobalAttention(hk.Module):
	"""MSA per-column global attention.

	Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention"
	"""

	def __init__(self, config, global_config, name='msa_column_global_attention'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self,
	msa_act,
	msa_mask,
	is_training=False):
	"""Builds MSAColumnGlobalAttention module.

	Arguments:
	msa_act: [N_seq, N_res, c_m] MSA representation.
	msa_mask: [N_seq, N_res] mask of non-padded regions.
	is_training: Whether the module is in training mode.

	Returns:
	Update to msa_act, shape [N_seq, N_res, c_m].
	"""
	c = self.config

	assert len(msa_act.shape) == 3
	assert len(msa_mask.shape) == 2
	assert c.orientation == 'per_column'

	msa_act = jnp.swapaxes(msa_act, -2, -3)
	msa_mask = jnp.swapaxes(msa_mask, -1, -2)

	bias = (1e9 * (msa_mask - 1.))[:, None, None, :]
	assert len(bias.shape) == 4

	msa_act = hk.LayerNorm(
	axis=[-1], create_scale=True, create_offset=True, name='query_norm')(
	msa_act)

	attn_mod = GlobalAttention(
	c, self.global_config, msa_act.shape[-1],
	name='attention')
	# [N_seq, N_res, 1]
	msa_mask = jnp.expand_dims(msa_mask, axis=-1)
	msa_act = mapping.inference_subbatch(
	attn_mod,
	self.global_config.subbatch_size,
	batched_args=[msa_act, msa_act, msa_mask, bias],
	nonbatched_args=[],
	low_memory=not is_training)

	msa_act = jnp.swapaxes(msa_act, -2, -3)

	return msa_act


	class TriangleAttention(hk.Module):
	"""Triangle Attention.

	Jumper et al. (2021) Suppl. Alg. 13 "TriangleAttentionStartingNode"
	Jumper et al. (2021) Suppl. Alg. 14 "TriangleAttentionEndingNode"
	"""

	def __init__(self, config, global_config, name='triangle_attention'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, pair_act, pair_mask, is_training=False):
	"""Builds TriangleAttention module.

	Arguments:
	pair_act: [N_res, N_res, c_z] pair activations tensor
	pair_mask: [N_res, N_res] mask of non-padded regions in the tensor.
	is_training: Whether the module is in training mode.

	Returns:
	Update to pair_act, shape [N_res, N_res, c_z].
	"""
	c = self.config

	assert len(pair_act.shape) == 3
	assert len(pair_mask.shape) == 2
	assert c.orientation in ['per_row', 'per_column']

	if c.orientation == 'per_column':
	pair_act = jnp.swapaxes(pair_act, -2, -3)
	pair_mask = jnp.swapaxes(pair_mask, -1, -2)

	bias = (1e9 * (pair_mask - 1.))[:, None, None, :]
	assert len(bias.shape) == 4

	pair_act = hk.LayerNorm(
	axis=[-1], create_scale=True, create_offset=True, name='query_norm')(
	pair_act)

	init_factor = 1. / jnp.sqrt(int(pair_act.shape[-1]))
	weights = hk.get_parameter(
	'feat_2d_weights',
	shape=(pair_act.shape[-1], c.num_head),
	init=hk.initializers.RandomNormal(stddev=init_factor))
	nonbatched_bias = jnp.einsum('qkc,ch->hqk', pair_act, weights)

	attn_mod = Attention(
	c, self.global_config, pair_act.shape[-1])
	pair_act = mapping.inference_subbatch(
	attn_mod,
	self.global_config.subbatch_size,
	batched_args=[pair_act, pair_act, bias],
	nonbatched_args=[nonbatched_bias],
	low_memory=not is_training)

	if c.orientation == 'per_column':
	pair_act = jnp.swapaxes(pair_act, -2, -3)

	return pair_act


	class MaskedMsaHead(hk.Module):
	"""Head to predict MSA at the masked locations.

	The MaskedMsaHead employs a BERT-style objective to reconstruct a masked
	version of the full MSA, based on a linear projection of
	the MSA representation.
	Jumper et al. (2021) Suppl. Sec. 1.9.9 "Masked MSA prediction"
	"""

	def __init__(self, config, global_config, name='masked_msa_head'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, representations, batch, is_training):
	"""Builds MaskedMsaHead module.

	Arguments:
	representations: Dictionary of representations, must contain:
	* 'msa': MSA representation, shape [N_seq, N_res, c_m].
	batch: Batch, unused.
	is_training: Whether the module is in training mode.

	Returns:
	Dictionary containing:
	* 'logits': logits of shape [N_seq, N_res, N_aatype] with
	(unnormalized) log probabilies of predicted aatype at position.
	"""
	del batch
	logits = common_modules.Linear(
	self.config.num_output,
	initializer=utils.final_init(self.global_config),
	name='logits')(
	representations['msa'])
	return dict(logits=logits)

	def loss(self, value, batch):
	errors = softmax_cross_entropy(
	labels=jax.nn.one_hot(batch['true_msa'], num_classes=23),
	logits=value['logits'])
	loss = (jnp.sum(errors * batch['bert_mask'], axis=(-2, -1)) /
	(1e-8 + jnp.sum(batch['bert_mask'], axis=(-2, -1))))
	return {'loss': loss}


	class PredictedLDDTHead(hk.Module):
	"""Head to predict the per-residue LDDT to be used as a confidence measure.

	Jumper et al. (2021) Suppl. Sec. 1.9.6 "Model confidence prediction (pLDDT)"
	Jumper et al. (2021) Suppl. Alg. 29 "predictPerResidueLDDT_Ca"
	"""

	def __init__(self, config, global_config, name='predicted_lddt_head'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, representations, batch, is_training):
	"""Builds ExperimentallyResolvedHead module.

	Arguments:
	representations: Dictionary of representations, must contain:
	* 'structure_module': Single representation from the structure module,
	shape [N_res, c_s].
	batch: Batch, unused.
	is_training: Whether the module is in training mode.

	Returns:
	Dictionary containing :
	* 'logits': logits of shape [N_res, N_bins] with
	(unnormalized) log probabilies of binned predicted lDDT.
	"""
	act = representations['structure_module']

	act = hk.LayerNorm(
	axis=[-1],
	create_scale=True,
	create_offset=True,
	name='input_layer_norm')(
	act)

	act = common_modules.Linear(
	self.config.num_channels,
	initializer='relu',
	name='act_0')(
	act)
	act = jax.nn.relu(act)

	act = common_modules.Linear(
	self.config.num_channels,
	initializer='relu',
	name='act_1')(
	act)
	act = jax.nn.relu(act)

	logits = common_modules.Linear(
	self.config.num_bins,
	initializer=utils.final_init(self.global_config),
	name='logits')(
	act)
	# Shape (batch_size, num_res, num_bins)
	return dict(logits=logits)

	def loss(self, value, batch):
	# Shape (num_res, 37, 3)
	pred_all_atom_pos = value['structure_module']['final_atom_positions']
	# Shape (num_res, 37, 3)
	true_all_atom_pos = batch['all_atom_positions']
	# Shape (num_res, 37)
	all_atom_mask = batch['all_atom_mask']

	# Shape (num_res,)
	lddt_ca = lddt.lddt(
	# Shape (batch_size, num_res, 3)
	predicted_points=pred_all_atom_pos[None, :, 1, :],
	# Shape (batch_size, num_res, 3)
	true_points=true_all_atom_pos[None, :, 1, :],
	# Shape (batch_size, num_res, 1)
	true_points_mask=all_atom_mask[None, :, 1:2].astype(jnp.float32),
	cutoff=15.,
	per_residue=True)[0]
	lddt_ca = jax.lax.stop_gradient(lddt_ca)

	num_bins = self.config.num_bins
	bin_index = jnp.floor(lddt_ca * num_bins).astype(jnp.int32)

	# protect against out of range for lddt_ca == 1
	bin_index = jnp.minimum(bin_index, num_bins - 1)
	lddt_ca_one_hot = jax.nn.one_hot(bin_index, num_classes=num_bins)

	# Shape (num_res, num_channel)
	logits = value['predicted_lddt']['logits']
	errors = softmax_cross_entropy(labels=lddt_ca_one_hot, logits=logits)

	# Shape (num_res,)
	mask_ca = all_atom_mask[:, residue_constants.atom_order['CA']]
	mask_ca = mask_ca.astype(jnp.float32)
	loss = jnp.sum(errors * mask_ca) / (jnp.sum(mask_ca) + 1e-8)

	if self.config.filter_by_resolution:
	# NMR & distillation have resolution = 0
	loss *= ((batch['resolution'] >= self.config.min_resolution)
	& (batch['resolution'] <= self.config.max_resolution)).astype(
	jnp.float32)

	output = {'loss': loss}
	return output


	class PredictedAlignedErrorHead(hk.Module):
	"""Head to predict the distance errors in the backbone alignment frames.

	Can be used to compute predicted TM-Score.
	Jumper et al. (2021) Suppl. Sec. 1.9.7 "TM-score prediction"
	"""

	def __init__(self, config, global_config,
	name='predicted_aligned_error_head'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, representations, batch, is_training):
	"""Builds PredictedAlignedErrorHead module.

	Arguments:
	representations: Dictionary of representations, must contain:
	* 'pair': pair representation, shape [N_res, N_res, c_z].
	batch: Batch, unused.
	is_training: Whether the module is in training mode.

	Returns:
	Dictionary containing:
	* logits: logits for aligned error, shape [N_res, N_res, N_bins].
	* bin_breaks: array containing bin breaks, shape [N_bins - 1].
	"""

	act = representations['pair']

	# Shape (num_res, num_res, num_bins)
	logits = common_modules.Linear(
	self.config.num_bins,
	initializer=utils.final_init(self.global_config),
	name='logits')(act)
	# Shape (num_bins,)
	breaks = jnp.linspace(
	0., self.config.max_error_bin, self.config.num_bins - 1)
	return dict(logits=logits, breaks=breaks)

	def loss(self, value, batch):
	# Shape (num_res, 7)
	predicted_affine = quat_affine.QuatAffine.from_tensor(
	value['structure_module']['final_affines'])
	# Shape (num_res, 7)
	true_affine = quat_affine.QuatAffine.from_tensor(
	batch['backbone_affine_tensor'])
	# Shape (num_res)
	mask = batch['backbone_affine_mask']
	# Shape (num_res, num_res)
	square_mask = mask[:, None] * mask[None, :]
	num_bins = self.config.num_bins
	# (1, num_bins - 1)
	breaks = value['predicted_aligned_error']['breaks']
	# (1, num_bins)
	logits = value['predicted_aligned_error']['logits']

	# Compute the squared error for each alignment.
	def _local_frame_points(affine):
	points = [jnp.expand_dims(x, axis=-2) for x in affine.translation]
	return affine.invert_point(points, extra_dims=1)
	error_dist2_xyz = [
	jnp.square(a - b)
	for a, b in zip(_local_frame_points(predicted_affine),
	_local_frame_points(true_affine))]
	error_dist2 = sum(error_dist2_xyz)
	# Shape (num_res, num_res)
	# First num_res are alignment frames, second num_res are the residues.
	error_dist2 = jax.lax.stop_gradient(error_dist2)

	sq_breaks = jnp.square(breaks)
	true_bins = jnp.sum((
	error_dist2[..., None] > sq_breaks).astype(jnp.int32), axis=-1)

	errors = softmax_cross_entropy(
	labels=jax.nn.one_hot(true_bins, num_bins, axis=-1), logits=logits)

	loss = (jnp.sum(errors * square_mask, axis=(-2, -1)) /
	(1e-8 + jnp.sum(square_mask, axis=(-2, -1))))

	if self.config.filter_by_resolution:
	# NMR & distillation have resolution = 0
	loss *= ((batch['resolution'] >= self.config.min_resolution)
	& (batch['resolution'] <= self.config.max_resolution)).astype(
	jnp.float32)

	output = {'loss': loss}
	return output


	class ExperimentallyResolvedHead(hk.Module):
	"""Predicts if an atom is experimentally resolved in a high-res structure.

	Only trained on high-resolution X-ray crystals & cryo-EM.
	Jumper et al. (2021) Suppl. Sec. 1.9.10 '"Experimentally resolved" prediction'
	"""

	def __init__(self, config, global_config,
	name='experimentally_resolved_head'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, representations, batch, is_training):
	"""Builds ExperimentallyResolvedHead module.

	Arguments:
	representations: Dictionary of representations, must contain:
	* 'single': Single representation, shape [N_res, c_s].
	batch: Batch, unused.
	is_training: Whether the module is in training mode.

	Returns:
	Dictionary containing:
	* 'logits': logits of shape [N_res, 37],
	log probability that an atom is resolved in atom37 representation,
	can be converted to probability by applying sigmoid.
	"""
	logits = common_modules.Linear(
	37, # atom_exists.shape[-1]
	initializer=utils.final_init(self.global_config),
	name='logits')(representations['single'])
	return dict(logits=logits)

	def loss(self, value, batch):
	logits = value['logits']
	assert len(logits.shape) == 2

	# Does the atom appear in the amino acid?
	atom_exists = batch['atom37_atom_exists']
	# Is the atom resolved in the experiment? Subset of atom_exists,
	# except for OXT
	all_atom_mask = batch['all_atom_mask'].astype(jnp.float32)

	xent = sigmoid_cross_entropy(labels=all_atom_mask, logits=logits)
	loss = jnp.sum(xent * atom_exists) / (1e-8 + jnp.sum(atom_exists))

	if self.config.filter_by_resolution:
	# NMR & distillation examples have resolution = 0.
	loss *= ((batch['resolution'] >= self.config.min_resolution)
	& (batch['resolution'] <= self.config.max_resolution)).astype(
	jnp.float32)

	output = {'loss': loss}
	return output


	class TriangleMultiplication(hk.Module):
	"""Triangle multiplication layer ("outgoing" or "incoming").

	Jumper et al. (2021) Suppl. Alg. 11 "TriangleMultiplicationOutgoing"
	Jumper et al. (2021) Suppl. Alg. 12 "TriangleMultiplicationIncoming"
	"""

	def __init__(self, config, global_config, name='triangle_multiplication'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, act, mask, is_training=True):
	"""Builds TriangleMultiplication module.

	Arguments:
	act: Pair activations, shape [N_res, N_res, c_z]
	mask: Pair mask, shape [N_res, N_res].
	is_training: Whether the module is in training mode.

	Returns:
	Outputs, same shape/type as act.
	"""
	del is_training
	c = self.config
	gc = self.global_config

	mask = mask[..., None]

	act = hk.LayerNorm(axis=[-1], create_scale=True, create_offset=True,
	name='layer_norm_input')(act)
	input_act = act

	left_projection = common_modules.Linear(
	c.num_intermediate_channel,
	name='left_projection')
	left_proj_act = mask * left_projection(act)

	right_projection = common_modules.Linear(
	c.num_intermediate_channel,
	name='right_projection')
	right_proj_act = mask * right_projection(act)

	left_gate_values = jax.nn.sigmoid(common_modules.Linear(
	c.num_intermediate_channel,
	bias_init=1.,
	initializer=utils.final_init(gc),
	name='left_gate')(act))

	right_gate_values = jax.nn.sigmoid(common_modules.Linear(
	c.num_intermediate_channel,
	bias_init=1.,
	initializer=utils.final_init(gc),
	name='right_gate')(act))

	left_proj_act *= left_gate_values
	right_proj_act *= right_gate_values

	# "Outgoing" edges equation: 'ikc,jkc->ijc'
	# "Incoming" edges equation: 'kjc,kic->ijc'
	# Note on the Suppl. Alg. 11 & 12 notation:
	# For the "outgoing" edges, a = left_proj_act and b = right_proj_act
	# For the "incoming" edges, it's swapped:
	# b = left_proj_act and a = right_proj_act
	act = jnp.einsum(c.equation, left_proj_act, right_proj_act)

	act = hk.LayerNorm(
	axis=[-1],
	create_scale=True,
	create_offset=True,
	name='center_layer_norm')(
	act)

	output_channel = int(input_act.shape[-1])

	act = common_modules.Linear(
	output_channel,
	initializer=utils.final_init(gc),
	name='output_projection')(act)

	gate_values = jax.nn.sigmoid(common_modules.Linear(
	output_channel,
	bias_init=1.,
	initializer=utils.final_init(gc),
	name='gating_linear')(input_act))
	act *= gate_values

	return act


	class DistogramHead(hk.Module):
	"""Head to predict a distogram.

	Jumper et al. (2021) Suppl. Sec. 1.9.8 "Distogram prediction"
	"""

	def __init__(self, config, global_config, name='distogram_head'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, representations, batch, is_training):
	"""Builds DistogramHead module.

	Arguments:
	representations: Dictionary of representations, must contain:
	* 'pair': pair representation, shape [N_res, N_res, c_z].
	batch: Batch, unused.
	is_training: Whether the module is in training mode.

	Returns:
	Dictionary containing:
	* logits: logits for distogram, shape [N_res, N_res, N_bins].
	* bin_breaks: array containing bin breaks, shape [N_bins - 1,].
	"""
	half_logits = common_modules.Linear(
	self.config.num_bins,
	initializer=utils.final_init(self.global_config),
	name='half_logits')(
	representations['pair'])

	logits = half_logits + jnp.swapaxes(half_logits, -2, -3)
	breaks = jnp.linspace(self.config.first_break, self.config.last_break,
	self.config.num_bins - 1)

	return dict(logits=logits, bin_edges=breaks)

	def loss(self, value, batch):
	return _distogram_log_loss(value['logits'], value['bin_edges'],
	batch, self.config.num_bins)


	def _distogram_log_loss(logits, bin_edges, batch, num_bins):
	"""Log loss of a distogram."""

	assert len(logits.shape) == 3
	positions = batch['pseudo_beta']
	mask = batch['pseudo_beta_mask']

	assert positions.shape[-1] == 3

	sq_breaks = jnp.square(bin_edges)

	dist2 = jnp.sum(
	jnp.square(
	jnp.expand_dims(positions, axis=-2) -
	jnp.expand_dims(positions, axis=-3)),
	axis=-1,
	keepdims=True)

	true_bins = jnp.sum(dist2 > sq_breaks, axis=-1)

	errors = softmax_cross_entropy(
	labels=jax.nn.one_hot(true_bins, num_bins), logits=logits)

	square_mask = jnp.expand_dims(mask, axis=-2) * jnp.expand_dims(mask, axis=-1)

	avg_error = (
	jnp.sum(errors * square_mask, axis=(-2, -1)) /
	(1e-6 + jnp.sum(square_mask, axis=(-2, -1))))
	dist2 = dist2[..., 0]
	return dict(loss=avg_error, true_dist=jnp.sqrt(1e-6 + dist2))


	class OuterProductMean(hk.Module):
	"""Computes mean outer product.

	Jumper et al. (2021) Suppl. Alg. 10 "OuterProductMean"
	"""

	def __init__(self,
	config,
	global_config,
	num_output_channel,
	name='outer_product_mean'):
	super().__init__(name=name)
	self.global_config = global_config
	self.config = config
	self.num_output_channel = num_output_channel

	def __call__(self, act, mask, is_training=True):
	"""Builds OuterProductMean module.

	Arguments:
	act: MSA representation, shape [N_seq, N_res, c_m].
	mask: MSA mask, shape [N_seq, N_res].
	is_training: Whether the module is in training mode.

	Returns:
	Update to pair representation, shape [N_res, N_res, c_z].
	"""
	gc = self.global_config
	c = self.config

	mask = mask[..., None]
	act = hk.LayerNorm([-1], True, True, name='layer_norm_input')(act)

	left_act = mask * common_modules.Linear(
	c.num_outer_channel,
	initializer='linear',
	name='left_projection')(
	act)

	right_act = mask * common_modules.Linear(
	c.num_outer_channel,
	initializer='linear',
	name='right_projection')(
	act)

	if gc.zero_init:
	init_w = hk.initializers.Constant(0.0)
	else:
	init_w = hk.initializers.VarianceScaling(scale=2., mode='fan_in')

	output_w = hk.get_parameter(
	'output_w',
	shape=(c.num_outer_channel, c.num_outer_channel,
	self.num_output_channel),
	init=init_w)
	output_b = hk.get_parameter(
	'output_b', shape=(self.num_output_channel,),
	init=hk.initializers.Constant(0.0))

	def compute_chunk(left_act):
	# This is equivalent to
	#
	# act = jnp.einsum('abc,ade->dceb', left_act, right_act)
	# act = jnp.einsum('dceb,cef->bdf', act, output_w) + output_b
	#
	# but faster.
	left_act = jnp.transpose(left_act, [0, 2, 1])
	act = jnp.einsum('acb,ade->dceb', left_act, right_act)
	act = jnp.einsum('dceb,cef->dbf', act, output_w) + output_b
	return jnp.transpose(act, [1, 0, 2])

	act = mapping.inference_subbatch(
	compute_chunk,
	c.chunk_size,
	batched_args=[left_act],
	nonbatched_args=[],
	low_memory=True,
	input_subbatch_dim=1,
	output_subbatch_dim=0)

	epsilon = 1e-3
	norm = jnp.einsum('abc,adc->bdc', mask, mask)
	act /= epsilon + norm

	return act

	def dgram_from_positions(positions, num_bins, min_bin, max_bin):
	"""Compute distogram from amino acid positions.
	Arguments:
	positions: [N_res, 3] Position coordinates.
	num_bins: The number of bins in the distogram.
	min_bin: The left edge of the first bin.
	max_bin: The left edge of the final bin. The final bin catches
	everything larger than `max_bin`.
	Returns:
	Distogram with the specified number of bins.
	"""
	def squared_difference(x, y):
	return jnp.square(x - y)

	lower_breaks = jnp.linspace(min_bin, max_bin, num_bins)
	lower_breaks = jnp.square(lower_breaks)
	upper_breaks = jnp.concatenate([lower_breaks[1:],jnp.array([1e8], dtype=jnp.float32)], axis=-1)
	dist2 = jnp.sum(
	squared_difference(
	jnp.expand_dims(positions, axis=-2),
	jnp.expand_dims(positions, axis=-3)),
	axis=-1, keepdims=True)

	return ((dist2 > lower_breaks).astype(jnp.float32) * (dist2 < upper_breaks).astype(jnp.float32))

	def dgram_from_positions_soft(positions, num_bins, min_bin, max_bin, temp=2.0):
	'''soft positions to dgram converter'''
	lower_breaks = jnp.append(-1e8,jnp.linspace(min_bin, max_bin, num_bins))
	upper_breaks = jnp.append(lower_breaks[1:],1e8)
	dist = jnp.sqrt(jnp.square(positions[...,:,None,:] - positions[...,None,:,:]).sum(-1,keepdims=True) + 1e-8)
	o = jax.nn.sigmoid((dist - lower_breaks)/temp) * jax.nn.sigmoid((upper_breaks - dist)/temp)
	o = o/(o.sum(-1,keepdims=True) + 1e-8)
	return o[...,1:]

	def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks):
	"""Create pseudo beta features."""

	ca_idx = residue_constants.atom_order['CA']
	cb_idx = residue_constants.atom_order['CB']

	if jnp.issubdtype(aatype.dtype, jnp.integer):
	is_gly = jnp.equal(aatype, residue_constants.restype_order['G'])
	is_gly_tile = jnp.tile(is_gly[..., None], [1] * len(is_gly.shape) + [3])
	pseudo_beta = jnp.where(is_gly_tile, all_atom_positions[..., ca_idx, :], all_atom_positions[..., cb_idx, :])

	if all_atom_masks is not None:
	pseudo_beta_mask = jnp.where(is_gly, all_atom_masks[..., ca_idx], all_atom_masks[..., cb_idx])
	pseudo_beta_mask = pseudo_beta_mask.astype(jnp.float32)
	return pseudo_beta, pseudo_beta_mask
	else:
	return pseudo_beta
	else:
	is_gly = aatype[...,residue_constants.restype_order['G']]
	ca_pos = all_atom_positions[...,ca_idx,:]
	cb_pos = all_atom_positions[...,cb_idx,:]
	pseudo_beta = is_gly[...,None] * ca_pos + (1-is_gly[...,None]) * cb_pos
	if all_atom_masks is not None:
	ca_mask = all_atom_masks[...,ca_idx]
	cb_mask = all_atom_masks[...,cb_idx]
	pseudo_beta_mask = is_gly * ca_mask + (1-is_gly) * cb_mask
	return pseudo_beta, pseudo_beta_mask
	else:
	return pseudo_beta

	class EvoformerIteration(hk.Module):
	"""Single iteration (block) of Evoformer stack.
	Jumper et al. (2021) Suppl. Alg. 6 "EvoformerStack" lines 2-10
	"""

	def __init__(self, config, global_config, is_extra_msa,
	name='evoformer_iteration'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config
	self.is_extra_msa = is_extra_msa

	def __call__(self, activations, masks, is_training=True, safe_key=None, scale_rate=1.0):
	"""Builds EvoformerIteration module.

	Arguments:
	activations: Dictionary containing activations:
	* 'msa': MSA activations, shape [N_seq, N_res, c_m].
	* 'pair': pair activations, shape [N_res, N_res, c_z].
	masks: Dictionary of masks:
	* 'msa': MSA mask, shape [N_seq, N_res].
	* 'pair': pair mask, shape [N_res, N_res].
	is_training: Whether the module is in training mode.
	safe_key: prng.SafeKey encapsulating rng key.

	Returns:
	Outputs, same shape/type as act.
	"""
	c = self.config
	gc = self.global_config

	msa_act, pair_act = activations['msa'], activations['pair']

	if safe_key is None:
	safe_key = prng.SafeKey(hk.next_rng_key())

	msa_mask, pair_mask = masks['msa'], masks['pair']

	dropout_wrapper_fn = functools.partial(
	dropout_wrapper,
	is_training=is_training,
	global_config=gc,
	scale_rate=scale_rate)

	safe_key, *sub_keys = safe_key.split(10)
	sub_keys = iter(sub_keys)

	msa_act = dropout_wrapper_fn(
	MSARowAttentionWithPairBias(
	c.msa_row_attention_with_pair_bias, gc,
	name='msa_row_attention_with_pair_bias'),
	msa_act,
	msa_mask,
	safe_key=next(sub_keys),
	pair_act=pair_act)

	if not self.is_extra_msa:
	attn_mod = MSAColumnAttention(
	c.msa_column_attention, gc, name='msa_column_attention')
	else:
	attn_mod = MSAColumnGlobalAttention(
	c.msa_column_attention, gc, name='msa_column_global_attention')
	msa_act = dropout_wrapper_fn(
	attn_mod,
	msa_act,
	msa_mask,
	safe_key=next(sub_keys))

	msa_act = dropout_wrapper_fn(
	Transition(c.msa_transition, gc, name='msa_transition'),
	msa_act,
	msa_mask,
	safe_key=next(sub_keys))

	pair_act = dropout_wrapper_fn(
	OuterProductMean(
	config=c.outer_product_mean,
	global_config=self.global_config,
	num_output_channel=int(pair_act.shape[-1]),
	name='outer_product_mean'),
	msa_act,
	msa_mask,
	safe_key=next(sub_keys),
	output_act=pair_act)

	pair_act = dropout_wrapper_fn(
	TriangleMultiplication(c.triangle_multiplication_outgoing, gc,
	name='triangle_multiplication_outgoing'),
	pair_act,
	pair_mask,
	safe_key=next(sub_keys))
	pair_act = dropout_wrapper_fn(
	TriangleMultiplication(c.triangle_multiplication_incoming, gc,
	name='triangle_multiplication_incoming'),
	pair_act,
	pair_mask,
	safe_key=next(sub_keys))

	pair_act = dropout_wrapper_fn(
	TriangleAttention(c.triangle_attention_starting_node, gc,
	name='triangle_attention_starting_node'),
	pair_act,
	pair_mask,
	safe_key=next(sub_keys))
	pair_act = dropout_wrapper_fn(
	TriangleAttention(c.triangle_attention_ending_node, gc,
	name='triangle_attention_ending_node'),
	pair_act,
	pair_mask,
	safe_key=next(sub_keys))

	pair_act = dropout_wrapper_fn(
	Transition(c.pair_transition, gc, name='pair_transition'),
	pair_act,
	pair_mask,
	safe_key=next(sub_keys))

	return {'msa': msa_act, 'pair': pair_act}


	class EmbeddingsAndEvoformer(hk.Module):
	"""Embeds the input data and runs Evoformer.

	Produces the MSA, single and pair representations.
	Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5-18
	"""

	def __init__(self, config, global_config, name='evoformer'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, batch, is_training, safe_key=None):

	c = self.config
	gc = self.global_config

	if safe_key is None:
	safe_key = prng.SafeKey(hk.next_rng_key())

	# Embed clustered MSA.
	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5
	# Jumper et al. (2021) Suppl. Alg. 3 "InputEmbedder"
	preprocess_1d = common_modules.Linear(
	c.msa_channel, name='preprocess_1d')(
	batch['target_feat'])

	preprocess_msa = common_modules.Linear(
	c.msa_channel, name='preprocess_msa')(
	batch['msa_feat'])

	msa_activations = jnp.expand_dims(preprocess_1d, axis=0) + preprocess_msa

	left_single = common_modules.Linear(
	c.pair_channel, name='left_single')(
	batch['target_feat'])
	right_single = common_modules.Linear(
	c.pair_channel, name='right_single')(
	batch['target_feat'])
	pair_activations = left_single[:, None] + right_single[None]
	mask_2d = batch['seq_mask'][:, None] * batch['seq_mask'][None, :]

	# Inject previous outputs for recycling.
	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 6
	# Jumper et al. (2021) Suppl. Alg. 32 "RecyclingEmbedder"

	if "prev_pos" in batch:
	# use predicted position input
	prev_pseudo_beta = pseudo_beta_fn(batch['aatype'], batch['prev_pos'], None)
	if c.backprop_dgram:
	dgram = dgram_from_positions_soft(prev_pseudo_beta, temp=c.backprop_dgram_temp, **c.prev_pos)
	else:
	dgram = dgram_from_positions(prev_pseudo_beta, **c.prev_pos)

	elif 'prev_dgram' in batch:
	# use predicted distogram input (from Sergey)
	dgram = jax.nn.softmax(batch["prev_dgram"])
	dgram_map = jax.nn.one_hot(jnp.repeat(jnp.append(0,jnp.arange(15)),4),15).at[:,0].set(0)
	dgram = dgram @ dgram_map

	pair_activations += common_modules.Linear(c.pair_channel, name='prev_pos_linear')(dgram)

	if c.recycle_features:
	if 'prev_msa_first_row' in batch:
	prev_msa_first_row = hk.LayerNorm([-1],
	True,
	True,
	name='prev_msa_first_row_norm')(
	batch['prev_msa_first_row'])
	msa_activations = msa_activations.at[0].add(prev_msa_first_row)

	if 'prev_pair' in batch:
	pair_activations += hk.LayerNorm([-1],
	True,
	True,
	name='prev_pair_norm')(
	batch['prev_pair'])

	# Relative position encoding.
	# Jumper et al. (2021) Suppl. Alg. 4 "relpos"
	# Jumper et al. (2021) Suppl. Alg. 5 "one_hot"
	if c.max_relative_feature:
	# Add one-hot-encoded clipped residue distances to the pair activations.
	if "rel_pos" in batch:
	rel_pos = batch['rel_pos']
	else:
	if "offset" in batch:
	offset = batch['offset']
	else:
	pos = batch['residue_index']
	offset = pos[:, None] - pos[None, :]
	rel_pos = jax.nn.one_hot(
	jnp.clip(
	offset + c.max_relative_feature,
	a_min=0,
	a_max=2 * c.max_relative_feature),
	2 * c.max_relative_feature + 1)
	pair_activations += common_modules.Linear(c.pair_channel, name='pair_activiations')(rel_pos)

	# Embed templates into the pair activations.
	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-13

	if c.template.enabled:
	template_batch = {k: batch[k] for k in batch if k.startswith('template_')}
	template_pair_representation = TemplateEmbedding(c.template, gc)(
	pair_activations,
	template_batch,
	mask_2d,
	is_training=is_training,
	scale_rate=batch["scale_rate"])

	pair_activations += template_pair_representation

	# Embed extra MSA features.
	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 14-16
	extra_msa_feat = create_extra_msa_feature(batch)
	extra_msa_activations = common_modules.Linear(
	c.extra_msa_channel,
	name='extra_msa_activations')(
	extra_msa_feat)

	# Extra MSA Stack.
	# Jumper et al. (2021) Suppl. Alg. 18 "ExtraMsaStack"
	extra_msa_stack_input = {
	'msa': extra_msa_activations,
	'pair': pair_activations,
	}

	extra_msa_stack_iteration = EvoformerIteration(
	c.evoformer, gc, is_extra_msa=True, name='extra_msa_stack')

	def extra_msa_stack_fn(x):
	act, safe_key = x
	safe_key, safe_subkey = safe_key.split()
	extra_evoformer_output = extra_msa_stack_iteration(
	activations=act,
	masks={
	'msa': batch['extra_msa_mask'],
	'pair': mask_2d
	},
	is_training=is_training,
	safe_key=safe_subkey, scale_rate=batch["scale_rate"])
	return (extra_evoformer_output, safe_key)

	if gc.use_remat:
	extra_msa_stack_fn = hk.remat(extra_msa_stack_fn)

	extra_msa_stack = layer_stack.layer_stack(
	c.extra_msa_stack_num_block)(
	extra_msa_stack_fn)
	extra_msa_output, safe_key = extra_msa_stack(
	(extra_msa_stack_input, safe_key))

	pair_activations = extra_msa_output['pair']

	evoformer_input = {
	'msa': msa_activations,
	'pair': pair_activations,
	}

	evoformer_masks = {'msa': batch['msa_mask'], 'pair': mask_2d}

	####################################################################
	####################################################################

	# Append num_templ rows to msa_activations with template embeddings.
	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 7-8
	if c.template.enabled and c.template.embed_torsion_angles:
	if jnp.issubdtype(batch['template_aatype'].dtype, jnp.integer):
	num_templ, num_res = batch['template_aatype'].shape
	# Embed the templates aatypes.
	aatype = batch['template_aatype']
	aatype_one_hot = jax.nn.one_hot(batch['template_aatype'], 22, axis=-1)
	else:
	num_templ, num_res, _ = batch['template_aatype'].shape
	aatype = batch['template_aatype'].argmax(-1)
	aatype_one_hot = batch['template_aatype']

	# Embed the templates aatype, torsion angles and masks.
	# Shape (templates, residues, msa_channels)
	ret = all_atom.atom37_to_torsion_angles(
	aatype=aatype,
	all_atom_pos=batch['template_all_atom_positions'],
	all_atom_mask=batch['template_all_atom_masks'],
	# Ensure consistent behaviour during testing:
	placeholder_for_undefined=not gc.zero_init)

	template_features = jnp.concatenate([
	aatype_one_hot,
	jnp.reshape(ret['torsion_angles_sin_cos'], [num_templ, num_res, 14]),
	jnp.reshape(ret['alt_torsion_angles_sin_cos'], [num_templ, num_res, 14]),
	ret['torsion_angles_mask']], axis=-1)

	template_activations = common_modules.Linear(
	c.msa_channel,
	initializer='relu',
	name='template_single_embedding')(template_features)
	template_activations = jax.nn.relu(template_activations)
	template_activations = common_modules.Linear(
	c.msa_channel,
	initializer='relu',
	name='template_projection')(template_activations)

	# Concatenate the templates to the msa.
	evoformer_input['msa'] = jnp.concatenate([evoformer_input['msa'], template_activations], axis=0)

	# Concatenate templates masks to the msa masks.
	# Use mask from the psi angle, as it only depends on the backbone atoms
	# from a single residue.
	torsion_angle_mask = ret['torsion_angles_mask'][:, :, 2]
	torsion_angle_mask = torsion_angle_mask.astype(evoformer_masks['msa'].dtype)
	evoformer_masks['msa'] = jnp.concatenate([evoformer_masks['msa'], torsion_angle_mask], axis=0)

	####################################################################
	####################################################################

	# Main trunk of the network
	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 17-18
	evoformer_iteration = EvoformerIteration(
	c.evoformer, gc, is_extra_msa=False, name='evoformer_iteration')

	def evoformer_fn(x):
	act, safe_key = x
	safe_key, safe_subkey = safe_key.split()
	evoformer_output = evoformer_iteration(
	activations=act,
	masks=evoformer_masks,
	is_training=is_training,
	safe_key=safe_subkey, scale_rate=batch["scale_rate"])
	return (evoformer_output, safe_key)

	if gc.use_remat:
	evoformer_fn = hk.remat(evoformer_fn)

	evoformer_stack = layer_stack.layer_stack(c.evoformer_num_block)(evoformer_fn)
	evoformer_output, safe_key = evoformer_stack((evoformer_input, safe_key))

	msa_activations = evoformer_output['msa']
	pair_activations = evoformer_output['pair']

	single_activations = common_modules.Linear(
	c.seq_channel, name='single_activations')(msa_activations[0])

	num_sequences = batch['msa_feat'].shape[0]
	output = {
	'single': single_activations,
	'pair': pair_activations,
	# Crop away template rows such that they are not used in MaskedMsaHead.
	'msa': msa_activations[:num_sequences, :, :],
	'msa_first_row': msa_activations[0],
	}

	return output

	####################################################################
	####################################################################
	class SingleTemplateEmbedding(hk.Module):
	"""Embeds a single template.
	Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9+11
	"""

	def __init__(self, config, global_config, name='single_template_embedding'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, query_embedding, batch, mask_2d, is_training, scale_rate=1.0):
	"""Build the single template embedding.
	Arguments:
	query_embedding: Query pair representation, shape [N_res, N_res, c_z].
	batch: A batch of template features (note the template dimension has been
	stripped out as this module only runs over a single template).
	mask_2d: Padding mask (Note: this doesn't care if a template exists,
	unlike the template_pseudo_beta_mask).
	is_training: Whether the module is in training mode.
	Returns:
	A template embedding [N_res, N_res, c_z].
	"""
	assert mask_2d.dtype == query_embedding.dtype
	dtype = query_embedding.dtype
	num_res = batch['template_aatype'].shape[0]
	num_channels = (self.config.template_pair_stack
	.triangle_attention_ending_node.value_dim)
	template_mask = batch['template_pseudo_beta_mask']
	template_mask_2d = template_mask[:, None] * template_mask[None, :]
	template_mask_2d = template_mask_2d.astype(dtype)

	if "template_dgram" in batch:
	template_dgram = batch["template_dgram"]
	else:
	if self.config.backprop_dgram:
	template_dgram = dgram_from_positions_soft(batch['template_pseudo_beta'],
	temp=self.config.backprop_dgram_temp,
	**self.config.dgram_features)
	else:
	template_dgram = dgram_from_positions(batch['template_pseudo_beta'],
	**self.config.dgram_features)
	template_dgram = template_dgram.astype(dtype)

	to_concat = [template_dgram, template_mask_2d[:, :, None]]

	if jnp.issubdtype(batch['template_aatype'].dtype, jnp.integer):
	aatype = jax.nn.one_hot(batch['template_aatype'], 22, axis=-1, dtype=dtype)
	else:
	aatype = batch['template_aatype']

	to_concat.append(jnp.tile(aatype[None, :, :], [num_res, 1, 1]))
	to_concat.append(jnp.tile(aatype[:, None, :], [1, num_res, 1]))

	# Backbone affine mask: whether the residue has C, CA, N
	# (the template mask defined above only considers pseudo CB).
	n, ca, c = [residue_constants.atom_order[a] for a in ('N', 'CA', 'C')]
	template_mask = (
	batch['template_all_atom_masks'][..., n] *
	batch['template_all_atom_masks'][..., ca] *
	batch['template_all_atom_masks'][..., c])
	template_mask_2d = template_mask[:, None] * template_mask[None, :]

	# compute unit_vector (not used by default)
	if self.config.use_template_unit_vector:
	rot, trans = quat_affine.make_transform_from_reference(
	n_xyz=batch['template_all_atom_positions'][:, n],
	ca_xyz=batch['template_all_atom_positions'][:, ca],
	c_xyz=batch['template_all_atom_positions'][:, c])
	affines = quat_affine.QuatAffine(
	quaternion=quat_affine.rot_to_quat(rot, unstack_inputs=True),
	translation=trans,
	rotation=rot,
	unstack_inputs=True)
	points = [jnp.expand_dims(x, axis=-2) for x in affines.translation]
	affine_vec = affines.invert_point(points, extra_dims=1)
	inv_distance_scalar = jax.lax.rsqrt(1e-6 + sum([jnp.square(x) for x in affine_vec]))
	inv_distance_scalar *= template_mask_2d.astype(inv_distance_scalar.dtype)
	unit_vector = [(x * inv_distance_scalar)[..., None] for x in affine_vec]
	else:
	unit_vector = [jnp.zeros((num_res,num_res,1))] * 3

	unit_vector = [x.astype(dtype) for x in unit_vector]
	to_concat.extend(unit_vector)

	template_mask_2d = template_mask_2d.astype(dtype)
	to_concat.append(template_mask_2d[..., None])

	act = jnp.concatenate(to_concat, axis=-1)

	# Mask out non-template regions so we don't get arbitrary values in the
	# distogram for these regions.
	act *= template_mask_2d[..., None]

	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 9
	act = common_modules.Linear(
	num_channels,
	initializer='relu',
	name='embedding2d')(act)

	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 11
	act = TemplatePairStack(
	self.config.template_pair_stack, self.global_config)(act, mask_2d, is_training, scale_rate=scale_rate)

	act = hk.LayerNorm([-1], True, True, name='output_layer_norm')(act)
	return act


	class TemplateEmbedding(hk.Module):
	"""Embeds a set of templates.
	Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-12
	Jumper et al. (2021) Suppl. Alg. 17 "TemplatePointwiseAttention"
	"""

	def __init__(self, config, global_config, name='template_embedding'):
	super().__init__(name=name)
	self.config = config
	self.global_config = global_config

	def __call__(self, query_embedding, template_batch, mask_2d, is_training, scale_rate=1.0):
	"""Build TemplateEmbedding module.
	Arguments:
	query_embedding: Query pair representation, shape [N_res, N_res, c_z].
	template_batch: A batch of template features.
	mask_2d: Padding mask (Note: this doesn't care if a template exists,
	unlike the template_pseudo_beta_mask).
	is_training: Whether the module is in training mode.
	Returns:
	A template embedding [N_res, N_res, c_z].
	"""

	num_templates = template_batch['template_mask'].shape[0]
	num_channels = (self.config.template_pair_stack
	.triangle_attention_ending_node.value_dim)
	num_res = query_embedding.shape[0]

	dtype = query_embedding.dtype
	template_mask = template_batch['template_mask']
	template_mask = template_mask.astype(dtype)

	query_num_channels = query_embedding.shape[-1]

	# Make sure the weights are shared across templates by constructing the
	# embedder here.
	# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-12
	template_embedder = SingleTemplateEmbedding(self.config, self.global_config)

	def map_fn(batch):
	return template_embedder(query_embedding, batch, mask_2d, is_training, scale_rate=scale_rate)

	template_pair_representation = mapping.sharded_map(map_fn, in_axes=0)(template_batch)

	# Cross attend from the query to the templates along the residue
	# dimension by flattening everything else into the batch dimension.
	# Jumper et al. (2021) Suppl. Alg. 17 "TemplatePointwiseAttention"
	flat_query = jnp.reshape(query_embedding,[num_res * num_res, 1, query_num_channels])

	flat_templates = jnp.reshape(
	jnp.transpose(template_pair_representation, [1, 2, 0, 3]),
	[num_res * num_res, num_templates, num_channels])

	bias = (1e9 * (template_mask[None, None, None, :] - 1.))

	template_pointwise_attention_module = Attention(
	self.config.attention, self.global_config, query_num_channels)
	nonbatched_args = [bias]
	batched_args = [flat_query, flat_templates]

	embedding = mapping.inference_subbatch(
	template_pointwise_attention_module,
	self.config.subbatch_size,
	batched_args=batched_args,
	nonbatched_args=nonbatched_args,
	low_memory=not is_training)
	embedding = jnp.reshape(embedding,[num_res, num_res, query_num_channels])

	# No gradients if no templates.
	embedding *= (jnp.sum(template_mask) > 0.).astype(embedding.dtype)

	return embedding
	####################################################################