Spaces:

julse
/

RPcontact

Running

App Files Files Community

RPcontact / RNA_protein /model /atn_gz.py

julse

Upload 23 files

82d55c6 verified about 1 month ago

raw

history blame contribute delete

16.7 kB

	import math

	import torch
	import torch.nn as nn
	# from torch.nn import Module
	# # for gzlabel contable_gpu env
	# class MultiheadAttention(Module):
	# r"""Allows the model to jointly attend to information
	# from different representation subspaces.
	# See `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
	#
	# .. math::
	# \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
	#
	# where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
	#
	# Args:
	# embed_dim: Total dimension of the model.
	# num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
	# across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
	# dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
	# bias: If specified, adds bias to input / output projection layers. Default: ``True``.
	# add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
	# add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
	# Default: ``False``.
	# kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
	# vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
	# batch_first: If ``True``, then the input and output tensors are provided
	# as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
	#
	# Examples::
	#
	# >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
	# >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
	# """
	# __constants__ = ['batch_first']
	# bias_k: Optional[torch.Tensor]
	# bias_v: Optional[torch.Tensor]
	#
	# def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
	# kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None:
	# factory_kwargs = {'device': device, 'dtype': dtype}
	# super(MultiheadAttention, self).__init__()
	# self.embed_dim = embed_dim
	# self.kdim = kdim if kdim is not None else embed_dim
	# self.vdim = vdim if vdim is not None else embed_dim
	# self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
	#
	# self.num_heads = num_heads
	# self.dropout = dropout
	# self.batch_first = batch_first
	# self.head_dim = embed_dim // num_heads
	# assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
	#
	# if self._qkv_same_embed_dim is False:
	# self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs))
	# self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs))
	# self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs))
	# self.register_parameter('in_proj_weight', None)
	# else:
	# self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs))
	# self.register_parameter('q_proj_weight', None)
	# self.register_parameter('k_proj_weight', None)
	# self.register_parameter('v_proj_weight', None)
	#
	# if bias:
	# self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
	# else:
	# self.register_parameter('in_proj_bias', None)
	# self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
	#
	# if add_bias_kv:
	# self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
	# self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
	# else:
	# self.bias_k = self.bias_v = None
	#
	# self.add_zero_attn = add_zero_attn
	#
	# self._reset_parameters()
	#
	# def _reset_parameters(self):
	# if self._qkv_same_embed_dim:
	# xavier_uniform_(self.in_proj_weight)
	# else:
	# xavier_uniform_(self.q_proj_weight)
	# xavier_uniform_(self.k_proj_weight)
	# xavier_uniform_(self.v_proj_weight)
	#
	# if self.in_proj_bias is not None:
	# constant_(self.in_proj_bias, 0.)
	# constant_(self.out_proj.bias, 0.)
	# if self.bias_k is not None:
	# xavier_normal_(self.bias_k)
	# if self.bias_v is not None:
	# xavier_normal_(self.bias_v)
	#
	# def __setstate__(self, state):
	# # Support loading old MultiheadAttention checkpoints generated by v1.1.0
	# if '_qkv_same_embed_dim' not in state:
	# state['_qkv_same_embed_dim'] = True
	#
	# super(MultiheadAttention, self).__setstate__(state)
	#
	# def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
	# need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
	# r"""
	# Args:
	# query: Query embeddings of shape :math:`(L, N, E_q)` when ``batch_first=False`` or :math:`(N, L, E_q)`
	# when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is the batch size,
	# and :math:`E_q` is the query embedding dimension ``embed_dim``. Queries are compared against
	# key-value pairs to produce the output. See "Attention Is All You Need" for more details.
	# key: Key embeddings of shape :math:`(S, N, E_k)` when ``batch_first=False`` or :math:`(N, S, E_k)` when
	# ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
	# :math:`E_k` is the key embedding dimension ``kdim``. See "Attention Is All You Need" for more details.
	# value: Value embeddings of shape :math:`(S, N, E_v)` when ``batch_first=False`` or :math:`(N, S, E_v)` when
	# ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
	# :math:`E_v` is the value embedding dimension ``vdim``. See "Attention Is All You Need" for more details.
	# key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
	# to ignore for the purpose of attention (i.e. treat as "padding"). Binary and byte masks are supported.
	# For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
	# the purpose of attention. For a byte mask, a non-zero value indicates that the corresponding ``key``
	# value will be ignored.
	# need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
	# Default: ``True``.
	# attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
	# :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
	# :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
	# broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
	# Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
	# corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
	# corresponding position is not allowed to attend. For a float mask, the mask values will be added to
	# the attention weight.
	#
	# Outputs:
	# - attn_output - Attention outputs of shape :math:`(L, N, E)` when ``batch_first=False`` or
	# :math:`(N, L, E)` when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is
	# the batch size, and :math:`E` is the embedding dimension ``embed_dim``.
	# - attn_output_weights - Attention output weights of shape :math:`(N, L, S)`, where :math:`N` is the batch
	# size, :math:`L` is the target sequence length, and :math:`S` is the source sequence length. Only returned
	# when ``need_weights=True``.
	# """
	# if self.batch_first:
	# query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
	#
	# if not self._qkv_same_embed_dim:
	# attn_output, attn_output_weights = F.multi_head_attention_forward(
	# query, key, value, self.embed_dim, self.num_heads,
	# self.in_proj_weight, self.in_proj_bias,
	# self.bias_k, self.bias_v, self.add_zero_attn,
	# self.dropout, self.out_proj.weight, self.out_proj.bias,
	# training=self.training,
	# key_padding_mask=key_padding_mask, need_weights=need_weights,
	# attn_mask=attn_mask, use_separate_proj_weight=True,
	# q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
	# v_proj_weight=self.v_proj_weight)
	# else:
	# attn_output, attn_output_weights = F.multi_head_attention_forward(
	# query, key, value, self.embed_dim, self.num_heads,
	# self.in_proj_weight, self.in_proj_bias,
	# self.bias_k, self.bias_v, self.add_zero_attn,
	# self.dropout, self.out_proj.weight, self.out_proj.bias,
	# training=self.training,
	# key_padding_mask=key_padding_mask, need_weights=need_weights,
	# attn_mask=attn_mask)
	# if self.batch_first:
	# return attn_output.transpose(1, 0), attn_output_weights
	# else:
	# return attn_output, attn_output_weights
	class PositionalEncoding(nn.Module):
	"Implement the PE function."
	def __init__(self, d_model, dropout, max_len=5000):
	#d_model=512,dropout=0.1,
	#max_len=5000代表事先准备好长度为5000的序列的位置编码，其实没必要，
	#一般100或者200足够了。
	super(PositionalEncoding, self).__init__()
	self.dropout = nn.Dropout(p=dropout)

	# Compute the positional encodings once in log space.
	pe = torch.zeros(max_len, d_model)
	#(5000,512)矩阵，保持每个位置的位置编码，一共5000个位置，
	#每个位置用一个512维度向量来表示其位置编码
	position = torch.arange(0, max_len).unsqueeze(1)
	# (5000) -> (5000,1)
	div_term = torch.exp(torch.arange(0, d_model, 2) *
	-(math.log(10000.0) / d_model))
	# (0,2,…, 4998)一共准备2500个值，供sin, cos调用
	pe[:, 0::2] = torch.sin(position * div_term) # 偶数下标的位置
	pe[:, 1::2] = torch.cos(position * div_term) # 奇数下标的位置
	pe = pe.unsqueeze(0)
	# (5000, 512) -> (1, 5000, 512) 为batch.size留出位置
	self.register_buffer('pe', pe)
	def forward(self, x):
	x = x + self.pe[:, :x.size(1)]
	# 接受1.Embeddings的词嵌入结果x，
	#然后把自己的位置编码pe，封装成torch的Variable(不需要梯度)，加上去。
	#例如，假设x是(30,10,512)的一个tensor，
	#30是batch.size, 10是该batch的序列长度, 512是每个词的词嵌入向量；
	#则该行代码的第二项是(1, min(10, 5000), 512)=(1,10,512)，
	#在具体相加的时候，会扩展(1,10,512)为(30,10,512)，
	#保证一个batch中的30个序列，都使用（叠加）一样的位置编码。
	return self.dropout(x) # 增加一次dropout操作
	# 注意，位置编码不会更新，是写死的，所以这个class里面没有可训练的参数。
	class TwoTrackAttention(nn.Module):
	def __init__(self, d_attn, n_head, d_ff=512, dropout=0.1) -> None:
	super().__init__()

	self.self_attn = torch.nn.MultiheadAttention(
	d_attn, n_head,
	dropout = dropout,
	batch_first=True # gzbl 这边的pytorch版本没有这个参数
	)
	self.dropout_self = nn.Dropout(dropout)

	self.cross_attn = torch.nn.MultiheadAttention(
	d_attn, n_head,
	dropout = dropout,
	batch_first=True
	)
	self.dropout_cross = nn.Dropout(dropout)

	self.norm1 = nn.LayerNorm(d_attn)

	self.ff1 = nn.Linear(d_attn, d_ff)
	self.dropout_ff = nn.Dropout(dropout)
	self.ff2 = nn.Linear(d_ff, d_attn)

	self.norm2 = nn.LayerNorm(d_attn)
	self.dropout = nn.Dropout(dropout)

	self.activation = nn.ReLU()

	# self.s_query = nn.Linear(d_attn,d_attn)
	# self.s_key = nn.Linear(d_attn,d_attn)
	# self.s_value = nn.Linear(d_attn,d_attn)
	#
	# self.c_query = nn.Linear(d_attn,d_attn)
	# self.c_key = nn.Linear(d_attn,d_attn)
	# self.c_value = nn.Linear(d_attn,d_attn)

	def forward(self, obj_update, obj_message):
	self_update = self.self_attn(
	query = obj_update,
	key = obj_update,
	value = obj_update
	)[0]

	cross_update = self.cross_attn(
	query = obj_update, # [1, 299, 128]
	key = obj_message, # [1, 74, 128]
	value = obj_message # [1, 74, 128]
	)[0]
	# [torch.Size([1, 299, 128]), torch.Size([1, 74, 128]), torch.Size([1, 74, 128])]
	obj_update = obj_update + self.dropout_self(self_update) + self.dropout_cross(cross_update)
	obj_update = self.norm1(obj_update)

	ff_update = self.ff2(self.dropout_ff(self.activation(self.ff1(obj_update))))

	obj_update = obj_update + self.dropout(ff_update)
	obj_update = self.norm2(obj_update)

	return obj_update


	class SymertricTwoTrackAttention(nn.Module):
	def __init__(self, d_attn, n_head, d_ff=512, dropout=0.1,sync = False) -> None:
	super().__init__()
	self.tta1 = TwoTrackAttention(d_attn, n_head, d_ff, dropout)
	self.tta2 = TwoTrackAttention(d_attn, n_head, d_ff, dropout)
	self.sync = sync
	def forward(self, obj_1, obj_2):
	if self.sync:
	return self.tta1(obj_1, obj_2), self.tta2(obj_2, obj_1)
	else:
	obj_1 = self.tta1(obj_1, obj_2)
	obj_2 = self.tta2(obj_2, obj_1)
	return obj_1, obj_2


	class LinearFF(nn.Module):
	def __init__(self, d_in, d_out, dropout=0.1) -> None:
	super().__init__()
	self.emb = nn.Linear(d_in, d_out)
	self.norm = nn.LayerNorm(d_out)
	self.dropout = nn.Dropout(dropout)
	self.activation = nn.ReLU()

	def forward(self, f_in):
	f_in = f_in.permute(0,2,1)
	return self.norm(self.dropout(self.activation(self.emb(f_in))))


	class ProteinRNAInteraction(nn.Module):
	def __init__(self, d_pro, d_rna, n_layers, d_attn, n_head=4, d_ff=512, dropout=0.1,sync=False) -> None:
	super().__init__()
	print('sync update ProteinRNAInteraction',sync)
	self.pro_emb = LinearFF(d_pro, d_attn)
	self.pro_rna = LinearFF(d_rna, d_attn)

	self.pro_pos = PositionalEncoding(d_attn,dropout)
	self.rna_pos = PositionalEncoding(d_attn,dropout)

	self.layers = nn.ModuleList([
	SymertricTwoTrackAttention(d_attn, n_head, d_ff, dropout,sync = sync) for _ in range(n_layers)
	])

	self.pred = nn.Linear(d_attn, 1)
	# self.pred = nn.Linear(2*d_attn, 1)
	self.sigmoid = nn.Sigmoid()

	def forward(self, f_pro, f_rna):
	# print(f_pro.shape)
	# print(f_pro.device)
	f_pro = self.pro_emb(f_pro)
	f_rna = self.pro_rna(f_rna)

	f_pro = self.pro_pos(f_pro)
	f_rna = self.rna_pos(f_rna)

	for layer in self.layers:
	f_pro, f_rna = layer(f_pro, f_rna)


	f_pro = f_pro.unsqueeze(2) # [B, L, R, D]
	f_rna = f_rna.unsqueeze(1)
	prob = self.sigmoid(self.pred(f_rna.mul(f_pro)))
	return prob


	# f_pro = f_pro.unsqueeze(2) # [1, 299, 1, 128]
	# f_rna = f_rna.unsqueeze(1) # [1, 1, 74, 128]
	# f_pro = f_pro.repeat(1, 1, f_rna.shape[2], 1) # [B, L, R, D]
	# f_rna = f_rna.repeat(1, f_pro.shape[1], 1, 1) # [B, L, R, D]
	#
	# # prob = self.pred(f_rna.mul(f_pro))
	# prob = self.pred(torch.cat([f_pro, f_rna], -1))
	# # print(prob.max(),prob.min(),prob.mean())
	# prob = torch.sigmoid(prob)
	# # prob = self.sigmoid(prob)
	# # prob = self.sigmoid(self.pred(torch.cat([f_pro, f_rna], -1))) # pred : -0.06, 0.619
	# return prob