julse commited on
Commit
82d55c6
·
verified ·
1 Parent(s): 4d203ee

Upload 23 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ example/outputs/8DMB_W.8DMB_P_0_evaluate.png filter=lfs diff=lfs merge=lfs -text
37
+ example/outputs/8DMB_W.8DMB_P_0_prob.png filter=lfs diff=lfs merge=lfs -text
RNA_protein/.DS_Store ADDED
Binary file (6.15 kB). View file
 
RNA_protein/model/.DS_Store ADDED
Binary file (6.15 kB). View file
 
RNA_protein/model/atn_gz.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ # from torch.nn import Module
6
+ # # for gzlabel contable_gpu env
7
+ # class MultiheadAttention(Module):
8
+ # r"""Allows the model to jointly attend to information
9
+ # from different representation subspaces.
10
+ # See `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
11
+ #
12
+ # .. math::
13
+ # \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
14
+ #
15
+ # where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
16
+ #
17
+ # Args:
18
+ # embed_dim: Total dimension of the model.
19
+ # num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
20
+ # across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
21
+ # dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
22
+ # bias: If specified, adds bias to input / output projection layers. Default: ``True``.
23
+ # add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
24
+ # add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
25
+ # Default: ``False``.
26
+ # kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
27
+ # vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
28
+ # batch_first: If ``True``, then the input and output tensors are provided
29
+ # as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
30
+ #
31
+ # Examples::
32
+ #
33
+ # >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
34
+ # >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
35
+ # """
36
+ # __constants__ = ['batch_first']
37
+ # bias_k: Optional[torch.Tensor]
38
+ # bias_v: Optional[torch.Tensor]
39
+ #
40
+ # def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
41
+ # kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None:
42
+ # factory_kwargs = {'device': device, 'dtype': dtype}
43
+ # super(MultiheadAttention, self).__init__()
44
+ # self.embed_dim = embed_dim
45
+ # self.kdim = kdim if kdim is not None else embed_dim
46
+ # self.vdim = vdim if vdim is not None else embed_dim
47
+ # self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
48
+ #
49
+ # self.num_heads = num_heads
50
+ # self.dropout = dropout
51
+ # self.batch_first = batch_first
52
+ # self.head_dim = embed_dim // num_heads
53
+ # assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
54
+ #
55
+ # if self._qkv_same_embed_dim is False:
56
+ # self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs))
57
+ # self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs))
58
+ # self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs))
59
+ # self.register_parameter('in_proj_weight', None)
60
+ # else:
61
+ # self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs))
62
+ # self.register_parameter('q_proj_weight', None)
63
+ # self.register_parameter('k_proj_weight', None)
64
+ # self.register_parameter('v_proj_weight', None)
65
+ #
66
+ # if bias:
67
+ # self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
68
+ # else:
69
+ # self.register_parameter('in_proj_bias', None)
70
+ # self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
71
+ #
72
+ # if add_bias_kv:
73
+ # self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
74
+ # self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
75
+ # else:
76
+ # self.bias_k = self.bias_v = None
77
+ #
78
+ # self.add_zero_attn = add_zero_attn
79
+ #
80
+ # self._reset_parameters()
81
+ #
82
+ # def _reset_parameters(self):
83
+ # if self._qkv_same_embed_dim:
84
+ # xavier_uniform_(self.in_proj_weight)
85
+ # else:
86
+ # xavier_uniform_(self.q_proj_weight)
87
+ # xavier_uniform_(self.k_proj_weight)
88
+ # xavier_uniform_(self.v_proj_weight)
89
+ #
90
+ # if self.in_proj_bias is not None:
91
+ # constant_(self.in_proj_bias, 0.)
92
+ # constant_(self.out_proj.bias, 0.)
93
+ # if self.bias_k is not None:
94
+ # xavier_normal_(self.bias_k)
95
+ # if self.bias_v is not None:
96
+ # xavier_normal_(self.bias_v)
97
+ #
98
+ # def __setstate__(self, state):
99
+ # # Support loading old MultiheadAttention checkpoints generated by v1.1.0
100
+ # if '_qkv_same_embed_dim' not in state:
101
+ # state['_qkv_same_embed_dim'] = True
102
+ #
103
+ # super(MultiheadAttention, self).__setstate__(state)
104
+ #
105
+ # def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
106
+ # need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
107
+ # r"""
108
+ # Args:
109
+ # query: Query embeddings of shape :math:`(L, N, E_q)` when ``batch_first=False`` or :math:`(N, L, E_q)`
110
+ # when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is the batch size,
111
+ # and :math:`E_q` is the query embedding dimension ``embed_dim``. Queries are compared against
112
+ # key-value pairs to produce the output. See "Attention Is All You Need" for more details.
113
+ # key: Key embeddings of shape :math:`(S, N, E_k)` when ``batch_first=False`` or :math:`(N, S, E_k)` when
114
+ # ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
115
+ # :math:`E_k` is the key embedding dimension ``kdim``. See "Attention Is All You Need" for more details.
116
+ # value: Value embeddings of shape :math:`(S, N, E_v)` when ``batch_first=False`` or :math:`(N, S, E_v)` when
117
+ # ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
118
+ # :math:`E_v` is the value embedding dimension ``vdim``. See "Attention Is All You Need" for more details.
119
+ # key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
120
+ # to ignore for the purpose of attention (i.e. treat as "padding"). Binary and byte masks are supported.
121
+ # For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
122
+ # the purpose of attention. For a byte mask, a non-zero value indicates that the corresponding ``key``
123
+ # value will be ignored.
124
+ # need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
125
+ # Default: ``True``.
126
+ # attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
127
+ # :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
128
+ # :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
129
+ # broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
130
+ # Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
131
+ # corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
132
+ # corresponding position is not allowed to attend. For a float mask, the mask values will be added to
133
+ # the attention weight.
134
+ #
135
+ # Outputs:
136
+ # - **attn_output** - Attention outputs of shape :math:`(L, N, E)` when ``batch_first=False`` or
137
+ # :math:`(N, L, E)` when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is
138
+ # the batch size, and :math:`E` is the embedding dimension ``embed_dim``.
139
+ # - **attn_output_weights** - Attention output weights of shape :math:`(N, L, S)`, where :math:`N` is the batch
140
+ # size, :math:`L` is the target sequence length, and :math:`S` is the source sequence length. Only returned
141
+ # when ``need_weights=True``.
142
+ # """
143
+ # if self.batch_first:
144
+ # query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
145
+ #
146
+ # if not self._qkv_same_embed_dim:
147
+ # attn_output, attn_output_weights = F.multi_head_attention_forward(
148
+ # query, key, value, self.embed_dim, self.num_heads,
149
+ # self.in_proj_weight, self.in_proj_bias,
150
+ # self.bias_k, self.bias_v, self.add_zero_attn,
151
+ # self.dropout, self.out_proj.weight, self.out_proj.bias,
152
+ # training=self.training,
153
+ # key_padding_mask=key_padding_mask, need_weights=need_weights,
154
+ # attn_mask=attn_mask, use_separate_proj_weight=True,
155
+ # q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
156
+ # v_proj_weight=self.v_proj_weight)
157
+ # else:
158
+ # attn_output, attn_output_weights = F.multi_head_attention_forward(
159
+ # query, key, value, self.embed_dim, self.num_heads,
160
+ # self.in_proj_weight, self.in_proj_bias,
161
+ # self.bias_k, self.bias_v, self.add_zero_attn,
162
+ # self.dropout, self.out_proj.weight, self.out_proj.bias,
163
+ # training=self.training,
164
+ # key_padding_mask=key_padding_mask, need_weights=need_weights,
165
+ # attn_mask=attn_mask)
166
+ # if self.batch_first:
167
+ # return attn_output.transpose(1, 0), attn_output_weights
168
+ # else:
169
+ # return attn_output, attn_output_weights
170
+ class PositionalEncoding(nn.Module):
171
+ "Implement the PE function."
172
+ def __init__(self, d_model, dropout, max_len=5000):
173
+ #d_model=512,dropout=0.1,
174
+ #max_len=5000代表事先准备好长度为5000的序列的位置编码,其实没必要,
175
+ #一般100或者200足够了。
176
+ super(PositionalEncoding, self).__init__()
177
+ self.dropout = nn.Dropout(p=dropout)
178
+
179
+ # Compute the positional encodings once in log space.
180
+ pe = torch.zeros(max_len, d_model)
181
+ #(5000,512)矩阵,保持每个位置的位置编码,一共5000个位置,
182
+ #每个位置用一个512维度向量来表示其位置编码
183
+ position = torch.arange(0, max_len).unsqueeze(1)
184
+ # (5000) -> (5000,1)
185
+ div_term = torch.exp(torch.arange(0, d_model, 2) *
186
+ -(math.log(10000.0) / d_model))
187
+ # (0,2,…, 4998)一共准备2500个值,供sin, cos调用
188
+ pe[:, 0::2] = torch.sin(position * div_term) # 偶数下标的位置
189
+ pe[:, 1::2] = torch.cos(position * div_term) # 奇数下标的位置
190
+ pe = pe.unsqueeze(0)
191
+ # (5000, 512) -> (1, 5000, 512) 为batch.size留出位置
192
+ self.register_buffer('pe', pe)
193
+ def forward(self, x):
194
+ x = x + self.pe[:, :x.size(1)]
195
+ # 接受1.Embeddings的词嵌入结果x,
196
+ #然后把自己的位置编码pe,封装成torch的Variable(不需要梯度),加上去。
197
+ #例如,假设x是(30,10,512)的一个tensor,
198
+ #30是batch.size, 10是该batch的序列长度, 512是每个词的词嵌入向量;
199
+ #则该行代码的第二项是(1, min(10, 5000), 512)=(1,10,512),
200
+ #在具体相加的时候,会扩展(1,10,512)为(30,10,512),
201
+ #保证一个batch中的30个序列,都使用(叠加)一样的位置编码。
202
+ return self.dropout(x) # 增加一次dropout操作
203
+ # 注意,位置编码不会更新,是写死的,所以这个class里面没有可训练的参数。
204
+ class TwoTrackAttention(nn.Module):
205
+ def __init__(self, d_attn, n_head, d_ff=512, dropout=0.1) -> None:
206
+ super().__init__()
207
+
208
+ self.self_attn = torch.nn.MultiheadAttention(
209
+ d_attn, n_head,
210
+ dropout = dropout,
211
+ batch_first=True # gzbl 这边的pytorch版本没有这个参数
212
+ )
213
+ self.dropout_self = nn.Dropout(dropout)
214
+
215
+ self.cross_attn = torch.nn.MultiheadAttention(
216
+ d_attn, n_head,
217
+ dropout = dropout,
218
+ batch_first=True
219
+ )
220
+ self.dropout_cross = nn.Dropout(dropout)
221
+
222
+ self.norm1 = nn.LayerNorm(d_attn)
223
+
224
+ self.ff1 = nn.Linear(d_attn, d_ff)
225
+ self.dropout_ff = nn.Dropout(dropout)
226
+ self.ff2 = nn.Linear(d_ff, d_attn)
227
+
228
+ self.norm2 = nn.LayerNorm(d_attn)
229
+ self.dropout = nn.Dropout(dropout)
230
+
231
+ self.activation = nn.ReLU()
232
+
233
+ # self.s_query = nn.Linear(d_attn,d_attn)
234
+ # self.s_key = nn.Linear(d_attn,d_attn)
235
+ # self.s_value = nn.Linear(d_attn,d_attn)
236
+ #
237
+ # self.c_query = nn.Linear(d_attn,d_attn)
238
+ # self.c_key = nn.Linear(d_attn,d_attn)
239
+ # self.c_value = nn.Linear(d_attn,d_attn)
240
+
241
+ def forward(self, obj_update, obj_message):
242
+ self_update = self.self_attn(
243
+ query = obj_update,
244
+ key = obj_update,
245
+ value = obj_update
246
+ )[0]
247
+
248
+ cross_update = self.cross_attn(
249
+ query = obj_update, # [1, 299, 128]
250
+ key = obj_message, # [1, 74, 128]
251
+ value = obj_message # [1, 74, 128]
252
+ )[0]
253
+ # [torch.Size([1, 299, 128]), torch.Size([1, 74, 128]), torch.Size([1, 74, 128])]
254
+ obj_update = obj_update + self.dropout_self(self_update) + self.dropout_cross(cross_update)
255
+ obj_update = self.norm1(obj_update)
256
+
257
+ ff_update = self.ff2(self.dropout_ff(self.activation(self.ff1(obj_update))))
258
+
259
+ obj_update = obj_update + self.dropout(ff_update)
260
+ obj_update = self.norm2(obj_update)
261
+
262
+ return obj_update
263
+
264
+
265
+ class SymertricTwoTrackAttention(nn.Module):
266
+ def __init__(self, d_attn, n_head, d_ff=512, dropout=0.1,sync = False) -> None:
267
+ super().__init__()
268
+ self.tta1 = TwoTrackAttention(d_attn, n_head, d_ff, dropout)
269
+ self.tta2 = TwoTrackAttention(d_attn, n_head, d_ff, dropout)
270
+ self.sync = sync
271
+ def forward(self, obj_1, obj_2):
272
+ if self.sync:
273
+ return self.tta1(obj_1, obj_2), self.tta2(obj_2, obj_1)
274
+ else:
275
+ obj_1 = self.tta1(obj_1, obj_2)
276
+ obj_2 = self.tta2(obj_2, obj_1)
277
+ return obj_1, obj_2
278
+
279
+
280
+ class LinearFF(nn.Module):
281
+ def __init__(self, d_in, d_out, dropout=0.1) -> None:
282
+ super().__init__()
283
+ self.emb = nn.Linear(d_in, d_out)
284
+ self.norm = nn.LayerNorm(d_out)
285
+ self.dropout = nn.Dropout(dropout)
286
+ self.activation = nn.ReLU()
287
+
288
+ def forward(self, f_in):
289
+ f_in = f_in.permute(0,2,1)
290
+ return self.norm(self.dropout(self.activation(self.emb(f_in))))
291
+
292
+
293
+ class ProteinRNAInteraction(nn.Module):
294
+ def __init__(self, d_pro, d_rna, n_layers, d_attn, n_head=4, d_ff=512, dropout=0.1,sync=False) -> None:
295
+ super().__init__()
296
+ print('sync update ProteinRNAInteraction',sync)
297
+ self.pro_emb = LinearFF(d_pro, d_attn)
298
+ self.pro_rna = LinearFF(d_rna, d_attn)
299
+
300
+ self.pro_pos = PositionalEncoding(d_attn,dropout)
301
+ self.rna_pos = PositionalEncoding(d_attn,dropout)
302
+
303
+ self.layers = nn.ModuleList([
304
+ SymertricTwoTrackAttention(d_attn, n_head, d_ff, dropout,sync = sync) for _ in range(n_layers)
305
+ ])
306
+
307
+ self.pred = nn.Linear(d_attn, 1)
308
+ # self.pred = nn.Linear(2*d_attn, 1)
309
+ self.sigmoid = nn.Sigmoid()
310
+
311
+ def forward(self, f_pro, f_rna):
312
+ # print(f_pro.shape)
313
+ # print(f_pro.device)
314
+ f_pro = self.pro_emb(f_pro)
315
+ f_rna = self.pro_rna(f_rna)
316
+
317
+ f_pro = self.pro_pos(f_pro)
318
+ f_rna = self.rna_pos(f_rna)
319
+
320
+ for layer in self.layers:
321
+ f_pro, f_rna = layer(f_pro, f_rna)
322
+
323
+
324
+ f_pro = f_pro.unsqueeze(2) # [B, L, R, D]
325
+ f_rna = f_rna.unsqueeze(1)
326
+ prob = self.sigmoid(self.pred(f_rna.mul(f_pro)))
327
+ return prob
328
+
329
+
330
+ # f_pro = f_pro.unsqueeze(2) # [1, 299, 1, 128]
331
+ # f_rna = f_rna.unsqueeze(1) # [1, 1, 74, 128]
332
+ # f_pro = f_pro.repeat(1, 1, f_rna.shape[2], 1) # [B, L, R, D]
333
+ # f_rna = f_rna.repeat(1, f_pro.shape[1], 1, 1) # [B, L, R, D]
334
+ #
335
+ # # prob = self.pred(f_rna.mul(f_pro))
336
+ # prob = self.pred(torch.cat([f_pro, f_rna], -1))
337
+ # # print(prob.max(),prob.min(),prob.mean())
338
+ # prob = torch.sigmoid(prob)
339
+ # # prob = self.sigmoid(prob)
340
+ # # prob = self.sigmoid(self.pred(torch.cat([f_pro, f_rna], -1))) # pred : -0.06, 0.619
341
+ # return prob
342
+
RPcontact_pipline.sh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+
3
+ # 检查参数数量
4
+ if [ "$#" -ne 4 ]; then
5
+ echo "Usage: $0 <fin_fasta> <dirout> <esm2_env_path> <ernie_rna_env_path>"
6
+ exit 1
7
+ fi
8
+
9
+ fin_fasta=$1
10
+ dirout=$2
11
+ esm2_env_path=$3
12
+ ernie_rna_env_path=$4
13
+
14
+ # 设置默认值
15
+ WDIR=$dirout
16
+ rna_fasta=$WDIR/_0_process/rna_sequences.fasta
17
+ pro_fasta=$WDIR/_0_process/protein_sequences.fasta
18
+ fcombinations=$WDIR/_0_process/combinations.csv
19
+ finfo=$WDIR/_0_process/info.csv
20
+
21
+ current_path=$WDIR/_0_process/
22
+
23
+ # 创建所需目录
24
+ mkdir -p $current_path
25
+ mkdir -p $current_path/ernie_rna_emb
26
+ mkdir -p $current_path/esm2_emb
27
+ mkdir -p $current_path/rpcontact
28
+ mkdir -p $current_path/no_constrained
29
+ mkdir -p $current_path/constrained
30
+
31
+ # 写入组合文件
32
+ while IFS= read -r line; do
33
+ rna_id=$(echo $line | cut -d ',' -f 1)
34
+ rna_seq=$(echo $line | cut -d ',' -f 2)
35
+ pro_id=$(echo $line | cut -d ',' -f 3)
36
+ pro_seq=$(echo $line | cut -d ',' -f 4)
37
+ rna_len=$(echo $line | cut -d ',' -f 5)
38
+ pro_len=$(echo $line | cut -d ',' -f 6)
39
+ echo "$rna_id.$pro_id,$rna_seq,$pro_seq,$rna_len,$pro_len" >> $fcombinations
40
+ done < $fin_fasta
41
+
42
+ # 打印信息
43
+ echo "Done. RNA sequences are in $rna_fasta, protein sequences are in $pro_fasta, and combinations are in $fcombinations."
44
+ echo "RNA count: $(wc -l < $rna_fasta), RNA max length: $(awk -F',' '{print $5}' $fcombinations | sort -nr | head -n 1), RNA min length: $(awk -F',' '{print $5}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
45
+ echo "Protein count: $(wc -l < $pro_fasta), Protein max length: $(awk -F',' '{print $6}' $fcombinations | sort -nr | head -n 1), Protein min length: $(awk -F',' '{print $6}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
46
+ echo "Sequence length longer than 1000 were truncated and kept head and tail with the length of 1000, sliding 500 as step, 1000 as window"
47
+
48
+ # ERNIE-RNA 嵌入
49
+ ERNIE_RNA_script="cd /public/home/jiang_jiuhong/soft/ERNIE-RNA/
50
+ $ernie_rna_env_path/miniconda3/envs/ERNIE-RNA/bin/python extract_embedding_jh.py --seqs_path='$rna_fasta' --save_path='$current_path/ernie_rna_emb/' --device=cpu"
51
+
52
+ echo "$ERNIE_RNA_script" > $current_path/ernie_rna_emb.sh
53
+ chmod +x $current_path/ernie_rna_emb.sh
54
+
55
+ nohup srun -p hebhcnormal01 -c 32 sh $current_path/ernie_rna_emb.sh > $current_path/log_ernie_rna_emb.txt 2>&1 &
56
+
57
+ # ESM2 嵌入
58
+ ESM2_script="cd /public/home/jiang_jiuhong/code/esm/
59
+ $esm2_env_path/miniconda3/envs/esm2_env/bin/python scripts/extract.py esm2_t48_15B_UR50D $pro_fasta $current_path/esm2_emb/ --repr_layers 48 --include mean per_tok"
60
+
61
+ echo "$ESM2_script" > $current_path/esm2_emb.sh
62
+ chmod +x $current_path/esm2_emb.sh
63
+
64
+ nohup srun -p hebhcnormal01 -c 32 sh $current_path/esm2_emb.sh > $current_path/log_esm2_emb.txt 2>&1 &
65
+
66
+ # 等待嵌入完成
67
+ wait
68
+
69
+ # 执行 RPcontact 获取 contactmap
70
+ python process_rna_protein.py --rna_fasta=$rna_fasta --pro_fasta=$pro_fasta --csv=$fcombinations --WDIR=$WDIR --out=$dirout
71
+
app.py ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import random
5
+ import tempfile
6
+ import os
7
+ import zipfile
8
+ import io
9
+ from Bio import SeqIO
10
+ import torch
11
+ from sklearn.preprocessing import OneHotEncoder
12
+ import plotly.graph_objects as go
13
+
14
+
15
+ class RPContactPredictor:
16
+ def __init__(self, model_path='./weight/model_roc_0_56=0.779.pt'):
17
+
18
+ """Initialize RNA-protein contact predictor"""
19
+ self.model = torch.load(model_path, map_location=torch.device('cpu'))
20
+ self.model.eval()
21
+ self.seed_everything()
22
+
23
+ def seed_everything(self, seed=2022):
24
+ """Set random seed for reproducibility"""
25
+ random.seed(seed)
26
+ os.environ['PYTHONHASHSEED'] = str(seed)
27
+ np.random.seed(seed)
28
+ torch.manual_seed(seed)
29
+ torch.cuda.manual_seed(seed)
30
+ torch.backends.cudnn.deterministic = True
31
+ torch.backends.cudnn.benchmark = False
32
+
33
+ def one_hot_encode(self, sequences, alpha='ACGU'):
34
+ """One-hot encode biological sequences"""
35
+ sequences_array = np.array(list(sequences)).reshape(-1, 1)
36
+ label = np.array(list(alpha)).reshape(-1, 1)
37
+ enc = OneHotEncoder(handle_unknown='ignore')
38
+ enc.fit(label)
39
+ seq_encode = enc.transform(sequences_array).toarray()
40
+ return seq_encode
41
+
42
+ def contact_partner_constrained(self, prob_matrix, colmax=12, rowmax=24):
43
+ """Apply contact partner constraints to probability matrix"""
44
+ row_max_indices = np.argsort(-prob_matrix, axis=1)[:, :rowmax]
45
+ row_max_mask = np.zeros_like(prob_matrix)
46
+ row_max_mask[np.arange(prob_matrix.shape[0])[:, np.newaxis], row_max_indices] = 1
47
+
48
+ col_max_indices = np.argsort(-prob_matrix, axis=0)[:colmax, :]
49
+ col_max_mask = np.zeros_like(prob_matrix)
50
+ col_max_mask[col_max_indices, np.arange(prob_matrix.shape[1])] = 1
51
+
52
+ mask = np.logical_and(row_max_mask, col_max_mask).astype(np.float32)
53
+ prob_matrix = np.where(mask == 1, prob_matrix, 0)
54
+ return prob_matrix
55
+
56
+ def read_fasta(self, fasta_content):
57
+ """Parse FASTA format content"""
58
+ sequences = {}
59
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as tmp_file:
60
+ tmp_file.write(fasta_content)
61
+ tmp_file_path = tmp_file.name
62
+
63
+ try:
64
+ for record in SeqIO.parse(tmp_file_path, 'fasta'):
65
+ pdbid, seq = record.id, str(record.seq)
66
+ rnaid, proid = pdbid.split('.')
67
+ rnaseq, proseq = seq.split('.')
68
+ sequences = {
69
+ 'rna': (rnaid, rnaseq),
70
+ 'protein': (proid, proseq)
71
+ }
72
+ break
73
+ finally:
74
+ os.unlink(tmp_file_path)
75
+
76
+ return sequences
77
+
78
+ def predict_contact(self, rna_seq, protein_seq):
79
+ """Predict RNA-protein contact matrix"""
80
+ # Encode sequences
81
+ rna_oh = self.one_hot_encode(rna_seq, alpha='ACGU')
82
+ pro_oh = self.one_hot_encode(protein_seq, alpha='GAVLIFWYDNEKQMSTCPHR')
83
+
84
+ # Prepare input tensors
85
+ x_rna = torch.from_numpy(np.expand_dims(rna_oh, 0)).transpose(-1, -2).float()
86
+ x_pro = torch.from_numpy(np.expand_dims(pro_oh, 0)).transpose(-1, -2).float()
87
+
88
+ # Run prediction
89
+ with torch.no_grad():
90
+ outputs = self.model(x_pro, x_rna)
91
+
92
+ # Process outputs
93
+ outputs = torch.squeeze(outputs, -1).permute(0, 2, 1)
94
+ contact_matrix = outputs[0].cpu().numpy()
95
+
96
+ # Apply constraints and normalization
97
+ contact_matrix = self.contact_partner_constrained(contact_matrix)
98
+ contact_matrix = (contact_matrix - contact_matrix.min()) / (contact_matrix.max() - contact_matrix.min() + 1e-8)
99
+
100
+ return contact_matrix
101
+
102
+
103
+ def create_heatmap(contact_matrix, rna_labels, protein_labels, rna_name, protein_name, Threshold=0.0):
104
+ """Create interactive contact heatmap with threshold filtering"""
105
+ # Apply Threshold threshold
106
+ filtered_matrix = contact_matrix.copy()
107
+ filtered_matrix[filtered_matrix < Threshold] = 0
108
+
109
+ fig = go.Figure(data=go.Heatmap(
110
+ z=filtered_matrix,
111
+ x=protein_labels,
112
+ y=rna_labels,
113
+ colorscale='Reds',
114
+ showscale=True,
115
+ colorbar=dict(title="Predicted Probability"),
116
+ hovertemplate='RNA: %{y}<br>Protein: %{x}<br>Probability: %{z:.4f}<extra></extra>'
117
+ ))
118
+
119
+ fig.update_layout(
120
+ title={
121
+ 'text': f"{rna_name} vs {protein_name} (Threshold ≥ {Threshold:.3f})",
122
+ 'x': 0.5,
123
+ 'xanchor': 'center',
124
+ 'yanchor': 'top'
125
+ },
126
+ xaxis_title=f"Protein Residues ({protein_name})",
127
+ yaxis_title=f"RNA Nucleotides ({rna_name})",
128
+ width=800,
129
+ height=600,
130
+ font=dict(size=12)
131
+ )
132
+
133
+ return fig
134
+
135
+
136
+ def get_contact_pairs(contact_matrix, rna_labels, protein_labels, Threshold=0.0):
137
+ """Get filtered contact pairs list above threshold"""
138
+ df = pd.DataFrame(contact_matrix, index=rna_labels, columns=protein_labels)
139
+ df_stacked = df.stack().reset_index()
140
+ df_stacked.columns = ['RNA', 'Protein', 'Probability']
141
+ df_filtered = df_stacked[df_stacked['Probability'] > Threshold].sort_values('Probability', ascending=False)
142
+ return df_filtered
143
+
144
+
145
+ def create_download_files(contact_matrix, rna_labels, protein_labels, rna_name, protein_name):
146
+ """Create downloadable result files package"""
147
+ # Create temporary directory
148
+ temp_dir = tempfile.mkdtemp()
149
+
150
+ # Save heatmap raw data
151
+ heatmap_df = pd.DataFrame(contact_matrix, index=rna_labels, columns=protein_labels)
152
+ heatmap_file = os.path.join(temp_dir, f"{rna_name}_{protein_name}_heatmap.csv")
153
+ heatmap_df.to_csv(heatmap_file, index=True)
154
+
155
+ # Save contact pairs list
156
+ pairs_df = get_contact_pairs(contact_matrix, rna_labels, protein_labels, Threshold=0.0)
157
+ pairs_file = os.path.join(temp_dir, f"{rna_name}_{protein_name}_contact_pairs.csv")
158
+ pairs_df.to_csv(pairs_file, index=False)
159
+
160
+ # Create ZIP file
161
+ zip_path = os.path.join(temp_dir, f"{rna_name}_{protein_name}_results.zip")
162
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
163
+ zipf.write(heatmap_file, os.path.basename(heatmap_file))
164
+ zipf.write(pairs_file, os.path.basename(pairs_file))
165
+
166
+ return zip_path
167
+
168
+
169
+ def process_prediction(fasta_file, rna_sequence, protein_sequence, input_method):
170
+ """Process prediction request and return initial results"""
171
+ if not fasta_file and not (rna_sequence and protein_sequence):
172
+ return "❌ Please upload a FASTA file or enter RNA and protein sequences",None, None, None, None, None, None
173
+
174
+ try:
175
+ # Process input
176
+ if input_method == "Upload FASTA File" and fasta_file:
177
+ fasta_content = fasta_file.decode('utf-8')
178
+ sequences = predictor.read_fasta(fasta_content)
179
+ else:
180
+ # Create sequences from text input
181
+ sequences = {
182
+ 'rna': ('RNA', rna_sequence),
183
+ 'protein': ('Protein', protein_sequence)
184
+ }
185
+
186
+ rna_id, rna_seq = sequences['rna']
187
+ protein_id, protein_seq = sequences['protein']
188
+
189
+ # Validate sequences
190
+ if len(set(rna_seq) - set('ACGU')) > 0:
191
+ return f"❌ RNA sequence contains invalid characters: {set(rna_seq) - set('ACGU')}",None, None, None, None, None, None
192
+ if len(set(protein_seq) - set('GAVLIFWYDNEKQMSTCPHR')) > 0:
193
+ return f"❌ Protein sequence contains invalid characters: {set(protein_seq) - set('GAVLIFWYDNEKQMSTCPHR')}",None, None, None, None, None, None
194
+
195
+ # Run contact prediction
196
+ contact_matrix = predictor.predict_contact(rna_seq, protein_seq)
197
+
198
+ # Generate residue labels
199
+ rna_labels = [f'{nt}{i + 1}' for i, nt in enumerate(rna_seq)]
200
+ protein_labels = [f'{aa}{i + 1}' for i, aa in enumerate(protein_seq)]
201
+
202
+ # Calculate default Threshold (minimum non-zero value)
203
+ non_zero_values = contact_matrix[contact_matrix > 0]
204
+ default_threshold = float(np.min(non_zero_values)) if len(non_zero_values) > 0 else 0.0
205
+ max_threshold = float(np.max(contact_matrix))
206
+
207
+ # Create initial heatmap with default Threshold
208
+ heatmap = create_heatmap(contact_matrix, rna_labels, protein_labels, rna_id, protein_id, default_threshold)
209
+
210
+ # Create initial contact pairs table
211
+ contact_pairs = get_contact_pairs(contact_matrix, rna_labels, protein_labels, default_threshold)
212
+
213
+ # Create download file
214
+ download_file = create_download_files(contact_matrix, rna_labels, protein_labels, rna_id, protein_id)
215
+
216
+ # Prepare status message
217
+ status = f"✅ Prediction completed!\n"
218
+ status += f"RNA length: {len(rna_seq)}\n"
219
+ status += f"Protein length: {len(protein_seq)}\n"
220
+ status += f"Total predicted contacts: {len(contact_pairs)}"
221
+
222
+ # Prepare result state for threshold updates
223
+ result_state = {
224
+ 'contact_matrix': contact_matrix,
225
+ 'rna_labels': rna_labels,
226
+ 'protein_labels': protein_labels,
227
+ 'rna_id': rna_id,
228
+ 'protein_id': protein_id
229
+ }
230
+
231
+ # Update slider configuration
232
+ slider_update = gr.update(
233
+ minimum=default_threshold,
234
+ maximum=max_threshold,
235
+ value=default_threshold,
236
+ step=(max_threshold - default_threshold) / 100,
237
+ visible=True
238
+ )
239
+
240
+ # Create contact pairs info
241
+ contact_info = f"📊 Found {len(contact_pairs)} contacts (Threshold ≥ {default_threshold:.3f})"
242
+
243
+ return status, heatmap, contact_pairs, contact_info, download_file, result_state, slider_update
244
+
245
+ except Exception as e:
246
+ return f"❌ Prediction failed: {str(e)}", None, None, None, None, None, None
247
+
248
+ def update_results_with_threshold(Threshold, result_state):
249
+ """Update heatmap and contact table based on Threshold threshold"""
250
+ if result_state is None:
251
+ return None, None, None
252
+ # Create updated heatmap
253
+ heatmap = create_heatmap(
254
+ result_state['contact_matrix'],
255
+ result_state['rna_labels'],
256
+ result_state['protein_labels'],
257
+ result_state['rna_id'],
258
+ result_state['protein_id'],
259
+ Threshold
260
+ )
261
+
262
+ # Create updated contact pairs table
263
+ contact_pairs = get_contact_pairs(
264
+ result_state['contact_matrix'],
265
+ result_state['rna_labels'],
266
+ result_state['protein_labels'],
267
+ Threshold
268
+ )
269
+
270
+ # Create contact pairs info
271
+ contact_info = f"📊 Found {len(contact_pairs)} contacts (Probability ≥ {Threshold:.3f})"
272
+
273
+
274
+ return heatmap, contact_pairs, contact_info
275
+
276
+
277
+ def reset_threshold(result_state):
278
+ if result_state is None:
279
+ return gr.update(value=0.0)
280
+
281
+ contact_matrix = result_state['contact_matrix']
282
+ non_zero_values = contact_matrix[contact_matrix > 0]
283
+
284
+ if len(non_zero_values) > 0:
285
+ default_threshold = float(np.min(non_zero_values))
286
+ else:
287
+ default_threshold = 0.0
288
+
289
+ # 返回滑块更新对象
290
+ return gr.update(
291
+ minimum=default_threshold,
292
+ maximum=float(np.max(non_zero_values)),
293
+ value=default_threshold,
294
+ interactive=True)
295
+
296
+
297
+ def load_example_data(fasta_input, rna_input, protein_input):
298
+ # 如果fasta有值(非空),则返回"Upload FASTA File"
299
+ if fasta_input is not None:
300
+ return gr.update(value="Upload FASTA File")
301
+ else:
302
+ return gr.update(value="Enter Sequences Directly")
303
+ def create_interface():
304
+ """Create Gradio interface with threshold control"""
305
+ custom_css = """
306
+ .gradio-dataframe {
307
+ background: white !important;
308
+ border: 1px solid #e0e0e0;
309
+ border-radius: 8px;
310
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
311
+ }
312
+ .dataframe-container {
313
+ padding: 12px;
314
+ background: white;
315
+ border-radius: 8px;
316
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
317
+ }
318
+ .contact-info {
319
+ font-size: 14px;
320
+ font-weight: 500;
321
+ margin-bottom: 8px;
322
+ color: #4a5568;
323
+ }
324
+ """
325
+
326
+ with gr.Blocks(title="RNA-Protein Contact Prediction Tool",
327
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="teal"),
328
+ css=custom_css) as app:
329
+ gr.Markdown("""
330
+ <center>
331
+
332
+ # 🧬 RPcontact: RNA-Protein Contact Prediction
333
+ **Direct Nucleotide–Residue Contact Prediction from Primary Sequences**
334
+
335
+ [Paper](https://www.biorxiv.org/content/10.1101/2025.06.02.657171v1.full) |
336
+ [Code](https://github.com/rpcontact) |
337
+ [Demo](https://julse-rpcontact.hf.space/)
338
+
339
+ </center>
340
+
341
+
342
+ > RPcontact predicts direct nucleotide-residue contacts between RNA and protein sequences.
343
+ Leveraging **ERNIE-RNA** for RNA and **ESM-2** for protein modeling, the method provides high-resolution insights into RNA-protein interactions at the atomic level.
344
+ <br><br>Current Demo (auROC 0.779 on VL-49) is optimized for limited CPU environments using efficient one-hot encoding<br>
345
+ Advanced Model (auROC 0.845 on VL-49), the Embedding-based approach will be released upon paper publication ([contact us](mailto:[email protected]) for early access)
346
+
347
+ """)
348
+ with gr.Tab("🔬 Contact Prediction"):
349
+ with gr.Row():
350
+ with gr.Column(scale=1):
351
+ gr.Markdown("## ⚙️ Input Options")
352
+ with gr.Group(elem_classes="input-group"):
353
+ input_method = gr.Radio(
354
+ choices=["Upload FASTA File", "Enter Sequences Directly"],
355
+ value="Upload FASTA File",
356
+ label="Input Method"
357
+ )
358
+
359
+ fasta_input = gr.File(
360
+ label="FASTA File",
361
+ file_types=['.fasta', '.fa', '.txt'],
362
+ type='binary'
363
+ )
364
+
365
+ rna_input = gr.Textbox(
366
+ label="RNA Sequence",
367
+ placeholder="Enter RNA sequence (use A,C,G,U)",
368
+ lines=3,
369
+ visible=False
370
+ )
371
+
372
+ protein_input = gr.Textbox(
373
+ label="Protein Sequence",
374
+ placeholder="Enter protein sequence (standard amino acid codes)",
375
+ lines=3,
376
+ visible=False
377
+ )
378
+
379
+ # Example data
380
+ gr.Examples(
381
+ examples=[
382
+ ["./example/inputs/8DMB_W.8DMB_P.fasta", "GGGCCUUAUUAAAUGACUUC", "MDVPRKMETRRNLRRARRYRK"],
383
+ ],
384
+ inputs=[fasta_input, rna_input, protein_input],
385
+ outputs=[input_method],
386
+ label="📋 Example Data (click to load)",
387
+ run_on_click=True,
388
+ fn = load_example_data
389
+ )
390
+
391
+
392
+
393
+ # Submit button at the bottom of input column
394
+ predict_btn = gr.Button("🚀 Run Prediction", variant="primary", size="lg")
395
+
396
+ # Status output
397
+ status_output = gr.Textbox(label="Prediction Status", lines=5)
398
+
399
+
400
+
401
+ with gr.Column(scale=2):
402
+ # Results section - initially hidden
403
+ gr.Markdown("""
404
+ ## 📊 Results
405
+ """)
406
+ # Threshold control section
407
+ with gr.Row():
408
+ threshold_slider = gr.Slider(
409
+ label="Contact Probability Threshold",
410
+ minimum=0.0,
411
+ maximum=1.0,
412
+ value=0.0,
413
+ step=0.001,
414
+ visible=True,
415
+ interactive=True
416
+ )
417
+ reset_btn = gr.Button("Reset to Default", size="sm")
418
+ gr.Markdown("""
419
+ ### 🎯Contact Map
420
+ """)
421
+ # Heatmap display
422
+ heatmap_plot = gr.Plot(label='Contact Map')
423
+
424
+ # Contact pairs table with info header
425
+ gr.Markdown("### 🎯Contact Pairs")
426
+ contact_info = gr.Markdown("", elem_classes="contact-info")
427
+ contact_table = gr.Dataframe(
428
+ headers=["RNA", "Protein", "Probability"],
429
+ datatype=["str", "str", "number"],
430
+ row_count=15,
431
+ interactive=False,
432
+ elem_classes="gradio-dataframe"
433
+ )
434
+
435
+ # Download button
436
+ download_btn = gr.File(
437
+ label="📥 Download Results Package",
438
+ visible=True
439
+ )
440
+
441
+ # User Guide tab remains unchanged
442
+ with gr.Tab("📖 User Guide"):
443
+ # ... (unchanged user guide content) ...
444
+ gr.Markdown("""
445
+ # 📖 Comprehensive User Guide
446
+
447
+ ## 🎯 Overview
448
+
449
+ This tool predicts direct contacts between nucleotides in RNA sequences and residues in protein sequences using a deep learning model based on ERNIE-RNA and ESM-2 embeddings. The tool provides:
450
+
451
+ - **Interactive contact matrix visualization** with adjustable probability thresholds
452
+ - **Detailed contact pairs list** sorted by prediction confidence
453
+ - **Downloadable results** in CSV and ZIP formats
454
+ - **Real-time threshold filtering** for result exploration
455
+
456
+ ## 📋 Input Formats
457
+
458
+ ### Method 1: FASTA File Upload
459
+
460
+ Upload a FASTA file containing both RNA and protein sequences in the following format:
461
+
462
+ ```
463
+ >RNA_ID.PROTEIN_ID
464
+ RNA_SEQUENCE.PROTEIN_SEQUENCE
465
+ ```
466
+
467
+ **Example:**
468
+ ```
469
+ >8DMB_W.8DMB_P
470
+ GGGCCUUAUUAAAUGACUUC.MDVPRKMETRRNLRRARRYRK
471
+ ```
472
+
473
+ ### Method 2: Direct Sequence Input
474
+
475
+ Enter RNA and protein sequences directly in the respective text boxes:
476
+
477
+ - **RNA Sequence**: Use standard nucleotide codes (A, U, G, C)
478
+ - **Protein Sequence**: Use standard single-letter amino acid codes (GAVLIFWYDNEKQMSTCPHR)
479
+
480
+ ## 🔬 Understanding Results
481
+
482
+ ### Contact Heatmap
483
+
484
+ - **X-axis**: Protein residue positions (e.g., M1, D2, V3...)
485
+ - **Y-axis**: RNA nucleotide positions (e.g., G1, G2, G3...)
486
+ - **Color Intensity**: Contact probability (0.0 to 1.0)
487
+ - **Red Colors**: Higher contact probability
488
+ - **White/Light**: Lower or no contact probability
489
+
490
+ ### Contact Pairs Table
491
+
492
+ Lists all predicted contacts above the selected threshold, showing:
493
+ - **RNA**: Nucleotide position and type
494
+ - **Protein**: Residue position and type
495
+ - **Probability**: Contact prediction confidence (0.0-1.0)
496
+
497
+ ### Threshold Control
498
+
499
+ Use the **Contact Probability Threshold** slider to:
500
+ - Filter contacts by minimum probability
501
+ - Focus on high-confidence predictions
502
+ - Explore different confidence levels
503
+ - Click **"Reset to Default"** to return to the minimum non-zero value
504
+
505
+ ## 📥 Download Options
506
+
507
+ The results package (ZIP file) contains:
508
+
509
+ 1. **`*_heatmap.csv`**: Complete contact probability matrix
510
+ - Rows: RNA nucleotides
511
+ - Columns: Protein residues
512
+ - Values: Contact probabilities
513
+
514
+ 2. **`*_contact_pairs.csv`**: All contact pairs above zero probability
515
+ - RNA: Nucleotide identifier
516
+ - Protein: Residue identifier
517
+ - Probability: Contact prediction score
518
+
519
+ ## ⚡ Performance Guidelines
520
+
521
+ - **Processing Time**: Scales quadratically with sequence length
522
+
523
+ ### Quality Considerations
524
+ - Higher probabilities indicate more confident predictions
525
+ - Consider biological context when interpreting results
526
+ - Cross-validate important contacts with experimental data
527
+
528
+ ## 🔧 Troubleshooting
529
+
530
+ ### Common Issues
531
+
532
+ **Invalid Characters Error:**
533
+ - RNA: Only A, U, G, C are allowed
534
+ - Protein: Only standard 20 amino acids are supported
535
+ - Check for lowercase letters, numbers, or special characters
536
+
537
+ **File Format Error:**
538
+ - Ensure FASTA format: `>ID\\nSEQUENCE`
539
+ - Use period (.) to separate RNA and protein sequences
540
+ - Check file encoding (UTF-8 recommended)
541
+
542
+ **Empty Results:**
543
+ - Very short sequences may produce no significant contacts
544
+ - Try lowering the probability threshold
545
+ - Verify sequence quality and biological relevance
546
+
547
+ ## 📊 Interpretation Guidelines
548
+
549
+ ### High-Confidence Predictions (≥0.7)
550
+ - Strong likelihood of direct contact
551
+ - Priority targets for experimental validation
552
+ - Suitable for structural modeling constraints
553
+
554
+ ### Medium-Confidence Predictions (0.3-0.7)
555
+ - Moderate likelihood of interaction
556
+ - Consider in context with other evidence
557
+ - Useful for identifying interaction regions
558
+
559
+ ### Low-Confidence Predictions (<0.3)
560
+ - May represent weak or indirect interactions
561
+ - Use with caution for biological interpretation
562
+ - Good for exploratory analysis
563
+
564
+ ## 🔬 Technical Details
565
+
566
+ ### Model Architecture
567
+ - Based on attention mechanisms and transformer models
568
+ - Trained on experimentally validated RNA-protein complexes
569
+ - Uses one-hot encoding for sequence representation
570
+ - Applies contact partner constraints for biological realism
571
+
572
+ ### Validation Metrics
573
+ - Cross-validated on diverse RNA-protein complex datasets
574
+ - Performance metrics available in the original publication
575
+ - Benchmarked against existing prediction methods
576
+
577
+ ### 📊 Difference between current demo and final model
578
+ | Model Type | Checkpoint File | auROC (VL-49) | LLM embeddings |
579
+ |---------------------|---------------------------|---------------|-------------------|
580
+ | OH + RP_Emb (final) | `model_roc_0_38=0.845.pt` | 0.845 | ✓ |
581
+ | OH (demo) | `model_roc_0_56=0.779.pt` | 0.779 | ✗ |
582
+
583
+ ## 📚 Citation & Contact
584
+
585
+ If you use this tool in your research, please cite:
586
+
587
+ **Jiang, J., Zhang, X., Zhan, J., Miao, Z., & Zhou, Y. (2025). RPcontact: Improved prediction of RNA-protein contacts using RNA and protein language models. bioRxiv, 2025-06.**
588
+
589
+ ### Contact Information
590
+ For technical issues, feature requests, or collaboration inquiries, please contact the development team.
591
+
592
+ - **Primary Contact**: Jiuhong Jiang
593
+ - **Email**: [email protected]
594
+ - **Institution**: ShanghaiTech University, Shanghai, China
595
+ ---
596
+
597
+ <p align="center"><em>Making RNA-protein interaction prediction accessible and accurate for the research community.</em></p>
598
+
599
+ """)
600
+
601
+ # Hidden state to store prediction results
602
+ result_state = gr.State()
603
+
604
+ # Event handlers
605
+ def toggle_inputs(method):
606
+ """Toggle input visibility based on selected method"""
607
+ if method == "Upload FASTA File":
608
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
609
+ else:
610
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
611
+
612
+ # Input method change
613
+ input_method.change(
614
+ fn=toggle_inputs,
615
+ inputs=[input_method],
616
+ outputs=[fasta_input, rna_input, protein_input]
617
+ )
618
+
619
+ # Prediction button
620
+ predict_btn.click(
621
+ fn=process_prediction,
622
+ inputs=[fasta_input, rna_input, protein_input, input_method],
623
+ outputs=[
624
+ status_output,
625
+ heatmap_plot,
626
+ contact_table,
627
+ contact_info,
628
+ download_btn,
629
+ result_state,
630
+ threshold_slider
631
+ ]
632
+ )
633
+
634
+ # Threshold slider change
635
+ threshold_slider.change(
636
+ fn=update_results_with_threshold,
637
+ inputs=[threshold_slider, result_state],
638
+ outputs=[heatmap_plot, contact_table, contact_info]
639
+ )
640
+
641
+ # Reset button
642
+ reset_btn.click(
643
+ fn=reset_threshold,
644
+ inputs=[result_state],
645
+ outputs=[threshold_slider]
646
+ )
647
+
648
+ return app
649
+
650
+
651
+ # Initialize predictor
652
+ predictor = RPContactPredictor()
653
+
654
+ if __name__ == "__main__":
655
+ app = create_interface()
656
+ app.launch(
657
+ server_name="0.0.0.0",
658
+ server_port=7860,
659
+ share=False,
660
+ debug=True
661
+ )
benchmark/.DS_Store ADDED
Binary file (6.15 kB). View file
 
benchmark/readme.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ the results predicted by all the methods on TS_nt can download after the paper accepted by journal
evaluate.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Created by: [email protected]
4
+ # des : evaluate RPcontact
5
+ import glob
6
+ import os
7
+ import pickle
8
+ import random
9
+ from argparse import ArgumentParser
10
+ import matplotlib.pyplot as plt
11
+ import pandas as pd
12
+
13
+ import torch
14
+ from Bio import SeqIO
15
+ from sklearn.preprocessing import OneHotEncoder
16
+ import numpy as np
17
+
18
+ from predict import check_path, one_hot_encode, get_bin_pred, doSavePredict
19
+
20
+
21
+ def get_bin_label(df_label,distance_cutoff):
22
+ bin_label = df_label < distance_cutoff
23
+ bin_label = bin_label.astype(int)
24
+ return bin_label
25
+
26
+ def view_evaluate_contact_prob(df_label, bin_pred,ax=None,markersize=5):
27
+ confusing_matrix = np.zeros_like(df_label)
28
+ r, p = confusing_matrix.shape
29
+ if ax is None:
30
+ ax = plt
31
+ ax.xlim([-2, p + 2])
32
+ ax.ylim([-2, r + 2])
33
+ # plt.xticks(rotation=90)
34
+ else:
35
+
36
+ ax.set_xlim([-2, p + 2])
37
+ ax.set_ylim([-2, r + 2])
38
+ # plt.setp(ax.get_xticklabels(), rotation=90)
39
+ ax.set_title('performance')
40
+
41
+ colors = [
42
+ '#f5e0c4', # lightblue for FP
43
+ # '#aaa6ce','#66609c','k',# light purple, dark purple,black, for Groud truth
44
+ '#b0d9db','#61b3b6','k',# light purple, dark purple,black, for Groud truth
45
+ '#ecbbd8','#9d4e7d','r' # for TP
46
+
47
+ ]
48
+ tps = []
49
+ bin_label = df_label<8
50
+ temp = bin_pred - bin_label
51
+ fn = ax.plot(*np.where(temp.T == 1), ".", c=colors[0], markersize=markersize,label='False Positive')[0]
52
+ # 绘制NaN值的数据点为灰色
53
+ oc = ax.plot(*np.where(df_label.T.isna()), ".", c='gray', markersize=markersize, label='Missing in PDB')[0]
54
+ confusing_matrix[bin_label == 1] = 1 #ground truth
55
+ oc = ax.plot(*np.where(bin_label.T == 1), ".", c=colors[1],markersize=markersize, label='Ground truth (8Å)')[0]
56
+ temp = bin_label + bin_pred
57
+ tps.append(len(confusing_matrix[np.where(temp == 2)]))
58
+ confusing_matrix[np.where(temp == 2)] = 2 # TP : blue
59
+ tp = ax.plot(*np.where(temp.T == 2), "o", c=colors[4],markersize=markersize, label='True Positive (8Å)')[0]
60
+ tp.set_markerfacecolor(colors[1])
61
+ tp.set_markeredgecolor(colors[4])
62
+
63
+ bin_label = df_label<5
64
+ temp = bin_label + bin_pred
65
+ tps.append(len(confusing_matrix[np.where(temp == 2)]))
66
+
67
+ oc = ax.plot(*np.where(bin_label.T == 1), ".", c=colors[2],markersize=markersize, label='Ground truth (5Å)')[0]
68
+ confusing_matrix[np.where(temp == 2)] = 2 # TP : blue
69
+ tp = ax.plot(*np.where(temp.T == 2), "o", c=colors[5],markersize=markersize, label='True Positive (5Å)')[0]
70
+ tp.set_markerfacecolor(colors[2])
71
+ tp.set_markeredgecolor(colors[5])
72
+ bin_label = df_label<3.5
73
+ oc = ax.plot(*np.where(bin_label.T == 1), ".", c=colors[3],markersize=markersize, label='Ground truth (3.5Å)')[0]
74
+ temp = bin_label + bin_pred
75
+ tps.append(len(confusing_matrix[np.where(temp == 2)]))
76
+
77
+ confusing_matrix[np.where(temp == 2)] = 2 # TP : blue
78
+ tp = ax.plot(*np.where(temp.T == 2), "o", c=colors[6],markersize=markersize, label='True Positive (3.5Å)')[0]
79
+ tp.set_markerfacecolor(colors[3])
80
+ tp.set_markeredgecolor(colors[6])
81
+
82
+ # ax.legend()
83
+ # plt.show()
84
+ # tp = len(confusing_matrix[np.where(temp == 2)])
85
+ print(len(confusing_matrix[np.where(temp == 2)]))
86
+ return '/'.join([str(e) for e in tps[::-1]]),confusing_matrix
87
+ def seed_everything(seed=2022):
88
+ print('seed_everything to ',seed)
89
+ random.seed(seed)
90
+ os.environ['PYTHONHASHSEED'] = str(seed)
91
+ np.random.seed(seed)
92
+ torch.manual_seed(seed) # 程序每次运行结果一致,但是程序中多次生成随机数每次不一致 # https://blog.csdn.net/qq_42951560/article/details/112174334
93
+ torch.cuda.manual_seed(seed)
94
+ torch.backends.cudnn.deterministic = True
95
+ torch.backends.cudnn.benchmark = False # minbatch的长度一直在变化,这个优化比较浪费时间
96
+
97
+
98
+ def getParam():
99
+ parser = ArgumentParser()
100
+ # data
101
+ parser.add_argument('--rootdir', default='',
102
+ type=str)
103
+ parser.add_argument('--fasta', default='./example/inputs/8DMB_W.8DMB_P.fasta',
104
+ type=str)
105
+ parser.add_argument('--out', default='./example/outputs/',
106
+ type=str)
107
+ parser.add_argument('--ffeat', default='./example/inputs/{pdbid}.pickle',
108
+ type=str)
109
+ parser.add_argument('--fmodel', default='./weight/model_roc_0_38=0.845.pt',
110
+ type=str)
111
+ parser.add_argument('--device', default='cpu',
112
+ type=str)
113
+ parser.add_argument('--flabel', default='./example/inputs/{pdbid}.pickle',
114
+ type=str)
115
+ parser.add_argument('--draw', default=True,
116
+ type=bool)
117
+ args = parser.parse_args()
118
+ return args
119
+ if __name__ == '__main__':
120
+ args = getParam()
121
+ rootdir = args.rootdir
122
+ fasta = args.fasta
123
+ ffeat = args.ffeat
124
+ fmodel = args.fmodel
125
+ device = args.device
126
+ flabel = args.flabel
127
+ draw = args.draw
128
+ out = args.out
129
+ check_path(out)
130
+
131
+ # pdbid = fasta.rsplit('/',1)[0].split('.')[0]
132
+ seed_everything(seed=2022)
133
+ models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
134
+ print('loading existed model', fmodel)
135
+ with torch.no_grad():
136
+ for pdbid,seq in [(record.id,record.seq) for record in SeqIO.parse(fasta,'fasta')]:
137
+ rnaid,proid= pdbid.split('.')
138
+ rnaseq,proseq= seq.split('.')
139
+
140
+ with open(ffeat.format_map({'pdbid':rnaid}),'rb') as f:
141
+ rna_emb = pickle.load(f)
142
+ with open(ffeat.format_map({'pdbid':proid}),'rb') as f:
143
+ pro_emb = pickle.load(f)
144
+
145
+ rna_oh = one_hot_encode(rnaseq, alpha='ACGU')
146
+ pro_oh = one_hot_encode(proseq, alpha='GAVLIFWYDNEKQMSTCPHR')
147
+
148
+ # mask = np.ones((emb.shape[0],1)) # mask missing nt when evaluate the model
149
+ x_train = np.concatenate([rna_oh,rna_emb],axis=1)
150
+ x_train = np.expand_dims(x_train,0)
151
+ x_train = torch.from_numpy(x_train).transpose(-1,-2)
152
+ x_train = x_train.to(device, dtype=torch.float)
153
+ x_rna = x_train
154
+
155
+ x_train = np.concatenate([pro_oh, pro_emb], axis=1)
156
+ x_train = np.expand_dims(x_train, 0)
157
+ x_train = torch.from_numpy(x_train).transpose(-1, -2)
158
+ x_train = x_train.to(device, dtype=torch.float)
159
+ x_pro = x_train
160
+
161
+ print('input data shape for rna and protein:',x_rna.shape,x_pro.shape)
162
+
163
+ x_rna = x_rna.to(device, dtype=torch.float32)
164
+ x_pro = x_pro.to(device, dtype=torch.float32)
165
+ plt.figure(figsize=(20, 15))
166
+ for i,(model_path,model) in enumerate(models):
167
+ model.eval()
168
+ outputs = model(x_pro, x_rna) # [1, 299, 74, 1]
169
+ # print('outputs,',outputs.device)
170
+ outputs = torch.squeeze(outputs, -1)
171
+ outputs = outputs.permute(0, 2, 1)
172
+
173
+ df_pred = outputs[0].cpu().detach().numpy()
174
+ # seq = data._seq[pdbid] if pdbid in data._seq else None
175
+ des = f'predict by {__file__}\n#{model_path}'
176
+ doSavePredict(pdbid, {'rna':rnaseq,'protein':proseq}, df_pred,
177
+ out,
178
+ des
179
+ )
180
+ top = sum(df_pred.shape)
181
+ df_pred = pd.DataFrame(df_pred)
182
+ threshold = df_pred.stack().nlargest(top).iloc[-1]
183
+ if draw:
184
+ with open(flabel.format_map({'pdbid': pdbid}), 'rb') as f:
185
+ df_label = pickle.load(f)
186
+ df_label = df_label.squeeze()
187
+ bin_pred = get_bin_pred(df_pred, threshold=threshold)
188
+ view_evaluate_contact_prob(df_label, bin_pred, ax=None)
189
+ plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
190
+ plt.xlabel(proid)
191
+ plt.ylabel(rnaid)
192
+ handles, labels = plt.gca().get_legend_handles_labels()
193
+
194
+ plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
195
+ frameon=False)
196
+ # 设置坐标轴的相同缩放
197
+ ax = plt.gca()
198
+ ax.set_aspect('equal')
199
+ plt.tight_layout()
200
+ plt.savefig(f'{out}/{pdbid}_{i}_evaluate.png',dpi=900)
201
+ plt.show()
202
+ print(f'predict {pdbid} with {len(seq)} nts')
203
+
204
+
205
+
206
+
207
+
208
+
209
+
example/inputs/8DMB_W.8DMB_P.fasta ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ >8DMB_W.8DMB_P
2
+ GGGCCUUAUUAAAUGACUUCUCGUCAACCACCCCUGACUGAAGUCAGAGGCUUGCUUCUGGCCUGAGUUGGGGGCCCGGUUUGGCGGGGCCGGGGGCAACUGGCUGACCAGGCGGCCCGGUUCGCCGGGCAGGGGUCCGCGGGGCUACCAAGGACUUCCGGGUGUUUCGCCAGCCCGGACUAUCUCCGGCAGAACCGCUCAAUGCCGCGGCCGGCCAAGACCGGCCUAAGCCCUGCGGACAGCGCCGAGGCGACAAUCACUCCGAAAGGAGGCCGUGUAUCGGC.MGSSHHHHHHSSGLVPRGSHMASWSHPQFEKGGGSGGGSGGSAWSHPQFEKMSDSEVNQEAKPEVKPEVKPETHINLKVSDGSSEIFFKIKKTTPLRRLMEAFAKRQGKEMDSLRFLYDGIRIQADQTPEDLDMEDNDIIEAHREQIGGSMSTSITRVPVVGVDGRPLMPTTPRKARLLIRDGLAVPRRNKLGLFYIQMLRPVGTRTQPVALAVDPGAKYDGVAVASHRRVELRAMVFLPDDVPRKMETRRNLRRARRYRKTPRRPARFDNRRRKGYWLAPTQRFKVEARLKVVRELCRIYPVQLIVTEDVRFNHARDRNGKYFSTVEIGKTLTYREYRKLAELRLVEVSETDAWRERFGLEKRTERKCEQVPETHANDAAAMLMGVTGCAHNPAAPFFVWRRLRYARRSLFRQNPQKDGVRPRFGGTANGGFFRKGDWVEAEKAGKVYRGWVCGLPTETTKLVGVADADGKRIGQFSPKKVRLLARSTGFSWKEVAAHSSPEVSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYK
example/inputs/readme.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ sequence of RNA and protein: 8DMB_W.8DMB_P.fasta
2
+
3
+ rna embedding from ERNIE-RNA: 8DMB_W.pickle
4
+ protein embedding from esm2: 8DMB_P.pickle
5
+
6
+ Label is needed in the evaluate mdoe: 8DMB_W.8DMB_P.pickle
example/outputs/8DMB_W.8DMB_P.txt ADDED
The diff for this file is too large to render. See raw diff
 
example/outputs/8DMB_W.8DMB_P_0_binary.png ADDED
example/outputs/8DMB_W.8DMB_P_0_evaluate.png ADDED

Git LFS Details

  • SHA256: 5bce3945fc152fd81839e3225dddd0aff0f2f4a794affc96a4e66c2fa36ff194
  • Pointer size: 132 Bytes
  • Size of remote file: 1.97 MB
example/outputs/8DMB_W.8DMB_P_0_prob.png ADDED

Git LFS Details

  • SHA256: 554f9f4913f0f951289d573f8c83e0522786928125949e6146c4514054848bb5
  • Pointer size: 131 Bytes
  • Size of remote file: 819 kB
example/outputs/8DMB_W.8DMB_P_topL.txt ADDED
@@ -0,0 +1,1026 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rna protein pred
2
+ U35 R17 1.00000
3
+ A37 R17 0.99522
4
+ A8 R17 0.98054
5
+ U7 R17 0.97008
6
+ U39 R17 0.95614
7
+ C34 R17 0.95496
8
+ C33 R17 0.94333
9
+ U9 R17 0.94329
10
+ A27 R17 0.94328
11
+ G36 R17 0.94095
12
+ U6 R17 0.94029
13
+ G1 R17 0.93609
14
+ C38 R17 0.93235
15
+ A26 R17 0.91772
16
+ C32 R17 0.91758
17
+ A30 R17 0.91663
18
+ G2 R17 0.90895
19
+ U10 R17 0.90053
20
+ C28 R17 0.89455
21
+ C31 R17 0.89061
22
+ A41 R17 0.88997
23
+ U44 R17 0.88463
24
+ C29 R17 0.88146
25
+ G3 R17 0.87936
26
+ A42 R17 0.87429
27
+ A16 R17 0.87417
28
+ C5 R17 0.87406
29
+ A11 R17 0.87328
30
+ U14 R17 0.87179
31
+ G40 R17 0.87162
32
+ C4 R17 0.87014
33
+ A13 R17 0.86685
34
+ A12 R17 0.86145
35
+ U18 R17 0.85383
36
+ U19 R17 0.84186
37
+ A46 R17 0.84029
38
+ C45 R17 0.83834
39
+ G15 R17 0.83245
40
+ G43 R17 0.83169
41
+ C17 R17 0.82836
42
+ C25 R17 0.82813
43
+ U24 R17 0.81132
44
+ U82 R17 0.80720
45
+ U21 R17 0.79809
46
+ G72 R17 0.79460
47
+ U35 R299 0.79412
48
+ C20 R17 0.79060
49
+ U81 R17 0.79039
50
+ U35 H5 0.78897
51
+ G71 R17 0.78883
52
+ A37 R299 0.78653
53
+ G73 R17 0.78649
54
+ U35 K31 0.78543
55
+ A37 H5 0.78424
56
+ A8 R299 0.78149
57
+ A37 K31 0.77866
58
+ G83 R17 0.77861
59
+ U35 R295 0.77697
60
+ A8 K31 0.77684
61
+ U64 R17 0.77534
62
+ G74 R17 0.77356
63
+ U35 H6 0.77221
64
+ U7 R299 0.77158
65
+ U80 R17 0.77144
66
+ U69 R17 0.77060
67
+ A8 H5 0.76959
68
+ G70 R17 0.76760
69
+ A37 R295 0.76733
70
+ A37 H6 0.76665
71
+ C75 R17 0.76648
72
+ A66 R17 0.76631
73
+ U7 K31 0.76610
74
+ U121 R17 0.76137
75
+ G84 R17 0.76003
76
+ A8 R295 0.75984
77
+ U35 R274 0.75935
78
+ U7 H5 0.75871
79
+ C90 R17 0.75856
80
+ C76 R17 0.75854
81
+ U35 R424 0.75682
82
+ U35 R422 0.75666
83
+ C284 R17 0.75530
84
+ C34 R299 0.75520
85
+ C22 R17 0.75505
86
+ A37 R274 0.75487
87
+ C91 R17 0.75449
88
+ C34 H5 0.75313
89
+ U35 H10 0.75301
90
+ U9 R299 0.75242
91
+ U68 R17 0.75214
92
+ G23 R17 0.75203
93
+ C77 R17 0.75059
94
+ A37 R424 0.75047
95
+ G47 R17 0.75025
96
+ U35 H7 0.75020
97
+ U7 R295 0.74986
98
+ A8 H6 0.74975
99
+ U39 H5 0.74969
100
+ A37 H10 0.74899
101
+ U9 K31 0.74829
102
+ C34 K31 0.74809
103
+ A37 R422 0.74766
104
+ U35 H9 0.74712
105
+ G89 R17 0.74704
106
+ G36 R299 0.74701
107
+ U122 R17 0.74670
108
+ U6 R299 0.74564
109
+ A37 H7 0.74483
110
+ U35 R268 0.74468
111
+ A27 R299 0.74466
112
+ U39 R299 0.74453
113
+ G65 R17 0.74445
114
+ A27 H5 0.74397
115
+ U276 R17 0.74369
116
+ A37 H9 0.74357
117
+ C33 R299 0.74245
118
+ G36 H5 0.74204
119
+ C33 H5 0.74197
120
+ A259 R17 0.74120
121
+ U6 K31 0.74112
122
+ U35 H8 0.74110
123
+ U39 K31 0.74108
124
+ G92 R17 0.74087
125
+ U9 H5 0.74076
126
+ G120 R17 0.74047
127
+ A8 R424 0.74030
128
+ A266 R17 0.74023
129
+ G78 R17 0.73972
130
+ G36 K31 0.73956
131
+ C85 R17 0.73934
132
+ U7 H6 0.73909
133
+ A37 R268 0.73878
134
+ A8 R274 0.73852
135
+ C34 R295 0.73834
136
+ A48 R17 0.73822
137
+ G79 R17 0.73819
138
+ A8 R422 0.73817
139
+ A27 K31 0.73719
140
+ A37 H8 0.73675
141
+ A267 R17 0.73649
142
+ C34 H6 0.73613
143
+ U35 R413 0.73563
144
+ U35 R157 0.73516
145
+ G1 K31 0.73493
146
+ A8 H10 0.73452
147
+ C33 K31 0.73437
148
+ U35 R273 0.73416
149
+ U9 R295 0.73407
150
+ U35 R174 0.73368
151
+ C38 H5 0.73352
152
+ U6 H5 0.73301
153
+ U35 R718 0.73299
154
+ U39 H6 0.73272
155
+ U257 R17 0.73268
156
+ G1 R299 0.73203
157
+ U278 R17 0.73174
158
+ C63 R17 0.73152
159
+ U35 R181 0.73144
160
+ U7 R424 0.73140
161
+ A37 R413 0.73104
162
+ C38 R299 0.73102
163
+ G1 H5 0.73062
164
+ G119 R17 0.73057
165
+ U35 R166 0.73039
166
+ U53 R17 0.73035
167
+ A248 R17 0.73015
168
+ G36 R295 0.72986
169
+ G88 R17 0.72984
170
+ A265 R17 0.72977
171
+ U7 R274 0.72945
172
+ A37 R273 0.72934
173
+ U39 R295 0.72919
174
+ G93 R17 0.72905
175
+ U6 R295 0.72784
176
+ U7 R422 0.72771
177
+ A27 R295 0.72754
178
+ A37 R157 0.72727
179
+ A37 R174 0.72716
180
+ U165 R17 0.72699
181
+ U7 H10 0.72676
182
+ C38 K31 0.72650
183
+ A27 H6 0.72631
184
+ A8 H9 0.72628
185
+ G36 H6 0.72625
186
+ A8 H7 0.72621
187
+ G127 R17 0.72597
188
+ A37 R181 0.72553
189
+ A26 R299 0.72531
190
+ U35 R435 0.72526
191
+ A37 R718 0.72524
192
+ C33 R295 0.72510
193
+ C33 H6 0.72443
194
+ G67 R17 0.72392
195
+ U56 R17 0.72388
196
+ A37 R166 0.72374
197
+ U52 R17 0.72372
198
+ U9 H6 0.72342
199
+ C258 R17 0.72304
200
+ A26 H5 0.72242
201
+ A8 R413 0.72192
202
+ G128 R17 0.72185
203
+ G283 R17 0.72176
204
+ C34 R274 0.72168
205
+ A256 R17 0.72132
206
+ A30 R299 0.72115
207
+ A8 R268 0.72099
208
+ G86 R17 0.72079
209
+ C32 R299 0.72024
210
+ G94 R17 0.72003
211
+ C32 H5 0.71998
212
+ C34 H10 0.71962
213
+ A30 H5 0.71913
214
+ U57 R17 0.71885
215
+ C34 R424 0.71867
216
+ U7 H9 0.71865
217
+ U39 R274 0.71854
218
+ G87 R17 0.71831
219
+ C126 R17 0.71809
220
+ A8 H8 0.71788
221
+ A279 R17 0.71784
222
+ U10 R299 0.71742
223
+ C118 R17 0.71739
224
+ G277 R17 0.71732
225
+ A26 K31 0.71674
226
+ C34 R422 0.71666
227
+ U7 H7 0.71660
228
+ U39 H10 0.71650
229
+ C38 H6 0.71631
230
+ G275 R17 0.71624
231
+ U6 H6 0.71602
232
+ C274 R17 0.71602
233
+ A37 R435 0.71587
234
+ A8 R157 0.71582
235
+ G1 R295 0.71576
236
+ A270 R17 0.71541
237
+ A8 R174 0.71528
238
+ U35 R450 0.71475
239
+ G1 H6 0.71464
240
+ C34 H7 0.71439
241
+ U9 R422 0.71433
242
+ U166 R17 0.71415
243
+ U9 R424 0.71394
244
+ G36 R274 0.71380
245
+ U10 K31 0.71368
246
+ U35 R284 0.71344
247
+ A8 R718 0.71313
248
+ C38 R295 0.71305
249
+ C34 H9 0.71296
250
+ A8 R273 0.71281
251
+ G2 K31 0.71278
252
+ U7 R268 0.71255
253
+ A30 K31 0.71245
254
+ G95 R17 0.71245
255
+ U7 R413 0.71243
256
+ C273 R17 0.71210
257
+ C32 K31 0.71152
258
+ U39 H7 0.71128
259
+ U39 R424 0.71127
260
+ C123 R17 0.71125
261
+ A8 R181 0.71124
262
+ G36 R424 0.71123
263
+ U39 H9 0.71091
264
+ G36 R422 0.71068
265
+ U9 R274 0.71061
266
+ C33 H10 0.71047
267
+ A8 R166 0.71034
268
+ C33 R274 0.71021
269
+ A37 R450 0.71017
270
+ G2 R299 0.70961
271
+ U6 R424 0.70957
272
+ A30 R295 0.70955
273
+ U7 H8 0.70951
274
+ A27 R274 0.70919
275
+ A255 R17 0.70913
276
+ A27 H10 0.70895
277
+ A27 R424 0.70860
278
+ C34 R268 0.70840
279
+ G2 H5 0.70803
280
+ G36 H10 0.70787
281
+ U35 R272 0.70781
282
+ A154 R17 0.70779
283
+ C62 R17 0.70761
284
+ C55 R17 0.70729
285
+ C33 R424 0.70694
286
+ U10 H5 0.70665
287
+ U6 R422 0.70664
288
+ A26 R295 0.70652
289
+ U7 R157 0.70640
290
+ G54 R17 0.70638
291
+ A8 R435 0.70636
292
+ U7 R174 0.70636
293
+ A98 R17 0.70630
294
+ U6 R274 0.70621
295
+ U39 R422 0.70617
296
+ C125 R17 0.70606
297
+ C34 H8 0.70602
298
+ C28 H5 0.70588
299
+ A30 H6 0.70581
300
+ G36 H7 0.70579
301
+ C28 R299 0.70566
302
+ U35 R264 0.70560
303
+ A37 R284 0.70521
304
+ U35 R290 0.70492
305
+ U9 H10 0.70487
306
+ C117 R17 0.70451
307
+ A26 H6 0.70419
308
+ A27 H7 0.70414
309
+ U7 R273 0.70408
310
+ U156 R17 0.70408
311
+ C260 R17 0.70401
312
+ U6 H10 0.70393
313
+ C32 R295 0.70392
314
+ U39 R268 0.70387
315
+ U157 R17 0.70382
316
+ U35 R177 0.70363
317
+ U39 H8 0.70363
318
+ C33 H9 0.70360
319
+ U59 R17 0.70351
320
+ A27 R422 0.70344
321
+ C32 H6 0.70343
322
+ A99 R17 0.70337
323
+ U261 R17 0.70331
324
+ G96 R17 0.70329
325
+ U7 R718 0.70318
326
+ G129 R17 0.70318
327
+ G36 H9 0.70296
328
+ C33 H7 0.70292
329
+ G272 R17 0.70285
330
+ C38 R274 0.70272
331
+ A27 H9 0.70270
332
+ A110 R17 0.70270
333
+ U35 R265 0.70257
334
+ C33 R422 0.70249
335
+ U10 R295 0.70240
336
+ U7 R166 0.70156
337
+ U9 H7 0.70150
338
+ U7 R181 0.70127
339
+ G124 R17 0.70124
340
+ A37 R272 0.70119
341
+ G1 H10 0.70118
342
+ C38 H10 0.70072
343
+ C28 K31 0.70046
344
+ U280 R17 0.70043
345
+ G247 R17 0.69989
346
+ G268 R17 0.69960
347
+ C31 R299 0.69928
348
+ C34 R413 0.69913
349
+ U101 R17 0.69905
350
+ G36 R268 0.69892
351
+ U9 H9 0.69863
352
+ A37 R264 0.69861
353
+ C34 R174 0.69840
354
+ C31 H5 0.69836
355
+ A8 R450 0.69811
356
+ U35 R533 0.69801
357
+ C34 R157 0.69801
358
+ G1 R274 0.69794
359
+ U6 H9 0.69771
360
+ C33 R268 0.69748
361
+ G1 R424 0.69745
362
+ G36 H8 0.69745
363
+ A37 R290 0.69737
364
+ U39 R413 0.69721
365
+ U7 R435 0.69717
366
+ U203 R17 0.69700
367
+ G2 R295 0.69683
368
+ U35 R599 0.69674
369
+ C38 R424 0.69672
370
+ A16 R299 0.69634
371
+ C34 R273 0.69631
372
+ A37 R265 0.69628
373
+ C116 R17 0.69622
374
+ A41 H5 0.69614
375
+ U6 H7 0.69608
376
+ A27 H8 0.69574
377
+ A253 R17 0.69571
378
+ C34 R181 0.69564
379
+ A27 R268 0.69559
380
+ U163 R17 0.69558
381
+ C33 H8 0.69556
382
+ U9 R413 0.69533
383
+ C38 H7 0.69524
384
+ G1 H9 0.69524
385
+ C38 H9 0.69513
386
+ G271 R17 0.69472
387
+ C29 R299 0.69470
388
+ C97 R17 0.69462
389
+ A37 R177 0.69460
390
+ G164 R17 0.69453
391
+ G1 H7 0.69448
392
+ A16 K31 0.69425
393
+ C29 H5 0.69401
394
+ U35 R576 0.69376
395
+ G2 H6 0.69373
396
+ C34 R718 0.69339
397
+ U9 R268 0.69330
398
+ A11 R299 0.69328
399
+ G249 R17 0.69323
400
+ G282 R17 0.69297
401
+ U44 R299 0.69288
402
+ C34 R166 0.69284
403
+ A37 R533 0.69265
404
+ C28 R295 0.69257
405
+ U9 H8 0.69256
406
+ G1 R422 0.69252
407
+ A41 K31 0.69252
408
+ C38 R422 0.69207
409
+ A41 R299 0.69201
410
+ U10 H6 0.69183
411
+ U39 R174 0.69180
412
+ U44 H5 0.69169
413
+ U39 R273 0.69149
414
+ C28 H6 0.69129
415
+ A30 R424 0.69123
416
+ G36 R413 0.69115
417
+ A27 R413 0.69098
418
+ C31 K31 0.69096
419
+ A16 H5 0.69090
420
+ C58 R17 0.69089
421
+ G269 R17 0.69089
422
+ A202 R17 0.69080
423
+ G210 R17 0.69071
424
+ U9 R157 0.69063
425
+ U39 R181 0.69056
426
+ G3 K31 0.69052
427
+ U39 R157 0.69052
428
+ A37 R599 0.69052
429
+ C32 H10 0.69043
430
+ G36 R273 0.69041
431
+ A30 R274 0.69039
432
+ C51 R17 0.69028
433
+ A37 R576 0.69024
434
+ C5 R299 0.69021
435
+ G36 R157 0.69014
436
+ U6 R268 0.69006
437
+ U6 R413 0.69005
438
+ A8 R284 0.68991
439
+ A27 R174 0.68987
440
+ A26 R424 0.68980
441
+ U6 H8 0.68978
442
+ A11 K31 0.68977
443
+ U7 R450 0.68949
444
+ C5 K31 0.68938
445
+ C32 R274 0.68936
446
+ U44 K31 0.68920
447
+ A30 R422 0.68918
448
+ U167 R17 0.68915
449
+ A26 R274 0.68915
450
+ A26 H10 0.68915
451
+ C29 K31 0.68896
452
+ G36 R718 0.68890
453
+ G115 R17 0.68861
454
+ U9 R718 0.68859
455
+ U9 R174 0.68858
456
+ C211 R17 0.68839
457
+ G36 R174 0.68823
458
+ C33 R174 0.68819
459
+ A30 H10 0.68810
460
+ G3 R299 0.68804
461
+ C33 R413 0.68792
462
+ C38 H8 0.68789
463
+ A30 H7 0.68781
464
+ G1 H8 0.68766
465
+ C38 R268 0.68764
466
+ A27 R157 0.68756
467
+ A27 R181 0.68730
468
+ U14 R299 0.68723
469
+ C32 R424 0.68715
470
+ C34 R435 0.68684
471
+ G36 R181 0.68663
472
+ U9 R273 0.68643
473
+ A8 R290 0.68614
474
+ G61 R17 0.68601
475
+ G36 R166 0.68600
476
+ U6 R157 0.68559
477
+ C31 R295 0.68552
478
+ C4 K31 0.68542
479
+ U14 K31 0.68540
480
+ A30 H9 0.68539
481
+ U9 R181 0.68539
482
+ C33 R157 0.68533
483
+ U9 R166 0.68525
484
+ C246 R17 0.68509
485
+ U14 H5 0.68505
486
+ C100 R17 0.68500
487
+ A8 R272 0.68497
488
+ A41 R295 0.68487
489
+ C32 H9 0.68474
490
+ G209 R17 0.68466
491
+ U10 R422 0.68462
492
+ U39 R166 0.68459
493
+ A11 H5 0.68454
494
+ C33 R181 0.68449
495
+ U6 R174 0.68435
496
+ A16 R295 0.68432
497
+ C33 R273 0.68407
498
+ G3 H5 0.68397
499
+ C4 R299 0.68391
500
+ C31 H6 0.68389
501
+ A8 R177 0.68387
502
+ G264 R17 0.68386
503
+ C29 R295 0.68381
504
+ G49 R17 0.68377
505
+ G114 R17 0.68368
506
+ A42 H5 0.68356
507
+ A41 H6 0.68353
508
+ U6 R718 0.68344
509
+ C155 R17 0.68343
510
+ A27 R273 0.68340
511
+ C32 H7 0.68340
512
+ U10 R424 0.68327
513
+ A26 R422 0.68317
514
+ A131 R17 0.68312
515
+ A13 R299 0.68294
516
+ U39 R718 0.68293
517
+ A26 H9 0.68273
518
+ G1 R413 0.68258
519
+ U9 R435 0.68258
520
+ A26 H7 0.68249
521
+ C109 R17 0.68244
522
+ G111 R17 0.68244
523
+ U18 R299 0.68243
524
+ G40 H5 0.68211
525
+ A42 R299 0.68204
526
+ U6 R273 0.68203
527
+ A27 R166 0.68196
528
+ A42 K31 0.68186
529
+ C32 R422 0.68184
530
+ G1 R268 0.68184
531
+ C113 R17 0.68161
532
+ C281 R17 0.68156
533
+ C130 R17 0.68150
534
+ C29 H6 0.68121
535
+ A151 R17 0.68109
536
+ U6 R166 0.68093
537
+ C33 R166 0.68088
538
+ G36 R435 0.68084
539
+ A12 R299 0.68083
540
+ C5 H5 0.68079
541
+ C38 R413 0.68073
542
+ U7 R284 0.68070
543
+ A30 H8 0.68064
544
+ A8 R599 0.68041
545
+ A13 K31 0.68027
546
+ C254 R17 0.68000
547
+ G250 R17 0.67975
548
+ A11 R295 0.67963
549
+ A13 H5 0.67963
550
+ C33 R718 0.67962
551
+ G112 R17 0.67936
552
+ G2 H10 0.67927
553
+ A147 R17 0.67891
554
+ C34 R450 0.67874
555
+ G60 R17 0.67872
556
+ C158 R17 0.67852
557
+ U44 R295 0.67847
558
+ U6 R181 0.67847
559
+ G3 R295 0.67834
560
+ A27 R718 0.67816
561
+ A8 R264 0.67801
562
+ U10 R274 0.67791
563
+ C5 R295 0.67782
564
+ G50 R17 0.67772
565
+ C212 R17 0.67760
566
+ A12 K31 0.67741
567
+ C38 R273 0.67737
568
+ C4 H5 0.67735
569
+ G1 R157 0.67714
570
+ G2 R424 0.67700
571
+ C32 H8 0.67696
572
+ U6 R435 0.67696
573
+ G2 R274 0.67693
574
+ A30 R268 0.67690
575
+ G40 R299 0.67683
576
+ C32 R268 0.67674
577
+ G40 K31 0.67667
578
+ A16 H6 0.67658
579
+ U7 R290 0.67644
580
+ C38 R174 0.67631
581
+ U7 R272 0.67612
582
+ A8 R576 0.67598
583
+ G153 R17 0.67578
584
+ A26 R268 0.67576
585
+ A27 R435 0.67568
586
+ A241 R17 0.67566
587
+ A8 R533 0.67551
588
+ U35 H26 0.67546
589
+ C38 R157 0.67523
590
+ U39 R435 0.67522
591
+ C38 R181 0.67516
592
+ U44 H6 0.67511
593
+ A26 H8 0.67503
594
+ U18 K31 0.67495
595
+ C262 R17 0.67485
596
+ A42 R295 0.67477
597
+ G1 R174 0.67476
598
+ C263 R17 0.67471
599
+ A12 H5 0.67470
600
+ G2 H7 0.67467
601
+ U7 R177 0.67463
602
+ C4 R295 0.67445
603
+ C34 R284 0.67445
604
+ U39 R450 0.67443
605
+ A8 R265 0.67413
606
+ G2 H9 0.67411
607
+ G2 R422 0.67399
608
+ A150 R17 0.67399
609
+ G1 R181 0.67342
610
+ G204 R17 0.67340
611
+ C251 R17 0.67340
612
+ U18 H5 0.67339
613
+ U14 R295 0.67328
614
+ C208 R17 0.67320
615
+ C33 R435 0.67320
616
+ U35 R144 0.67314
617
+ C28 R274 0.67247
618
+ C28 R424 0.67245
619
+ C28 H10 0.67242
620
+ U10 H7 0.67222
621
+ G1 R273 0.67215
622
+ U35 R408 0.67201
623
+ U7 R599 0.67188
624
+ U10 H10 0.67184
625
+ G3 H6 0.67159
626
+ G36 R450 0.67154
627
+ C28 H7 0.67151
628
+ A30 R413 0.67145
629
+ U9 R450 0.67142
630
+ A42 H6 0.67117
631
+ A26 R413 0.67110
632
+ A30 R174 0.67108
633
+ C38 R166 0.67077
634
+ A26 R174 0.67071
635
+ A11 H6 0.67061
636
+ A172 R17 0.67031
637
+ G36 R284 0.67024
638
+ C28 R422 0.67012
639
+ U19 R299 0.67007
640
+ A30 R157 0.67004
641
+ C34 R272 0.66993
642
+ C159 R17 0.66975
643
+ C31 H10 0.66944
644
+ C38 R718 0.66940
645
+ C31 R274 0.66912
646
+ U14 H6 0.66911
647
+ A37 H26 0.66909
648
+ G1 R166 0.66893
649
+ A239 R17 0.66881
650
+ C32 R174 0.66873
651
+ C31 R424 0.66870
652
+ A13 R295 0.66843
653
+ C34 R264 0.66840
654
+ G40 H6 0.66825
655
+ G1 R718 0.66813
656
+ C32 R413 0.66808
657
+ C28 H9 0.66805
658
+ G252 R17 0.66799
659
+ U7 R264 0.66794
660
+ U6 R450 0.66794
661
+ U10 H9 0.66791
662
+ G102 R17 0.66783
663
+ A41 R274 0.66766
664
+ G2 H8 0.66764
665
+ C108 R17 0.66753
666
+ U9 R284 0.66742
667
+ C33 R450 0.66738
668
+ C5 H6 0.66737
669
+ A26 R157 0.66737
670
+ A26 R181 0.66724
671
+ U35 R260 0.66722
672
+ C34 R177 0.66718
673
+ U7 R576 0.66711
674
+ G152 R17 0.66688
675
+ A12 R295 0.66683
676
+ A30 R718 0.66682
677
+ A37 R144 0.66674
678
+ A30 R181 0.66670
679
+ G40 R295 0.66669
680
+ A30 R273 0.66622
681
+ C34 R290 0.66614
682
+ A201 R17 0.66610
683
+ U18 R295 0.66599
684
+ A41 H10 0.66599
685
+ C205 R17 0.66598
686
+ A30 R166 0.66594
687
+ U39 R284 0.66575
688
+ A27 R450 0.66575
689
+ C31 H7 0.66568
690
+ A41 H7 0.66565
691
+ U7 R533 0.66562
692
+ C31 H9 0.66559
693
+ C4 H6 0.66539
694
+ C34 R265 0.66529
695
+ G36 R272 0.66527
696
+ U146 R17 0.66521
697
+ C32 R157 0.66502
698
+ U7 R265 0.66495
699
+ G213 R17 0.66471
700
+ C31 R422 0.66471
701
+ A16 R424 0.66458
702
+ U9 R290 0.66427
703
+ C32 R181 0.66427
704
+ A26 R273 0.66408
705
+ U10 R413 0.66408
706
+ U44 R274 0.66407
707
+ C29 R424 0.66405
708
+ A13 H6 0.66398
709
+ A37 R408 0.66393
710
+ U136 R17 0.66392
711
+ C32 R273 0.66378
712
+ A26 R166 0.66378
713
+ U10 H8 0.66363
714
+ G36 R290 0.66362
715
+ A41 R424 0.66359
716
+ U39 R264 0.66337
717
+ A16 R422 0.66336
718
+ C29 H7 0.66321
719
+ C29 R274 0.66318
720
+ C28 H8 0.66312
721
+ U44 H10 0.66311
722
+ C29 R422 0.66298
723
+ A41 H9 0.66277
724
+ G36 R264 0.66277
725
+ C34 R533 0.66259
726
+ A11 R424 0.66254
727
+ G2 R413 0.66253
728
+ A11 R422 0.66248
729
+ U39 R272 0.66246
730
+ C245 R17 0.66218
731
+ C29 H10 0.66217
732
+ C17 R299 0.66211
733
+ U6 R284 0.66208
734
+ C38 R435 0.66200
735
+ U19 H5 0.66194
736
+ G1 R435 0.66178
737
+ A41 R422 0.66169
738
+ C32 R166 0.66146
739
+ G2 R268 0.66144
740
+ A107 R17 0.66142
741
+ U10 R268 0.66129
742
+ C34 R599 0.66113
743
+ U39 R265 0.66110
744
+ U19 K31 0.66089
745
+ U9 R272 0.66067
746
+ U10 R157 0.66053
747
+ G36 R265 0.66031
748
+ C33 R284 0.66028
749
+ U9 R177 0.66024
750
+ A8 H26 0.66023
751
+ A37 R260 0.66022
752
+ A12 H6 0.66019
753
+ A30 R435 0.66009
754
+ C38 R450 0.65997
755
+ G36 R177 0.65988
756
+ U10 R718 0.65977
757
+ G207 R17 0.65968
758
+ C32 R718 0.65964
759
+ C206 R17 0.65939
760
+ C148 R17 0.65939
761
+ C31 H8 0.65939
762
+ C29 H9 0.65931
763
+ C28 R268 0.65927
764
+ G1 R450 0.65916
765
+ A26 R718 0.65907
766
+ G15 R299 0.65903
767
+ C17 K31 0.65898
768
+ C5 R424 0.65887
769
+ C34 R576 0.65852
770
+ U39 R290 0.65848
771
+ U39 R533 0.65839
772
+ G15 K31 0.65838
773
+ A46 R299 0.65825
774
+ A41 H8 0.65818
775
+ A27 R264 0.65806
776
+ U18 H6 0.65796
777
+ C5 R422 0.65788
778
+ U10 R174 0.65772
779
+ G2 R157 0.65769
780
+ U44 R424 0.65765
781
+ G3 R424 0.65760
782
+ A16 H7 0.65755
783
+ A16 H10 0.65754
784
+ A27 R284 0.65749
785
+ C174 R17 0.65749
786
+ U35 R409 0.65746
787
+ U44 H9 0.65707
788
+ G160 R17 0.65706
789
+ A27 R177 0.65697
790
+ C175 R17 0.65696
791
+ A27 R290 0.65689
792
+ G3 R422 0.65686
793
+ U6 R290 0.65676
794
+ A26 R435 0.65669
795
+ C45 R299 0.65668
796
+ G162 R17 0.65665
797
+ U9 R599 0.65652
798
+ A42 R274 0.65652
799
+ C33 R272 0.65644
800
+ U39 R177 0.65639
801
+ U14 R424 0.65636
802
+ C31 R268 0.65632
803
+ U39 R576 0.65632
804
+ C45 H5 0.65622
805
+ A27 R272 0.65619
806
+ G3 H10 0.65619
807
+ G214 R17 0.65616
808
+ U10 R166 0.65613
809
+ G15 H5 0.65608
810
+ U9 R264 0.65608
811
+ A11 R274 0.65595
812
+ A16 R274 0.65588
813
+ A46 H5 0.65585
814
+ G3 R274 0.65566
815
+ U44 H7 0.65552
816
+ C240 R17 0.65546
817
+ U6 R272 0.65538
818
+ C33 R264 0.65535
819
+ C29 H8 0.65533
820
+ A8 R408 0.65530
821
+ C28 R413 0.65528
822
+ C17 H5 0.65528
823
+ U10 R273 0.65519
824
+ A41 R268 0.65517
825
+ C25 R299 0.65515
826
+ U35 R257 0.65510
827
+ G2 R174 0.65507
828
+ C149 R17 0.65506
829
+ U35 R671 0.65501
830
+ G36 R533 0.65498
831
+ A42 H10 0.65485
832
+ C5 R274 0.65471
833
+ U10 R181 0.65456
834
+ C33 R177 0.65449
835
+ G36 R599 0.65442
836
+ A27 R265 0.65438
837
+ C4 R422 0.65432
838
+ C4 R424 0.65426
839
+ G3 H7 0.65421
840
+ U6 R177 0.65419
841
+ A42 H7 0.65410
842
+ U39 R599 0.65404
843
+ C5 H10 0.65399
844
+ C45 K31 0.65391
845
+ C28 R174 0.65389
846
+ A46 K31 0.65387
847
+ U35 R405 0.65386
848
+ A16 H9 0.65380
849
+ C28 R157 0.65370
850
+ A217 R17 0.65351
851
+ U10 R435 0.65346
852
+ A42 R424 0.65343
853
+ C32 R435 0.65326
854
+ U14 H10 0.65315
855
+ A13 R424 0.65313
856
+ U44 R422 0.65311
857
+ G173 R17 0.65295
858
+ G2 R181 0.65277
859
+ C33 R265 0.65272
860
+ C176 R17 0.65265
861
+ U9 R265 0.65256
862
+ G40 R274 0.65249
863
+ U6 R599 0.65240
864
+ A11 H7 0.65238
865
+ G3 H9 0.65230
866
+ G36 R576 0.65211
867
+ C38 R284 0.65193
868
+ A42 H9 0.65187
869
+ U19 R295 0.65175
870
+ G40 H10 0.65173
871
+ G2 R273 0.65171
872
+ U7 H26 0.65167
873
+ C28 R181 0.65162
874
+ U35 R258 0.65156
875
+ A42 R422 0.65156
876
+ A11 H10 0.65154
877
+ U44 R268 0.65151
878
+ G161 R17 0.65149
879
+ A12 R424 0.65138
880
+ C33 R290 0.65125
881
+ U14 R422 0.65119
882
+ C33 R533 0.65083
883
+ A41 R413 0.65081
884
+ A27 R599 0.65072
885
+ A27 R533 0.65067
886
+ U9 R533 0.65060
887
+ G43 R299 0.65053
888
+ C4 R274 0.65052
889
+ A37 R671 0.65044
890
+ C4 H10 0.65034
891
+ C25 H5 0.65029
892
+ C5 H7 0.65024
893
+ A8 R144 0.65020
894
+ A27 R576 0.65012
895
+ C33 R599 0.65007
896
+ C215 R17 0.65000
897
+ C29 R268 0.65000
898
+ G43 H5 0.64999
899
+ G2 R718 0.64996
900
+ C5 H9 0.64987
901
+ A37 R409 0.64987
902
+ C31 R174 0.64975
903
+ U14 R274 0.64973
904
+ G1 R284 0.64959
905
+ U9 R576 0.64950
906
+ C38 R272 0.64949
907
+ G40 H7 0.64948
908
+ U14 H7 0.64942
909
+ C31 R413 0.64942
910
+ G132 R17 0.64941
911
+ A16 H8 0.64938
912
+ C168 R17 0.64936
913
+ U44 H8 0.64931
914
+ G43 K31 0.64920
915
+ G2 R166 0.64906
916
+ A37 R405 0.64895
917
+ A30 R450 0.64893
918
+ U18 R424 0.64892
919
+ C17 R295 0.64892
920
+ C4 H7 0.64882
921
+ G103 R17 0.64877
922
+ A11 H9 0.64876
923
+ C28 R273 0.64859
924
+ A218 R17 0.64858
925
+ A12 R422 0.64858
926
+ U14 H9 0.64854
927
+ A13 H10 0.64838
928
+ C38 R264 0.64828
929
+ A220 R17 0.64820
930
+ G135 R17 0.64810
931
+ A13 R422 0.64809
932
+ U35 R312 0.64803
933
+ U6 R264 0.64798
934
+ C25 K31 0.64791
935
+ G134 R17 0.64790
936
+ C137 R17 0.64778
937
+ A41 R157 0.64758
938
+ G40 H9 0.64754
939
+ C32 R450 0.64751
940
+ A30 R284 0.64749
941
+ C33 R576 0.64731
942
+ A42 H8 0.64723
943
+ G3 H8 0.64722
944
+ C31 R157 0.64721
945
+ C28 R166 0.64721
946
+ A26 R450 0.64713
947
+ U18 R422 0.64710
948
+ G133 R17 0.64706
949
+ C4 H9 0.64697
950
+ G1 R290 0.64695
951
+ G15 R295 0.64691
952
+ G40 R424 0.64685
953
+ A13 R274 0.64667
954
+ A37 R257 0.64648
955
+ U44 R413 0.64647
956
+ U6 R265 0.64646
957
+ A16 R268 0.64613
958
+ C38 R265 0.64600
959
+ C29 R413 0.64588
960
+ U6 R576 0.64563
961
+ U7 R408 0.64554
962
+ C38 R290 0.64553
963
+ A16 R413 0.64546
964
+ U19 H6 0.64539
965
+ G242 R17 0.64531
966
+ C31 R181 0.64518
967
+ C29 R157 0.64517
968
+ U35 R122 0.64506
969
+ C28 R718 0.64506
970
+ A13 H7 0.64495
971
+ C31 R273 0.64495
972
+ C29 R174 0.64478
973
+ A8 R260 0.64476
974
+ A11 H8 0.64469
975
+ A12 R274 0.64462
976
+ A41 R174 0.64454
977
+ A42 R268 0.64451
978
+ A13 H9 0.64444
979
+ G1 R272 0.64435
980
+ U6 R533 0.64431
981
+ G244 R17 0.64429
982
+ U35 K555 0.64417
983
+ C5 H8 0.64410
984
+ A16 R157 0.64403
985
+ C138 R17 0.64398
986
+ A46 R295 0.64386
987
+ C31 R166 0.64376
988
+ G40 R422 0.64359
989
+ G2 R435 0.64338
990
+ A41 R273 0.64334
991
+ C38 R533 0.64333
992
+ A12 H10 0.64318
993
+ A11 R413 0.64316
994
+ C38 R177 0.64302
995
+ C31 R718 0.64275
996
+ C243 R17 0.64273
997
+ C28 R435 0.64270
998
+ A30 R264 0.64235
999
+ G1 R264 0.64234
1000
+ U14 H8 0.64227
1001
+ A16 R174 0.64222
1002
+ C4 H8 0.64221
1003
+ U24 R299 0.64218
1004
+ A41 R181 0.64217
1005
+ G1 R576 0.64217
1006
+ A37 R258 0.64214
1007
+ A8 R671 0.64211
1008
+ G40 H8 0.64207
1009
+ A12 H7 0.64204
1010
+ G3 R413 0.64204
1011
+ C29 R181 0.64174
1012
+ A30 R265 0.64171
1013
+ A37 R312 0.64164
1014
+ G15 H6 0.64163
1015
+ U18 R274 0.64158
1016
+ C38 R576 0.64152
1017
+ C17 H6 0.64149
1018
+ C34 H26 0.64146
1019
+ C216 R17 0.64135
1020
+ U18 H10 0.64126
1021
+ G1 R177 0.64123
1022
+ C45 R295 0.64115
1023
+ G1 R599 0.64105
1024
+ U44 R157 0.64093
1025
+ U7 R144 0.64088
1026
+ G177 R17 0.64087
example/outputs/predict_scores.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pdbid contact_score
2
+ 8DMB_W.8DMB_P 0.29469
3
+ pdbid contact_score
4
+ 8DMB_W.8DMB_P 0.29469
5
+ pdbid contact_score
6
+ 8DMB_W.8DMB_P 712.49723
7
+ pdbid contact_score
8
+ 8DMB_W.8DMB_P 712.49723
9
+ pdbid contact_score
10
+ 8DMB_W.8DMB_P 712.49723
11
+ pdbid contact_score
12
+ 8DMB_W.8DMB_P 712.49723
13
+ pdbid contact_score
14
+ 8DMB_W.8DMB_P 6.98915
15
+ pdbid contact_score
16
+ 8DMB_W.8DMB_P 6.98915
17
+ pdbid contact_score
18
+ 8DMB_W.8DMB_P 6.98915
19
+ pdbid contact_score
20
+ 8DMB_W.8DMB_P 712.49723
predict.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Created by: [email protected]
4
+ # des : evaluate RPcontact
5
+ import glob
6
+ import pickle
7
+ import random
8
+ import re
9
+ from argparse import ArgumentParser
10
+ import matplotlib.pyplot as plt
11
+
12
+ import torch
13
+ from Bio import SeqIO
14
+ from sklearn.preprocessing import OneHotEncoder
15
+ import numpy as np
16
+ import os
17
+ import pandas as pd
18
+ class bcolors:
19
+ RED = "\033[1;31m"
20
+ BLUE = "\033[1;34m"
21
+ CYAN = "\033[1;36m"
22
+ GREEN = "\033[0;32m"
23
+ RESET = "\033[0;0m"
24
+ BOLD = "\033[;1m"
25
+ REVERSE = "\033[;7m"
26
+
27
+
28
+ def check_path(dirout,file=False):
29
+ if file:dirout = dirout.rsplit('/',1)[0]
30
+ try:
31
+ if not os.path.exists(dirout):
32
+ print('make dir '+dirout)
33
+ os.makedirs(dirout)
34
+ except:
35
+ print(f'{dirout} have been made by other process')
36
+
37
+ def load_label_pred(fin_label,fin_pred):
38
+ with open(fin_label, 'rb') as f:
39
+ df_label = pickle.load(f)
40
+ df_label = df_label.squeeze()
41
+ df_pred = pd.read_table(fin_pred, comment='#', index_col=[0])
42
+ if type(df_label) == pd.DataFrame:
43
+ df_pred.index = df_label.index
44
+ df_pred.columns = df_label.columns
45
+ # 删除包含空值的行
46
+ df_label = df_label.dropna(how='all')
47
+
48
+ # 删除包含空值的列
49
+ df_label = df_label.dropna(axis=1, how='all')
50
+ df_pred = df_pred.loc[df_label.index, df_label.columns]
51
+ keep=0
52
+ if df_pred.columns[0].count('.')==2:
53
+ keep=-1
54
+ df_pred.columns = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.columns)]
55
+ df_pred.index = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.index)]
56
+ return df_label,df_pred
57
+ def doSavePredict(_id,seq,predict,fout,des):
58
+ # seq = {'protein': 'KKGVGSTKNGRDSEAKRLGAKRADGQFVTGGSILYRQRGTKIYPGENVGRGGDDTLFAKIDGTVKFERFGRDRKKVSVYPV',
59
+ # 'rna': 'GGGGCCUUAGCUCAGGGGAGAGCGCCUGCUUUGCACGCAGGAGGCAGCGGUUCGAUCCCGCUAGGCUCCACCA'}
60
+ check_path(fout)
61
+ df = pd.DataFrame(predict)
62
+ if not seq:df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='w',float_format='%.5f')
63
+ else:
64
+ df.columns = list(seq['protein'])
65
+ df.index = list(seq['rna'])
66
+ with open(fout+ f'{_id}.txt','w') as f:
67
+ f.write(f'#{des}\n')
68
+ f.write(f"# row =rna:{seq['rna']}\n")
69
+ f.write(f"# col=protein:{seq['protein']}\n")
70
+ # df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.3f',index=None,header=None)
71
+ df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.5f')
72
+
73
+ df.columns = [f'{elem}{index+1}' for index,elem in enumerate(seq['protein'])]
74
+ df.index = [f'{elem}{index+1}' for index,elem in enumerate(seq['rna'])]
75
+ df = get_top_l_triplets(df, sum(df.shape))
76
+ df.to_csv(fout+ f'{_id}_topL.txt',sep='\t',mode='w',float_format='%.5f',index=False)
77
+
78
+
79
+
80
+ def get_top_l_triplets(df_pred, L):
81
+ """
82
+ 从Pandas DataFrame矩阵中提取值最大的前L个三元组。
83
+
84
+ 参数:
85
+ - matrix_df: Pandas DataFrame,表示接触矩阵。
86
+ - L: int,要提取的三元组的数量。
87
+
88
+ 返回:
89
+ - top_l_triplets: 列表,包含前L个三元组,每个三元组格式为(row_index, col_index, value)。
90
+ """
91
+ df = df_pred.stack().reset_index()
92
+ df.columns = ['rna', 'protein', 'pred']
93
+ df = df.sort_values(by='pred', ascending=False).head(L)
94
+ return df
95
+
96
+ def doSavePredict_single(_id,seq,predict_rsa,fout,des,pred_asa=None):
97
+ check_path(fout)
98
+ BASES = 'AUCG'
99
+ asa_std = [400, 350, 350, 400]
100
+ dict_rnam1_ASA = dict(zip(BASES, asa_std))
101
+ sequence = re.sub(r"[T]", "U", ''.join(seq))
102
+ sequence = re.sub(r"[^AGCU]", BASES[random.randint(0, 3)], sequence) # 其他字符随机变换以取得对目标的预测
103
+ ASA_scale = np.array([dict_rnam1_ASA[i] for i in sequence])
104
+
105
+ if pred_asa is None:
106
+ pred_asa = np.multiply(predict_rsa, ASA_scale).T
107
+ else:
108
+ predict_rsa = pred_asa/ASA_scale
109
+ col1 = np.array([i + 1 for i, I in enumerate(seq)])[None, :]
110
+ col2 = np.array([I for i, I in enumerate(seq)])[None, :]
111
+ col3 = pred_asa
112
+ col4 = predict_rsa
113
+ if len(col3[col3 == 0]):
114
+ exit(f'error in predict\t {_id},{seq}')
115
+ temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%.2f', col3), np.char.mod('%.3f', col4))).T
116
+ if fout:np.savetxt(fout + f'{_id}.txt', (temp), delimiter='\t\t', fmt="%s",
117
+ header=f'#{des}',
118
+ comments='')
119
+
120
+ return pred_asa,predict_rsa
121
+
122
+ def one_hot_encode(sequences,alpha='ACGU'):
123
+ print(sequences)
124
+ sequences_arry = np.array(list(sequences)).reshape(-1, 1)
125
+ lable = np.array(list(alpha)).reshape(-1, 1)
126
+ enc = OneHotEncoder(handle_unknown='ignore')
127
+ enc.fit(lable)
128
+ seq_encode = enc.transform(sequences_arry).toarray()
129
+ # print(seq_encode.shape)
130
+ return (seq_encode)
131
+
132
+ def get_bin_pred(df_pred,threshold):
133
+ bin_pred = df_pred.values >= threshold
134
+ bin_pred = bin_pred.astype(int)
135
+ return bin_pred
136
+
137
+ def seed_everything(seed=2022):
138
+ print('seed_everything to ',seed)
139
+ random.seed(seed)
140
+ os.environ['PYTHONHASHSEED'] = str(seed)
141
+ np.random.seed(seed)
142
+ torch.manual_seed(seed) # 程序每次运行结果一致,但是程序中多次生成随机数每次不一致 # https://blog.csdn.net/qq_42951560/article/details/112174334
143
+ torch.cuda.manual_seed(seed)
144
+ torch.backends.cudnn.deterministic = True
145
+ torch.backends.cudnn.benchmark = False # minbatch的长度一直在变化,这个优化比较浪费时间
146
+
147
+
148
+ def contact_partner_constrained(prob_matrix, colmax=12, rowmax=24):
149
+ """Apply contact partner constraints to probability matrix"""
150
+ row_max_indices = np.argsort(-prob_matrix, axis=1)[:, :rowmax]
151
+ row_max_mask = np.zeros_like(prob_matrix)
152
+ row_max_mask[np.arange(prob_matrix.shape[0])[:, np.newaxis], row_max_indices] = 1
153
+
154
+ col_max_indices = np.argsort(-prob_matrix, axis=0)[:colmax, :]
155
+ col_max_mask = np.zeros_like(prob_matrix)
156
+ col_max_mask[col_max_indices, np.arange(prob_matrix.shape[1])] = 1
157
+
158
+ mask = np.logical_and(row_max_mask, col_max_mask).astype(np.float32)
159
+ prob_matrix = np.where(mask == 1, prob_matrix, 0)
160
+ return prob_matrix
161
+ def getParam():
162
+ parser = ArgumentParser()
163
+ # data
164
+ parser.add_argument('--rootdir', default='',
165
+ type=str)
166
+ parser.add_argument('--fasta', default='./example/inputs/8DMB_W.8DMB_P.fasta',
167
+ type=str)
168
+ parser.add_argument('--out', default='./example/outputs/',
169
+ type=str)
170
+ parser.add_argument('--ffeat', default='./example/inputs/{pdbid}.pickle',
171
+ type=str)
172
+ parser.add_argument('--fmodel', default='./weight/model_roc_0_38=0.845.pt',
173
+ type=str)
174
+ parser.add_argument('--device', default='cpu',
175
+ type=str)
176
+ parser.add_argument('--draw',action='store_true',default=True)
177
+ parser.add_argument('--constrained',action='store_true',default=True)
178
+ args = parser.parse_args()
179
+ return args
180
+ if __name__ == '__main__':
181
+ args = getParam()
182
+ rootdir = args.rootdir
183
+ fasta = args.fasta
184
+ ffeat = args.ffeat
185
+ fmodel = args.fmodel
186
+ device = args.device
187
+ out = args.out
188
+ draw = args.draw
189
+ check_path(out)
190
+
191
+ # pdbid = fasta.rsplit('/',1)[0].split('.')[0]
192
+ seed_everything(seed=2022)
193
+
194
+ models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
195
+ # models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
196
+ print('loading existed model', fmodel)
197
+ with torch.no_grad():
198
+ for pdbid,seq in [(record.id,record.seq) for record in SeqIO.parse(fasta,'fasta')]:
199
+ rnaid,proid= pdbid.split('.')
200
+ rnaseq,proseq= seq.split('.')
201
+
202
+ with open(ffeat.format_map({'pdbid':rnaid}),'rb') as f:
203
+ rna_emb = pickle.load(f)
204
+ with open(ffeat.format_map({'pdbid':proid}),'rb') as f:
205
+ pro_emb = pickle.load(f)
206
+
207
+ rna_oh = one_hot_encode(rnaseq, alpha='ACGU')
208
+ pro_oh = one_hot_encode(proseq, alpha='GAVLIFWYDNEKQMSTCPHR')
209
+
210
+ # mask = np.ones((emb.shape[0],1)) # mask missing nt when evaluate the model
211
+ x_train = np.concatenate([rna_oh,rna_emb],axis=1)
212
+ x_train = np.expand_dims(x_train,0)
213
+ x_train = torch.from_numpy(x_train).transpose(-1,-2)
214
+ x_train = x_train.to(device, dtype=torch.float)
215
+ x_rna = x_train
216
+
217
+ x_train = np.concatenate([pro_oh, pro_emb], axis=1)
218
+ x_train = np.expand_dims(x_train, 0)
219
+ x_train = torch.from_numpy(x_train).transpose(-1, -2)
220
+ x_train = x_train.to(device, dtype=torch.float)
221
+ x_pro = x_train
222
+
223
+ print('input data shape for rna and protein:',x_rna.shape,x_pro.shape)
224
+
225
+ x_rna = x_rna.to(device, dtype=torch.float32)
226
+ x_pro = x_pro.to(device, dtype=torch.float32)
227
+
228
+
229
+ ###########
230
+
231
+ predict_scores = []
232
+
233
+ #######
234
+ for i,(model_path,model) in enumerate(models):
235
+ model.eval()
236
+ outputs = model(x_pro, x_rna) # [1, 299, 74, 1]
237
+ # print('outputs,',outputs.device)
238
+ outputs = torch.squeeze(outputs, -1)
239
+ outputs = outputs.permute(0, 2, 1)
240
+
241
+ df_pred = outputs[0].cpu().detach().numpy()
242
+
243
+ # Apply constraints and normalization
244
+ if args.constrained:contact_matrix = contact_partner_constrained(df_pred)
245
+ contact_matrix = (contact_matrix - contact_matrix.min()) / (
246
+ contact_matrix.max() - contact_matrix.min() + 1e-8)
247
+
248
+ # seq = data._seq[pdbid] if pdbid in data._seq else None
249
+ des = f'predict by {__file__}\n#{model_path}'
250
+ doSavePredict(pdbid, {'rna':rnaseq,'protein':proseq}, df_pred,
251
+ out,
252
+ des
253
+ )
254
+
255
+ tmp = df_pred.flatten()
256
+ tmp.sort()
257
+ score = sum(tmp[::-1][:sum(df_pred.shape)])
258
+ predict_scores.append((pdbid, score))
259
+ print('pdbid',pdbid,score) # 这个score是否和label中contact的个数有correlation?
260
+
261
+ if draw:
262
+ plt.figure(figsize=(20, 15))
263
+ top = sum(df_pred.shape)
264
+ df_pred = pd.DataFrame(df_pred)
265
+ threshold = df_pred.stack().nlargest(top).iloc[-1]
266
+ bin_pred = get_bin_pred(df_pred,threshold=threshold)
267
+
268
+ import seaborn as sns
269
+ sns.heatmap(df_pred,mask=bin_pred,cbar_kws={"shrink": 0.5},cmap='coolwarm',vmin=0,vmax=1)
270
+ plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
271
+ plt.xlabel(proid)
272
+ plt.ylabel(rnaid)
273
+ handles, labels = plt.gca().get_legend_handles_labels()
274
+
275
+ plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
276
+ frameon=False)
277
+ # 设置坐标轴的相同缩放
278
+ ax = plt.gca()
279
+ ax.set_aspect('equal')
280
+ plt.tight_layout()
281
+ plt.savefig(f'{out}/{pdbid}_{i}_prob.png',dpi=300)
282
+ plt.show()
283
+
284
+ plt.clf()
285
+ ax = plt.gca()
286
+ tp = \
287
+ ax.plot(*np.where(bin_pred.T==1), ".", c='r',markersize=1, label='Predicted contact')[
288
+ 0]
289
+ tp.set_markerfacecolor('w')
290
+ tp.set_markeredgecolor('r')
291
+ h,w = bin_pred.shape
292
+ plt.xlim([0,w])
293
+ plt.ylim([0,h])
294
+ plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
295
+ plt.xlabel(proid)
296
+ plt.ylabel(rnaid)
297
+ handles, labels = plt.gca().get_legend_handles_labels()
298
+
299
+ plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
300
+ frameon=False)
301
+
302
+ # 设置坐标轴的相同缩放
303
+ ax.set_aspect('equal')
304
+ plt.tight_layout()
305
+ plt.savefig(f'{out}/{pdbid}_{i}_binary.png',dpi=300)
306
+ plt.show()
307
+
308
+
309
+ print(f'predict {pdbid} with {len(seq)} nts')
310
+
311
+ df = pd.DataFrame(predict_scores, columns=['pdbid', 'contact_score'])
312
+ df.to_csv(args.out + '/predict_scores.csv',index=False, sep='\t', mode='a', float_format='%.5f')
313
+
314
+
315
+
316
+
317
+
318
+
predict_batch.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Created by: [email protected]
4
+ # des : evaluate RPcontact
5
+ import glob
6
+ import pickle
7
+ import random
8
+ import re
9
+ from argparse import ArgumentParser
10
+ import matplotlib.pyplot as plt
11
+
12
+ import torch
13
+ from Bio import SeqIO
14
+ from sklearn.preprocessing import OneHotEncoder
15
+ import numpy as np
16
+ import os
17
+ import pandas as pd
18
+ class bcolors:
19
+ RED = "\033[1;31m"
20
+ BLUE = "\033[1;34m"
21
+ CYAN = "\033[1;36m"
22
+ GREEN = "\033[0;32m"
23
+ RESET = "\033[0;0m"
24
+ BOLD = "\033[;1m"
25
+ REVERSE = "\033[;7m"
26
+
27
+
28
+ def check_path(dirout,file=False):
29
+ if file:dirout = dirout.rsplit('/',1)[0]
30
+ try:
31
+ if not os.path.exists(dirout):
32
+ print('make dir '+dirout)
33
+ os.makedirs(dirout)
34
+ except:
35
+ print(f'{dirout} have been made by other process')
36
+
37
+ def load_label_pred(fin_label,fin_pred):
38
+ with open(fin_label, 'rb') as f:
39
+ df_label = pickle.load(f)
40
+ df_label = df_label.squeeze()
41
+ df_pred = pd.read_table(fin_pred, comment='#', index_col=[0])
42
+ if type(df_label) == pd.DataFrame:
43
+ df_pred.index = df_label.index
44
+ df_pred.columns = df_label.columns
45
+ # 删除包含空值的行
46
+ df_label = df_label.dropna(how='all')
47
+
48
+ # 删除包含空值的列
49
+ df_label = df_label.dropna(axis=1, how='all')
50
+ df_pred = df_pred.loc[df_label.index, df_label.columns]
51
+ keep=0
52
+ if df_pred.columns[0].count('.')==2:
53
+ keep=-1
54
+ df_pred.columns = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.columns)]
55
+ df_pred.index = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.index)]
56
+ return df_label,df_pred
57
+ def doSavePredict(_id,seq,predict,fout,des):
58
+ # seq = {'protein': 'KKGVGSTKNGRDSEAKRLGAKRADGQFVTGGSILYRQRGTKIYPGENVGRGGDDTLFAKIDGTVKFERFGRDRKKVSVYPV',
59
+ # 'rna': 'GGGGCCUUAGCUCAGGGGAGAGCGCCUGCUUUGCACGCAGGAGGCAGCGGUUCGAUCCCGCUAGGCUCCACCA'}
60
+ check_path(fout)
61
+ df = pd.DataFrame(predict)
62
+ if not seq:df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='w',float_format='%.5f')
63
+ else:
64
+ df.columns = [f'{elem}{index+1}' for index,elem in enumerate(seq['protein'])]
65
+ df.index = [f'{elem}{index+1}' for index,elem in enumerate(seq['rna'])]
66
+ with open(fout+ f'{_id}.txt','w') as f:
67
+ f.write(f'#{des}\n')
68
+ f.write(f"# row =rna:{seq['rna']}\n")
69
+ f.write(f"# col=protein:{seq['protein']}\n")
70
+ # df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.3f',index=None,header=None)
71
+ df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.5f')
72
+
73
+ df = get_top_l_triplets(df, sum(df.shape))
74
+ df.to_csv(fout+ f'{_id}_topL.txt',sep='\t',mode='w',float_format='%.5f',index=False)
75
+
76
+
77
+
78
+ def get_top_l_triplets(df_pred, L):
79
+ """
80
+ 从Pandas DataFrame矩阵中提取值最大的前L个三元组。
81
+
82
+ 参数:
83
+ - matrix_df: Pandas DataFrame,表示接触矩阵。
84
+ - L: int,要提取的三元组的数量。
85
+
86
+ 返回:
87
+ - top_l_triplets: 列表,包含前L个三元组,每个三元组格式为(row_index, col_index, value)。
88
+ """
89
+ df = df_pred.stack().reset_index()
90
+ df.columns = ['rna', 'protein', 'pred']
91
+ df = df.sort_values(by='pred', ascending=False).head(L)
92
+ return df
93
+
94
+ def doSavePredict_single(_id,seq,predict_rsa,fout,des,pred_asa=None):
95
+ check_path(fout)
96
+ BASES = 'AUCG'
97
+ asa_std = [400, 350, 350, 400]
98
+ dict_rnam1_ASA = dict(zip(BASES, asa_std))
99
+ sequence = re.sub(r"[T]", "U", ''.join(seq))
100
+ sequence = re.sub(r"[^AGCU]", BASES[random.randint(0, 3)], sequence) # 其他字符随机变换以取得对目标的预测
101
+ ASA_scale = np.array([dict_rnam1_ASA[i] for i in sequence])
102
+
103
+ if pred_asa is None:
104
+ pred_asa = np.multiply(predict_rsa, ASA_scale).T
105
+ else:
106
+ predict_rsa = pred_asa/ASA_scale
107
+ col1 = np.array([i + 1 for i, I in enumerate(seq)])[None, :]
108
+ col2 = np.array([I for i, I in enumerate(seq)])[None, :]
109
+ col3 = pred_asa
110
+ col4 = predict_rsa
111
+ if len(col3[col3 == 0]):
112
+ exit(f'error in predict\t {_id},{seq}')
113
+ temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%.2f', col3), np.char.mod('%.3f', col4))).T
114
+ if fout:np.savetxt(fout + f'{_id}.txt', (temp), delimiter='\t\t', fmt="%s",
115
+ header=f'#{des}',
116
+ comments='')
117
+
118
+ return pred_asa,predict_rsa
119
+
120
+ def one_hot_encode(sequences,alpha='ACGU'):
121
+ # print(sequences)
122
+ sequences_arry = np.array(list(sequences)).reshape(-1, 1)
123
+ lable = np.array(list(alpha)).reshape(-1, 1)
124
+ enc = OneHotEncoder(handle_unknown='ignore')
125
+ enc.fit(lable)
126
+ seq_encode = enc.transform(sequences_arry).toarray()
127
+ # print(seq_encode.shape)
128
+ return (seq_encode)
129
+
130
+ def get_bin_pred(df_pred,threshold):
131
+ bin_pred = df_pred.values >= threshold
132
+ bin_pred = bin_pred.astype(int)
133
+ return bin_pred
134
+
135
+ def seed_everything(seed=2022):
136
+ print('seed_everything to ',seed)
137
+ random.seed(seed)
138
+ os.environ['PYTHONHASHSEED'] = str(seed)
139
+ np.random.seed(seed)
140
+ torch.manual_seed(seed) # 程序每次运行结果一致,但是程序中多次生成随机数每次不一致 # https://blog.csdn.net/qq_42951560/article/details/112174334
141
+ torch.cuda.manual_seed(seed)
142
+ torch.backends.cudnn.deterministic = True
143
+ torch.backends.cudnn.benchmark = False # minbatch的长度一直在变化,这个优化比较浪费时间
144
+ def getParam():
145
+ parser = ArgumentParser()
146
+ # data
147
+ parser.add_argument('--rootdir', default='',
148
+ type=str)
149
+ parser.add_argument('--rna_fasta', default='./example/inputs_batch/rna.fasta',
150
+ type=str)
151
+
152
+ parser.add_argument('--pro_fasta', default='./example/inputs_batch/protein.fasta',
153
+ type=str)
154
+
155
+ parser.add_argument('--csv', default='./example/inputs_batch/pairs.csv',
156
+ type=str)
157
+ parser.add_argument('--col', default='_id',
158
+ type=str)
159
+ parser.add_argument('--out', default='./example/outputs_batch/',
160
+ type=str)
161
+ parser.add_argument('--ffeat', default='./example/inputs_batch/embedding/{element}/{pdbid}.pickle',
162
+ type=str)
163
+ parser.add_argument('--fmodel', default='./weight/model_roc_0_38=0.845.pt',
164
+ type=str)
165
+ parser.add_argument('--device', default='cpu',
166
+ type=str)
167
+ parser.add_argument('--draw', action='store_true')
168
+ args = parser.parse_args()
169
+ return args
170
+ if __name__ == '__main__':
171
+ args = getParam()
172
+ rootdir = args.rootdir
173
+ csv = args.csv
174
+ col = args.col
175
+ rna_fasta = args.rna_fasta
176
+ pro_fasta = args.pro_fasta
177
+ ffeat = args.ffeat
178
+ fmodel = args.fmodel
179
+ device = args.device
180
+ out = args.out
181
+ draw = args.draw
182
+ check_path(out)
183
+
184
+ # pdbid = fasta.rsplit('/',1)[0].split('.')[0]
185
+ seed_everything(seed=2022)
186
+
187
+ models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
188
+ # models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
189
+ print('loading existed model', fmodel)
190
+ with torch.no_grad():
191
+ rna_dict = {}
192
+ for pdbid, seq in [(record.id, record.seq) for record in SeqIO.parse(rna_fasta, 'fasta')]:
193
+ rna_dict[pdbid]=str(seq)
194
+ pro_dict = {}
195
+ for pdbid, seq in [(record.id, record.seq) for record in SeqIO.parse(pro_fasta, 'fasta')]:
196
+ pro_dict[pdbid]=str(seq)
197
+
198
+ df = pd.read_csv(csv)
199
+ predict_scores = []
200
+ for pdbid in df[col]:
201
+ # pdbcode,r,p = pdbid.split('_')
202
+ # rnaid = f'{pdbcode}_{r}'
203
+ # proid = f'{pdbcode}_{p}'
204
+
205
+ rnaid,proid = pdbid.split('.')
206
+
207
+ rnaseq,proseq= rna_dict[rnaid],pro_dict[proid]
208
+
209
+ with open(ffeat.format_map({'pdbid':rnaid,'element':'rna'}),'rb') as f:
210
+ rna_emb = pickle.load(f)
211
+ with open(ffeat.format_map({'pdbid':proid,'element':'protein'}),'rb') as f:
212
+ pro_emb = pickle.load(f)
213
+
214
+ rna_oh = one_hot_encode(rnaseq.replace('T','U'), alpha='ACGU')
215
+ pro_oh = one_hot_encode(proseq, alpha='GAVLIFWYDNEKQMSTCPHR')
216
+
217
+ # mask = np.ones((emb.shape[0],1)) # mask missing nt when evaluate the model
218
+ x_train = np.concatenate([rna_oh,rna_emb],axis=1)
219
+ x_train = np.expand_dims(x_train,0)
220
+ x_train = torch.from_numpy(x_train).transpose(-1,-2)
221
+ x_train = x_train.to(device, dtype=torch.float)
222
+ x_rna = x_train
223
+
224
+ x_train = np.concatenate([pro_oh, pro_emb], axis=1)
225
+ x_train = np.expand_dims(x_train, 0)
226
+ x_train = torch.from_numpy(x_train).transpose(-1, -2)
227
+ x_train = x_train.to(device, dtype=torch.float)
228
+ x_pro = x_train
229
+
230
+ # print('input data shape for rna and protein:',x_rna.shape,x_pro.shape)
231
+
232
+ x_rna = x_rna.to(device, dtype=torch.float32)
233
+ x_pro = x_pro.to(device, dtype=torch.float32)
234
+
235
+ for i,(model_path,model) in enumerate(models):
236
+ model.eval()
237
+ outputs = model(x_pro, x_rna) # [1, 299, 74, 1]
238
+ # print('outputs,',outputs.device)
239
+ outputs = torch.squeeze(outputs, -1)
240
+ outputs = outputs.permute(0, 2, 1)
241
+
242
+ df_pred = outputs[0].cpu().detach().numpy()
243
+ des = f'predict by {__file__}\n#{model_path}'
244
+ doSavePredict(pdbid, {'rna':rnaseq,'protein':proseq}, df_pred,
245
+ out,
246
+ des
247
+ )
248
+
249
+ tmp = df_pred.flatten()
250
+ tmp.sort()
251
+ score = sum(tmp[::-1][:sum(df_pred.shape)])
252
+ predict_scores.append((pdbid, score))
253
+ print(pdbid,score)
254
+
255
+ if draw:
256
+ plt.figure(figsize=(20, 15))
257
+ top = sum(df_pred.shape)
258
+ df_pred = pd.DataFrame(df_pred)
259
+ threshold = df_pred.stack().nlargest(top).iloc[-1]
260
+ bin_pred = get_bin_pred(df_pred,threshold=threshold)
261
+
262
+ import seaborn as sns
263
+ sns.heatmap(df_pred,mask=bin_pred)
264
+ plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
265
+ plt.xlabel(proid)
266
+ plt.ylabel(rnaid)
267
+ handles, labels = plt.gca().get_legend_handles_labels()
268
+
269
+ plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
270
+ frameon=False)
271
+ # 设置坐标轴的相同缩放
272
+ ax = plt.gca()
273
+ ax.set_aspect('equal')
274
+ plt.tight_layout()
275
+ plt.savefig(f'{out}/{pdbid}_{i}_prob.png',dpi=300)
276
+ plt.show()
277
+
278
+ plt.clf()
279
+ ax = plt.gca()
280
+ tp = \
281
+ ax.plot(*np.where(bin_pred.T==1), ".", c='r',markersize=1, label='Predicted contact')[
282
+ 0]
283
+ tp.set_markerfacecolor('w')
284
+ tp.set_markeredgecolor('r')
285
+ h,w = bin_pred.shape
286
+ plt.xlim([0,w])
287
+ plt.ylim([0,h])
288
+ plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
289
+ plt.xlabel(proid)
290
+ plt.ylabel(rnaid)
291
+ handles, labels = plt.gca().get_legend_handles_labels()
292
+
293
+ plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
294
+ frameon=False)
295
+
296
+ # 设置坐标轴的相同缩放
297
+ ax.set_aspect('equal')
298
+ plt.tight_layout()
299
+ plt.savefig(f'{out}/{pdbid}_{i}_binary.png',dpi=300)
300
+ plt.show()
301
+
302
+
303
+ print(f'predict {pdbid} with {len(seq)} nts')
304
+
305
+ df = pd.DataFrame(predict_scores, columns=['pdbid', 'contact_score'])
306
+ df.to_csv(args.out + '/predict_scores.tsv',index=False, sep='\t', mode='w', float_format='%.5f')
307
+
308
+
309
+
310
+
311
+
312
+
readme.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/JulseJiang/RPcontact/main/example/logo.png" alt="RPcontact Logo" width="120"/>
3
+ </p>
4
+
5
+ # RPcontact: RNA-Protein Contact Prediction
6
+
7
+ **Improved prediction of RNA-protein contacts using RNA and protein language models**
8
+
9
+ [Paper](https://www.biorxiv.org/content/10.1101/2025.06.02.657171v1.full)
10
+ [Code](https://github.com/rpcontact)
11
+ [Demo](https://julse-rpcontact.hf.space/)
12
+
13
+
14
+ ---
15
+
16
+ ## Overview
17
+
18
+ RPcontact is a novel computational tool for accurately predicting RNA-protein contacts, addressing a fundamental challenge in understanding molecular biology processes such as transcription, splicing, and translation. Traditional methods are limited by the scarcity of RNA-protein complex structures and the constraints of experimental techniques. While recent deep learning approaches like AlphaFold 3 and RoseTTAFoldNA have made progress, they still rely heavily on homologous templates.
19
+
20
+ RPcontact overcomes these limitations by leveraging large language models specifically designed for RNA ([ERNIE-RNA](https://github.com/Bruce-ywj/ERNIE-RNA)) and proteins ([ESM-2](https://github.com/facebookresearch/esm)). Trained exclusively on ribosomal RNA-protein complexes, RPcontact delivers robust and generalized performance, accurately predicting contacts in both dimeric and multimeric non-rRNA-protein complexes. Benchmark results show that RPcontact significantly outperforms binary contacts inferred from models like AlphaFold 3 and RoseTTAFoldNA, making it a valuable tool for structure and function prediction in RNA-protein research.
21
+
22
+ ---
23
+
24
+ ## Quick Start
25
+
26
+ ### Requirements
27
+
28
+ | Dependency | Recommended Version |
29
+ |-------------|--------------------|
30
+ | Python | ≥ 3.8 |
31
+ | PyTorch | 1.13.1 |
32
+ | fair-esm | 1.0.2 |
33
+
34
+ Install dependencies (example):
35
+ ```bash
36
+ pip install numpy pandas matplotlib biopython scikit-learn
37
+ pip install torch==1.13.1
38
+ pip install fair-esm==1.0.2
39
+ ```
40
+
41
+ ---
42
+
43
+ ### Script Overview
44
+
45
+ | Script | Function | Example Command |
46
+ |-------------------|-------------------------------------|---------------------------------|
47
+ | predict.py | Single RNA-protein pair contact prediction | `python predict.py` |
48
+ | predict_batch.py | Batch RNA-protein pairs contact prediction | `python predict_batch.py` |
49
+ | evaluate.py | Evaluation and visualization | `python evaluate.py` |
50
+ | app.py | Launch web-based demo interface (need install gradio) | `python app.py` |
51
+
52
+ ---
53
+
54
+ ### Data Preparation
55
+
56
+ - RNA/protein sequences: FASTA format
57
+ - Embedding features: pickle format
58
+ - For batch prediction: provide a CSV file for pairing info
59
+
60
+ ---
61
+
62
+ ### Typical Usage
63
+
64
+ **Single pair prediction:**
65
+ ```bash
66
+ python predict.py --fasta your_sequence.fasta --out output_dir/
67
+ ```
68
+
69
+ **Batch prediction:**
70
+ ```bash
71
+ python predict_batch.py --rna_fasta rna.fasta --pro_fasta protein.fasta --csv pairs.csv --out output_dir/
72
+ ```
73
+
74
+ **Evaluation:**
75
+ ```bash
76
+ python evaluate.py --fasta your_sequence.fasta --out eval_dir/ --flabel true_labels.pickle
77
+ ```
78
+
79
+ ---
80
+
81
+ ### Common Parameters
82
+
83
+ | Parameter | Description |
84
+ |---------------|--------------------------------------------------------|
85
+ | --fasta | Input FASTA file (for single prediction) |
86
+ | --rna_fasta | RNA FASTA file (for batch prediction) |
87
+ | --pro_fasta | Protein FASTA file (for batch prediction) |
88
+ | --csv | RNA-protein pairing info CSV (for batch prediction) |
89
+ | --ffeat | Precomputed embedding feature file (pickle format) |
90
+ | --fmodel | Pretrained model file path |
91
+ | --out | Output directory |
92
+ | --flabel | True label file (for evaluation) |
93
+ | --device | Specify device (e.g., cpu or cuda:0) |
94
+ | --draw | Whether to visualize results |
95
+
96
+ ---
97
+
98
+ ## Output Interpretation
99
+
100
+ - The prediction output is a contact probability matrix for each RNA-protein pair. Higher scores indicate a higher probability of interaction.
101
+ - The evaluation script provides accuracy and other metrics, as well as visualization.
102
+
103
+ ---
104
+
105
+ ## Contact & Citation
106
+
107
+ Questions or suggestions? Contact:
108
+
109
+ - Jiuhong Jiang
110
+ - Email: [email protected]
111
+
112
+ If you find this project helpful, please cite our manuscript.
113
+ - Jiang, J., Zhang, X., Zhan, J., Miao, Z., & Zhou, Y. (2025). RPcontact: Improved prediction of RNA-protein contacts using RNA and protein language models. bioRxiv, 2025-06.
114
+ ---
115
+
116
+ <p align="center"><em>Make RNA-protein contact prediction easier and more accurate!</em></p>
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Bio==1.8.0
2
+ biopython==1.81
3
+ gradio==5.35.0
4
+ matplotlib==3.5.1
5
+ numpy==1.24.4
6
+ pandas==1.5.3
7
+ plotly==5.24.1
8
+ scikit_learn==1.2.1
9
+ seaborn==0.13.2
10
+ torch==2.4.1
third_part_tool/ernie_rna/readme.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://github.com/Bruce-ywj/ERNIE-RNA
third_part_tool/esm2/readme.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ install following https://github.com/facebookresearch/esm/
2
+
3
+ using this pretrained model: esm2_t48_15B_UR50D
4
+
weight/readme.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ model_roc_0_38=0.845.pt for OH+RP_Emb with data augmentation
2
+ model_roc_0_56=0.779.pt for OH with data augmentation
3
+
4
+ The model weight can download after the paper accepted by journal