Spaces:

julse
/

RPcontact

Running

App Files Files Community

julse commited on Jul 2

Commit

82d55c6

verified ·

1 Parent(s): 4d203ee

Upload 23 files

Browse files

Files changed (24) hide show

.gitattributes +2 -0
RNA_protein/.DS_Store +0 -0
RNA_protein/model/.DS_Store +0 -0
RNA_protein/model/atn_gz.py +342 -0
RPcontact_pipline.sh +71 -0
app.py +661 -0
benchmark/.DS_Store +0 -0
benchmark/readme.txt +1 -0
evaluate.py +209 -0
example/inputs/8DMB_W.8DMB_P.fasta +2 -0
example/inputs/readme.txt +6 -0
example/outputs/8DMB_W.8DMB_P.txt +0 -0
example/outputs/8DMB_W.8DMB_P_0_binary.png +0 -0
example/outputs/8DMB_W.8DMB_P_0_evaluate.png +3 -0
example/outputs/8DMB_W.8DMB_P_0_prob.png +3 -0
example/outputs/8DMB_W.8DMB_P_topL.txt +1026 -0
example/outputs/predict_scores.csv +20 -0
predict.py +318 -0
predict_batch.py +312 -0
readme.md +116 -0
requirements.txt +10 -0
third_part_tool/ernie_rna/readme.txt +1 -0
third_part_tool/esm2/readme.txt +4 -0
weight/readme.txt +4 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example/outputs/8DMB_W.8DMB_P_0_evaluate.png filter=lfs diff=lfs merge=lfs -text
+example/outputs/8DMB_W.8DMB_P_0_prob.png filter=lfs diff=lfs merge=lfs -text

RNA_protein/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

RNA_protein/model/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

RNA_protein/model/atn_gz.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import math
+import torch
+import torch.nn as nn
+# from torch.nn import Module
+# # for gzlabel contable_gpu env
+# class MultiheadAttention(Module):
+#     r"""Allows the model to jointly attend to information
+#     from different representation subspaces.
+#     See `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+#
+#     .. math::
+#         \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+#
+#     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+#
+#     Args:
+#         embed_dim: Total dimension of the model.
+#         num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
+#             across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
+#         dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
+#         bias: If specified, adds bias to input / output projection layers. Default: ``True``.
+#         add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
+#         add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
+#             Default: ``False``.
+#         kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
+#         vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
+#         batch_first: If ``True``, then the input and output tensors are provided
+#             as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+#
+#     Examples::
+#
+#         >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+#         >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+#     """
+#     __constants__ = ['batch_first']
+#     bias_k: Optional[torch.Tensor]
+#     bias_v: Optional[torch.Tensor]
+#
+#     def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
+#                  kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None:
+#         factory_kwargs = {'device': device, 'dtype': dtype}
+#         super(MultiheadAttention, self).__init__()
+#         self.embed_dim = embed_dim
+#         self.kdim = kdim if kdim is not None else embed_dim
+#         self.vdim = vdim if vdim is not None else embed_dim
+#         self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+#
+#         self.num_heads = num_heads
+#         self.dropout = dropout
+#         self.batch_first = batch_first
+#         self.head_dim = embed_dim // num_heads
+#         assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+#
+#         if self._qkv_same_embed_dim is False:
+#             self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs))
+#             self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs))
+#             self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs))
+#             self.register_parameter('in_proj_weight', None)
+#         else:
+#             self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs))
+#             self.register_parameter('q_proj_weight', None)
+#             self.register_parameter('k_proj_weight', None)
+#             self.register_parameter('v_proj_weight', None)
+#
+#         if bias:
+#             self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
+#         else:
+#             self.register_parameter('in_proj_bias', None)
+#         self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+#
+#         if add_bias_kv:
+#             self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+#             self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+#         else:
+#             self.bias_k = self.bias_v = None
+#
+#         self.add_zero_attn = add_zero_attn
+#
+#         self._reset_parameters()
+#
+#     def _reset_parameters(self):
+#         if self._qkv_same_embed_dim:
+#             xavier_uniform_(self.in_proj_weight)
+#         else:
+#             xavier_uniform_(self.q_proj_weight)
+#             xavier_uniform_(self.k_proj_weight)
+#             xavier_uniform_(self.v_proj_weight)
+#
+#         if self.in_proj_bias is not None:
+#             constant_(self.in_proj_bias, 0.)
+#             constant_(self.out_proj.bias, 0.)
+#         if self.bias_k is not None:
+#             xavier_normal_(self.bias_k)
+#         if self.bias_v is not None:
+#             xavier_normal_(self.bias_v)
+#
+#     def __setstate__(self, state):
+#         # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+#         if '_qkv_same_embed_dim' not in state:
+#             state['_qkv_same_embed_dim'] = True
+#
+#         super(MultiheadAttention, self).__setstate__(state)
+#
+#     def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
+#                 need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+#         r"""
+#     Args:
+#         query: Query embeddings of shape :math:`(L, N, E_q)` when ``batch_first=False`` or :math:`(N, L, E_q)`
+#             when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is the batch size,
+#             and :math:`E_q` is the query embedding dimension ``embed_dim``. Queries are compared against
+#             key-value pairs to produce the output. See "Attention Is All You Need" for more details.
+#         key: Key embeddings of shape :math:`(S, N, E_k)` when ``batch_first=False`` or :math:`(N, S, E_k)` when
+#             ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
+#             :math:`E_k` is the key embedding dimension ``kdim``. See "Attention Is All You Need" for more details.
+#         value: Value embeddings of shape :math:`(S, N, E_v)` when ``batch_first=False`` or :math:`(N, S, E_v)` when
+#             ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
+#             :math:`E_v` is the value embedding dimension ``vdim``. See "Attention Is All You Need" for more details.
+#         key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
+#             to ignore for the purpose of attention (i.e. treat as "padding"). Binary and byte masks are supported.
+#             For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
+#             the purpose of attention. For a byte mask, a non-zero value indicates that the corresponding ``key``
+#             value will be ignored.
+#         need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
+#             Default: ``True``.
+#         attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
+#             :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
+#             :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
+#             broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
+#             Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
+#             corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
+#             corresponding position is not allowed to attend. For a float mask, the mask values will be added to
+#             the attention weight.
+#
+#     Outputs:
+#         - **attn_output** - Attention outputs of shape :math:`(L, N, E)` when ``batch_first=False`` or
+#           :math:`(N, L, E)` when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is
+#           the batch size, and :math:`E` is the embedding dimension ``embed_dim``.
+#         - **attn_output_weights** - Attention output weights of shape :math:`(N, L, S)`, where :math:`N` is the batch
+#           size, :math:`L` is the target sequence length, and :math:`S` is the source sequence length. Only returned
+#           when ``need_weights=True``.
+#         """
+#         if self.batch_first:
+#             query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+#
+#         if not self._qkv_same_embed_dim:
+#             attn_output, attn_output_weights = F.multi_head_attention_forward(
+#                 query, key, value, self.embed_dim, self.num_heads,
+#                 self.in_proj_weight, self.in_proj_bias,
+#                 self.bias_k, self.bias_v, self.add_zero_attn,
+#                 self.dropout, self.out_proj.weight, self.out_proj.bias,
+#                 training=self.training,
+#                 key_padding_mask=key_padding_mask, need_weights=need_weights,
+#                 attn_mask=attn_mask, use_separate_proj_weight=True,
+#                 q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+#                 v_proj_weight=self.v_proj_weight)
+#         else:
+#             attn_output, attn_output_weights = F.multi_head_attention_forward(
+#                 query, key, value, self.embed_dim, self.num_heads,
+#                 self.in_proj_weight, self.in_proj_bias,
+#                 self.bias_k, self.bias_v, self.add_zero_attn,
+#                 self.dropout, self.out_proj.weight, self.out_proj.bias,
+#                 training=self.training,
+#                 key_padding_mask=key_padding_mask, need_weights=need_weights,
+#                 attn_mask=attn_mask)
+#         if self.batch_first:
+#             return attn_output.transpose(1, 0), attn_output_weights
+#         else:
+#             return attn_output, attn_output_weights
+class PositionalEncoding(nn.Module):
+  "Implement the PE function."
+  def __init__(self, d_model, dropout, max_len=5000):
+    #d_model=512,dropout=0.1,
+    #max_len=5000代表事先准备好长度为5000的序列的位置编码，其实没必要，
+    #一般100或者200足够了。
+    super(PositionalEncoding, self).__init__()
+    self.dropout = nn.Dropout(p=dropout)
+    # Compute the positional encodings once in log space.
+    pe = torch.zeros(max_len, d_model)
+    #(5000,512)矩阵，保持每个位置的位置编码，一共5000个位置，
+    #每个位置用一个512维度向量来表示其位置编码
+    position = torch.arange(0, max_len).unsqueeze(1)
+    # (5000) -> (5000,1)
+    div_term = torch.exp(torch.arange(0, d_model, 2) *
+      -(math.log(10000.0) / d_model))
+      # (0,2,…, 4998)一共准备2500个值，供sin, cos调用
+    pe[:, 0::2] = torch.sin(position * div_term) # 偶数下标的位置
+    pe[:, 1::2] = torch.cos(position * div_term) # 奇数下标的位置
+    pe = pe.unsqueeze(0)
+    # (5000, 512) -> (1, 5000, 512) 为batch.size留出位置
+    self.register_buffer('pe', pe)
+  def forward(self, x):
+    x = x + self.pe[:, :x.size(1)]
+    # 接受1.Embeddings的词嵌入结果x，
+    #然后把自己的位置编码pe，封装成torch的Variable(不需要梯度)，加上去。
+    #例如，假设x是(30,10,512)的一个tensor，
+    #30是batch.size, 10是该batch的序列长度, 512是每个词的词嵌入向量；
+    #则该行代码的第二项是(1, min(10, 5000), 512)=(1,10,512)，
+    #在具体相加的时候，会扩展(1,10,512)为(30,10,512)，
+    #保证一个batch中的30个序列，都使用（叠加）一样的位置编码。
+    return self.dropout(x) # 增加一次dropout操作
+# 注意，位置编码不会更新，是写死的，所以这个class里面没有可训练的参数。
+class TwoTrackAttention(nn.Module):
+    def __init__(self, d_attn, n_head, d_ff=512, dropout=0.1) -> None:
+        super().__init__()
+        self.self_attn = torch.nn.MultiheadAttention(
+            d_attn, n_head,
+            dropout = dropout,
+            batch_first=True # gzbl 这边的pytorch版本没有这个参数
+        )
+        self.dropout_self = nn.Dropout(dropout)
+        self.cross_attn = torch.nn.MultiheadAttention(
+            d_attn, n_head,
+            dropout = dropout,
+            batch_first=True
+        )
+        self.dropout_cross = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_attn)
+        self.ff1 = nn.Linear(d_attn, d_ff)
+        self.dropout_ff = nn.Dropout(dropout)
+        self.ff2 = nn.Linear(d_ff, d_attn)
+        self.norm2 = nn.LayerNorm(d_attn)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = nn.ReLU()
+        # self.s_query = nn.Linear(d_attn,d_attn)
+        # self.s_key = nn.Linear(d_attn,d_attn)
+        # self.s_value = nn.Linear(d_attn,d_attn)
+        #
+        # self.c_query = nn.Linear(d_attn,d_attn)
+        # self.c_key = nn.Linear(d_attn,d_attn)
+        # self.c_value = nn.Linear(d_attn,d_attn)
+    def forward(self, obj_update, obj_message):
+        self_update = self.self_attn(
+            query = obj_update,
+            key = obj_update,
+            value = obj_update
+        )[0]
+        cross_update = self.cross_attn(
+            query = obj_update, # [1, 299, 128]
+            key = obj_message, # [1, 74, 128]
+            value = obj_message # [1, 74, 128]
+        )[0]
+        # [torch.Size([1, 299, 128]), torch.Size([1, 74, 128]), torch.Size([1, 74, 128])]
+        obj_update = obj_update + self.dropout_self(self_update) + self.dropout_cross(cross_update)
+        obj_update = self.norm1(obj_update)
+        ff_update = self.ff2(self.dropout_ff(self.activation(self.ff1(obj_update))))
+        obj_update = obj_update + self.dropout(ff_update)
+        obj_update = self.norm2(obj_update)
+        return obj_update
+class SymertricTwoTrackAttention(nn.Module):
+    def __init__(self, d_attn, n_head, d_ff=512, dropout=0.1,sync = False) -> None:
+        super().__init__()
+        self.tta1 = TwoTrackAttention(d_attn, n_head, d_ff, dropout)
+        self.tta2 = TwoTrackAttention(d_attn, n_head, d_ff, dropout)
+        self.sync = sync
+    def forward(self, obj_1, obj_2):
+        if self.sync:
+            return self.tta1(obj_1, obj_2), self.tta2(obj_2, obj_1)
+        else:
+            obj_1 = self.tta1(obj_1, obj_2)
+            obj_2 = self.tta2(obj_2, obj_1)
+            return obj_1, obj_2
+class LinearFF(nn.Module):
+    def __init__(self, d_in, d_out, dropout=0.1) -> None:
+        super().__init__()
+        self.emb = nn.Linear(d_in, d_out)
+        self.norm = nn.LayerNorm(d_out)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = nn.ReLU()
+    def forward(self, f_in):
+        f_in = f_in.permute(0,2,1)
+        return self.norm(self.dropout(self.activation(self.emb(f_in))))
+class ProteinRNAInteraction(nn.Module):
+    def __init__(self, d_pro, d_rna, n_layers, d_attn, n_head=4, d_ff=512, dropout=0.1,sync=False) -> None:
+        super().__init__()
+        print('sync update ProteinRNAInteraction',sync)
+        self.pro_emb = LinearFF(d_pro, d_attn)
+        self.pro_rna = LinearFF(d_rna, d_attn)
+        self.pro_pos = PositionalEncoding(d_attn,dropout)
+        self.rna_pos = PositionalEncoding(d_attn,dropout)
+        self.layers = nn.ModuleList([
+            SymertricTwoTrackAttention(d_attn, n_head, d_ff, dropout,sync = sync) for _ in range(n_layers)
+            ])
+        self.pred = nn.Linear(d_attn, 1)
+        # self.pred = nn.Linear(2*d_attn, 1)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, f_pro, f_rna):
+        # print(f_pro.shape)
+        # print(f_pro.device)
+        f_pro = self.pro_emb(f_pro)
+        f_rna = self.pro_rna(f_rna)
+        f_pro = self.pro_pos(f_pro)
+        f_rna = self.rna_pos(f_rna)
+        for layer in self.layers:
+            f_pro, f_rna = layer(f_pro, f_rna)
+        f_pro = f_pro.unsqueeze(2)  # [B, L, R, D]
+        f_rna = f_rna.unsqueeze(1)
+        prob = self.sigmoid(self.pred(f_rna.mul(f_pro)))
+        return prob
+        # f_pro = f_pro.unsqueeze(2) # [1, 299, 1, 128]
+        # f_rna = f_rna.unsqueeze(1) # [1, 1, 74, 128]
+        # f_pro = f_pro.repeat(1, 1, f_rna.shape[2], 1)  # [B, L, R, D]
+        # f_rna = f_rna.repeat(1, f_pro.shape[1], 1, 1)  # [B, L, R, D]
+        #
+        # # prob = self.pred(f_rna.mul(f_pro))
+        # prob = self.pred(torch.cat([f_pro, f_rna], -1))
+        # # print(prob.max(),prob.min(),prob.mean())
+        # prob = torch.sigmoid(prob)
+        # # prob = self.sigmoid(prob)
+        # # prob = self.sigmoid(self.pred(torch.cat([f_pro, f_rna], -1))) # pred : -0.06, 0.619
+        # return prob

RPcontact_pipline.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/bash
+# 检查参数数量
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <fin_fasta> <dirout> <esm2_env_path> <ernie_rna_env_path>"
+    exit 1
+fi
+fin_fasta=$1
+dirout=$2
+esm2_env_path=$3
+ernie_rna_env_path=$4
+# 设置默认值
+WDIR=$dirout
+rna_fasta=$WDIR/_0_process/rna_sequences.fasta
+pro_fasta=$WDIR/_0_process/protein_sequences.fasta
+fcombinations=$WDIR/_0_process/combinations.csv
+finfo=$WDIR/_0_process/info.csv
+current_path=$WDIR/_0_process/
+# 创建所需目录
+mkdir -p $current_path
+mkdir -p $current_path/ernie_rna_emb
+mkdir -p $current_path/esm2_emb
+mkdir -p $current_path/rpcontact
+mkdir -p $current_path/no_constrained
+mkdir -p $current_path/constrained
+# 写入组合文件
+while IFS= read -r line; do
+    rna_id=$(echo $line | cut -d ',' -f 1)
+    rna_seq=$(echo $line | cut -d ',' -f 2)
+    pro_id=$(echo $line | cut -d ',' -f 3)
+    pro_seq=$(echo $line | cut -d ',' -f 4)
+    rna_len=$(echo $line | cut -d ',' -f 5)
+    pro_len=$(echo $line | cut -d ',' -f 6)
+    echo "$rna_id.$pro_id,$rna_seq,$pro_seq,$rna_len,$pro_len" >> $fcombinations
+done < $fin_fasta
+# 打印信息
+echo "Done. RNA sequences are in $rna_fasta, protein sequences are in $pro_fasta, and combinations are in $fcombinations."
+echo "RNA count: $(wc -l < $rna_fasta), RNA max length: $(awk -F',' '{print $5}' $fcombinations | sort -nr | head -n 1), RNA min length: $(awk -F',' '{print $5}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
+echo "Protein count: $(wc -l < $pro_fasta), Protein max length: $(awk -F',' '{print $6}' $fcombinations | sort -nr | head -n 1), Protein min length: $(awk -F',' '{print $6}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
+echo "Sequence length longer than 1000 were truncated and kept head and tail with the length of 1000, sliding 500 as step, 1000 as window"
+# ERNIE-RNA 嵌入
+ERNIE_RNA_script="cd /public/home/jiang_jiuhong/soft/ERNIE-RNA/
+$ernie_rna_env_path/miniconda3/envs/ERNIE-RNA/bin/python extract_embedding_jh.py --seqs_path='$rna_fasta' --save_path='$current_path/ernie_rna_emb/' --device=cpu"
+echo "$ERNIE_RNA_script" > $current_path/ernie_rna_emb.sh
+chmod +x $current_path/ernie_rna_emb.sh
+nohup srun -p hebhcnormal01 -c 32 sh $current_path/ernie_rna_emb.sh > $current_path/log_ernie_rna_emb.txt 2>&1 &
+# ESM2 嵌入
+ESM2_script="cd /public/home/jiang_jiuhong/code/esm/
+$esm2_env_path/miniconda3/envs/esm2_env/bin/python scripts/extract.py esm2_t48_15B_UR50D $pro_fasta $current_path/esm2_emb/ --repr_layers 48 --include mean per_tok"
+echo "$ESM2_script" > $current_path/esm2_emb.sh
+chmod +x $current_path/esm2_emb.sh
+nohup srun -p hebhcnormal01 -c 32 sh $current_path/esm2_emb.sh > $current_path/log_esm2_emb.txt 2>&1 &
+# 等待嵌入完成
+wait
+# 执行 RPcontact 获取 contactmap
+python process_rna_protein.py --rna_fasta=$rna_fasta --pro_fasta=$pro_fasta --csv=$fcombinations --WDIR=$WDIR --out=$dirout

app.py ADDED Viewed

	@@ -0,0 +1,661 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import random
+import tempfile
+import os
+import zipfile
+import io
+from Bio import SeqIO
+import torch
+from sklearn.preprocessing import OneHotEncoder
+import plotly.graph_objects as go
+class RPContactPredictor:
+    def __init__(self, model_path='./weight/model_roc_0_56=0.779.pt'):
+        """Initialize RNA-protein contact predictor"""
+        self.model = torch.load(model_path, map_location=torch.device('cpu'))
+        self.model.eval()
+        self.seed_everything()
+    def seed_everything(self, seed=2022):
+        """Set random seed for reproducibility"""
+        random.seed(seed)
+        os.environ['PYTHONHASHSEED'] = str(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    def one_hot_encode(self, sequences, alpha='ACGU'):
+        """One-hot encode biological sequences"""
+        sequences_array = np.array(list(sequences)).reshape(-1, 1)
+        label = np.array(list(alpha)).reshape(-1, 1)
+        enc = OneHotEncoder(handle_unknown='ignore')
+        enc.fit(label)
+        seq_encode = enc.transform(sequences_array).toarray()
+        return seq_encode
+    def contact_partner_constrained(self, prob_matrix, colmax=12, rowmax=24):
+        """Apply contact partner constraints to probability matrix"""
+        row_max_indices = np.argsort(-prob_matrix, axis=1)[:, :rowmax]
+        row_max_mask = np.zeros_like(prob_matrix)
+        row_max_mask[np.arange(prob_matrix.shape[0])[:, np.newaxis], row_max_indices] = 1
+        col_max_indices = np.argsort(-prob_matrix, axis=0)[:colmax, :]
+        col_max_mask = np.zeros_like(prob_matrix)
+        col_max_mask[col_max_indices, np.arange(prob_matrix.shape[1])] = 1
+        mask = np.logical_and(row_max_mask, col_max_mask).astype(np.float32)
+        prob_matrix = np.where(mask == 1, prob_matrix, 0)
+        return prob_matrix
+    def read_fasta(self, fasta_content):
+        """Parse FASTA format content"""
+        sequences = {}
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as tmp_file:
+            tmp_file.write(fasta_content)
+            tmp_file_path = tmp_file.name
+        try:
+            for record in SeqIO.parse(tmp_file_path, 'fasta'):
+                pdbid, seq = record.id, str(record.seq)
+                rnaid, proid = pdbid.split('.')
+                rnaseq, proseq = seq.split('.')
+                sequences = {
+                    'rna': (rnaid, rnaseq),
+                    'protein': (proid, proseq)
+                }
+                break
+        finally:
+            os.unlink(tmp_file_path)
+        return sequences
+    def predict_contact(self, rna_seq, protein_seq):
+        """Predict RNA-protein contact matrix"""
+        # Encode sequences
+        rna_oh = self.one_hot_encode(rna_seq, alpha='ACGU')
+        pro_oh = self.one_hot_encode(protein_seq, alpha='GAVLIFWYDNEKQMSTCPHR')
+        # Prepare input tensors
+        x_rna = torch.from_numpy(np.expand_dims(rna_oh, 0)).transpose(-1, -2).float()
+        x_pro = torch.from_numpy(np.expand_dims(pro_oh, 0)).transpose(-1, -2).float()
+        # Run prediction
+        with torch.no_grad():
+            outputs = self.model(x_pro, x_rna)
+        # Process outputs
+        outputs = torch.squeeze(outputs, -1).permute(0, 2, 1)
+        contact_matrix = outputs[0].cpu().numpy()
+        # Apply constraints and normalization
+        contact_matrix = self.contact_partner_constrained(contact_matrix)
+        contact_matrix = (contact_matrix - contact_matrix.min()) / (contact_matrix.max() - contact_matrix.min() + 1e-8)
+        return contact_matrix
+def create_heatmap(contact_matrix, rna_labels, protein_labels, rna_name, protein_name, Threshold=0.0):
+    """Create interactive contact heatmap with threshold filtering"""
+    # Apply Threshold threshold
+    filtered_matrix = contact_matrix.copy()
+    filtered_matrix[filtered_matrix < Threshold] = 0
+    fig = go.Figure(data=go.Heatmap(
+        z=filtered_matrix,
+        x=protein_labels,
+        y=rna_labels,
+        colorscale='Reds',
+        showscale=True,
+        colorbar=dict(title="Predicted Probability"),
+        hovertemplate='RNA: %{y}<br>Protein: %{x}<br>Probability: %{z:.4f}<extra></extra>'
+    ))
+    fig.update_layout(
+        title={
+            'text': f"{rna_name} vs {protein_name} (Threshold ≥ {Threshold:.3f})",
+            'x': 0.5,
+            'xanchor': 'center',
+            'yanchor': 'top'
+        },
+        xaxis_title=f"Protein Residues ({protein_name})",
+        yaxis_title=f"RNA Nucleotides ({rna_name})",
+        width=800,
+        height=600,
+        font=dict(size=12)
+    )
+    return fig
+def get_contact_pairs(contact_matrix, rna_labels, protein_labels, Threshold=0.0):
+    """Get filtered contact pairs list above threshold"""
+    df = pd.DataFrame(contact_matrix, index=rna_labels, columns=protein_labels)
+    df_stacked = df.stack().reset_index()
+    df_stacked.columns = ['RNA', 'Protein', 'Probability']
+    df_filtered = df_stacked[df_stacked['Probability'] > Threshold].sort_values('Probability', ascending=False)
+    return df_filtered
+def create_download_files(contact_matrix, rna_labels, protein_labels, rna_name, protein_name):
+    """Create downloadable result files package"""
+    # Create temporary directory
+    temp_dir = tempfile.mkdtemp()
+    # Save heatmap raw data
+    heatmap_df = pd.DataFrame(contact_matrix, index=rna_labels, columns=protein_labels)
+    heatmap_file = os.path.join(temp_dir, f"{rna_name}_{protein_name}_heatmap.csv")
+    heatmap_df.to_csv(heatmap_file, index=True)
+    # Save contact pairs list
+    pairs_df = get_contact_pairs(contact_matrix, rna_labels, protein_labels, Threshold=0.0)
+    pairs_file = os.path.join(temp_dir, f"{rna_name}_{protein_name}_contact_pairs.csv")
+    pairs_df.to_csv(pairs_file, index=False)
+    # Create ZIP file
+    zip_path = os.path.join(temp_dir, f"{rna_name}_{protein_name}_results.zip")
+    with zipfile.ZipFile(zip_path, 'w') as zipf:
+        zipf.write(heatmap_file, os.path.basename(heatmap_file))
+        zipf.write(pairs_file, os.path.basename(pairs_file))
+    return zip_path
+def process_prediction(fasta_file, rna_sequence, protein_sequence, input_method):
+    """Process prediction request and return initial results"""
+    if not fasta_file and not (rna_sequence and protein_sequence):
+        return "❌ Please upload a FASTA file or enter RNA and protein sequences",None, None, None, None, None, None
+    try:
+        # Process input
+        if input_method == "Upload FASTA File" and fasta_file:
+            fasta_content = fasta_file.decode('utf-8')
+            sequences = predictor.read_fasta(fasta_content)
+        else:
+            # Create sequences from text input
+            sequences = {
+                'rna': ('RNA', rna_sequence),
+                'protein': ('Protein', protein_sequence)
+            }
+        rna_id, rna_seq = sequences['rna']
+        protein_id, protein_seq = sequences['protein']
+        # Validate sequences
+        if len(set(rna_seq) - set('ACGU')) > 0:
+            return f"❌ RNA sequence contains invalid characters: {set(rna_seq) - set('ACGU')}",None, None, None, None, None, None
+        if len(set(protein_seq) - set('GAVLIFWYDNEKQMSTCPHR')) > 0:
+            return f"❌ Protein sequence contains invalid characters: {set(protein_seq) - set('GAVLIFWYDNEKQMSTCPHR')}",None, None, None, None, None, None
+        # Run contact prediction
+        contact_matrix = predictor.predict_contact(rna_seq, protein_seq)
+        # Generate residue labels
+        rna_labels = [f'{nt}{i + 1}' for i, nt in enumerate(rna_seq)]
+        protein_labels = [f'{aa}{i + 1}' for i, aa in enumerate(protein_seq)]
+        # Calculate default Threshold (minimum non-zero value)
+        non_zero_values = contact_matrix[contact_matrix > 0]
+        default_threshold = float(np.min(non_zero_values)) if len(non_zero_values) > 0 else 0.0
+        max_threshold = float(np.max(contact_matrix))
+        # Create initial heatmap with default Threshold
+        heatmap = create_heatmap(contact_matrix, rna_labels, protein_labels, rna_id, protein_id, default_threshold)
+        # Create initial contact pairs table
+        contact_pairs = get_contact_pairs(contact_matrix, rna_labels, protein_labels, default_threshold)
+        # Create download file
+        download_file = create_download_files(contact_matrix, rna_labels, protein_labels, rna_id, protein_id)
+        # Prepare status message
+        status = f"✅ Prediction completed!\n"
+        status += f"RNA length: {len(rna_seq)}\n"
+        status += f"Protein length: {len(protein_seq)}\n"
+        status += f"Total predicted contacts: {len(contact_pairs)}"
+        # Prepare result state for threshold updates
+        result_state = {
+            'contact_matrix': contact_matrix,
+            'rna_labels': rna_labels,
+            'protein_labels': protein_labels,
+            'rna_id': rna_id,
+            'protein_id': protein_id
+        }
+        # Update slider configuration
+        slider_update = gr.update(
+            minimum=default_threshold,
+            maximum=max_threshold,
+            value=default_threshold,
+            step=(max_threshold - default_threshold) / 100,
+            visible=True
+        )
+        # Create contact pairs info
+        contact_info = f"📊 Found {len(contact_pairs)} contacts (Threshold ≥ {default_threshold:.3f})"
+        return status, heatmap, contact_pairs, contact_info, download_file, result_state, slider_update
+    except Exception as e:
+        return f"❌ Prediction failed: {str(e)}", None, None, None, None, None, None
+def update_results_with_threshold(Threshold, result_state):
+    """Update heatmap and contact table based on Threshold threshold"""
+    if result_state is None:
+        return None, None, None
+    # Create updated heatmap
+    heatmap = create_heatmap(
+        result_state['contact_matrix'],
+        result_state['rna_labels'],
+        result_state['protein_labels'],
+        result_state['rna_id'],
+        result_state['protein_id'],
+        Threshold
+    )
+    # Create updated contact pairs table
+    contact_pairs = get_contact_pairs(
+        result_state['contact_matrix'],
+        result_state['rna_labels'],
+        result_state['protein_labels'],
+        Threshold
+    )
+    # Create contact pairs info
+    contact_info = f"📊 Found {len(contact_pairs)} contacts (Probability ≥ {Threshold:.3f})"
+    return heatmap, contact_pairs, contact_info
+def reset_threshold(result_state):
+    if result_state is None:
+        return gr.update(value=0.0)
+    contact_matrix = result_state['contact_matrix']
+    non_zero_values = contact_matrix[contact_matrix > 0]
+    if len(non_zero_values) > 0:
+        default_threshold = float(np.min(non_zero_values))
+    else:
+        default_threshold = 0.0
+    # 返回滑块更新对象
+    return gr.update(
+        minimum=default_threshold,
+        maximum=float(np.max(non_zero_values)),
+        value=default_threshold,
+        interactive=True)
+def load_example_data(fasta_input, rna_input, protein_input):
+    # 如果fasta有值（非空），则返回"Upload FASTA File"
+    if fasta_input is not None:
+        return gr.update(value="Upload FASTA File")
+    else:
+        return gr.update(value="Enter Sequences Directly")
+def create_interface():
+    """Create Gradio interface with threshold control"""
+    custom_css = """
+    .gradio-dataframe {
+        background: white !important;
+        border: 1px solid #e0e0e0;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+    }
+    .dataframe-container {
+        padding: 12px;
+        background: white;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+    }
+    .contact-info {
+        font-size: 14px;
+        font-weight: 500;
+        margin-bottom: 8px;
+        color: #4a5568;
+    }
+    """
+    with gr.Blocks(title="RNA-Protein Contact Prediction Tool",
+                   theme=gr.themes.Soft(primary_hue="blue", secondary_hue="teal"),
+                   css=custom_css) as app:
+        gr.Markdown("""
+   <center>
+# 🧬 RPcontact: RNA-Protein Contact Prediction
+**Direct Nucleotide–Residue Contact Prediction from Primary Sequences**
+[Paper](https://www.biorxiv.org/content/10.1101/2025.06.02.657171v1.full) |
+[Code](https://github.com/rpcontact) |
+[Demo](https://julse-rpcontact.hf.space/)
+</center>
+> RPcontact predicts direct nucleotide-residue contacts between RNA and protein sequences.
+Leveraging **ERNIE-RNA** for RNA and **ESM-2** for protein modeling, the method provides high-resolution insights into RNA-protein interactions at the atomic level.
+<br><br>Current Demo (auROC 0.779 on VL-49) is optimized for limited CPU environments using efficient one-hot encoding<br>
+    Advanced Model (auROC 0.845 on VL-49), the Embedding-based approach will be released upon paper publication ([contact us](mailto:[email protected]) for early access)
+        """)
+        with gr.Tab("🔬 Contact Prediction"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("## ⚙️ Input Options")
+                    with gr.Group(elem_classes="input-group"):
+                        input_method = gr.Radio(
+                            choices=["Upload FASTA File", "Enter Sequences Directly"],
+                            value="Upload FASTA File",
+                            label="Input Method"
+                        )
+                        fasta_input = gr.File(
+                            label="FASTA File",
+                            file_types=['.fasta', '.fa', '.txt'],
+                            type='binary'
+                        )
+                        rna_input = gr.Textbox(
+                            label="RNA Sequence",
+                            placeholder="Enter RNA sequence (use A,C,G,U)",
+                            lines=3,
+                            visible=False
+                        )
+                        protein_input = gr.Textbox(
+                            label="Protein Sequence",
+                            placeholder="Enter protein sequence (standard amino acid codes)",
+                            lines=3,
+                            visible=False
+                        )
+                    # Example data
+                    gr.Examples(
+                        examples=[
+                            ["./example/inputs/8DMB_W.8DMB_P.fasta", "GGGCCUUAUUAAAUGACUUC", "MDVPRKMETRRNLRRARRYRK"],
+                        ],
+                        inputs=[fasta_input, rna_input, protein_input],
+                        outputs=[input_method],
+                        label="📋 Example Data (click to load)",
+                        run_on_click=True,
+                        fn = load_example_data
+                    )
+                    # Submit button at the bottom of input column
+                    predict_btn = gr.Button("🚀 Run Prediction", variant="primary", size="lg")
+                    # Status output
+                    status_output = gr.Textbox(label="Prediction Status", lines=5)
+                with gr.Column(scale=2):
+                    # Results section - initially hidden
+                    gr.Markdown("""
+                    ## 📊 Results
+                    """)
+                    # Threshold control section
+                    with gr.Row():
+                        threshold_slider = gr.Slider(
+                            label="Contact Probability Threshold",
+                            minimum=0.0,
+                            maximum=1.0,
+                            value=0.0,
+                            step=0.001,
+                            visible=True,
+                            interactive=True
+                        )
+                        reset_btn = gr.Button("Reset to Default", size="sm")
+                    gr.Markdown("""
+                    ### 🎯Contact Map
+                    """)
+                    # Heatmap display
+                    heatmap_plot = gr.Plot(label='Contact Map')
+                    # Contact pairs table with info header
+                    gr.Markdown("### 🎯Contact Pairs")
+                    contact_info = gr.Markdown("", elem_classes="contact-info")
+                    contact_table = gr.Dataframe(
+                        headers=["RNA", "Protein", "Probability"],
+                        datatype=["str", "str", "number"],
+                        row_count=15,
+                        interactive=False,
+                        elem_classes="gradio-dataframe"
+                    )
+                    # Download button
+                    download_btn = gr.File(
+                        label="📥 Download Results Package",
+                        visible=True
+                    )
+        # User Guide tab remains unchanged
+        with gr.Tab("📖 User Guide"):
+            # ... (unchanged user guide content) ...
+            gr.Markdown("""
+            # 📖 Comprehensive User Guide
+            ## 🎯 Overview
+            This tool predicts direct contacts between nucleotides in RNA sequences and residues in protein sequences using a deep learning model based on ERNIE-RNA and ESM-2 embeddings. The tool provides:
+            - **Interactive contact matrix visualization** with adjustable probability thresholds
+            - **Detailed contact pairs list** sorted by prediction confidence
+            - **Downloadable results** in CSV and ZIP formats
+            - **Real-time threshold filtering** for result exploration
+            ## 📋 Input Formats
+            ### Method 1: FASTA File Upload
+            Upload a FASTA file containing both RNA and protein sequences in the following format:
+            ```
+            >RNA_ID.PROTEIN_ID
+            RNA_SEQUENCE.PROTEIN_SEQUENCE
+            ```
+            **Example:**
+            ```
+            >8DMB_W.8DMB_P
+            GGGCCUUAUUAAAUGACUUC.MDVPRKMETRRNLRRARRYRK
+            ```
+            ### Method 2: Direct Sequence Input
+            Enter RNA and protein sequences directly in the respective text boxes:
+            - **RNA Sequence**: Use standard nucleotide codes (A, U, G, C)
+            - **Protein Sequence**: Use standard single-letter amino acid codes (GAVLIFWYDNEKQMSTCPHR)
+            ## 🔬 Understanding Results
+            ### Contact Heatmap
+            - **X-axis**: Protein residue positions (e.g., M1, D2, V3...)
+            - **Y-axis**: RNA nucleotide positions (e.g., G1, G2, G3...)
+            - **Color Intensity**: Contact probability (0.0 to 1.0)
+            - **Red Colors**: Higher contact probability
+            - **White/Light**: Lower or no contact probability
+            ### Contact Pairs Table
+            Lists all predicted contacts above the selected threshold, showing:
+            - **RNA**: Nucleotide position and type
+            - **Protein**: Residue position and type
+            - **Probability**: Contact prediction confidence (0.0-1.0)
+            ### Threshold Control
+            Use the **Contact Probability Threshold** slider to:
+            - Filter contacts by minimum probability
+            - Focus on high-confidence predictions
+            - Explore different confidence levels
+            - Click **"Reset to Default"** to return to the minimum non-zero value
+            ## 📥 Download Options
+            The results package (ZIP file) contains:
+            1. **`*_heatmap.csv`**: Complete contact probability matrix
+                - Rows: RNA nucleotides
+                - Columns: Protein residues
+                - Values: Contact probabilities
+            2. **`*_contact_pairs.csv`**: All contact pairs above zero probability
+                - RNA: Nucleotide identifier
+                - Protein: Residue identifier
+                - Probability: Contact prediction score
+            ## ⚡ Performance Guidelines
+            - **Processing Time**: Scales quadratically with sequence length
+            ### Quality Considerations
+            - Higher probabilities indicate more confident predictions
+            - Consider biological context when interpreting results
+            - Cross-validate important contacts with experimental data
+            ## 🔧 Troubleshooting
+            ### Common Issues
+            **Invalid Characters Error:**
+            - RNA: Only A, U, G, C are allowed
+            - Protein: Only standard 20 amino acids are supported
+            - Check for lowercase letters, numbers, or special characters
+            **File Format Error:**
+            - Ensure FASTA format: `>ID\\nSEQUENCE`
+            - Use period (.) to separate RNA and protein sequences
+            - Check file encoding (UTF-8 recommended)
+            **Empty Results:**
+            - Very short sequences may produce no significant contacts
+            - Try lowering the probability threshold
+            - Verify sequence quality and biological relevance
+            ## 📊 Interpretation Guidelines
+            ### High-Confidence Predictions (≥0.7)
+            - Strong likelihood of direct contact
+            - Priority targets for experimental validation
+            - Suitable for structural modeling constraints
+            ### Medium-Confidence Predictions (0.3-0.7)
+            - Moderate likelihood of interaction
+            - Consider in context with other evidence
+            - Useful for identifying interaction regions
+            ### Low-Confidence Predictions (<0.3)
+            - May represent weak or indirect interactions
+            - Use with caution for biological interpretation
+            - Good for exploratory analysis
+            ## 🔬 Technical Details
+            ### Model Architecture
+            - Based on attention mechanisms and transformer models
+            - Trained on experimentally validated RNA-protein complexes
+            - Uses one-hot encoding for sequence representation
+            - Applies contact partner constraints for biological realism
+            ### Validation Metrics
+            - Cross-validated on diverse RNA-protein complex datasets
+            - Performance metrics available in the original publication
+            - Benchmarked against existing prediction methods
+            ### 📊 Difference between current demo and final model
+            | Model Type          | Checkpoint File           | auROC (VL-49) | LLM embeddings |
+            |---------------------|---------------------------|---------------|-------------------|
+            | OH + RP_Emb (final)         | `model_roc_0_38=0.845.pt` | 0.845         | ✓                |
+            | OH (demo)                  | `model_roc_0_56=0.779.pt` | 0.779         | ✗                |
+            ## 📚 Citation & Contact
+            If you use this tool in your research, please cite:
+            **Jiang, J., Zhang, X., Zhan, J., Miao, Z., & Zhou, Y. (2025). RPcontact: Improved prediction of RNA-protein contacts using RNA and protein language models. bioRxiv, 2025-06.**
+            ### Contact Information
+            For technical issues, feature requests, or collaboration inquiries, please contact the development team.
+            - **Primary Contact**: Jiuhong Jiang
+            - **Email**: [email protected]
+            - **Institution**: ShanghaiTech University, Shanghai, China
+            ---
+            <p align="center"><em>Making RNA-protein interaction prediction accessible and accurate for the research community.</em></p>
+            """)
+        # Hidden state to store prediction results
+        result_state = gr.State()
+        # Event handlers
+        def toggle_inputs(method):
+            """Toggle input visibility based on selected method"""
+            if method == "Upload FASTA File":
+                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+            else:
+                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
+        # Input method change
+        input_method.change(
+            fn=toggle_inputs,
+            inputs=[input_method],
+            outputs=[fasta_input, rna_input, protein_input]
+        )
+        # Prediction button
+        predict_btn.click(
+            fn=process_prediction,
+            inputs=[fasta_input, rna_input, protein_input, input_method],
+            outputs=[
+                status_output,
+                heatmap_plot,
+                contact_table,
+                contact_info,
+                download_btn,
+                result_state,
+                threshold_slider
+            ]
+        )
+        # Threshold slider change
+        threshold_slider.change(
+            fn=update_results_with_threshold,
+            inputs=[threshold_slider, result_state],
+            outputs=[heatmap_plot, contact_table, contact_info]
+        )
+        # Reset button
+        reset_btn.click(
+            fn=reset_threshold,
+            inputs=[result_state],
+            outputs=[threshold_slider]
+        )
+    return app
+# Initialize predictor
+predictor = RPContactPredictor()
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )

benchmark/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

benchmark/readme.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ the results predicted by all the methods on TS_nt can download after the paper accepted by journal

evaluate.py ADDED Viewed

	@@ -0,0 +1,209 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Created by: [email protected]
+# des : evaluate RPcontact
+import glob
+import os
+import pickle
+import random
+from argparse import ArgumentParser
+import matplotlib.pyplot as plt
+import pandas as pd
+import torch
+from Bio import SeqIO
+from sklearn.preprocessing import OneHotEncoder
+import numpy as np
+from predict import check_path, one_hot_encode, get_bin_pred, doSavePredict
+def get_bin_label(df_label,distance_cutoff):
+    bin_label = df_label < distance_cutoff
+    bin_label = bin_label.astype(int)
+    return bin_label
+def view_evaluate_contact_prob(df_label, bin_pred,ax=None,markersize=5):
+    confusing_matrix = np.zeros_like(df_label)
+    r, p = confusing_matrix.shape
+    if ax is None:
+        ax = plt
+        ax.xlim([-2, p + 2])
+        ax.ylim([-2, r + 2])
+        # plt.xticks(rotation=90)
+    else:
+        ax.set_xlim([-2, p + 2])
+        ax.set_ylim([-2, r + 2])
+        # plt.setp(ax.get_xticklabels(), rotation=90)
+        ax.set_title('performance')
+    colors = [
+              '#f5e0c4', # lightblue for FP
+            # '#aaa6ce','#66609c','k',# light purple, dark purple,black, for Groud truth
+            '#b0d9db','#61b3b6','k',# light purple, dark purple,black, for Groud truth
+        '#ecbbd8','#9d4e7d','r' # for TP
+    ]
+    tps = []
+    bin_label = df_label<8
+    temp = bin_pred - bin_label
+    fn = ax.plot(*np.where(temp.T == 1), ".", c=colors[0], markersize=markersize,label='False Positive')[0]
+    # 绘制NaN值的数据点为灰色
+    oc = ax.plot(*np.where(df_label.T.isna()), ".", c='gray', markersize=markersize, label='Missing in PDB')[0]
+    confusing_matrix[bin_label == 1] = 1  #ground truth
+    oc = ax.plot(*np.where(bin_label.T == 1), ".", c=colors[1],markersize=markersize, label='Ground truth (8Å)')[0]
+    temp = bin_label + bin_pred
+    tps.append(len(confusing_matrix[np.where(temp == 2)]))
+    confusing_matrix[np.where(temp == 2)] = 2  # TP : blue
+    tp = ax.plot(*np.where(temp.T == 2), "o", c=colors[4],markersize=markersize, label='True Positive (8Å)')[0]
+    tp.set_markerfacecolor(colors[1])
+    tp.set_markeredgecolor(colors[4])
+    bin_label = df_label<5
+    temp = bin_label + bin_pred
+    tps.append(len(confusing_matrix[np.where(temp == 2)]))
+    oc = ax.plot(*np.where(bin_label.T == 1), ".", c=colors[2],markersize=markersize, label='Ground truth (5Å)')[0]
+    confusing_matrix[np.where(temp == 2)] = 2  # TP : blue
+    tp = ax.plot(*np.where(temp.T == 2), "o", c=colors[5],markersize=markersize, label='True Positive (5Å)')[0]
+    tp.set_markerfacecolor(colors[2])
+    tp.set_markeredgecolor(colors[5])
+    bin_label = df_label<3.5
+    oc = ax.plot(*np.where(bin_label.T == 1), ".", c=colors[3],markersize=markersize, label='Ground truth (3.5Å)')[0]
+    temp = bin_label + bin_pred
+    tps.append(len(confusing_matrix[np.where(temp == 2)]))
+    confusing_matrix[np.where(temp == 2)] = 2  # TP : blue
+    tp = ax.plot(*np.where(temp.T == 2), "o", c=colors[6],markersize=markersize, label='True Positive (3.5Å)')[0]
+    tp.set_markerfacecolor(colors[3])
+    tp.set_markeredgecolor(colors[6])
+    # ax.legend()
+    # plt.show()
+    # tp = len(confusing_matrix[np.where(temp == 2)])
+    print(len(confusing_matrix[np.where(temp == 2)]))
+    return '/'.join([str(e) for e in tps[::-1]]),confusing_matrix
+def seed_everything(seed=2022):
+    print('seed_everything to ',seed)
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed) # 程序每次运行结果一致，但是程序中多次生成随机数每次不一致 # https://blog.csdn.net/qq_42951560/article/details/112174334
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False # minbatch的长度一直在变化，这个优化比较浪费时间
+def getParam():
+    parser = ArgumentParser()
+    # data
+    parser.add_argument('--rootdir', default='',
+                        type=str)
+    parser.add_argument('--fasta', default='./example/inputs/8DMB_W.8DMB_P.fasta',
+                        type=str)
+    parser.add_argument('--out', default='./example/outputs/',
+                        type=str)
+    parser.add_argument('--ffeat', default='./example/inputs/{pdbid}.pickle',
+                        type=str)
+    parser.add_argument('--fmodel', default='./weight/model_roc_0_38=0.845.pt',
+                        type=str)
+    parser.add_argument('--device', default='cpu',
+                        type=str)
+    parser.add_argument('--flabel', default='./example/inputs/{pdbid}.pickle',
+                        type=str)
+    parser.add_argument('--draw', default=True,
+                        type=bool)
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = getParam()
+    rootdir = args.rootdir
+    fasta = args.fasta
+    ffeat = args.ffeat
+    fmodel = args.fmodel
+    device = args.device
+    flabel = args.flabel
+    draw = args.draw
+    out = args.out
+    check_path(out)
+    # pdbid = fasta.rsplit('/',1)[0].split('.')[0]
+    seed_everything(seed=2022)
+    models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
+    print('loading existed model', fmodel)
+    with torch.no_grad():
+        for pdbid,seq in [(record.id,record.seq) for record in SeqIO.parse(fasta,'fasta')]:
+            rnaid,proid= pdbid.split('.')
+            rnaseq,proseq= seq.split('.')
+            with open(ffeat.format_map({'pdbid':rnaid}),'rb') as f:
+                rna_emb = pickle.load(f)
+            with open(ffeat.format_map({'pdbid':proid}),'rb') as f:
+                pro_emb = pickle.load(f)
+            rna_oh = one_hot_encode(rnaseq, alpha='ACGU')
+            pro_oh = one_hot_encode(proseq, alpha='GAVLIFWYDNEKQMSTCPHR')
+            # mask = np.ones((emb.shape[0],1)) # mask missing nt when evaluate the model
+            x_train = np.concatenate([rna_oh,rna_emb],axis=1)
+            x_train = np.expand_dims(x_train,0)
+            x_train = torch.from_numpy(x_train).transpose(-1,-2)
+            x_train = x_train.to(device, dtype=torch.float)
+            x_rna = x_train
+            x_train = np.concatenate([pro_oh, pro_emb], axis=1)
+            x_train = np.expand_dims(x_train, 0)
+            x_train = torch.from_numpy(x_train).transpose(-1, -2)
+            x_train = x_train.to(device, dtype=torch.float)
+            x_pro = x_train
+            print('input data shape for rna and protein:',x_rna.shape,x_pro.shape)
+            x_rna = x_rna.to(device, dtype=torch.float32)
+            x_pro = x_pro.to(device, dtype=torch.float32)
+            plt.figure(figsize=(20, 15))
+            for i,(model_path,model) in enumerate(models):
+                model.eval()
+                outputs = model(x_pro, x_rna)  # [1, 299, 74, 1]
+                # print('outputs,',outputs.device)
+                outputs = torch.squeeze(outputs, -1)
+                outputs = outputs.permute(0, 2, 1)
+                df_pred = outputs[0].cpu().detach().numpy()
+                # seq = data._seq[pdbid] if pdbid in data._seq else None
+                des = f'predict by {__file__}\n#{model_path}'
+                doSavePredict(pdbid, {'rna':rnaseq,'protein':proseq}, df_pred,
+                                   out,
+                                   des
+                                   )
+                top = sum(df_pred.shape)
+                df_pred = pd.DataFrame(df_pred)
+                threshold = df_pred.stack().nlargest(top).iloc[-1]
+                if draw:
+                    with open(flabel.format_map({'pdbid': pdbid}), 'rb') as f:
+                        df_label = pickle.load(f)
+                    df_label = df_label.squeeze()
+                    bin_pred = get_bin_pred(df_pred, threshold=threshold)
+                    view_evaluate_contact_prob(df_label, bin_pred, ax=None)
+                    plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
+                    plt.xlabel(proid)
+                    plt.ylabel(rnaid)
+                    handles, labels = plt.gca().get_legend_handles_labels()
+                    plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
+                               frameon=False)
+                    # 设置坐标轴的相同缩放
+                    ax = plt.gca()
+                    ax.set_aspect('equal')
+                    plt.tight_layout()
+                    plt.savefig(f'{out}/{pdbid}_{i}_evaluate.png',dpi=900)
+                    plt.show()
+                print(f'predict {pdbid} with {len(seq)} nts')

example/inputs/8DMB_W.8DMB_P.fasta ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ >8DMB_W.8DMB_P
2	+ GGGCCUUAUUAAAUGACUUCUCGUCAACCACCCCUGACUGAAGUCAGAGGCUUGCUUCUGGCCUGAGUUGGGGGCCCGGUUUGGCGGGGCCGGGGGCAACUGGCUGACCAGGCGGCCCGGUUCGCCGGGCAGGGGUCCGCGGGGCUACCAAGGACUUCCGGGUGUUUCGCCAGCCCGGACUAUCUCCGGCAGAACCGCUCAAUGCCGCGGCCGGCCAAGACCGGCCUAAGCCCUGCGGACAGCGCCGAGGCGACAAUCACUCCGAAAGGAGGCCGUGUAUCGGC.MGSSHHHHHHSSGLVPRGSHMASWSHPQFEKGGGSGGGSGGSAWSHPQFEKMSDSEVNQEAKPEVKPEVKPETHINLKVSDGSSEIFFKIKKTTPLRRLMEAFAKRQGKEMDSLRFLYDGIRIQADQTPEDLDMEDNDIIEAHREQIGGSMSTSITRVPVVGVDGRPLMPTTPRKARLLIRDGLAVPRRNKLGLFYIQMLRPVGTRTQPVALAVDPGAKYDGVAVASHRRVELRAMVFLPDDVPRKMETRRNLRRARRYRKTPRRPARFDNRRRKGYWLAPTQRFKVEARLKVVRELCRIYPVQLIVTEDVRFNHARDRNGKYFSTVEIGKTLTYREYRKLAELRLVEVSETDAWRERFGLEKRTERKCEQVPETHANDAAAMLMGVTGCAHNPAAPFFVWRRLRYARRSLFRQNPQKDGVRPRFGGTANGGFFRKGDWVEAEKAGKVYRGWVCGLPTETTKLVGVADADGKRIGQFSPKKVRLLARSTGFSWKEVAAHSSPEVSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYK

example/inputs/readme.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+sequence of RNA and protein:		8DMB_W.8DMB_P.fasta
+rna embedding from ERNIE-RNA: 		8DMB_W.pickle
+protein embedding from esm2: 		8DMB_P.pickle
+Label is needed in the evaluate mdoe:	8DMB_W.8DMB_P.pickle

example/outputs/8DMB_W.8DMB_P.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

example/outputs/8DMB_W.8DMB_P_0_binary.png ADDED Viewed

example/outputs/8DMB_W.8DMB_P_0_evaluate.png ADDED Viewed

Git LFS Details

SHA256: 5bce3945fc152fd81839e3225dddd0aff0f2f4a794affc96a4e66c2fa36ff194
Pointer size: 132 Bytes
Size of remote file: 1.97 MB

example/outputs/8DMB_W.8DMB_P_0_prob.png ADDED Viewed

Git LFS Details

SHA256: 554f9f4913f0f951289d573f8c83e0522786928125949e6146c4514054848bb5
Pointer size: 131 Bytes
Size of remote file: 819 kB

example/outputs/8DMB_W.8DMB_P_topL.txt ADDED Viewed

	@@ -0,0 +1,1026 @@

+rna	protein	pred
+U35	R17	1.00000
+A37	R17	0.99522
+A8	R17	0.98054
+U7	R17	0.97008
+U39	R17	0.95614
+C34	R17	0.95496
+C33	R17	0.94333
+U9	R17	0.94329
+A27	R17	0.94328
+G36	R17	0.94095
+U6	R17	0.94029
+G1	R17	0.93609
+C38	R17	0.93235
+A26	R17	0.91772
+C32	R17	0.91758
+A30	R17	0.91663
+G2	R17	0.90895
+U10	R17	0.90053
+C28	R17	0.89455
+C31	R17	0.89061
+A41	R17	0.88997
+U44	R17	0.88463
+C29	R17	0.88146
+G3	R17	0.87936
+A42	R17	0.87429
+A16	R17	0.87417
+C5	R17	0.87406
+A11	R17	0.87328
+U14	R17	0.87179
+G40	R17	0.87162
+C4	R17	0.87014
+A13	R17	0.86685
+A12	R17	0.86145
+U18	R17	0.85383
+U19	R17	0.84186
+A46	R17	0.84029
+C45	R17	0.83834
+G15	R17	0.83245
+G43	R17	0.83169
+C17	R17	0.82836
+C25	R17	0.82813
+U24	R17	0.81132
+U82	R17	0.80720
+U21	R17	0.79809
+G72	R17	0.79460
+U35	R299	0.79412
+C20	R17	0.79060
+U81	R17	0.79039
+U35	H5	0.78897
+G71	R17	0.78883
+A37	R299	0.78653
+G73	R17	0.78649
+U35	K31	0.78543
+A37	H5	0.78424
+A8	R299	0.78149
+A37	K31	0.77866
+G83	R17	0.77861
+U35	R295	0.77697
+A8	K31	0.77684
+U64	R17	0.77534
+G74	R17	0.77356
+U35	H6	0.77221
+U7	R299	0.77158
+U80	R17	0.77144
+U69	R17	0.77060
+A8	H5	0.76959
+G70	R17	0.76760
+A37	R295	0.76733
+A37	H6	0.76665
+C75	R17	0.76648
+A66	R17	0.76631
+U7	K31	0.76610
+U121	R17	0.76137
+G84	R17	0.76003
+A8	R295	0.75984
+U35	R274	0.75935
+U7	H5	0.75871
+C90	R17	0.75856
+C76	R17	0.75854
+U35	R424	0.75682
+U35	R422	0.75666
+C284	R17	0.75530
+C34	R299	0.75520
+C22	R17	0.75505
+A37	R274	0.75487
+C91	R17	0.75449
+C34	H5	0.75313
+U35	H10	0.75301
+U9	R299	0.75242
+U68	R17	0.75214
+G23	R17	0.75203
+C77	R17	0.75059
+A37	R424	0.75047
+G47	R17	0.75025
+U35	H7	0.75020
+U7	R295	0.74986
+A8	H6	0.74975
+U39	H5	0.74969
+A37	H10	0.74899
+U9	K31	0.74829
+C34	K31	0.74809
+A37	R422	0.74766
+U35	H9	0.74712
+G89	R17	0.74704
+G36	R299	0.74701
+U122	R17	0.74670
+U6	R299	0.74564
+A37	H7	0.74483
+U35	R268	0.74468
+A27	R299	0.74466
+U39	R299	0.74453
+G65	R17	0.74445
+A27	H5	0.74397
+U276	R17	0.74369
+A37	H9	0.74357
+C33	R299	0.74245
+G36	H5	0.74204
+C33	H5	0.74197
+A259	R17	0.74120
+U6	K31	0.74112
+U35	H8	0.74110
+U39	K31	0.74108
+G92	R17	0.74087
+U9	H5	0.74076
+G120	R17	0.74047
+A8	R424	0.74030
+A266	R17	0.74023
+G78	R17	0.73972
+G36	K31	0.73956
+C85	R17	0.73934
+U7	H6	0.73909
+A37	R268	0.73878
+A8	R274	0.73852
+C34	R295	0.73834
+A48	R17	0.73822
+G79	R17	0.73819
+A8	R422	0.73817
+A27	K31	0.73719
+A37	H8	0.73675
+A267	R17	0.73649
+C34	H6	0.73613
+U35	R413	0.73563
+U35	R157	0.73516
+G1	K31	0.73493
+A8	H10	0.73452
+C33	K31	0.73437
+U35	R273	0.73416
+U9	R295	0.73407
+U35	R174	0.73368
+C38	H5	0.73352
+U6	H5	0.73301
+U35	R718	0.73299
+U39	H6	0.73272
+U257	R17	0.73268
+G1	R299	0.73203
+U278	R17	0.73174
+C63	R17	0.73152
+U35	R181	0.73144
+U7	R424	0.73140
+A37	R413	0.73104
+C38	R299	0.73102
+G1	H5	0.73062
+G119	R17	0.73057
+U35	R166	0.73039
+U53	R17	0.73035
+A248	R17	0.73015
+G36	R295	0.72986
+G88	R17	0.72984
+A265	R17	0.72977
+U7	R274	0.72945
+A37	R273	0.72934
+U39	R295	0.72919
+G93	R17	0.72905
+U6	R295	0.72784
+U7	R422	0.72771
+A27	R295	0.72754
+A37	R157	0.72727
+A37	R174	0.72716
+U165	R17	0.72699
+U7	H10	0.72676
+C38	K31	0.72650
+A27	H6	0.72631
+A8	H9	0.72628
+G36	H6	0.72625
+A8	H7	0.72621
+G127	R17	0.72597
+A37	R181	0.72553
+A26	R299	0.72531
+U35	R435	0.72526
+A37	R718	0.72524
+C33	R295	0.72510
+C33	H6	0.72443
+G67	R17	0.72392
+U56	R17	0.72388
+A37	R166	0.72374
+U52	R17	0.72372
+U9	H6	0.72342
+C258	R17	0.72304
+A26	H5	0.72242
+A8	R413	0.72192
+G128	R17	0.72185
+G283	R17	0.72176
+C34	R274	0.72168
+A256	R17	0.72132
+A30	R299	0.72115
+A8	R268	0.72099
+G86	R17	0.72079
+C32	R299	0.72024
+G94	R17	0.72003
+C32	H5	0.71998
+C34	H10	0.71962
+A30	H5	0.71913
+U57	R17	0.71885
+C34	R424	0.71867
+U7	H9	0.71865
+U39	R274	0.71854
+G87	R17	0.71831
+C126	R17	0.71809
+A8	H8	0.71788
+A279	R17	0.71784
+U10	R299	0.71742
+C118	R17	0.71739
+G277	R17	0.71732
+A26	K31	0.71674
+C34	R422	0.71666
+U7	H7	0.71660
+U39	H10	0.71650
+C38	H6	0.71631
+G275	R17	0.71624
+U6	H6	0.71602
+C274	R17	0.71602
+A37	R435	0.71587
+A8	R157	0.71582
+G1	R295	0.71576
+A270	R17	0.71541
+A8	R174	0.71528
+U35	R450	0.71475
+G1	H6	0.71464
+C34	H7	0.71439
+U9	R422	0.71433
+U166	R17	0.71415
+U9	R424	0.71394
+G36	R274	0.71380
+U10	K31	0.71368
+U35	R284	0.71344
+A8	R718	0.71313
+C38	R295	0.71305
+C34	H9	0.71296
+A8	R273	0.71281
+G2	K31	0.71278
+U7	R268	0.71255
+A30	K31	0.71245
+G95	R17	0.71245
+U7	R413	0.71243
+C273	R17	0.71210
+C32	K31	0.71152
+U39	H7	0.71128
+U39	R424	0.71127
+C123	R17	0.71125
+A8	R181	0.71124
+G36	R424	0.71123
+U39	H9	0.71091
+G36	R422	0.71068
+U9	R274	0.71061
+C33	H10	0.71047
+A8	R166	0.71034
+C33	R274	0.71021
+A37	R450	0.71017
+G2	R299	0.70961
+U6	R424	0.70957
+A30	R295	0.70955
+U7	H8	0.70951
+A27	R274	0.70919
+A255	R17	0.70913
+A27	H10	0.70895
+A27	R424	0.70860
+C34	R268	0.70840
+G2	H5	0.70803
+G36	H10	0.70787
+U35	R272	0.70781
+A154	R17	0.70779
+C62	R17	0.70761
+C55	R17	0.70729
+C33	R424	0.70694
+U10	H5	0.70665
+U6	R422	0.70664
+A26	R295	0.70652
+U7	R157	0.70640
+G54	R17	0.70638
+A8	R435	0.70636
+U7	R174	0.70636
+A98	R17	0.70630
+U6	R274	0.70621
+U39	R422	0.70617
+C125	R17	0.70606
+C34	H8	0.70602
+C28	H5	0.70588
+A30	H6	0.70581
+G36	H7	0.70579
+C28	R299	0.70566
+U35	R264	0.70560
+A37	R284	0.70521
+U35	R290	0.70492
+U9	H10	0.70487
+C117	R17	0.70451
+A26	H6	0.70419
+A27	H7	0.70414
+U7	R273	0.70408
+U156	R17	0.70408
+C260	R17	0.70401
+U6	H10	0.70393
+C32	R295	0.70392
+U39	R268	0.70387
+U157	R17	0.70382
+U35	R177	0.70363
+U39	H8	0.70363
+C33	H9	0.70360
+U59	R17	0.70351
+A27	R422	0.70344
+C32	H6	0.70343
+A99	R17	0.70337
+U261	R17	0.70331
+G96	R17	0.70329
+U7	R718	0.70318
+G129	R17	0.70318
+G36	H9	0.70296
+C33	H7	0.70292
+G272	R17	0.70285
+C38	R274	0.70272
+A27	H9	0.70270
+A110	R17	0.70270
+U35	R265	0.70257
+C33	R422	0.70249
+U10	R295	0.70240
+U7	R166	0.70156
+U9	H7	0.70150
+U7	R181	0.70127
+G124	R17	0.70124
+A37	R272	0.70119
+G1	H10	0.70118
+C38	H10	0.70072
+C28	K31	0.70046
+U280	R17	0.70043
+G247	R17	0.69989
+G268	R17	0.69960
+C31	R299	0.69928
+C34	R413	0.69913
+U101	R17	0.69905
+G36	R268	0.69892
+U9	H9	0.69863
+A37	R264	0.69861
+C34	R174	0.69840
+C31	H5	0.69836
+A8	R450	0.69811
+U35	R533	0.69801
+C34	R157	0.69801
+G1	R274	0.69794
+U6	H9	0.69771
+C33	R268	0.69748
+G1	R424	0.69745
+G36	H8	0.69745
+A37	R290	0.69737
+U39	R413	0.69721
+U7	R435	0.69717
+U203	R17	0.69700
+G2	R295	0.69683
+U35	R599	0.69674
+C38	R424	0.69672
+A16	R299	0.69634
+C34	R273	0.69631
+A37	R265	0.69628
+C116	R17	0.69622
+A41	H5	0.69614
+U6	H7	0.69608
+A27	H8	0.69574
+A253	R17	0.69571
+C34	R181	0.69564
+A27	R268	0.69559
+U163	R17	0.69558
+C33	H8	0.69556
+U9	R413	0.69533
+C38	H7	0.69524
+G1	H9	0.69524
+C38	H9	0.69513
+G271	R17	0.69472
+C29	R299	0.69470
+C97	R17	0.69462
+A37	R177	0.69460
+G164	R17	0.69453
+G1	H7	0.69448
+A16	K31	0.69425
+C29	H5	0.69401
+U35	R576	0.69376
+G2	H6	0.69373
+C34	R718	0.69339
+U9	R268	0.69330
+A11	R299	0.69328
+G249	R17	0.69323
+G282	R17	0.69297
+U44	R299	0.69288
+C34	R166	0.69284
+A37	R533	0.69265
+C28	R295	0.69257
+U9	H8	0.69256
+G1	R422	0.69252
+A41	K31	0.69252
+C38	R422	0.69207
+A41	R299	0.69201
+U10	H6	0.69183
+U39	R174	0.69180
+U44	H5	0.69169
+U39	R273	0.69149
+C28	H6	0.69129
+A30	R424	0.69123
+G36	R413	0.69115
+A27	R413	0.69098
+C31	K31	0.69096
+A16	H5	0.69090
+C58	R17	0.69089
+G269	R17	0.69089
+A202	R17	0.69080
+G210	R17	0.69071
+U9	R157	0.69063
+U39	R181	0.69056
+G3	K31	0.69052
+U39	R157	0.69052
+A37	R599	0.69052
+C32	H10	0.69043
+G36	R273	0.69041
+A30	R274	0.69039
+C51	R17	0.69028
+A37	R576	0.69024
+C5	R299	0.69021
+G36	R157	0.69014
+U6	R268	0.69006
+U6	R413	0.69005
+A8	R284	0.68991
+A27	R174	0.68987
+A26	R424	0.68980
+U6	H8	0.68978
+A11	K31	0.68977
+U7	R450	0.68949
+C5	K31	0.68938
+C32	R274	0.68936
+U44	K31	0.68920
+A30	R422	0.68918
+U167	R17	0.68915
+A26	R274	0.68915
+A26	H10	0.68915
+C29	K31	0.68896
+G36	R718	0.68890
+G115	R17	0.68861
+U9	R718	0.68859
+U9	R174	0.68858
+C211	R17	0.68839
+G36	R174	0.68823
+C33	R174	0.68819
+A30	H10	0.68810
+G3	R299	0.68804
+C33	R413	0.68792
+C38	H8	0.68789
+A30	H7	0.68781
+G1	H8	0.68766
+C38	R268	0.68764
+A27	R157	0.68756
+A27	R181	0.68730
+U14	R299	0.68723
+C32	R424	0.68715
+C34	R435	0.68684
+G36	R181	0.68663
+U9	R273	0.68643
+A8	R290	0.68614
+G61	R17	0.68601
+G36	R166	0.68600
+U6	R157	0.68559
+C31	R295	0.68552
+C4	K31	0.68542
+U14	K31	0.68540
+A30	H9	0.68539
+U9	R181	0.68539
+C33	R157	0.68533
+U9	R166	0.68525
+C246	R17	0.68509
+U14	H5	0.68505
+C100	R17	0.68500
+A8	R272	0.68497
+A41	R295	0.68487
+C32	H9	0.68474
+G209	R17	0.68466
+U10	R422	0.68462
+U39	R166	0.68459
+A11	H5	0.68454
+C33	R181	0.68449
+U6	R174	0.68435
+A16	R295	0.68432
+C33	R273	0.68407
+G3	H5	0.68397
+C4	R299	0.68391
+C31	H6	0.68389
+A8	R177	0.68387
+G264	R17	0.68386
+C29	R295	0.68381
+G49	R17	0.68377
+G114	R17	0.68368
+A42	H5	0.68356
+A41	H6	0.68353
+U6	R718	0.68344
+C155	R17	0.68343
+A27	R273	0.68340
+C32	H7	0.68340
+U10	R424	0.68327
+A26	R422	0.68317
+A131	R17	0.68312
+A13	R299	0.68294
+U39	R718	0.68293
+A26	H9	0.68273
+G1	R413	0.68258
+U9	R435	0.68258
+A26	H7	0.68249
+C109	R17	0.68244
+G111	R17	0.68244
+U18	R299	0.68243
+G40	H5	0.68211
+A42	R299	0.68204
+U6	R273	0.68203
+A27	R166	0.68196
+A42	K31	0.68186
+C32	R422	0.68184
+G1	R268	0.68184
+C113	R17	0.68161
+C281	R17	0.68156
+C130	R17	0.68150
+C29	H6	0.68121
+A151	R17	0.68109
+U6	R166	0.68093
+C33	R166	0.68088
+G36	R435	0.68084
+A12	R299	0.68083
+C5	H5	0.68079
+C38	R413	0.68073
+U7	R284	0.68070
+A30	H8	0.68064
+A8	R599	0.68041
+A13	K31	0.68027
+C254	R17	0.68000
+G250	R17	0.67975
+A11	R295	0.67963
+A13	H5	0.67963
+C33	R718	0.67962
+G112	R17	0.67936
+G2	H10	0.67927
+A147	R17	0.67891
+C34	R450	0.67874
+G60	R17	0.67872
+C158	R17	0.67852
+U44	R295	0.67847
+U6	R181	0.67847
+G3	R295	0.67834
+A27	R718	0.67816
+A8	R264	0.67801
+U10	R274	0.67791
+C5	R295	0.67782
+G50	R17	0.67772
+C212	R17	0.67760
+A12	K31	0.67741
+C38	R273	0.67737
+C4	H5	0.67735
+G1	R157	0.67714
+G2	R424	0.67700
+C32	H8	0.67696
+U6	R435	0.67696
+G2	R274	0.67693
+A30	R268	0.67690
+G40	R299	0.67683
+C32	R268	0.67674
+G40	K31	0.67667
+A16	H6	0.67658
+U7	R290	0.67644
+C38	R174	0.67631
+U7	R272	0.67612
+A8	R576	0.67598
+G153	R17	0.67578
+A26	R268	0.67576
+A27	R435	0.67568
+A241	R17	0.67566
+A8	R533	0.67551
+U35	H26	0.67546
+C38	R157	0.67523
+U39	R435	0.67522
+C38	R181	0.67516
+U44	H6	0.67511
+A26	H8	0.67503
+U18	K31	0.67495
+C262	R17	0.67485
+A42	R295	0.67477
+G1	R174	0.67476
+C263	R17	0.67471
+A12	H5	0.67470
+G2	H7	0.67467
+U7	R177	0.67463
+C4	R295	0.67445
+C34	R284	0.67445
+U39	R450	0.67443
+A8	R265	0.67413
+G2	H9	0.67411
+G2	R422	0.67399
+A150	R17	0.67399
+G1	R181	0.67342
+G204	R17	0.67340
+C251	R17	0.67340
+U18	H5	0.67339
+U14	R295	0.67328
+C208	R17	0.67320
+C33	R435	0.67320
+U35	R144	0.67314
+C28	R274	0.67247
+C28	R424	0.67245
+C28	H10	0.67242
+U10	H7	0.67222
+G1	R273	0.67215
+U35	R408	0.67201
+U7	R599	0.67188
+U10	H10	0.67184
+G3	H6	0.67159
+G36	R450	0.67154
+C28	H7	0.67151
+A30	R413	0.67145
+U9	R450	0.67142
+A42	H6	0.67117
+A26	R413	0.67110
+A30	R174	0.67108
+C38	R166	0.67077
+A26	R174	0.67071
+A11	H6	0.67061
+A172	R17	0.67031
+G36	R284	0.67024
+C28	R422	0.67012
+U19	R299	0.67007
+A30	R157	0.67004
+C34	R272	0.66993
+C159	R17	0.66975
+C31	H10	0.66944
+C38	R718	0.66940
+C31	R274	0.66912
+U14	H6	0.66911
+A37	H26	0.66909
+G1	R166	0.66893
+A239	R17	0.66881
+C32	R174	0.66873
+C31	R424	0.66870
+A13	R295	0.66843
+C34	R264	0.66840
+G40	H6	0.66825
+G1	R718	0.66813
+C32	R413	0.66808
+C28	H9	0.66805
+G252	R17	0.66799
+U7	R264	0.66794
+U6	R450	0.66794
+U10	H9	0.66791
+G102	R17	0.66783
+A41	R274	0.66766
+G2	H8	0.66764
+C108	R17	0.66753
+U9	R284	0.66742
+C33	R450	0.66738
+C5	H6	0.66737
+A26	R157	0.66737
+A26	R181	0.66724
+U35	R260	0.66722
+C34	R177	0.66718
+U7	R576	0.66711
+G152	R17	0.66688
+A12	R295	0.66683
+A30	R718	0.66682
+A37	R144	0.66674
+A30	R181	0.66670
+G40	R295	0.66669
+A30	R273	0.66622
+C34	R290	0.66614
+A201	R17	0.66610
+U18	R295	0.66599
+A41	H10	0.66599
+C205	R17	0.66598
+A30	R166	0.66594
+U39	R284	0.66575
+A27	R450	0.66575
+C31	H7	0.66568
+A41	H7	0.66565
+U7	R533	0.66562
+C31	H9	0.66559
+C4	H6	0.66539
+C34	R265	0.66529
+G36	R272	0.66527
+U146	R17	0.66521
+C32	R157	0.66502
+U7	R265	0.66495
+G213	R17	0.66471
+C31	R422	0.66471
+A16	R424	0.66458
+U9	R290	0.66427
+C32	R181	0.66427
+A26	R273	0.66408
+U10	R413	0.66408
+U44	R274	0.66407
+C29	R424	0.66405
+A13	H6	0.66398
+A37	R408	0.66393
+U136	R17	0.66392
+C32	R273	0.66378
+A26	R166	0.66378
+U10	H8	0.66363
+G36	R290	0.66362
+A41	R424	0.66359
+U39	R264	0.66337
+A16	R422	0.66336
+C29	H7	0.66321
+C29	R274	0.66318
+C28	H8	0.66312
+U44	H10	0.66311
+C29	R422	0.66298
+A41	H9	0.66277
+G36	R264	0.66277
+C34	R533	0.66259
+A11	R424	0.66254
+G2	R413	0.66253
+A11	R422	0.66248
+U39	R272	0.66246
+C245	R17	0.66218
+C29	H10	0.66217
+C17	R299	0.66211
+U6	R284	0.66208
+C38	R435	0.66200
+U19	H5	0.66194
+G1	R435	0.66178
+A41	R422	0.66169
+C32	R166	0.66146
+G2	R268	0.66144
+A107	R17	0.66142
+U10	R268	0.66129
+C34	R599	0.66113
+U39	R265	0.66110
+U19	K31	0.66089
+U9	R272	0.66067
+U10	R157	0.66053
+G36	R265	0.66031
+C33	R284	0.66028
+U9	R177	0.66024
+A8	H26	0.66023
+A37	R260	0.66022
+A12	H6	0.66019
+A30	R435	0.66009
+C38	R450	0.65997
+G36	R177	0.65988
+U10	R718	0.65977
+G207	R17	0.65968
+C32	R718	0.65964
+C206	R17	0.65939
+C148	R17	0.65939
+C31	H8	0.65939
+C29	H9	0.65931
+C28	R268	0.65927
+G1	R450	0.65916
+A26	R718	0.65907
+G15	R299	0.65903
+C17	K31	0.65898
+C5	R424	0.65887
+C34	R576	0.65852
+U39	R290	0.65848
+U39	R533	0.65839
+G15	K31	0.65838
+A46	R299	0.65825
+A41	H8	0.65818
+A27	R264	0.65806
+U18	H6	0.65796
+C5	R422	0.65788
+U10	R174	0.65772
+G2	R157	0.65769
+U44	R424	0.65765
+G3	R424	0.65760
+A16	H7	0.65755
+A16	H10	0.65754
+A27	R284	0.65749
+C174	R17	0.65749
+U35	R409	0.65746
+U44	H9	0.65707
+G160	R17	0.65706
+A27	R177	0.65697
+C175	R17	0.65696
+A27	R290	0.65689
+G3	R422	0.65686
+U6	R290	0.65676
+A26	R435	0.65669
+C45	R299	0.65668
+G162	R17	0.65665
+U9	R599	0.65652
+A42	R274	0.65652
+C33	R272	0.65644
+U39	R177	0.65639
+U14	R424	0.65636
+C31	R268	0.65632
+U39	R576	0.65632
+C45	H5	0.65622
+A27	R272	0.65619
+G3	H10	0.65619
+G214	R17	0.65616
+U10	R166	0.65613
+G15	H5	0.65608
+U9	R264	0.65608
+A11	R274	0.65595
+A16	R274	0.65588
+A46	H5	0.65585
+G3	R274	0.65566
+U44	H7	0.65552
+C240	R17	0.65546
+U6	R272	0.65538
+C33	R264	0.65535
+C29	H8	0.65533
+A8	R408	0.65530
+C28	R413	0.65528
+C17	H5	0.65528
+U10	R273	0.65519
+A41	R268	0.65517
+C25	R299	0.65515
+U35	R257	0.65510
+G2	R174	0.65507
+C149	R17	0.65506
+U35	R671	0.65501
+G36	R533	0.65498
+A42	H10	0.65485
+C5	R274	0.65471
+U10	R181	0.65456
+C33	R177	0.65449
+G36	R599	0.65442
+A27	R265	0.65438
+C4	R422	0.65432
+C4	R424	0.65426
+G3	H7	0.65421
+U6	R177	0.65419
+A42	H7	0.65410
+U39	R599	0.65404
+C5	H10	0.65399
+C45	K31	0.65391
+C28	R174	0.65389
+A46	K31	0.65387
+U35	R405	0.65386
+A16	H9	0.65380
+C28	R157	0.65370
+A217	R17	0.65351
+U10	R435	0.65346
+A42	R424	0.65343
+C32	R435	0.65326
+U14	H10	0.65315
+A13	R424	0.65313
+U44	R422	0.65311
+G173	R17	0.65295
+G2	R181	0.65277
+C33	R265	0.65272
+C176	R17	0.65265
+U9	R265	0.65256
+G40	R274	0.65249
+U6	R599	0.65240
+A11	H7	0.65238
+G3	H9	0.65230
+G36	R576	0.65211
+C38	R284	0.65193
+A42	H9	0.65187
+U19	R295	0.65175
+G40	H10	0.65173
+G2	R273	0.65171
+U7	H26	0.65167
+C28	R181	0.65162
+U35	R258	0.65156
+A42	R422	0.65156
+A11	H10	0.65154
+U44	R268	0.65151
+G161	R17	0.65149
+A12	R424	0.65138
+C33	R290	0.65125
+U14	R422	0.65119
+C33	R533	0.65083
+A41	R413	0.65081
+A27	R599	0.65072
+A27	R533	0.65067
+U9	R533	0.65060
+G43	R299	0.65053
+C4	R274	0.65052
+A37	R671	0.65044
+C4	H10	0.65034
+C25	H5	0.65029
+C5	H7	0.65024
+A8	R144	0.65020
+A27	R576	0.65012
+C33	R599	0.65007
+C215	R17	0.65000
+C29	R268	0.65000
+G43	H5	0.64999
+G2	R718	0.64996
+C5	H9	0.64987
+A37	R409	0.64987
+C31	R174	0.64975
+U14	R274	0.64973
+G1	R284	0.64959
+U9	R576	0.64950
+C38	R272	0.64949
+G40	H7	0.64948
+U14	H7	0.64942
+C31	R413	0.64942
+G132	R17	0.64941
+A16	H8	0.64938
+C168	R17	0.64936
+U44	H8	0.64931
+G43	K31	0.64920
+G2	R166	0.64906
+A37	R405	0.64895
+A30	R450	0.64893
+U18	R424	0.64892
+C17	R295	0.64892
+C4	H7	0.64882
+G103	R17	0.64877
+A11	H9	0.64876
+C28	R273	0.64859
+A218	R17	0.64858
+A12	R422	0.64858
+U14	H9	0.64854
+A13	H10	0.64838
+C38	R264	0.64828
+A220	R17	0.64820
+G135	R17	0.64810
+A13	R422	0.64809
+U35	R312	0.64803
+U6	R264	0.64798
+C25	K31	0.64791
+G134	R17	0.64790
+C137	R17	0.64778
+A41	R157	0.64758
+G40	H9	0.64754
+C32	R450	0.64751
+A30	R284	0.64749
+C33	R576	0.64731
+A42	H8	0.64723
+G3	H8	0.64722
+C31	R157	0.64721
+C28	R166	0.64721
+A26	R450	0.64713
+U18	R422	0.64710
+G133	R17	0.64706
+C4	H9	0.64697
+G1	R290	0.64695
+G15	R295	0.64691
+G40	R424	0.64685
+A13	R274	0.64667
+A37	R257	0.64648
+U44	R413	0.64647
+U6	R265	0.64646
+A16	R268	0.64613
+C38	R265	0.64600
+C29	R413	0.64588
+U6	R576	0.64563
+U7	R408	0.64554
+C38	R290	0.64553
+A16	R413	0.64546
+U19	H6	0.64539
+G242	R17	0.64531
+C31	R181	0.64518
+C29	R157	0.64517
+U35	R122	0.64506
+C28	R718	0.64506
+A13	H7	0.64495
+C31	R273	0.64495
+C29	R174	0.64478
+A8	R260	0.64476
+A11	H8	0.64469
+A12	R274	0.64462
+A41	R174	0.64454
+A42	R268	0.64451
+A13	H9	0.64444
+G1	R272	0.64435
+U6	R533	0.64431
+G244	R17	0.64429
+U35	K555	0.64417
+C5	H8	0.64410
+A16	R157	0.64403
+C138	R17	0.64398
+A46	R295	0.64386
+C31	R166	0.64376
+G40	R422	0.64359
+G2	R435	0.64338
+A41	R273	0.64334
+C38	R533	0.64333
+A12	H10	0.64318
+A11	R413	0.64316
+C38	R177	0.64302
+C31	R718	0.64275
+C243	R17	0.64273
+C28	R435	0.64270
+A30	R264	0.64235
+G1	R264	0.64234
+U14	H8	0.64227
+A16	R174	0.64222
+C4	H8	0.64221
+U24	R299	0.64218
+A41	R181	0.64217
+G1	R576	0.64217
+A37	R258	0.64214
+A8	R671	0.64211
+G40	H8	0.64207
+A12	H7	0.64204
+G3	R413	0.64204
+C29	R181	0.64174
+A30	R265	0.64171
+A37	R312	0.64164
+G15	H6	0.64163
+U18	R274	0.64158
+C38	R576	0.64152
+C17	H6	0.64149
+C34	H26	0.64146
+C216	R17	0.64135
+U18	H10	0.64126
+G1	R177	0.64123
+C45	R295	0.64115
+G1	R599	0.64105
+U44	R157	0.64093
+U7	R144	0.64088
+G177	R17	0.64087

example/outputs/predict_scores.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+pdbid	contact_score
+8DMB_W.8DMB_P	0.29469
+pdbid	contact_score
+8DMB_W.8DMB_P	0.29469
+pdbid	contact_score
+8DMB_W.8DMB_P	712.49723
+pdbid	contact_score
+8DMB_W.8DMB_P	712.49723
+pdbid	contact_score
+8DMB_W.8DMB_P	712.49723
+pdbid	contact_score
+8DMB_W.8DMB_P	712.49723
+pdbid	contact_score
+8DMB_W.8DMB_P	6.98915
+pdbid	contact_score
+8DMB_W.8DMB_P	6.98915
+pdbid	contact_score
+8DMB_W.8DMB_P	6.98915
+pdbid	contact_score
+8DMB_W.8DMB_P	712.49723

predict.py ADDED Viewed

	@@ -0,0 +1,318 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Created by: [email protected]
+# des : evaluate RPcontact
+import glob
+import pickle
+import random
+import re
+from argparse import ArgumentParser
+import matplotlib.pyplot as plt
+import torch
+from Bio import SeqIO
+from sklearn.preprocessing import OneHotEncoder
+import numpy as np
+import os
+import pandas as pd
+class bcolors:
+    RED   = "\033[1;31m"
+    BLUE  = "\033[1;34m"
+    CYAN  = "\033[1;36m"
+    GREEN = "\033[0;32m"
+    RESET = "\033[0;0m"
+    BOLD    = "\033[;1m"
+    REVERSE = "\033[;7m"
+def check_path(dirout,file=False):
+    if file:dirout = dirout.rsplit('/',1)[0]
+    try:
+        if not os.path.exists(dirout):
+            print('make dir '+dirout)
+            os.makedirs(dirout)
+    except:
+        print(f'{dirout} have been made by other process')
+def load_label_pred(fin_label,fin_pred):
+    with open(fin_label, 'rb') as f:
+        df_label = pickle.load(f)
+    df_label = df_label.squeeze()
+    df_pred = pd.read_table(fin_pred, comment='#', index_col=[0])
+    if type(df_label) == pd.DataFrame:
+        df_pred.index = df_label.index
+        df_pred.columns = df_label.columns
+        # 删除包含空值的行
+        df_label = df_label.dropna(how='all')
+        # 删除包含空值的列
+        df_label = df_label.dropna(axis=1, how='all')
+        df_pred = df_pred.loc[df_label.index, df_label.columns]
+    keep=0
+    if df_pred.columns[0].count('.')==2:
+        keep=-1
+    df_pred.columns = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.columns)]
+    df_pred.index = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.index)]
+    return df_label,df_pred
+def doSavePredict(_id,seq,predict,fout,des):
+    # seq = {'protein': 'KKGVGSTKNGRDSEAKRLGAKRADGQFVTGGSILYRQRGTKIYPGENVGRGGDDTLFAKIDGTVKFERFGRDRKKVSVYPV',
+    #  'rna': 'GGGGCCUUAGCUCAGGGGAGAGCGCCUGCUUUGCACGCAGGAGGCAGCGGUUCGAUCCCGCUAGGCUCCACCA'}
+    check_path(fout)
+    df = pd.DataFrame(predict)
+    if not seq:df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='w',float_format='%.5f')
+    else:
+        df.columns = list(seq['protein'])
+        df.index = list(seq['rna'])
+        with open(fout+ f'{_id}.txt','w') as f:
+            f.write(f'#{des}\n')
+            f.write(f"# row =rna:{seq['rna']}\n")
+            f.write(f"# col=protein:{seq['protein']}\n")
+        # df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.3f',index=None,header=None)
+        df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.5f')
+        df.columns = [f'{elem}{index+1}' for index,elem in enumerate(seq['protein'])]
+        df.index = [f'{elem}{index+1}' for index,elem in enumerate(seq['rna'])]
+        df = get_top_l_triplets(df, sum(df.shape))
+        df.to_csv(fout+ f'{_id}_topL.txt',sep='\t',mode='w',float_format='%.5f',index=False)
+def get_top_l_triplets(df_pred, L):
+    """
+    从Pandas DataFrame矩阵中提取值最大的前L个三元组。
+    参数:
+    - matrix_df: Pandas DataFrame，表示接触矩阵。
+    - L: int，要提取的三元组的数量。
+    返回:
+    - top_l_triplets: 列表，包含前L个三元组，每个三元组格式为(row_index, col_index, value)。
+    """
+    df = df_pred.stack().reset_index()
+    df.columns = ['rna', 'protein', 'pred']
+    df = df.sort_values(by='pred', ascending=False).head(L)
+    return df
+def doSavePredict_single(_id,seq,predict_rsa,fout,des,pred_asa=None):
+    check_path(fout)
+    BASES = 'AUCG'
+    asa_std = [400, 350, 350, 400]
+    dict_rnam1_ASA = dict(zip(BASES, asa_std))
+    sequence = re.sub(r"[T]", "U", ''.join(seq))
+    sequence = re.sub(r"[^AGCU]", BASES[random.randint(0, 3)], sequence) # 其他字符随机变换以取得对目标的预测
+    ASA_scale = np.array([dict_rnam1_ASA[i] for i in sequence])
+    if pred_asa is None:
+        pred_asa = np.multiply(predict_rsa, ASA_scale).T
+    else:
+        predict_rsa = pred_asa/ASA_scale
+    col1 = np.array([i + 1 for i, I in enumerate(seq)])[None, :]
+    col2 = np.array([I for i, I in enumerate(seq)])[None, :]
+    col3 = pred_asa
+    col4 = predict_rsa
+    if len(col3[col3 == 0]):
+        exit(f'error in predict\t {_id},{seq}')
+    temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%.2f', col3), np.char.mod('%.3f', col4))).T
+    if fout:np.savetxt(fout + f'{_id}.txt', (temp), delimiter='\t\t', fmt="%s",
+               header=f'#{des}',
+               comments='')
+    return pred_asa,predict_rsa
+def one_hot_encode(sequences,alpha='ACGU'):
+    print(sequences)
+    sequences_arry = np.array(list(sequences)).reshape(-1, 1)
+    lable = np.array(list(alpha)).reshape(-1, 1)
+    enc = OneHotEncoder(handle_unknown='ignore')
+    enc.fit(lable)
+    seq_encode = enc.transform(sequences_arry).toarray()
+    # print(seq_encode.shape)
+    return (seq_encode)
+def get_bin_pred(df_pred,threshold):
+    bin_pred = df_pred.values >= threshold
+    bin_pred = bin_pred.astype(int)
+    return bin_pred
+def seed_everything(seed=2022):
+    print('seed_everything to ',seed)
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed) # 程序每次运行结果一致，但是程序中多次生成随机数每次不一致 # https://blog.csdn.net/qq_42951560/article/details/112174334
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False # minbatch的长度一直在变化，这个优化比较浪费时间
+def contact_partner_constrained(prob_matrix, colmax=12, rowmax=24):
+    """Apply contact partner constraints to probability matrix"""
+    row_max_indices = np.argsort(-prob_matrix, axis=1)[:, :rowmax]
+    row_max_mask = np.zeros_like(prob_matrix)
+    row_max_mask[np.arange(prob_matrix.shape[0])[:, np.newaxis], row_max_indices] = 1
+    col_max_indices = np.argsort(-prob_matrix, axis=0)[:colmax, :]
+    col_max_mask = np.zeros_like(prob_matrix)
+    col_max_mask[col_max_indices, np.arange(prob_matrix.shape[1])] = 1
+    mask = np.logical_and(row_max_mask, col_max_mask).astype(np.float32)
+    prob_matrix = np.where(mask == 1, prob_matrix, 0)
+    return prob_matrix
+def getParam():
+    parser = ArgumentParser()
+    # data
+    parser.add_argument('--rootdir', default='',
+                        type=str)
+    parser.add_argument('--fasta', default='./example/inputs/8DMB_W.8DMB_P.fasta',
+                        type=str)
+    parser.add_argument('--out', default='./example/outputs/',
+                        type=str)
+    parser.add_argument('--ffeat', default='./example/inputs/{pdbid}.pickle',
+                        type=str)
+    parser.add_argument('--fmodel', default='./weight/model_roc_0_38=0.845.pt',
+                        type=str)
+    parser.add_argument('--device', default='cpu',
+                        type=str)
+    parser.add_argument('--draw',action='store_true',default=True)
+    parser.add_argument('--constrained',action='store_true',default=True)
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = getParam()
+    rootdir = args.rootdir
+    fasta = args.fasta
+    ffeat = args.ffeat
+    fmodel = args.fmodel
+    device = args.device
+    out = args.out
+    draw = args.draw
+    check_path(out)
+    # pdbid = fasta.rsplit('/',1)[0].split('.')[0]
+    seed_everything(seed=2022)
+    models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
+    # models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
+    print('loading existed model', fmodel)
+    with torch.no_grad():
+        for pdbid,seq in [(record.id,record.seq) for record in SeqIO.parse(fasta,'fasta')]:
+            rnaid,proid= pdbid.split('.')
+            rnaseq,proseq= seq.split('.')
+            with open(ffeat.format_map({'pdbid':rnaid}),'rb') as f:
+                rna_emb = pickle.load(f)
+            with open(ffeat.format_map({'pdbid':proid}),'rb') as f:
+                pro_emb = pickle.load(f)
+            rna_oh = one_hot_encode(rnaseq, alpha='ACGU')
+            pro_oh = one_hot_encode(proseq, alpha='GAVLIFWYDNEKQMSTCPHR')
+            # mask = np.ones((emb.shape[0],1)) # mask missing nt when evaluate the model
+            x_train = np.concatenate([rna_oh,rna_emb],axis=1)
+            x_train = np.expand_dims(x_train,0)
+            x_train = torch.from_numpy(x_train).transpose(-1,-2)
+            x_train = x_train.to(device, dtype=torch.float)
+            x_rna = x_train
+            x_train = np.concatenate([pro_oh, pro_emb], axis=1)
+            x_train = np.expand_dims(x_train, 0)
+            x_train = torch.from_numpy(x_train).transpose(-1, -2)
+            x_train = x_train.to(device, dtype=torch.float)
+            x_pro = x_train
+            print('input data shape for rna and protein:',x_rna.shape,x_pro.shape)
+            x_rna = x_rna.to(device, dtype=torch.float32)
+            x_pro = x_pro.to(device, dtype=torch.float32)
+            ###########
+            predict_scores = []
+            #######
+            for i,(model_path,model) in enumerate(models):
+                model.eval()
+                outputs = model(x_pro, x_rna)  # [1, 299, 74, 1]
+                # print('outputs,',outputs.device)
+                outputs = torch.squeeze(outputs, -1)
+                outputs = outputs.permute(0, 2, 1)
+                df_pred = outputs[0].cpu().detach().numpy()
+                # Apply constraints and normalization
+                if args.constrained:contact_matrix = contact_partner_constrained(df_pred)
+                contact_matrix = (contact_matrix - contact_matrix.min()) / (
+                            contact_matrix.max() - contact_matrix.min() + 1e-8)
+                # seq = data._seq[pdbid] if pdbid in data._seq else None
+                des = f'predict by {__file__}\n#{model_path}'
+                doSavePredict(pdbid, {'rna':rnaseq,'protein':proseq}, df_pred,
+                                   out,
+                                   des
+                                   )
+                tmp = df_pred.flatten()
+                tmp.sort()
+                score = sum(tmp[::-1][:sum(df_pred.shape)])
+                predict_scores.append((pdbid, score))
+                print('pdbid',pdbid,score) # 这个score是否和label中contact的个数有correlation？
+                if draw:
+                    plt.figure(figsize=(20, 15))
+                    top = sum(df_pred.shape)
+                    df_pred = pd.DataFrame(df_pred)
+                    threshold = df_pred.stack().nlargest(top).iloc[-1]
+                    bin_pred = get_bin_pred(df_pred,threshold=threshold)
+                    import seaborn as sns
+                    sns.heatmap(df_pred,mask=bin_pred,cbar_kws={"shrink": 0.5},cmap='coolwarm',vmin=0,vmax=1)
+                    plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
+                    plt.xlabel(proid)
+                    plt.ylabel(rnaid)
+                    handles, labels = plt.gca().get_legend_handles_labels()
+                    plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
+                            frameon=False)
+                    # 设置坐标轴的相同缩放
+                    ax = plt.gca()
+                    ax.set_aspect('equal')
+                    plt.tight_layout()
+                    plt.savefig(f'{out}/{pdbid}_{i}_prob.png',dpi=300)
+                    plt.show()
+                    plt.clf()
+                    ax = plt.gca()
+                    tp = \
+                    ax.plot(*np.where(bin_pred.T==1), ".", c='r',markersize=1, label='Predicted contact')[
+                        0]
+                    tp.set_markerfacecolor('w')
+                    tp.set_markeredgecolor('r')
+                    h,w = bin_pred.shape
+                    plt.xlim([0,w])
+                    plt.ylim([0,h])
+                    plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
+                    plt.xlabel(proid)
+                    plt.ylabel(rnaid)
+                    handles, labels = plt.gca().get_legend_handles_labels()
+                    plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
+                            frameon=False)
+                    # 设置坐标轴的相同缩放
+                    ax.set_aspect('equal')
+                    plt.tight_layout()
+                    plt.savefig(f'{out}/{pdbid}_{i}_binary.png',dpi=300)
+                    plt.show()
+                    print(f'predict {pdbid} with {len(seq)} nts')
+            df = pd.DataFrame(predict_scores, columns=['pdbid', 'contact_score'])
+            df.to_csv(args.out + '/predict_scores.csv',index=False, sep='\t', mode='a', float_format='%.5f')

predict_batch.py ADDED Viewed

	@@ -0,0 +1,312 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Created by: [email protected]
+# des : evaluate RPcontact
+import glob
+import pickle
+import random
+import re
+from argparse import ArgumentParser
+import matplotlib.pyplot as plt
+import torch
+from Bio import SeqIO
+from sklearn.preprocessing import OneHotEncoder
+import numpy as np
+import os
+import pandas as pd
+class bcolors:
+    RED   = "\033[1;31m"
+    BLUE  = "\033[1;34m"
+    CYAN  = "\033[1;36m"
+    GREEN = "\033[0;32m"
+    RESET = "\033[0;0m"
+    BOLD    = "\033[;1m"
+    REVERSE = "\033[;7m"
+def check_path(dirout,file=False):
+    if file:dirout = dirout.rsplit('/',1)[0]
+    try:
+        if not os.path.exists(dirout):
+            print('make dir '+dirout)
+            os.makedirs(dirout)
+    except:
+        print(f'{dirout} have been made by other process')
+def load_label_pred(fin_label,fin_pred):
+    with open(fin_label, 'rb') as f:
+        df_label = pickle.load(f)
+    df_label = df_label.squeeze()
+    df_pred = pd.read_table(fin_pred, comment='#', index_col=[0])
+    if type(df_label) == pd.DataFrame:
+        df_pred.index = df_label.index
+        df_pred.columns = df_label.columns
+        # 删除包含空值的行
+        df_label = df_label.dropna(how='all')
+        # 删除包含空值的列
+        df_label = df_label.dropna(axis=1, how='all')
+        df_pred = df_pred.loc[df_label.index, df_label.columns]
+    keep=0
+    if df_pred.columns[0].count('.')==2:
+        keep=-1
+    df_pred.columns = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.columns)]
+    df_pred.index = [e.split('.')[keep] + str(i+1) for i, e in enumerate(df_pred.index)]
+    return df_label,df_pred
+def doSavePredict(_id,seq,predict,fout,des):
+    # seq = {'protein': 'KKGVGSTKNGRDSEAKRLGAKRADGQFVTGGSILYRQRGTKIYPGENVGRGGDDTLFAKIDGTVKFERFGRDRKKVSVYPV',
+    #  'rna': 'GGGGCCUUAGCUCAGGGGAGAGCGCCUGCUUUGCACGCAGGAGGCAGCGGUUCGAUCCCGCUAGGCUCCACCA'}
+    check_path(fout)
+    df = pd.DataFrame(predict)
+    if not seq:df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='w',float_format='%.5f')
+    else:
+        df.columns = [f'{elem}{index+1}' for index,elem in enumerate(seq['protein'])]
+        df.index = [f'{elem}{index+1}' for index,elem in enumerate(seq['rna'])]
+        with open(fout+ f'{_id}.txt','w') as f:
+            f.write(f'#{des}\n')
+            f.write(f"# row =rna:{seq['rna']}\n")
+            f.write(f"# col=protein:{seq['protein']}\n")
+        # df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.3f',index=None,header=None)
+        df.to_csv(fout+ f'{_id}.txt',sep='\t',mode='a',float_format='%.5f')
+        df = get_top_l_triplets(df, sum(df.shape))
+        df.to_csv(fout+ f'{_id}_topL.txt',sep='\t',mode='w',float_format='%.5f',index=False)
+def get_top_l_triplets(df_pred, L):
+    """
+    从Pandas DataFrame矩阵中提取值最大的前L个三元组。
+    参数:
+    - matrix_df: Pandas DataFrame，表示接触矩阵。
+    - L: int，要提取的三元组的数量。
+    返回:
+    - top_l_triplets: 列表，包含前L个三元组，每个三元组格式为(row_index, col_index, value)。
+    """
+    df = df_pred.stack().reset_index()
+    df.columns = ['rna', 'protein', 'pred']
+    df = df.sort_values(by='pred', ascending=False).head(L)
+    return df
+def doSavePredict_single(_id,seq,predict_rsa,fout,des,pred_asa=None):
+    check_path(fout)
+    BASES = 'AUCG'
+    asa_std = [400, 350, 350, 400]
+    dict_rnam1_ASA = dict(zip(BASES, asa_std))
+    sequence = re.sub(r"[T]", "U", ''.join(seq))
+    sequence = re.sub(r"[^AGCU]", BASES[random.randint(0, 3)], sequence) # 其他字符随机变换以取得对目标的预测
+    ASA_scale = np.array([dict_rnam1_ASA[i] for i in sequence])
+    if pred_asa is None:
+        pred_asa = np.multiply(predict_rsa, ASA_scale).T
+    else:
+        predict_rsa = pred_asa/ASA_scale
+    col1 = np.array([i + 1 for i, I in enumerate(seq)])[None, :]
+    col2 = np.array([I for i, I in enumerate(seq)])[None, :]
+    col3 = pred_asa
+    col4 = predict_rsa
+    if len(col3[col3 == 0]):
+        exit(f'error in predict\t {_id},{seq}')
+    temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%.2f', col3), np.char.mod('%.3f', col4))).T
+    if fout:np.savetxt(fout + f'{_id}.txt', (temp), delimiter='\t\t', fmt="%s",
+               header=f'#{des}',
+               comments='')
+    return pred_asa,predict_rsa
+def one_hot_encode(sequences,alpha='ACGU'):
+    # print(sequences)
+    sequences_arry = np.array(list(sequences)).reshape(-1, 1)
+    lable = np.array(list(alpha)).reshape(-1, 1)
+    enc = OneHotEncoder(handle_unknown='ignore')
+    enc.fit(lable)
+    seq_encode = enc.transform(sequences_arry).toarray()
+    # print(seq_encode.shape)
+    return (seq_encode)
+def get_bin_pred(df_pred,threshold):
+    bin_pred = df_pred.values >= threshold
+    bin_pred = bin_pred.astype(int)
+    return bin_pred
+def seed_everything(seed=2022):
+    print('seed_everything to ',seed)
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed) # 程序每次运行结果一致，但是程序中多次生成随机数每次不一致 # https://blog.csdn.net/qq_42951560/article/details/112174334
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False # minbatch的长度一直在变化，这个优化比较浪费时间
+def getParam():
+    parser = ArgumentParser()
+    # data
+    parser.add_argument('--rootdir', default='',
+                        type=str)
+    parser.add_argument('--rna_fasta', default='./example/inputs_batch/rna.fasta',
+                        type=str)
+    parser.add_argument('--pro_fasta', default='./example/inputs_batch/protein.fasta',
+                        type=str)
+    parser.add_argument('--csv', default='./example/inputs_batch/pairs.csv',
+                        type=str)
+    parser.add_argument('--col', default='_id',
+                        type=str)
+    parser.add_argument('--out', default='./example/outputs_batch/',
+                        type=str)
+    parser.add_argument('--ffeat', default='./example/inputs_batch/embedding/{element}/{pdbid}.pickle',
+                        type=str)
+    parser.add_argument('--fmodel', default='./weight/model_roc_0_38=0.845.pt',
+                        type=str)
+    parser.add_argument('--device', default='cpu',
+                        type=str)
+    parser.add_argument('--draw', action='store_true')
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = getParam()
+    rootdir = args.rootdir
+    csv = args.csv
+    col = args.col
+    rna_fasta = args.rna_fasta
+    pro_fasta = args.pro_fasta
+    ffeat = args.ffeat
+    fmodel = args.fmodel
+    device = args.device
+    out = args.out
+    draw = args.draw
+    check_path(out)
+    # pdbid = fasta.rsplit('/',1)[0].split('.')[0]
+    seed_everything(seed=2022)
+    models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
+    # models = [(model_path,torch.load(model_path, map_location=torch.device(device))) for model_path in glob.glob(fmodel)]
+    print('loading existed model', fmodel)
+    with torch.no_grad():
+        rna_dict = {}
+        for pdbid, seq in [(record.id, record.seq) for record in SeqIO.parse(rna_fasta, 'fasta')]:
+            rna_dict[pdbid]=str(seq)
+        pro_dict = {}
+        for pdbid, seq in [(record.id, record.seq) for record in SeqIO.parse(pro_fasta, 'fasta')]:
+            pro_dict[pdbid]=str(seq)
+        df = pd.read_csv(csv)
+        predict_scores = []
+        for pdbid in df[col]:
+            # pdbcode,r,p = pdbid.split('_')
+            # rnaid = f'{pdbcode}_{r}'
+            # proid = f'{pdbcode}_{p}'
+            rnaid,proid = pdbid.split('.')
+            rnaseq,proseq= rna_dict[rnaid],pro_dict[proid]
+            with open(ffeat.format_map({'pdbid':rnaid,'element':'rna'}),'rb') as f:
+                rna_emb = pickle.load(f)
+            with open(ffeat.format_map({'pdbid':proid,'element':'protein'}),'rb') as f:
+                pro_emb = pickle.load(f)
+            rna_oh = one_hot_encode(rnaseq.replace('T','U'), alpha='ACGU')
+            pro_oh = one_hot_encode(proseq, alpha='GAVLIFWYDNEKQMSTCPHR')
+            # mask = np.ones((emb.shape[0],1)) # mask missing nt when evaluate the model
+            x_train = np.concatenate([rna_oh,rna_emb],axis=1)
+            x_train = np.expand_dims(x_train,0)
+            x_train = torch.from_numpy(x_train).transpose(-1,-2)
+            x_train = x_train.to(device, dtype=torch.float)
+            x_rna = x_train
+            x_train = np.concatenate([pro_oh, pro_emb], axis=1)
+            x_train = np.expand_dims(x_train, 0)
+            x_train = torch.from_numpy(x_train).transpose(-1, -2)
+            x_train = x_train.to(device, dtype=torch.float)
+            x_pro = x_train
+            # print('input data shape for rna and protein:',x_rna.shape,x_pro.shape)
+            x_rna = x_rna.to(device, dtype=torch.float32)
+            x_pro = x_pro.to(device, dtype=torch.float32)
+            for i,(model_path,model) in enumerate(models):
+                model.eval()
+                outputs = model(x_pro, x_rna)  # [1, 299, 74, 1]
+                # print('outputs,',outputs.device)
+                outputs = torch.squeeze(outputs, -1)
+                outputs = outputs.permute(0, 2, 1)
+                df_pred = outputs[0].cpu().detach().numpy()
+                des = f'predict by {__file__}\n#{model_path}'
+                doSavePredict(pdbid, {'rna':rnaseq,'protein':proseq}, df_pred,
+                                   out,
+                                   des
+                                   )
+                tmp = df_pred.flatten()
+                tmp.sort()
+                score = sum(tmp[::-1][:sum(df_pred.shape)])
+                predict_scores.append((pdbid, score))
+                print(pdbid,score)
+            if draw:
+                plt.figure(figsize=(20, 15))
+                top = sum(df_pred.shape)
+                df_pred = pd.DataFrame(df_pred)
+                threshold = df_pred.stack().nlargest(top).iloc[-1]
+                bin_pred = get_bin_pred(df_pred,threshold=threshold)
+                import seaborn as sns
+                sns.heatmap(df_pred,mask=bin_pred)
+                plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
+                plt.xlabel(proid)
+                plt.ylabel(rnaid)
+                handles, labels = plt.gca().get_legend_handles_labels()
+                plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
+                        frameon=False)
+                # 设置坐标轴的相同缩放
+                ax = plt.gca()
+                ax.set_aspect('equal')
+                plt.tight_layout()
+                plt.savefig(f'{out}/{pdbid}_{i}_prob.png',dpi=300)
+                plt.show()
+                plt.clf()
+                ax = plt.gca()
+                tp = \
+                ax.plot(*np.where(bin_pred.T==1), ".", c='r',markersize=1, label='Predicted contact')[
+                    0]
+                tp.set_markerfacecolor('w')
+                tp.set_markeredgecolor('r')
+                h,w = bin_pred.shape
+                plt.xlim([0,w])
+                plt.ylim([0,h])
+                plt.title(f'Predicted contact map of {pdbid}\nPredidcted by RPcontact, top L=r+p')
+                plt.xlabel(proid)
+                plt.ylabel(rnaid)
+                handles, labels = plt.gca().get_legend_handles_labels()
+                plt.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, borderaxespad=1,
+                        frameon=False)
+                # 设置坐标轴的相同缩放
+                ax.set_aspect('equal')
+                plt.tight_layout()
+                plt.savefig(f'{out}/{pdbid}_{i}_binary.png',dpi=300)
+                plt.show()
+                print(f'predict {pdbid} with {len(seq)} nts')
+            df = pd.DataFrame(predict_scores, columns=['pdbid', 'contact_score'])
+            df.to_csv(args.out + '/predict_scores.tsv',index=False, sep='\t', mode='w', float_format='%.5f')

readme.md ADDED Viewed

	@@ -0,0 +1,116 @@

+<p align="center">
+  <img src="https://raw.githubusercontent.com/JulseJiang/RPcontact/main/example/logo.png" alt="RPcontact Logo" width="120"/>
+</p>
+# RPcontact: RNA-Protein Contact Prediction
+**Improved prediction of RNA-protein contacts using RNA and protein language models**
+[Paper](https://www.biorxiv.org/content/10.1101/2025.06.02.657171v1.full)
+[Code](https://github.com/rpcontact)
+[Demo](https://julse-rpcontact.hf.space/)
+---
+## Overview
+RPcontact is a novel computational tool for accurately predicting RNA-protein contacts, addressing a fundamental challenge in understanding molecular biology processes such as transcription, splicing, and translation. Traditional methods are limited by the scarcity of RNA-protein complex structures and the constraints of experimental techniques. While recent deep learning approaches like AlphaFold 3 and RoseTTAFoldNA have made progress, they still rely heavily on homologous templates.
+RPcontact overcomes these limitations by leveraging large language models specifically designed for RNA ([ERNIE-RNA](https://github.com/Bruce-ywj/ERNIE-RNA)) and proteins ([ESM-2](https://github.com/facebookresearch/esm)). Trained exclusively on ribosomal RNA-protein complexes, RPcontact delivers robust and generalized performance, accurately predicting contacts in both dimeric and multimeric non-rRNA-protein complexes. Benchmark results show that RPcontact significantly outperforms binary contacts inferred from models like AlphaFold 3 and RoseTTAFoldNA, making it a valuable tool for structure and function prediction in RNA-protein research.
+---
+## Quick Start
+### Requirements
+| Dependency  | Recommended Version |
+|-------------|--------------------|
+| Python      | ≥ 3.8              |
+| PyTorch     | 1.13.1             |
+| fair-esm    | 1.0.2              |
+Install dependencies (example):
+```bash
+pip install numpy pandas matplotlib biopython scikit-learn
+pip install torch==1.13.1
+pip install fair-esm==1.0.2
+```
+---
+### Script Overview
+| Script            | Function                            | Example Command                 |
+|-------------------|-------------------------------------|---------------------------------|
+| predict.py        | Single RNA-protein pair contact prediction  | `python predict.py`             |
+| predict_batch.py  | Batch RNA-protein pairs contact prediction   | `python predict_batch.py`       |
+| evaluate.py       | Evaluation and visualization        | `python evaluate.py`            |
+| app.py       | Launch web-based demo interface (need install gradio)          | `python app.py`            |
+---
+### Data Preparation
+- RNA/protein sequences: FASTA format
+- Embedding features: pickle format
+- For batch prediction: provide a CSV file for pairing info
+---
+### Typical Usage
+**Single pair prediction:**
+```bash
+python predict.py --fasta your_sequence.fasta --out output_dir/
+```
+**Batch prediction:**
+```bash
+python predict_batch.py --rna_fasta rna.fasta --pro_fasta protein.fasta --csv pairs.csv --out output_dir/
+```
+**Evaluation:**
+```bash
+python evaluate.py --fasta your_sequence.fasta --out eval_dir/ --flabel true_labels.pickle
+```
+---
+### Common Parameters
+| Parameter     | Description                                             |
+|---------------|--------------------------------------------------------|
+| --fasta       | Input FASTA file (for single prediction)               |
+| --rna_fasta   | RNA FASTA file (for batch prediction)                  |
+| --pro_fasta   | Protein FASTA file (for batch prediction)              |
+| --csv         | RNA-protein pairing info CSV (for batch prediction)    |
+| --ffeat       | Precomputed embedding feature file (pickle format)     |
+| --fmodel      | Pretrained model file path                             |
+| --out         | Output directory                                       |
+| --flabel      | True label file (for evaluation)                       |
+| --device      | Specify device (e.g., cpu or cuda:0)                   |
+| --draw        | Whether to visualize results                           |
+---
+## Output Interpretation
+- The prediction output is a contact probability matrix for each RNA-protein pair. Higher scores indicate a higher probability of interaction.
+- The evaluation script provides accuracy and other metrics, as well as visualization.
+---
+## Contact & Citation
+Questions or suggestions? Contact:
+- Jiuhong Jiang
+- Email: [email protected]
+If you find this project helpful, please cite our manuscript.
+- Jiang, J., Zhang, X., Zhan, J., Miao, Z., & Zhou, Y. (2025). RPcontact: Improved prediction of RNA-protein contacts using RNA and protein language models. bioRxiv, 2025-06.
+---
+<p align="center"><em>Make RNA-protein contact prediction easier and more accurate!</em></p>

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+Bio==1.8.0
+biopython==1.81
+gradio==5.35.0
+matplotlib==3.5.1
+numpy==1.24.4
+pandas==1.5.3
+plotly==5.24.1
+scikit_learn==1.2.1
+seaborn==0.13.2
+torch==2.4.1

third_part_tool/ernie_rna/readme.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://github.com/Bruce-ywj/ERNIE-RNA

third_part_tool/esm2/readme.txt ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ install following https://github.com/facebookresearch/esm/
2	+
3	+ using this pretrained model: esm2_t48_15B_UR50D
4	+

weight/readme.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+model_roc_0_38=0.845.pt for OH+RP_Emb with data augmentation
+model_roc_0_56=0.779.pt for OH with data augmentation
+The model weight can download after the paper accepted by journal