File size: 2,942 Bytes
82d55c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/bash

# 检查参数数量
if [ "$#" -ne 4 ]; then
    echo "Usage: $0 <fin_fasta> <dirout> <esm2_env_path> <ernie_rna_env_path>"
    exit 1
fi

fin_fasta=$1
dirout=$2
esm2_env_path=$3
ernie_rna_env_path=$4

# 设置默认值
WDIR=$dirout
rna_fasta=$WDIR/_0_process/rna_sequences.fasta
pro_fasta=$WDIR/_0_process/protein_sequences.fasta
fcombinations=$WDIR/_0_process/combinations.csv
finfo=$WDIR/_0_process/info.csv

current_path=$WDIR/_0_process/

# 创建所需目录
mkdir -p $current_path
mkdir -p $current_path/ernie_rna_emb
mkdir -p $current_path/esm2_emb
mkdir -p $current_path/rpcontact
mkdir -p $current_path/no_constrained
mkdir -p $current_path/constrained

# 写入组合文件
while IFS= read -r line; do
    rna_id=$(echo $line | cut -d ',' -f 1)
    rna_seq=$(echo $line | cut -d ',' -f 2)
    pro_id=$(echo $line | cut -d ',' -f 3)
    pro_seq=$(echo $line | cut -d ',' -f 4)
    rna_len=$(echo $line | cut -d ',' -f 5)
    pro_len=$(echo $line | cut -d ',' -f 6)
    echo "$rna_id.$pro_id,$rna_seq,$pro_seq,$rna_len,$pro_len" >> $fcombinations
done < $fin_fasta

# 打印信息
echo "Done. RNA sequences are in $rna_fasta, protein sequences are in $pro_fasta, and combinations are in $fcombinations."
echo "RNA count: $(wc -l < $rna_fasta), RNA max length: $(awk -F',' '{print $5}' $fcombinations | sort -nr | head -n 1), RNA min length: $(awk -F',' '{print $5}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
echo "Protein count: $(wc -l < $pro_fasta), Protein max length: $(awk -F',' '{print $6}' $fcombinations | sort -nr | head -n 1), Protein min length: $(awk -F',' '{print $6}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
echo "Sequence length longer than 1000 were truncated and kept head and tail with the length of 1000, sliding 500 as step, 1000 as window"

# ERNIE-RNA 嵌入
ERNIE_RNA_script="cd /public/home/jiang_jiuhong/soft/ERNIE-RNA/
$ernie_rna_env_path/miniconda3/envs/ERNIE-RNA/bin/python extract_embedding_jh.py --seqs_path='$rna_fasta' --save_path='$current_path/ernie_rna_emb/' --device=cpu"

echo "$ERNIE_RNA_script" > $current_path/ernie_rna_emb.sh
chmod +x $current_path/ernie_rna_emb.sh

nohup srun -p hebhcnormal01 -c 32 sh $current_path/ernie_rna_emb.sh > $current_path/log_ernie_rna_emb.txt 2>&1 &

# ESM2 嵌入
ESM2_script="cd /public/home/jiang_jiuhong/code/esm/
$esm2_env_path/miniconda3/envs/esm2_env/bin/python scripts/extract.py esm2_t48_15B_UR50D $pro_fasta $current_path/esm2_emb/ --repr_layers 48 --include mean per_tok"

echo "$ESM2_script" > $current_path/esm2_emb.sh
chmod +x $current_path/esm2_emb.sh

nohup srun -p hebhcnormal01 -c 32 sh $current_path/esm2_emb.sh > $current_path/log_esm2_emb.txt 2>&1 &

# 等待嵌入完成
wait

# 执行 RPcontact 获取 contactmap
python process_rna_protein.py --rna_fasta=$rna_fasta --pro_fasta=$pro_fasta --csv=$fcombinations --WDIR=$WDIR --out=$dirout