File size: 2,942 Bytes
82d55c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
#!/usr/bin/bash
# 检查参数数量
if [ "$#" -ne 4 ]; then
echo "Usage: $0 <fin_fasta> <dirout> <esm2_env_path> <ernie_rna_env_path>"
exit 1
fi
fin_fasta=$1
dirout=$2
esm2_env_path=$3
ernie_rna_env_path=$4
# 设置默认值
WDIR=$dirout
rna_fasta=$WDIR/_0_process/rna_sequences.fasta
pro_fasta=$WDIR/_0_process/protein_sequences.fasta
fcombinations=$WDIR/_0_process/combinations.csv
finfo=$WDIR/_0_process/info.csv
current_path=$WDIR/_0_process/
# 创建所需目录
mkdir -p $current_path
mkdir -p $current_path/ernie_rna_emb
mkdir -p $current_path/esm2_emb
mkdir -p $current_path/rpcontact
mkdir -p $current_path/no_constrained
mkdir -p $current_path/constrained
# 写入组合文件
while IFS= read -r line; do
rna_id=$(echo $line | cut -d ',' -f 1)
rna_seq=$(echo $line | cut -d ',' -f 2)
pro_id=$(echo $line | cut -d ',' -f 3)
pro_seq=$(echo $line | cut -d ',' -f 4)
rna_len=$(echo $line | cut -d ',' -f 5)
pro_len=$(echo $line | cut -d ',' -f 6)
echo "$rna_id.$pro_id,$rna_seq,$pro_seq,$rna_len,$pro_len" >> $fcombinations
done < $fin_fasta
# 打印信息
echo "Done. RNA sequences are in $rna_fasta, protein sequences are in $pro_fasta, and combinations are in $fcombinations."
echo "RNA count: $(wc -l < $rna_fasta), RNA max length: $(awk -F',' '{print $5}' $fcombinations | sort -nr | head -n 1), RNA min length: $(awk -F',' '{print $5}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
echo "Protein count: $(wc -l < $pro_fasta), Protein max length: $(awk -F',' '{print $6}' $fcombinations | sort -nr | head -n 1), Protein min length: $(awk -F',' '{print $6}' $fcombinations | sort -n | head -n 1), total fragments: $(wc -l < $fcombinations)"
echo "Sequence length longer than 1000 were truncated and kept head and tail with the length of 1000, sliding 500 as step, 1000 as window"
# ERNIE-RNA 嵌入
ERNIE_RNA_script="cd /public/home/jiang_jiuhong/soft/ERNIE-RNA/
$ernie_rna_env_path/miniconda3/envs/ERNIE-RNA/bin/python extract_embedding_jh.py --seqs_path='$rna_fasta' --save_path='$current_path/ernie_rna_emb/' --device=cpu"
echo "$ERNIE_RNA_script" > $current_path/ernie_rna_emb.sh
chmod +x $current_path/ernie_rna_emb.sh
nohup srun -p hebhcnormal01 -c 32 sh $current_path/ernie_rna_emb.sh > $current_path/log_ernie_rna_emb.txt 2>&1 &
# ESM2 嵌入
ESM2_script="cd /public/home/jiang_jiuhong/code/esm/
$esm2_env_path/miniconda3/envs/esm2_env/bin/python scripts/extract.py esm2_t48_15B_UR50D $pro_fasta $current_path/esm2_emb/ --repr_layers 48 --include mean per_tok"
echo "$ESM2_script" > $current_path/esm2_emb.sh
chmod +x $current_path/esm2_emb.sh
nohup srun -p hebhcnormal01 -c 32 sh $current_path/esm2_emb.sh > $current_path/log_esm2_emb.txt 2>&1 &
# 等待嵌入完成
wait
# 执行 RPcontact 获取 contactmap
python process_rna_protein.py --rna_fasta=$rna_fasta --pro_fasta=$pro_fasta --csv=$fcombinations --WDIR=$WDIR --out=$dirout
|