Spaces:

chshan
/

RLAnOxPeptide

Sleeping

App Files Files Community

chshan commited on Jul 21

Commit

02b6e86

verified ·

1 Parent(s): ed68bd1

Update feature_extract.py

Browse files

Files changed (1) hide show

feature_extract.py +2 -32

feature_extract.py CHANGED Viewed

@@ -6,19 +6,11 @@ import random
 import pandas as pd
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
 from sklearn.model_selection import train_test_split
-# from sklearn.preprocessing import StandardScaler # 不再使用 StandardScaler
 from sklearn.preprocessing import RobustScaler # 导入 RobustScaler
 import torch
 from transformers import T5EncoderModel, T5Tokenizer
-# ProtT5Model, load_fasta, load_fasta_with_labels,
-# compute_amino_acid_composition, compute_reducing_aa_ratio,
-# compute_physicochemical_properties, compute_electronic_features,
-# compute_dimer_frequency, positional_encoding, perturb_sequence,
-# generate_adversarial_samples, extract_features 函数与您之前提供的版本相同。
-# 为保持简洁，此处省略这些函数的代码。请确保它们在您的文件中是完整的。
-# 您可以从之前的日志或您本地的文件中复制这些函数。
-# 以下是 prepare_features 函数的修改版，以及其他函数的占位符。
 class ProtT5Model:
     """
@@ -76,10 +68,6 @@ class ProtT5Model:
              return np.zeros((1, 1024), dtype=np.float32)
         return emb
-# --- (此处应包含您之前版本中所有其他的特征提取辅助函数) ---
-# load_fasta, load_fasta_with_labels, compute_amino_acid_composition, ... extract_features
-# 为确保完整性，请从您本地的 feature_extract.py 文件中复制这些函数到这里。
-# 下面是这些函数的一个简化占位符，您需要用实际的函数替换它们。
 def load_fasta(fasta_file):
     # (您的 load_fasta 实现)
@@ -99,7 +87,6 @@ def load_fasta(fasta_file):
     return sequences
 def load_fasta_with_labels(fasta_file):
-    # (您的 load_fasta_with_labels 实现)
     sequences, labels = [], []
     try:
         with open(fasta_file, 'r') as f:
@@ -123,7 +110,6 @@ def load_fasta_with_labels(fasta_file):
 def compute_amino_acid_composition(seq):
     if not seq: return {aa: 0.0 for aa in "ACDEFGHIKLMNPQRSTVWY"}
-    # (您的 compute_amino_acid_composition 实现)
     amino_acids = "ACDEFGHIKLMNPQRSTVWY"
     seq_len = len(seq)
     return {aa: seq.upper().count(aa) / seq_len for aa in amino_acids}
@@ -131,7 +117,6 @@ def compute_amino_acid_composition(seq):
 def compute_reducing_aa_ratio(seq):
     if not seq: return 0.0
-    # (您的 compute_reducing_aa_ratio 实现)
     reducing = ['C', 'M', 'W']
     return sum(seq.upper().count(aa) for aa in reducing) / len(seq) if len(seq) > 0 else 0.0
@@ -146,7 +131,6 @@ def compute_physicochemical_properties(seq):
 def compute_electronic_features(seq):
     if not seq: return 0.0, 0.0
-    # (您的 compute_electronic_features 实现)
     electronegativity = {'A':1.8,'C':2.5,'D':3.0,'E':3.2,'F':2.8,'G':1.6,'H':2.4,'I':4.5,'K':3.0,'L':4.2,'M':4.5,'N':2.0,'P':3.5,'Q':3.5,'R':2.5,'S':1.8,'T':2.5,'V':4.0,'W':5.0,'Y':4.0}
     values = [electronegativity.get(aa.upper(), 2.5) for aa in seq]
     avg_val = sum(values) / len(values) if values else 2.5
@@ -155,7 +139,6 @@ def compute_electronic_features(seq):
 def compute_dimer_frequency(seq):
     if len(seq) < 2: return np.zeros(400) # 20*20
-    # (您的 compute_dimer_frequency 实现)
     amino_acids = "ACDEFGHIKLMNPQRSTVWY"
     dimer_counts = {aa1+aa2: 0 for aa1 in amino_acids for aa2 in amino_acids}
     for i in range(len(seq) - 1):
@@ -166,12 +149,7 @@ def compute_dimer_frequency(seq):
     return np.array([dimer_counts[d] for d in sorted(dimer_counts.keys())])
-def positional_encoding(seq_len_actual, L_fixed=29, d_model=16): # Pass actual sequence length or use L_fixed
-    # (您的 positional_encoding 实现)
-    # This PE is fixed length, not dependent on actual seq len if L_fixed is used.
-    # For random short sequences, this fixed PE might be an issue.
-    # A more dynamic PE or no PE for very short sequences might be better.
-    # However, to match current model input, we keep it.
     pos_enc = np.zeros((L_fixed, d_model))
     for pos in range(L_fixed):
         for i in range(d_model):
@@ -194,9 +172,6 @@ def perturb_sequence(seq, perturb_rate=0.1, critical=['C', 'M', 'W']):
 def extract_features(seq, prott5_model_instance, L_fixed=29, d_model_pe=16): # Renamed d_model to d_model_pe
     if not seq or not isinstance(seq, str) or len(seq) == 0:
         print(f"警告: extract_features 接收到空或无效序列。返回零特征。")
-        # 返回一个与预期特征维度匹配的零向量
-        # 1024 (protT5) + 20 (aac) + 1 (red_ratio) + 3 (phys) + 2 (elec) + 400 (dimer) + L_fixed*d_model_pe (pos_enc)
-        # Example: 1024 + 20 + 1 + 3 + 2 + 400 + 29*16 = 1024 + 20 + 1 + 3 + 2 + 400 + 464 = 1914
         return np.zeros(1024 + 20 + 1 + 3 + 2 + 400 + (L_fixed * d_model_pe))
@@ -280,7 +255,6 @@ def prepare_features(neg_fasta, pos_fasta, prott5_model_path, additional_params=
     # --- **关键修改：使用 RobustScaler** ---
-    # scaler = StandardScaler() # 原来的 StandardScaler
     scaler = RobustScaler()
     print("使用 RobustScaler 进行特征归一化。")
@@ -303,10 +277,6 @@ if __name__ == "__main__":
     if not os.path.exists(pos_fasta_test):
         with open(pos_fasta_test, "w") as f: f.write(">pos1\nAOPPEPTIDE\n>pos2\nTRYTRYTRY\n")
-    # 为了让ProtT5Model能加载，需要模拟一个最小的transformers模型目录结构
-    # 通常至少需要 config.json, pytorch_model.bin (或 tf_model.h5), tokenizer_config.json, spiece.model
-    # 这里我们只创建目录，实际加载可能会失败，除非transformers库能从模型名下载
-    # 或者您提供一个真实的本地ProtT5模型路径
     if not os.listdir(prott5_path_test): # 如果目录为空
         print(f"警告: {prott5_path_test} 为空。ProtT5Model可能尝试从HuggingFace Hub下载模型。")
         print(f"请确保您已下载Rostlab/ProstT5-XL-UniRef50或类似模型到该路径，或使用其HuggingFace名称。")

 import pandas as pd
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import RobustScaler # 导入 RobustScaler
 import torch
 from transformers import T5EncoderModel, T5Tokenizer
 class ProtT5Model:
     """
              return np.zeros((1, 1024), dtype=np.float32)
         return emb
 def load_fasta(fasta_file):
     # (您的 load_fasta 实现)
     return sequences
 def load_fasta_with_labels(fasta_file):
     sequences, labels = [], []
     try:
         with open(fasta_file, 'r') as f:
 def compute_amino_acid_composition(seq):
     if not seq: return {aa: 0.0 for aa in "ACDEFGHIKLMNPQRSTVWY"}
     amino_acids = "ACDEFGHIKLMNPQRSTVWY"
     seq_len = len(seq)
     return {aa: seq.upper().count(aa) / seq_len for aa in amino_acids}
 def compute_reducing_aa_ratio(seq):
     if not seq: return 0.0
     reducing = ['C', 'M', 'W']
     return sum(seq.upper().count(aa) for aa in reducing) / len(seq) if len(seq) > 0 else 0.0
 def compute_electronic_features(seq):
     if not seq: return 0.0, 0.0
     electronegativity = {'A':1.8,'C':2.5,'D':3.0,'E':3.2,'F':2.8,'G':1.6,'H':2.4,'I':4.5,'K':3.0,'L':4.2,'M':4.5,'N':2.0,'P':3.5,'Q':3.5,'R':2.5,'S':1.8,'T':2.5,'V':4.0,'W':5.0,'Y':4.0}
     values = [electronegativity.get(aa.upper(), 2.5) for aa in seq]
     avg_val = sum(values) / len(values) if values else 2.5
 def compute_dimer_frequency(seq):
     if len(seq) < 2: return np.zeros(400) # 20*20
     amino_acids = "ACDEFGHIKLMNPQRSTVWY"
     dimer_counts = {aa1+aa2: 0 for aa1 in amino_acids for aa2 in amino_acids}
     for i in range(len(seq) - 1):
     return np.array([dimer_counts[d] for d in sorted(dimer_counts.keys())])
+def positional_encoding(seq_len_actual, L_fixed=29, d_model=16):
     pos_enc = np.zeros((L_fixed, d_model))
     for pos in range(L_fixed):
         for i in range(d_model):
 def extract_features(seq, prott5_model_instance, L_fixed=29, d_model_pe=16): # Renamed d_model to d_model_pe
     if not seq or not isinstance(seq, str) or len(seq) == 0:
         print(f"警告: extract_features 接收到空或无效序列。返回零特征。")
         return np.zeros(1024 + 20 + 1 + 3 + 2 + 400 + (L_fixed * d_model_pe))
     # --- **关键修改：使用 RobustScaler** ---
     scaler = RobustScaler()
     print("使用 RobustScaler 进行特征归一化。")
     if not os.path.exists(pos_fasta_test):
         with open(pos_fasta_test, "w") as f: f.write(">pos1\nAOPPEPTIDE\n>pos2\nTRYTRYTRY\n")
     if not os.listdir(prott5_path_test): # 如果目录为空
         print(f"警告: {prott5_path_test} 为空。ProtT5Model可能尝试从HuggingFace Hub下载模型。")
         print(f"请确保您已下载Rostlab/ProstT5-XL-UniRef50或类似模型到该路径，或使用其HuggingFace名称。")