Spaces:

chshan
/

RLAnOxPeptide

Sleeping

App Files Files Community

chshan commited on Jul 20

Commit

8a4f49a

verified ·

1 Parent(s): 6f96910

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -24

app.py CHANGED Viewed

@@ -16,10 +16,7 @@
 # │   └── final_generator_model.pth
 # ├── prott5/
 # │   └── model/
-# │       ├── config.json
-# │       ├── pytorch_model.bin  (The base ProtT5 model from Rostlab)
-# │       ├── finetuned_prott5.bin (Your fine-tuned feature extractor weights)
-# │       └── ... (other tokenizer files)
 # └── requirements.txt
 import os
@@ -50,25 +47,27 @@ VOCAB_SIZE = len(token2id)
 # --- Feature Extractor Model Class (For ProtT5) ---
-# This class robustly loads the base ProtT5 model and applies your fine-tuned weights.
 class FeatureProtT5Model:
-    def __init__(self, model_dir_path, finetuned_weights_path=None):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Initializing ProtT5 for feature extraction on device: {self.device}")
-        # Load the base model architecture and tokenizer from the specified directory.
-        self.tokenizer = transformers.T5Tokenizer.from_pretrained(model_dir_path, do_lower_case=False)
-        self.model = transformers.T5EncoderModel.from_pretrained(model_dir_path)
         # If a path to a fine-tuned weights file is provided, load and apply those weights.
         if finetuned_weights_path and os.path.exists(finetuned_weights_path):
-            print(f"Applying fine-tuned weights from: {finetuned_weights_path}")
             state_dict = torch.load(finetuned_weights_path, map_location=self.device)
             self.model.load_state_dict(state_dict, strict=False)
             print("Successfully applied fine-tuned weights.")
         else:
             print("Warning: Fine-tuned weights not found or not provided. Using base ProtT5 weights.")
         self.model.to(self.device)
         self.model.eval()
@@ -82,10 +81,10 @@ class AntioxidantPredictor(nn.Module):
         self.handcrafted_dim = input_dim - self.prott5_dim
         self.seq_len = 16
         self.prott5_feature_dim = 64 # 16 * 64 = 1024
         encoder_layer = nn.TransformerEncoderLayer(d_model=self.prott5_feature_dim, nhead=transformer_heads, dropout=transformer_dropout, batch_first=True)
         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
         fused_dim = self.prott5_feature_dim + self.handcrafted_dim
         self.fusion_fc = nn.Sequential(nn.Linear(fused_dim, 1024), nn.ReLU(), nn.Dropout(0.3), nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(0.3))
         self.classifier = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 1))
@@ -96,17 +95,17 @@ class AntioxidantPredictor(nn.Module):
         # The input 'x' is a flat 1914-dim vector from extract_features()
         prot_t5_features = x[:, :self.prott5_dim]
         handcrafted_features = x[:, self.prott5_dim:]
         # Reshape the first 1024 features back into a sequence representation
         prot_t5_seq = prot_t5_features.view(batch_size, self.seq_len, self.prott5_feature_dim)
         encoded_seq = self.transformer_encoder(prot_t5_seq)
         refined_prott5 = encoded_seq.mean(dim=1)
         fused_features = torch.cat([refined_prott5, handcrafted_features], dim=1)
         fused_output = self.fusion_fc(fused_features)
         logits = self.classifier(fused_output)
         return logits / self.temperature
     def get_temperature(self):
@@ -123,13 +122,13 @@ class ProtT5Generator(nn.Module):
         self.vocab_size = vocab_size
         self.eos_token_id = token2id["<EOS>"]
         self.pad_token_id = token2id["<PAD>"]
     def forward(self, input_ids):
         embeddings = self.embed_tokens(input_ids)
         encoder_output = self.encoder(embeddings)
         logits = self.lm_head(encoder_output)
         return logits
     def sample(self, batch_size, max_length=20, device="cpu", temperature=2.5, min_decoded_length=3):
         start_token = torch.randint(2, self.vocab_size, (batch_size, 1), device=device)
         generated = start_token
@@ -138,7 +137,7 @@ class ProtT5Generator(nn.Module):
             next_logits = logits[:, -1, :] / temperature
             if generated.size(1) < min_decoded_length:
                 next_logits[:, self.eos_token_id] = -float("inf")
             probs = torch.softmax(next_logits, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)
             generated = torch.cat((generated, next_token), dim=1)
@@ -208,7 +207,9 @@ try:
     PREDICTOR_CHECKPOINT_PATH = "checkpoints/final_rl_model_logitp0.1_calibrated_FINETUNED_PROTT5.pth"
     SCALER_PATH = "checkpoints/scaler_FINETUNED_PROTT5.pkl"
     GENERATOR_CHECKPOINT_PATH = "generator_checkpoints_v3.6/final_generator_model.pth"
-    PROTT5_BASE_MODEL_PATH = "prott5/model/"
     FINETUNED_PROTT5_FOR_FEATURES_PATH = "prott5/model/finetuned_prott5.bin"
     # --- Load Predictor Model ---
@@ -223,8 +224,9 @@ try:
     print(f"Loading Scaler from: {SCALER_PATH}")
     SCALER = joblib.load(SCALER_PATH)
     print("Loading ProtT5 Feature Extractor...")
     PROTT5_EXTRACTOR = FeatureProtT5Model(
-        model_dir_path=PROTT5_BASE_MODEL_PATH,
         finetuned_weights_path=FINETUNED_PROTT5_FOR_FEATURES_PATH
     )
     print("✅ Scaler and Feature Extractor loaded.")

 # │   └── final_generator_model.pth
 # ├── prott5/
 # │   └── model/
+# │       └── finetuned_prott5.bin (Your fine-tuned feature extractor weights)
 # └── requirements.txt
 import os
 # --- Feature Extractor Model Class (For ProtT5) ---
+# MODIFIED: This class now loads the base model from the Hugging Face Hub ID
+# and then applies your local fine-tuned weights.
 class FeatureProtT5Model:
+    def __init__(self, base_model_id, finetuned_weights_path=None):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Initializing ProtT5 for feature extraction on device: {self.device}")
+        # Load the base model architecture and tokenizer directly from the Hub ID.
+        print(f"Loading base model and tokenizer from '{base_model_id}'...")
+        self.tokenizer = transformers.T5Tokenizer.from_pretrained(base_model_id, do_lower_case=False)
+        self.model = transformers.T5EncoderModel.from_pretrained(base_model_id)
         # If a path to a fine-tuned weights file is provided, load and apply those weights.
         if finetuned_weights_path and os.path.exists(finetuned_weights_path):
+            print(f"Applying local fine-tuned weights from: {finetuned_weights_path}")
             state_dict = torch.load(finetuned_weights_path, map_location=self.device)
             self.model.load_state_dict(state_dict, strict=False)
             print("Successfully applied fine-tuned weights.")
         else:
             print("Warning: Fine-tuned weights not found or not provided. Using base ProtT5 weights.")
         self.model.to(self.device)
         self.model.eval()
         self.handcrafted_dim = input_dim - self.prott5_dim
         self.seq_len = 16
         self.prott5_feature_dim = 64 # 16 * 64 = 1024
         encoder_layer = nn.TransformerEncoderLayer(d_model=self.prott5_feature_dim, nhead=transformer_heads, dropout=transformer_dropout, batch_first=True)
         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
         fused_dim = self.prott5_feature_dim + self.handcrafted_dim
         self.fusion_fc = nn.Sequential(nn.Linear(fused_dim, 1024), nn.ReLU(), nn.Dropout(0.3), nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(0.3))
         self.classifier = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 1))
         # The input 'x' is a flat 1914-dim vector from extract_features()
         prot_t5_features = x[:, :self.prott5_dim]
         handcrafted_features = x[:, self.prott5_dim:]
         # Reshape the first 1024 features back into a sequence representation
         prot_t5_seq = prot_t5_features.view(batch_size, self.seq_len, self.prott5_feature_dim)
         encoded_seq = self.transformer_encoder(prot_t5_seq)
         refined_prott5 = encoded_seq.mean(dim=1)
         fused_features = torch.cat([refined_prott5, handcrafted_features], dim=1)
         fused_output = self.fusion_fc(fused_features)
         logits = self.classifier(fused_output)
         return logits / self.temperature
     def get_temperature(self):
         self.vocab_size = vocab_size
         self.eos_token_id = token2id["<EOS>"]
         self.pad_token_id = token2id["<PAD>"]
     def forward(self, input_ids):
         embeddings = self.embed_tokens(input_ids)
         encoder_output = self.encoder(embeddings)
         logits = self.lm_head(encoder_output)
         return logits
     def sample(self, batch_size, max_length=20, device="cpu", temperature=2.5, min_decoded_length=3):
         start_token = torch.randint(2, self.vocab_size, (batch_size, 1), device=device)
         generated = start_token
             next_logits = logits[:, -1, :] / temperature
             if generated.size(1) < min_decoded_length:
                 next_logits[:, self.eos_token_id] = -float("inf")
             probs = torch.softmax(next_logits, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)
             generated = torch.cat((generated, next_token), dim=1)
     PREDICTOR_CHECKPOINT_PATH = "checkpoints/final_rl_model_logitp0.1_calibrated_FINETUNED_PROTT5.pth"
     SCALER_PATH = "checkpoints/scaler_FINETUNED_PROTT5.pkl"
     GENERATOR_CHECKPOINT_PATH = "generator_checkpoints_v3.6/final_generator_model.pth"
+    # Define the base model ID from the Hub and the path to your local fine-tuned weights.
+    PROTT5_BASE_MODEL_ID = "Rostlab/prot_t5_xl_uniref50"
     FINETUNED_PROTT5_FOR_FEATURES_PATH = "prott5/model/finetuned_prott5.bin"
     # --- Load Predictor Model ---
     print(f"Loading Scaler from: {SCALER_PATH}")
     SCALER = joblib.load(SCALER_PATH)
     print("Loading ProtT5 Feature Extractor...")
+    # Pass the Hub ID to the updated class to load the base model.
     PROTT5_EXTRACTOR = FeatureProtT5Model(
+        base_model_id=PROTT5_BASE_MODEL_ID,
         finetuned_weights_path=FINETUNED_PROTT5_FOR_FEATURES_PATH
     )
     print("✅ Scaler and Feature Extractor loaded.")