Spaces:

chshan
/

RLAnOxPeptide

Sleeping

App Files Files Community

chshan commited on Jul 20

Commit

456a63e

verified ·

1 Parent(s): d8d52cc

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -52

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 # app.py - RLAnOxPeptide Gradio Web Application
-# Final version incorporating user feedback on generator logic and UI controls.
 import os
 import torch
@@ -16,6 +16,9 @@ from tqdm import tqdm
 import transformers
 import time
 # Suppress verbose logging from transformers
 transformers.logging.set_verbosity_error()
@@ -32,26 +35,27 @@ id2token = {i: t for t, i in token2id.items()}
 VOCAB_SIZE = len(token2id)
-# --- Feature Extractor Model Class (For ProtT5) ---
-class FeatureProtT5Model:
-    def __init__(self, base_model_id, finetuned_weights_path=None):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Initializing ProtT5 for feature extraction on device: {self.device}")
-        print(f"Loading base model and tokenizer from '{base_model_id}'...")
-        self.tokenizer = transformers.T5Tokenizer.from_pretrained(base_model_id, do_lower_case=False)
-        self.model = transformers.T5EncoderModel.from_pretrained(base_model_id)
-        if finetuned_weights_path and os.path.exists(finetuned_weights_path):
-            print(f"Applying local fine-tuned weights from: {finetuned_weights_path}")
-            state_dict = torch.load(finetuned_weights_path, map_location=self.device)
-            self.model.load_state_dict(state_dict, strict=False)
-            print("Successfully applied fine-tuned weights.")
-        else:
-            print("Warning: Fine-tuned weights not found or not provided. Using base ProtT5 weights.")
-        self.model.to(self.device)
         self.model.eval()
     def encode(self, sequence):
         if not sequence or not isinstance(sequence, str):
@@ -67,7 +71,7 @@ class FeatureProtT5Model:
         emb_np = embedding.squeeze(0).cpu().numpy()
         return emb_np if emb_np.shape[0] > 0 else np.zeros((1, 1024), dtype=np.float32)
-# --- Predictor Model Architecture ---
 class AntioxidantPredictor(nn.Module):
     def __init__(self, input_dim=1914, transformer_layers=3, transformer_heads=4, transformer_dropout=0.1):
         super(AntioxidantPredictor, self).__init__()
@@ -99,7 +103,7 @@ class AntioxidantPredictor(nn.Module):
     def get_temperature(self):
         return self.temperature.item()
-# --- Generator Model Architecture ---
 class ProtT5Generator(nn.Module):
     def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dropout=0.1):
         super(ProtT5Generator, self).__init__()
@@ -141,17 +145,16 @@ class ProtT5Generator(nn.Module):
             sequences.append(seq)
         return sequences
-# --- CRITICAL DEPENDENCY: feature_extract.py ---
 try:
     from feature_extract import extract_features
 except ImportError:
     raise gr.Error("Fatal Error: `feature_extract.py` not found. This file is required. Please upload it to your repository.")
-# --- Clustering Logic ---
 def cluster_sequences(generator, sequences, num_clusters, device):
     if not sequences or len(sequences) < num_clusters:
         return sequences[:num_clusters]
     with torch.no_grad():
         token_ids_list = []
         max_len = max((len(seq) for seq in sequences), default=0) + 2
@@ -160,13 +163,11 @@ def cluster_sequences(generator, sequences, num_clusters, device):
             ids = [np.random.randint(2, VOCAB_SIZE)] + ids
             ids += [token2id["<PAD>"]] * (max_len - len(ids))
             token_ids_list.append(ids)
         input_ids = torch.tensor(token_ids_list, dtype=torch.long, device=device)
         embeddings = generator.embed_tokens(input_ids)
         mask = (input_ids != token2id["<PAD>"]).unsqueeze(-1).float()
         seq_embeds = (embeddings * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-9)
         seq_embeds_np = seq_embeds.cpu().numpy()
     kmeans = KMeans(n_clusters=int(num_clusters), random_state=42, n_init='auto').fit(seq_embeds_np)
     representatives = []
     for i in range(int(num_clusters)):
@@ -191,10 +192,12 @@ try:
     PREDICTOR_CHECKPOINT_PATH = "checkpoints/final_rl_model_logitp0.1_calibrated_FINETUNED_PROTT5.pth"
     SCALER_PATH = "checkpoints/scaler_FINETUNED_PROTT5.pkl"
     GENERATOR_CHECKPOINT_PATH = "generator_checkpoints_v3.6/final_generator_model.pth"
     PROTT5_BASE_MODEL_ID = "Rostlab/prot_t5_xl_uniref50"
-    FINETUNED_PROTT5_FOR_FEATURES_PATH = "prott5/model/finetuned_prott5.bin"
-    # --- Load Predictor Model ---
     print(f"Loading Predictor from: {PREDICTOR_CHECKPOINT_PATH}")
     PREDICTOR_MODEL = AntioxidantPredictor(input_dim=1914)
     PREDICTOR_MODEL.load_state_dict(torch.load(PREDICTOR_CHECKPOINT_PATH, map_location=DEVICE))
@@ -202,13 +205,14 @@ try:
     PREDICTOR_MODEL.eval()
     print(f"✅ Predictor model loaded (Temp: {PREDICTOR_MODEL.get_temperature():.4f}).")
-    # --- Load Scaler & Feature Extractor ---
     print(f"Loading Scaler from: {SCALER_PATH}")
     SCALER = joblib.load(SCALER_PATH)
-    print("Loading ProtT5 Feature Extractor...")
-    PROTT5_EXTRACTOR = FeatureProtT5Model(
         base_model_id=PROTT5_BASE_MODEL_ID,
-        finetuned_weights_path=FINETUNED_PROTT5_FOR_FEATURES_PATH
     )
     print("✅ Scaler and Feature Extractor loaded.")
@@ -227,7 +231,7 @@ except Exception as e:
     raise gr.Error(f"A required model or file could not be loaded. Please check your repository file structure and paths. Error details: {e}")
 # --------------------------------------------------------------------------
-# SECTION 3: WRAPPER FUNCTIONS FOR GRADIO UI
 # --------------------------------------------------------------------------
 def predict_peptide_wrapper(sequence_str):
@@ -235,6 +239,8 @@ def predict_peptide_wrapper(sequence_str):
         return "0.0000", "Error: Please enter a valid peptide sequence using standard amino acids (ACDEFGHIKLMNPQRSTVWY)."
     try:
         features = extract_features(sequence_str.upper(), PROTT5_EXTRACTOR, L_fixed=29, d_model_pe=16)
         scaled_features = SCALER.transform(features.reshape(1, -1))
@@ -251,29 +257,23 @@ def predict_peptide_wrapper(sequence_str):
         return "N/A", f"An error occurred during prediction: {e}"
 def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, diversity_factor, progress=gr.Progress()):
-    """
-    Handles the full generation-validation-clustering pipeline with a loop to ensure
-    the target number of peptides is generated.
-    """
     num_to_generate = int(num_to_generate)
     min_len = int(min_len)
     max_len = int(max_len)
-    # Safety check for length
     if min_len > max_len:
         gr.Warning("Minimum Length cannot be greater than Maximum Length. Adjusting min_len = max_len.")
         min_len = max_len
     try:
-        validated_pool = {}  # Use a dictionary to store unique sequences and their probabilities
         attempts = 0
-        max_attempts = 20  # Safety break to prevent infinite loops
-        generation_batch_size = 200 # Number of sequences to generate in each attempt
         while len(validated_pool) < num_to_generate and attempts < max_attempts:
             progress(len(validated_pool) / num_to_generate, desc=f"Found {len(validated_pool)} / {num_to_generate} peptides. (Attempt {attempts+1}/{max_attempts})")
-            # Generate a batch of candidate sequences
             with torch.no_grad():
                 generated_tokens = GENERATOR_MODEL.sample(
                     batch_size=generation_batch_size, max_length=max_len, device=DEVICE,
@@ -281,21 +281,18 @@ def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, div
                 )
             decoded_sequences = GENERATOR_MODEL.decode(generated_tokens)
-            # Filter for length and uniqueness
             new_candidates = []
             for seq in decoded_sequences:
                 if min_len <= len(seq) <= max_len:
                     if seq not in validated_pool:
                         new_candidates.append(seq)
-            # Validate the new, unique candidates
             for seq in new_candidates:
                 prob_str, _ = predict_peptide_wrapper(seq)
                 try:
                     prob = float(prob_str)
                     if prob > 0.90:
                         validated_pool[seq] = prob
-                        # Check if we have reached the target
                         if len(validated_pool) >= num_to_generate:
                             break
                 except (ValueError, TypeError):
@@ -311,13 +308,9 @@ def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, div
         if not validated_pool:
             return pd.DataFrame([{"Sequence": "Could not generate any high-activity peptides (>0.9 prob) with the current settings. Try different parameters.", "Predicted Probability": "N/A"}])
-        # --- Final Processing ---
         high_quality_sequences = list(validated_pool.keys())
-        # Cluster to ensure diversity, selecting up to the target number
         final_diverse_seqs = cluster_sequences(GENERATOR_MODEL, high_quality_sequences, num_to_generate, DEVICE)
-        # Format final results into a DataFrame
         final_results = [(seq, f"{validated_pool[seq]:.4f}") for seq in final_diverse_seqs]
         final_results.sort(key=lambda x: float(x[1]), reverse=True)
@@ -328,7 +321,7 @@ def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, div
         return pd.DataFrame([{"Sequence": f"An error occurred during generation: {e}", "Predicted Probability": "N/A"}])
 # --------------------------------------------------------------------------
-# SECTION 4: GRADIO UI CONSTRUCTION
 # --------------------------------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft(), title="RLAnOxPeptide") as demo:
     gr.Markdown("# RLAnOxPeptide: Intelligent Peptide Design and Prediction")
@@ -364,7 +357,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="RLAnOxPeptide") as demo:
             with gr.Column():
                 with gr.Row():
                     num_input = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="Number of Final Peptides to Generate")
-                    # ✅ MODIFIED: Length sliders both have a range of 2-20
                     min_len_input = gr.Slider(minimum=2, maximum=20, value=3, step=1, label="Minimum Length")
                     max_len_input = gr.Slider(minimum=2, maximum=20, value=20, step=1, label="Maximum Length")
                 with gr.Row():
@@ -374,10 +366,9 @@ with gr.Blocks(theme=gr.themes.Soft(), title="RLAnOxPeptide") as demo:
             generate_button = gr.Button("Generate Peptides", variant="primary")
             results_output = gr.DataFrame(headers=["Sequence", "Predicted Probability"], label="Generated & Validated Peptides (>90% Probability)", wrap=True)
-            # ✅ ADDED: Dynamic linking of min and max length sliders for better UX
             def update_min_len_range(max_len):
                 return gr.Slider(maximum=max_len)
-            max_len_input.change(fn=update_min_len_range, inputs=max_len_input, outputs=min_len_input)
             def update_max_len_range(min_len):
                 return gr.Slider(minimum=min_len)

 # -*- coding: utf-8 -*-
 # app.py - RLAnOxPeptide Gradio Web Application
+# Final version updated to use a LoRA-finetuned model for feature extraction.
 import os
 import torch
 import transformers
 import time
+# NEW DEPENDENCY: peft library for LoRA
+from peft import PeftModel
 # Suppress verbose logging from transformers
 transformers.logging.set_verbosity_error()
 VOCAB_SIZE = len(token2id)
+# --- LoRA Feature Extractor Model Class ---
+# ✅ REPLACED: This new class handles loading the base model and attaching the LoRA adapter.
+class LoRAProtT5Extractor:
+    def __init__(self, base_model_id, lora_adapter_path):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Initializing LoRA-enhanced ProtT5 on device: {self.device}")
+        print(f"  - Loading base model and tokenizer from '{base_model_id}'...")
+        base_model = transformers.T5EncoderModel.from_pretrained(base_model_id)
+        self.tokenizer = transformers.T5Tokenizer.from_pretrained(base_model_id)
+        if not os.path.exists(lora_adapter_path):
+            raise FileNotFoundError(f"Error: LoRA adapter directory not found at: {lora_adapter_path}")
+        print(f"  - Loading and applying LoRA adapter from: {lora_adapter_path}")
+        lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
+        print("  - Merging LoRA weights for faster inference...")
+        self.model = lora_model.merge_and_unload().to(self.device)
         self.model.eval()
+        print("  - LoRA-enhanced feature extractor is ready.")
     def encode(self, sequence):
         if not sequence or not isinstance(sequence, str):
         emb_np = embedding.squeeze(0).cpu().numpy()
         return emb_np if emb_np.shape[0] > 0 else np.zeros((1, 1024), dtype=np.float32)
+# --- Predictor Model Architecture (Unchanged) ---
 class AntioxidantPredictor(nn.Module):
     def __init__(self, input_dim=1914, transformer_layers=3, transformer_heads=4, transformer_dropout=0.1):
         super(AntioxidantPredictor, self).__init__()
     def get_temperature(self):
         return self.temperature.item()
+# --- Generator Model Architecture (Unchanged) ---
 class ProtT5Generator(nn.Module):
     def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dropout=0.1):
         super(ProtT5Generator, self).__init__()
             sequences.append(seq)
         return sequences
+# --- CRITICAL DEPENDENCY: feature_extract.py (Unchanged) ---
 try:
     from feature_extract import extract_features
 except ImportError:
     raise gr.Error("Fatal Error: `feature_extract.py` not found. This file is required. Please upload it to your repository.")
+# --- Clustering Logic (Unchanged) ---
 def cluster_sequences(generator, sequences, num_clusters, device):
     if not sequences or len(sequences) < num_clusters:
         return sequences[:num_clusters]
     with torch.no_grad():
         token_ids_list = []
         max_len = max((len(seq) for seq in sequences), default=0) + 2
             ids = [np.random.randint(2, VOCAB_SIZE)] + ids
             ids += [token2id["<PAD>"]] * (max_len - len(ids))
             token_ids_list.append(ids)
         input_ids = torch.tensor(token_ids_list, dtype=torch.long, device=device)
         embeddings = generator.embed_tokens(input_ids)
         mask = (input_ids != token2id["<PAD>"]).unsqueeze(-1).float()
         seq_embeds = (embeddings * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-9)
         seq_embeds_np = seq_embeds.cpu().numpy()
     kmeans = KMeans(n_clusters=int(num_clusters), random_state=42, n_init='auto').fit(seq_embeds_np)
     representatives = []
     for i in range(int(num_clusters)):
     PREDICTOR_CHECKPOINT_PATH = "checkpoints/final_rl_model_logitp0.1_calibrated_FINETUNED_PROTT5.pth"
     SCALER_PATH = "checkpoints/scaler_FINETUNED_PROTT5.pkl"
     GENERATOR_CHECKPOINT_PATH = "generator_checkpoints_v3.6/final_generator_model.pth"
+    # ✅ UPDATED: Define paths for LoRA-based loading
     PROTT5_BASE_MODEL_ID = "Rostlab/prot_t5_xl_uniref50"
+    LORA_ADAPTER_PATH = "./lora_finetuned_prott5" # Assumes LoRA files are in this directory
+    # --- Load Predictor Model (Head) ---
     print(f"Loading Predictor from: {PREDICTOR_CHECKPOINT_PATH}")
     PREDICTOR_MODEL = AntioxidantPredictor(input_dim=1914)
     PREDICTOR_MODEL.load_state_dict(torch.load(PREDICTOR_CHECKPOINT_PATH, map_location=DEVICE))
     PREDICTOR_MODEL.eval()
     print(f"✅ Predictor model loaded (Temp: {PREDICTOR_MODEL.get_temperature():.4f}).")
+    # --- Load Scaler & LoRA Feature Extractor ---
     print(f"Loading Scaler from: {SCALER_PATH}")
     SCALER = joblib.load(SCALER_PATH)
+    print("Loading LoRA-enhanced ProtT5 Feature Extractor...")
+    # ✅ UPDATED: Instantiate the new LoRA extractor class
+    PROTT5_EXTRACTOR = LoRAProtT5Extractor(
         base_model_id=PROTT5_BASE_MODEL_ID,
+        lora_adapter_path=LORA_ADAPTER_PATH
     )
     print("✅ Scaler and Feature Extractor loaded.")
     raise gr.Error(f"A required model or file could not be loaded. Please check your repository file structure and paths. Error details: {e}")
 # --------------------------------------------------------------------------
+# SECTION 3: WRAPPER FUNCTIONS FOR GRADIO UI (Unchanged logic)
 # --------------------------------------------------------------------------
 def predict_peptide_wrapper(sequence_str):
         return "0.0000", "Error: Please enter a valid peptide sequence using standard amino acids (ACDEFGHIKLMNPQRSTVWY)."
     try:
+        # This function call remains the same because the PROTT5_EXTRACTOR object,
+        # despite its new internal logic, provides the same interface.
         features = extract_features(sequence_str.upper(), PROTT5_EXTRACTOR, L_fixed=29, d_model_pe=16)
         scaled_features = SCALER.transform(features.reshape(1, -1))
         return "N/A", f"An error occurred during prediction: {e}"
 def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, diversity_factor, progress=gr.Progress()):
     num_to_generate = int(num_to_generate)
     min_len = int(min_len)
     max_len = int(max_len)
     if min_len > max_len:
         gr.Warning("Minimum Length cannot be greater than Maximum Length. Adjusting min_len = max_len.")
         min_len = max_len
     try:
+        validated_pool = {}
         attempts = 0
+        max_attempts = 20
+        generation_batch_size = 200
         while len(validated_pool) < num_to_generate and attempts < max_attempts:
             progress(len(validated_pool) / num_to_generate, desc=f"Found {len(validated_pool)} / {num_to_generate} peptides. (Attempt {attempts+1}/{max_attempts})")
             with torch.no_grad():
                 generated_tokens = GENERATOR_MODEL.sample(
                     batch_size=generation_batch_size, max_length=max_len, device=DEVICE,
                 )
             decoded_sequences = GENERATOR_MODEL.decode(generated_tokens)
             new_candidates = []
             for seq in decoded_sequences:
                 if min_len <= len(seq) <= max_len:
                     if seq not in validated_pool:
                         new_candidates.append(seq)
             for seq in new_candidates:
                 prob_str, _ = predict_peptide_wrapper(seq)
                 try:
                     prob = float(prob_str)
                     if prob > 0.90:
                         validated_pool[seq] = prob
                         if len(validated_pool) >= num_to_generate:
                             break
                 except (ValueError, TypeError):
         if not validated_pool:
             return pd.DataFrame([{"Sequence": "Could not generate any high-activity peptides (>0.9 prob) with the current settings. Try different parameters.", "Predicted Probability": "N/A"}])
         high_quality_sequences = list(validated_pool.keys())
         final_diverse_seqs = cluster_sequences(GENERATOR_MODEL, high_quality_sequences, num_to_generate, DEVICE)
         final_results = [(seq, f"{validated_pool[seq]:.4f}") for seq in final_diverse_seqs]
         final_results.sort(key=lambda x: float(x[1]), reverse=True)
         return pd.DataFrame([{"Sequence": f"An error occurred during generation: {e}", "Predicted Probability": "N/A"}])
 # --------------------------------------------------------------------------
+# SECTION 4: GRADIO UI CONSTRUCTION (Unchanged)
 # --------------------------------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft(), title="RLAnOxPeptide") as demo:
     gr.Markdown("# RLAnOxPeptide: Intelligent Peptide Design and Prediction")
             with gr.Column():
                 with gr.Row():
                     num_input = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="Number of Final Peptides to Generate")
                     min_len_input = gr.Slider(minimum=2, maximum=20, value=3, step=1, label="Minimum Length")
                     max_len_input = gr.Slider(minimum=2, maximum=20, value=20, step=1, label="Maximum Length")
                 with gr.Row():
             generate_button = gr.Button("Generate Peptides", variant="primary")
             results_output = gr.DataFrame(headers=["Sequence", "Predicted Probability"], label="Generated & Validated Peptides (>90% Probability)", wrap=True)
             def update_min_len_range(max_len):
                 return gr.Slider(maximum=max_len)
+            max_len_input.change(fn=update_min_len_range, inputs=max_len_input, outputs=max_len_input)
             def update_max_len_range(min_len):
                 return gr.Slider(minimum=min_len)