Spaces:

chshan
/

RLAnOxPeptide

Sleeping

App Files Files Community

chshan commited on Jul 20

Commit

6f96910

verified ·

1 Parent(s): d1b4723

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -100

app.py CHANGED Viewed

@@ -1,5 +1,26 @@
-# app.py - RLAnOxPeptide Gradio Web Application (FINAL CORRECTED VERSION - Robust Loading)
 import os
 import torch
@@ -12,14 +33,14 @@ from sklearn.cluster import KMeans
 from tqdm import tqdm
 import transformers
-# Suppress verbose logging from transformers
 transformers.logging.set_verbosity_error()
 # --------------------------------------------------------------------------
 # SECTION 1: CORE CLASS AND FUNCTION DEFINITIONS
 # --------------------------------------------------------------------------
-# --- Vocabulary Definition ---
 AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
 token2id = {aa: i + 2 for i, aa in enumerate(AMINO_ACIDS)}
 token2id["<PAD>"] = 0
@@ -27,71 +48,71 @@ token2id["<EOS>"] = 1
 id2token = {i: t for t, i in token2id.items()}
 VOCAB_SIZE = len(token2id)
-# --- ROBUST FeatureProtT5Model Class for Feature Extraction ---
 class FeatureProtT5Model:
     def __init__(self, model_dir_path, finetuned_weights_path=None):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Initializing ProtT5 from base directory: {model_dir_path}")
-        # Step 1: Load the base model architecture and tokenizer from the directory.
-        # This step requires the original pytorch_model.bin to be in the model_dir_path.
         self.tokenizer = transformers.T5Tokenizer.from_pretrained(model_dir_path, do_lower_case=False)
         self.model = transformers.T5EncoderModel.from_pretrained(model_dir_path)
-        # Step 2: If a separate fine-tuned weights file is provided, load it.
         if finetuned_weights_path and os.path.exists(finetuned_weights_path):
-            print(f"Loading and applying fine-tuned weights from: {finetuned_weights_path}")
-            # Load the state_dict from your specific fine-tuned file
             state_dict = torch.load(finetuned_weights_path, map_location=self.device)
-            # Use strict=False because the fine-tuned model may only contain encoder weights
             self.model.load_state_dict(state_dict, strict=False)
-            print("Successfully applied fine-tuned weights to the model.")
         else:
-            print("Warning: Fine-tuned weights file not provided or not found. Using the base ProtT5 model weights.")
         self.model.to(self.device)
         self.model.eval()
-    def encode(self, sequence):
-        if not sequence or not isinstance(sequence, str):
-            return np.zeros((1, 1024), dtype=np.float32)
-        seq_spaced = " ".join(list(sequence))
-        encoded_input = self.tokenizer(seq_spaced, return_tensors='pt', padding=True, truncation=True, max_length=1022)
-        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
-        with torch.no_grad():
-            embedding = self.model(**encoded_input).last_hidden_state
-        emb = embedding.squeeze(0).cpu().numpy()
-        return emb if emb.shape[0] > 0 else np.zeros((1, 1024), dtype=np.float32)
 # --- Predictor Model Architecture ---
 class AntioxidantPredictor(nn.Module):
-    def __init__(self, input_dim, transformer_layers=3, transformer_heads=4, transformer_dropout=0.1):
         super(AntioxidantPredictor, self).__init__()
         self.prott5_dim = 1024
         self.handcrafted_dim = input_dim - self.prott5_dim
         self.seq_len = 16
-        self.prott5_feature_dim = 64
         encoder_layer = nn.TransformerEncoderLayer(d_model=self.prott5_feature_dim, nhead=transformer_heads, dropout=transformer_dropout, batch_first=True)
         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
         fused_dim = self.prott5_feature_dim + self.handcrafted_dim
         self.fusion_fc = nn.Sequential(nn.Linear(fused_dim, 1024), nn.ReLU(), nn.Dropout(0.3), nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(0.3))
         self.classifier = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 1))
         self.temperature = nn.Parameter(torch.ones(1), requires_grad=False)
-    def forward(self, x, *args):
         batch_size = x.size(0)
         prot_t5_features = x[:, :self.prott5_dim]
         handcrafted_features = x[:, self.prott5_dim:]
         prot_t5_seq = prot_t5_features.view(batch_size, self.seq_len, self.prott5_feature_dim)
         encoded_seq = self.transformer_encoder(prot_t5_seq)
         refined_prott5 = encoded_seq.mean(dim=1)
         fused_features = torch.cat([refined_prott5, handcrafted_features], dim=1)
-        fused_features = self.fusion_fc(fused_features)
-        logits = self.classifier(fused_features)
         return logits / self.temperature
-    def set_temperature(self, temp_value, device): self.temperature = nn.Parameter(torch.tensor([temp_value], device=device), requires_grad=False)
-    def get_temperature(self): return self.temperature.item()
-# --- Generator Model Architecture (Copied VERBATIM from your generator.py) ---
 class ProtT5Generator(nn.Module):
     def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dropout=0.1):
         super(ProtT5Generator, self).__init__()
@@ -117,49 +138,50 @@ class ProtT5Generator(nn.Module):
             next_logits = logits[:, -1, :] / temperature
             if generated.size(1) < min_decoded_length:
                 next_logits[:, self.eos_token_id] = -float("inf")
             probs = torch.softmax(next_logits, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)
             generated = torch.cat((generated, next_token), dim=1)
-            if (generated == self.eos_token_id).any(dim=1).all():
-                break
         return generated
     def decode(self, token_ids_batch):
-        seqs = []
         for ids_tensor in token_ids_batch:
             seq = ""
-            for token_id in ids_tensor.tolist()[1:]: # Skip start token
                 if token_id == self.eos_token_id: break
                 if token_id == self.pad_token_id: continue
-                seq += id2token.get(token_id, "?")
-            seqs.append(seq)
-        return seqs
-# --- Feature Extraction (needs feature_extract.py in the same directory) ---
 try:
-    from feature_extract import ProtT5Model as FeatureProtT5Model, extract_features
 except ImportError:
-    raise gr.Error("Failed to import feature_extract.py. Ensure it is in the same directory.")
 # --- Clustering Logic (from generator.py) ---
 def cluster_sequences(generator, sequences, num_clusters, device):
     if not sequences or len(sequences) < num_clusters:
         return sequences[:num_clusters]
     with torch.no_grad():
         token_ids_list = []
-        max_len = max(len(seq) for seq in sequences) + 2
         for seq in sequences:
             ids = [token2id.get(aa, 0) for aa in seq] + [generator.eos_token_id]
-            ids = [np.random.randint(2, VOCAB_SIZE)] + ids
             ids += [token2id["<PAD>"]] * (max_len - len(ids))
             token_ids_list.append(ids)
         input_ids = torch.tensor(token_ids_list, dtype=torch.long, device=device)
         embeddings = generator.embed_tokens(input_ids)
         mask = (input_ids != token2id["<PAD>"]).unsqueeze(-1).float()
-        embeddings = embeddings * mask
-        lengths = mask.sum(dim=1)
-        seq_embeds = embeddings.sum(dim=1) / (lengths + 1e-9)
         seq_embeds_np = seq_embeds.cpu().numpy()
     kmeans = KMeans(n_clusters=int(num_clusters), random_state=42, n_init='auto').fit(seq_embeds_np)
@@ -175,67 +197,67 @@ def cluster_sequences(generator, sequences, num_clusters, device):
     return representatives
 # --------------------------------------------------------------------------
-# SECTION 2: GLOBAL MODEL LOADING
 # --------------------------------------------------------------------------
-print("Loading all models and dependencies...")
-DEVICE = "cpu"
 try:
-    # --- Define file paths (!! CHECK THESE PATHS !!) ---
     PREDICTOR_CHECKPOINT_PATH = "checkpoints/final_rl_model_logitp0.1_calibrated_FINETUNED_PROTT5.pth"
     SCALER_PATH = "checkpoints/scaler_FINETUNED_PROTT5.pkl"
     GENERATOR_CHECKPOINT_PATH = "generator_checkpoints_v3.6/final_generator_model.pth"
     PROTT5_BASE_MODEL_PATH = "prott5/model/"
-    # This path is now used by the FeatureProtT5Model to load the fine-tuned weights
     FINETUNED_PROTT5_FOR_FEATURES_PATH = "prott5/model/finetuned_prott5.bin"
-    # --- Load Predictor ---
-    print("Loading Predictor Model...")
-    # Initialize the correct class
-    PREDICTOR_MODEL = AntioxidantPredictor(
-        input_dim=1914, transformer_layers=3, transformer_heads=4, transformer_dropout=0.1
-    )
-    # Load the state dict that matches this class
     PREDICTOR_MODEL.load_state_dict(torch.load(PREDICTOR_CHECKPOINT_PATH, map_location=DEVICE))
     PREDICTOR_MODEL.to(DEVICE)
     PREDICTOR_MODEL.eval()
     print(f"✅ Predictor model loaded (Temp: {PREDICTOR_MODEL.get_temperature():.4f}).")
     # --- Load Scaler & Feature Extractor ---
-    print("Loading Scaler and Feature Extractor...")
     SCALER = joblib.load(SCALER_PATH)
     PROTT5_EXTRACTOR = FeatureProtT5Model(
-        model_path=PROTT5_BASE_MODEL_PATH,
-        finetuned_model_file=FINETUNED_PROTT5_FOR_FEATURES_PATH
     )
     print("✅ Scaler and Feature Extractor loaded.")
-    # --- Load Generator ---
-    print("Loading Generator Model...")
-    GENERATOR_MODEL = ProtT5Generator(
-        vocab_size=VOCAB_SIZE, embed_dim=512, num_layers=6, num_heads=8, dropout=0.1
-    )
     GENERATOR_MODEL.load_state_dict(torch.load(GENERATOR_CHECKPOINT_PATH, map_location=DEVICE))
     GENERATOR_MODEL.to(DEVICE)
     GENERATOR_MODEL.eval()
     print("✅ Generator model loaded.")
-    print("\n--- All models loaded successfully! Gradio app is ready. ---\n")
 except Exception as e:
-    print(f"💥 FATAL ERROR: Failed to load a model or dependency file: {e}")
-    raise gr.Error(f"Model or dependency loading failed! Check file paths and integrity. Error: {e}")
 # --------------------------------------------------------------------------
-# SECTION 3: WRAPPER FUNCTIONS FOR GRADIO
 # --------------------------------------------------------------------------
 def predict_peptide_wrapper(sequence_str):
     if not sequence_str or not isinstance(sequence_str, str) or any(c not in AMINO_ACIDS for c in sequence_str.upper()):
-        return "0.0000", "Error: Please enter a valid sequence with standard amino acids."
     try:
-        # These L_fixed and d_model_pe values are from your predictor.py args
-        features = extract_features(sequence_str, PROTT5_EXTRACTOR, L_fixed=29, d_model_pe=16)
         scaled_features = SCALER.transform(features.reshape(1, -1))
         with torch.no_grad():
@@ -247,21 +269,22 @@ def predict_peptide_wrapper(sequence_str):
         return f"{probability:.4f}", classification
     except Exception as e:
-        print(f"Prediction error for sequence '{sequence_str}': {e}")
-        return "N/A", f"An error occurred during processing: {e}"
 def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, diversity_factor, progress=gr.Progress(track_tqdm=True)):
-    # This logic is a direct adaptation of your generator.py main function
     num_to_generate = int(num_to_generate)
     min_len = int(min_len)
     max_len = int(max_len)
     try:
-        # Step 1: Generate a pool of unique sequences
         target_pool_size = int(num_to_generate * diversity_factor)
         unique_seqs = set()
-        with tqdm(total=target_pool_size, desc="Generating candidate sequences") as pbar:
             while len(unique_seqs) < target_pool_size:
                 batch_size = max(1, (target_pool_size - len(unique_seqs)))
                 with torch.no_grad():
@@ -269,19 +292,19 @@ def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, div
                         batch_size=batch_size, max_length=max_len, device=DEVICE,
                         temperature=temperature, min_decoded_length=min_len
                     )
-                decoded = GENERATOR_MODEL.decode(generated_tokens.cpu())
                 initial_count = len(unique_seqs)
-                for seq in decoded:
                     if min_len <= len(seq) <= max_len:
                         unique_seqs.add(seq)
                 pbar.update(len(unique_seqs) - initial_count)
         candidate_seqs = list(unique_seqs)
-        # Step 2: Validate the generated sequences
         validated_pool = {}
-        for seq in tqdm(candidate_seqs, desc="Validating generated sequences"):
             prob_str, _ = predict_peptide_wrapper(seq)
             try:
                 prob = float(prob_str)
@@ -291,40 +314,41 @@ def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, div
                 continue
         if not validated_pool:
-            return pd.DataFrame([{"Sequence": "No high-activity peptides (>0.9 prob) were generated.", "Predicted Probability": "N/A"}])
         high_quality_sequences = list(validated_pool.keys())
-        # Step 3: Cluster to ensure diversity
-        progress(1.0, desc="Clustering for diversity...")
         final_diverse_seqs = cluster_sequences(GENERATOR_MODEL, high_quality_sequences, num_to_generate, DEVICE)
-        # Step 4: Format final results
         final_results = [(seq, f"{validated_pool[seq]:.4f}") for seq in final_diverse_seqs]
         final_results.sort(key=lambda x: float(x[1]), reverse=True)
         return pd.DataFrame(final_results, columns=["Sequence", "Predicted Probability"])
     except Exception as e:
-        print(f"Generation error: {e}")
-        return pd.DataFrame([{"Sequence": f"An error occurred: {e}", "Predicted Probability": "N/A"}])
 # --------------------------------------------------------------------------
 # SECTION 4: GRADIO UI CONSTRUCTION
 # --------------------------------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft(), title="RLAnOxPeptide") as demo:
-    gr.Markdown("# RLAnOxPeptide: Intelligent Peptide Design and Prediction Platform")
     gr.Markdown("An integrated framework combining reinforcement learning and a Transformer model for the efficient prediction and innovative design of antioxidant peptides.")
     with gr.Tabs():
         with gr.TabItem("Peptide Activity Predictor"):
             gr.Markdown("### Enter an amino acid sequence to predict its antioxidant activity.")
             with gr.Row():
                 peptide_input = gr.Textbox(label="Peptide Sequence", placeholder="e.g., WHYHDYKY", scale=3)
                 predict_button = gr.Button("Predict", variant="primary", scale=1)
             with gr.Row():
-                probability_output = gr.Textbox(label="Predicted Probability")
-                class_output = gr.Textbox(label="Predicted Class")
             predict_button.click(
                 fn=predict_peptide_wrapper,
@@ -332,23 +356,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="RLAnOxPeptide") as demo:
                 outputs=[probability_output, class_output]
             )
             gr.Examples(
-                examples=[["WHYHDYKY"], ["YPGG"], ["LVLHEHGGN"]],
-                inputs=peptide_input
             )
         with gr.TabItem("Novel Sequence Generator"):
             gr.Markdown("### Set parameters to generate novel, high-activity antioxidant peptides.")
             with gr.Column():
                 with gr.Row():
-                    num_input = gr.Slider(minimum=1, maximum=50, value=10, step=1, label="Number of Final Peptides to Generate")
-                    min_len_input = gr.Slider(minimum=2, maximum=10, value=3, step=1, label="Minimum Length")
                     max_len_input = gr.Slider(minimum=10, maximum=20, value=20, step=1, label="Maximum Length")
                 with gr.Row():
                     temp_input = gr.Slider(minimum=0.5, maximum=3.0, value=2.5, step=0.1, label="Temperature (Higher = More random)")
-                    diversity_input = gr.Slider(minimum=1.0, maximum=3.0, value=1.2, step=0.1, label="Diversity Factor (Higher = Larger initial pool for clustering)")
             generate_button = gr.Button("Generate Peptides", variant="primary")
-            results_output = gr.DataFrame(headers=["Sequence", "Predicted Probability"], label="Generated & Validated Peptides", wrap=True)
             generate_button.click(
                 fn=generate_peptide_wrapper,

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# app.py - RLAnOxPeptide Gradio Web Application
+# This script combines logic from predictor.py, generator.py, and the original app.py
+# into a single, self-contained file for a Hugging Face Space.
+#
+# REQUIRED FILE STRUCTURE IN HUGGING FACE REPO:
+# .
+# ├── app.py                  (This file)
+# ├── feature_extract.py      (CRITICAL: This file with your `extract_features` function MUST be present)
+# ├── checkpoints/
+# │   ├── final_rl_model_logitp0.1_calibrated_FINETUNED_PROTT5.pth
+# │   └── scaler_FINETUNED_PROTT5.pkl
+# ├── generator_checkpoints_v3.6/
+# │   └── final_generator_model.pth
+# ├── prott5/
+# │   └── model/
+# │       ├── config.json
+# │       ├── pytorch_model.bin  (The base ProtT5 model from Rostlab)
+# │       ├── finetuned_prott5.bin (Your fine-tuned feature extractor weights)
+# │       └── ... (other tokenizer files)
+# └── requirements.txt
 import os
 import torch
 from tqdm import tqdm
 import transformers
+# Suppress verbose logging from transformers, which can clutter the app logs
 transformers.logging.set_verbosity_error()
 # --------------------------------------------------------------------------
 # SECTION 1: CORE CLASS AND FUNCTION DEFINITIONS
 # --------------------------------------------------------------------------
+# --- Vocabulary Definition (Consistent across all scripts) ---
 AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
 token2id = {aa: i + 2 for i, aa in enumerate(AMINO_ACIDS)}
 token2id["<PAD>"] = 0
 id2token = {i: t for t, i in token2id.items()}
 VOCAB_SIZE = len(token2id)
+# --- Feature Extractor Model Class (For ProtT5) ---
+# This class robustly loads the base ProtT5 model and applies your fine-tuned weights.
 class FeatureProtT5Model:
     def __init__(self, model_dir_path, finetuned_weights_path=None):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Initializing ProtT5 for feature extraction on device: {self.device}")
+        # Load the base model architecture and tokenizer from the specified directory.
         self.tokenizer = transformers.T5Tokenizer.from_pretrained(model_dir_path, do_lower_case=False)
         self.model = transformers.T5EncoderModel.from_pretrained(model_dir_path)
+        # If a path to a fine-tuned weights file is provided, load and apply those weights.
         if finetuned_weights_path and os.path.exists(finetuned_weights_path):
+            print(f"Applying fine-tuned weights from: {finetuned_weights_path}")
             state_dict = torch.load(finetuned_weights_path, map_location=self.device)
             self.model.load_state_dict(state_dict, strict=False)
+            print("Successfully applied fine-tuned weights.")
         else:
+            print("Warning: Fine-tuned weights not found or not provided. Using base ProtT5 weights.")
         self.model.to(self.device)
         self.model.eval()
 # --- Predictor Model Architecture ---
+# This is the antioxidant activity predictor model. Its architecture must
+# exactly match the architecture used to save the checkpoint file.
 class AntioxidantPredictor(nn.Module):
+    def __init__(self, input_dim=1914, transformer_layers=3, transformer_heads=4, transformer_dropout=0.1):
         super(AntioxidantPredictor, self).__init__()
         self.prott5_dim = 1024
         self.handcrafted_dim = input_dim - self.prott5_dim
         self.seq_len = 16
+        self.prott5_feature_dim = 64 # 16 * 64 = 1024
         encoder_layer = nn.TransformerEncoderLayer(d_model=self.prott5_feature_dim, nhead=transformer_heads, dropout=transformer_dropout, batch_first=True)
         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
         fused_dim = self.prott5_feature_dim + self.handcrafted_dim
         self.fusion_fc = nn.Sequential(nn.Linear(fused_dim, 1024), nn.ReLU(), nn.Dropout(0.3), nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(0.3))
         self.classifier = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 1))
         self.temperature = nn.Parameter(torch.ones(1), requires_grad=False)
+    def forward(self, x):
         batch_size = x.size(0)
+        # The input 'x' is a flat 1914-dim vector from extract_features()
         prot_t5_features = x[:, :self.prott5_dim]
         handcrafted_features = x[:, self.prott5_dim:]
+        # Reshape the first 1024 features back into a sequence representation
         prot_t5_seq = prot_t5_features.view(batch_size, self.seq_len, self.prott5_feature_dim)
         encoded_seq = self.transformer_encoder(prot_t5_seq)
         refined_prott5 = encoded_seq.mean(dim=1)
         fused_features = torch.cat([refined_prott5, handcrafted_features], dim=1)
+        fused_output = self.fusion_fc(fused_features)
+        logits = self.classifier(fused_output)
         return logits / self.temperature
+    def get_temperature(self):
+        return self.temperature.item()
+# --- Generator Model Architecture (from generator.py) ---
 class ProtT5Generator(nn.Module):
     def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dropout=0.1):
         super(ProtT5Generator, self).__init__()
             next_logits = logits[:, -1, :] / temperature
             if generated.size(1) < min_decoded_length:
                 next_logits[:, self.eos_token_id] = -float("inf")
             probs = torch.softmax(next_logits, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)
             generated = torch.cat((generated, next_token), dim=1)
         return generated
     def decode(self, token_ids_batch):
+        sequences = []
         for ids_tensor in token_ids_batch:
             seq = ""
+            for token_id in ids_tensor.tolist()[1:]: # Skip the random start token
                 if token_id == self.eos_token_id: break
                 if token_id == self.pad_token_id: continue
+                seq += id2token.get(token_id, "")
+            sequences.append(seq)
+        return sequences
+# --- CRITICAL DEPENDENCY: feature_extract.py ---
+# This application requires a function named `extract_features` to convert a peptide
+# sequence into a 1914-dimensional feature vector for the prediction model.
+# This function must be defined in a file named `feature_extract.py` in the repository root.
 try:
+    from feature_extract import extract_features
 except ImportError:
+    raise gr.Error("Fatal Error: `feature_extract.py` not found. This file is required for the application to run. Please upload it to your repository.")
 # --- Clustering Logic (from generator.py) ---
 def cluster_sequences(generator, sequences, num_clusters, device):
     if not sequences or len(sequences) < num_clusters:
         return sequences[:num_clusters]
     with torch.no_grad():
         token_ids_list = []
+        max_len = max((len(seq) for seq in sequences), default=0) + 2
         for seq in sequences:
             ids = [token2id.get(aa, 0) for aa in seq] + [generator.eos_token_id]
+            ids = [np.random.randint(2, VOCAB_SIZE)] + ids # Prepend a start token
             ids += [token2id["<PAD>"]] * (max_len - len(ids))
             token_ids_list.append(ids)
         input_ids = torch.tensor(token_ids_list, dtype=torch.long, device=device)
         embeddings = generator.embed_tokens(input_ids)
         mask = (input_ids != token2id["<PAD>"]).unsqueeze(-1).float()
+        seq_embeds = (embeddings * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-9)
         seq_embeds_np = seq_embeds.cpu().numpy()
     kmeans = KMeans(n_clusters=int(num_clusters), random_state=42, n_init='auto').fit(seq_embeds_np)
     return representatives
 # --------------------------------------------------------------------------
+# SECTION 2: GLOBAL MODEL AND DEPENDENCY LOADING
 # --------------------------------------------------------------------------
+print("--- Starting Application: Loading all models and dependencies ---")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 try:
+    # --- Define file paths relative to the repository root ---
     PREDICTOR_CHECKPOINT_PATH = "checkpoints/final_rl_model_logitp0.1_calibrated_FINETUNED_PROTT5.pth"
     SCALER_PATH = "checkpoints/scaler_FINETUNED_PROTT5.pkl"
     GENERATOR_CHECKPOINT_PATH = "generator_checkpoints_v3.6/final_generator_model.pth"
     PROTT5_BASE_MODEL_PATH = "prott5/model/"
     FINETUNED_PROTT5_FOR_FEATURES_PATH = "prott5/model/finetuned_prott5.bin"
+    # --- Load Predictor Model ---
+    print(f"Loading Predictor from: {PREDICTOR_CHECKPOINT_PATH}")
+    PREDICTOR_MODEL = AntioxidantPredictor(input_dim=1914)
     PREDICTOR_MODEL.load_state_dict(torch.load(PREDICTOR_CHECKPOINT_PATH, map_location=DEVICE))
     PREDICTOR_MODEL.to(DEVICE)
     PREDICTOR_MODEL.eval()
     print(f"✅ Predictor model loaded (Temp: {PREDICTOR_MODEL.get_temperature():.4f}).")
     # --- Load Scaler & Feature Extractor ---
+    print(f"Loading Scaler from: {SCALER_PATH}")
     SCALER = joblib.load(SCALER_PATH)
+    print("Loading ProtT5 Feature Extractor...")
     PROTT5_EXTRACTOR = FeatureProtT5Model(
+        model_dir_path=PROTT5_BASE_MODEL_PATH,
+        finetuned_weights_path=FINETUNED_PROTT5_FOR_FEATURES_PATH
     )
     print("✅ Scaler and Feature Extractor loaded.")
+    # --- Load Generator Model ---
+    print(f"Loading Generator from: {GENERATOR_CHECKPOINT_PATH}")
+    GENERATOR_MODEL = ProtT5Generator(vocab_size=VOCAB_SIZE)
     GENERATOR_MODEL.load_state_dict(torch.load(GENERATOR_CHECKPOINT_PATH, map_location=DEVICE))
     GENERATOR_MODEL.to(DEVICE)
     GENERATOR_MODEL.eval()
     print("✅ Generator model loaded.")
+    print("\n--- All models loaded! Gradio app is ready. ---\n")
 except Exception as e:
+    print(f"💥 FATAL ERROR during model loading: {e}")
+    raise gr.Error(f"A required model or file could not be loaded. Please check your repository file structure and paths. Error details: {e}")
 # --------------------------------------------------------------------------
+# SECTION 3: WRAPPER FUNCTIONS FOR GRADIO UI
 # --------------------------------------------------------------------------
 def predict_peptide_wrapper(sequence_str):
+    """Handles the prediction for a single peptide sequence from the UI."""
     if not sequence_str or not isinstance(sequence_str, str) or any(c not in AMINO_ACIDS for c in sequence_str.upper()):
+        return "0.0000", "Error: Please enter a valid peptide sequence using standard amino acids (ACDEFGHIKLMNPQRSTVWY)."
     try:
+        # Use the imported extract_features function.
+        # The L_fixed and d_model_pe values are taken from your original predictor.py arguments.
+        features = extract_features(sequence_str.upper(), PROTT5_EXTRACTOR, L_fixed=29, d_model_pe=16)
+        # Scale the features using the loaded scaler
         scaled_features = SCALER.transform(features.reshape(1, -1))
         with torch.no_grad():
         return f"{probability:.4f}", classification
     except Exception as e:
+        print(f"Prediction Error for sequence '{sequence_str}': {e}")
+        return "N/A", f"An error occurred during prediction: {e}"
 def generate_peptide_wrapper(num_to_generate, min_len, max_len, temperature, diversity_factor, progress=gr.Progress(track_tqdm=True)):
+    """Handles the full generation-validation-clustering pipeline."""
     num_to_generate = int(num_to_generate)
     min_len = int(min_len)
     max_len = int(max_len)
     try:
+        # Step 1: Generate a large, unique pool of candidate sequences
         target_pool_size = int(num_to_generate * diversity_factor)
         unique_seqs = set()
+        pbar_desc = "Step 1/3: Generating candidate sequences"
+        with tqdm(total=target_pool_size, desc=pbar_desc) as pbar:
             while len(unique_seqs) < target_pool_size:
                 batch_size = max(1, (target_pool_size - len(unique_seqs)))
                 with torch.no_grad():
                         batch_size=batch_size, max_length=max_len, device=DEVICE,
                         temperature=temperature, min_decoded_length=min_len
                     )
+                decoded_sequences = GENERATOR_MODEL.decode(generated_tokens)
                 initial_count = len(unique_seqs)
+                for seq in decoded_sequences:
                     if min_len <= len(seq) <= max_len:
                         unique_seqs.add(seq)
                 pbar.update(len(unique_seqs) - initial_count)
         candidate_seqs = list(unique_seqs)
+        # Step 2: Validate the generated sequences and filter for high probability
         validated_pool = {}
+        for seq in tqdm(candidate_seqs, desc="Step 2/3: Validating generated sequences"):
             prob_str, _ = predict_peptide_wrapper(seq)
             try:
                 prob = float(prob_str)
                 continue
         if not validated_pool:
+            return pd.DataFrame([{"Sequence": "No high-activity peptides (>0.9 prob) were generated. Try increasing the Diversity Factor or changing the Temperature.", "Predicted Probability": "N/A"}])
         high_quality_sequences = list(validated_pool.keys())
+        # Step 3: Cluster to ensure diversity in the final set
+        progress(1.0, desc="Step 3/3: Clustering for diversity...")
         final_diverse_seqs = cluster_sequences(GENERATOR_MODEL, high_quality_sequences, num_to_generate, DEVICE)
+        # Step 4: Format final results into a DataFrame
         final_results = [(seq, f"{validated_pool[seq]:.4f}") for seq in final_diverse_seqs]
         final_results.sort(key=lambda x: float(x[1]), reverse=True)
         return pd.DataFrame(final_results, columns=["Sequence", "Predicted Probability"])
     except Exception as e:
+        print(f"Generation Pipeline Error: {e}")
+        return pd.DataFrame([{"Sequence": f"An error occurred during generation: {e}", "Predicted Probability": "N/A"}])
 # --------------------------------------------------------------------------
 # SECTION 4: GRADIO UI CONSTRUCTION
 # --------------------------------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft(), title="RLAnOxPeptide") as demo:
+    gr.Markdown("# RLAnOxPeptide: Intelligent Peptide Design and Prediction")
     gr.Markdown("An integrated framework combining reinforcement learning and a Transformer model for the efficient prediction and innovative design of antioxidant peptides.")
     with gr.Tabs():
+        # --- PREDICTION TAB ---
         with gr.TabItem("Peptide Activity Predictor"):
             gr.Markdown("### Enter an amino acid sequence to predict its antioxidant activity.")
             with gr.Row():
                 peptide_input = gr.Textbox(label="Peptide Sequence", placeholder="e.g., WHYHDYKY", scale=3)
                 predict_button = gr.Button("Predict", variant="primary", scale=1)
             with gr.Row():
+                probability_output = gr.Textbox(label="Predicted Probability", interactive=False)
+                class_output = gr.Textbox(label="Predicted Class", interactive=False)
             predict_button.click(
                 fn=predict_peptide_wrapper,
                 outputs=[probability_output, class_output]
             )
             gr.Examples(
+                examples=[["WHYHDYKY"], ["YPGG"], ["LVLHEHGGN"], ["WKYG"]],
+                inputs=peptide_input,
+                fn=predict_peptide_wrapper,
+                outputs=[probability_output, class_output],
+                cache_examples=True
             )
+        # --- GENERATION TAB ---
         with gr.TabItem("Novel Sequence Generator"):
             gr.Markdown("### Set parameters to generate novel, high-activity antioxidant peptides.")
             with gr.Column():
                 with gr.Row():
+                    num_input = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="Number of Final Peptides to Generate")
+                    min_len_input = gr.Slider(minimum=3, maximum=10, value=3, step=1, label="Minimum Length")
                     max_len_input = gr.Slider(minimum=10, maximum=20, value=20, step=1, label="Maximum Length")
                 with gr.Row():
                     temp_input = gr.Slider(minimum=0.5, maximum=3.0, value=2.5, step=0.1, label="Temperature (Higher = More random)")
+                    diversity_input = gr.Slider(minimum=1.1, maximum=5.0, value=1.5, step=0.1, label="Diversity Factor (Larger initial pool for clustering)")
             generate_button = gr.Button("Generate Peptides", variant="primary")
+            results_output = gr.DataFrame(headers=["Sequence", "Predicted Probability"], label="Generated & Validated Peptides (>90% Probability)", wrap=True)
             generate_button.click(
                 fn=generate_peptide_wrapper,