Spaces:

nonzeroexit
/

AMP-Classifier

Running

App Files Files Community

nonzeroexit commited on May 28

Commit

febb4a6

verified ·

1 Parent(s): adaeb14

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -88

app.py CHANGED Viewed

@@ -8,21 +8,37 @@ import torch
 from transformers import BertTokenizer, BertModel
 from lime.lime_tabular import LimeTabularExplainer
 from math import expm1
 # Load AMP Classifier
-model = joblib.load("RF.joblib")
-scaler = joblib.load("norm (4).joblib")
 # Load ProtBert
-tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
-protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-protbert_model = protbert_model.to(device).eval()
-# Full list of selected features
 selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
     "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
-    "_PolarizabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
     "_SecondaryStrD1001", "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
     "_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001", "_PolarityD1050",
     "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001", "_NormalizedVDWVD2001", "_NormalizedVDWVD2025",
@@ -48,98 +64,252 @@ selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondarySt
     "APAAC15", "APAAC18", "APAAC19", "APAAC24"]
 # LIME Explainer Setup
-sample_data = np.random.rand(100, len(selected_features))
 explainer = LimeTabularExplainer(
     training_data=sample_data,
     feature_names=selected_features,
-    class_names=["AMP", "Non-AMP"],
     mode="classification"
 )
-def extract_features(sequence):
-    sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
-    if len(sequence) < 10:
-        return "Error: Sequence too short."
-    dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
-    filtered_dipeptide_features = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
-    ctd_features = CTD.CalculateCTD(sequence)
-    auto_features = Autocorrelation.CalculateAutoTotal(sequence)
-    pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
-    all_features_dict = {}
-    all_features_dict.update(ctd_features)
-    all_features_dict.update(filtered_dipeptide_features)
-    all_features_dict.update(auto_features)
-    all_features_dict.update(pseudo_features)
-    feature_df_all = pd.DataFrame([all_features_dict])
-    normalized_array = scaler.transform(feature_df_all.values)
-    normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
-    if not set(selected_features).issubset(set(normalized_df.columns)):
-        return "Error: Some selected features are missing from computed features."
-    selected_df = normalized_df[selected_features].fillna(0)
-    return selected_df.values
-def predictmic(sequence):
-    sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
-    if len(sequence) < 10:
-        return {"Error": "Sequence too short or invalid."}
-    seq_spaced = ' '.join(list(sequence))
-    tokens = tokenizer(seq_spaced, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
-    tokens = {k: v.to(device) for k, v in tokens.items()}
-    with torch.no_grad():
-        outputs = protbert_model(**tokens)
-        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().reshape(1, -1)
     bacteria_config = {
-        "E.coli": {"model": "coli_xgboost_model.pkl", "scaler": "coli_scaler.pkl", "pca": None},
-        "S.aureus": {"model": "aur_xgboost_model.pkl", "scaler": "aur_scaler.pkl", "pca": None},
-        "P.aeruginosa": {"model": "arg_xgboost_model.pkl", "scaler": "arg_scaler.pkl", "pca": None},
-        "K.Pneumonia": {"model": "pne_mlp_model.pkl", "scaler": "pne_scaler.pkl", "pca": "pne_pca.pkl"}
     }
     mic_results = {}
-    for bacterium, cfg in bacteria_config.items():
         try:
-            scaler = joblib.load(cfg["scaler"])
-            scaled = scaler.transform(embedding)
-            transformed = joblib.load(cfg["pca"]).transform(scaled) if cfg["pca"] else scaled
-            model = joblib.load(cfg["model"])
-            mic_log = model.predict(transformed)[0]
             mic = round(expm1(mic_log), 3)
-            mic_results[bacterium] = mic
         except Exception as e:
-            mic_results[bacterium] = f"Error: {str(e)}"
     return mic_results
-def full_prediction(sequence):
-    features = extract_features(sequence)
-    if isinstance(features, str):
-        return features
-    prediction = model.predict(features)[0]
-    probabilities = model.predict_proba(features)[0]
-    amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
-    confidence = round(probabilities[0 if prediction == 0 else 1] * 100, 2)
-    result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
-    if prediction == 0:
-        mic_values = predictmic(sequence)
-        result += "\nPredicted MIC Values (\u00b5M):\n"
-        for org, mic in mic_values.items():
-            result += f"- {org}: {mic}\n"
-    else:
-        result += "\nMIC prediction skipped for Non-AMP sequences.\n"
-    explanation = explainer.explain_instance(
-        data_row=features[0],
-        predict_fn=model.predict_proba,
-        num_features=10
-    )
-    result += "\nTop Features Influencing Prediction:\n"
-    for feat, weight in explanation.as_list():
-        result += f"- {feat}: {round(weight, 4)}\n"
-    return result
-iface = gr.Interface(
-    fn=full_prediction,
-    inputs=gr.Textbox(label="Enter Protein Sequence"),
-    outputs=gr.Textbox(label="Results"),
-    title="AMP & MIC Predictor + LIME Explanation",
-    description="Paste an amino acid sequence (\u226510 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
-)
-iface.launch(share=True)

 from transformers import BertTokenizer, BertModel
 from lime.lime_tabular import LimeTabularExplainer
 from math import expm1
+import matplotlib.pyplot as plt
+import io
+import base64
+import os
+# --- Configuration and Model Loading ---
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
 # Load AMP Classifier
+try:
+    model = joblib.load(os.path.join(MODEL_DIR, "RF.joblib"))
+    scaler = joblib.load(os.path.join(MODEL_DIR, "norm (4).joblib"))
+except FileNotFoundError as e:
+    raise gr.Error(f"Classifier model or scaler not found: {e}. Make sure RF.joblib and norm (4).joblib are in the {MODEL_DIR} directory.")
+except Exception as e:
+    raise gr.Error(f"Error loading classifier components: {e}")
 # Load ProtBert
+try:
+    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
+    protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    protbert_model = protbert_model.to(device).eval()
+except Exception as e:
+    raise gr.Error(f"Error loading ProtBert model/tokenizer: {e}. Check internet connection or model availability.")
+# Full list of selected features (as provided in the original code)
 selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
     "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
+    "_PolarabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
     "_SecondaryStrD1001", "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
     "_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001", "_PolarityD1050",
     "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001", "_NormalizedVDWVD2001", "_NormalizedVDWVD2025",
     "APAAC15", "APAAC18", "APAAC19", "APAAC24"]
 # LIME Explainer Setup
+try:
+    # Attempt to load a real sample data for LIME background if available
+    # e.g., sample_data = np.load(os.path.join(MODEL_DIR, 'sample_training_features_scaled.npy'))
+    sample_data = np.random.rand(500, len(selected_features)) # Fallback: Generate random sample data
+except Exception:
+    print("Warning: Could not load pre-saved sample data for LIME. Generating random sample data.")
+    sample_data = np.random.rand(500, len(selected_features)) # Generate enough samples
 explainer = LimeTabularExplainer(
     training_data=sample_data,
     feature_names=selected_features,
+    class_names=["AMP", "Non-AMP"], # Assuming 0 is AMP, 1 is Non-AMP as per model prediction
     mode="classification"
 )
+# --- Feature Extraction Function ---
+def extract_features(sequence: str) -> np.ndarray:
+    """
+    Extracts biochemical and compositional features from an amino acid sequence.
+    Args:
+        sequence (str): The amino acid sequence.
+    Returns:
+        np.ndarray: A scaled 2D numpy array of selected features (1, num_features).
+    Raises:
+        gr.Error: If the sequence is invalid or feature extraction fails.
+    """
+    cleaned_sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
+    if not (10 <= len(cleaned_sequence) <= 100):
+        raise gr.Error(f"Invalid sequence length ({len(cleaned_sequence)}). Must be between 10 and 100 characters and contain only standard amino acids.")
+    try:
+        dipeptide_features = AAComposition.CalculateAADipeptideComposition(cleaned_sequence)
+        ctd_features = CTD.CalculateCTD(cleaned_sequence)
+        auto_features = Autocorrelation.CalculateAutoTotal(cleaned_sequence)
+        pseudo_features = PseudoAAC.GetAPseudoAAC(cleaned_sequence, lamda=9)
+        all_features_dict = {}
+        all_features_dict.update(ctd_features)
+        all_features_dict.update(dipeptide_features)
+        all_features_dict.update(auto_features)
+        all_features_dict.update(pseudo_features)
+        feature_df_all = pd.DataFrame([all_features_dict])
+        computed_features_ordered = feature_df_all.reindex(columns=selected_features, fill_value=0)
+        computed_features_ordered = computed_features_ordered.fillna(0)
+        normalized_array = scaler.transform(computed_features_ordered.values)
+        return normalized_array
+    except Exception as e:
+        raise gr.Error(f"Feature extraction failed: {e}. Ensure sequence is valid and Propy dependencies are met.")
+# --- MIC Prediction Function ---
+def predictmic(sequence: str, selected_bacteria_keys: list) -> dict:
+    """
+    Predicts Minimum Inhibitory Concentration (MIC) for selected bacteria using ProtBert embeddings.
+    Args:
+        sequence (str): The amino acid sequence.
+        selected_bacteria_keys (list): List of keys for bacteria to predict MIC for (e.g., ['e_coli', 'p_aeruginosa']).
+    Returns:
+        dict: A dictionary where keys are bacterium keys and values are predicted MICs in µM.
+              Returns error messages for individual bacteria if prediction fails.
+    Raises:
+        gr.Error: If ProtBert embedding fails or sequence is invalid.
+    """
+    cleaned_sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
+    if not (10 <= len(cleaned_sequence) <= 100):
+        raise gr.Error(f"Invalid sequence length for MIC prediction ({len(cleaned_sequence)}). Must be between 10 and 100 characters.")
+    seq_spaced = ' '.join(list(cleaned_sequence))
+    try:
+        tokens = tokenizer(seq_spaced, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
+        tokens = {k: v.to(device) for k, v in tokens.items()}
+        with torch.no_grad():
+            outputs = protbert_model(**tokens)
+            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().reshape(1, -1)
+    except Exception as e:
+        raise gr.Error(f"Error generating ProtBert embedding: {e}. Check sequence format or model availability.")
     bacteria_config = {
+        "e_coli": {"display_name": "E.coli", "model": "coli_xgboost_model.pkl", "scaler": "coli_scaler.pkl", "pca": None},
+        "p_aeruginosa": {"display_name": "P. aeruginosa", "model": "arg_xgboost_model.pkl", "scaler": "arg_scaler.pkl", "pca": None},
+        "s_aureus": {"display_name": "S. aureus", "model": "aur_xgboost_model.pkl", "scaler": "aur_scaler.pkl", "pca": None},
+        "k_pneumoniae": {"display_name": "K. pneumoniae", "model": "pne_mlp_model.pkl", "scaler": "pne_scaler.pkl", "pca": "pne_pca.pkl"}
     }
     mic_results = {}
+    for bacterium_key in selected_bacteria_keys:
+        cfg = bacteria_config.get(bacterium_key)
+        if not cfg:
+            mic_results[bacterium_key] = "Error: Invalid bacterium key provided."
+            continue
         try:
+            mic_scaler = joblib.load(os.path.join(MODEL_DIR, cfg["scaler"]))
+            scaled_embedding = mic_scaler.transform(embedding)
+            transformed_embedding = scaled_embedding
+            if cfg["pca"]:
+                mic_pca = joblib.load(os.path.join(MODEL_DIR, cfg["pca"]))
+                transformed_embedding = mic_pca.transform(scaled_embedding)
+            mic_model = joblib.load(os.path.join(MODEL_DIR, cfg["model"]))
+            mic_log = mic_model.predict(transformed_embedding)[0]
             mic = round(expm1(mic_log), 3)
+            mic_results[bacterium_key] = mic
+        except FileNotFoundError as e:
+            mic_results[bacterium_key] = f"Model file not found for {cfg['display_name']}: {e}"
         except Exception as e:
+            mic_results[bacterium_key] = f"Prediction error for {cfg['display_name']}: {e}"
     return mic_results
+# --- LIME Plot Generation Helper ---
+def generate_lime_plot_base64(explanation_list: list) -> str:
+    """
+    Generates a LIME explanation plot and returns it as a base64 encoded PNG string.
+    Args:
+        explanation_list (list): The output from LimeExplanation.as_list().
+    Returns:
+        str: Base64 encoded PNG image string.
+    """
+    if not explanation_list:
+        return ""
+    fig, ax = plt.subplots(figsize=(10, 6))
+    features = [item[0] for item in explanation_list]
+    weights = [item[1] for item in explanation_list]
+    sorted_indices = np.argsort(np.abs(weights))[::-1]
+    features_sorted = [features[i] for i in sorted_indices]
+    weights_sorted = [weights[i] for i in sorted_indices]
+    y_pos = np.arange(len(features_sorted))
+    colors = ['green' if w > 0 else 'red' for w in weights_sorted]
+    ax.barh(y_pos, weights_sorted, align='center', color=colors)
+    ax.set_yticks(y_pos)
+    ax.set_yticklabels(features_sorted, fontsize=10)
+    ax.invert_yaxis()
+    ax.set_xlabel('Contribution to Prediction (LIME Weight)', fontsize=12)
+    ax.set_title('Top Features Influencing Prediction (LIME)', fontsize=14)
+    ax.axvline(0, color='grey', linestyle='--', linewidth=0.8)
+    plt.grid(axis='x', linestyle=':', alpha=0.7)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    buf.seek(0)
+    image_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
+    plt.close(fig)
+    return image_base64
+# --- Gradio API Endpoints ---
+def classify_and_interpret_amp(sequence: str) -> dict:
+    """
+    Gradio API endpoint for AMP classification and interpretability (LIME).
+    This function processes the sequence, performs classification, generates LIME explanation,
+    and formats the output as a structured dictionary for the frontend.
+    """
+    try:
+        features = extract_features(sequence)
+        prediction_class_idx = model.predict(features)[0]
+        probabilities = model.predict_proba(features)[0]
+        amp_label = "AMP (Positive)" if prediction_class_idx == 0 else "Non-AMP"
+        confidence = probabilities[prediction_class_idx]
+        explanation = explainer.explain_instance(
+            data_row=features[0],
+            predict_fn=model.predict_proba,
+            num_features=10
+        )
+        top_features = []
+        for feat_str, weight in explanation.as_list():
+            # Parse the feature string from LIME (e.g., "APAAC4 <= 0.23")
+            # This parsing is a heuristic based on LIME's default output format.
+            parts = feat_str.split(" ", 1)
+            feature_name = parts[0]
+            condition = parts[1] if len(parts) > 1 else ""
+            top_features.append({
+                "feature": feature_name,
+                "condition": condition.strip(),
+                "value": round(weight, 4)
+            })
+        lime_plot_base64_str = generate_lime_plot_base64(explanation.as_list())
+        return {
+            "label": amp_label,
+            "confidence": float(confidence),
+            "shap_plot_base64": lime_plot_base64_str,
+            "top_features": top_features
+        }
+    except gr.Error as e:
+        raise e
+    except Exception as e:
+        raise gr.Error(f"An unexpected error occurred during AMP classification: {e}")
+def get_mic_predictions_api(sequence: str, selected_bacteria_keys: list) -> dict:
+    """
+    Gradio API endpoint for MIC prediction.
+    This function wraps the `predictmic` function to serve as a separate API endpoint.
+    """
+    try:
+        mic_results = predictmic(sequence, selected_bacteria_keys)
+        return mic_results
+    except gr.Error as e:
+        raise e
+    except Exception as e:
+        raise gr.Error(f"An unexpected error occurred during MIC prediction API call: {e}")
+# --- Gradio Interface Definition ---
+with gr.Blocks() as demo:
+    gr.Markdown("# EPIC-AMP Platform Backend API")
+    gr.Markdown("This Gradio application provides the backend services for the EPIC-AMP frontend.")
+    with gr.Tab("AMP Classification & Interpretability API"):
+        gr.Markdown("### `/predict` Endpoint (AMP Classification, Confidence, LIME Plot, Top Features)")
+        gr.Markdown("Input an amino acid sequence (10-100 AAs) to get classification details.")
+        sequence_input_amp = gr.Textbox(label="Amino Acid Sequence", lines=5, placeholder="Enter sequence here...")
+        amp_api_output = gr.Json(label="AMP Prediction Details JSON Output")
+        gr.Button("Test Classification").click(
+            fn=classify_and_interpret_amp,
+            inputs=[sequence_input_amp],
+            outputs=[amp_api_output],
+            api_name="predict"
+        )
+    with gr.Tab("MIC Prediction API"):
+        gr.Markdown("### `/predict_mic` Endpoint (MIC Values)")
+        gr.Markdown("Input an amino acid sequence (only if classified as AMP) and select bacteria to get predicted MIC values.")
+        sequence_input_mic = gr.Textbox(label="Amino Acid Sequence", lines=5, placeholder="Enter AMP sequence for MIC prediction...")
+        mic_bacteria_checkboxes = gr.CheckboxGroup(
+            choices=["e_coli", "p_aeruginosa", "s_aureus", "k_pneumoniae"],
+            label="Select Bacteria for MIC Prediction (keys for backend)"
+        )
+        mic_api_output = gr.Json(label="MIC Prediction JSON Output")
+        gr.Button("Test MIC Prediction").click(
+            fn=get_mic_predictions_api,
+            inputs=[sequence_input_mic, mic_bacteria_checkboxes],
+            outputs=[mic_api_output],
+            api_name="predict_mic"
+        )
+demo.launch(share=True, enable_queue=True, show_api=True)