AMP-Classifier / app.py
nonzeroexit's picture
Update app.py
81bcfb3 verified
raw
history blame
5.72 kB
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
from sklearn.preprocessing import MinMaxScaler
# Load the pre-trained model and scaler
model = joblib.load("RF.joblib")
scaler = joblib.load("norm (1).joblib")
# Define the list of selected features (IMPORTANT: Keep this consistent with training)
selected_features = [
"_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
"_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
"_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
"_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001",
"_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
"_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001",
"_PolarityD1050", "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001",
"_NormalizedVDWVD2001", "_NormalizedVDWVD2025", "_NormalizedVDWVD2050", "_NormalizedVDWVD3001",
"_HydrophobicityD1001", "_HydrophobicityD2001", "_HydrophobicityD3001", "_HydrophobicityD3025",
"A", "R", "D", "C", "E", "Q", "H", "I", "M", "P", "Y", "V",
"AR", "AV", "RC", "RL", "RV", "CR", "CC", "CL", "CK", "EE", "EI", "EL",
"HC", "IA", "IL", "IV", "LA", "LC", "LE", "LI", "LT", "LV", "KC", "MA",
"MS", "SC", "TC", "TV", "YC", "VC", "VE", "VL", "VK", "VV",
"MoreauBrotoAuto_FreeEnergy30", "MoranAuto_Hydrophobicity2", "MoranAuto_Hydrophobicity4",
"GearyAuto_Hydrophobicity20", "GearyAuto_Hydrophobicity24", "GearyAuto_Hydrophobicity26",
"GearyAuto_Hydrophobicity27", "GearyAuto_Hydrophobicity28", "GearyAuto_Hydrophobicity29",
"GearyAuto_Hydrophobicity30", "GearyAuto_AvFlexibility22", "GearyAuto_AvFlexibility26",
"GearyAuto_AvFlexibility27", "GearyAuto_AvFlexibility28", "GearyAuto_AvFlexibility29",
"GearyAuto_AvFlexibility30", "GearyAuto_Polarizability22", "GearyAuto_Polarizability24",
"GearyAuto_Polarizability25", "GearyAuto_Polarizability27", "GearyAuto_Polarizability28",
"GearyAuto_Polarizability29", "GearyAuto_Polarizability30", "GearyAuto_FreeEnergy24",
"GearyAuto_FreeEnergy25", "GearyAuto_FreeEnergy30", "GearyAuto_ResidueASA21",
"GearyAuto_ResidueASA22", "GearyAuto_ResidueASA23", "GearyAuto_ResidueASA24",
"GearyAuto_ResidueASA30", "GearyAuto_ResidueVol21", "GearyAuto_ResidueVol24",
"GearyAuto_ResidueVol25", "GearyAuto_ResidueVol26", "GearyAuto_ResidueVol28",
"GearyAuto_ResidueVol29", "GearyAuto_ResidueVol30", "GearyAuto_Steric18",
"GearyAuto_Steric21", "GearyAuto_Steric26", "GearyAuto_Steric27", "GearyAuto_Steric28",
"GearyAuto_Steric29", "GearyAuto_Steric30", "GearyAuto_Mutability23", "GearyAuto_Mutability25",
"GearyAuto_Mutability26", "GearyAuto_Mutability27", "GearyAuto_Mutability28",
"GearyAuto_Mutability29", "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5",
"APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13", "APAAC15", "APAAC18", "APAAC19",
"APAAC24"
]
def extract_features(sequence):
"""Extracts features from a protein sequence and returns them as a NumPy array."""
try:
# Calculate features from different ProPy modules
comp_features = AAComposition.CalculateAAComposition(sequence)
auto_features = Autocorrelation.CalculateAutoTotal(sequence)
ctd_features = CTD.CalculateCTD(sequence)
pseudo_features = PseudoAAC.GetAPseudoAAC(sequence) # Use default parameters
# Combine all features into a single dictionary
all_features = {**comp_features, **auto_features, **ctd_features, **pseudo_features}
#print(len(all_features)) # debugging
# Convert to DataFrame, selecting only the required features
all_features_df = pd.DataFrame([all_features])
all_features_df = all_features_df[selected_features]
# Normalize the features using the pre-fitted scaler
normalized_features = scaler.transform(all_features_df)
return normalized_features
except ZeroDivisionError:
print("Error: Division by zero encountered in feature calculation. Check your input sequence.")
return None # Or handle appropriately
except KeyError as e:
print(f"Error: Missing feature {e}. Check feature name consistency and ProPy version.")
return None # Or handle appropriately
except Exception as e:
print(f"An unexpected error occurred during feature extraction: {e}")
return None # Or handle appropriately
def predict(sequence):
"""Predicts whether the input sequence is an AMP and returns the prediction."""
features = extract_features(sequence)
# Check if feature extraction was successful
if features is None:
return "Error: Could not extract features. Please check the input sequence."
# No need to reshape here; extract_features already returns the correct shape
prediction = model.predict(features)[0]
probabilities = model.predict_proba(features)[0]
# Determine output string based on prediction
if prediction == 0:
return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
else:
return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"
# Gradio interface setup
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="Enter Protein Sequence"),
outputs=gr.Label(label="Prediction"),
title="AMP Classifier",
description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict whether it's an antimicrobial peptide (AMP) or not."
)
iface.launch(share=True)