AMP-Classifier / app.py
nonzeroexit's picture
Update app.py
fc7380a verified
raw
history blame
3.65 kB
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from propy import AAComposition
from sklearn.preprocessing import MinMaxScaler
# Load trained model and scaler
model = joblib.load("SVM.joblib")
scaler = joblib.load("norm.joblib")
# Selected features used in training
selected_features = [
"A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V",
"AA", "AR", "AN", "AD", "AC", "AE", "AQ", "AG", "AI", "AL", "AK", "AF", "AP", "AS", "AT", "AY", "AV",
"RA", "RR", "RN", "RD", "RC", "RE", "RQ", "RG", "RH", "RI", "RL", "RK", "RM", "RF", "RS", "RT", "RY", "RV",
"NA", "NR", "ND", "NC", "NE", "NG", "NI", "NL", "NK", "NP",
"DA", "DR", "DN", "DD", "DC", "DE", "DQ", "DG", "DI", "DL", "DK", "DP", "DS", "DT", "DV",
"CA", "CR", "CN", "CD", "CC", "CE", "CG", "CH", "CI", "CL", "CK", "CF", "CP", "CS", "CT", "CY", "CV",
"EA", "ER", "EN", "ED", "EC", "EE", "EQ", "EG", "EI", "EL", "EK", "EP", "ES", "ET", "EV",
"QA", "QR", "QC", "QG", "QL", "QK", "QP", "QT", "QV",
"GA", "GR", "GD", "GC", "GE", "GQ", "GG", "GI", "GL", "GK", "GF", "GP", "GS", "GW", "GY", "GV",
"HC", "HG", "HL", "HK", "HP",
"IA", "IR", "ID", "IC", "IE", "II", "IL", "IK", "IF", "IP", "IS", "IT", "IV",
"LA", "LR", "LN", "LD", "LC", "LE", "LQ", "LG", "LI", "LL", "LK", "LM", "LF", "LP", "LS", "LT", "LV",
"KA", "KR", "KN", "KD", "KC", "KE", "KQ", "KG", "KH", "KI", "KL", "KK", "KM", "KF", "KP", "KS", "KT", "KV",
"MA", "ME", "MI", "ML", "MK", "MF", "MP", "MS", "MT", "MV",
"FR", "FC", "FQ", "FG", "FI", "FL", "FF", "FS", "FT", "FY", "FV",
"PA", "PR", "PD", "PC", "PE", "PG", "PL", "PK", "PS", "PV",
"SA", "SR", "SD", "SC", "SE", "SG", "SH", "SI", "SL", "SK", "SF", "SP", "SS", "ST", "SY", "SV",
"TA", "TR", "TN", "TC", "TE", "TG", "TI", "TL", "TK", "TF", "TP", "TS", "TT", "TV",
"WC",
"YR", "YD", "YC", "YG", "YL", "YS", "YV",
"VA", "VR", "VD", "VC", "VE", "VQ", "VG", "VI", "VL", "VK", "VP", "VS", "VT", "VY", "VV"
]
def extract_features(sequence):
"""Extract selected features and normalize them."""
all_features = AAComposition.CalculateAADipeptideComposition(sequence)
feature_values = list(all_features.values())
feature_array = np.array(feature_values).reshape(-1, 1)
feature_array = feature_array[: 420] # Ensure we only use 420 features
normalized_features = scaler.transform(feature_array.T)
normalized_features = normalized_features.flatten()
# Select features that match training data
selected_feature_dict = {feature: normalized_features[i] for i, feature in enumerate(selected_features)
if feature in all_features}
selected_feature_df = pd.DataFrame([selected_feature_dict])
selected_feature_array = selected_feature_df.T.to_numpy()
return selected_feature_array
def predict(sequence):
"""Predict if the sequence is an AMP or not."""
features = extract_features(sequence)
prediction = model.predict(features.T)[0]
probabilities = model.predict_proba(features.T)[0]
prob_amp = probabilities[0]
prob_non_amp = probabilities[1]
return f"⚑ {prob_amp * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)" if prediction == 0 else f"❌ {prob_non_amp * 100:.2f}% chance of being Non-AMP"
# Gradio interface
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="Enter Protein Sequence"),
outputs=gr.Label(label="Prediction"),
title="AMP Classifier",
description="Enter an amino acid sequence to predict whether it's an antimicrobial peptide (AMP) or not."
)
iface.launch(share=True)