Spaces:
Running
Running
File size: 5,511 Bytes
85c36de 942bf87 51a3749 ea9a1bf e199881 51a3749 248a61c e199881 248ff12 942bf87 248a61c e199881 11e1095 dc9275e 3b84715 7d97f16 8319384 c63f76d aa6838a c63f76d 8319384 248a61c aa6838a 8319384 c63f76d aa6838a 8319384 c63f76d 7d97f16 8319384 c63f76d 7d97f16 a359627 7d97f16 a359627 aa6838a 7d97f16 c63f76d 7d97f16 c63f76d 9748994 85c36de 248a61c 9f51e97 7d97f16 81bcfb3 e199881 c9a939f 81bcfb3 248a61c 85c36de 248a61c 85c36de 81bcfb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
from sklearn.preprocessing import MinMaxScaler
# Load model and scaler
model = joblib.load("RF.joblib")
scaler = joblib.load("norm (1).joblib")
# Feature list (KEEP THIS CONSISTENT)
selected_features = [
"_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
"_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
"_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
"_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001",
"_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
"_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001",
"_PolarityD1050", "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001",
"_NormalizedVDWVD2001", "_NormalizedVDWVD2025", "_NormalizedVDWVD2050", "_NormalizedVDWVD3001",
"_HydrophobicityD1001", "_HydrophobicityD2001", "_HydrophobicityD3001", "_HydrophobicityD3025",
"A", "R", "D", "C", "E", "Q", "H", "I", "M", "P", "Y", "V",
"AR", "AV", "RC", "RL", "RV", "CR", "CC", "CL", "CK", "EE", "EI", "EL",
"HC", "IA", "IL", "IV", "LA", "LC", "LE", "LI", "LT", "LV", "KC", "MA",
"MS", "SC", "TC", "TV", "YC", "VC", "VE", "VL", "VK", "VV",
"MoreauBrotoAuto_FreeEnergy30", "MoranAuto_Hydrophobicity2", "MoranAuto_Hydrophobicity4",
"GearyAuto_Hydrophobicity20", "GearyAuto_Hydrophobicity24", "GearyAuto_Hydrophobicity26",
"GearyAuto_Hydrophobicity27", "GearyAuto_Hydrophobicity28", "GearyAuto_Hydrophobicity29",
"GearyAuto_Hydrophobicity30", "GearyAuto_AvFlexibility22", "GearyAuto_AvFlexibility26",
"GearyAuto_AvFlexibility27", "GearyAuto_AvFlexibility28", "GearyAuto_AvFlexibility29",
"GearyAuto_AvFlexibility30", "GearyAuto_Polarizability22", "GearyAuto_Polarizability24",
"GearyAuto_Polarizability25", "GearyAuto_Polarizability27", "GearyAuto_Polarizability28",
"GearyAuto_Polarizability29", "GearyAuto_Polarizability30", "GearyAuto_FreeEnergy24",
"GearyAuto_FreeEnergy25", "GearyAuto_FreeEnergy30", "GearyAuto_ResidueASA21",
"GearyAuto_ResidueASA22", "GearyAuto_ResidueASA23", "GearyAuto_ResidueASA24",
"GearyAuto_ResidueASA30", "GearyAuto_ResidueVol21", "GearyAuto_ResidueVol24",
"GearyAuto_ResidueVol25", "GearyAuto_ResidueVol26", "GearyAuto_ResidueVol28",
"GearyAuto_ResidueVol29", "GearyAuto_ResidueVol30", "GearyAuto_Steric18",
"GearyAuto_Steric21", "GearyAuto_Steric26", "GearyAuto_Steric27", "GearyAuto_Steric28",
"GearyAuto_Steric29", "GearyAuto_Steric30", "GearyAuto_Mutability23", "GearyAuto_Mutability25",
"GearyAuto_Mutability26", "GearyAuto_Mutability27", "GearyAuto_Mutability28",
"GearyAuto_Mutability29", "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5",
"APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13", "APAAC15", "APAAC18", "APAAC19",
"APAAC24"
]
def extract_features(sequence):
"""Extract selected features, ensure order matches trained features, and normalize them."""
if len(sequence) <= 9:
return "Error: Protein sequence must be longer than 9 amino acids to extract features (for lamda=9)."
all_features_dict = {}
dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
all_features_dict.update(dipeptide_features)
auto_features = Autocorrelation.CalculateAutoTotal(sequence)
all_features_dict.update(auto_features)
ctd_features = CTD.CalculateCTD(sequence)
all_features_dict.update(ctd_features)
pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
all_features_dict.update(pseudo_features)
# Create an ordered list of feature values based on selected_features
ordered_feature_values = []
missing_features = []
for feature_name in selected_features:
if feature_name in all_features_dict:
ordered_feature_values.append(all_features_dict[feature_name])
else:
missing_features.append(feature_name)
ordered_feature_values.append(0) # Pad with 0 for missing features - important for consistent input size
if missing_features:
print(f"Warning: The following features were missing from extraction and padded with 0: {missing_features}")
feature_array = np.array(ordered_feature_values).reshape(1, -1) # Reshape to (1, n_features) for single sample
normalized_features = scaler.transform(feature_array) # Normalize the ordered feature array
return normalized_features # Return the normalized features as a 2D numpy array
def predict(sequence):
"""Predicts whether the input sequence is an AMP."""
features = extract_features(sequence)
if isinstance(features, str) and features.startswith("Error:"):
return features
prediction = model.predict(features)[0]
probabilities = model.predict_proba(features)[0]
if prediction == 0:
return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
else:
return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"
# Gradio interface
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="Enter Protein Sequence"),
outputs=gr.Label(label="Prediction"),
title="AMP Classifier",
description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict AMP."
)
iface.launch(share=True) |