Spaces:
Running
Running
File size: 5,220 Bytes
85c36de 942bf87 51a3749 ea9a1bf e199881 51a3749 248a61c e199881 248ff12 942bf87 248a61c e199881 11e1095 dc9275e 3b84715 aa6838a c63f76d aa6838a c63f76d aa6838a 248a61c aa6838a c63f76d aa6838a c63f76d aa6838a c63f76d a359627 aa6838a a359627 aa6838a c63f76d 9748994 85c36de 248a61c 9f51e97 81bcfb3 248a61c 81bcfb3 e199881 c9a939f 81bcfb3 248a61c 85c36de 248a61c 85c36de 81bcfb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
from sklearn.preprocessing import MinMaxScaler
# Load model and scaler
model = joblib.load("RF.joblib")
scaler = joblib.load("norm (1).joblib")
# Feature list (KEEP THIS CONSISTENT)
selected_features = [
"_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
"_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
"_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
"_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001",
"_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
"_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001",
"_PolarityD1050", "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001",
"_NormalizedVDWVD2001", "_NormalizedVDWVD2025", "_NormalizedVDWVD2050", "_NormalizedVDWVD3001",
"_HydrophobicityD1001", "_HydrophobicityD2001", "_HydrophobicityD3001", "_HydrophobicityD3025",
"A", "R", "D", "C", "E", "Q", "H", "I", "M", "P", "Y", "V",
"AR", "AV", "RC", "RL", "RV", "CR", "CC", "CL", "CK", "EE", "EI", "EL",
"HC", "IA", "IL", "IV", "LA", "LC", "LE", "LI", "LT", "LV", "KC", "MA",
"MS", "SC", "TC", "TV", "YC", "VC", "VE", "VL", "VK", "VV",
"MoreauBrotoAuto_FreeEnergy30", "MoranAuto_Hydrophobicity2", "MoranAuto_Hydrophobicity4",
"GearyAuto_Hydrophobicity20", "GearyAuto_Hydrophobicity24", "GearyAuto_Hydrophobicity26",
"GearyAuto_Hydrophobicity27", "GearyAuto_Hydrophobicity28", "GearyAuto_Hydrophobicity29",
"GearyAuto_Hydrophobicity30", "GearyAuto_AvFlexibility22", "GearyAuto_AvFlexibility26",
"GearyAuto_AvFlexibility27", "GearyAuto_AvFlexibility28", "GearyAuto_AvFlexibility29",
"GearyAuto_AvFlexibility30", "GearyAuto_Polarizability22", "GearyAuto_Polarizability24",
"GearyAuto_Polarizability25", "GearyAuto_Polarizability27", "GearyAuto_Polarizability28",
"GearyAuto_Polarizability29", "GearyAuto_Polarizability30", "GearyAuto_FreeEnergy24",
"GearyAuto_FreeEnergy25", "GearyAuto_FreeEnergy30", "GearyAuto_ResidueASA21",
"GearyAuto_ResidueASA22", "GearyAuto_ResidueASA23", "GearyAuto_ResidueASA24",
"GearyAuto_ResidueASA30", "GearyAuto_ResidueVol21", "GearyAuto_ResidueVol24",
"GearyAuto_ResidueVol25", "GearyAuto_ResidueVol26", "GearyAuto_ResidueVol28",
"GearyAuto_ResidueVol29", "GearyAuto_ResidueVol30", "GearyAuto_Steric18",
"GearyAuto_Steric21", "GearyAuto_Steric26", "GearyAuto_Steric27", "GearyAuto_Steric28",
"GearyAuto_Steric29", "GearyAuto_Steric30", "GearyAuto_Mutability23", "GearyAuto_Mutability25",
"GearyAuto_Mutability26", "GearyAuto_Mutability27", "GearyAuto_Mutability28",
"GearyAuto_Mutability29", "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5",
"APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13", "APAAC15", "APAAC18", "APAAC19",
"APAAC24"
]
def extract_features(sequence):
"""Extract selected features and normalize them."""
if len(sequence) < 3: # Ensure sequence is long enough
return None # Return None if sequence is too short
all_features_dict = {}
dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
all_features_dict.update(dipeptide_features) # Use update instead of reassignment
auto_features = Autocorrelation.CalculateAutoTotal(sequence)
all_features_dict.update(auto_features) # Use update
ctd_features = CTD.CalculateCTD(sequence)
all_features_dict.update(ctd_features) # Use update
pseudo_features = PseudoAAC.GetAPseudoAAC(sequence)
all_features_dict.update(pseudo_features) # Use update
feature_values = list(all_features_dict.values()) # Use all_features_dict
feature_array = np.array(feature_values).reshape(-1, 1)
normalized_features = scaler.transform(feature_array.T)
normalized_features = normalized_features.flatten()
selected_feature_dict = {}
for i, feature in enumerate(selected_features):
if feature in all_features_dict: # Use all_features_dict
selected_feature_dict[feature] = normalized_features[i]
selected_feature_df = pd.DataFrame([selected_feature_dict])
selected_feature_array = selected_feature_df.T.to_numpy()
return selected_feature_array
def predict(sequence):
"""Predicts whether the input sequence is an AMP."""
features = extract_features(sequence)
if features is None:
return "Error: Could not extract features."
prediction = model.predict(features)[0]
probabilities = model.predict_proba(features)[0]
if prediction == 0:
return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
else:
return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"
# Gradio interface
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="Enter Protein Sequence"),
outputs=gr.Label(label="Prediction"),
title="AMP Classifier",
description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict AMP."
)
iface.launch(share=True) |