File size: 5,715 Bytes
85c36de
942bf87
51a3749
ea9a1bf
e199881
51a3749
 
81bcfb3
e199881
248ff12
942bf87
81bcfb3
e199881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11e1095
dc9275e
f4d6f55
 
3b84715
81bcfb3
9748994
81bcfb3
9748994
 
 
81bcfb3
98a1e1e
81bcfb3
 
 
98a1e1e
81bcfb3
 
9748994
 
81bcfb3
 
9748994
e199881
9748994
81bcfb3
9748994
81bcfb3
 
 
 
 
9748994
81bcfb3
 
85c36de
8efdc57
47bb3e1
85c36de
81bcfb3
9f51e97
81bcfb3
 
 
 
 
 
e199881
 
c9a939f
81bcfb3
 
 
 
 
 
85c36de
81bcfb3
85c36de
 
 
 
 
81bcfb3
85c36de
 
81bcfb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
from sklearn.preprocessing import MinMaxScaler

# Load the pre-trained model and scaler
model = joblib.load("RF.joblib")
scaler = joblib.load("norm (1).joblib")

# Define the list of selected features (IMPORTANT: Keep this consistent with training)
selected_features =  [
    "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
    "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
    "_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
    "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001",
    "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
    "_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001",
    "_PolarityD1050", "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001",
    "_NormalizedVDWVD2001", "_NormalizedVDWVD2025", "_NormalizedVDWVD2050", "_NormalizedVDWVD3001",
    "_HydrophobicityD1001", "_HydrophobicityD2001", "_HydrophobicityD3001", "_HydrophobicityD3025",
    "A", "R", "D", "C", "E", "Q", "H", "I", "M", "P", "Y", "V",
    "AR", "AV", "RC", "RL", "RV", "CR", "CC", "CL", "CK", "EE", "EI", "EL",
    "HC", "IA", "IL", "IV", "LA", "LC", "LE", "LI", "LT", "LV", "KC", "MA",
    "MS", "SC", "TC", "TV", "YC", "VC", "VE", "VL", "VK", "VV",
    "MoreauBrotoAuto_FreeEnergy30", "MoranAuto_Hydrophobicity2", "MoranAuto_Hydrophobicity4",
    "GearyAuto_Hydrophobicity20", "GearyAuto_Hydrophobicity24", "GearyAuto_Hydrophobicity26",
    "GearyAuto_Hydrophobicity27", "GearyAuto_Hydrophobicity28", "GearyAuto_Hydrophobicity29",
    "GearyAuto_Hydrophobicity30", "GearyAuto_AvFlexibility22", "GearyAuto_AvFlexibility26",
    "GearyAuto_AvFlexibility27", "GearyAuto_AvFlexibility28", "GearyAuto_AvFlexibility29",
    "GearyAuto_AvFlexibility30", "GearyAuto_Polarizability22", "GearyAuto_Polarizability24",
    "GearyAuto_Polarizability25", "GearyAuto_Polarizability27", "GearyAuto_Polarizability28",
    "GearyAuto_Polarizability29", "GearyAuto_Polarizability30", "GearyAuto_FreeEnergy24",
    "GearyAuto_FreeEnergy25", "GearyAuto_FreeEnergy30", "GearyAuto_ResidueASA21",
    "GearyAuto_ResidueASA22", "GearyAuto_ResidueASA23", "GearyAuto_ResidueASA24",
    "GearyAuto_ResidueASA30", "GearyAuto_ResidueVol21", "GearyAuto_ResidueVol24",
    "GearyAuto_ResidueVol25", "GearyAuto_ResidueVol26", "GearyAuto_ResidueVol28",
    "GearyAuto_ResidueVol29", "GearyAuto_ResidueVol30", "GearyAuto_Steric18",
    "GearyAuto_Steric21", "GearyAuto_Steric26", "GearyAuto_Steric27", "GearyAuto_Steric28",
    "GearyAuto_Steric29", "GearyAuto_Steric30", "GearyAuto_Mutability23", "GearyAuto_Mutability25",
    "GearyAuto_Mutability26", "GearyAuto_Mutability27", "GearyAuto_Mutability28",
    "GearyAuto_Mutability29", "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5",
    "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13", "APAAC15", "APAAC18", "APAAC19",
    "APAAC24"
]



def extract_features(sequence):
    """Extracts features from a protein sequence and returns them as a NumPy array."""
    try:
        # Calculate features from different ProPy modules
        comp_features = AAComposition.CalculateAAComposition(sequence)
        auto_features = Autocorrelation.CalculateAutoTotal(sequence)
        ctd_features = CTD.CalculateCTD(sequence)
        pseudo_features = PseudoAAC.GetAPseudoAAC(sequence)  # Use default parameters

        # Combine all features into a single dictionary
        all_features = {**comp_features, **auto_features, **ctd_features, **pseudo_features}
        #print(len(all_features)) # debugging

        # Convert to DataFrame, selecting only the required features
        all_features_df = pd.DataFrame([all_features])
        all_features_df = all_features_df[selected_features]


        # Normalize the features using the pre-fitted scaler
        normalized_features = scaler.transform(all_features_df)

        return normalized_features

    except ZeroDivisionError:
        print("Error: Division by zero encountered in feature calculation.  Check your input sequence.")
        return None  # Or handle appropriately
    except KeyError as e:
        print(f"Error: Missing feature {e}.  Check feature name consistency and ProPy version.")
        return None # Or handle appropriately
    except Exception as e:
        print(f"An unexpected error occurred during feature extraction: {e}")
        return None  # Or handle appropriately



def predict(sequence):
    """Predicts whether the input sequence is an AMP and returns the prediction."""
    features = extract_features(sequence)

    # Check if feature extraction was successful
    if features is None:
        return "Error: Could not extract features. Please check the input sequence."

    # No need to reshape here; extract_features already returns the correct shape
    prediction = model.predict(features)[0]
    probabilities = model.predict_proba(features)[0]

    # Determine output string based on prediction
    if prediction == 0:
        return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
    else:
        return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"


# Gradio interface setup
iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(label="Enter Protein Sequence"),
    outputs=gr.Label(label="Prediction"),
    title="AMP Classifier",
    description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict whether it's an antimicrobial peptide (AMP) or not."
)

iface.launch(share=True)