nonzeroexit commited on
Commit
e199881
·
verified ·
1 Parent(s): 472e9a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -44
app.py CHANGED
@@ -2,66 +2,80 @@ import gradio as gr
2
  import joblib
3
  import numpy as np
4
  import pandas as pd
5
- from propy import AAComposition
6
  from sklearn.preprocessing import MinMaxScaler
7
 
8
- # Load trained model and scaler
9
- model = joblib.load("SVM.joblib")
10
  scaler = joblib.load("norm.joblib")
11
 
12
- # Selected features used in training
13
- selected_features = [
14
- "A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V",
15
- "AA", "AR", "AN", "AD", "AC", "AE", "AQ", "AG", "AI", "AL", "AK", "AF", "AP", "AS", "AT", "AY", "AV",
16
- "RA", "RR", "RN", "RD", "RC", "RE", "RQ", "RG", "RH", "RI", "RL", "RK", "RM", "RF", "RS", "RT", "RY", "RV",
17
- "NA", "NR", "ND", "NC", "NE", "NG", "NI", "NL", "NK", "NP",
18
- "DA", "DR", "DN", "DD", "DC", "DE", "DQ", "DG", "DI", "DL", "DK", "DP", "DS", "DT", "DV",
19
- "CA", "CR", "CN", "CD", "CC", "CE", "CG", "CH", "CI", "CL", "CK", "CF", "CP", "CS", "CT", "CY", "CV",
20
- "EA", "ER", "EN", "ED", "EC", "EE", "EQ", "EG", "EI", "EL", "EK", "EP", "ES", "ET", "EV",
21
- "QA", "QR", "QC", "QG", "QL", "QK", "QP", "QT", "QV",
22
- "GA", "GR", "GD", "GC", "GE", "GQ", "GG", "GI", "GL", "GK", "GF", "GP", "GS", "GW", "GY", "GV",
23
- "HC", "HG", "HL", "HK", "HP",
24
- "IA", "IR", "ID", "IC", "IE", "II", "IL", "IK", "IF", "IP", "IS", "IT", "IV",
25
- "LA", "LR", "LN", "LD", "LC", "LE", "LQ", "LG", "LI", "LL", "LK", "LM", "LF", "LP", "LS", "LT", "LV",
26
- "KA", "KR", "KN", "KD", "KC", "KE", "KQ", "KG", "KH", "KI", "KL", "KK", "KM", "KF", "KP", "KS", "KT", "KV",
27
- "MA", "ME", "MI", "ML", "MK", "MF", "MP", "MS", "MT", "MV",
28
- "FR", "FC", "FQ", "FG", "FI", "FL", "FF", "FS", "FT", "FY", "FV",
29
- "PA", "PR", "PD", "PC", "PE", "PG", "PL", "PK", "PS", "PV",
30
- "SA", "SR", "SD", "SC", "SE", "SG", "SH", "SI", "SL", "SK", "SF", "SP", "SS", "ST", "SY", "SV",
31
- "TA", "TR", "TN", "TC", "TE", "TG", "TI", "TL", "TK", "TF", "TP", "TS", "TT", "TV",
32
- "WC",
33
- "YR", "YD", "YC", "YG", "YL", "YS", "YV",
34
- "VA", "VR", "VD", "VC", "VE", "VQ", "VG", "VI", "VL", "VK", "VP", "VS", "VT", "VY", "VV"
 
 
 
 
 
 
 
 
 
 
35
  ]
36
 
37
  def extract_features(sequence):
38
- """Extract selected features and normalize them."""
39
- all_features = AAComposition.CalculateAADipeptideComposition(sequence)
40
- feature_values = list(all_features.values())
41
- feature_array = np.array(feature_values).reshape(-1, 1)
42
- feature_array = feature_array[: 420] # Ensure we only use 420 features
43
- normalized_features = scaler.transform(feature_array.T)
44
- normalized_features = normalized_features.flatten()
 
 
 
 
 
45
 
46
  # Select features that match training data
47
- selected_feature_dict = {feature: normalized_features[i] for i, feature in enumerate(selected_features)
48
- if feature in all_features}
49
- selected_feature_df = pd.DataFrame([selected_feature_dict])
50
- selected_feature_array = selected_feature_df.T.to_numpy()
51
 
52
- return selected_feature_array
53
 
54
  def predict(sequence):
55
  """Predict if the sequence is an AMP or not."""
56
  features = extract_features(sequence)
57
- prediction = model.predict(features.T)[0]
58
- probabilities = model.predict_proba(features.T)[0]
59
- prob_amp = probabilities[0]
60
- prob_non_amp = probabilities[1]
 
61
 
62
- return f"{prob_amp * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)" if prediction == 0 else f"{prob_non_amp * 100:.2f}% chance of being Non-AMP"
63
 
64
- # Gradio interface
65
  iface = gr.Interface(
66
  fn=predict,
67
  inputs=gr.Textbox(label="Enter Protein Sequence"),
 
2
  import joblib
3
  import numpy as np
4
  import pandas as pd
5
+ from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
6
  from sklearn.preprocessing import MinMaxScaler
7
 
8
+ model = joblib.load("RF.joblib")
 
9
  scaler = joblib.load("norm.joblib")
10
 
11
+ selected_features = [
12
+ "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
13
+ "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
14
+ "_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
15
+ "_SolventAccessibilityD2001", "_SolventAccessibilityD3001", "_SecondaryStrD1001",
16
+ "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
17
+ "_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001",
18
+ "_PolarityD1050", "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001",
19
+ "_NormalizedVDWVD2001", "_NormalizedVDWVD2025", "_NormalizedVDWVD2050", "_NormalizedVDWVD3001",
20
+ "_HydrophobicityD1001", "_HydrophobicityD2001", "_HydrophobicityD3001", "_HydrophobicityD3025",
21
+ "A", "R", "D", "C", "E", "Q", "H", "I", "M", "P", "Y", "V",
22
+ "AR", "AV", "RC", "RL", "RV", "CR", "CC", "CL", "CK", "EE", "EI", "EL",
23
+ "HC", "IA", "IL", "IV", "LA", "LC", "LE", "LI", "LT", "LV", "KC", "MA",
24
+ "MS", "SC", "TC", "TV", "YC", "VC", "VE", "VL", "VK", "VV",
25
+ "MoreauBrotoAuto_FreeEnergy30", "MoranAuto_Hydrophobicity2", "MoranAuto_Hydrophobicity4",
26
+ "GearyAuto_Hydrophobicity20", "GearyAuto_Hydrophobicity24", "GearyAuto_Hydrophobicity26",
27
+ "GearyAuto_Hydrophobicity27", "GearyAuto_Hydrophobicity28", "GearyAuto_Hydrophobicity29",
28
+ "GearyAuto_Hydrophobicity30", "GearyAuto_AvFlexibility22", "GearyAuto_AvFlexibility26",
29
+ "GearyAuto_AvFlexibility27", "GearyAuto_AvFlexibility28", "GearyAuto_AvFlexibility29",
30
+ "GearyAuto_AvFlexibility30", "GearyAuto_Polarizability22", "GearyAuto_Polarizability24",
31
+ "GearyAuto_Polarizability25", "GearyAuto_Polarizability27", "GearyAuto_Polarizability28",
32
+ "GearyAuto_Polarizability29", "GearyAuto_Polarizability30", "GearyAuto_FreeEnergy24",
33
+ "GearyAuto_FreeEnergy25", "GearyAuto_FreeEnergy30", "GearyAuto_ResidueASA21",
34
+ "GearyAuto_ResidueASA22", "GearyAuto_ResidueASA23", "GearyAuto_ResidueASA24",
35
+ "GearyAuto_ResidueASA30", "GearyAuto_ResidueVol21", "GearyAuto_ResidueVol24",
36
+ "GearyAuto_ResidueVol25", "GearyAuto_ResidueVol26", "GearyAuto_ResidueVol28",
37
+ "GearyAuto_ResidueVol29", "GearyAuto_ResidueVol30", "GearyAuto_Steric18",
38
+ "GearyAuto_Steric21", "GearyAuto_Steric26", "GearyAuto_Steric27", "GearyAuto_Steric28",
39
+ "GearyAuto_Steric29", "GearyAuto_Steric30", "GearyAuto_Mutability23", "GearyAuto_Mutability25",
40
+ "GearyAuto_Mutability26", "GearyAuto_Mutability27", "GearyAuto_Mutability28",
41
+ "GearyAuto_Mutability29", "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5",
42
+ "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13", "APAAC15", "APAAC18", "APAAC19",
43
+ "APAAC24"
44
  ]
45
 
46
  def extract_features(sequence):
47
+ aa_features = AAComposition.CalculateAADipeptideComposition(sequence)
48
+
49
+ auto_features = Autocorrelation.CalculateAutoTotal(sequence)
50
+
51
+ ctd_features = CTD.CalculateCTD(sequence)
52
+
53
+ pseaac_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
54
+
55
+ all_features = {**aa_features, **auto_features, **ctd_features, **pseaac_features}
56
+
57
+ # Convert to DataFrame
58
+ feature_df = pd.DataFrame([all_features])
59
 
60
  # Select features that match training data
61
+ feature_df = feature_df[selected_features]
62
+
63
+ # Normalize
64
+ normalized_features = scaler.transform(feature_df)
65
 
66
+ return normalized_features
67
 
68
  def predict(sequence):
69
  """Predict if the sequence is an AMP or not."""
70
  features = extract_features(sequence)
71
+ prediction = model.predict(features)[0]
72
+ probabilities = model.predict_proba(features)[0]
73
+
74
+ prob_amp = probabilities[0]
75
+ prob_non_amp = probabilities[1]
76
 
77
+ return f"{prob_amp * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)" if prediction == 0 else f"{prob_non_amp * 100:.2f}% chance of being Non-AMP"
78
 
 
79
  iface = gr.Interface(
80
  fn=predict,
81
  inputs=gr.Textbox(label="Enter Protein Sequence"),