nonzeroexit commited on
Commit
248a61c
·
verified ·
1 Parent(s): 81bcfb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -35
app.py CHANGED
@@ -5,12 +5,12 @@ import pandas as pd
5
  from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
6
  from sklearn.preprocessing import MinMaxScaler
7
 
8
- # Load the pre-trained model and scaler
9
  model = joblib.load("RF.joblib")
10
  scaler = joblib.load("norm (1).joblib")
11
 
12
- # Define the list of selected features (IMPORTANT: Keep this consistent with training)
13
- selected_features = [
14
  "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
15
  "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
16
  "_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
@@ -45,69 +45,64 @@ selected_features = [
45
  "APAAC24"
46
  ]
47
 
48
-
49
-
50
  def extract_features(sequence):
51
- """Extracts features from a protein sequence and returns them as a NumPy array."""
52
  try:
53
- # Calculate features from different ProPy modules
54
- comp_features = AAComposition.CalculateAAComposition(sequence)
 
 
 
 
55
  auto_features = Autocorrelation.CalculateAutoTotal(sequence)
56
  ctd_features = CTD.CalculateCTD(sequence)
57
- pseudo_features = PseudoAAC.GetAPseudoAAC(sequence) # Use default parameters
58
-
59
- # Combine all features into a single dictionary
60
- all_features = {**comp_features, **auto_features, **ctd_features, **pseudo_features}
61
- #print(len(all_features)) # debugging
62
 
63
- # Convert to DataFrame, selecting only the required features
64
  all_features_df = pd.DataFrame([all_features])
65
- all_features_df = all_features_df[selected_features]
66
 
 
 
 
 
 
 
67
 
68
- # Normalize the features using the pre-fitted scaler
69
- normalized_features = scaler.transform(all_features_df)
70
 
 
 
71
  return normalized_features
72
 
73
- except ZeroDivisionError:
74
- print("Error: Division by zero encountered in feature calculation. Check your input sequence.")
75
- return None # Or handle appropriately
76
- except KeyError as e:
77
- print(f"Error: Missing feature {e}. Check feature name consistency and ProPy version.")
78
- return None # Or handle appropriately
79
  except Exception as e:
80
- print(f"An unexpected error occurred during feature extraction: {e}")
81
- return None # Or handle appropriately
82
-
83
 
84
 
85
  def predict(sequence):
86
- """Predicts whether the input sequence is an AMP and returns the prediction."""
87
  features = extract_features(sequence)
88
-
89
- # Check if feature extraction was successful
90
  if features is None:
91
- return "Error: Could not extract features. Please check the input sequence."
92
 
93
- # No need to reshape here; extract_features already returns the correct shape
94
  prediction = model.predict(features)[0]
95
  probabilities = model.predict_proba(features)[0]
96
 
97
- # Determine output string based on prediction
98
  if prediction == 0:
99
  return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
100
  else:
101
  return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"
102
 
103
-
104
- # Gradio interface setup
105
  iface = gr.Interface(
106
  fn=predict,
107
  inputs=gr.Textbox(label="Enter Protein Sequence"),
108
  outputs=gr.Label(label="Prediction"),
109
  title="AMP Classifier",
110
- description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict whether it's an antimicrobial peptide (AMP) or not."
111
  )
112
 
113
  iface.launch(share=True)
 
5
  from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
6
  from sklearn.preprocessing import MinMaxScaler
7
 
8
+ # Load model and scaler
9
  model = joblib.load("RF.joblib")
10
  scaler = joblib.load("norm (1).joblib")
11
 
12
+ # Feature list (KEEP THIS CONSISTENT)
13
+ selected_features = [
14
  "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
15
  "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
16
  "_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
 
45
  "APAAC24"
46
  ]
47
 
 
 
48
  def extract_features(sequence):
49
+ """Extracts features, aligns, and normalizes, prioritizing AADipeptide."""
50
  try:
51
+ # 1. Calculate Dipeptide Composition (as per your request)
52
+ dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
53
+ dipeptide_values = list(dipeptide_features.values())
54
+ dipeptide_array = np.array(dipeptide_values).reshape(1, -1) #Correct shape
55
+
56
+ # 2. Calculate other features
57
  auto_features = Autocorrelation.CalculateAutoTotal(sequence)
58
  ctd_features = CTD.CalculateCTD(sequence)
59
+ pseudo_features = PseudoAAC.GetAPseudoAAC(sequence)
60
+ all_features = {**auto_features, **ctd_features, **pseudo_features,**dipeptide_features}
 
 
 
61
 
62
+ # Create a DataFrame for ALL features
63
  all_features_df = pd.DataFrame([all_features])
 
64
 
65
+ # --- Feature Selection and Alignment ---
66
+ present_features = [col for col in selected_features if col in all_features_df.columns]
67
+ selected_df = all_features_df[present_features]
68
+ aligned_df = pd.DataFrame(columns=selected_features)
69
+ aligned_df.update(selected_df)
70
+ aligned_df = aligned_df.fillna(0)
71
 
 
 
72
 
73
+ # Normalize
74
+ normalized_features = scaler.transform(aligned_df)
75
  return normalized_features
76
 
77
+ except (ZeroDivisionError, KeyError, TypeError, ValueError) as e:
78
+ print(f"Error during feature extraction: {e}")
79
+ return None
 
 
 
80
  except Exception as e:
81
+ print(f"An unexpected error occurred: {e}")
82
+ return None
 
83
 
84
 
85
  def predict(sequence):
86
+ """Predicts whether the input sequence is an AMP."""
87
  features = extract_features(sequence)
 
 
88
  if features is None:
89
+ return "Error: Could not extract features."
90
 
 
91
  prediction = model.predict(features)[0]
92
  probabilities = model.predict_proba(features)[0]
93
 
 
94
  if prediction == 0:
95
  return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
96
  else:
97
  return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"
98
 
99
+ # Gradio interface
 
100
  iface = gr.Interface(
101
  fn=predict,
102
  inputs=gr.Textbox(label="Enter Protein Sequence"),
103
  outputs=gr.Label(label="Prediction"),
104
  title="AMP Classifier",
105
+ description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict AMP."
106
  )
107
 
108
  iface.launch(share=True)