Spaces:

nonzeroexit
/

AMP-Classifier

Running

App Files Files Community

nonzeroexit commited on Mar 7

Commit

248a61c

verified ·

1 Parent(s): 81bcfb3

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -35

app.py CHANGED Viewed

@@ -5,12 +5,12 @@ import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 from sklearn.preprocessing import MinMaxScaler
-# Load the pre-trained model and scaler
 model = joblib.load("RF.joblib")
 scaler = joblib.load("norm (1).joblib")
-# Define the list of selected features (IMPORTANT: Keep this consistent with training)
-selected_features =  [
     "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
     "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
     "_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
@@ -45,69 +45,64 @@ selected_features =  [
     "APAAC24"
 ]
 def extract_features(sequence):
-    """Extracts features from a protein sequence and returns them as a NumPy array."""
     try:
-        # Calculate features from different ProPy modules
-        comp_features = AAComposition.CalculateAAComposition(sequence)
         auto_features = Autocorrelation.CalculateAutoTotal(sequence)
         ctd_features = CTD.CalculateCTD(sequence)
-        pseudo_features = PseudoAAC.GetAPseudoAAC(sequence)  # Use default parameters
-        # Combine all features into a single dictionary
-        all_features = {**comp_features, **auto_features, **ctd_features, **pseudo_features}
-        #print(len(all_features)) # debugging
-        # Convert to DataFrame, selecting only the required features
         all_features_df = pd.DataFrame([all_features])
-        all_features_df = all_features_df[selected_features]
-        # Normalize the features using the pre-fitted scaler
-        normalized_features = scaler.transform(all_features_df)
         return normalized_features
-    except ZeroDivisionError:
-        print("Error: Division by zero encountered in feature calculation.  Check your input sequence.")
-        return None  # Or handle appropriately
-    except KeyError as e:
-        print(f"Error: Missing feature {e}.  Check feature name consistency and ProPy version.")
-        return None # Or handle appropriately
     except Exception as e:
-        print(f"An unexpected error occurred during feature extraction: {e}")
-        return None  # Or handle appropriately
 def predict(sequence):
-    """Predicts whether the input sequence is an AMP and returns the prediction."""
     features = extract_features(sequence)
-    # Check if feature extraction was successful
     if features is None:
-        return "Error: Could not extract features. Please check the input sequence."
-    # No need to reshape here; extract_features already returns the correct shape
     prediction = model.predict(features)[0]
     probabilities = model.predict_proba(features)[0]
-    # Determine output string based on prediction
     if prediction == 0:
         return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
     else:
         return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"
-# Gradio interface setup
 iface = gr.Interface(
     fn=predict,
     inputs=gr.Textbox(label="Enter Protein Sequence"),
     outputs=gr.Label(label="Prediction"),
     title="AMP Classifier",
-    description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict whether it's an antimicrobial peptide (AMP) or not."
 )
 iface.launch(share=True)

 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 from sklearn.preprocessing import MinMaxScaler
+# Load model and scaler
 model = joblib.load("RF.joblib")
 scaler = joblib.load("norm (1).joblib")
+# Feature list (KEEP THIS CONSISTENT)
+selected_features = [
     "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
     "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001",
     "_PolarizabilityD2001", "_PolarizabilityD3001", "_SolventAccessibilityD1001",
     "APAAC24"
 ]
 def extract_features(sequence):
+    """Extracts features, aligns, and normalizes, prioritizing AADipeptide."""
     try:
+        # 1. Calculate Dipeptide Composition (as per your request)
+        dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
+        dipeptide_values = list(dipeptide_features.values())
+        dipeptide_array = np.array(dipeptide_values).reshape(1, -1) #Correct shape
+        # 2. Calculate other features
         auto_features = Autocorrelation.CalculateAutoTotal(sequence)
         ctd_features = CTD.CalculateCTD(sequence)
+        pseudo_features = PseudoAAC.GetAPseudoAAC(sequence)
+        all_features = {**auto_features, **ctd_features, **pseudo_features,**dipeptide_features}
+         # Create a DataFrame for ALL features
         all_features_df = pd.DataFrame([all_features])
+        # --- Feature Selection and Alignment ---
+        present_features = [col for col in selected_features if col in all_features_df.columns]
+        selected_df = all_features_df[present_features]
+        aligned_df = pd.DataFrame(columns=selected_features)
+        aligned_df.update(selected_df)
+        aligned_df = aligned_df.fillna(0)
+        # Normalize
+        normalized_features = scaler.transform(aligned_df)
         return normalized_features
+    except (ZeroDivisionError, KeyError, TypeError, ValueError) as e:
+        print(f"Error during feature extraction: {e}")
+        return None
     except Exception as e:
+         print(f"An unexpected error occurred: {e}")
+         return None
 def predict(sequence):
+    """Predicts whether the input sequence is an AMP."""
     features = extract_features(sequence)
     if features is None:
+        return "Error: Could not extract features."
     prediction = model.predict(features)[0]
     probabilities = model.predict_proba(features)[0]
     if prediction == 0:
         return f"{probabilities[0] * 100:.2f}% chance of being an Antimicrobial Peptide (AMP)"
     else:
         return f"{probabilities[1] * 100:.2f}% chance of being Non-AMP"
+# Gradio interface
 iface = gr.Interface(
     fn=predict,
     inputs=gr.Textbox(label="Enter Protein Sequence"),
     outputs=gr.Label(label="Prediction"),
     title="AMP Classifier",
+    description="Enter an amino acid sequence (e.g., FLPVLAGGL) to predict AMP."
 )
 iface.launch(share=True)