Spaces:

ShivamKum4r
/

Drug-Toxicity-Prediction

Sleeping

App Files Files Community

ShivamKum4r commited on 22 days ago

Commit

227ad73

verified ·

1 Parent(s): db6c0b7

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +100 -91

src/streamlit_app.py CHANGED Viewed

@@ -16,9 +16,9 @@ import plotly.express as px
 from rdkit.Chem import Draw
 from torch_geometric.data import Batch
 from rdkit.Chem import Descriptors
 import time
 # ------------------- Models -------------------
@@ -114,6 +114,16 @@ with msg_threshold.container():
 # ------------------- Utility Functions -------------------
 fp_gen = GetMorganGenerator(radius=2, fpSize=1024)
 def get_molecule_info(mol):
     return {
         "Formula": Chem.rdMolDescriptors.CalcMolFormula(mol),
@@ -122,8 +132,6 @@ def get_molecule_info(mol):
         "Bonds": mol.GetNumBonds()
     }
 def predict_gcn(smiles):
     graph = smiles_to_graph(smiles)
     if graph is None:
@@ -177,6 +185,8 @@ def smiles_to_graph(smiles, label=None):
     return data
 # def predict_gcn(smiles):
 #     graph = smiles_to_graph(smiles)
 #     if graph is None or graph.x.size(0) == 0:
@@ -199,26 +209,19 @@ def smiles_to_graph(smiles, label=None):
 # df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)
 # df = df[df['mol'].notna()].reset_index(drop=True)
-df = pd.read_csv("tox21.csv")[['smiles', 'SR-HSE']].dropna()
 df = df[df['SR-HSE'].isin([0, 1])].reset_index(drop=True)
 # ✅ Filter invalid or unprocessable SMILES
 def is_valid_graph(smi):
-    mol = Chem.MolFromSmiles(smi)
-    return mol is not None and smiles_to_graph(smi) is not None
 df = df[df['smiles'].apply(is_valid_graph)].reset_index(drop=True)
 def create_graph_dataset(smiles_list, labels):
-    data_list = []
-    for smi, label in zip(smiles_list, labels):
-        data = smiles_to_graph(smi, label)
-        if data:
-            data_list.append(data)
-    return data_list
 graph_data = create_graph_dataset(df['smiles'], df['SR-HSE'])
 test_loader = DataLoader(graph_data, batch_size=32)
@@ -237,16 +240,15 @@ def plot_distribution(df, model_type, input_prob=None):
 # ------------------- Prediction Cache -------------------
 @st.cache_data(show_spinner="Generating predictions...")
 def predict_fp(smiles):
     try:
-        mol = Chem.MolFromSmiles(smiles)
-        if mol is None:
             return "Invalid SMILES", 0.0
         fp = fp_gen.GetFingerprint(mol)
-        fp_array = np.array(fp).reshape(1, -1)
         with torch.no_grad():
-            logits = fp_model(torch.tensor(fp_array).float())
             prob = torch.sigmoid(logits).item()
         return ("Toxic" if prob > 0.5 else "Non-toxic"), prob
     except Exception as e:
@@ -298,6 +300,7 @@ tab1, tab2 = st.tabs(["🔬 Fingerprint Model", "🧬 GCN Model"])
 with tab1:
     st.subheader("Fingerprint-based Prediction")
     with st.form("fp_form"):
         smiles_fp = st.text_input("Enter SMILES", "CCO")
         show_debug_fp = st.checkbox("🐞 Show Debug Info (raw score/logit)", key="fp_debug")
@@ -305,35 +308,42 @@ with tab1:
     if predict_btn:
         with st.spinner("Predicting..."):
-            mol = Chem.MolFromSmiles(smiles_fp)
-            if mol:
-                fp = fp_gen.GetFingerprint(mol)
-                arr = np.array(fp).reshape(1, -1)
-                tensor = torch.tensor(arr).float()
-                with torch.no_grad():
-                    output = fp_model(tensor)
-                    prob = torch.sigmoid(output).item()
-                    raw_score = output.item()
-                    label = "Toxic" if prob > 0.5 else "Non-toxic"
-                    color = "red" if label == "Toxic" else "green"
-                st.markdown(f"<h4>🧾 Prediction: <span style='color:{color}'>{label}</span> — <code>{prob:.3f}</code></h4>", unsafe_allow_html=True)
-                if show_debug_fp:
-                    st.code(f"📉 Raw Logit: {raw_score:.4f}", language='text')
-                    st.markdown("#### Fingerprint Vector (First 20 bits)")
-                    st.code(str(arr[0][:20]) + " ...", language="text")
-                st.image(Draw.MolToImage(mol), caption="Molecular Structure", width=250)
-                info = get_molecule_info(mol)
-                st.markdown("### Molecule Info:")
-                for k, v in info.items():
-                    st.markdown(f"**{k}:** {v}")
-                st.plotly_chart(plot_distribution(df, 'fp', prob), use_container_width=True)
-            else:
                 st.error("❌ Invalid SMILES input. Please check your string.")
     with st.expander("📌 Example SMILES to Try"):
         st.markdown("""
@@ -360,12 +370,12 @@ with tab1:
         else:
             st.info("Fingerprint model predictions not available.")
 with tab2:
     st.subheader("Graph Neural Network Prediction")
     SUPPORTED_ATOMS = {1, 6, 7, 8, 9, 16, 17, 35, 53}  # H, C, N, O, F, S, Cl, Br, I
     def is_supported(mol):
         return all(atom.GetAtomicNum() in SUPPORTED_ATOMS for atom in mol.GetAtoms())
@@ -376,48 +386,48 @@ with tab2:
     if gcn_btn:
         with st.spinner("Predicting..."):
-            mol = Chem.MolFromSmiles(smiles_gcn)
-            if mol is None:
                 st.error("❌ Invalid SMILES: could not parse molecule.")
-            elif not is_supported(mol):
-                st.error("⚠️ This molecule contains unsupported atoms (e.g. Sn, P, etc.). GCN model only supports common organic elements.")
             else:
-                graph = smiles_to_graph(smiles_gcn)
-                if graph is None:
-                    st.error("❌ SMILES is valid but could not be converted to graph. Possibly malformed structure.")
                 else:
-                    batch = Batch.from_data_list([graph])
-                    with torch.no_grad():
-                        out = gcn_model(batch)
-                        prob = torch.sigmoid(out).item()
-                        raw_score = out.item()
-                        label = "Toxic" if prob > best_threshold else "Non-toxic"
-                        color = "red" if label == "Toxic" else "green"
-                    st.markdown(f"<h4>🧾 GCN Prediction: <span style='color:{color}'>{label}</span> — <code>{prob:.3f}</code></h4>", unsafe_allow_html=True)
-                    if show_debug:
-                        st.code(f"📉 Raw Logit: {raw_score:.4f}", language='text')
-                    st.image(Draw.MolToImage(mol), caption="Molecular Structure", width=250)
-                    def get_molecule_info(mol):
-                        return {
-                            "Molecular Weight": round(Chem.Descriptors.MolWt(mol), 2),
-                            "LogP": round(Chem.Crippen.MolLogP(mol), 2),
-                            "Num H-Bond Donors": Chem.Lipinski.NumHDonors(mol),
-                            "Num H-Bond Acceptors": Chem.Lipinski.NumHAcceptors(mol),
-                            "TPSA": round(Chem.rdMolDescriptors.CalcTPSA(mol), 2),
-                            "Num Rotatable Bonds": Chem.Lipinski.NumRotatableBonds(mol)
-                        }
-                    info = get_molecule_info(mol)
-                    st.markdown("### Molecule Info:")
-                    for k, v in info.items():
-                        st.markdown(f"**{k}:** {v}")
-                    st.plotly_chart(plot_distribution(df, 'gcn', prob), use_container_width=True)
     with st.expander("📌 Example SMILES to Try"):
         st.markdown("""
@@ -441,19 +451,18 @@ with tab2:
             st.info("Predictions not available yet.")
     with st.expander("🧪 Top 5 Toxic Predictions from Test Set"):
-        if 'gcn_prob' in df:
             def is_valid_gcn(smi):
                 mol = Chem.MolFromSmiles(smi)
                 return mol is not None and is_supported(mol) and smiles_to_graph(smi) is not None
             top_toxic = df[df['gcn_prob'] > best_threshold].copy()
             top_toxic = top_toxic[top_toxic['smiles'].apply(is_valid_gcn)]
-            top_toxic = top_toxic.sort_values('gcn_prob', ascending=False).head(5)
-            if not top_toxic.empty:
                 st.table(top_toxic[['smiles', 'gcn_prob']].rename(columns={'gcn_prob': 'Predicted Probability'}))
             else:
                 st.info("No valid top predictions available.")
         else:
             st.info("GCN model predictions not available.")

 from rdkit.Chem import Draw
 from torch_geometric.data import Batch
 from rdkit.Chem import Descriptors
 import time
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
 # ------------------- Models -------------------
 # ------------------- Utility Functions -------------------
 fp_gen = GetMorganGenerator(radius=2, fpSize=1024)
+def is_valid_smiles(smiles):
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None or mol.GetNumAtoms() == 0:
+            return False
+        Chem.SanitizeMol(mol)  # Force check for chemical correctness
+        return True
+    except:
+        return False
 def get_molecule_info(mol):
     return {
         "Formula": Chem.rdMolDescriptors.CalcMolFormula(mol),
         "Bonds": mol.GetNumBonds()
     }
 def predict_gcn(smiles):
     graph = smiles_to_graph(smiles)
     if graph is None:
     return data
 # def predict_gcn(smiles):
 #     graph = smiles_to_graph(smiles)
 #     if graph is None or graph.x.size(0) == 0:
 # df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)
 # df = df[df['mol'].notna()].reset_index(drop=True)
+df = pd.read_csv("tox21.csv")[['smiles', 'SR-HSE']].dropna() #src changes
 df = df[df['SR-HSE'].isin([0, 1])].reset_index(drop=True)
 # ✅ Filter invalid or unprocessable SMILES
 def is_valid_graph(smi):
+    return is_valid_smiles(smi) and smiles_to_graph(smi) is not None
 df = df[df['smiles'].apply(is_valid_graph)].reset_index(drop=True)
 def create_graph_dataset(smiles_list, labels):
+    return [smiles_to_graph(smi, label) for smi, label in zip(smiles_list, labels) if smiles_to_graph(smi, label)]
 graph_data = create_graph_dataset(df['smiles'], df['SR-HSE'])
 test_loader = DataLoader(graph_data, batch_size=32)
 # ------------------- Prediction Cache -------------------
 @st.cache_data(show_spinner="Generating predictions...")
 def predict_fp(smiles):
     try:
+        if not is_valid_smiles(smiles):
             return "Invalid SMILES", 0.0
+        mol = Chem.MolFromSmiles(smiles)
         fp = fp_gen.GetFingerprint(mol)
+        arr = np.array(fp).reshape(1, -1)
         with torch.no_grad():
+            logits = fp_model(torch.tensor(arr).float())
             prob = torch.sigmoid(logits).item()
         return ("Toxic" if prob > 0.5 else "Non-toxic"), prob
     except Exception as e:
 with tab1:
     st.subheader("Fingerprint-based Prediction")
     with st.form("fp_form"):
         smiles_fp = st.text_input("Enter SMILES", "CCO")
         show_debug_fp = st.checkbox("🐞 Show Debug Info (raw score/logit)", key="fp_debug")
     if predict_btn:
         with st.spinner("Predicting..."):
+            if not is_valid_smiles(smiles_fp):
                 st.error("❌ Invalid SMILES input. Please check your string.")
+            else:
+                try:
+                    mol = Chem.MolFromSmiles(smiles_fp)
+                    fp = fp_gen.GetFingerprint(mol)
+                    arr = np.array(fp).reshape(1, -1)
+                    tensor = torch.tensor(arr).float().to("cpu")
+                    fp_model.to("cpu")  # Ensure model is on CPU
+                    with torch.no_grad():
+                        output = fp_model(tensor)
+                        prob = torch.sigmoid(output).item()
+                        raw_score = output.item()
+                        label = "Toxic" if prob > 0.5 else "Non-toxic"
+                        color = "red" if label == "Toxic" else "green"
+                    st.markdown(f"<h4>🧾 Prediction: <span style='color:{color}'>{label}</span> — <code>{prob:.3f}</code></h4>", unsafe_allow_html=True)
+                    if show_debug_fp:
+                        st.code(f"📉 Raw Logit: {raw_score:.4f}", language='text')
+                        st.markdown("#### Fingerprint Vector (First 20 bits)")
+                        st.code(str(arr[0][:20]) + " ...", language="text")
+                    st.image(Draw.MolToImage(mol), caption="Molecular Structure", width=250)
+                    info = get_molecule_info(mol)
+                    st.markdown("### Molecule Info:")
+                    for k, v in info.items():
+                        st.markdown(f"**{k}:** {v}")
+                    st.plotly_chart(plot_distribution(df, 'fp', prob), use_container_width=True)
+                except Exception as e:
+                    st.error(f"Prediction error: {str(e)}")
     with st.expander("📌 Example SMILES to Try"):
         st.markdown("""
         else:
             st.info("Fingerprint model predictions not available.")
 with tab2:
     st.subheader("Graph Neural Network Prediction")
     SUPPORTED_ATOMS = {1, 6, 7, 8, 9, 16, 17, 35, 53}  # H, C, N, O, F, S, Cl, Br, I
     def is_supported(mol):
         return all(atom.GetAtomicNum() in SUPPORTED_ATOMS for atom in mol.GetAtoms())
     if gcn_btn:
         with st.spinner("Predicting..."):
+            if not is_valid_smiles(smiles_gcn):
                 st.error("❌ Invalid SMILES: could not parse molecule.")
             else:
+                mol = Chem.MolFromSmiles(smiles_gcn)
+                if not is_supported(mol):
+                    st.error("⚠️ This molecule contains unsupported atoms (e.g. Sn, P, etc.). GCN model only supports common organic elements.")
                 else:
+                    graph = smiles_to_graph(smiles_gcn)
+                    if graph is None:
+                        st.error("❌ SMILES is valid but could not be converted to graph. Possibly malformed structure.")
+                    else:
+                        batch = Batch.from_data_list([graph])
+                        with torch.no_grad():
+                            out = gcn_model(batch)
+                            prob = torch.sigmoid(out).item()
+                            raw_score = out.item()
+                            label = "Toxic" if prob > best_threshold else "Non-toxic"
+                            color = "red" if label == "Toxic" else "green"
+                        st.markdown(f"<h4>🧾 GCN Prediction: <span style='color:{color}'>{label}</span> — <code>{prob:.3f}</code></h4>", unsafe_allow_html=True)
+                        if show_debug:
+                            st.code(f"📉 Raw Logit: {raw_score:.4f}", language='text')
+                        st.image(Draw.MolToImage(mol), caption="Molecular Structure", width=250)
+                        def get_molecule_info(mol):
+                            return {
+                                "Molecular Weight": round(Chem.Descriptors.MolWt(mol), 2),
+                                "LogP": round(Chem.Crippen.MolLogP(mol), 2),
+                                "Num H-Bond Donors": Chem.Lipinski.NumHDonors(mol),
+                                "Num H-Bond Acceptors": Chem.Lipinski.NumHAcceptors(mol),
+                                "TPSA": round(Chem.rdMolDescriptors.CalcTPSA(mol), 2),
+                                "Num Rotatable Bonds": Chem.Lipinski.NumRotatableBonds(mol)
+                            }
+                        info = get_molecule_info(mol)
+                        st.markdown("### Molecule Info:")
+                        for k, v in info.items():
+                            st.markdown(f"**{k}:** {v}")
+                        st.plotly_chart(plot_distribution(df, 'gcn', prob), use_container_width=True)
     with st.expander("📌 Example SMILES to Try"):
         st.markdown("""
             st.info("Predictions not available yet.")
     with st.expander("🧪 Top 5 Toxic Predictions from Test Set"):
+        if 'gcn_prob' in df.columns:
             def is_valid_gcn(smi):
                 mol = Chem.MolFromSmiles(smi)
                 return mol is not None and is_supported(mol) and smiles_to_graph(smi) is not None
             top_toxic = df[df['gcn_prob'] > best_threshold].copy()
             top_toxic = top_toxic[top_toxic['smiles'].apply(is_valid_gcn)]
+            if not top_toxic.empty and 'gcn_prob' in top_toxic.columns:
+                top_toxic = top_toxic.sort_values('gcn_prob', ascending=False).head(5)
                 st.table(top_toxic[['smiles', 'gcn_prob']].rename(columns={'gcn_prob': 'Predicted Probability'}))
             else:
                 st.info("No valid top predictions available.")
         else:
             st.info("GCN model predictions not available.")