Spaces:

sandl
/

private_polymer_compound_prediction

Sleeping

App Files Files Community

bndl commited on Jul 21, 2023

Commit

9e770a8

1 Parent(s): e64a9a7

Update inference_polymers_gnn.py

Browse files

Files changed (1) hide show

inference_polymers_gnn.py +91 -51

inference_polymers_gnn.py CHANGED Viewed

@@ -5,69 +5,109 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 import torch
 import pandas as pd
-from polymerlearn.utils import GraphDataset, get_Tg_add
-from polymerlearn.models.gnn import PolymerGNN_Tg
 from polymerlearn.utils import make_like_batch
-def convert_to_graphdataset(df, acid_positions = (0, 12), glycol_positions = (13, 25)):
-    """
-    Takes a dataframe with the input data and converts it to a graph dataset for the prediction
-    For the Tg the model takes the proportions of acid and glycols and the log of the Mw (PS) (no other parameters)
-    """
-    targets = ["Tg"]
     # Need to pass the target column to do the preprocessing - not sure why it is needed - must be clarified
     for col in targets:
         df[col] = 0
     df_duplicated = pd.concat([df, df]) # Because must specify a test set - this must be changed in the code
-    add_duplicated = get_Tg_add(df_duplicated)
     print(f"Inference to be done on data of size {df.shape}")
-    graph_data = GraphDataset(
-        data = df_duplicated,
-        structure_dir = './Structures/AG/xyz',
-        Y_target=targets,
-        test_size = 0.5,
-        add_features = add_duplicated,
-        ac = acid_positions,
-        gc = glycol_positions
-    )
-    test_data, Ytest, add_test = graph_data.get_test()
-    print(f"After data preprocessing, inference on {len(test_data)}")
-    n_predictions = len(Ytest)
-    return test_data, add_test, n_predictions
-def predict_from_graph(input_graph_data, input_additional_features, n_predictions, model_path="../polymerlearn/data_models/tg_model_test.pth"):
-    model = PolymerGNN_Tg(
-        input_feat= 6,         # How many input features on each node; don't change this
-        hidden_channels= 32,   # How many intermediate dimensions to use in model
-                                # Can change this ^^
-        num_additional= 1      # How many additional resin properties to include in the prediction
-                                # Corresponds to the number in get_IV_add
-    )
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    state = torch.load(model_path, map_location=device)
-    model.load_state_dict(state)
-    model.eval()
-    predictions = []
-    print(f"Prediction done on {n_predictions}")
-    with torch.no_grad():
-        for i in range(n_predictions):
-            batch_like_tup = make_like_batch(input_graph_data[i])
-            pred = model(*batch_like_tup, input_additional_features[i]).item()
-            predictions.append(pred)
-    return predictions
-def predict(df, acid_positions = (0, 12), glycol_positions = (13, 25), model_path="../polymerlearn/data_models/tg_model_test.pth"):
-    test_data, add_test, n_predictions = convert_to_graphdataset(df, acid_positions=acid_positions, glycol_positions=glycol_positions)
-    predictions = predict_from_graph(test_data, add_test, n_predictions, model_path=model_path)
-    return predictions

 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 import torch
 import pandas as pd
+from polymerlearn.utils import GraphDataset, get_Tg_add, get_IV_add
+from polymerlearn.models.gnn import PolymerGNN_Tg, PolymerGNN_IV
 from polymerlearn.utils import make_like_batch
+# def convert_to_graphdataset(df, acid_positions = (0, 12), glycol_positions = (13, 25)):
+#     """
+#     Takes a dataframe with the input data and converts it to a graph dataset for the prediction
+#     For the Tg the model takes the proportions of acid and glycols and the log of the Mw (PS) (no other parameters)
+#     """
+#     targets = ["Tg", "IV"]
+#     # Need to pass the target column to do the preprocessing - not sure why it is needed - must be clarified
+#     for col in targets:
+#         df[col] = 0
+#     df_duplicated = pd.concat([df, df]) # Because must specify a test set - this must be changed in the code
+#     add_duplicated = get_Tg_add(df_duplicated)
+#     print(f"Inference to be done on data of size {df.shape}")
+#     graph_data = GraphDataset(
+#         data = df_duplicated,
+#         structure_dir = './Structures/AG/xyz',
+#         Y_target=targets,
+#         test_size = 0.5,
+#         add_features = add_duplicated,
+#         ac = acid_positions,
+#         gc = glycol_positions
+#     )
+#     test_data, Ytest, add_test = graph_data.get_test()
+#     print(f"After data preprocessing, inference on {len(test_data)}")
+#     n_predictions = len(Ytest)
+#     return test_data, add_test, n_predictions
+def predict(df, model_path="../polymerlearn/data_models/", acid_positions = (0, 12), glycol_positions = (13, 25)):
+    targets = ["Tg", "IV"]
     # Need to pass the target column to do the preprocessing - not sure why it is needed - must be clarified
     for col in targets:
         df[col] = 0
     df_duplicated = pd.concat([df, df]) # Because must specify a test set - this must be changed in the code
+    add_features = {"Tg": get_Tg_add(df_duplicated), "IV": get_IV_add(df_duplicated)}
     print(f"Inference to be done on data of size {df.shape}")
+    print("Tg", add_features["Tg"].shape)
+    print("Tg", add_features["IV"].shape)
+    pred_all = []
+    for pred in targets:
+        graph_data = GraphDataset(
+            data = df_duplicated,
+            structure_dir = './Structures/AG/xyz',
+            Y_target=targets,
+            test_size = 0.5,
+            add_features = add_features[pred],
+            ac = acid_positions,
+            gc = glycol_positions
+        )
+        test_data, Ytest, add_test = graph_data.get_test()
+        print(f"After data preprocessing, inference on {len(test_data)}")
+        n_predictions = len(Ytest)
+        if pred == "Tg":
+            model = PolymerGNN_Tg(
+                input_feat= 6,         # How many input features on each node; don't change this
+                hidden_channels= 32,   # How many intermediate dimensions to use in model
+                                        # Can change this ^^
+                num_additional= add_features[pred].shape[1]     # How many additional resin properties to include in the prediction
+                                        # Corresponds to the number in get_IV_add
+            )
+        else:
+            model = PolymerGNN_IV(
+                input_feat= 6,         # How many input features on each node; don't change this
+                hidden_channels= 32,   # How many intermediate dimensions to use in model
+                                        # Can change this ^^
+                num_additional= add_features[pred].shape[1]     # How many additional resin properties to include in the prediction
+                                        # Corresponds to the number in get_IV_add
+            )
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        state = torch.load(os.path.join(model_path, f"{pred}_model.pth"), map_location=device)
+        model.load_state_dict(state)
+        model.eval()
+        predictions = []
+        print(f"Prediction done on {n_predictions}")
+        with torch.no_grad():
+            for i in range(n_predictions):
+                batch_like_tup = make_like_batch(test_data[i])
+                pred = model(*batch_like_tup, add_test[i]).item()
+                predictions.append(pred)
+        pred_all.append(predictions)
+    return pred_all
+# def predict(df, acid_positions = (0, 12), glycol_positions = (13, 25), model_path="../polymerlearn/data_models/tg_model_test.pth"):
+#     test_data, add_test, n_predictions = convert_to_graphdataset(df, acid_positions=acid_positions, glycol_positions=glycol_positions)
+#     predictions = predict_from_graph(test_data, add_test, n_predictions, model_path=model_path)
+#     return predictions