AlienChen commited on
Commit
b07c84e
·
verified ·
1 Parent(s): edb3c00

Create compute_class_weights.py

Browse files
Files changed (1) hide show
  1. dataset/compute_class_weights.py +47 -0
dataset/compute_class_weights.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import ast
3
+ from sklearn.model_selection import train_test_split
4
+ import numpy as np
5
+ import torch.nn.functional as F
6
+ import torch
7
+ from torch.utils.data import Dataset
8
+ from datasets import Dataset as HFDataset, DatasetDict
9
+ from transformers import AutoTokenizer
10
+ import pdb
11
+
12
+ def main():
13
+
14
+ data = pd.read_csv('dataset_drop_500.csv')
15
+
16
+ print(len(data))
17
+
18
+ binding_sites = data['mutTarget_motifs'].tolist()
19
+ targets = data['Target'].tolist()
20
+
21
+ # No need for padding the first position of binding sites for class weight calculations
22
+ binding_sites = [ast.literal_eval(binding_site) for binding_site in binding_sites]
23
+ binding_sites = [len(binding_site) for binding_site in binding_sites]
24
+ targets = [len(seq) for seq in targets]
25
+ pdb.set_trace()
26
+
27
+ train_val_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
28
+ train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)
29
+
30
+ train_index = train_data.index.to_numpy()
31
+ print(len(train_index))
32
+ return
33
+
34
+ train_binding_dataset = [binding_sites[i] for i in train_index]
35
+ train_targets = [targets[i] for i in train_index]
36
+
37
+ num_binding_sites = sum(train_binding_dataset)
38
+ num_total = sum(train_targets)
39
+ num_non_binding_sites = num_total - num_binding_sites
40
+ weight_for_binding = num_total / (2 * num_binding_sites)
41
+ weight_for_non_binding = num_total / (2 * num_non_binding_sites)
42
+
43
+ print(weight_for_binding, weight_for_non_binding)
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()