dexay commited on
Commit
371b679
·
1 Parent(s): e27afc9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -210
app.py CHANGED
@@ -6,221 +6,223 @@ from transformers import pipeline, TokenClassificationPipeline, BertForTokenCla
6
  st.header("Knowledge extraction on Endocrine disruptors")
7
  st.write("This tool lets you extract relation triples concerning interactions between: endocrine disrupting chemicals, hormones, receptors and cancers.")
8
  st.write("It is the result of an end of studies project within ESI school and dedicated to biomedical researchers looking to extract precise information about the subject without digging into long publications.")
9
- x = st.text_area('Entre you text on EDCs:')
10
 
 
 
 
11
 
12
- #model.to("cpu")
13
- tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
14
- model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
15
-
16
-
17
- model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", )
18
- token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, )
19
-
20
- st.text("Knowledge extraction is in progress ...")
21
-
22
- if x and x[-1] not in ".?:":
23
- x += "."
24
-
25
- biotext = x
26
-
27
- #split document or text into sentences
28
-
29
- lstbiotext = []
30
-
31
- flag = 0
32
- tempsen = ""
33
- for e in biotext:
34
- tempsen += e
35
- if e=="(":
36
- flag = 1
37
- if e==")":
38
- flag = 0
39
- if (e =="." or e =="?" or e ==":" ) and flag == 0 :
40
- lstbiotext += [tempsen.strip()]
41
- tempsen = ""
42
-
43
- ddata = lstbiotext
44
-
45
- #tokenized_dat = tokenize_function(ddata)
46
-
47
- az = token_classifier(ddata)
48
-
49
-
50
- #code to convert NER output to RE input compatible format
51
-
52
- #tg_inorder are decoding of labels on which the model was fine tuned on
53
-
54
- tg_inorder = ['O',
55
- 'B-HORMONE',
56
- 'B-EXP_PER',
57
- 'I-HORMONE',
58
- 'I-CANCER',
59
- 'I-EDC',
60
- 'B-RECEPTOR',
61
- 'B-CANCER',
62
- 'I-RECEPTOR',
63
- 'B-EDC',
64
- 'PAD']
65
-
66
- lstSentEnc = []
67
- lstSentbilbl = []
68
- lstSentEnt = []
69
- for itsent in az:
70
-
71
- sentaz = itsent
72
- ph = []
73
- phl = []
74
- for e in sentaz:
75
- if e["word"][0]=="#" and len(ph)!=0:
76
- ph[-1]+= e["word"][2:]
77
- else:
78
- ph += [e["word"]]
79
- phl += [e["entity"]]
80
-
81
-
82
- phltr = []
83
- for e in phl:
84
- phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]]
85
 
86
-
87
- nwph = []
88
- nwphltr = []
89
  flag = 0
90
- for i in range(len(phltr)-2):
91
- if phltr[i]=="O" and flag != 3 :
92
- nwph += [ph[i]]
93
- nwphltr += [phltr[i]]
94
- continue
95
- elif flag == 3:
96
- nwph[-1] += " "+ph[i]
97
- flag = 1
98
- continue
99
- elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
100
- nwph += [ph[i]]
101
- nwphltr += [phltr[i]]
102
- flag = 1
103
- continue
104
- elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
105
- nwph[-1] += " "+ph[i]
106
- continue
107
- # xox with flag == 3
108
- elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
109
- nwph += [ph[i]]
110
- nwphltr += [phltr[i]]
111
- flag = 3
112
- continue
113
- elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
114
- nwph[-1] += " "+ph[i]
115
- flag = 3
116
- continue
117
- #\ xox
118
- elif flag == 1:
119
- nwph[-1] += " "+ph[i]
120
- flag = 0
121
- continue
122
- else :
123
- nwph += [ph[i]]
124
- nwphltr += [phltr[i]]
125
- continue
126
-
127
-
128
- # nwph,nwphltr,len(nwph),len(nwphltr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
-
131
- if nwphltr.count("O") <= len(nwphltr)-2:
132
- for i in range(len(nwph)-1):
133
- if nwphltr[i] != "O":
134
- for j in range(i,len(nwph)):
135
- if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
136
- sen2ad = ""
137
- for g in range(i):
138
- sen2ad += nwph[g]+" "
139
- sen2ad += "<e1>"+nwph[i]+"</e1> "
140
-
141
- for t in range(i+1,j):
142
- sen2ad += nwph[t]+" "
143
- sen2ad += "<e2>"+nwph[j]+"</e2>"
144
- if j<len(nwph):
145
- for l in range(j+1,len(nwph)):
146
- sen2ad += " "+nwph[l]
147
- lstSentEnc += [sen2ad]
148
- lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
149
- lstSentEnt += [[nwph[i],nwph[j]]]
150
-
151
-
152
-
153
- #lstSentEnc,lstSentEnt,lstSentbilbl
154
-
155
- st.text("Entities detected, Next: Relation detection ...")
156
-
157
-
158
- # Relation extraction part
159
-
160
- token_classifier = pipeline("text-classification", tokenizer = tokenizer,model=model_re,
161
- )
162
-
163
- rrdata = lstSentEnc
164
-
165
-
166
-
167
- outre = token_classifier(rrdata)
168
-
169
-
170
- trLABELS = ['INCREASE_RISK(e1,e2)',
171
- 'SPEED_UP(e2,e1)',
172
- 'DECREASE_ACTIVITY(e1,e2)',
173
- 'NO_ASSOCIATION(e1,e2)',
174
- 'DECREASE(e1,e2)',
175
- 'BLOCK(e1,e2)',
176
- 'CAUSE(e1,e2)',
177
- 'ACTIVATE(e2,e1)',
178
- 'DEVELOP(e2,e1)',
179
- 'ALTER(e1,e2)',
180
- 'INCREASE_RISK(e2,e1)',
181
- 'SPEED_UP(e1,e2)',
182
- 'INTERFER(e1,e2)',
183
- 'DECREASE(e2,e1)',
184
- 'NO_ASSOCIATION(e2,e1)',
185
- 'INCREASE(e2,e1)',
186
- 'INTERFER(e2,e1)',
187
- 'ACTIVATE(e1,e2)',
188
- 'INCREASE(e1,e2)',
189
- 'MIMIC(e1,e2)',
190
- 'MIMIC(e2,e1)',
191
- 'BLOCK(e2,e1)',
192
- 'other',
193
- 'BIND(e2,e1)',
194
- 'INCREASE_ACTIVITY(e2,e1)',
195
- 'ALTER(e2,e1)',
196
- 'CAUSE(e2,e1)',
197
- 'BIND(e1,e2)',
198
- 'DEVELOP(e1,e2)',
199
- 'DECREASE_ACTIVITY(e2,e1)']
200
-
201
-
202
-
203
- outrelbl = []
204
- for e in outre:
205
- outrelbl += [trLABELS[int(e['label'][-1])] if len(e["label"])==7 else trLABELS[int(e['label'][-2:])] ]
206
-
207
- for i in range(len(outrelbl)):
208
- if "(e2,e1)" in outrelbl[i]:
209
- lstSentbilbl[i][0],lstSentbilbl[i][1] = lstSentbilbl[i][1],lstSentbilbl[i][0]
210
- lstSentEnt[i][0],lstSentEnt[i][1] = lstSentEnt[i][1],lstSentEnt[i][0]
211
-
212
-
213
- edccan = []
214
-
215
-
216
- for i in range(len(outrelbl)):
217
- if outrelbl[i] != "other":
218
- edccan += [[lstSentEnc[i],lstSentEnt[i][0], lstSentEnt[i][1],lstSentbilbl[i][0]+" "+outrelbl[i][:-7]+" "+lstSentbilbl[i][1]]]
219
-
220
- edccandf = pd.DataFrame(edccan, columns= ["Sentence", "Entity 1", "Entity 2", "Relation"] )
221
-
222
- if x:
223
- out = token_classifier(x)
224
  st.table(edccandf)
225
 
226
 
 
6
  st.header("Knowledge extraction on Endocrine disruptors")
7
  st.write("This tool lets you extract relation triples concerning interactions between: endocrine disrupting chemicals, hormones, receptors and cancers.")
8
  st.write("It is the result of an end of studies project within ESI school and dedicated to biomedical researchers looking to extract precise information about the subject without digging into long publications.")
 
9
 
10
+ form = st.form(key='my-form')
11
+ x = form.text_input('Enter your text')
12
+ submit = form.form_submit_button('Submit')
13
 
14
+ if submit and len(x) != 0:
15
+ #model.to("cpu")
16
+ tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
17
+ model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
18
+
19
+
20
+ model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", )
21
+ token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, )
22
+
23
+ st.text("Knowledge extraction is in progress ...")
24
+
25
+ if x[-1] not in ".?:":
26
+ x += "."
27
+
28
+ biotext = x
29
+
30
+ #split document or text into sentences
31
+
32
+ lstbiotext = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
 
 
34
  flag = 0
35
+ tempsen = ""
36
+ for e in biotext:
37
+ tempsen += e
38
+ if e=="(":
39
+ flag = 1
40
+ if e==")":
41
+ flag = 0
42
+ if (e =="." or e =="?" or e ==":" ) and flag == 0 :
43
+ lstbiotext += [tempsen.strip()]
44
+ tempsen = ""
45
+
46
+ ddata = lstbiotext
47
+
48
+ #tokenized_dat = tokenize_function(ddata)
49
+
50
+ az = token_classifier(ddata)
51
+
52
+
53
+ #code to convert NER output to RE input compatible format
54
+
55
+ #tg_inorder are decoding of labels on which the model was fine tuned on
56
+
57
+ tg_inorder = ['O',
58
+ 'B-HORMONE',
59
+ 'B-EXP_PER',
60
+ 'I-HORMONE',
61
+ 'I-CANCER',
62
+ 'I-EDC',
63
+ 'B-RECEPTOR',
64
+ 'B-CANCER',
65
+ 'I-RECEPTOR',
66
+ 'B-EDC',
67
+ 'PAD']
68
+
69
+ lstSentEnc = []
70
+ lstSentbilbl = []
71
+ lstSentEnt = []
72
+ for itsent in az:
73
+
74
+ sentaz = itsent
75
+ ph = []
76
+ phl = []
77
+ for e in sentaz:
78
+ if e["word"][0]=="#" and len(ph)!=0:
79
+ ph[-1]+= e["word"][2:]
80
+ else:
81
+ ph += [e["word"]]
82
+ phl += [e["entity"]]
83
+
84
+
85
+ phltr = []
86
+ for e in phl:
87
+ phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]]
88
+
89
+
90
+ nwph = []
91
+ nwphltr = []
92
+ flag = 0
93
+ for i in range(len(phltr)-2):
94
+ if phltr[i]=="O" and flag != 3 :
95
+ nwph += [ph[i]]
96
+ nwphltr += [phltr[i]]
97
+ continue
98
+ elif flag == 3:
99
+ nwph[-1] += " "+ph[i]
100
+ flag = 1
101
+ continue
102
+ elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
103
+ nwph += [ph[i]]
104
+ nwphltr += [phltr[i]]
105
+ flag = 1
106
+ continue
107
+ elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
108
+ nwph[-1] += " "+ph[i]
109
+ continue
110
+ # xox with flag == 3
111
+ elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
112
+ nwph += [ph[i]]
113
+ nwphltr += [phltr[i]]
114
+ flag = 3
115
+ continue
116
+ elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
117
+ nwph[-1] += " "+ph[i]
118
+ flag = 3
119
+ continue
120
+ #\ xox
121
+ elif flag == 1:
122
+ nwph[-1] += " "+ph[i]
123
+ flag = 0
124
+ continue
125
+ else :
126
+ nwph += [ph[i]]
127
+ nwphltr += [phltr[i]]
128
+ continue
129
+
130
+
131
+ # nwph,nwphltr,len(nwph),len(nwphltr)
132
+
133
+
134
+ if nwphltr.count("O") <= len(nwphltr)-2:
135
+ for i in range(len(nwph)-1):
136
+ if nwphltr[i] != "O":
137
+ for j in range(i,len(nwph)):
138
+ if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
139
+ sen2ad = ""
140
+ for g in range(i):
141
+ sen2ad += nwph[g]+" "
142
+ sen2ad += "<e1>"+nwph[i]+"</e1> "
143
+
144
+ for t in range(i+1,j):
145
+ sen2ad += nwph[t]+" "
146
+ sen2ad += "<e2>"+nwph[j]+"</e2>"
147
+ if j<len(nwph):
148
+ for l in range(j+1,len(nwph)):
149
+ sen2ad += " "+nwph[l]
150
+ lstSentEnc += [sen2ad]
151
+ lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
152
+ lstSentEnt += [[nwph[i],nwph[j]]]
153
+
154
+
155
+
156
+ #lstSentEnc,lstSentEnt,lstSentbilbl
157
+
158
+ st.text("Entities detected, Next: Relation detection ...")
159
+
160
+
161
+ # Relation extraction part
162
+
163
+ token_classifier = pipeline("text-classification", tokenizer = tokenizer,model=model_re,
164
+ )
165
+
166
+ rrdata = lstSentEnc
167
+
168
+
169
+
170
+ outre = token_classifier(rrdata)
171
+
172
+
173
+ trLABELS = ['INCREASE_RISK(e1,e2)',
174
+ 'SPEED_UP(e2,e1)',
175
+ 'DECREASE_ACTIVITY(e1,e2)',
176
+ 'NO_ASSOCIATION(e1,e2)',
177
+ 'DECREASE(e1,e2)',
178
+ 'BLOCK(e1,e2)',
179
+ 'CAUSE(e1,e2)',
180
+ 'ACTIVATE(e2,e1)',
181
+ 'DEVELOP(e2,e1)',
182
+ 'ALTER(e1,e2)',
183
+ 'INCREASE_RISK(e2,e1)',
184
+ 'SPEED_UP(e1,e2)',
185
+ 'INTERFER(e1,e2)',
186
+ 'DECREASE(e2,e1)',
187
+ 'NO_ASSOCIATION(e2,e1)',
188
+ 'INCREASE(e2,e1)',
189
+ 'INTERFER(e2,e1)',
190
+ 'ACTIVATE(e1,e2)',
191
+ 'INCREASE(e1,e2)',
192
+ 'MIMIC(e1,e2)',
193
+ 'MIMIC(e2,e1)',
194
+ 'BLOCK(e2,e1)',
195
+ 'other',
196
+ 'BIND(e2,e1)',
197
+ 'INCREASE_ACTIVITY(e2,e1)',
198
+ 'ALTER(e2,e1)',
199
+ 'CAUSE(e2,e1)',
200
+ 'BIND(e1,e2)',
201
+ 'DEVELOP(e1,e2)',
202
+ 'DECREASE_ACTIVITY(e2,e1)']
203
+
204
+
205
+
206
+ outrelbl = []
207
+ for e in outre:
208
+ outrelbl += [trLABELS[int(e['label'][-1])] if len(e["label"])==7 else trLABELS[int(e['label'][-2:])] ]
209
+
210
+ for i in range(len(outrelbl)):
211
+ if "(e2,e1)" in outrelbl[i]:
212
+ lstSentbilbl[i][0],lstSentbilbl[i][1] = lstSentbilbl[i][1],lstSentbilbl[i][0]
213
+ lstSentEnt[i][0],lstSentEnt[i][1] = lstSentEnt[i][1],lstSentEnt[i][0]
214
+
215
+
216
+ edccan = []
217
+
218
+
219
+ for i in range(len(outrelbl)):
220
+ if outrelbl[i] != "other":
221
+ edccan += [[lstSentEnc[i],lstSentEnt[i][0], lstSentEnt[i][1],lstSentbilbl[i][0]+" "+outrelbl[i][:-7]+" "+lstSentbilbl[i][1]]]
222
+
223
+ edccandf = pd.DataFrame(edccan, columns= ["Sentence", "Entity 1", "Entity 2", "Relation"] )
224
+
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  st.table(edccandf)
227
 
228