|
import streamlit as st |
|
import pandas as pd |
|
import transformers |
|
from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer , TextClassificationPipeline , AutoModelForSequenceClassification |
|
|
|
st.header("Knowledge extraction on Endocrine disruptors") |
|
st.write("This tool lets you extract relation triples concerning interactions between: endocrine disrupting chemicals, hormones, receptors and cancers.") |
|
st.write("It is the result of an end of studies project within ESI school and dedicated to biomedical researchers looking to extract precise information about the subject without digging into long publications.") |
|
|
|
form = st.form(key='my-form') |
|
x = form.text_area('Enter text', height=275) |
|
submit = form.form_submit_button('Submit') |
|
|
|
if submit and len(x) != 0: |
|
|
|
st.text("Execution is in progress ...") |
|
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,) |
|
model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", ) |
|
|
|
|
|
model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", ) |
|
token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, ) |
|
|
|
|
|
|
|
if x[-1] not in ".?:": |
|
x += "." |
|
|
|
biotext = x |
|
|
|
|
|
|
|
lstbiotext = [] |
|
|
|
flag = 0 |
|
tempsen = "" |
|
for e in biotext: |
|
tempsen += e |
|
if e=="(": |
|
flag = 1 |
|
if e==")": |
|
flag = 0 |
|
if (e =="." or e =="?" or e ==":" ) and flag == 0 : |
|
lstbiotext += [tempsen.strip()] |
|
tempsen = "" |
|
|
|
ddata = lstbiotext |
|
|
|
|
|
|
|
az = token_classifier(ddata) |
|
|
|
|
|
|
|
|
|
|
|
|
|
tg_inorder = ['O', |
|
'B-HORMONE', |
|
'B-EXP_PER', |
|
'I-HORMONE', |
|
'I-CANCER', |
|
'I-EDC', |
|
'B-RECEPTOR', |
|
'B-CANCER', |
|
'I-RECEPTOR', |
|
'B-EDC', |
|
'PAD'] |
|
|
|
lstSentEnc = [] |
|
lstSentbilbl = [] |
|
lstSentEnt = [] |
|
for itsent in az: |
|
|
|
sentaz = itsent |
|
ph = [] |
|
phl = [] |
|
for e in sentaz: |
|
if e["word"][0]=="#" and len(ph)!=0: |
|
ph[-1]+= e["word"][2:] |
|
else: |
|
ph += [e["word"]] |
|
phl += [e["entity"]] |
|
|
|
|
|
phltr = [] |
|
for e in phl: |
|
phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]] |
|
|
|
|
|
nwph = [] |
|
nwphltr = [] |
|
flag = 0 |
|
for i in range(len(phltr)-2): |
|
if phltr[i]=="O" and flag != 3 : |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
continue |
|
elif flag == 3: |
|
nwph[-1] += " "+ph[i] |
|
flag = 1 |
|
continue |
|
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0: |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
flag = 1 |
|
continue |
|
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
continue |
|
|
|
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0: |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
flag = 3 |
|
continue |
|
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
flag = 3 |
|
continue |
|
|
|
elif flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
flag = 0 |
|
continue |
|
else : |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if nwphltr.count("O") <= len(nwphltr)-2: |
|
for i in range(len(nwph)-1): |
|
if nwphltr[i] != "O": |
|
for j in range(i,len(nwph)): |
|
if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}: |
|
sen2ad = "" |
|
for g in range(i): |
|
sen2ad += nwph[g]+" " |
|
sen2ad += "<e1>"+nwph[i]+"</e1> " |
|
|
|
for t in range(i+1,j): |
|
sen2ad += nwph[t]+" " |
|
sen2ad += "<e2>"+nwph[j]+"</e2>" |
|
if j<len(nwph): |
|
for l in range(j+1,len(nwph)): |
|
sen2ad += " "+nwph[l] |
|
lstSentEnc += [sen2ad] |
|
lstSentbilbl += [[nwphltr[i],nwphltr[j]]] |
|
lstSentEnt += [[nwph[i],nwph[j]]] |
|
|
|
|
|
|
|
|
|
|
|
st.text("Entities detected.") |
|
st.text("") |
|
st.text("Next: Relation detection ...") |
|
|
|
|
|
|
|
|
|
token_classifier = pipeline("text-classification", tokenizer = tokenizer,model=model_re, |
|
) |
|
|
|
rrdata = lstSentEnc |
|
|
|
|
|
|
|
outre = token_classifier(rrdata) |
|
|
|
|
|
trLABELS = ['INCREASE_RISK(e1,e2)', |
|
'SPEED_UP(e2,e1)', |
|
'DECREASE_ACTIVITY(e1,e2)', |
|
'NO_ASSOCIATION(e1,e2)', |
|
'DECREASE(e1,e2)', |
|
'BLOCK(e1,e2)', |
|
'CAUSE(e1,e2)', |
|
'ACTIVATE(e2,e1)', |
|
'DEVELOP(e2,e1)', |
|
'ALTER(e1,e2)', |
|
'INCREASE_RISK(e2,e1)', |
|
'SPEED_UP(e1,e2)', |
|
'INTERFER(e1,e2)', |
|
'DECREASE(e2,e1)', |
|
'NO_ASSOCIATION(e2,e1)', |
|
'INCREASE(e2,e1)', |
|
'INTERFER(e2,e1)', |
|
'ACTIVATE(e1,e2)', |
|
'INCREASE(e1,e2)', |
|
'MIMIC(e1,e2)', |
|
'MIMIC(e2,e1)', |
|
'BLOCK(e2,e1)', |
|
'other', |
|
'BIND(e2,e1)', |
|
'INCREASE_ACTIVITY(e2,e1)', |
|
'ALTER(e2,e1)', |
|
'CAUSE(e2,e1)', |
|
'BIND(e1,e2)', |
|
'DEVELOP(e1,e2)', |
|
'DECREASE_ACTIVITY(e2,e1)'] |
|
|
|
|
|
|
|
outrelbl = [] |
|
for e in outre: |
|
outrelbl += [trLABELS[int(e['label'][-1])] if len(e["label"])==7 else trLABELS[int(e['label'][-2:])] ] |
|
|
|
for i in range(len(outrelbl)): |
|
if "(e2,e1)" in outrelbl[i]: |
|
lstSentbilbl[i][0],lstSentbilbl[i][1] = lstSentbilbl[i][1],lstSentbilbl[i][0] |
|
lstSentEnt[i][0],lstSentEnt[i][1] = lstSentEnt[i][1],lstSentEnt[i][0] |
|
|
|
|
|
edccan = [] |
|
|
|
|
|
for i in range(len(outrelbl)): |
|
if outrelbl[i] != "other": |
|
edccan += [[lstSentEnc[i],lstSentEnt[i][0]+" ["+lstSentbilbl[i][0][2:]+"]", lstSentEnt[i][1]+" ["+lstSentbilbl[i][0][2:]+"]",outrelbl[i][:-7]]] |
|
|
|
edccandf = pd.DataFrame(edccan, columns= ["Sentence", "Entity 1", "Entity 2", "Relation"] ) |
|
|
|
|
|
st.table(edccandf) |
|
|
|
|
|
|
|
|