EDC_IE / app.py
dexay's picture
Update app.py
a127264
raw
history blame
3.52 kB
import streamlit as st
import transformers
from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer
x = st.text_area('enter')
#model.to("cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, )
biotext = x
#split document or text into sentences
lstbiotext = []
flag = 0
tempsen = ""
for e in biotext:
tempsen += e
if e=="(":
flag = 1
if e==")":
flag = 0
if (e =="." or e =="?" or e ==":" ) and flag == 0 :
lstbiotext += [tempsen.strip()]
tempsen = ""
ddata = lstbiotext
#tokenized_dat = tokenize_function(ddata)
az = token_classifier(ddata)
#code to convert NER output to RE input compatible format
#tg_inorder are decoding of labels on which the model was fine tuned on
tg_inorder = ['O',
'B-HORMONE',
'B-EXP_PER',
'I-HORMONE',
'I-CANCER',
'I-EDC',
'B-RECEPTOR',
'B-CANCER',
'I-RECEPTOR',
'B-EDC',
'PAD']
lstSentEnc = []
lstSentbilbl = []
lstSentEnt = []
for itsent in az:
sentaz = itsent
ph = []
phl = []
for e in sentaz:
if e["word"][0]=="#" and len(ph)!=0:
ph[-1]+= e["word"][2:]
else:
ph += [e["word"]]
phl += [e["entity"]]
phltr = []
for e in phl:
phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]]
nwph = []
nwphltr = []
flag = 0
for i in range(len(phltr)-2):
if phltr[i]=="O" and flag != 3 :
nwph += [ph[i]]
nwphltr += [phltr[i]]
continue
elif flag == 3:
nwph[-1] += " "+ph[i]
flag = 1
continue
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
nwph += [ph[i]]
nwphltr += [phltr[i]]
flag = 1
continue
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
nwph[-1] += " "+ph[i]
continue
# xox with flag == 3
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
nwph += [ph[i]]
nwphltr += [phltr[i]]
flag = 3
continue
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
nwph[-1] += " "+ph[i]
flag = 3
continue
#\ xox
elif flag == 1:
nwph[-1] += " "+ph[i]
flag = 0
continue
else :
nwph += [ph[i]]
nwphltr += [phltr[i]]
continue
# nwph,nwphltr,len(nwph),len(nwphltr)
if nwphltr.count("O") <= len(nwphltr)-2:
for i in range(len(nwph)-1):
if nwphltr[i] != "O":
for j in range(i,len(nwph)):
if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
sen2ad = ""
for g in range(i):
sen2ad += nwph[g]+" "
sen2ad += "<e1>"+nwph[i]+"</e1> "
for t in range(i+1,j):
sen2ad += nwph[t]+" "
sen2ad += "<e2>"+nwph[j]+"</e2>"
if j<len(nwph):
for l in range(j+1,len(nwph)):
sen2ad += " "+nwph[l]
lstSentEnc += [sen2ad]
lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
lstSentEnt += [[nwph[i],nwph[j]]]
#lstSentEnc,lstSentEnt,lstSentbilbl
if x:
out = token_classifier(x)
st.markdown(lstSentEnc)