|
import streamlit as st |
|
import transformers |
|
from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer |
|
|
|
x = st.text_area('enter') |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,) |
|
model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", ) |
|
token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, ) |
|
|
|
|
|
biotext = x |
|
|
|
|
|
|
|
lstbiotext = [] |
|
|
|
flag = 0 |
|
tempsen = "" |
|
for e in biotext: |
|
tempsen += e |
|
if e=="(": |
|
flag = 1 |
|
if e==")": |
|
flag = 0 |
|
if (e =="." or e =="?" or e ==":" ) and flag == 0 : |
|
lstbiotext += [tempsen.strip()] |
|
tempsen = "" |
|
|
|
ddata = lstbiotext |
|
|
|
|
|
|
|
az = token_classifier(ddata) |
|
|
|
|
|
|
|
|
|
|
|
|
|
tg_inorder = ['I-RECEPTOR', |
|
'O', |
|
'B-RECEPTOR', |
|
'B-EDC', |
|
'I-EXP_PER', |
|
'B-EXP_PER', |
|
'I-CANCER', |
|
'I-EDC', |
|
'B-HORMONE', |
|
'I-HORMONE', |
|
'B-QUANTITY', |
|
'B-EXP_DUR', |
|
'I-QUANTITY', |
|
'B-CANCER', |
|
'PAD'] |
|
|
|
lstSentEnc = [] |
|
lstSentbilbl = [] |
|
lstSentEnt = [] |
|
for itsent in az: |
|
|
|
sentaz = itsent |
|
ph = [] |
|
phl = [] |
|
for e in sentaz: |
|
if e["word"][0]=="#" and len(ph)!=0: |
|
ph[-1]+= e["word"][2:] |
|
else: |
|
ph += [e["word"]] |
|
phl += [e["entity"]] |
|
|
|
|
|
phltr = [] |
|
for e in phl: |
|
phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]] |
|
|
|
|
|
nwph = [] |
|
nwphltr = [] |
|
flag = 0 |
|
for i in range(len(phltr)-2): |
|
if phltr[i]=="O" and flag != 3 : |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
continue |
|
elif flag == 3: |
|
nwph[-1] += " "+ph[i] |
|
flag = 1 |
|
continue |
|
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0: |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
flag = 1 |
|
continue |
|
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
continue |
|
|
|
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0: |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
flag = 3 |
|
continue |
|
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
flag = 3 |
|
continue |
|
|
|
elif flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
flag = 0 |
|
continue |
|
else : |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if nwphltr.count("O") <= len(nwphltr)-2: |
|
for i in range(len(nwph)-1): |
|
if nwphltr[i] != "O": |
|
for j in range(i,len(nwph)): |
|
if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}: |
|
sen2ad = "" |
|
for g in range(i): |
|
sen2ad += nwph[g]+" " |
|
sen2ad += "<e1>"+nwph[i]+"</e1> " |
|
|
|
for t in range(i+1,j): |
|
sen2ad += nwph[t]+" " |
|
sen2ad += "<e2>"+nwph[j]+"</e2>" |
|
if j<len(nwph): |
|
for l in range(j+1,len(nwph)): |
|
sen2ad += " "+nwph[l] |
|
lstSentEnc += [sen2ad] |
|
lstSentbilbl += [[nwphltr[i],nwphltr[j]]] |
|
lstSentEnt += [[nwph[i],nwph[j]]] |
|
|
|
|
|
|
|
|
|
|
|
if x: |
|
out = token_classifier(x) |
|
st.markdown(lstSentEnc) |
|
|
|
|
|
|
|
|