Spaces:

dexay
/

EDC_IE

Runtime error

File size: 5,191 Bytes

import streamlit as st
import pandas as pd
import transformers
from transformers import  pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer , TextClassificationPipeline , AutoModelForSequenceClassification

x = st.text_area('enter')

#model.to("cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )


model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", truncation = True, padding=True, model_max_length=512,)
token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint,  )


biotext = x

#split document or text into sentences

lstbiotext = []

flag = 0
tempsen = ""
for e in biotext:
  tempsen += e
  if e=="(":
      flag = 1
  if e==")":
      flag = 0
  if (e =="." or e =="?" or e ==":" ) and flag == 0 :
      lstbiotext += [tempsen.strip()]
      tempsen = ""

ddata = lstbiotext

#tokenized_dat = tokenize_function(ddata) 

az = token_classifier(ddata)


#code to convert NER output to  RE input compatible format

#tg_inorder are decoding of labels on which the model was fine tuned on 

tg_inorder = ['O',
 'B-HORMONE',
 'B-EXP_PER',
 'I-HORMONE',
 'I-CANCER',
 'I-EDC',
 'B-RECEPTOR',
 'B-CANCER',
 'I-RECEPTOR',
 'B-EDC',
 'PAD']

lstSentEnc = []
lstSentbilbl = []
lstSentEnt = []
for itsent in az:
  
  sentaz = itsent
  ph = []
  phl = []
  for e in sentaz:
    if e["word"][0]=="#" and len(ph)!=0:
      ph[-1]+= e["word"][2:]
    else:
      ph += [e["word"]]
      phl += [e["entity"]]


  phltr = []
  for e in phl:
    phltr += [tg_inorder[int(e[-1])] if len(e)==7 else  tg_inorder[int(e[-2:])]]
  

  nwph = []
  nwphltr = []
  flag = 0
  for i in range(len(phltr)-2):
    if phltr[i]=="O" and flag != 3 :
      nwph += [ph[i]]
      nwphltr += [phltr[i]]
      continue
    elif flag == 3:
      nwph[-1] += " "+ph[i]
      flag = 1
      continue
    elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
      nwph += [ph[i]]
      nwphltr += [phltr[i]]
      flag = 1
      continue
    elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
      nwph[-1] += " "+ph[i]
      continue
# xox with flag == 3
    elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
      nwph += [ph[i]]
      nwphltr += [phltr[i]]
      flag = 3
      continue
    elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
      nwph[-1] += " "+ph[i]
      flag = 3
      continue
#\ xox
    elif flag == 1:
      nwph[-1] += " "+ph[i]
      flag = 0
      continue
    else :
      nwph += [ph[i]]
      nwphltr += [phltr[i]]
      continue
      

  # nwph,nwphltr,len(nwph),len(nwphltr)
  

  if nwphltr.count("O") <= len(nwphltr)-2:
    for i in range(len(nwph)-1):
      if nwphltr[i] != "O":
        for j in range(i,len(nwph)):
          if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
            sen2ad = ""
            for g in range(i):
              sen2ad += nwph[g]+" "
            sen2ad += "<e1>"+nwph[i]+"</e1> "

            for t in range(i+1,j):
              sen2ad += nwph[t]+" "
            sen2ad += "<e2>"+nwph[j]+"</e2>"
            if j<len(nwph):
              for l in range(j+1,len(nwph)):
                sen2ad += " "+nwph[l]
            lstSentEnc += [sen2ad]
            lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
            lstSentEnt += [[nwph[i],nwph[j]]]
      


#lstSentEnc,lstSentEnt,lstSentbilbl

# Relation extraction part

token_classifier = pipeline("text-classification", tokenizer = tokenizer,model=model_re, 
)

rrdata = lstSentEnc



outre = token_classifier(rrdata)


trLABELS = ['INCREASE_RISK(e1,e2)',
 'SPEED_UP(e2,e1)',
 'DECREASE_ACTIVITY(e1,e2)',
 'NO_ASSOCIATION(e1,e2)',
 'DECREASE(e1,e2)',
 'BLOCK(e1,e2)',
 'CAUSE(e1,e2)',
 'ACTIVATE(e2,e1)',
 'DEVELOP(e2,e1)',
 'ALTER(e1,e2)',
 'INCREASE_RISK(e2,e1)',
 'SPEED_UP(e1,e2)',
 'INTERFER(e1,e2)',
 'DECREASE(e2,e1)',
 'NO_ASSOCIATION(e2,e1)',
 'INCREASE(e2,e1)',
 'INTERFER(e2,e1)',
 'ACTIVATE(e1,e2)',
 'INCREASE(e1,e2)',
 'MIMIC(e1,e2)',
 'MIMIC(e2,e1)',
 'BLOCK(e2,e1)',
 'other',
 'BIND(e2,e1)',
 'INCREASE_ACTIVITY(e2,e1)',
 'ALTER(e2,e1)',
 'CAUSE(e2,e1)',
 'BIND(e1,e2)',
 'DEVELOP(e1,e2)',
 'DECREASE_ACTIVITY(e2,e1)']



outrelbl = []
for e in outre:
  outrelbl += [trLABELS[int(e['label'][-1])] if len(e["label"])==7 else trLABELS[int(e['label'][-2:])] ]

for i in range(len(outrelbl)):
  if "(e2,e1)" in outrelbl[i]:
    lstSentbilbl[i][0],lstSentbilbl[i][1] = lstSentbilbl[i][1],lstSentbilbl[i][0]
    lstSentEnt[i][0],lstSentEnt[i][1] = lstSentEnt[i][1],lstSentEnt[i][0]


edccan = []


for i in range(len(outrelbl)):
  if outrelbl[i]== "other":
    edccan += [[lstSentEnc[i],lstSentEnt[i][0], lstSentEnt[i][1],lstSentbilbl[i][0]+" "+outrelbl[i][:-7]+" "+lstSentbilbl[i][1]]]
 
edccandf = pd.DataFrame(edccan, columns= ["Sentence", "Entity 1", "Entity 2", "Relation"] )

if x:
  out = token_classifier(x)
  st.table(edccandf)