import streamlit as st import transformers from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer , TextClassificationPipeline , AutoModelForSequenceClassification x = st.text_area('enter') #model.to("cpu") tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,) model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", ) model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", truncation = True, padding=True, model_max_length=512,) token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, ) biotext = x #split document or text into sentences lstbiotext = [] flag = 0 tempsen = "" for e in biotext: tempsen += e if e=="(": flag = 1 if e==")": flag = 0 if (e =="." or e =="?" or e ==":" ) and flag == 0 : lstbiotext += [tempsen.strip()] tempsen = "" ddata = lstbiotext #tokenized_dat = tokenize_function(ddata) az = token_classifier(ddata) #code to convert NER output to RE input compatible format #tg_inorder are decoding of labels on which the model was fine tuned on tg_inorder = ['O', 'B-HORMONE', 'B-EXP_PER', 'I-HORMONE', 'I-CANCER', 'I-EDC', 'B-RECEPTOR', 'B-CANCER', 'I-RECEPTOR', 'B-EDC', 'PAD'] lstSentEnc = [] lstSentbilbl = [] lstSentEnt = [] for itsent in az: sentaz = itsent ph = [] phl = [] for e in sentaz: if e["word"][0]=="#" and len(ph)!=0: ph[-1]+= e["word"][2:] else: ph += [e["word"]] phl += [e["entity"]] phltr = [] for e in phl: phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]] nwph = [] nwphltr = [] flag = 0 for i in range(len(phltr)-2): if phltr[i]=="O" and flag != 3 : nwph += [ph[i]] nwphltr += [phltr[i]] continue elif flag == 3: nwph[-1] += " "+ph[i] flag = 1 continue elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0: nwph += [ph[i]] nwphltr += [phltr[i]] flag = 1 continue elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1: nwph[-1] += " "+ph[i] continue # xox with flag == 3 elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0: nwph += [ph[i]] nwphltr += [phltr[i]] flag = 3 continue elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1: nwph[-1] += " "+ph[i] flag = 3 continue #\ xox elif flag == 1: nwph[-1] += " "+ph[i] flag = 0 continue else : nwph += [ph[i]] nwphltr += [phltr[i]] continue # nwph,nwphltr,len(nwph),len(nwphltr) if nwphltr.count("O") <= len(nwphltr)-2: for i in range(len(nwph)-1): if nwphltr[i] != "O": for j in range(i,len(nwph)): if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}: sen2ad = "" for g in range(i): sen2ad += nwph[g]+" " sen2ad += ""+nwph[i]+" " for t in range(i+1,j): sen2ad += nwph[t]+" " sen2ad += ""+nwph[j]+"" if j