Spaces:

dexay
/

EDC_IE

Runtime error

App Files Files Community

EDC_IE / app.py

dexay

Update app.py

d532f55 about 3 years ago

raw

history blame

3.58 kB

	import streamlit as st
	import transformers
	from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer

	x = st.text_area('enter')

	#model.to("cpu")
	tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
	model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
	token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, )


	biotext = x

	#split document or text into sentences

	lstbiotext = []

	flag = 0
	tempsen = ""
	for e in biotext:
	tempsen += e
	if e=="(":
	flag = 1
	if e==")":
	flag = 0
	if (e =="." or e =="?" or e ==":" ) and flag == 0 :
	lstbiotext += [tempsen.strip()]
	tempsen = ""

	ddata = lstbiotext

	#tokenized_dat = tokenize_function(ddata)

	az = token_classifier(ddata)


	#code to convert NER output to RE input compatible format

	#tg_inorder are decoding of labels on which the model was fine tuned on

	tg_inorder = ['I-RECEPTOR',
	'O',
	'B-RECEPTOR',
	'B-EDC',
	'I-EXP_PER',
	'B-EXP_PER',
	'I-CANCER',
	'I-EDC',
	'B-HORMONE',
	'I-HORMONE',
	'B-QUANTITY',
	'B-EXP_DUR',
	'I-QUANTITY',
	'B-CANCER',
	'PAD']

	lstSentEnc = []
	lstSentbilbl = []
	lstSentEnt = []
	for itsent in az:

	sentaz = itsent
	ph = []
	phl = []
	for e in sentaz:
	if e["word"][0]=="#" and len(ph)!=0:
	ph[-1]+= e["word"][2:]
	else:
	ph += [e["word"]]
	phl += [e["entity"]]


	phltr = []
	for e in phl:
	phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]]


	nwph = []
	nwphltr = []
	flag = 0
	for i in range(len(phltr)-2):
	if phltr[i]=="O" and flag != 3 :
	nwph += [ph[i]]
	nwphltr += [phltr[i]]
	continue
	elif flag == 3:
	nwph[-1] += " "+ph[i]
	flag = 1
	continue
	elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
	nwph += [ph[i]]
	nwphltr += [phltr[i]]
	flag = 1
	continue
	elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
	nwph[-1] += " "+ph[i]
	continue
	# xox with flag == 3
	elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
	nwph += [ph[i]]
	nwphltr += [phltr[i]]
	flag = 3
	continue
	elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
	nwph[-1] += " "+ph[i]
	flag = 3
	continue
	#\ xox
	elif flag == 1:
	nwph[-1] += " "+ph[i]
	flag = 0
	continue
	else :
	nwph += [ph[i]]
	nwphltr += [phltr[i]]
	continue


	# nwph,nwphltr,len(nwph),len(nwphltr)


	if nwphltr.count("O") <= len(nwphltr)-2:
	for i in range(len(nwph)-1):
	if nwphltr[i] != "O":
	for j in range(i,len(nwph)):
	if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
	sen2ad = ""
	for g in range(i):
	sen2ad += nwph[g]+" "
	sen2ad += "<e1>"+nwph[i]+"</e1> "

	for t in range(i+1,j):
	sen2ad += nwph[t]+" "
	sen2ad += "<e2>"+nwph[j]+"</e2>"
	if j<len(nwph):
	for l in range(j+1,len(nwph)):
	sen2ad += " "+nwph[l]
	lstSentEnc += [sen2ad]
	lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
	lstSentEnt += [[nwph[i],nwph[j]]]



	#lstSentEnc,lstSentEnt,lstSentbilbl

	if x:
	out = token_classifier(x)
	st.markdown(lstSentEnc)