Spaces:

dexay
/

EDC_IE

Runtime error

App Files Files Community

dexay commited on Jun 24, 2022

Commit

371b679

1 Parent(s): e27afc9

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -210

app.py CHANGED Viewed

@@ -6,221 +6,223 @@ from transformers import  pipeline, TokenClassificationPipeline, BertForTokenCla
 st.header("Knowledge extraction on Endocrine disruptors")
 st.write("This tool lets you extract relation triples concerning interactions between: endocrine disrupting chemicals, hormones, receptors and cancers.")
 st.write("It is the result of an end of studies project within ESI school and dedicated to biomedical researchers looking to extract precise information about the subject without digging into long publications.")
-x = st.text_area('Entre you text on EDCs:')
-#model.to("cpu")
-tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
-model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
-model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", )
-token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint,  )
-st.text("Knowledge extraction is in progress ...")
-if x and x[-1] not in ".?:":
-  x += "."
-biotext = x
-#split document or text into sentences
-lstbiotext = []
-flag = 0
-tempsen = ""
-for e in biotext:
-  tempsen += e
-  if e=="(":
-      flag = 1
-  if e==")":
-      flag = 0
-  if (e =="." or e =="?" or e ==":" ) and flag == 0 :
-      lstbiotext += [tempsen.strip()]
-      tempsen = ""
-ddata = lstbiotext
-#tokenized_dat = tokenize_function(ddata)
-az = token_classifier(ddata)
-#code to convert NER output to  RE input compatible format
-#tg_inorder are decoding of labels on which the model was fine tuned on
-tg_inorder = ['O',
- 'B-HORMONE',
- 'B-EXP_PER',
- 'I-HORMONE',
- 'I-CANCER',
- 'I-EDC',
- 'B-RECEPTOR',
- 'B-CANCER',
- 'I-RECEPTOR',
- 'B-EDC',
- 'PAD']
-lstSentEnc = []
-lstSentbilbl = []
-lstSentEnt = []
-for itsent in az:
-  sentaz = itsent
-  ph = []
-  phl = []
-  for e in sentaz:
-    if e["word"][0]=="#" and len(ph)!=0:
-      ph[-1]+= e["word"][2:]
-    else:
-      ph += [e["word"]]
-      phl += [e["entity"]]
-  phltr = []
-  for e in phl:
-    phltr += [tg_inorder[int(e[-1])] if len(e)==7 else  tg_inorder[int(e[-2:])]]
-  nwph = []
-  nwphltr = []
   flag = 0
-  for i in range(len(phltr)-2):
-    if phltr[i]=="O" and flag != 3 :
-      nwph += [ph[i]]
-      nwphltr += [phltr[i]]
-      continue
-    elif flag == 3:
-      nwph[-1] += " "+ph[i]
-      flag = 1
-      continue
-    elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
-      nwph += [ph[i]]
-      nwphltr += [phltr[i]]
-      flag = 1
-      continue
-    elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
-      nwph[-1] += " "+ph[i]
-      continue
-# xox with flag == 3
-    elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
-      nwph += [ph[i]]
-      nwphltr += [phltr[i]]
-      flag = 3
-      continue
-    elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
-      nwph[-1] += " "+ph[i]
-      flag = 3
-      continue
-#\ xox
-    elif flag == 1:
-      nwph[-1] += " "+ph[i]
-      flag = 0
-      continue
-    else :
-      nwph += [ph[i]]
-      nwphltr += [phltr[i]]
-      continue
-  # nwph,nwphltr,len(nwph),len(nwphltr)
-  if nwphltr.count("O") <= len(nwphltr)-2:
-    for i in range(len(nwph)-1):
-      if nwphltr[i] != "O":
-        for j in range(i,len(nwph)):
-          if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
-            sen2ad = ""
-            for g in range(i):
-              sen2ad += nwph[g]+" "
-            sen2ad += "<e1>"+nwph[i]+"</e1> "
-            for t in range(i+1,j):
-              sen2ad += nwph[t]+" "
-            sen2ad += "<e2>"+nwph[j]+"</e2>"
-            if j<len(nwph):
-              for l in range(j+1,len(nwph)):
-                sen2ad += " "+nwph[l]
-            lstSentEnc += [sen2ad]
-            lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
-            lstSentEnt += [[nwph[i],nwph[j]]]
-#lstSentEnc,lstSentEnt,lstSentbilbl
-st.text("Entities detected, Next: Relation detection ...")
-# Relation extraction part
-token_classifier = pipeline("text-classification", tokenizer = tokenizer,model=model_re,
-)
-rrdata = lstSentEnc
-outre = token_classifier(rrdata)
-trLABELS = ['INCREASE_RISK(e1,e2)',
- 'SPEED_UP(e2,e1)',
- 'DECREASE_ACTIVITY(e1,e2)',
- 'NO_ASSOCIATION(e1,e2)',
- 'DECREASE(e1,e2)',
- 'BLOCK(e1,e2)',
- 'CAUSE(e1,e2)',
- 'ACTIVATE(e2,e1)',
- 'DEVELOP(e2,e1)',
- 'ALTER(e1,e2)',
- 'INCREASE_RISK(e2,e1)',
- 'SPEED_UP(e1,e2)',
- 'INTERFER(e1,e2)',
- 'DECREASE(e2,e1)',
- 'NO_ASSOCIATION(e2,e1)',
- 'INCREASE(e2,e1)',
- 'INTERFER(e2,e1)',
- 'ACTIVATE(e1,e2)',
- 'INCREASE(e1,e2)',
- 'MIMIC(e1,e2)',
- 'MIMIC(e2,e1)',
- 'BLOCK(e2,e1)',
- 'other',
- 'BIND(e2,e1)',
- 'INCREASE_ACTIVITY(e2,e1)',
- 'ALTER(e2,e1)',
- 'CAUSE(e2,e1)',
- 'BIND(e1,e2)',
- 'DEVELOP(e1,e2)',
- 'DECREASE_ACTIVITY(e2,e1)']
-outrelbl = []
-for e in outre:
-  outrelbl += [trLABELS[int(e['label'][-1])] if len(e["label"])==7 else trLABELS[int(e['label'][-2:])] ]
-for i in range(len(outrelbl)):
-  if "(e2,e1)" in outrelbl[i]:
-    lstSentbilbl[i][0],lstSentbilbl[i][1] = lstSentbilbl[i][1],lstSentbilbl[i][0]
-    lstSentEnt[i][0],lstSentEnt[i][1] = lstSentEnt[i][1],lstSentEnt[i][0]
-edccan = []
-for i in range(len(outrelbl)):
-  if outrelbl[i] != "other":
-    edccan += [[lstSentEnc[i],lstSentEnt[i][0], lstSentEnt[i][1],lstSentbilbl[i][0]+" "+outrelbl[i][:-7]+" "+lstSentbilbl[i][1]]]
-edccandf = pd.DataFrame(edccan, columns= ["Sentence", "Entity 1", "Entity 2", "Relation"] )
-if x:
-  out = token_classifier(x)
   st.table(edccandf)

 st.header("Knowledge extraction on Endocrine disruptors")
 st.write("This tool lets you extract relation triples concerning interactions between: endocrine disrupting chemicals, hormones, receptors and cancers.")
 st.write("It is the result of an end of studies project within ESI school and dedicated to biomedical researchers looking to extract precise information about the subject without digging into long publications.")
+form = st.form(key='my-form')
+x = form.text_input('Enter your text')
+submit = form.form_submit_button('Submit')
+if submit and len(x) != 0:
+  #model.to("cpu")
+  tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
+  model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
+  model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", )
+  token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint,  )
+  st.text("Knowledge extraction is in progress ...")
+  if x[-1] not in ".?:":
+    x += "."
+  biotext = x
+  #split document or text into sentences
+  lstbiotext = []
   flag = 0
+  tempsen = ""
+  for e in biotext:
+    tempsen += e
+    if e=="(":
+        flag = 1
+    if e==")":
+        flag = 0
+    if (e =="." or e =="?" or e ==":" ) and flag == 0 :
+        lstbiotext += [tempsen.strip()]
+        tempsen = ""
+  ddata = lstbiotext
+  #tokenized_dat = tokenize_function(ddata)
+  az = token_classifier(ddata)
+  #code to convert NER output to  RE input compatible format
+  #tg_inorder are decoding of labels on which the model was fine tuned on
+  tg_inorder = ['O',
+   'B-HORMONE',
+   'B-EXP_PER',
+   'I-HORMONE',
+   'I-CANCER',
+   'I-EDC',
+   'B-RECEPTOR',
+   'B-CANCER',
+   'I-RECEPTOR',
+   'B-EDC',
+   'PAD']
+  lstSentEnc = []
+  lstSentbilbl = []
+  lstSentEnt = []
+  for itsent in az:
+    sentaz = itsent
+    ph = []
+    phl = []
+    for e in sentaz:
+      if e["word"][0]=="#" and len(ph)!=0:
+        ph[-1]+= e["word"][2:]
+      else:
+        ph += [e["word"]]
+        phl += [e["entity"]]
+    phltr = []
+    for e in phl:
+      phltr += [tg_inorder[int(e[-1])] if len(e)==7 else  tg_inorder[int(e[-2:])]]
+    nwph = []
+    nwphltr = []
+    flag = 0
+    for i in range(len(phltr)-2):
+      if phltr[i]=="O" and flag != 3 :
+        nwph += [ph[i]]
+        nwphltr += [phltr[i]]
+        continue
+      elif flag == 3:
+        nwph[-1] += " "+ph[i]
+        flag = 1
+        continue
+      elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
+        nwph += [ph[i]]
+        nwphltr += [phltr[i]]
+        flag = 1
+        continue
+      elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
+        nwph[-1] += " "+ph[i]
+        continue
+  # xox with flag == 3
+      elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
+        nwph += [ph[i]]
+        nwphltr += [phltr[i]]
+        flag = 3
+        continue
+      elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
+        nwph[-1] += " "+ph[i]
+        flag = 3
+        continue
+  #\ xox
+      elif flag == 1:
+        nwph[-1] += " "+ph[i]
+        flag = 0
+        continue
+      else :
+        nwph += [ph[i]]
+        nwphltr += [phltr[i]]
+        continue
+    # nwph,nwphltr,len(nwph),len(nwphltr)
+    if nwphltr.count("O") <= len(nwphltr)-2:
+      for i in range(len(nwph)-1):
+        if nwphltr[i] != "O":
+          for j in range(i,len(nwph)):
+            if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
+              sen2ad = ""
+              for g in range(i):
+                sen2ad += nwph[g]+" "
+              sen2ad += "<e1>"+nwph[i]+"</e1> "
+              for t in range(i+1,j):
+                sen2ad += nwph[t]+" "
+              sen2ad += "<e2>"+nwph[j]+"</e2>"
+              if j<len(nwph):
+                for l in range(j+1,len(nwph)):
+                  sen2ad += " "+nwph[l]
+              lstSentEnc += [sen2ad]
+              lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
+              lstSentEnt += [[nwph[i],nwph[j]]]
+  #lstSentEnc,lstSentEnt,lstSentbilbl
+  st.text("Entities detected, Next: Relation detection ...")
+  # Relation extraction part
+  token_classifier = pipeline("text-classification", tokenizer = tokenizer,model=model_re,
+  )
+  rrdata = lstSentEnc
+  outre = token_classifier(rrdata)
+  trLABELS = ['INCREASE_RISK(e1,e2)',
+   'SPEED_UP(e2,e1)',
+   'DECREASE_ACTIVITY(e1,e2)',
+   'NO_ASSOCIATION(e1,e2)',
+   'DECREASE(e1,e2)',
+   'BLOCK(e1,e2)',
+   'CAUSE(e1,e2)',
+   'ACTIVATE(e2,e1)',
+   'DEVELOP(e2,e1)',
+   'ALTER(e1,e2)',
+   'INCREASE_RISK(e2,e1)',
+   'SPEED_UP(e1,e2)',
+   'INTERFER(e1,e2)',
+   'DECREASE(e2,e1)',
+   'NO_ASSOCIATION(e2,e1)',
+   'INCREASE(e2,e1)',
+   'INTERFER(e2,e1)',
+   'ACTIVATE(e1,e2)',
+   'INCREASE(e1,e2)',
+   'MIMIC(e1,e2)',
+   'MIMIC(e2,e1)',
+   'BLOCK(e2,e1)',
+   'other',
+   'BIND(e2,e1)',
+   'INCREASE_ACTIVITY(e2,e1)',
+   'ALTER(e2,e1)',
+   'CAUSE(e2,e1)',
+   'BIND(e1,e2)',
+   'DEVELOP(e1,e2)',
+   'DECREASE_ACTIVITY(e2,e1)']
+  outrelbl = []
+  for e in outre:
+    outrelbl += [trLABELS[int(e['label'][-1])] if len(e["label"])==7 else trLABELS[int(e['label'][-2:])] ]
+  for i in range(len(outrelbl)):
+    if "(e2,e1)" in outrelbl[i]:
+      lstSentbilbl[i][0],lstSentbilbl[i][1] = lstSentbilbl[i][1],lstSentbilbl[i][0]
+      lstSentEnt[i][0],lstSentEnt[i][1] = lstSentEnt[i][1],lstSentEnt[i][0]
+  edccan = []
+  for i in range(len(outrelbl)):
+    if outrelbl[i] != "other":
+      edccan += [[lstSentEnc[i],lstSentEnt[i][0], lstSentEnt[i][1],lstSentbilbl[i][0]+" "+outrelbl[i][:-7]+" "+lstSentbilbl[i][1]]]
+  edccandf = pd.DataFrame(edccan, columns= ["Sentence", "Entity 1", "Entity 2", "Relation"] )
   st.table(edccandf)