File size: 4,271 Bytes
adf4ac7
 
062527a
adf4ac7
58ef0b0
df89742
 
58ef0b0
50e8f91
dcb64e4
e010ab9
 
 
dcb64e4
 
 
 
b8f9703
dcb64e4
 
3cf2a36
 
 
58ef0b0
 
3cf2a36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83f7a4f
 
3cf2a36
 
83f7a4f
3cf2a36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83f7a4f
3cf2a36
 
83f7a4f
3cf2a36
adf4ac7
96898b2
 
 
 
 
 
e010ab9
dcb64e4
44854e8
08b8ceb
e010ab9
dcb64e4
 
 
 
3cf2a36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import streamlit as st
from transformers import pipeline
from ipymarkup import format_span_box_markup

# Load the pre-trained NER model
model = pipeline("ner", model="/home/user/app/mendobert/", tokenizer="indolem/indobert-base-uncased")
basemodel = pipeline("ner", model="/home/user/app/base-model/", tokenizer="indolem/indobert-base-uncased")


st.title(':blue[MendoBERT] - Named Entity Recognition Model :sunglasses:')
with st.container():
    example1 = st.button('Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka.', use_container_width=True)
    example2 = st.button('Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor.', use_container_width=True)
if example1:
    text = st.text_area('Enter some text: ', 'Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka.')
elif example2:
    text = st.text_area('Enter some text: ', 'Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor.')
else:
    text = st.text_area('Enter some text: ', 'Enter your texts here...')

if text:
    ner_results = model(text)
    ner_results2 = basemodel(text)
    
    
    # MendoBERT
    
    formatted_results = []
    for result in ner_results:
      end = result["start"]+len(result["word"].replace("##", ""))  
      
      if result["word"].startswith("##"):
        formatted_results[-1]["end"] = end
        formatted_results[-1]["word"]+= result["word"].replace("##", "")
      else:
        formatted_results.append({
                'start': result["start"], 
                'end': end,
                'entity': result["entity"],
                'index': result["index"],
                'score': result["score"],
                'word': result["word"]})
        
    for result in formatted_results:
        if result["entity"].startswith("LABEL_0"):
            result["entity"] = "O"
        elif result["entity"].startswith("LABEL_1"):
            result["entity"] = "B"
        elif result["entity"].startswith("LABEL_2"):
            result["entity"] = "I"
    
    mendo = []
    spanMendo = []
    for result in formatted_results:
        if not result["entity"].startswith("O"):
            spanMendo.append((result["start"],result["end"],result["entity"]))
            mendo.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}""")        
    
    # Base Model     
    
    formatted_results = []
    for result in ner_results2:
      end = result["start"]+len(result["word"].replace("##", ""))  
      
      if result["word"].startswith("##"):
        formatted_results[-1]["end"] = end
        formatted_results[-1]["word"]+= result["word"].replace("##", "")
      else:
        formatted_results.append({
                'start': result["start"], 
                'end': end,
                'entity': result["entity"],
                'index': result["index"],
                'score': result["score"],
                'word': result["word"]})
        
    for result in formatted_results:
        if result["entity"].startswith("LABEL_0"):
            result["entity"] = "O"
        elif result["entity"].startswith("LABEL_1"):
            result["entity"] = "B"
        elif result["entity"].startswith("LABEL_2"):
            result["entity"] = "I"
    
    base=[]        
    spanBase=[]
    for result in formatted_results:
        if not result["entity"].startswith("O"):
            spanBase.append((result["start"],result["end"],result["entity"]))
            base.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}""")

    formatMendo = format_span_box_markup(text, spanMendo)
    htmlMendo = ''.join(formatMendo)

    formatBase = format_span_box_markup(text, spanBase)
    htmlBase = ''.join(formatBase)

    st.divider()
    st.subheader('MendoBERT')
    st.json(mendo)
    st.markdown(htmlMendo,unsafe_allow_html=True)
    st.divider()
    st.subheader('IndoLEM')
    st.json(base)
    st.markdown(htmlBase,unsafe_allow_html=True)