Update pages/type_text.py
Browse files- pages/type_text.py +33 -30
pages/type_text.py
CHANGED
@@ -9,7 +9,7 @@ from sentence_transformers import SentenceTransformer, util
|
|
9 |
import os
|
10 |
os.getenv("HF_TOKEN")
|
11 |
|
12 |
-
st.header("
|
13 |
st.subheader("Select specific Chapter for quicker results")
|
14 |
#df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv")
|
15 |
df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows_with_total.csv")
|
@@ -105,7 +105,7 @@ numMAPPINGS_input = 5
|
|
105 |
#st.button("Clear text", on_click=on_click)
|
106 |
|
107 |
|
108 |
-
## Define the
|
109 |
st_models = {
|
110 |
'original model for general domain, fastest: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
|
111 |
'fine-tuned model for medical domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
|
@@ -119,34 +119,13 @@ st_models = {
|
|
119 |
#model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
|
120 |
#model = SentenceTransformer('clips/mfaq')
|
121 |
|
122 |
-
## Create the select box
|
123 |
selected_st_model = st.selectbox('Choose a Sentence Transformer model:', list(st_models.keys()))
|
124 |
st.write("Current selection:", selected_st_model)
|
125 |
|
126 |
-
## Get the selected model
|
127 |
SentTrans_model = st_models[selected_st_model]
|
128 |
-
|
129 |
-
## Use the model...
|
130 |
-
@st.cache_resource
|
131 |
-
def load_model():
|
132 |
-
model = SentenceTransformer(SentTrans_model)
|
133 |
-
return model
|
134 |
-
model = load_model()
|
135 |
|
136 |
-
#mapSBS_button = st.button("Map to SBS codes", on_click=on_click, key="user_clickedSBS")
|
137 |
-
|
138 |
-
INTdesc_embedding = model.encode(INTdesc_input)
|
139 |
-
|
140 |
-
# Semantic search, Compute cosine similarity between INTdesc_embedding and SBS descriptions
|
141 |
-
|
142 |
-
SBScorpus_embeddings = model.encode(SBScorpus)
|
143 |
-
|
144 |
-
#if len(chapter_rows_indexes_list) >1:
|
145 |
-
if INTdesc_input is not None:
|
146 |
-
#my_model_results = pipeline("ner", model= "checkpoint-92")
|
147 |
-
HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
|
148 |
-
HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
|
149 |
-
HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
|
150 |
|
151 |
## Define the Reasoning models
|
152 |
rs_models = {
|
@@ -156,25 +135,49 @@ rs_models = {
|
|
156 |
'fine-tuned model for medical domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
|
157 |
}
|
158 |
|
159 |
-
## Create the select box
|
160 |
selected_rs_model = st.selectbox('Choose a Reasoning model:', list(rs_models.keys()))
|
161 |
st.write("Current selection:", selected_rs_model)
|
162 |
|
163 |
-
## Get the selected model
|
164 |
Reasoning_model = rs_models[selected_rs_model]
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
@st.cache_resource
|
168 |
def load_pipe():
|
169 |
pipe = pipeline("text-generation", model=Reasoning_model, device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
|
170 |
return pipe
|
171 |
pipe = load_pipe()
|
172 |
|
173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
|
176 |
dfALL = pd.DataFrame.from_dict(dictA)
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
if INTdesc_input is not None and mapSBS_button == True:
|
179 |
for i, result in enumerate(HF_model_results_displayed):
|
180 |
dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
|
|
|
9 |
import os
|
10 |
os.getenv("HF_TOKEN")
|
11 |
|
12 |
+
st.header("Map internal descriptions to SBS codes using Sentence Transformer + Reasoning Models")
|
13 |
st.subheader("Select specific Chapter for quicker results")
|
14 |
#df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv")
|
15 |
df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows_with_total.csv")
|
|
|
105 |
#st.button("Clear text", on_click=on_click)
|
106 |
|
107 |
|
108 |
+
## Define the Sentence Transformer models
|
109 |
st_models = {
|
110 |
'original model for general domain, fastest: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
|
111 |
'fine-tuned model for medical domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
|
|
|
119 |
#model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
|
120 |
#model = SentenceTransformer('clips/mfaq')
|
121 |
|
122 |
+
## Create the select Sentence Transformer box
|
123 |
selected_st_model = st.selectbox('Choose a Sentence Transformer model:', list(st_models.keys()))
|
124 |
st.write("Current selection:", selected_st_model)
|
125 |
|
126 |
+
## Get the selected SentTrans model
|
127 |
SentTrans_model = st_models[selected_st_model]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
## Define the Reasoning models
|
131 |
rs_models = {
|
|
|
135 |
'fine-tuned model for medical domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
|
136 |
}
|
137 |
|
138 |
+
## Create the select Reasoning box
|
139 |
selected_rs_model = st.selectbox('Choose a Reasoning model:', list(rs_models.keys()))
|
140 |
st.write("Current selection:", selected_rs_model)
|
141 |
|
142 |
+
## Get the selected Reasoning model
|
143 |
Reasoning_model = rs_models[selected_rs_model]
|
144 |
+
|
145 |
+
|
146 |
+
## Load the Sentence Transformer model ...
|
147 |
+
@st.cache_resource
|
148 |
+
def load_model():
|
149 |
+
model = SentenceTransformer(SentTrans_model)
|
150 |
+
return model
|
151 |
+
model = load_model()
|
152 |
+
|
153 |
+
## Load the Reasoning model as pipeline ...
|
154 |
@st.cache_resource
|
155 |
def load_pipe():
|
156 |
pipe = pipeline("text-generation", model=Reasoning_model, device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
|
157 |
return pipe
|
158 |
pipe = load_pipe()
|
159 |
|
160 |
+
|
161 |
+
#mapSBS_button = st.button("Map to SBS codes", on_click=on_click, key="user_clickedSBS")
|
162 |
+
mapSBS_button = st.button("Map to SBS codes") #, key="user_clickedSBS")
|
163 |
+
|
164 |
+
INTdesc_embedding = model.encode(INTdesc_input)
|
165 |
+
|
166 |
+
# Semantic search, Compute cosine similarity between INTdesc_embedding and SBS descriptions
|
167 |
+
|
168 |
+
SBScorpus_embeddings = model.encode(SBScorpus)
|
169 |
|
170 |
dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
|
171 |
dfALL = pd.DataFrame.from_dict(dictA)
|
172 |
|
173 |
+
|
174 |
+
if INTdesc_input is not None and if st.button(...):
|
175 |
+
#my_model_results = pipeline("ner", model= "checkpoint-92")
|
176 |
+
HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
|
177 |
+
HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
|
178 |
+
HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
|
179 |
+
|
180 |
+
|
181 |
if INTdesc_input is not None and mapSBS_button == True:
|
182 |
for i, result in enumerate(HF_model_results_displayed):
|
183 |
dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
|