georad commited on
Commit
82d00d1
·
verified ·
1 Parent(s): 7ce2192

Update pages/type_text.py

Browse files
Files changed (1) hide show
  1. pages/type_text.py +69 -64
pages/type_text.py CHANGED
@@ -1,27 +1,26 @@
1
- import streamlit as st
2
  import streamlit.components.v1 as components
3
  import pandas as pd
4
  from io import StringIO
5
  import json
6
  import torch
7
  from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
8
- from sentence_transformers import SentenceTransformer, util
9
  import time
10
  import os
11
 
12
- os.getenv("HF_TOKEN")
 
 
 
13
 
14
  def get_device_map() -> str:
15
  return 'cuda' if torch.cuda.is_available() else 'cpu'
 
16
  device = get_device_map() # 'cpu'
17
 
18
  def on_click():
19
- st.session_state.user_input = ""
20
-
21
- def make_spinner(text = "In progress..."):
22
- with st.spinner(text):
23
- yield
24
-
25
 
26
  # JavaScript to scroll the dummy element into view
27
  scroll_script = """
@@ -47,16 +46,18 @@ def convert_json(df:pd.DataFrame):
47
  #st.json(json_string, expanded=True)
48
  return json_string
49
 
50
- #df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv")
51
- df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows_with_total.csv")
52
 
53
  startrowindex_list = df_chapters["from_row_index"].tolist()
54
  endrowindex_list = df_chapters["to_row_index"].tolist()
 
55
  allchapters_rows_list = []
56
  for s, e in zip(startrowindex_list, endrowindex_list):
57
  eachchapter_rows_list = list(range(s,e))
58
  allchapters_rows_list.append(eachchapter_rows_list)
59
- df_chapters['range_of_rows'] = allchapters_rows_list
 
60
 
61
  def dataframe_with_selections(df_chapters: pd.DataFrame, init_value: bool = False) -> pd.DataFrame:
62
  df_with_selections = df_chapters.copy()
@@ -77,52 +78,58 @@ def dataframe_with_selections(df_chapters: pd.DataFrame, init_value: bool = Fals
77
  if "selected_chapters" not in st.session_state:
78
  st.session_state['selected_chapters'] = []
79
  st.session_state['selected_rows'] = []
 
80
  selected_chapters_list = st.session_state.selected_chapters
81
  selected_rows_list = st.session_state.selected_rows
82
 
83
  selected_chapters = dataframe_with_selections(df_chapters)
 
84
  #st.write("Your selection:")
85
- #st.write(selected_chapters)
 
86
  #chapter_start_row_index = selected_chapters['from_row_index']
87
  #chapter_end_row_index = selected_chapters['to_row_index']
88
  chapter_rows_indexes_list = selected_chapters['range_of_rows'].tolist()
89
  #st.write("CHAPTER START ROW INDEX: ", chapter_start_row_index)
90
  #st.write("CHAPTER END ROW INDEX: ", chapter_end_row_index)
91
  #st.write("CHAPTER ROWS INDEXES LIST: ", chapter_rows_indexes_list)
92
- combined_chapters_rows_indexes_list = [0]
 
93
  for item in chapter_rows_indexes_list:
94
  combined_chapters_rows_indexes_list.extend(item)
95
 
96
- if len(combined_chapters_rows_indexes_list) == 1:
97
- st.warning("Please select at least one chapter above")
98
  #st.write("COMBINED CHAPTERS ROWS INDEXES LIST: ", combined_chapters_rows_indexes_list)
 
99
  df_SBS = pd.read_csv("SBS_V2_0/Code_Sheet.csv", header=0, skip_blank_lines=False, skiprows = lambda x: x not in combined_chapters_rows_indexes_list)
100
  #st.write(df_SBS.head(5))
 
101
  SBScorpus = df_SBS['Long_Description'].values.tolist()
102
 
103
  dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
104
  dfALL = pd.DataFrame.from_dict(dictA)
105
 
106
- col1, col2 = st.columns([4,1])
107
- with col1:
108
- INTdesc_input = st.text_input("Type internal description", placeholder="Type internal description here", label_visibility="collapsed", key="user_input")
109
- #INTdesc_input = st.text_input(r"$\textsf{\Large Type internal description here}$", label_visibility="collapsed", key="user_input")
 
110
  with col2:
111
  col2.button("Remove text", on_click=on_click)
112
 
113
-
114
  ## Define the Sentence Transformer models
115
  st_models = {
116
- '(higher speed) original model for general domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
117
- '(high performance) original model for general domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
118
- '(expected in future) fine-tuned model for medical domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
119
  '(expected in future) fine-tuned model for medical domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
120
  }
121
 
122
  #model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
123
  #model = SentenceTransformer('all-mpnet-base-v2') # best performance
124
  #model = SentenceTransformers('all-distilroberta-v1')
125
- #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
126
  #model = SentenceTransformer('clips/mfaq')
127
 
128
  ## Create the select Sentence Transformer box
@@ -132,16 +139,15 @@ selected_st_model = st.selectbox('Current selected Sentence Transformer model:',
132
  ## Get the selected SentTrans model
133
  SentTrans_model = st_models[selected_st_model]
134
 
135
-
136
  ## Define the Reasoning models
137
  rs_models = {
138
- '(medium speed) original model for general domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
139
- '(slower speed) original model for general domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
140
  '(medium speed) original model for general domain: EpistemeAI/ReasoningCore-1B-r1-0': 'EpistemeAI/ReasoningCore-1B-r1-0',
141
- '(expected in future) fine-tuned model for medical domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
142
  '(expected in future) fine-tuned model for medical domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
143
  }
144
-
145
  ## Create the select Reasoning box
146
  selected_rs_model = st.selectbox('Current selected Reasoning model:', list(rs_models.keys())) # or 'Choose a Reasoning Model'
147
  #st.write("Current selection:", selected_rs_model)
@@ -154,24 +160,28 @@ Reasoning_model = rs_models[selected_rs_model]
154
  def load_model():
155
  model = SentenceTransformer(SentTrans_model)
156
  return model
157
- model = load_model()
 
158
 
159
  ## Load the Reasoning model as pipeline ...
160
  @st.cache_resource
161
  def load_pipe():
162
- pipe = pipeline("text-generation", model=Reasoning_model, device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
163
- return pipe
 
164
  pipe = load_pipe()
165
 
166
  # Semantic search, Compute cosine similarity between INTdesc_embedding and SBS descriptions
167
  INTdesc_embedding = model.encode(INTdesc_input)
168
- SBScorpus_embeddings = model.encode(SBScorpus)
 
169
  numMAPPINGS_input = 5
170
 
171
- if INTdesc_input is not None and st.button(":blue[Map to SBS codes]", key="run_st_model"):
172
- HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
173
- HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
174
- HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
 
175
 
176
  for i, result in enumerate(HF_model_results_displayed):
177
  dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
@@ -184,45 +194,40 @@ if INTdesc_input is not None and st.button(":blue[Map to SBS codes]", key="run_s
184
  dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
185
  dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
186
  dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
187
-
188
  st.dataframe(data=dfALL, hide_index=True)
189
- #st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
190
- #components.html(scroll_script, height=0, width=0)
191
 
192
- display_format = "ask REASONING MODEL: Which, if any, of the following SBS descriptions corresponds best to " + INTdesc_input +"? "
193
  #st.write(display_format)
194
- question = "Which one, if any, of the following Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
195
- shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
196
  prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
197
  #st.write(prompt)
198
-
199
  messages = [
200
  {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
201
  {"role": "user", "content": prompt},
202
  ]
203
-
204
- status_text = st.empty()
205
- status_text.warning("It may take several minutes for Reasoning Model to analyze above 5 options and output results below")
206
- st.write("")
207
- #runningToggle(True)
208
-
209
- outputs = pipe(
210
- messages,
211
- max_new_tokens=256,
212
- )
213
- st.write(outputs[0]["generated_text"][-1]["content"])
214
- #st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
215
- #components.html(scroll_script, height=0, width=0)
216
-
217
  bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
218
  with b1:
219
- #csvbutton = download_button(results, "results.csv", "📥 Download .csv")
220
  csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
221
  with b2:
222
- #textbutton = download_button(results, "results.txt", "📥 Download .txt")
223
  textbutton = st.download_button(label="📥 Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain', key='text_b')
224
  with b3:
225
- #jsonbutton = download_button(results, "results.json", "📥 Download .json")
226
- jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json', key='json_b')
 
227
  st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
228
- components.html(scroll_script, height=0, width=0)
 
 
 
1
+ import streamlit as st
2
  import streamlit.components.v1 as components
3
  import pandas as pd
4
  from io import StringIO
5
  import json
6
  import torch
7
  from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
8
+ from sentence_transformers import SentenceTransformer, util
9
  import time
10
  import os
11
 
12
+ # It's better practice to configure Streamlit page settings at the top
13
+ st.set_page_config(layout="wide")
14
+
15
+ os.getenv("HF_TOKEN")
16
 
17
  def get_device_map() -> str:
18
  return 'cuda' if torch.cuda.is_available() else 'cpu'
19
+
20
  device = get_device_map() # 'cpu'
21
 
22
  def on_click():
23
+ st.session_state.user_input = ""
 
 
 
 
 
24
 
25
  # JavaScript to scroll the dummy element into view
26
  scroll_script = """
 
46
  #st.json(json_string, expanded=True)
47
  return json_string
48
 
49
+ #df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv")
50
+ df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows_with_total.csv")
51
 
52
  startrowindex_list = df_chapters["from_row_index"].tolist()
53
  endrowindex_list = df_chapters["to_row_index"].tolist()
54
+
55
  allchapters_rows_list = []
56
  for s, e in zip(startrowindex_list, endrowindex_list):
57
  eachchapter_rows_list = list(range(s,e))
58
  allchapters_rows_list.append(eachchapter_rows_list)
59
+
60
+ df_chapters['range_of_rows'] = allchapters_rows_list
61
 
62
  def dataframe_with_selections(df_chapters: pd.DataFrame, init_value: bool = False) -> pd.DataFrame:
63
  df_with_selections = df_chapters.copy()
 
78
  if "selected_chapters" not in st.session_state:
79
  st.session_state['selected_chapters'] = []
80
  st.session_state['selected_rows'] = []
81
+
82
  selected_chapters_list = st.session_state.selected_chapters
83
  selected_rows_list = st.session_state.selected_rows
84
 
85
  selected_chapters = dataframe_with_selections(df_chapters)
86
+
87
  #st.write("Your selection:")
88
+ #st.write(selected_chapters)
89
+
90
  #chapter_start_row_index = selected_chapters['from_row_index']
91
  #chapter_end_row_index = selected_chapters['to_row_index']
92
  chapter_rows_indexes_list = selected_chapters['range_of_rows'].tolist()
93
  #st.write("CHAPTER START ROW INDEX: ", chapter_start_row_index)
94
  #st.write("CHAPTER END ROW INDEX: ", chapter_end_row_index)
95
  #st.write("CHAPTER ROWS INDEXES LIST: ", chapter_rows_indexes_list)
96
+
97
+ combined_chapters_rows_indexes_list = [0]
98
  for item in chapter_rows_indexes_list:
99
  combined_chapters_rows_indexes_list.extend(item)
100
 
101
+ if len(combined_chapters_rows_indexes_list) == 1:
102
+ st.warning("Please select at least one chapter above")
103
  #st.write("COMBINED CHAPTERS ROWS INDEXES LIST: ", combined_chapters_rows_indexes_list)
104
+
105
  df_SBS = pd.read_csv("SBS_V2_0/Code_Sheet.csv", header=0, skip_blank_lines=False, skiprows = lambda x: x not in combined_chapters_rows_indexes_list)
106
  #st.write(df_SBS.head(5))
107
+
108
  SBScorpus = df_SBS['Long_Description'].values.tolist()
109
 
110
  dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
111
  dfALL = pd.DataFrame.from_dict(dictA)
112
 
113
+ col1, col2 = st.columns([4,1])
114
+ with col1:
115
+ INTdesc_input = st.text_input("Type internal description", placeholder="Type internal description here", label_visibility="collapsed", key="user_input")
116
+ #INTdesc_input = st.text_input(r"$\textsf{\Large Type internal description here}$", label_visibility="collapsed", key="user_input")
117
+
118
  with col2:
119
  col2.button("Remove text", on_click=on_click)
120
 
 
121
  ## Define the Sentence Transformer models
122
  st_models = {
123
+ '(higher speed) original model for general domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
124
+ '(high performance) original model for general domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
125
+ '(expected in future) fine-tuned model for medical domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
126
  '(expected in future) fine-tuned model for medical domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
127
  }
128
 
129
  #model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
130
  #model = SentenceTransformer('all-mpnet-base-v2') # best performance
131
  #model = SentenceTransformers('all-distilroberta-v1')
132
+ #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
133
  #model = SentenceTransformer('clips/mfaq')
134
 
135
  ## Create the select Sentence Transformer box
 
139
  ## Get the selected SentTrans model
140
  SentTrans_model = st_models[selected_st_model]
141
 
 
142
  ## Define the Reasoning models
143
  rs_models = {
144
+ '(medium speed) original model for general domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
145
+ '(slower speed) original model for general domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
146
  '(medium speed) original model for general domain: EpistemeAI/ReasoningCore-1B-r1-0': 'EpistemeAI/ReasoningCore-1B-r1-0',
147
+ '(expected in future) fine-tuned model for medical domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
148
  '(expected in future) fine-tuned model for medical domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
149
  }
150
+
151
  ## Create the select Reasoning box
152
  selected_rs_model = st.selectbox('Current selected Reasoning model:', list(rs_models.keys())) # or 'Choose a Reasoning Model'
153
  #st.write("Current selection:", selected_rs_model)
 
160
  def load_model():
161
  model = SentenceTransformer(SentTrans_model)
162
  return model
163
+
164
+ model = load_model()
165
 
166
  ## Load the Reasoning model as pipeline ...
167
  @st.cache_resource
168
  def load_pipe():
169
+ pipe = pipeline("text-generation", model=Reasoning_model, device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
170
+ return pipe
171
+
172
  pipe = load_pipe()
173
 
174
  # Semantic search, Compute cosine similarity between INTdesc_embedding and SBS descriptions
175
  INTdesc_embedding = model.encode(INTdesc_input)
176
+ SBScorpus_embeddings = model.encode(SBScorpus)
177
+
178
  numMAPPINGS_input = 5
179
 
180
+ if INTdesc_input and st.button(":blue[Map to SBS codes]", key="run_st_model"): # Added check for INTdesc_input not being empty
181
+ with st.spinner("Running Sentence Transformer model..."): # Added spinner for ST model
182
+ HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
183
+ HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
184
+ HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
185
 
186
  for i, result in enumerate(HF_model_results_displayed):
187
  dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
 
194
  dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
195
  dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
196
  dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
197
+
198
  st.dataframe(data=dfALL, hide_index=True)
 
 
199
 
200
+ display_format = "ask REASONING MODEL: Which, if any, of the following SBS descriptions corresponds best to " + INTdesc_input +"? "
201
  #st.write(display_format)
202
+ question = "Which one, if any, of the following Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
203
+ shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
204
  prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
205
  #st.write(prompt)
 
206
  messages = [
207
  {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
208
  {"role": "user", "content": prompt},
209
  ]
210
+ # Removed status_text and spinner as the pipeline call might handle its own
211
+ st.write("") # Add some space before the Reasoning Model output
212
+ st.subheader("Reasoning Model Output:") # Added a subheader for clarity
213
+ with st.spinner("Running Reasoning Model..."): # Added spinner for Reasoning model
214
+ outputs = pipe(
215
+ messages,
216
+ max_new_tokens=256,
217
+ )
218
+ st.write(outputs[0]["generated_text"][-1]["content"])
219
+
220
+ # Download buttons
 
 
 
221
  bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
222
  with b1:
 
223
  csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
224
  with b2:
 
225
  textbutton = st.download_button(label="📥 Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain', key='text_b')
226
  with b3:
227
+ jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json', key='json_b')
228
+
229
+ # Add a dummy element at the bottom AFTER all content is rendered
230
  st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
231
+
232
+ # Inject the scroll script AFTER the dummy element is added
233
+ components.html(scroll_script, height=0, width=0, unsafe_allow_html=True)