Spaces:

georad
/

sbsmapper

Running

App Files Files Community

georad commited on May 16

Commit

82d00d1

verified ·

1 Parent(s): 7ce2192

Update pages/type_text.py

Browse files

Files changed (1) hide show

pages/type_text.py +69 -64

pages/type_text.py CHANGED Viewed

@@ -1,27 +1,26 @@
-import streamlit as st
 import streamlit.components.v1 as components
 import pandas as pd
 from io import StringIO
 import json
 import torch
 from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
-from sentence_transformers import SentenceTransformer, util
 import time
 import os
-os.getenv("HF_TOKEN")
 def get_device_map() -> str:
     return 'cuda' if torch.cuda.is_available() else 'cpu'
 device = get_device_map()  # 'cpu'
 def on_click():
-    st.session_state.user_input = ""
-def make_spinner(text = "In progress..."):
-    with st.spinner(text):
-        yield
 # JavaScript to scroll the dummy element into view
 scroll_script = """
@@ -47,16 +46,18 @@ def convert_json(df:pd.DataFrame):
     #st.json(json_string, expanded=True)
     return json_string
-#df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv")
-df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows_with_total.csv")
 startrowindex_list = df_chapters["from_row_index"].tolist()
 endrowindex_list = df_chapters["to_row_index"].tolist()
 allchapters_rows_list = []
 for s, e in zip(startrowindex_list, endrowindex_list):
     eachchapter_rows_list = list(range(s,e))
     allchapters_rows_list.append(eachchapter_rows_list)
-df_chapters['range_of_rows'] = allchapters_rows_list
 def dataframe_with_selections(df_chapters: pd.DataFrame, init_value: bool = False) -> pd.DataFrame:
     df_with_selections = df_chapters.copy()
@@ -77,52 +78,58 @@ def dataframe_with_selections(df_chapters: pd.DataFrame, init_value: bool = Fals
 if "selected_chapters" not in st.session_state:
     st.session_state['selected_chapters'] = []
     st.session_state['selected_rows'] = []
 selected_chapters_list = st.session_state.selected_chapters
 selected_rows_list = st.session_state.selected_rows
 selected_chapters = dataframe_with_selections(df_chapters)
 #st.write("Your selection:")
-#st.write(selected_chapters)
 #chapter_start_row_index = selected_chapters['from_row_index']
 #chapter_end_row_index = selected_chapters['to_row_index']
 chapter_rows_indexes_list = selected_chapters['range_of_rows'].tolist()
 #st.write("CHAPTER START ROW INDEX: ", chapter_start_row_index)
 #st.write("CHAPTER END ROW INDEX: ", chapter_end_row_index)
 #st.write("CHAPTER ROWS INDEXES LIST: ", chapter_rows_indexes_list)
-combined_chapters_rows_indexes_list = [0]
 for item in chapter_rows_indexes_list:
     combined_chapters_rows_indexes_list.extend(item)
-if len(combined_chapters_rows_indexes_list) == 1:
-    st.warning("Please select at least one chapter above")
 #st.write("COMBINED CHAPTERS ROWS INDEXES LIST: ", combined_chapters_rows_indexes_list)
 df_SBS = pd.read_csv("SBS_V2_0/Code_Sheet.csv", header=0, skip_blank_lines=False, skiprows = lambda x: x not in combined_chapters_rows_indexes_list)
 #st.write(df_SBS.head(5))
 SBScorpus = df_SBS['Long_Description'].values.tolist()
 dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
 dfALL = pd.DataFrame.from_dict(dictA)
-col1, col2 = st.columns([4,1])
-with col1:
-    INTdesc_input = st.text_input("Type internal description", placeholder="Type internal description here", label_visibility="collapsed", key="user_input")
-    #INTdesc_input = st.text_input(r"$\textsf{\Large Type internal description here}$", label_visibility="collapsed", key="user_input")
 with col2:
     col2.button("Remove text", on_click=on_click)
 ## Define the Sentence Transformer models
 st_models = {
-    '(higher speed) original model for general domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
-    '(high performance) original model for general domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
-    '(expected in future) fine-tuned model for medical domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
     '(expected in future) fine-tuned model for medical domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
 }
 #model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
 #model = SentenceTransformer('all-mpnet-base-v2') # best performance
 #model = SentenceTransformers('all-distilroberta-v1')
-#model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
 #model = SentenceTransformer('clips/mfaq')
 ## Create the select Sentence Transformer box
@@ -132,16 +139,15 @@ selected_st_model = st.selectbox('Current selected Sentence Transformer model:',
 ## Get the selected SentTrans model
 SentTrans_model = st_models[selected_st_model]
 ## Define the Reasoning models
 rs_models = {
-    '(medium speed) original model for general domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
-    '(slower speed) original model for general domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
     '(medium speed) original model for general domain: EpistemeAI/ReasoningCore-1B-r1-0': 'EpistemeAI/ReasoningCore-1B-r1-0',
-    '(expected in future) fine-tuned model for medical domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
     '(expected in future) fine-tuned model for medical domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
 }
 ## Create the select Reasoning box
 selected_rs_model = st.selectbox('Current selected Reasoning model:', list(rs_models.keys())) # or 'Choose a Reasoning Model'
 #st.write("Current selection:", selected_rs_model)
@@ -154,24 +160,28 @@ Reasoning_model = rs_models[selected_rs_model]
 def load_model():
     model = SentenceTransformer(SentTrans_model)
     return model
-model = load_model()
 ## Load the Reasoning model as pipeline ...
 @st.cache_resource
 def load_pipe():
-    pipe = pipeline("text-generation", model=Reasoning_model, device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
-    return pipe
 pipe = load_pipe()
 # Semantic search, Compute cosine similarity between INTdesc_embedding and SBS descriptions
 INTdesc_embedding = model.encode(INTdesc_input)
-SBScorpus_embeddings = model.encode(SBScorpus)
 numMAPPINGS_input = 5
-if INTdesc_input is not None and st.button(":blue[Map to SBS codes]", key="run_st_model"):
-    HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
-    HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
-    HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
     for i, result in enumerate(HF_model_results_displayed):
         dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
@@ -184,45 +194,40 @@ if INTdesc_input is not None and st.button(":blue[Map to SBS codes]", key="run_s
         dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
         dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
         dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
     st.dataframe(data=dfALL, hide_index=True)
-    #st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
-    #components.html(scroll_script, height=0, width=0)
-    display_format = "ask REASONING MODEL: Which, if any, of the following SBS descriptions corresponds best to " + INTdesc_input +"? "
     #st.write(display_format)
-    question = "Which one, if any, of the following Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
-    shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
     prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
     #st.write(prompt)
     messages = [
     {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
     {"role": "user", "content": prompt},
     ]
-    status_text = st.empty()
-    status_text.warning("It may take several minutes for Reasoning Model to analyze above 5 options and output results below")
-    st.write("")
-    #runningToggle(True)
-    outputs = pipe(
-        messages,
-        max_new_tokens=256,
-    )
-    st.write(outputs[0]["generated_text"][-1]["content"])
-    #st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
-    #components.html(scroll_script, height=0, width=0)
     bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
     with b1:
-        #csvbutton = download_button(results, "results.csv", "📥 Download .csv")
         csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
     with b2:
-        #textbutton = download_button(results, "results.txt", "📥 Download .txt")
         textbutton = st.download_button(label="📥 Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain',  key='text_b')
     with b3:
-        #jsonbutton = download_button(results, "results.json", "📥 Download .json")
-        jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json',  key='json_b')
     st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
-    components.html(scroll_script, height=0, width=0)

+import streamlit as st
 import streamlit.components.v1 as components
 import pandas as pd
 from io import StringIO
 import json
 import torch
 from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
+from sentence_transformers import SentenceTransformer, util
 import time
 import os
+# It's better practice to configure Streamlit page settings at the top
+st.set_page_config(layout="wide")
+os.getenv("HF_TOKEN")
 def get_device_map() -> str:
     return 'cuda' if torch.cuda.is_available() else 'cpu'
 device = get_device_map()  # 'cpu'
 def on_click():
+    st.session_state.user_input = ""
 # JavaScript to scroll the dummy element into view
 scroll_script = """
     #st.json(json_string, expanded=True)
     return json_string
+#df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv")
+df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows_with_total.csv")
 startrowindex_list = df_chapters["from_row_index"].tolist()
 endrowindex_list = df_chapters["to_row_index"].tolist()
 allchapters_rows_list = []
 for s, e in zip(startrowindex_list, endrowindex_list):
     eachchapter_rows_list = list(range(s,e))
     allchapters_rows_list.append(eachchapter_rows_list)
+df_chapters['range_of_rows'] = allchapters_rows_list
 def dataframe_with_selections(df_chapters: pd.DataFrame, init_value: bool = False) -> pd.DataFrame:
     df_with_selections = df_chapters.copy()
 if "selected_chapters" not in st.session_state:
     st.session_state['selected_chapters'] = []
     st.session_state['selected_rows'] = []
 selected_chapters_list = st.session_state.selected_chapters
 selected_rows_list = st.session_state.selected_rows
 selected_chapters = dataframe_with_selections(df_chapters)
 #st.write("Your selection:")
+#st.write(selected_chapters)
 #chapter_start_row_index = selected_chapters['from_row_index']
 #chapter_end_row_index = selected_chapters['to_row_index']
 chapter_rows_indexes_list = selected_chapters['range_of_rows'].tolist()
 #st.write("CHAPTER START ROW INDEX: ", chapter_start_row_index)
 #st.write("CHAPTER END ROW INDEX: ", chapter_end_row_index)
 #st.write("CHAPTER ROWS INDEXES LIST: ", chapter_rows_indexes_list)
+combined_chapters_rows_indexes_list = [0]
 for item in chapter_rows_indexes_list:
     combined_chapters_rows_indexes_list.extend(item)
+if len(combined_chapters_rows_indexes_list) == 1:
+     st.warning("Please select at least one chapter above")
 #st.write("COMBINED CHAPTERS ROWS INDEXES LIST: ", combined_chapters_rows_indexes_list)
 df_SBS = pd.read_csv("SBS_V2_0/Code_Sheet.csv", header=0, skip_blank_lines=False, skiprows = lambda x: x not in combined_chapters_rows_indexes_list)
 #st.write(df_SBS.head(5))
 SBScorpus = df_SBS['Long_Description'].values.tolist()
 dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
 dfALL = pd.DataFrame.from_dict(dictA)
+col1, col2 = st.columns([4,1])
+with col1:
+     INTdesc_input = st.text_input("Type internal description", placeholder="Type internal description here", label_visibility="collapsed", key="user_input")
+     #INTdesc_input = st.text_input(r"$\textsf{\Large Type internal description here}$", label_visibility="collapsed", key="user_input")
 with col2:
     col2.button("Remove text", on_click=on_click)
 ## Define the Sentence Transformer models
 st_models = {
+    '(higher speed) original model for general domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
+    '(high performance) original model for general domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
+    '(expected in future) fine-tuned model for medical domain: all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
     '(expected in future) fine-tuned model for medical domain: all-mpnet-base-v2': 'all-mpnet-base-v2',
 }
 #model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
 #model = SentenceTransformer('all-mpnet-base-v2') # best performance
 #model = SentenceTransformers('all-distilroberta-v1')
+#model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
 #model = SentenceTransformer('clips/mfaq')
 ## Create the select Sentence Transformer box
 ## Get the selected SentTrans model
 SentTrans_model = st_models[selected_st_model]
 ## Define the Reasoning models
 rs_models = {
+    '(medium speed) original model for general domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
+    '(slower speed) original model for general domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
     '(medium speed) original model for general domain: EpistemeAI/ReasoningCore-1B-r1-0': 'EpistemeAI/ReasoningCore-1B-r1-0',
+    '(expected in future) fine-tuned model for medical domain: meta-llama/Llama-3.2-1B-Instruct': 'meta-llama/Llama-3.2-1B-Instruct',
     '(expected in future) fine-tuned model for medical domain: Qwen/Qwen2-1.5B-Instruct': 'Qwen/Qwen2-1.5B-Instruct',
 }
 ## Create the select Reasoning box
 selected_rs_model = st.selectbox('Current selected Reasoning model:', list(rs_models.keys())) # or 'Choose a Reasoning Model'
 #st.write("Current selection:", selected_rs_model)
 def load_model():
     model = SentenceTransformer(SentTrans_model)
     return model
+model = load_model()
 ## Load the Reasoning model as pipeline ...
 @st.cache_resource
 def load_pipe():
+    pipe = pipeline("text-generation", model=Reasoning_model, device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
+    return pipe
 pipe = load_pipe()
 # Semantic search, Compute cosine similarity between INTdesc_embedding and SBS descriptions
 INTdesc_embedding = model.encode(INTdesc_input)
+SBScorpus_embeddings = model.encode(SBScorpus)
 numMAPPINGS_input = 5
+if INTdesc_input and st.button(":blue[Map to SBS codes]", key="run_st_model"): # Added check for INTdesc_input not being empty
+    with st.spinner("Running Sentence Transformer model..."): # Added spinner for ST model
+        HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
+        HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
+        HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
     for i, result in enumerate(HF_model_results_displayed):
         dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
         dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
         dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
         dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
     st.dataframe(data=dfALL, hide_index=True)
+    display_format = "ask REASONING MODEL: Which, if any, of the following SBS descriptions corresponds best to " + INTdesc_input +"? "
     #st.write(display_format)
+    question = "Which one, if any, of the following Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
+    shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
     prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
     #st.write(prompt)
     messages = [
     {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
     {"role": "user", "content": prompt},
     ]
+    # Removed status_text and spinner as the pipeline call might handle its own
+    st.write("") # Add some space before the Reasoning Model output
+    st.subheader("Reasoning Model Output:") # Added a subheader for clarity
+    with st.spinner("Running Reasoning Model..."): # Added spinner for Reasoning model
+        outputs = pipe(
+            messages,
+            max_new_tokens=256,
+        )
+    st.write(outputs[0]["generated_text"][-1]["content"])
+    # Download buttons
     bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
     with b1:
         csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
     with b2:
         textbutton = st.download_button(label="📥 Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain',  key='text_b')
     with b3:
+        jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json',  key='json_b')
+    # Add a dummy element at the bottom AFTER all content is rendered
     st.markdown('<div id="bottom"></div>', unsafe_allow_html=True)
+    # Inject the scroll script AFTER the dummy element is added
+    components.html(scroll_script, height=0, width=0, unsafe_allow_html=True)