import awesome_streamlit as ast import streamlit as st import pandas as pd from io import StringIO import json #import torch from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM #AutoModelForTokenClassification from sentence_transformers import SentenceTransformer, util #import lmdeploy #import turbomind as tm from backend.utils import get_current_ram_usage, ga #import backend.aragpt import backend.home import backend.processor #import backend.sa #import backend.qa import os os.getenv("HF_TOKEN") st.set_page_config( page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide" ) ga(st.__file__) PAGES = { "Home": backend.home, "Demo": Demo, "About": backend.home } st.sidebar.title("SBSmapper") selection = st.sidebar.radio("Pages", list(PAGES.keys())) page = PAGES[selection] # with st.spinner(f"Loading {selection} ..."): #ast.shared.components.write_page(page) st.sidebar.header("Info") st.sidebar.write("Project by JA RAD") #st.sidebar.write( # "Pre-trained models are available on [HF Hub](https://huggingface.co/)" #) #st.sidebar.write( # "Models source code available on [GitHub](https://github.com/)" #) #st.sidebar.write( # "App source code available on [GitHub](https://github.com/)" #) if st.sidebar.checkbox("Show RAM usage"): ram = get_current_ram_usage() st.sidebar.write("Ram usage: {:.2f}/{:.2f} GB".format(ram[0], ram[1])) def on_click(): st.session_state.user_input = "" #@st.cache def convert_df(df:pd.DataFrame): return df.to_csv(index=False).encode('utf-8') #@st.cache def convert_json(df:pd.DataFrame): result = df.to_json(orient="index") parsed = json.loads(result) json_string = json.dumps(parsed) #st.json(json_string, expanded=True) return json_string #st.title("📘SBS mapper") INTdesc_input = st.text_input("Type internal description and hit Enter", key="user_input") createSBScodes, right_column = st.columns(2) createSBScodes_clicked = createSBScodes.button("Map to SBS codes", key="user_createSBScodes") right_column.button("Reset", on_click=on_click) numMAPPINGS_input = 5 #numMAPPINGS_input = st.text_input("Type number of mappings and hit Enter", key="user_input_numMAPPINGS") #st.button("Clear text", on_click=on_click) model = SentenceTransformer('all-MiniLM-L6-v2') # fastest #model = SentenceTransformer('all-mpnet-base-v2') # best performance #model = SentenceTransformers('all-distilroberta-v1') #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5') #model = SentenceTransformer('clips/mfaq') INTdesc_embedding = model.encode(INTdesc_input) # Semantic search, Compute cosine similarity between all pairs of SBS descriptions #df_SBS = pd.read_csv("SBS_V2_Table.csv", index_col="SBS_Code", usecols=["Long_Description"]) # na_values=['NA'] #df_SBS = pd.read_csv("SBS_V2_Table.csv", usecols=["SBS_Code_Hyphenated","Long_Description"]) from_line = 7727 # Imaging services chapter start, adjust as needed to_line = 8239 # Imaging services chapter end, adjust as needed nrows = to_line - from_line + 1 skiprows = list(range(1,from_line - 1)) df_SBS = pd.read_csv("SBS_V2_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows) #st.write(df_SBS.head(5)) SBScorpus = df_SBS['Long_Description'].values.tolist() SBScorpus_embeddings = model.encode(SBScorpus) #my_model_results = pipeline("ner", model= "checkpoint-92") HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings) HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True) HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input] model_id = "meta-llama/Llama-3.2-1B-Instruct" pipe = pipeline("text-generation", model=model_id, device_map="auto",) # torch_dtype=torch.bfloat16 col1, col2, col3 = st.columns([1,1,2.5]) col1.subheader("Score") col2.subheader("SBS code") col3.subheader("SBS description V2.0") dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []} if INTdesc_input is not None and createSBScodes_clicked == True: #for i, result in enumerate(HF_model_results_displayed): for result in HF_model_results_displayed: with st.container(): col1.write("%.4f" % result[0]["score"]) col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) col3.write(SBScorpus[result[0]["corpus_id"]]) dictA["Score"].append("%.4f" % result[0]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[0]["corpus_id"]]) col1.write("%.4f" % result[1]["score"]) col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) col3.write(SBScorpus[result[1]["corpus_id"]]) dictA["Score"].append("%.4f" % result[1]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[1]["corpus_id"]]) col1.write("%.4f" % result[2]["score"]) col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) col3.write(SBScorpus[result[2]["corpus_id"]]) dictA["Score"].append("%.4f" % result[2]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[2]["corpus_id"]]) col1.write("%.4f" % result[3]["score"]) col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) col3.write(SBScorpus[result[3]["corpus_id"]]) dictA["Score"].append("%.4f" % result[3]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[3]["corpus_id"]]) col1.write("%.4f" % result[4]["score"]) col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) col3.write(SBScorpus[result[4]["corpus_id"]]) dictA["Score"].append("%.4f" % result[4]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[4]["corpus_id"]]) dfA = pd.DataFrame.from_dict(dictA) display_format = "ask REASONING MODEL: Which, if any, of the above Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? " st.write(display_format) question = "Which, if any, of the below Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? " shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]] prompt = [question + " " + shortlist[0] + " " + shortlist[1] + " " + shortlist[2] + " " + shortlist[3] + " " + shortlist[4]] #st.write(prompt) messages = [ {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"}, {"role": "user", "content": prompt}, ] outputs = pipe( messages, max_new_tokens=256, ) st.write(outputs[0]["generated_text"][-1]["content"]) bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75]) with b1: #csvbutton = download_button(results, "results.csv", "📥 Download .csv") csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(dfA), file_name= "results.csv", mime='text/csv', key='csv_b') with b2: #textbutton = download_button(results, "results.txt", "📥 Download .txt") textbutton = st.download_button(label="📥 Download .txt", data=convert_df(dfA), file_name= "results.text", mime='text/plain', key='text_b') with b3: #jsonbutton = download_button(results, "results.json", "📥 Download .json") jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(dfA), file_name= "results.json", mime='application/json', key='json_b')