georad commited on
Commit
9993024
Β·
verified Β·
1 Parent(s): 29245d2

Upload type_text_v10.py

Browse files
Files changed (1) hide show
  1. pages/type_text_v10.py +126 -0
pages/type_text_v10.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from io import StringIO
4
+ import json
5
+ import torch
6
+ from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
7
+ from sentence_transformers import SentenceTransformer, util
8
+
9
+ import os
10
+ os.getenv("HF_TOKEN")
11
+
12
+ def get_device_map() -> str:
13
+ return 'cuda' if torch.cuda.is_available() else 'cpu'
14
+ device = get_device_map() # 'cpu'
15
+
16
+ def on_click():
17
+ st.session_state.user_input = ""
18
+
19
+ #@st.cache
20
+ def convert_df(df:pd.DataFrame):
21
+ return df.to_csv(index=False).encode('utf-8')
22
+
23
+ #@st.cache
24
+ def convert_json(df:pd.DataFrame):
25
+ result = df.to_json(orient="index")
26
+ parsed = json.loads(result)
27
+ json_string = json.dumps(parsed)
28
+ #st.json(json_string, expanded=True)
29
+ return json_string
30
+
31
+ INTdesc_input = st.text_input("Type internal description", key="user_input")
32
+
33
+ createSBScodes, right_column = st.columns(2)
34
+ createSBScodes_clicked = createSBScodes.button("Map to SBS codes", key="user_createSBScodes")
35
+ right_column.button("Reset", on_click=on_click)
36
+
37
+ numMAPPINGS_input = 5
38
+ #numMAPPINGS_input = st.text_input("Type number of mappings and hit Enter", key="user_input_numMAPPINGS")
39
+ #st.button("Clear text", on_click=on_click)
40
+
41
+ @st.cache_resource
42
+ def load_model():
43
+ model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
44
+ return model
45
+ model = load_model()
46
+
47
+ #model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
48
+ #model = SentenceTransformer('all-mpnet-base-v2') # best performance
49
+ #model = SentenceTransformers('all-distilroberta-v1')
50
+ #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
51
+ #model = SentenceTransformer('clips/mfaq')
52
+
53
+ INTdesc_embedding = model.encode(INTdesc_input)
54
+
55
+ # Semantic search, Compute cosine similarity between all pairs of SBS descriptions
56
+
57
+ #df_SBS = pd.read_csv("SBS_V2_Table.csv", index_col="SBS_Code", usecols=["Long_Description"]) # na_values=['NA']
58
+ #df_SBS = pd.read_csv("SBS_V2_Table.csv", usecols=["SBS_Code_Hyphenated","Long_Description"])
59
+ from_line = 0 # Imaging services chapter start, adjust as needed
60
+ to_line = 10080 # Imaging services chapter end, adjust as needed
61
+ nrows = to_line - from_line + 1
62
+ skiprows = list(range(1,from_line - 1))
63
+ df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows)
64
+ #st.write(df_SBS.head(5))
65
+
66
+ SBScorpus = df_SBS['Long_Description'].values.tolist()
67
+ SBScorpus_embeddings = model.encode(SBScorpus)
68
+
69
+ #my_model_results = pipeline("ner", model= "checkpoint-92")
70
+ HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
71
+ HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
72
+ HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
73
+
74
+ @st.cache_resource
75
+ def load_pipe():
76
+ pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
77
+ return pipe
78
+ pipe = load_pipe()
79
+
80
+ #pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
81
+
82
+ dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
83
+ dfALL = pd.DataFrame.from_dict(dictA)
84
+
85
+ if INTdesc_input is not None and createSBScodes_clicked == True:
86
+ for i, result in enumerate(HF_model_results_displayed):
87
+ dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
88
+ dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
89
+ dictA.update({"Score": "%.4f" % result[1]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[1]["corpus_id"]]})
90
+ dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
91
+ dictA.update({"Score": "%.4f" % result[2]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[2]["corpus_id"]]})
92
+ dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
93
+ dictA.update({"Score": "%.4f" % result[3]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[3]["corpus_id"]]})
94
+ dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
95
+ dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
96
+ dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
97
+
98
+ st.dataframe(data=dfALL, hide_index=True)
99
+
100
+ display_format = "ask REASONING MODEL: Which, if any, of the above SBS descriptions corresponds best to " + INTdesc_input +"? "
101
+ #st.write(display_format)
102
+ question = "Which one, if any, of the below Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
103
+ shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
104
+ prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
105
+ st.write(prompt)
106
+
107
+ messages = [
108
+ {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
109
+ {"role": "user", "content": prompt},
110
+ ]
111
+ outputs = pipe(
112
+ messages,
113
+ max_new_tokens=256,
114
+ )
115
+ st.write(outputs[0]["generated_text"][-1]["content"])
116
+
117
+ bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
118
+ with b1:
119
+ #csvbutton = download_button(results, "results.csv", "πŸ“₯ Download .csv")
120
+ csvbutton = st.download_button(label="πŸ“₯ Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
121
+ with b2:
122
+ #textbutton = download_button(results, "results.txt", "πŸ“₯ Download .txt")
123
+ textbutton = st.download_button(label="πŸ“₯ Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain', key='text_b')
124
+ with b3:
125
+ #jsonbutton = download_button(results, "results.json", "πŸ“₯ Download .json")
126
+ jsonbutton = st.download_button(label="πŸ“₯ Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json', key='json_b')