georad commited on
Commit
56a715e
·
verified ·
1 Parent(s): 53b9fb4

Delete pages/demo_type_text.py

Browse files
Files changed (1) hide show
  1. pages/demo_type_text.py +0 -205
pages/demo_type_text.py DELETED
@@ -1,205 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- from io import StringIO
4
- import json
5
- import torch
6
- from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
7
- from sentence_transformers import SentenceTransformer, util
8
-
9
- import os
10
- os.getenv("HF_TOKEN")
11
-
12
- #for k, v in st.session_state.items():
13
- # st.session_state[k] = v
14
-
15
- #st.title("📘Map internal description to SBS codes V2.0")
16
- #st.subheader("Select specific Chapter for quicker results")
17
- #df_chapters = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv")
18
-
19
- #startrowindex_list = df_chapters["from_row_index"].tolist()
20
- #endrowindex_list = df_chapters["to_row_index"].tolist()
21
- #allchapters_rows_list = []
22
- #for s, e in zip(startrowindex_list, endrowindex_list):
23
- # eachchapter_rows_list = list(range(s,e))
24
- # allchapters_rows_list.append(eachchapter_rows_list)
25
- #f_chapters['range_of_rows'] = allchapters_rows_list
26
-
27
- def dataframe_with_selections(df_chapters: pd.DataFrame, init_value: bool = False) -> pd.DataFrame:
28
- df_with_selections = df_chapters.copy()
29
- df_with_selections.insert(0, "Select", init_value)
30
-
31
- # Get dataframe row-selections from user with st.data_editor
32
- edited_df = st.data_editor(
33
- df_with_selections,
34
- hide_index=True,
35
- column_config={"Select": st.column_config.CheckboxColumn(required=True)},
36
- disabled=df_chapters.columns,
37
- )
38
-
39
- # Filter the dataframe using the temporary column, then drop the column
40
- selected_rows = edited_df[edited_df.Select]
41
- return selected_rows.drop('Select', axis=1)
42
-
43
- #if "selected_chapters" not in st.session_state:
44
- # st.session_state['selected_chapters'] = []
45
- # st.session_state['selected_rows'] = []
46
- #selected_chapters_list = st.session_state.selected_chapters
47
- #if "selected_rows" not in st.session_state:
48
- # st.session_state['selected_rows'] = []
49
- #selected_rows_list = st.session_state.selected_rows
50
-
51
- #selected_chapters = dataframe_with_selections(df_chapters)
52
- #st.write("Your selection:")
53
- #st.write(selected_chapters)
54
- #selected_rows = dataframe_with_selections(df_chapters)
55
- #st.write("Your selection:")
56
- #st.write(selected_rows)
57
-
58
- #selected_chapters_list = selected_chapters.iloc[:,0].tolist()
59
- #st.write("SELECTED CHAPTERS: ", selected_chapters_list)
60
- #selected_rows_list = selected_chapters.iloc[:,6].tolist()
61
- #st.write("SELECTED ROWS: ", selected_rows_list)
62
-
63
- #if selected_chapters is not None:
64
- # st.session_state.selected_chapters = selected_chapters_list
65
- # st.session_state.selected_rows = selected_rows_list
66
-
67
-
68
-
69
-
70
- #selected_chapters_floatlist = list(st.session_state.items())[0][1]
71
- #selected_chapters_intlist = [int(i) for i in selected_chapters_floatlist]
72
- #st.write("SELECTED CHAPTERS: ", selected_chapters_intlist)
73
- #for item in st.session_state.items():
74
- # st.write("IIIIIIIII: ", item)
75
- #selected_rows_list = list(st.session_state.items())[1][1]
76
- #st.write("SELECTED ROWS: ", selected_rows_list)
77
-
78
- def get_device_map() -> str:
79
- return 'cuda' if torch.cuda.is_available() else 'cpu'
80
- device = get_device_map() # 'cpu'
81
-
82
- def on_click():
83
- st.session_state.user_input = ""
84
-
85
- #@st.cache
86
- def convert_df(df:pd.DataFrame):
87
- return df.to_csv(index=False).encode('utf-8')
88
-
89
- #@st.cache
90
- def convert_json(df:pd.DataFrame):
91
- result = df.to_json(orient="index")
92
- parsed = json.loads(result)
93
- json_string = json.dumps(parsed)
94
- #st.json(json_string, expanded=True)
95
- return json_string
96
-
97
- INTdesc_input = st.text_input("Type internal description", key="user_input")
98
-
99
- createSBScodes, right_column = st.columns(2)
100
- createSBScodes_clicked = createSBScodes.button("Map to SBS codes", key="user_createSBScodes")
101
- right_column.button("Reset", on_click=on_click)
102
-
103
- numMAPPINGS_input = 5
104
- #numMAPPINGS_input = st.text_input("Type number of mappings", key="user_input_numMAPPINGS")
105
- #st.button("Clear text", on_click=on_click)
106
-
107
- @st.cache_resource
108
- def load_model():
109
- model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
110
- #model = SentenceTransformer('all-mpnet-base-v2') # best performance
111
- #model = SentenceTransformers('all-distilroberta-v1')
112
- #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
113
- #model = SentenceTransformer('clips/mfaq')
114
- return model
115
- model = load_model()
116
-
117
- INTdesc_embedding = model.encode(INTdesc_input)
118
-
119
- # Semantic search, Compute cosine similarity between all pairs of SBS descriptions
120
-
121
- #df_allchaps = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv", usecols=["Chapter", "from_row_index", "to_row_index"])
122
- #st.dataframe(df_allchaps)
123
- #df_selectedchaps = df.loc[df['City'] == 'Chicago']
124
-
125
- #dict_allchaps = df_allchaps.to_dict(orient='index')
126
- #st.write("ALL CHAPTERS: ", dict_allchaps)
127
- #for chapter in dict_allchaps.get("Chapter"):
128
- # st.write(chapter)
129
-
130
- selected_rows_list = []
131
-
132
- #if len(selected_rows_list) == 0:
133
- # st.warning("Please select at least one chapter")
134
- # selected_rows_list = [0, 10080]
135
- #st.write("SELECTED ROWS: ", selected_rows_list)
136
- #df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", header=0, skip_blank_lines=False, skiprows = lambda x: x not in selected_rows_list)
137
-
138
- #df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", index_col="SBS_Code", usecols=["Long_Description"]) # na_values=['NA']
139
- #df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", usecols=["SBS_Code_Hyphenated","Long_Description"])
140
- from_row_index = 0 # Imaging services chapter start, adjust as needed
141
- to_row_index = 10080 # Imaging services chapter end, adjust as needed
142
- nrows = to_row_index - from_row_index + 1
143
- skiprows = list(range(1,from_row_index - 1))
144
- df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows)
145
- st.write(df_SBS.head(5))
146
-
147
- SBScorpus = df_SBS['Long_Description'].values.tolist()
148
- SBScorpus_embeddings = model.encode(SBScorpus)
149
-
150
- #my_model_results = pipeline("ner", model= "checkpoint-92")
151
- HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
152
- HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
153
- HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
154
-
155
- @st.cache_resource
156
- def load_pipe():
157
- pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
158
- #pipe = pipeline("text-generation", model="Qwen/Qwen2-1.5B-Instruct", device_map=device,) # device_map="auto", torch_dtype="auto"
159
- return pipe
160
- pipe = load_pipe()
161
-
162
- dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
163
- dfALL = pd.DataFrame.from_dict(dictA)
164
-
165
- if INTdesc_input is not None and createSBScodes_clicked == True:
166
- for i, result in enumerate(HF_model_results_displayed):
167
- dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
168
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
169
- dictA.update({"Score": "%.4f" % result[1]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[1]["corpus_id"]]})
170
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
171
- dictA.update({"Score": "%.4f" % result[2]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[2]["corpus_id"]]})
172
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
173
- dictA.update({"Score": "%.4f" % result[3]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[3]["corpus_id"]]})
174
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
175
- dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
176
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
177
-
178
- st.dataframe(data=dfALL, hide_index=True)
179
-
180
- question = "Which one, if any, of the following Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
181
- shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
182
- prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
183
- st.write(prompt)
184
-
185
- messages = [
186
- {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
187
- {"role": "user", "content": prompt},
188
- ]
189
- outputs = pipe(
190
- messages,
191
- max_new_tokens=256,
192
- )
193
- st.write(outputs[0]["generated_text"][-1]["content"])
194
-
195
- bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
196
- with b1:
197
- #csvbutton = download_button(results, "results.csv", "📥 Download .csv")
198
- csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
199
- with b2:
200
- #textbutton = download_button(results, "results.txt", "📥 Download .txt")
201
- textbutton = st.download_button(label="📥 Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain', key='text_b')
202
- with b3:
203
- #jsonbutton = download_button(results, "results.json", "📥 Download .json")
204
- jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json', key='json_b')
205
-