georad commited on
Commit
11ac955
Β·
verified Β·
1 Parent(s): c87168b

Delete pages/type_text.py

Browse files
Files changed (1) hide show
  1. pages/type_text.py +0 -145
pages/type_text.py DELETED
@@ -1,145 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- from io import StringIO
4
- import json
5
- import torch
6
- from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
7
- from sentence_transformers import SentenceTransformer, util
8
-
9
- import os
10
- os.getenv("HF_TOKEN")
11
-
12
- #for k, v in st.session_state.items():
13
- # st.session_state[k] = v
14
-
15
- #selected_chapters_floatlist = list(st.session_state.items())[0][1]
16
- #selected_chapters_intlist = [int(i) for i in selected_chapters_floatlist]
17
- #st.write("SELECTED CHAPTERS: ", selected_chapters_intlist)
18
- selected_rows_list = list(st.session_state.items())[1][1]
19
- st.write("SELECTED ROWS: ", selected_rows_list)
20
-
21
- def get_device_map() -> str:
22
- return 'cuda' if torch.cuda.is_available() else 'cpu'
23
- device = get_device_map() # 'cpu'
24
-
25
- def on_click():
26
- st.session_state.user_input = ""
27
-
28
- #@st.cache
29
- def convert_df(df:pd.DataFrame):
30
- return df.to_csv(index=False).encode('utf-8')
31
-
32
- #@st.cache
33
- def convert_json(df:pd.DataFrame):
34
- result = df.to_json(orient="index")
35
- parsed = json.loads(result)
36
- json_string = json.dumps(parsed)
37
- #st.json(json_string, expanded=True)
38
- return json_string
39
-
40
- INTdesc_input = st.text_input("Type internal description", key="user_input")
41
-
42
- createSBScodes, right_column = st.columns(2)
43
- createSBScodes_clicked = createSBScodes.button("Map to SBS codes", key="user_createSBScodes")
44
- right_column.button("Reset", on_click=on_click)
45
-
46
- numMAPPINGS_input = 5
47
- #numMAPPINGS_input = st.text_input("Type number of mappings", key="user_input_numMAPPINGS")
48
- #st.button("Clear text", on_click=on_click)
49
-
50
- @st.cache_resource
51
- def load_model():
52
- model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
53
- #model = SentenceTransformer('all-mpnet-base-v2') # best performance
54
- #model = SentenceTransformers('all-distilroberta-v1')
55
- #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
56
- #model = SentenceTransformer('clips/mfaq')
57
- return model
58
- model = load_model()
59
-
60
- INTdesc_embedding = model.encode(INTdesc_input)
61
-
62
- # Semantic search, Compute cosine similarity between all pairs of SBS descriptions
63
-
64
- #df_allchaps = pd.read_csv("SBS_V2_0/Chapter_Index_Rows.csv", usecols=["Chapter", "from_row_index", "to_row_index"])
65
- #st.dataframe(df_allchaps)
66
- #df_selectedchaps = df.loc[df['City'] == 'Chicago']
67
-
68
- #dict_allchaps = df_allchaps.to_dict(orient='index')
69
- #st.write("ALL CHAPTERS: ", dict_allchaps)
70
- #for chapter in dict_allchaps.get("Chapter"):
71
- # st.write(chapter)
72
-
73
- if len(selected_rows_list) == 0:
74
- st.warning("Please select at least one chapter")
75
- selected_rows_list = [0, 10080]
76
- st.write("SELECTED ROWS: ", selected_rows_list)
77
- df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", header=0, skip_blank_lines=False, skiprows = lambda x: x not in selected_rows_list)
78
-
79
- #df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", index_col="SBS_Code", usecols=["Long_Description"]) # na_values=['NA']
80
- #df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", usecols=["SBS_Code_Hyphenated","Long_Description"])
81
- #from_row_index = 7725 # Imaging services chapter start, adjust as needed
82
- #to_row_index = 8239 # Imaging services chapter end, adjust as needed
83
- #nrows = to_row_index - from_row_index + 1
84
- #skiprows = list(range(1,from_row_index - 1))
85
- #df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows)
86
- #st.write(df_SBS.head(5))
87
-
88
- SBScorpus = df_SBS['Long_Description'].values.tolist()
89
- SBScorpus_embeddings = model.encode(SBScorpus)
90
-
91
- #my_model_results = pipeline("ner", model= "checkpoint-92")
92
- HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
93
- HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
94
- HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
95
-
96
- @st.cache_resource
97
- def load_pipe():
98
- pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
99
- #pipe = pipeline("text-generation", model="Qwen/Qwen2-1.5B-Instruct", device_map=device,) # device_map="auto", torch_dtype="auto"
100
- return pipe
101
- pipe = load_pipe()
102
-
103
- dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
104
- dfALL = pd.DataFrame.from_dict(dictA)
105
-
106
- if INTdesc_input is not None and createSBScodes_clicked == True:
107
- for i, result in enumerate(HF_model_results_displayed):
108
- dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
109
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
110
- dictA.update({"Score": "%.4f" % result[1]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[1]["corpus_id"]]})
111
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
112
- dictA.update({"Score": "%.4f" % result[2]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[2]["corpus_id"]]})
113
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
114
- dictA.update({"Score": "%.4f" % result[3]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[3]["corpus_id"]]})
115
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
116
- dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
117
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
118
-
119
- st.dataframe(data=dfALL, hide_index=True)
120
-
121
- question = "Which one, if any, of the following Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
122
- shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
123
- prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
124
- st.write(prompt)
125
-
126
- messages = [
127
- {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
128
- {"role": "user", "content": prompt},
129
- ]
130
- outputs = pipe(
131
- messages,
132
- max_new_tokens=256,
133
- )
134
- st.write(outputs[0]["generated_text"][-1]["content"])
135
-
136
- bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
137
- with b1:
138
- #csvbutton = download_button(results, "results.csv", "πŸ“₯ Download .csv")
139
- csvbutton = st.download_button(label="πŸ“₯ Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
140
- with b2:
141
- #textbutton = download_button(results, "results.txt", "πŸ“₯ Download .txt")
142
- textbutton = st.download_button(label="πŸ“₯ Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain', key='text_b')
143
- with b3:
144
- #jsonbutton = download_button(results, "results.json", "πŸ“₯ Download .json")
145
- jsonbutton = st.download_button(label="πŸ“₯ Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json', key='json_b')