georad commited on
Commit
76922b0
Β·
verified Β·
1 Parent(s): 3b28ab5

Delete type_text_v7.py

Browse files
Files changed (1) hide show
  1. type_text_v7.py +0 -150
type_text_v7.py DELETED
@@ -1,150 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- from io import StringIO
4
- import json
5
- import torch
6
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM #AutoModelForTokenClassification
7
- from sentence_transformers import SentenceTransformer, util
8
- #import lmdeploy
9
- #import turbomind as tm
10
-
11
- import os
12
- os.getenv("HF_TOKEN")
13
-
14
- def get_device_map() -> str:
15
- return 'cuda' if torch.cuda.is_available() else 'cpu'
16
- device = get_device_map() # 'cpu'
17
-
18
- def on_click():
19
- st.session_state.user_input = ""
20
-
21
- #@st.cache
22
- def convert_df(df:pd.DataFrame):
23
- return df.to_csv(index=False).encode('utf-8')
24
-
25
- #@st.cache
26
- def convert_json(df:pd.DataFrame):
27
- result = df.to_json(orient="index")
28
- parsed = json.loads(result)
29
- json_string = json.dumps(parsed)
30
- #st.json(json_string, expanded=True)
31
- return json_string
32
-
33
- #st.title("πŸ“˜SBS mapper")
34
-
35
- INTdesc_input = st.text_input("Type internal description and hit Enter", key="user_input")
36
-
37
- createSBScodes, right_column = st.columns(2)
38
- createSBScodes_clicked = createSBScodes.button("Map to SBS codes", key="user_createSBScodes")
39
- right_column.button("Reset", on_click=on_click)
40
-
41
- numMAPPINGS_input = 5
42
- #numMAPPINGS_input = st.text_input("Type number of mappings and hit Enter", key="user_input_numMAPPINGS")
43
- #st.button("Clear text", on_click=on_click)
44
-
45
- @st.cache_resource
46
- def load_model():
47
- #st.markdown("Sentence Transformer")
48
- model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
49
- #st.success("Loaded model!")
50
- #st.write("Turning on evaluation mode...")
51
- #model.eval()
52
- #st.write("Here's the model:")
53
- return model
54
- model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
55
- #model = SentenceTransformer('all-mpnet-base-v2') # best performance
56
- #model = SentenceTransformers('all-distilroberta-v1')
57
- #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
58
- #model = SentenceTransformer('clips/mfaq')
59
- load_model()
60
-
61
- INTdesc_embedding = model.encode(INTdesc_input)
62
-
63
- # Semantic search, Compute cosine similarity between all pairs of SBS descriptions
64
-
65
- #df_SBS = pd.read_csv("SBS_V2_Table.csv", index_col="SBS_Code", usecols=["Long_Description"]) # na_values=['NA']
66
- #df_SBS = pd.read_csv("SBS_V2_Table.csv", usecols=["SBS_Code_Hyphenated","Long_Description"])
67
- from_line = 0 # Imaging services chapter start, adjust as needed
68
- to_line = 10080 # Imaging services chapter end, adjust as needed
69
- nrows = to_line - from_line + 1
70
- skiprows = list(range(1,from_line - 1))
71
- df_SBS = pd.read_csv("SBS_V2_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows)
72
- #st.write(df_SBS.head(5))
73
-
74
- SBScorpus = df_SBS['Long_Description'].values.tolist()
75
- SBScorpus_embeddings = model.encode(SBScorpus)
76
-
77
- #my_model_results = pipeline("ner", model= "checkpoint-92")
78
- HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
79
- HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
80
- HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
81
-
82
- model_id = "meta-llama/Llama-3.2-1B-Instruct"
83
- pipe = pipeline("text-generation", model=model_id, device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
84
-
85
-
86
- col1, col2, col3 = st.columns([1,1,2.5])
87
- col1.subheader("Score")
88
- col2.subheader("SBS code")
89
- col3.subheader("SBS description V2.0")
90
-
91
- dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
92
-
93
- if INTdesc_input is not None and createSBScodes_clicked == True:
94
- #for i, result in enumerate(HF_model_results_displayed):
95
- for result in HF_model_results_displayed:
96
- with st.container():
97
- col1.write("%.4f" % result[0]["score"])
98
- col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
99
- col3.write(SBScorpus[result[0]["corpus_id"]])
100
- #dictA["Score"].append("%.4f" % result[0]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[0]["corpus_id"]])
101
-
102
- col1.write("%.4f" % result[1]["score"])
103
- col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
104
- col3.write(SBScorpus[result[1]["corpus_id"]])
105
- #dictA["Score"].append("%.4f" % result[1]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[1]["corpus_id"]])
106
-
107
- col1.write("%.4f" % result[2]["score"])
108
- col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
109
- col3.write(SBScorpus[result[2]["corpus_id"]])
110
- #dictA["Score"].append("%.4f" % result[2]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[2]["corpus_id"]])
111
-
112
- col1.write("%.4f" % result[3]["score"])
113
- col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
114
- col3.write(SBScorpus[result[3]["corpus_id"]])
115
- dictA["Score"].append("%.4f" % result[3]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[3]["corpus_id"]])
116
-
117
- col1.write("%.4f" % result[4]["score"])
118
- col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
119
- col3.write(SBScorpus[result[4]["corpus_id"]])
120
- #dictA["Score"].append("%.4f" % result[4]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[4]["corpus_id"]])
121
-
122
- dfA = pd.DataFrame.from_dict(dictA)
123
-
124
- display_format = "ask REASONING MODEL: Which, if any, of the above Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? "
125
- st.write(display_format)
126
- question = "Which, if any, of the below Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? "
127
- shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
128
- prompt = [question + " " + shortlist[0] + " " + shortlist[1] + " " + shortlist[2] + " " + shortlist[3] + " " + shortlist[4]]
129
- #st.write(prompt)
130
-
131
- messages = [
132
- {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
133
- {"role": "user", "content": prompt},
134
- ]
135
- outputs = pipe(
136
- messages,
137
- max_new_tokens=256,
138
- )
139
- st.write(outputs[0]["generated_text"][-1]["content"])
140
-
141
- bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
142
- with b1:
143
- #csvbutton = download_button(results, "results.csv", "πŸ“₯ Download .csv")
144
- csvbutton = st.download_button(label="πŸ“₯ Download .csv", data=convert_df(dfA), file_name= "results.csv", mime='text/csv', key='csv_b')
145
- with b2:
146
- #textbutton = download_button(results, "results.txt", "πŸ“₯ Download .txt")
147
- textbutton = st.download_button(label="πŸ“₯ Download .txt", data=convert_df(dfA), file_name= "results.text", mime='text/plain', key='text_b')
148
- with b3:
149
- #jsonbutton = download_button(results, "results.json", "πŸ“₯ Download .json")
150
- jsonbutton = st.download_button(label="πŸ“₯ Download .json", data=convert_json(dfA), file_name= "results.json", mime='application/json', key='json_b')