georad commited on
Commit
e93f60e
Β·
verified Β·
1 Parent(s): 9993024

Delete pages/type_text_v11.py

Browse files
Files changed (1) hide show
  1. pages/type_text_v11.py +0 -133
pages/type_text_v11.py DELETED
@@ -1,133 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- from io import StringIO
4
- import json
5
- import torch
6
- from transformers import pipeline # AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
7
- from sentence_transformers import SentenceTransformer, util
8
- #from "/home/user/app/pages/chapter_index.py" import selected_chapters_list
9
-
10
- import os
11
- os.getenv("HF_TOKEN")
12
-
13
- for k, v in st.session_state.items():
14
- st.session_state[k] = v
15
- selected_chapters_floatlist = list(st.session_state.items())[0][1]
16
- selected_chapters_list = [int(i) for i in selected_chapters_floatlist]
17
-
18
- def get_device_map() -> str:
19
- return 'cuda' if torch.cuda.is_available() else 'cpu'
20
- device = get_device_map() # 'cpu'
21
-
22
- def on_click():
23
- st.session_state.user_input = ""
24
-
25
- #@st.cache
26
- def convert_df(df:pd.DataFrame):
27
- return df.to_csv(index=False).encode('utf-8')
28
-
29
- #@st.cache
30
- def convert_json(df:pd.DataFrame):
31
- result = df.to_json(orient="index")
32
- parsed = json.loads(result)
33
- json_string = json.dumps(parsed)
34
- #st.json(json_string, expanded=True)
35
- return json_string
36
-
37
- INTdesc_input = st.text_input("Type internal description and hit Enter", key="user_input")
38
-
39
- createSBScodes, right_column = st.columns(2)
40
- createSBScodes_clicked = createSBScodes.button("Map to SBS codes", key="user_createSBScodes")
41
- right_column.button("Reset", on_click=on_click)
42
-
43
- numMAPPINGS_input = 5
44
- #numMAPPINGS_input = st.text_input("Type number of mappings and hit Enter", key="user_input_numMAPPINGS")
45
- #st.button("Clear text", on_click=on_click)
46
-
47
- st.write("SELECTED CHAPTERS: ", selected_chapters_list)
48
-
49
- @st.cache_resource
50
- def load_model():
51
- model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
52
- #model = SentenceTransformer('all-mpnet-base-v2') # best performance
53
- #model = SentenceTransformers('all-distilroberta-v1')
54
- #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
55
- #model = SentenceTransformer('clips/mfaq')
56
- return model
57
- model = load_model()
58
-
59
-
60
-
61
- INTdesc_embedding = model.encode(INTdesc_input)
62
-
63
- # Semantic search, Compute cosine similarity between all pairs of SBS descriptions
64
-
65
- #df_chapters = filter_chapters_env(df_chapters, "chapter_name")
66
-
67
- #df_SBS = pd.read_csv("SBS_V2_Table.csv", index_col="SBS_Code", usecols=["Long_Description"]) # na_values=['NA']
68
- #df_SBS = pd.read_csv("SBS_V2_Table.csv", usecols=["SBS_Code_Hyphenated","Long_Description"])
69
- from_line = 0 # Imaging services chapter start, adjust as needed
70
- to_line = 10080 # Imaging services chapter end, adjust as needed
71
- nrows = to_line - from_line + 1
72
- skiprows = list(range(1,from_line - 1))
73
- df_SBS = pd.read_csv("SBS_V2_0/Code_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows)
74
- #st.write(df_SBS.head(5))
75
-
76
- SBScorpus = df_SBS['Long_Description'].values.tolist()
77
- SBScorpus_embeddings = model.encode(SBScorpus)
78
-
79
- #my_model_results = pipeline("ner", model= "checkpoint-92")
80
- HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
81
- HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
82
- HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
83
-
84
- @st.cache_resource
85
- def load_pipe():
86
- #pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device_map=device,) # device_map="auto", torch_dtype=torch.bfloat16
87
- pipe = pipeline("text-generation", model="Qwen/Qwen2-1.5B-Instruct", device_map=device,) # device_map="auto", torch_dtype="auto"
88
- return pipe
89
- pipe = load_pipe()
90
-
91
- dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
92
- dfALL = pd.DataFrame.from_dict(dictA)
93
-
94
- if INTdesc_input is not None and createSBScodes_clicked == True:
95
- for i, result in enumerate(HF_model_results_displayed):
96
- dictA.update({"Score": "%.4f" % result[0]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[0]["corpus_id"]]})
97
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
98
- dictA.update({"Score": "%.4f" % result[1]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[1]["corpus_id"]]})
99
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
100
- dictA.update({"Score": "%.4f" % result[2]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[2]["corpus_id"]]})
101
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
102
- dictA.update({"Score": "%.4f" % result[3]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[3]["corpus_id"]]})
103
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
104
- dictA.update({"Score": "%.4f" % result[4]["score"], "SBS Code": df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0], "SBS Description V2.0": SBScorpus[result[4]["corpus_id"]]})
105
- dfALL = pd.concat([dfALL, pd.DataFrame([dictA])], ignore_index=True)
106
-
107
- st.dataframe(data=dfALL, hide_index=True)
108
-
109
- question = "Which one, if any, of the following Saudi Billing System descriptions A, B, C, D, or E corresponds best to " + INTdesc_input +"? "
110
- shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
111
- prompt = question + " " +"A: "+ shortlist[0] + " " +"B: " + shortlist[1] + " " + "C: " + shortlist[2] + " " + "D: " + shortlist[3] + " " + "E: " + shortlist[4]
112
- st.write(prompt)
113
-
114
- messages = [
115
- {"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"},
116
- {"role": "user", "content": prompt},
117
- ]
118
- outputs = pipe(
119
- messages,
120
- max_new_tokens=256,
121
- )
122
- st.write(outputs[0]["generated_text"][-1]["content"])
123
-
124
- bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
125
- with b1:
126
- #csvbutton = download_button(results, "results.csv", "πŸ“₯ Download .csv")
127
- csvbutton = st.download_button(label="πŸ“₯ Download .csv", data=convert_df(dfALL), file_name= "results.csv", mime='text/csv', key='csv_b')
128
- with b2:
129
- #textbutton = download_button(results, "results.txt", "πŸ“₯ Download .txt")
130
- textbutton = st.download_button(label="πŸ“₯ Download .txt", data=convert_df(dfALL), file_name= "results.text", mime='text/plain', key='text_b')
131
- with b3:
132
- #jsonbutton = download_button(results, "results.json", "πŸ“₯ Download .json")
133
- jsonbutton = st.download_button(label="πŸ“₯ Download .json", data=convert_json(dfALL), file_name= "results.json", mime='application/json', key='json_b')