Spaces:
Sleeping
Sleeping
Commit
·
d9f47da
1
Parent(s):
ce243a5
Adding Pipeline class.
Browse files- app.py +9 -8
- mi_clase.py +127 -1
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from mi_clase import persona
|
|
|
|
| 3 |
st.title("Ask your scientific question!")
|
| 4 |
expected_format = "What is color?\nA)Is a name.\nB)Is something horrible.\nC)I don't know.\nD)You should ask someone else.\nE)Ask in a pyshic book."
|
| 5 |
txt = st.text_area(
|
|
@@ -19,15 +20,15 @@ try:
|
|
| 19 |
q = lista[0]
|
| 20 |
|
| 21 |
mi_dict= {
|
| 22 |
-
"
|
| 23 |
-
"
|
| 24 |
-
"
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"
|
| 28 |
}
|
| 29 |
-
|
| 30 |
st.write(mi_dict)
|
| 31 |
-
st.write(
|
| 32 |
except:
|
| 33 |
st.error("Your question doesn't have the required format. Please, correct it.")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from mi_clase import persona
|
| 3 |
+
from mi_clase import pipeline
|
| 4 |
st.title("Ask your scientific question!")
|
| 5 |
expected_format = "What is color?\nA)Is a name.\nB)Is something horrible.\nC)I don't know.\nD)You should ask someone else.\nE)Ask in a pyshic book."
|
| 6 |
txt = st.text_area(
|
|
|
|
| 20 |
q = lista[0]
|
| 21 |
|
| 22 |
mi_dict= {
|
| 23 |
+
"prompt":q,
|
| 24 |
+
"A":a,
|
| 25 |
+
"B":b,
|
| 26 |
+
"C":c,
|
| 27 |
+
"D":d,
|
| 28 |
+
"E":e
|
| 29 |
}
|
| 30 |
+
answer = pipeline.give_the_best_answer(mi_dict)
|
| 31 |
st.write(mi_dict)
|
| 32 |
+
st.write(answer)
|
| 33 |
except:
|
| 34 |
st.error("Your question doesn't have the required format. Please, correct it.")
|
mi_clase.py
CHANGED
|
@@ -6,4 +6,130 @@ class Persona:
|
|
| 6 |
def get_nomber(self):
|
| 7 |
return self.nombre
|
| 8 |
|
| 9 |
-
persona = Persona("josue",33)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
def get_nomber(self):
|
| 7 |
return self.nombre
|
| 8 |
|
| 9 |
+
persona = Persona("josue",33)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] ='0'
|
| 13 |
+
import torch
|
| 14 |
+
from transformers import AutoModelForMultipleChoice
|
| 15 |
+
from transformers import AutoTokenizer
|
| 16 |
+
from nltk.corpus import stopwords
|
| 17 |
+
from nltk.tokenize import word_tokenize
|
| 18 |
+
from transformers import AutoTokenizer, AutoModel
|
| 19 |
+
|
| 20 |
+
QUERY_MODEL = "/kaggle/input/bge-small-faiss/"
|
| 21 |
+
GENERATOR_MODEL="/kaggle/input/training-model-2/model_v2"
|
| 22 |
+
DEVICE = "cpu" # cpu or cuda
|
| 23 |
+
|
| 24 |
+
class Pipeline:
|
| 25 |
+
|
| 26 |
+
#---- init class
|
| 27 |
+
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.model = AutoModelForMultipleChoice.from_pretrained(GENERATOR_MODEL)
|
| 30 |
+
self.tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
|
| 31 |
+
self.semModel = AutoModel.from_pretrained(QUERY_MODEL)
|
| 32 |
+
self.semTokenizer = AutoTokenizer.from_pretrained(QUERY_MODEL)
|
| 33 |
+
self.device = torch.device(DEVICE)
|
| 34 |
+
|
| 35 |
+
self.semModel.to(self.device)
|
| 36 |
+
self.model.to(self.device)
|
| 37 |
+
|
| 38 |
+
#---- utils functions
|
| 39 |
+
|
| 40 |
+
def convert_to_letter(self,a):
|
| 41 |
+
if a == 0:
|
| 42 |
+
return "A"
|
| 43 |
+
if a==1:
|
| 44 |
+
return "B"
|
| 45 |
+
if a==2:
|
| 46 |
+
return "C"
|
| 47 |
+
if a==3:
|
| 48 |
+
return "D"
|
| 49 |
+
if a==4:
|
| 50 |
+
return "E"
|
| 51 |
+
|
| 52 |
+
def filter_stopwords(self,example_sent):
|
| 53 |
+
stop_words = set(stopwords.words('english'))
|
| 54 |
+
word_tokens = word_tokenize(example_sent)
|
| 55 |
+
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
|
| 56 |
+
return " ".join(filtered_sentence)
|
| 57 |
+
|
| 58 |
+
def cls_pooling(self,model_output):
|
| 59 |
+
return model_output.pooler_output#last_hidden_state[:, 0]
|
| 60 |
+
|
| 61 |
+
def get_embeddings(self,text_list):
|
| 62 |
+
encoded_input = self.semTokenizer(
|
| 63 |
+
text_list, padding=True, truncation=True, return_tensors="pt",max_length =512
|
| 64 |
+
)
|
| 65 |
+
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
|
| 66 |
+
model_output = self.semModel(**encoded_input)
|
| 67 |
+
return self.cls_pooling(model_output)
|
| 68 |
+
|
| 69 |
+
#---- retriever
|
| 70 |
+
|
| 71 |
+
def get_context_from_text(self,question):
|
| 72 |
+
question_embedding = self.get_embeddings([question]).cpu().detach().numpy()
|
| 73 |
+
scores, samples = datasetx.get_nearest_examples(
|
| 74 |
+
"embeddings", question_embedding, k=5
|
| 75 |
+
)
|
| 76 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
| 77 |
+
samples_df["scores"] = scores
|
| 78 |
+
samples_df.sort_values("scores", ascending=False, inplace=True)
|
| 79 |
+
contexts = ""
|
| 80 |
+
# aux_row = ""
|
| 81 |
+
for _, row in samples_df.iterrows():
|
| 82 |
+
contexts = contexts + f"=={row.section}== {row.text} "
|
| 83 |
+
|
| 84 |
+
# if aux_row =={row.title}:
|
| 85 |
+
# contexts = contexts + f"=={row.section}== {row.text}"
|
| 86 |
+
# else:
|
| 87 |
+
# contexts = contexts + f"==={row.title}=== =={row.section}== {row.text}"
|
| 88 |
+
# aux_row = {row.title}
|
| 89 |
+
return contexts
|
| 90 |
+
|
| 91 |
+
#---- generator
|
| 92 |
+
|
| 93 |
+
# [CLS] context #### question? [SEP] answer [SEP]
|
| 94 |
+
def create_tokens(self,quetion_and_options,context):
|
| 95 |
+
question = quetion_and_options["prompt"]
|
| 96 |
+
candidate1 = "#### "+question + " [SEP] "+quetion_and_options["A"]+ " [SEP]"
|
| 97 |
+
candidate2 = "#### "+question + " [SEP] "+quetion_and_options["B"]+ " [SEP]"
|
| 98 |
+
candidate3 = "#### "+question + " [SEP] "+quetion_and_options["C"]+ " [SEP]"
|
| 99 |
+
candidate4 = "#### "+question + " [SEP] "+quetion_and_options["D"]+ " [SEP]"
|
| 100 |
+
candidate5 = "#### "+question + " [SEP] "+quetion_and_options["E"]+ " [SEP]"
|
| 101 |
+
prompt = "[CLS]"+ context
|
| 102 |
+
|
| 103 |
+
inputs = self.tokenizer([
|
| 104 |
+
[prompt, candidate1],
|
| 105 |
+
[prompt, candidate2],
|
| 106 |
+
[prompt, candidate3],
|
| 107 |
+
[prompt, candidate4],
|
| 108 |
+
[prompt, candidate5]
|
| 109 |
+
], return_tensors="pt", padding=True,truncation="only_first",max_length =512,add_special_tokens=False)
|
| 110 |
+
labels = torch.tensor(0).unsqueeze(0)
|
| 111 |
+
return (inputs,labels)
|
| 112 |
+
|
| 113 |
+
def infer_answer(self,mi_tupla):
|
| 114 |
+
(inputs,labels) = mi_tupla
|
| 115 |
+
|
| 116 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 117 |
+
labels = labels.to(self.device)
|
| 118 |
+
|
| 119 |
+
outputs = self.model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
|
| 120 |
+
logits = outputs.logits
|
| 121 |
+
_, topk_indices = torch.topk(logits, k=3, dim=1)
|
| 122 |
+
#predicted_class = logits.argmax().item()
|
| 123 |
+
return topk_indices
|
| 124 |
+
|
| 125 |
+
#---- retriever + generator
|
| 126 |
+
|
| 127 |
+
def give_the_best_answer(self,dict_with_all_the_info):
|
| 128 |
+
a = self.get_context_from_text(my_dict["prompt"])
|
| 129 |
+
b = self.create_tokens(my_dict,a)
|
| 130 |
+
c = self.infer_answer(b)
|
| 131 |
+
d = self.convert_to_letter(int(c[0][0]))
|
| 132 |
+
#print("\nThe answer is ",)
|
| 133 |
+
return d
|
| 134 |
+
|
| 135 |
+
pipeline = Pipeline()
|