Spaces:
Sleeping
Sleeping
| class Persona: | |
| def __init__(self,nombre,edad): | |
| self.nombre = nombre | |
| self.edad = edad | |
| def get_nomber(self): | |
| return self.nombre | |
| persona = Persona("josue",33) | |
| # os.environ['CUDA_VISIBLE_DEVICES'] ='0' | |
| import torch | |
| from transformers import AutoModelForMultipleChoice | |
| from transformers import AutoTokenizer | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from transformers import AutoTokenizer, AutoModel | |
| GENERATOR_MODEL = "JosueElias/pipeline_generator_model" | |
| GENERATOR_TOKENIZER = "JosueElias/pipeline_generator_tokenizer" | |
| QUERY_MODEL = "JosueElias/pipeline_query_model" | |
| QUERY_TOKENIZER = "JosueElias/pipeline_query_tokenizer" | |
| DEVICE = "cpu" # cpu or cuda | |
| class Pipeline: | |
| #---- init class | |
| def __init__(self): | |
| self.model = AutoModelForMultipleChoice.from_pretrained(GENERATOR_MODEL) | |
| self.tokenizer = AutoTokenizer.from_pretrained(GENERATOR_TOKENIZER) | |
| self.semModel = AutoModel.from_pretrained(QUERY_MODEL) | |
| self.semTokenizer = AutoTokenizer.from_pretrained(QUERY_TOKENIZER) | |
| self.device = torch.device(DEVICE) | |
| self.semModel.to(self.device) | |
| self.model.to(self.device) | |
| #---- utils functions | |
| def convert_to_letter(self,a): | |
| if a == 0: | |
| return "A" | |
| if a==1: | |
| return "B" | |
| if a==2: | |
| return "C" | |
| if a==3: | |
| return "D" | |
| if a==4: | |
| return "E" | |
| def filter_stopwords(self,example_sent): | |
| stop_words = set(stopwords.words('english')) | |
| word_tokens = word_tokenize(example_sent) | |
| filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words] | |
| return " ".join(filtered_sentence) | |
| def cls_pooling(self,model_output): | |
| return model_output.pooler_output#last_hidden_state[:, 0] | |
| def get_embeddings(self,text_list): | |
| encoded_input = self.semTokenizer( | |
| text_list, padding=True, truncation=True, return_tensors="pt",max_length =512 | |
| ) | |
| encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()} | |
| model_output = self.semModel(**encoded_input) | |
| return self.cls_pooling(model_output) | |
| #---- retriever | |
| def get_context_from_text(self,question): | |
| question_embedding = self.get_embeddings([question]).cpu().detach().numpy() | |
| scores, samples = datasetx.get_nearest_examples( | |
| "embeddings", question_embedding, k=5 | |
| ) | |
| samples_df = pd.DataFrame.from_dict(samples) | |
| samples_df["scores"] = scores | |
| samples_df.sort_values("scores", ascending=False, inplace=True) | |
| contexts = "" | |
| # aux_row = "" | |
| for _, row in samples_df.iterrows(): | |
| contexts = contexts + f"=={row.section}== {row.text} " | |
| # if aux_row =={row.title}: | |
| # contexts = contexts + f"=={row.section}== {row.text}" | |
| # else: | |
| # contexts = contexts + f"==={row.title}=== =={row.section}== {row.text}" | |
| # aux_row = {row.title} | |
| return contexts | |
| #---- generator | |
| # [CLS] context #### question? [SEP] answer [SEP] | |
| def create_tokens(self,quetion_and_options,context): | |
| question = quetion_and_options["prompt"] | |
| candidate1 = "#### "+question + " [SEP] "+quetion_and_options["A"]+ " [SEP]" | |
| candidate2 = "#### "+question + " [SEP] "+quetion_and_options["B"]+ " [SEP]" | |
| candidate3 = "#### "+question + " [SEP] "+quetion_and_options["C"]+ " [SEP]" | |
| candidate4 = "#### "+question + " [SEP] "+quetion_and_options["D"]+ " [SEP]" | |
| candidate5 = "#### "+question + " [SEP] "+quetion_and_options["E"]+ " [SEP]" | |
| prompt = "[CLS]"+ context | |
| inputs = self.tokenizer([ | |
| [prompt, candidate1], | |
| [prompt, candidate2], | |
| [prompt, candidate3], | |
| [prompt, candidate4], | |
| [prompt, candidate5] | |
| ], return_tensors="pt", padding=True,truncation="only_first",max_length =512,add_special_tokens=False) | |
| labels = torch.tensor(0).unsqueeze(0) | |
| return (inputs,labels) | |
| def infer_answer(self,mi_tupla): | |
| (inputs,labels) = mi_tupla | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| labels = labels.to(self.device) | |
| outputs = self.model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels) | |
| logits = outputs.logits | |
| _, topk_indices = torch.topk(logits, k=3, dim=1) | |
| #predicted_class = logits.argmax().item() | |
| return topk_indices | |
| #---- retriever + generator | |
| def give_the_best_answer(self,dict_with_all_the_info): | |
| a = self.get_context_from_text(my_dict["prompt"]) | |
| b = self.create_tokens(my_dict,a) | |
| c = self.infer_answer(b) | |
| d = self.convert_to_letter(int(c[0][0])) | |
| #print("\nThe answer is ",) | |
| return d | |
| pipeline = Pipeline() |