harithapliyal commited on
Commit
081ac3f
·
verified ·
1 Parent(s): a5503af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -2
app.py CHANGED
@@ -1,4 +1,118 @@
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install transformers
2
+ !pip install -Uq evaluate
3
+ !pip install -Uq SentencePiece
4
+ !pip install -Uq sentence-transformers
5
+
6
+ from sentence_transformers import SentenceTransformer
7
+ from sentence_transformers import util
8
  import streamlit as st
9
 
10
+ import pandas as pd
11
+ import torch
12
+ import ast
13
+
14
+ QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv'
15
+
16
+ # @title Load Example Questions
17
+ df_qa = pd.read_csv(QA_VECTOR)
18
+ example_qa = df_qa['Question'].sample(5).to_list()
19
+
20
+ question = st.selectbox(
21
+ 'Please Select a Question',
22
+ (example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4]))
23
+
24
+ st.write('You Question:', question)
25
+
26
+ CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv'
27
+
28
+ df_chunk = pd.read_csv(CHUNK_VECTOR)
29
+
30
+ # print(df_qa.shape)
31
+ # df_qa.head(1)
32
+ # df_chunk.head(1)
33
+ # df_chunk.shape
34
+
35
+ # @title Load Embedding Model
36
+ embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1'
37
+ embmodel = SentenceTransformer(embmodelname)
38
+
39
+ # @title Create Question Embedding
40
+ def get_ques_vector(ques):
41
+ Question_Embeddings = embmodel.encode(ques)
42
+ return Question_Embeddings
43
+
44
+ question_embedding = get_ques_vector(question)
45
+
46
+ # @title Load all chunk_vectors into memotry
47
+ chunk_id = df_chunk['Chunk_Id'].to_list()
48
+
49
+ chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])]
50
+ # chunk_vector = [df_chunk['ChunkVector'+embmodelname1][i] for i in range(df_chunk.shape[0])]
51
+ chunk_vector = torch.tensor(chunk_vector)
52
+
53
+ # @title Predict Chunk Id for Question
54
+ top_k=5
55
+ hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k)
56
+
57
+ predictedId= [item['corpus_id'] for item in hits[0]] # Hit contains index (corpus_id), location of chunk_vector
58
+ predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] # We need to get chunk_id corresponding to chunk_index
59
+ print (predicted_Docid)
60
+
61
+ # @title Load Prediction Model
62
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
63
+ MODEL_FOLDER = '/content/drive/MyDrive/HBQA/t5small-30epoch'
64
+
65
+ # Load the corresponding tokenizer
66
+ tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER)
67
+
68
+ # Load the pre-trained T5 model
69
+ model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER)
70
+ DEVICE = 'cpu'
71
+ model.to(DEVICE)
72
+
73
+ # @title Predict Answer
74
+
75
+ def predict_answer(context, question, tokenizer, model):
76
+ Q_LEN=1500
77
+
78
+ # if predmodel_name=="t5":
79
+ inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
80
+
81
+ input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
82
+ attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
83
+
84
+ with torch.no_grad():
85
+ outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)
86
+
87
+ predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)
88
+
89
+
90
+ if len(predicted_answer)<3:
91
+ predicted_answer="xxx"
92
+ elif predicted_answer[0:5]=='[CLS]' or predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='<s>' :
93
+ predicted_answer="xxx"
94
+ return predicted_answer
95
+
96
+ verbose = False
97
+
98
+ if len(predicted_Docid)>3:
99
+ ids = predicted_Docid
100
+ else:
101
+ print("Sorry, No document")
102
+
103
+ ans=[]
104
+ for id in ids:
105
+ # print(ques_id, id)
106
+ cond = df_chunk["Chunk_Id"]==id
107
+ chunk = df_chunk.loc[cond]["Chunk"].values[0]
108
+
109
+ # print(chunk[:20])
110
+ pred_ans = predict_answer(chunk, question, tokenizer, model)
111
+ ans.append( pred_ans )
112
+
113
+ if verbose:
114
+ print("Pred Ans :", pred_ans)
115
+
116
+ # final prediction with all the joint answers.
117
+ pred_finalans = predict_answer(" ".join(ans), question, tokenizer, model)
118
+ pred_finalans