Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
!pip install transformers
|
2 |
+
!pip install -Uq evaluate
|
3 |
+
!pip install -Uq SentencePiece
|
4 |
+
!pip install -Uq sentence-transformers
|
5 |
+
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from sentence_transformers import util
|
8 |
import streamlit as st
|
9 |
|
10 |
+
import pandas as pd
|
11 |
+
import torch
|
12 |
+
import ast
|
13 |
+
|
14 |
+
QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv'
|
15 |
+
|
16 |
+
# @title Load Example Questions
|
17 |
+
df_qa = pd.read_csv(QA_VECTOR)
|
18 |
+
example_qa = df_qa['Question'].sample(5).to_list()
|
19 |
+
|
20 |
+
question = st.selectbox(
|
21 |
+
'Please Select a Question',
|
22 |
+
(example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4]))
|
23 |
+
|
24 |
+
st.write('You Question:', question)
|
25 |
+
|
26 |
+
CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv'
|
27 |
+
|
28 |
+
df_chunk = pd.read_csv(CHUNK_VECTOR)
|
29 |
+
|
30 |
+
# print(df_qa.shape)
|
31 |
+
# df_qa.head(1)
|
32 |
+
# df_chunk.head(1)
|
33 |
+
# df_chunk.shape
|
34 |
+
|
35 |
+
# @title Load Embedding Model
|
36 |
+
embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1'
|
37 |
+
embmodel = SentenceTransformer(embmodelname)
|
38 |
+
|
39 |
+
# @title Create Question Embedding
|
40 |
+
def get_ques_vector(ques):
|
41 |
+
Question_Embeddings = embmodel.encode(ques)
|
42 |
+
return Question_Embeddings
|
43 |
+
|
44 |
+
question_embedding = get_ques_vector(question)
|
45 |
+
|
46 |
+
# @title Load all chunk_vectors into memotry
|
47 |
+
chunk_id = df_chunk['Chunk_Id'].to_list()
|
48 |
+
|
49 |
+
chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])]
|
50 |
+
# chunk_vector = [df_chunk['ChunkVector'+embmodelname1][i] for i in range(df_chunk.shape[0])]
|
51 |
+
chunk_vector = torch.tensor(chunk_vector)
|
52 |
+
|
53 |
+
# @title Predict Chunk Id for Question
|
54 |
+
top_k=5
|
55 |
+
hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k)
|
56 |
+
|
57 |
+
predictedId= [item['corpus_id'] for item in hits[0]] # Hit contains index (corpus_id), location of chunk_vector
|
58 |
+
predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] # We need to get chunk_id corresponding to chunk_index
|
59 |
+
print (predicted_Docid)
|
60 |
+
|
61 |
+
# @title Load Prediction Model
|
62 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
63 |
+
MODEL_FOLDER = '/content/drive/MyDrive/HBQA/t5small-30epoch'
|
64 |
+
|
65 |
+
# Load the corresponding tokenizer
|
66 |
+
tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER)
|
67 |
+
|
68 |
+
# Load the pre-trained T5 model
|
69 |
+
model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER)
|
70 |
+
DEVICE = 'cpu'
|
71 |
+
model.to(DEVICE)
|
72 |
+
|
73 |
+
# @title Predict Answer
|
74 |
+
|
75 |
+
def predict_answer(context, question, tokenizer, model):
|
76 |
+
Q_LEN=1500
|
77 |
+
|
78 |
+
# if predmodel_name=="t5":
|
79 |
+
inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
|
80 |
+
|
81 |
+
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
82 |
+
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
83 |
+
|
84 |
+
with torch.no_grad():
|
85 |
+
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)
|
86 |
+
|
87 |
+
predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)
|
88 |
+
|
89 |
+
|
90 |
+
if len(predicted_answer)<3:
|
91 |
+
predicted_answer="xxx"
|
92 |
+
elif predicted_answer[0:5]=='[CLS]' or predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='<s>' :
|
93 |
+
predicted_answer="xxx"
|
94 |
+
return predicted_answer
|
95 |
+
|
96 |
+
verbose = False
|
97 |
+
|
98 |
+
if len(predicted_Docid)>3:
|
99 |
+
ids = predicted_Docid
|
100 |
+
else:
|
101 |
+
print("Sorry, No document")
|
102 |
+
|
103 |
+
ans=[]
|
104 |
+
for id in ids:
|
105 |
+
# print(ques_id, id)
|
106 |
+
cond = df_chunk["Chunk_Id"]==id
|
107 |
+
chunk = df_chunk.loc[cond]["Chunk"].values[0]
|
108 |
+
|
109 |
+
# print(chunk[:20])
|
110 |
+
pred_ans = predict_answer(chunk, question, tokenizer, model)
|
111 |
+
ans.append( pred_ans )
|
112 |
+
|
113 |
+
if verbose:
|
114 |
+
print("Pred Ans :", pred_ans)
|
115 |
+
|
116 |
+
# final prediction with all the joint answers.
|
117 |
+
pred_finalans = predict_answer(" ".join(ans), question, tokenizer, model)
|
118 |
+
pred_finalans
|