Spaces:

harithapliyal
/

t5-small-mahabharat-vana-parva-qa

Sleeping

App Files Files Community

harithapliyal commited on Jan 12, 2024

Commit

081ac3f

verified ·

1 Parent(s): a5503af

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -2

app.py CHANGED Viewed

@@ -1,4 +1,118 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+!pip install transformers
+!pip install -Uq evaluate
+!pip install -Uq SentencePiece
+!pip install -Uq sentence-transformers
+from sentence_transformers import SentenceTransformer
+from sentence_transformers import util
 import streamlit as st
+import pandas as pd
+import torch
+import ast
+QA_VECTOR = '07.2-HBQA_QA_Vector_multi-qa-mpnet-base-dot-v1.csv'
+# @title Load Example Questions
+df_qa =  pd.read_csv(QA_VECTOR)
+example_qa = df_qa['Question'].sample(5).to_list()
+question = st.selectbox(
+    'Please Select a Question',
+    (example_qa[0], example_qa[1], example_qa[2], example_qa[3], example_qa[4]))
+st.write('You Question:', question)
+CHUNK_VECTOR = '04.3-HBQA_Chunk_Vector_multi-qa-mpnet-base-dot-v1.csv'
+df_chunk = pd.read_csv(CHUNK_VECTOR)
+# print(df_qa.shape)
+# df_qa.head(1)
+# df_chunk.head(1)
+# df_chunk.shape
+# @title Load Embedding Model
+embmodelname, embmodelshort, embmodelname1 = 'multi-qa-mpnet-base-dot-v1', 'mpnet', '_multi-qa-mpnet-base-dot-v1'
+embmodel = SentenceTransformer(embmodelname)
+# @title Create Question Embedding
+def get_ques_vector(ques):
+  Question_Embeddings = embmodel.encode(ques)
+  return Question_Embeddings
+question_embedding = get_ques_vector(question)
+# @title Load all chunk_vectors into memotry
+chunk_id = df_chunk['Chunk_Id'].to_list()
+chunk_vector = [ast.literal_eval(df_chunk['ChunkVector'+embmodelname1][i]) for i in range(df_chunk.shape[0])]
+# chunk_vector = [df_chunk['ChunkVector'+embmodelname1][i] for i in range(df_chunk.shape[0])]
+chunk_vector = torch.tensor(chunk_vector)
+# @title Predict Chunk Id for Question
+top_k=5
+hits = util.semantic_search(question_embedding, chunk_vector, top_k=top_k)
+predictedId= [item['corpus_id'] for item in hits[0]] # Hit contains index (corpus_id), location of chunk_vector
+predicted_Docid = [df_chunk.loc[i, "Chunk_Id"] for i in predictedId] # We need to get chunk_id corresponding to chunk_index
+print (predicted_Docid)
+# @title Load Prediction Model
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+MODEL_FOLDER = '/content/drive/MyDrive/HBQA/t5small-30epoch'
+# Load the corresponding tokenizer
+tokenizer = T5Tokenizer.from_pretrained( MODEL_FOLDER)
+# Load the pre-trained T5 model
+model = T5ForConditionalGeneration.from_pretrained(MODEL_FOLDER)
+DEVICE = 'cpu'
+model.to(DEVICE)
+# @title Predict Answer
+def predict_answer(context, question, tokenizer, model):
+    Q_LEN=1500
+#     if predmodel_name=="t5":
+    inputs = tokenizer(question, context, max_length= Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
+    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
+    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
+    with torch.no_grad():
+        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)
+    predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)
+    if len(predicted_answer)<3:
+       predicted_answer="xxx"
+    elif predicted_answer[0:5]=='[CLS]' or  predicted_answer[0:5]=='[SEP]' or predicted_answer[0:3]=='<s>' :
+        predicted_answer="xxx"
+    return predicted_answer
+verbose = False
+if len(predicted_Docid)>3:
+    ids = predicted_Docid
+else:
+    print("Sorry, No document")
+ans=[]
+for id in ids:
+    # print(ques_id, id)
+    cond  = df_chunk["Chunk_Id"]==id
+    chunk = df_chunk.loc[cond]["Chunk"].values[0]
+    # print(chunk[:20])
+    pred_ans = predict_answer(chunk, question, tokenizer, model)
+    ans.append( pred_ans )
+    if verbose:
+        print("Pred Ans  :", pred_ans)
+# final prediction with all the joint answers.
+pred_finalans =  predict_answer(" ".join(ans), question, tokenizer, model)
+pred_finalans