Walelign commited on
Commit
a32625f
·
verified ·
1 Parent(s): e395743

Update chatbot_utils.py

Browse files
Files changed (1) hide show
  1. chatbot_utils.py +27 -7
chatbot_utils.py CHANGED
@@ -1,26 +1,46 @@
1
-
2
  import pandas as pd
3
  import faiss
4
  from sentence_transformers import SentenceTransformer
5
  import numpy as np
 
6
 
7
  class AmharicChatbot:
8
- def __init__(self, csv_path):
9
  self.df = pd.read_csv(csv_path)
10
  self.model = SentenceTransformer("intfloat/multilingual-e5-small")
 
11
  self.build_index()
12
 
13
  def build_index(self):
 
14
  self.embeddings = self.model.encode(
15
  ["passage: " + q for q in self.df["question"].tolist()],
16
  show_progress_bar=True
17
- )
18
  self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
19
- self.index.add(np.array(self.embeddings))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- def get_answer(self, query, top_k=3):
22
- query_embedding = self.model.encode([f"query: {query}"])
23
- D, I = self.index.search(np.array(query_embedding), top_k)
 
24
  results = []
25
  for idx in I[0]:
26
  question = self.df.iloc[idx]["question"]
 
 
1
  import pandas as pd
2
  import faiss
3
  from sentence_transformers import SentenceTransformer
4
  import numpy as np
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
 
7
  class AmharicChatbot:
8
+ def __init__(self, csv_path, threshold=0.70):
9
  self.df = pd.read_csv(csv_path)
10
  self.model = SentenceTransformer("intfloat/multilingual-e5-small")
11
+ self.threshold = threshold
12
  self.build_index()
13
 
14
  def build_index(self):
15
+ # Encode questions using the E5 small model; "passage:" prefix for context
16
  self.embeddings = self.model.encode(
17
  ["passage: " + q for q in self.df["question"].tolist()],
18
  show_progress_bar=True
19
+ ).astype("float32")
20
  self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
21
+ self.index.add(self.embeddings)
22
+
23
+ def get_answer(self, user_question, k=1):
24
+ # Encode the user question with "query:" prefix for best retrieval
25
+ user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32")
26
+ D, I = self.index.search(np.array([user_embedding]), k)
27
+
28
+ top_idx = I[0][0]
29
+ top_question = self.df.iloc[top_idx]["question"]
30
+ top_embedding = self.model.encode([f"passage: {top_question}"])[0]
31
+
32
+ # Cosine similarity score between user and top retrieved question
33
+ score = cosine_similarity([user_embedding], [top_embedding])[0][0]
34
+
35
+ if score < self.threshold:
36
+ return "__OUT_OF_SCOPE__"
37
+
38
+ return self.df.iloc[top_idx]["answer"]
39
 
40
+ # Optional: retrieve top-K relevant Q&A pairs for further use
41
+ def get_top_k(self, user_question, k=3):
42
+ user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32")
43
+ D, I = self.index.search(np.array([user_embedding]), k)
44
  results = []
45
  for idx in I[0]:
46
  question = self.df.iloc[idx]["question"]