Spaces:
Sleeping
Sleeping
Update chatbot_utils.py
Browse files- chatbot_utils.py +27 -7
chatbot_utils.py
CHANGED
@@ -1,26 +1,46 @@
|
|
1 |
-
|
2 |
import pandas as pd
|
3 |
import faiss
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
import numpy as np
|
|
|
6 |
|
7 |
class AmharicChatbot:
|
8 |
-
def __init__(self, csv_path):
|
9 |
self.df = pd.read_csv(csv_path)
|
10 |
self.model = SentenceTransformer("intfloat/multilingual-e5-small")
|
|
|
11 |
self.build_index()
|
12 |
|
13 |
def build_index(self):
|
|
|
14 |
self.embeddings = self.model.encode(
|
15 |
["passage: " + q for q in self.df["question"].tolist()],
|
16 |
show_progress_bar=True
|
17 |
-
)
|
18 |
self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
|
19 |
-
self.index.add(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
24 |
results = []
|
25 |
for idx in I[0]:
|
26 |
question = self.df.iloc[idx]["question"]
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import faiss
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
import numpy as np
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
|
7 |
class AmharicChatbot:
|
8 |
+
def __init__(self, csv_path, threshold=0.70):
|
9 |
self.df = pd.read_csv(csv_path)
|
10 |
self.model = SentenceTransformer("intfloat/multilingual-e5-small")
|
11 |
+
self.threshold = threshold
|
12 |
self.build_index()
|
13 |
|
14 |
def build_index(self):
|
15 |
+
# Encode questions using the E5 small model; "passage:" prefix for context
|
16 |
self.embeddings = self.model.encode(
|
17 |
["passage: " + q for q in self.df["question"].tolist()],
|
18 |
show_progress_bar=True
|
19 |
+
).astype("float32")
|
20 |
self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
|
21 |
+
self.index.add(self.embeddings)
|
22 |
+
|
23 |
+
def get_answer(self, user_question, k=1):
|
24 |
+
# Encode the user question with "query:" prefix for best retrieval
|
25 |
+
user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32")
|
26 |
+
D, I = self.index.search(np.array([user_embedding]), k)
|
27 |
+
|
28 |
+
top_idx = I[0][0]
|
29 |
+
top_question = self.df.iloc[top_idx]["question"]
|
30 |
+
top_embedding = self.model.encode([f"passage: {top_question}"])[0]
|
31 |
+
|
32 |
+
# Cosine similarity score between user and top retrieved question
|
33 |
+
score = cosine_similarity([user_embedding], [top_embedding])[0][0]
|
34 |
+
|
35 |
+
if score < self.threshold:
|
36 |
+
return "__OUT_OF_SCOPE__"
|
37 |
+
|
38 |
+
return self.df.iloc[top_idx]["answer"]
|
39 |
|
40 |
+
# Optional: retrieve top-K relevant Q&A pairs for further use
|
41 |
+
def get_top_k(self, user_question, k=3):
|
42 |
+
user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32")
|
43 |
+
D, I = self.index.search(np.array([user_embedding]), k)
|
44 |
results = []
|
45 |
for idx in I[0]:
|
46 |
question = self.df.iloc[idx]["question"]
|