Spaces:
Sleeping
Sleeping
import pandas as pd | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
class AmharicChatbot: | |
def __init__(self, csv_path, threshold=0.70): | |
self.df = pd.read_csv(csv_path) | |
self.model = SentenceTransformer("intfloat/multilingual-e5-small") | |
self.threshold = threshold | |
self.build_index() | |
def build_index(self): | |
# Encode questions using the E5 small model; "passage:" prefix for context | |
self.embeddings = self.model.encode( | |
["passage: " + q for q in self.df["question"].tolist()], | |
show_progress_bar=True | |
).astype("float32") | |
self.index = faiss.IndexFlatL2(self.embeddings.shape[1]) | |
self.index.add(self.embeddings) | |
def get_answer(self, user_question, k=1): | |
# Encode the user question with "query:" prefix for best retrieval | |
user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32") | |
D, I = self.index.search(np.array([user_embedding]), k) | |
top_idx = I[0][0] | |
top_question = self.df.iloc[top_idx]["question"] | |
top_embedding = self.model.encode([f"passage: {top_question}"])[0] | |
# Cosine similarity score between user and top retrieved question | |
score = cosine_similarity([user_embedding], [top_embedding])[0][0] | |
if score < self.threshold: | |
return "__OUT_OF_SCOPE__" | |
return self.df.iloc[top_idx]["answer"] | |
# Optional: retrieve top-K relevant Q&A pairs for further use | |
def get_top_k(self, user_question, k=3): | |
user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32") | |
D, I = self.index.search(np.array([user_embedding]), k) | |
results = [] | |
for idx in I[0]: | |
question = self.df.iloc[idx]["question"] | |
answer = self.df.iloc[idx]["answer"] | |
results.append((question, answer)) | |
return results | |