import pandas as pd import faiss from sentence_transformers import SentenceTransformer import numpy as np from sklearn.metrics.pairwise import cosine_similarity class AmharicChatbot: def __init__(self, csv_path, threshold=0.80): self.df = pd.read_csv(csv_path) self.model = SentenceTransformer("intfloat/multilingual-e5-small") self.threshold = threshold self.build_index() def build_index(self): self.embeddings = self.model.encode( ["passage: " + q for q in self.df["question"].tolist()], show_progress_bar=True ).astype("float32") self.index = faiss.IndexFlatL2(self.embeddings.shape[1]) self.index.add(self.embeddings) def get_answer(self, user_question, k=1): user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32") D, I = self.index.search(np.array([user_embedding]), k) if len(I[0]) == 0: return "__OUT_OF_SCOPE__" top_idx = I[0][0] top_embedding = self.embeddings[top_idx] # Normalize embeddings before cosine similarity user_embedding = user_embedding / np.linalg.norm(user_embedding) top_embedding = top_embedding / np.linalg.norm(top_embedding) score = cosine_similarity([user_embedding], [top_embedding])[0][0] if score < self.threshold: return "__OUT_OF_SCOPE__" return self.df.iloc[top_idx]["answer"]