File size: 2,017 Bytes
9fcadef
 
 
 
a32625f
9fcadef
 
a32625f
9fcadef
 
a32625f
9fcadef
 
 
a32625f
9fcadef
 
 
a32625f
9fcadef
a32625f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fcadef
a32625f
 
 
 
9fcadef
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class AmharicChatbot:
    def __init__(self, csv_path, threshold=0.70):
        self.df = pd.read_csv(csv_path)
        self.model = SentenceTransformer("intfloat/multilingual-e5-small")
        self.threshold = threshold
        self.build_index()

    def build_index(self):
        # Encode questions using the E5 small model; "passage:" prefix for context
        self.embeddings = self.model.encode(
            ["passage: " + q for q in self.df["question"].tolist()],
            show_progress_bar=True
        ).astype("float32")
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings)

    def get_answer(self, user_question, k=1):
        # Encode the user question with "query:" prefix for best retrieval
        user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32")
        D, I = self.index.search(np.array([user_embedding]), k)

        top_idx = I[0][0]
        top_question = self.df.iloc[top_idx]["question"]
        top_embedding = self.model.encode([f"passage: {top_question}"])[0]
        
        # Cosine similarity score between user and top retrieved question
        score = cosine_similarity([user_embedding], [top_embedding])[0][0]

        if score < self.threshold:
            return "__OUT_OF_SCOPE__"
        
        return self.df.iloc[top_idx]["answer"]

    # Optional: retrieve top-K relevant Q&A pairs for further use
    def get_top_k(self, user_question, k=3):
        user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32")
        D, I = self.index.search(np.array([user_embedding]), k)
        results = []
        for idx in I[0]:
            question = self.df.iloc[idx]["question"]
            answer = self.df.iloc[idx]["answer"]
            results.append((question, answer))
        return results