File size: 1,467 Bytes
54c32b3
9fcadef
 
 
 
a32625f
9fcadef
 
0e5c52a
9fcadef
 
a32625f
9fcadef
 
 
 
 
 
a32625f
9fcadef
a32625f
 
 
 
 
0e5c52a
 
 
 
a32625f
0e5c52a
 
 
 
 
 
a32625f
0e5c52a
a32625f
 
0e5c52a
a32625f
0e5c52a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class AmharicChatbot:
    def __init__(self, csv_path, threshold=0.80):
        self.df = pd.read_csv(csv_path)
        self.model = SentenceTransformer("intfloat/multilingual-e5-small")
        self.threshold = threshold
        self.build_index()

    def build_index(self):
        self.embeddings = self.model.encode(
            ["passage: " + q for q in self.df["question"].tolist()],
            show_progress_bar=True
        ).astype("float32")
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings)

    def get_answer(self, user_question, k=1):
        user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32")
        D, I = self.index.search(np.array([user_embedding]), k)

        if len(I[0]) == 0:
            return "__OUT_OF_SCOPE__"

        top_idx = I[0][0]
        top_embedding = self.embeddings[top_idx]

        # Normalize embeddings before cosine similarity
        user_embedding = user_embedding / np.linalg.norm(user_embedding)
        top_embedding = top_embedding / np.linalg.norm(top_embedding)

        score = cosine_similarity([user_embedding], [top_embedding])[0][0]

        if score < self.threshold:
            return "__OUT_OF_SCOPE__"

        return self.df.iloc[top_idx]["answer"]