File size: 4,992 Bytes
dbd33b2
 
25b2b2b
 
 
dbd33b2
 
 
 
 
 
 
25b2b2b
 
dbd33b2
 
 
 
 
25b2b2b
 
dbd33b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25b2b2b
 
dbd33b2
 
 
 
 
 
 
 
 
25b2b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a61b32e
25b2b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import json
import ollama

class EvaluationSystem:
    def __init__(self, data_processor, database_handler):
        self.data_processor = data_processor
        self.db_handler = database_handler

    def relevance_scoring(self, query, retrieved_docs, top_k=5):
        query_embedding = self.data_processor.embedding_model.encode(query)
        doc_embeddings = [self.data_processor.embedding_model.encode(doc['content']) for doc in retrieved_docs]
        
        similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
        return np.mean(sorted(similarities, reverse=True)[:top_k])

    def answer_similarity(self, generated_answer, reference_answer):
        gen_embedding = self.data_processor.embedding_model.encode(generated_answer)
        ref_embedding = self.data_processor.embedding_model.encode(reference_answer)
        return cosine_similarity([gen_embedding], [ref_embedding])[0][0]

    def human_evaluation(self, video_id, query):
        with self.db_handler.conn:
            cursor = self.db_handler.conn.cursor()
            cursor.execute('''
                SELECT AVG(feedback) FROM user_feedback
                WHERE video_id = ? AND query = ?
            ''', (video_id, query))
            result = cursor.fetchone()
            return result[0] if result[0] is not None else 0

    def evaluate_rag_performance(self, rag_system, test_queries, reference_answers, index_name):
        relevance_scores = []
        similarity_scores = []
        human_scores = []

        for query, reference in zip(test_queries, reference_answers):
            retrieved_docs = rag_system.data_processor.search(query, num_results=5, method='hybrid', index_name=index_name)
            generated_answer, _ = rag_system.query(query, search_method='hybrid', index_name=index_name)

            relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
            similarity_scores.append(self.answer_similarity(generated_answer, reference))
            human_scores.append(self.human_evaluation(index_name, query))  # Assuming index_name can be used as video_id

        return {
            "avg_relevance_score": np.mean(relevance_scores),
            "avg_similarity_score": np.mean(similarity_scores),
            "avg_human_score": np.mean(human_scores)
        }

    def llm_as_judge(self, question, generated_answer, prompt_template):
        prompt = prompt_template.format(question=question, answer_llm=generated_answer)
        
        try:
            response = ollama.chat(
                model='phi3.5',
                messages=[{"role": "user", "content": prompt}]
            )
            evaluation = json.loads(response['message']['content'])
            return evaluation
        except Exception as e:
            print(f"Error in LLM evaluation: {str(e)}")
            return None

    def evaluate_rag(self, rag_system, ground_truth_file, sample_size=200, prompt_template=None):
        try:
            ground_truth = pd.read_csv(ground_truth_file)
        except FileNotFoundError:
            print("Ground truth file not found. Please generate ground truth data first.")
            return None

        sample = ground_truth.sample(n=min(sample_size, len(ground_truth)), random_state=1)
        evaluations = []

        for _, row in sample.iterrows():
            question = row['question']
            video_id = row['video_id']
            
            index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
            
            if not index_name:
                print(f"No index found for video {video_id}. Skipping this question.")
                continue

            try:
                answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
            except ValueError as e:
                print(f"Error querying RAG system: {str(e)}")
                continue

            if prompt_template:
                evaluation = self.llm_as_judge(question, answer_llm, prompt_template)
                if evaluation:
                    evaluations.append((
                        str(video_id),
                        str(question),
                        str(answer_llm),
                        str(evaluation.get('Relevance', 'UNKNOWN')),
                        str(evaluation.get('Explanation', 'No explanation provided'))
                    ))
            else:
                # Fallback to cosine similarity if no prompt template is provided
                similarity = self.answer_similarity(answer_llm, row.get('reference_answer', ''))
                evaluations.append((
                    str(video_id),
                    str(question),
                    str(answer_llm),
                    f"Similarity: {similarity}",
                    "Cosine similarity used for evaluation"
                ))

        return evaluations