File size: 3,567 Bytes
5e8a58c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# import functions
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric, HallucinationMetric, BiasMetric, ToxicityMetric
from deepeval.test_case import LLMTestCase

class LLM(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        model_name
    ):
        self.model = model
        self.model_name = model_name

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()
        return model(prompt)

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return self.model_name
    
def eval_answer_relevancy_metric(llm: LLM, question: str, answer: str, context: list):
    answer_relevancy_metric = AnswerRelevancyMetric(model=llm, threshold=0.5, include_reason=True)
    test_case = LLMTestCase(
        input=question,
        actual_output=answer,
        retrieval_context=context
    )

    answer_relevancy_metric.measure(test_case)
    return answer_relevancy_metric.score

def eval_faithfulness_metric(llm: LLM, question: str, answer: str, context: list):
    faithfulness_metric = FaithfulnessMetric(model=llm, threshold=0.5, include_reason=True)
    test_case = LLMTestCase(
        input=question,
        actual_output=answer,
        retrieval_context=context
    )

    faithfulness_metric.measure(test_case)
    return faithfulness_metric.score

def eval_contextual_relevancy_metric(llm: LLM, question: str, answer: str, context: list):
    contextual_relevancy_metric = ContextualRelevancyMetric(model=llm, threshold=0.5, include_reason=False)
    test_case = LLMTestCase(
        input=question,
        actual_output=answer,
        retrieval_context=context
    )

    contextual_relevancy_metric.measure(test_case)
    return contextual_relevancy_metric.score

def eval_hallucination_metric(llm: LLM, question: str, answer: str, context: list):
    hallucination_metric = HallucinationMetric(model=llm, threshold=0.5, include_reason=True)
    test_case = LLMTestCase(
        input=question,
        actual_output=answer,
        context=context
    )

    hallucination_metric.measure(test_case)
    return hallucination_metric.score

def eval_bias_metric(llm: LLM, question: str, answer: str):
    bias_metric = BiasMetric(model=llm, threshold=0.5, include_reason=True)
    test_case = LLMTestCase(
        input=question,
        actual_output=answer
    )

    bias_metric.measure(test_case)
    return bias_metric.score

def eval_toxicity_metric(llm: LLM, question: str, answer: str):
    toxicity_metric = ToxicityMetric(model=llm, threshold=0.5, include_reason=True)
    test_case = LLMTestCase(
        input=question,
        actual_output=answer
    )

    toxicity_metric.measure(test_case)
    return toxicity_metric.score

def eval_rag_metrics(llm: LLM, question: str, answer: str, context: list) -> dict:
    return {
            "AnswerRelevancyMetric": eval_answer_relevancy_metric(llm, question, answer, context),
            "FaithfulnessMetric": eval_faithfulness_metric(llm, question, answer, context),
            "ContextualRelevancyMetric": eval_contextual_relevancy_metric(llm, question, answer, context),
            # "HallucinationMetric": eval_hallucination_metric(llm, question, answer, context),
            # "BiasMetric": eval_bias_metric(llm, question, answer),
            # "ToxicityMetric": eval_toxicity_metric(llm, question, answer),
        }