File size: 3,567 Bytes
5e8a58c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# import functions
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric, HallucinationMetric, BiasMetric, ToxicityMetric
from deepeval.test_case import LLMTestCase
class LLM(DeepEvalBaseLLM):
def __init__(
self,
model,
model_name
):
self.model = model
self.model_name = model_name
def load_model(self):
return self.model
def generate(self, prompt: str) -> str:
model = self.load_model()
return model(prompt)
async def a_generate(self, prompt: str) -> str:
return self.generate(prompt)
def get_model_name(self):
return self.model_name
def eval_answer_relevancy_metric(llm: LLM, question: str, answer: str, context: list):
answer_relevancy_metric = AnswerRelevancyMetric(model=llm, threshold=0.5, include_reason=True)
test_case = LLMTestCase(
input=question,
actual_output=answer,
retrieval_context=context
)
answer_relevancy_metric.measure(test_case)
return answer_relevancy_metric.score
def eval_faithfulness_metric(llm: LLM, question: str, answer: str, context: list):
faithfulness_metric = FaithfulnessMetric(model=llm, threshold=0.5, include_reason=True)
test_case = LLMTestCase(
input=question,
actual_output=answer,
retrieval_context=context
)
faithfulness_metric.measure(test_case)
return faithfulness_metric.score
def eval_contextual_relevancy_metric(llm: LLM, question: str, answer: str, context: list):
contextual_relevancy_metric = ContextualRelevancyMetric(model=llm, threshold=0.5, include_reason=False)
test_case = LLMTestCase(
input=question,
actual_output=answer,
retrieval_context=context
)
contextual_relevancy_metric.measure(test_case)
return contextual_relevancy_metric.score
def eval_hallucination_metric(llm: LLM, question: str, answer: str, context: list):
hallucination_metric = HallucinationMetric(model=llm, threshold=0.5, include_reason=True)
test_case = LLMTestCase(
input=question,
actual_output=answer,
context=context
)
hallucination_metric.measure(test_case)
return hallucination_metric.score
def eval_bias_metric(llm: LLM, question: str, answer: str):
bias_metric = BiasMetric(model=llm, threshold=0.5, include_reason=True)
test_case = LLMTestCase(
input=question,
actual_output=answer
)
bias_metric.measure(test_case)
return bias_metric.score
def eval_toxicity_metric(llm: LLM, question: str, answer: str):
toxicity_metric = ToxicityMetric(model=llm, threshold=0.5, include_reason=True)
test_case = LLMTestCase(
input=question,
actual_output=answer
)
toxicity_metric.measure(test_case)
return toxicity_metric.score
def eval_rag_metrics(llm: LLM, question: str, answer: str, context: list) -> dict:
return {
"AnswerRelevancyMetric": eval_answer_relevancy_metric(llm, question, answer, context),
"FaithfulnessMetric": eval_faithfulness_metric(llm, question, answer, context),
"ContextualRelevancyMetric": eval_contextual_relevancy_metric(llm, question, answer, context),
# "HallucinationMetric": eval_hallucination_metric(llm, question, answer, context),
# "BiasMetric": eval_bias_metric(llm, question, answer),
# "ToxicityMetric": eval_toxicity_metric(llm, question, answer),
} |