|
|
|
from deepeval.models.base_model import DeepEvalBaseLLM |
|
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric, HallucinationMetric, BiasMetric, ToxicityMetric |
|
from deepeval.test_case import LLMTestCase |
|
|
|
class LLM(DeepEvalBaseLLM): |
|
def __init__( |
|
self, |
|
model, |
|
model_name |
|
): |
|
self.model = model |
|
self.model_name = model_name |
|
|
|
def load_model(self): |
|
return self.model |
|
|
|
def generate(self, prompt: str) -> str: |
|
model = self.load_model() |
|
return model(prompt) |
|
|
|
async def a_generate(self, prompt: str) -> str: |
|
return self.generate(prompt) |
|
|
|
def get_model_name(self): |
|
return self.model_name |
|
|
|
def eval_answer_relevancy_metric(llm: LLM, question: str, answer: str, context: list): |
|
answer_relevancy_metric = AnswerRelevancyMetric(model=llm, threshold=0.5, include_reason=True) |
|
test_case = LLMTestCase( |
|
input=question, |
|
actual_output=answer, |
|
retrieval_context=context |
|
) |
|
|
|
answer_relevancy_metric.measure(test_case) |
|
return answer_relevancy_metric.score |
|
|
|
def eval_faithfulness_metric(llm: LLM, question: str, answer: str, context: list): |
|
faithfulness_metric = FaithfulnessMetric(model=llm, threshold=0.5, include_reason=True) |
|
test_case = LLMTestCase( |
|
input=question, |
|
actual_output=answer, |
|
retrieval_context=context |
|
) |
|
|
|
faithfulness_metric.measure(test_case) |
|
return faithfulness_metric.score |
|
|
|
def eval_contextual_relevancy_metric(llm: LLM, question: str, answer: str, context: list): |
|
contextual_relevancy_metric = ContextualRelevancyMetric(model=llm, threshold=0.5, include_reason=False) |
|
test_case = LLMTestCase( |
|
input=question, |
|
actual_output=answer, |
|
retrieval_context=context |
|
) |
|
|
|
contextual_relevancy_metric.measure(test_case) |
|
return contextual_relevancy_metric.score |
|
|
|
def eval_hallucination_metric(llm: LLM, question: str, answer: str, context: list): |
|
hallucination_metric = HallucinationMetric(model=llm, threshold=0.5, include_reason=True) |
|
test_case = LLMTestCase( |
|
input=question, |
|
actual_output=answer, |
|
context=context |
|
) |
|
|
|
hallucination_metric.measure(test_case) |
|
return hallucination_metric.score |
|
|
|
def eval_bias_metric(llm: LLM, question: str, answer: str): |
|
bias_metric = BiasMetric(model=llm, threshold=0.5, include_reason=True) |
|
test_case = LLMTestCase( |
|
input=question, |
|
actual_output=answer |
|
) |
|
|
|
bias_metric.measure(test_case) |
|
return bias_metric.score |
|
|
|
def eval_toxicity_metric(llm: LLM, question: str, answer: str): |
|
toxicity_metric = ToxicityMetric(model=llm, threshold=0.5, include_reason=True) |
|
test_case = LLMTestCase( |
|
input=question, |
|
actual_output=answer |
|
) |
|
|
|
toxicity_metric.measure(test_case) |
|
return toxicity_metric.score |
|
|
|
def eval_rag_metrics(llm: LLM, question: str, answer: str, context: list) -> dict: |
|
return { |
|
"AnswerRelevancyMetric": eval_answer_relevancy_metric(llm, question, answer, context), |
|
"FaithfulnessMetric": eval_faithfulness_metric(llm, question, answer, context), |
|
"ContextualRelevancyMetric": eval_contextual_relevancy_metric(llm, question, answer, context), |
|
|
|
|
|
|
|
} |