File size: 3,698 Bytes
2214088 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import asyncio
import csv
from langchain_groq import ChatGroq
from pydantic import BaseModel
from src.pipeline import QAPipeline
from src.settings import settings
class LLMResponse(BaseModel):
is_correct: bool
is_idk: bool
pipeline = QAPipeline()
llm = ChatGroq(
model="llama-3.3-70b-versatile",
max_tokens=512,
max_retries=2,
timeout=30000,
api_key=settings.GROQ_API_KEY,
)
# Bind structured evaluation output
llm_evaluator = llm.with_structured_output(LLMResponse, include_raw=True)
# Sample queries and ground truths
queries = [
"What are the ingredients of tomato soup?",
"Recipe for an apple pie",
"How do you make a chocolate cake?",
"How do you make dal bhat?",
"What are the ingredients for making dhido?",
"How do you make chana masala?",
"Recipe for butter chicken",
"Recipe for ramen",
]
ground_truths = [
"Tomato Soup: tomatoes, onion, garlic, vegetable broth, cream, salt, pepper",
"Apple Pie: apples, flour, butter, sugar, cinnamon",
"I don't know",
"I don't know",
"I don't know",
"Chana Masala: chickpeas, tomato, onion, garlic, ginger, spices",
"Butter Chicken: chicken, tomato puree, cream, butter, spices, garlic, ginger",
"Ramen: ramen noodles, broth, boiled egg, green onion, soy sauce",
]
# Evaluation function
async def evaluate_pipeline():
correct = 0
idk = 0
total = len(queries)
results = []
for q, gt in zip(queries, ground_truths):
response = await pipeline.answer_query_(q)
answer = response.answer
# Evaluation prompt
eval_prompt = f"""
You are an evaluator. Assess whether the model's answer is both factually correct and acknowledges lack of knowledge when necessary.
Question: {q}
Model's Answer: {answer}
Ground Truth: {gt}
Evaluate the following:
1. Is the model's answer semantically correct when compared to the ground truth?
2. Does the model appropriately say "I don't know" or avoid answering if the answer is not available?
Respond in JSON with two fields:
- is_correct: true or false
- is_idk: true or false
"""
result = llm_evaluator.invoke(eval_prompt)
parsed = result["parsed"]
# Correct = either factually correct or correctly says "I don't know" when GT also says so
if parsed.is_correct or (parsed.is_idk and gt.strip().lower() == "i don't know"):
correct += 1
if parsed.is_idk:
idk += 1
# Log and store results
print(
f"Q: {q}\nA: {answer}\nGT: {gt}\nCorrect: {parsed.is_correct}, IDK: {parsed.is_idk}\n{'-' * 60}"
)
results.append(
{
"question": q,
"model_answer": answer,
"ground_truth": gt,
"is_correct": parsed.is_correct,
"is_idk": parsed.is_idk,
}
)
# Save results to CSV
with open("evaluation_results.csv", "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(
csvfile,
fieldnames=[
"question",
"model_answer",
"ground_truth",
"is_correct",
"is_idk",
],
)
writer.writeheader()
writer.writerows(results)
# Print summary
print(f"\nEvaluation results saved to 'evaluation_results.csv'.")
print(f"Total Correct: {correct}/{total} ({(correct / total) * 100:.2f}%)")
print(f"'I don't know' Responses: {idk}/{3} ({(idk / 3) * 100:.2f}%)") #here 3 because there a re 3 total i dont know response
# Entry point
if __name__ == "__main__":
asyncio.run(evaluate_pipeline())
|