File size: 3,698 Bytes
2214088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import asyncio
import csv
from langchain_groq import ChatGroq
from pydantic import BaseModel
from src.pipeline import QAPipeline
from src.settings import settings


class LLMResponse(BaseModel):
    is_correct: bool
    is_idk: bool


pipeline = QAPipeline()

llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    max_tokens=512,
    max_retries=2,
    timeout=30000,
    api_key=settings.GROQ_API_KEY,
)

# Bind structured evaluation output
llm_evaluator = llm.with_structured_output(LLMResponse, include_raw=True)

# Sample queries and ground truths
queries = [
    "What are the ingredients of tomato soup?",
    "Recipe for an apple pie",
    "How do you make a chocolate cake?",
    "How do you make dal bhat?",
    "What are the ingredients for making dhido?",
    "How do you make chana masala?",
    "Recipe for butter chicken",
    "Recipe for ramen",
]

ground_truths = [
    "Tomato Soup: tomatoes, onion, garlic, vegetable broth, cream, salt, pepper",
    "Apple Pie: apples, flour, butter, sugar, cinnamon",
    "I don't know",
    "I don't know",
    "I don't know",
    "Chana Masala: chickpeas, tomato, onion, garlic, ginger, spices",
    "Butter Chicken: chicken, tomato puree, cream, butter, spices, garlic, ginger",
    "Ramen: ramen noodles, broth, boiled egg, green onion, soy sauce",
]

# Evaluation function
async def evaluate_pipeline():
    correct = 0
    idk = 0
    total = len(queries)
    results = []

    for q, gt in zip(queries, ground_truths):
        response = await pipeline.answer_query_(q)
        answer = response.answer

        # Evaluation prompt
        eval_prompt = f"""
You are an evaluator. Assess whether the model's answer is both factually correct and acknowledges lack of knowledge when necessary.

Question: {q}
Model's Answer: {answer}
Ground Truth: {gt}

Evaluate the following:
1. Is the model's answer semantically correct when compared to the ground truth?
2. Does the model appropriately say "I don't know" or avoid answering if the answer is not available?

Respond in JSON with two fields:
- is_correct: true or false
- is_idk: true or false
"""

        result = llm_evaluator.invoke(eval_prompt)
        parsed = result["parsed"]

        # Correct = either factually correct or correctly says "I don't know" when GT also says so
        if parsed.is_correct or (parsed.is_idk and gt.strip().lower() == "i don't know"):
            correct += 1
        if parsed.is_idk:
            idk += 1

        # Log and store results
        print(
            f"Q: {q}\nA: {answer}\nGT: {gt}\nCorrect: {parsed.is_correct}, IDK: {parsed.is_idk}\n{'-' * 60}"
        )

        results.append(
            {
                "question": q,
                "model_answer": answer,
                "ground_truth": gt,
                "is_correct": parsed.is_correct,
                "is_idk": parsed.is_idk,
            }
        )

    # Save results to CSV
    with open("evaluation_results.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(
            csvfile,
            fieldnames=[
                "question",
                "model_answer",
                "ground_truth",
                "is_correct",
                "is_idk",
            ],
        )
        writer.writeheader()
        writer.writerows(results)

    # Print summary
    print(f"\nEvaluation results saved to 'evaluation_results.csv'.")
    print(f"Total Correct: {correct}/{total} ({(correct / total) * 100:.2f}%)")
    print(f"'I don't know' Responses: {idk}/{3} ({(idk / 3) * 100:.2f}%)") #here 3 because there a re 3 total i dont know response 

# Entry point
if __name__ == "__main__":
    asyncio.run(evaluate_pipeline())