File size: 2,599 Bytes
f5776d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

import dsp
import tqdm
import pandas as pd

try:
    from IPython.display import display as ipython_display
except ImportError:
    ipython_display = print
from dsp.utils import EM, F1, HotPotF1


def evaluateRetrieval(fn, dev, metric=None):
    data = []

    for example in tqdm.tqdm(dev):
        question = example.question
        prediction = fn(question)

        d = dict(example)

        # d['prediction'] = prediction.answer
        d['correct'] =  dsp.passage_match(prediction.context, example.answer)
        data.append(d)

    df = pd.DataFrame(data)

    percentage = round(100.0 * df['correct'].sum() / len(dev), 1)
    print(f"Answered {df['correct'].sum()} / {len(dev)} ({percentage}%) correctly.")
    df['correct'] = df['correct'].apply(lambda x: 'βœ”οΈ' if x else '❌')

    pd.options.display.max_colwidth = None
    ipython_display(df.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}, {'selector': 'td', 'props': [('text-align', 'left')]}]))


def evaluateAnswer(fn, dev, metric=EM):
    data = []

    for example in tqdm.tqdm(dev):
        question = example.question
        prediction = fn(question)

        d = dict(example)

        pred = prediction.answer

        d['prediction'] = pred
        d['correct'] = metric(pred, example.answer)
        data.append(d)

    df = pd.DataFrame(data)

    percentage = round(100.0 * df['correct'].sum() / len(dev), 1)
    print(f"Answered {df['correct'].sum()} / {len(dev)} ({percentage}%) correctly.")
    df['correct'] = df['correct'].apply(lambda x: 'βœ”οΈ' if x else '❌')

    pd.options.display.max_colwidth = None
    ipython_display(df.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}, {'selector': 'td', 'props': [('text-align', 'left')]}]))



def evaluate(fn, dev, metric=EM):
    data = []

    for example in tqdm.tqdm(dev):
        question = example.question
        prediction = fn(question)

        d = dict(example)

        pred = prediction#.answer

        d['prediction'] = pred
        d['correct'] = metric(pred, example.answer)
        data.append(d)

    df = pd.DataFrame(data)

    percentage = round(100.0 * df['correct'].sum() / len(dev), 1)
    print(f"Answered {df['correct'].sum()} / {len(dev)} ({percentage}%) correctly.")
    df['correct'] = df['correct'].apply(lambda x: 'βœ”οΈ' if x else '❌')

    pd.options.display.max_colwidth = None
    ipython_display(df.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}, {'selector': 'td', 'props': [('text-align', 'left')]}]))

    return percentage