File size: 3,791 Bytes
aea4c4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""CTFAIA 2024 dataset."""

import json
import os

import datasets

_CITATION = """ """

_DESCRIPTION = """ """

_HOMEPAGE = ""

_LICENSE = ""

_NAMES = [
    "2024_all",
    "2024_level1",
    "2024_level2",
    "2024_level3",
]

YEAR_TO_LEVELS = {"2024": [1, 2, 3]}

separator = "_"


class CTFAIA_dataset(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name=name, version=version, description=name)
        for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
    ]

    DEFAULT_CONFIG_NAME = "2024_all"

    def _info(self):
        features = datasets.Features(
            {
                "task_name": datasets.Value("string"),
                "Question": datasets.Value("string"),
                "url": datasets.Value("string"),
                "Level": datasets.Value("string"),
                "Final answer": datasets.Value("string"),  # ? for test values
                "Total score": datasets.Value("string"),
                "prompt": datasets.Value("string"),
                "type": datasets.Value("string"),
                "score": datasets.Sequence({
                    "question": datasets.Value("string"),
                    "score": datasets.Value("string"),
                    "answer": datasets.Sequence(datasets.Value("string"))
                }),
                "Annex": datasets.Value("string"),
                "Annex_path": datasets.Value("string"),  # generated here
                "Annotator Metadata": {
                    "Reference URL": datasets.Value("string"),
                    "Steps": datasets.Sequence(datasets.Value("string")),
                    "Optional Tools": datasets.Sequence(datasets.Value("string")),
                }  # "",
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        year, level_name = self.config.name.split(separator)
        if level_name == "all":
            levels = YEAR_TO_LEVELS[year]
        else:
            level_name = int(level_name.split("level")[1])
            levels = [level_name]
        print(year, level_name)
        output = []
        for split in ["test", "validation"]:
            root_file = dl_manager.download(os.path.join(year, split, "metadata.jsonl"))
            test_attached_files = {"": ""}
            with open(root_file, "r", encoding="utf-8") as f:
                for line in f:
                    cur_line = json.loads(line)
                    if cur_line["Level"] in levels and cur_line["Annex"] != "":
                        attached_file_name = cur_line["Annex"]
                        attached_file = dl_manager.download(os.path.join(year, split, attached_file_name))
                        test_attached_files[attached_file_name] = attached_file

            output.append(
                datasets.SplitGenerator(
                    name=getattr(datasets.Split, split.upper()),
                    gen_kwargs={"root_file": root_file, "attached_files": test_attached_files, "levels": levels},
                )
            )
        return output

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, root_file: str, attached_files: dict, levels: list[int]):
        with open(root_file, "r", encoding="utf-8") as f:
            for key, line in enumerate(f):
                cur_line = json.loads(line)
                if cur_line["Level"] in levels:
                    cur_line["Annex_path"] = attached_files[cur_line["Annex"]]
                    yield key, cur_line