File size: 4,576 Bytes
7ccf9d4
6f0c9ff
6454c0e
3adea5e
f05dc8f
3adea5e
089a447
133c6d8
7ccf9d4
 
6454c0e
5289522
089a447
ea047ad
133c6d8
ea047ad
bae4131
 
 
ea047ad
c272faa
089a447
 
 
 
c272faa
089a447
ea047ad
bae4131
 
ea047ad
089a447
ea047ad
 
 
6454c0e
 
 
b0d8978
3adea5e
 
6454c0e
 
ea047ad
c272faa
ea047ad
 
089a447
c272faa
 
 
6454c0e
 
ea047ad
6454c0e
c272faa
257cf03
c272faa
 
257cf03
 
6454c0e
257cf03
 
6454c0e
 
 
089a447
ea047ad
 
 
 
c272faa
ea047ad
6454c0e
ea047ad
3d76e98
ea047ad
 
 
 
 
 
6454c0e
ea047ad
 
089a447
c272faa
 
 
089a447
6454c0e
bae4131
089a447
f05dc8f
 
 
 
 
 
 
 
 
6f0c9ff
 
 
f05dc8f
 
 
6f0c9ff
 
 
f05dc8f
133c6d8
f05dc8f
 
133c6d8
7ccf9d4
089a447
133c6d8
7ccf9d4
133c6d8
 
 
7ccf9d4
089a447
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from loguru import logger
from ruamel.yaml import YAML

from yourbench_space import PATH
from yourbench_space.utils import to_commentable_yaml


def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
    """Creates the base config dictionary"""
    return {
        "hf_configuration": {
            "token": "$HF_TOKEN",
            "hf_organization": hf_org,
            "private": True,
            "hf_dataset_name": hf_dataset_name,
            "concat_if_exist": False,
        },
        "model_list": [
            {
                "model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
                "provider": "nebius",
                "max_concurrent_requests": 32,
            },
            {
                "model_name": "Qwen/Qwen2.5-72B-Instruct",
                "provider": "nebius",
                "max_concurrent_requests": 32,
            },
        ],
        "model_roles": {
            "ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
            "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
            "chunking": ["intfloat/multilingual-e5-large-instruct"],
            "single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
            "multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
        },
        "pipeline": {
            "ingestion": {
                "run": True,
                "source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/",
                "output_dir": f"{PATH}/{session_uid}/ingested",
            },
            "upload_ingest_to_hub": {
                "run": True,
                "source_documents_dir": f"{PATH}/{session_uid}/ingested",
            },
            "summarization": {
                "run": True,
                "max_tokens": 16384,
                "token_overlap": 64,
                "encoding_name": "cl100k_base",
            },
            "chunking": {
                "run": True,
                "chunking_configuration": {
                    "chunking_mode": "fast_chunking",
                    "l_max_tokens": 512,
                    "token_overlap": 64,
                    "encoding_name": "cl100k_base",
                    "l_min_tokens": 256,
                    "tau_threshold": 0.3,
                    "h_min": 2,
                    "h_max": 5,
                    "num_multihops_factor": 1,
                },
            },
            "single_shot_question_generation": {
                "run": True,
                "additional_instructions": "Generate questions to test a curious adult",
                "chunk_sampling": {
                    "mode": "count",
                    "value": 5,
                    "random_seed": 49,
                },
            },
            "multi_hop_question_generation": {
                "run": True,
                "additional_instructions": "Generate questions to test a curious adult",
                "chunk_sampling": {
                    "mode": "percentage",
                    "value": 0.3,
                    "random_seed": 42,
                },
            },
            "lighteval": {
                "run": True,
            },
            "citation_score_filtering": {
                "run": True,
            },
        },
    }


def save_yaml_file(config: dict, path: str):
    """Saves the given config dictionary to a YAML file with helpful comments."""
    yaml = YAML()
    yaml.indent(mapping=2, sequence=4, offset=2)

    config_cm = to_commentable_yaml(config)

    # Now we can add inline comments
    ingestion = config_cm["pipeline"]["ingestion"]
    ingestion.yaml_set_comment_before_after_key(
        "source_documents_dir", before="⚠️ Change this path to match your local directory"
    )
    ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved")

    upload = config_cm["pipeline"]["upload_ingest_to_hub"]
    upload.yaml_set_comment_before_after_key(
        "source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed"
    )

    with open(path, "w") as file:
        yaml.dump(config_cm, file)

    return path


def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str):
    """Generates and saves the YAML configuration file"""
    logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}")
    config = generate_base_config(hf_org, hf_name, session_uid)
    file_path = save_yaml_file(config, config_path)
    logger.success(f"Config saved at: {file_path}")
    return file_path