Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,576 Bytes
7ccf9d4 6f0c9ff 6454c0e 3adea5e f05dc8f 3adea5e 089a447 133c6d8 7ccf9d4 6454c0e 5289522 089a447 ea047ad 133c6d8 ea047ad bae4131 ea047ad c272faa 089a447 c272faa 089a447 ea047ad bae4131 ea047ad 089a447 ea047ad 6454c0e b0d8978 3adea5e 6454c0e ea047ad c272faa ea047ad 089a447 c272faa 6454c0e ea047ad 6454c0e c272faa 257cf03 c272faa 257cf03 6454c0e 257cf03 6454c0e 089a447 ea047ad c272faa ea047ad 6454c0e ea047ad 3d76e98 ea047ad 6454c0e ea047ad 089a447 c272faa 089a447 6454c0e bae4131 089a447 f05dc8f 6f0c9ff f05dc8f 6f0c9ff f05dc8f 133c6d8 f05dc8f 133c6d8 7ccf9d4 089a447 133c6d8 7ccf9d4 133c6d8 7ccf9d4 089a447 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
from loguru import logger
from ruamel.yaml import YAML
from yourbench_space import PATH
from yourbench_space.utils import to_commentable_yaml
def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
"""Creates the base config dictionary"""
return {
"hf_configuration": {
"token": "$HF_TOKEN",
"hf_organization": hf_org,
"private": True,
"hf_dataset_name": hf_dataset_name,
"concat_if_exist": False,
},
"model_list": [
{
"model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
"provider": "nebius",
"max_concurrent_requests": 32,
},
{
"model_name": "Qwen/Qwen2.5-72B-Instruct",
"provider": "nebius",
"max_concurrent_requests": 32,
},
],
"model_roles": {
"ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
"summarization": ["Qwen/Qwen2.5-72B-Instruct"],
"chunking": ["intfloat/multilingual-e5-large-instruct"],
"single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
"multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
},
"pipeline": {
"ingestion": {
"run": True,
"source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/",
"output_dir": f"{PATH}/{session_uid}/ingested",
},
"upload_ingest_to_hub": {
"run": True,
"source_documents_dir": f"{PATH}/{session_uid}/ingested",
},
"summarization": {
"run": True,
"max_tokens": 16384,
"token_overlap": 64,
"encoding_name": "cl100k_base",
},
"chunking": {
"run": True,
"chunking_configuration": {
"chunking_mode": "fast_chunking",
"l_max_tokens": 512,
"token_overlap": 64,
"encoding_name": "cl100k_base",
"l_min_tokens": 256,
"tau_threshold": 0.3,
"h_min": 2,
"h_max": 5,
"num_multihops_factor": 1,
},
},
"single_shot_question_generation": {
"run": True,
"additional_instructions": "Generate questions to test a curious adult",
"chunk_sampling": {
"mode": "count",
"value": 5,
"random_seed": 49,
},
},
"multi_hop_question_generation": {
"run": True,
"additional_instructions": "Generate questions to test a curious adult",
"chunk_sampling": {
"mode": "percentage",
"value": 0.3,
"random_seed": 42,
},
},
"lighteval": {
"run": True,
},
"citation_score_filtering": {
"run": True,
},
},
}
def save_yaml_file(config: dict, path: str):
"""Saves the given config dictionary to a YAML file with helpful comments."""
yaml = YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
config_cm = to_commentable_yaml(config)
# Now we can add inline comments
ingestion = config_cm["pipeline"]["ingestion"]
ingestion.yaml_set_comment_before_after_key(
"source_documents_dir", before="⚠️ Change this path to match your local directory"
)
ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved")
upload = config_cm["pipeline"]["upload_ingest_to_hub"]
upload.yaml_set_comment_before_after_key(
"source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed"
)
with open(path, "w") as file:
yaml.dump(config_cm, file)
return path
def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str):
"""Generates and saves the YAML configuration file"""
logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}")
config = generate_base_config(hf_org, hf_name, session_uid)
file_path = save_yaml_file(config, config_path)
logger.success(f"Config saved at: {file_path}")
return file_path
|