from loguru import logger from ruamel.yaml import YAML from yourbench_space import PATH from yourbench_space.utils import to_commentable_yaml def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str): """Creates the base config dictionary""" return { "hf_configuration": { "token": "$HF_TOKEN", "hf_organization": hf_org, "private": True, "hf_dataset_name": hf_dataset_name, "concat_if_exist": False, }, "model_list": [ { "model_name": "Qwen/Qwen2.5-VL-72B-Instruct", "provider": "nebius", "max_concurrent_requests": 32, }, { "model_name": "Qwen/Qwen2.5-72B-Instruct", "provider": "nebius", "max_concurrent_requests": 32, }, ], "model_roles": { "ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"], "summarization": ["Qwen/Qwen2.5-72B-Instruct"], "chunking": ["intfloat/multilingual-e5-large-instruct"], "single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"], "multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"], }, "pipeline": { "ingestion": { "run": True, "source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/", "output_dir": f"{PATH}/{session_uid}/ingested", }, "upload_ingest_to_hub": { "run": True, "source_documents_dir": f"{PATH}/{session_uid}/ingested", }, "summarization": { "run": True, "max_tokens": 16384, "token_overlap": 64, "encoding_name": "cl100k_base", }, "chunking": { "run": True, "chunking_configuration": { "chunking_mode": "fast_chunking", "l_max_tokens": 512, "token_overlap": 64, "encoding_name": "cl100k_base", "l_min_tokens": 256, "tau_threshold": 0.3, "h_min": 2, "h_max": 5, "num_multihops_factor": 1, }, }, "single_shot_question_generation": { "run": True, "additional_instructions": "Generate questions to test a curious adult", "chunk_sampling": { "mode": "count", "value": 5, "random_seed": 49, }, }, "multi_hop_question_generation": { "run": True, "additional_instructions": "Generate questions to test a curious adult", "chunk_sampling": { "mode": "percentage", "value": 0.3, "random_seed": 42, }, }, "lighteval": { "run": True, }, "citation_score_filtering": { "run": True, }, }, } def save_yaml_file(config: dict, path: str): """Saves the given config dictionary to a YAML file with helpful comments.""" yaml = YAML() yaml.indent(mapping=2, sequence=4, offset=2) config_cm = to_commentable_yaml(config) # Now we can add inline comments ingestion = config_cm["pipeline"]["ingestion"] ingestion.yaml_set_comment_before_after_key( "source_documents_dir", before="⚠️ Change this path to match your local directory" ) ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved") upload = config_cm["pipeline"]["upload_ingest_to_hub"] upload.yaml_set_comment_before_after_key( "source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed" ) with open(path, "w") as file: yaml.dump(config_cm, file) return path def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str): """Generates and saves the YAML configuration file""" logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}") config = generate_base_config(hf_org, hf_name, session_uid) file_path = save_yaml_file(config, config_path) logger.success(f"Config saved at: {file_path}") return file_path