#!/usr/bin/env python3 import datetime, os, subprocess, tempfile from pathlib import Path import pandas as pd, yaml, torch from huggingface_hub import HfApi, login, hf_hub_download from lm_eval import evaluator from lm_eval.models.huggingface import HFLM from peft import PeftModel from transformers import ( AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, ) CONFIGS = [] # ───── Load all configs ───── if Path("adapters.yaml").exists(): CONFIGS.extend(yaml.safe_load(open("adapters.yaml"))["adapters"]) for yml in Path("manifests").glob("*.yaml"): CONFIGS.append(yaml.safe_load(open(yml))) if not CONFIGS: raise RuntimeError("No adapter configs found in adapters.yaml or manifests/") # ───── Hugging Face auth ───── token = os.getenv("HF_TOKEN") if not token or token == "***": raise RuntimeError("HF_TOKEN secret is missing.") login(token) DATASET_REPO = os.environ["HF_DATASET_REPO"] api = HfApi() # ───── Evaluate each adapter ───── all_rows = [] METRICS_TO_KEEP = {"acc", "accuracy", "acc_stderr", "f1", "exact_match"} for cfg in CONFIGS: base_model_id = cfg["base_model"] adapter_repo = cfg["adapter_repo"] adapter_type = cfg.get("adapter_type", "LoRA") tasks = cfg["tasks"] print(f"\nLoading base model: {base_model_id}") tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True) # Try causal first, fallback to encoder try: base_model = AutoModelForCausalLM.from_pretrained( base_model_id, trust_remote_code=True, use_safetensors=True ) is_encoder = False except: base_model = AutoModelForSequenceClassification.from_pretrained( base_model_id, trust_remote_code=True, use_safetensors=True ) is_encoder = True peft_model = PeftModel.from_pretrained(base_model, adapter_repo) merged_model = peft_model.merge_and_unload() device = "cuda" if torch.cuda.is_available() else "cpu" merged_model.to(device) merged_model.eval() with tempfile.TemporaryDirectory() as td: merged_model.save_pretrained(td) tokenizer.save_pretrained(td) hf_lm = HFLM( pretrained=td, batch_size=8 if not is_encoder else 16, device=device, ) res = evaluator.simple_evaluate(model=hf_lm, tasks=tasks) meta = { "model_id": adapter_repo, "adapter_type": adapter_type, "trainable_params": cfg.get("trainable_params"), "peak_gpu_mem_mb": torch.cuda.max_memory_allocated(device) // 1024**2 if torch.cuda.is_available() else None, "run_date": datetime.datetime.utcnow().isoformat(timespec="seconds"), "commit_sha": subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode(), } for task, scores in res["results"].items(): for metric, value in scores.items(): if metric not in METRICS_TO_KEEP: continue all_rows.append({**meta, "task": task, "metric": metric, "value": value}) # ───── Merge and upload results ───── df_new = pd.DataFrame(all_rows) with tempfile.TemporaryDirectory() as tmp: current_path = hf_hub_download( repo_id=DATASET_REPO, filename="data/peft_bench.parquet", repo_type="dataset", cache_dir=tmp, local_dir=tmp, local_dir_use_symlinks=False, ) df_existing = pd.read_parquet(current_path) df_combined = pd.concat([df_existing, df_new], ignore_index=True) df_combined = df_combined.sort_values("run_date") df_combined["value"] = pd.to_numeric(df_combined["value"], errors="coerce") print("Existing rows:", len(df_existing)) print("New rows:", len(df_new)) print("Combined rows (pre-dedup):", len(df_existing) + len(df_new)) print("Final rows (after dedup):", len(df_combined)) out = Path("peft_bench.parquet") df_combined.to_parquet(out, index=False) api.upload_file( path_or_fileobj=out, path_in_repo="data/peft_bench.parquet", repo_id=DATASET_REPO, repo_type="dataset", commit_message=f"Add {len(CONFIGS)} new adapter run(s)", )