Mdrnfox commited on
Commit
bba84bb
Β·
verified Β·
1 Parent(s): 1d0493c

Create run_eval.py

Browse files
Files changed (1) hide show
  1. run_eval.py +122 -0
run_eval.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import datetime, os, subprocess, tempfile
3
+ from pathlib import Path
4
+
5
+ import pandas as pd, yaml, torch
6
+ from huggingface_hub import HfApi, login, hf_hub_download
7
+ from lm_eval import evaluator
8
+ from lm_eval.models.huggingface import HFLM
9
+ from peft import PeftModel
10
+ from transformers import (
11
+ AutoModelForCausalLM,
12
+ AutoModelForSequenceClassification,
13
+ AutoTokenizer,
14
+ )
15
+
16
+ CONFIGS = []
17
+
18
+ # ───── Load all configs ─────
19
+ if Path("adapters.yaml").exists():
20
+ CONFIGS.extend(yaml.safe_load(open("adapters.yaml"))["adapters"])
21
+
22
+ for yml in Path("manifests").glob("*.yaml"):
23
+ CONFIGS.append(yaml.safe_load(open(yml)))
24
+
25
+ if not CONFIGS:
26
+ raise RuntimeError("No adapter configs found in adapters.yaml or manifests/")
27
+
28
+ # ───── Hugging Face auth ─────
29
+ token = os.getenv("HF_TOKEN")
30
+ if not token or token == "***":
31
+ raise RuntimeError("HF_TOKEN secret is missing.")
32
+ login(token)
33
+
34
+ DATASET_REPO = os.environ["HF_DATASET_REPO"]
35
+ api = HfApi()
36
+
37
+ # ───── Evaluate each adapter ─────
38
+ all_rows = []
39
+ METRICS_TO_KEEP = {"acc", "accuracy", "acc_stderr", "f1", "exact_match"}
40
+
41
+ for cfg in CONFIGS:
42
+ base_model_id = cfg["base_model"]
43
+ adapter_repo = cfg["adapter_repo"]
44
+ adapter_type = cfg.get("adapter_type", "LoRA")
45
+ tasks = cfg["tasks"]
46
+
47
+ print(f"\nπŸ“¦ Loading base model: {base_model_id}")
48
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)
49
+
50
+ # Try causal first, fallback to encoder
51
+ try:
52
+ base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
53
+ is_encoder = False
54
+ except:
55
+ base_model = AutoModelForSequenceClassification.from_pretrained(base_model_id)
56
+ is_encoder = True
57
+
58
+ peft_model = PeftModel.from_pretrained(base_model, adapter_repo)
59
+ merged_model = peft_model.merge_and_unload()
60
+ device = "cuda" if torch.cuda.is_available() else "cpu"
61
+ merged_model.to(device)
62
+ merged_model.eval()
63
+
64
+ with tempfile.TemporaryDirectory() as td:
65
+ merged_model.save_pretrained(td)
66
+ tokenizer.save_pretrained(td)
67
+
68
+ hf_lm = HFLM(
69
+ pretrained=td,
70
+ batch_size=8 if not is_encoder else 16,
71
+ device=device,
72
+ )
73
+ res = evaluator.simple_evaluate(model=hf_lm, tasks=tasks)
74
+
75
+ meta = {
76
+ "model_id": adapter_repo,
77
+ "adapter_type": adapter_type,
78
+ "trainable_params": cfg.get("trainable_params"),
79
+ "peak_gpu_mem_mb": torch.cuda.max_memory_allocated(device) // 1024**2 if torch.cuda.is_available() else None,
80
+ "run_date": datetime.datetime.utcnow().isoformat(timespec="seconds"),
81
+ "commit_sha": subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode(),
82
+ }
83
+
84
+ for task, scores in res["results"].items():
85
+ for metric, value in scores.items():
86
+ if metric not in METRICS_TO_KEEP:
87
+ continue
88
+ all_rows.append({**meta, "task": task, "metric": metric, "value": value})
89
+
90
+ # ───── Merge and upload results ─────
91
+ df_new = pd.DataFrame(all_rows)
92
+
93
+ with tempfile.TemporaryDirectory() as tmp:
94
+ current_path = hf_hub_download(
95
+ repo_id=DATASET_REPO,
96
+ filename="data/peft_bench.parquet",
97
+ repo_type="dataset",
98
+ cache_dir=tmp,
99
+ local_dir=tmp,
100
+ local_dir_use_symlinks=False,
101
+ )
102
+ df_existing = pd.read_parquet(current_path)
103
+ df_combined = pd.concat([df_existing, df_new], ignore_index=True)
104
+
105
+ df_combined = (
106
+ df_combined
107
+ .sort_values("run_date")
108
+ .drop_duplicates(subset=["model_id", "task", "metric"], keep="last")
109
+ )
110
+
111
+ df_combined["value"] = pd.to_numeric(df_combined["value"], errors="coerce")
112
+
113
+ out = Path("peft_bench.parquet")
114
+ df_combined.to_parquet(out, index=False)
115
+
116
+ api.upload_file(
117
+ path_or_fileobj=out,
118
+ path_in_repo="data/peft_bench.parquet",
119
+ repo_id=DATASET_REPO,
120
+ repo_type="dataset",
121
+ commit_message=f"Add {len(CONFIGS)} new adapter run(s)",
122
+ )