Spaces:

Mdrnfox
/

peft-bench-eval

Sleeping

App Files Files Community

peft-bench-eval / run_eval.py

Mdrnfox

Update run_eval.py

c485faf verified 15 days ago

raw

history blame contribute delete

6.76 kB

	#!/usr/bin/env python3
	import datetime, os, subprocess, tempfile
	from pathlib import Path

	import gc
	import pandas as pd, yaml, torch
	from huggingface_hub import HfApi, login, hf_hub_download, model_info
	from lm_eval import evaluator
	from lm_eval.models.huggingface import HFLM
	from peft import PeftModel
	from transformers import (
	AutoModelForCausalLM,
	AutoModelForSequenceClassification,
	AutoTokenizer,
	BitsAndBytesConfig
	)


	CONFIGS = []

	# ───── Load all configs ─────
	if Path("adapters.yaml").exists():
	CONFIGS.extend(yaml.safe_load(open("adapters.yaml"))["adapters"])

	for yml in Path("manifests").glob("*.yaml"):
	CONFIGS.append(yaml.safe_load(open(yml)))

	if not CONFIGS:
	raise RuntimeError("No adapter configs found in adapters.yaml or manifests/")

	# ───── Hugging Face auth ─────
	token = os.getenv("HF_TOKEN")
	if not token or token == "***":
	raise RuntimeError("HF_TOKEN secret is missing.")
	login(token)

	DATASET_REPO = os.environ["HF_DATASET_REPO"]
	api = HfApi()

	all_rows = []

	# ───── Safe tokenizer loading ─────
	def load_tokenizer(model_id: str):
	try:
	return AutoTokenizer.from_pretrained(model_id, use_fast=True)
	except Exception as e1:
	print(f"Fast tokenizer failed for {model_id}: {e1}")
	try:
	return AutoTokenizer.from_pretrained(model_id, use_fast=False)
	except Exception as e2:
	raise RuntimeError(f"Failed to load tokenizer for {model_id}: {e2}") from e2

	# ───── Evaluate each adapter ─────
	for cfg in CONFIGS:
	base_model_id = cfg["base_model"]
	adapter_repo = cfg["adapter_repo"]
	adapter_type = cfg.get("adapter_type", "LoRA")
	tasks = cfg["tasks"]

	print(f"\nLoading base model: {base_model_id}")
	tokenizer = load_tokenizer(base_model_id)

	if "llama" in base_model_id.lower():
	try:
	tokenizer.legacy = False
	except:
	pass

	try:
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_id,
	device_map="auto",
	torch_dtype=torch.float16,
	trust_remote_code=True,
	use_safetensors=True
	)
	is_encoder = False
	print("Loaded as Causal LM")
	except Exception as e:
	print(f"⚠️ Failed to load causal LM: {e}")
	base_model = AutoModelForSequenceClassification.from_pretrained(
	base_model_id,
	device_map="auto",
	torch_dtype=torch.float16,
	trust_remote_code=True,
	use_safetensors=True
	)
	is_encoder = True
	print("Loaded as Sequence Classification model")

	try:
	info = model_info(adapter_repo)
	files = [f.rfilename for f in info.siblings]
	if "adapter_config.json" not in files:
	print(f"{adapter_repo} is not a valid PEFT adapter (missing adapter_config.json)")
	continue
	except Exception as e:
	print(f"Failed to inspect adapter {adapter_repo}: {e}")
	continue

	try:
	peft_model = PeftModel.from_pretrained(
	base_model,
	adapter_repo,
	device_map="auto",
	torch_dtype=torch.float16,
	)
	merged_model = peft_model.merge_and_unload()
	except Exception as e:
	print(f"Failed to apply adapter {adapter_repo}: {e}")
	continue

	merged_model.eval()

	with tempfile.TemporaryDirectory() as td:
	merged_model.save_pretrained(td)
	tokenizer.save_pretrained(td)

	# Verify tokenizer object
	if not hasattr(tokenizer, "vocab_size"):
	print("Invalid tokenizer loaded. Skipping.")
	continue

	device = "cuda" if torch.cuda.is_available() else "cpu"


	hf_lm = HFLM(
	pretrained=td,
	batch_size=8 if not is_encoder else 16,
	device=device,
	)

	try:
	res = evaluator.simple_evaluate(model=hf_lm, tasks=tasks)
	print(f"Raw results for {adapter_repo}: {res}")
	if not res.get("results"):
	print(f"Empty results — likely a task or model compatibility issue for: {adapter_repo}")
	continue
	print(f"\nEvaluation raw result for {adapter_repo}:")
	print(res.get("results", {}))
	del merged_model
	del peft_model
	del base_model
	del tokenizer
	del hf_lm
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()
	except Exception as e:
	print(f"Evaluation failed for {adapter_repo}: {e}")
	continue

	meta = {
	"model_id": adapter_repo,
	"adapter_type": adapter_type,
	"trainable_params": cfg.get("trainable_params"),
	"peak_gpu_mem_mb": torch.cuda.max_memory_allocated() // 1024**2 if torch.cuda.is_available() else None,
	"run_date": datetime.datetime.utcnow().isoformat(timespec="seconds"),
	"commit_sha": subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode(),
	}

	count_before = len(all_rows)
	for task, scores in res["results"].items():
	for metric, value in scores.items():
	if value is None:
	continue
	metric_name, _, aggregation = metric.partition(",")

	all_rows.append({
	**meta,
	"task": task,
	"metric": metric_name,
	"aggregation": aggregation or None,
	"value": value
	})


	print(f"{len(all_rows) - count_before} rows added for {adapter_repo}")


	# ───── Merge and upload results ─────
	df_new = pd.DataFrame(all_rows)

	with tempfile.TemporaryDirectory() as tmp:
	current_path = hf_hub_download(
	repo_id=DATASET_REPO,
	filename="data/peft_bench.parquet",
	repo_type="dataset",
	cache_dir=tmp,
	local_dir=tmp,
	local_dir_use_symlinks=False,
	)
	df_existing = pd.read_parquet(current_path)
	df_combined = pd.concat([df_existing, df_new], ignore_index=True)
	df_combined = df_combined.sort_values("run_date")
	df_combined["value"] = pd.to_numeric(df_combined["value"], errors="coerce")

	print("\nFinal new results:")
	print(df_new[["model_id", "task", "metric", "aggregation", "value"]])


	out = Path("peft_bench.parquet")
	df_combined.to_parquet(out, index=False)

	api.upload_file(
	path_or_fileobj=out,
	path_in_repo="data/peft_bench.parquet",
	repo_id=DATASET_REPO,
	repo_type="dataset",
	commit_message=f"Add {len(CONFIGS)} new adapter run(s)",
	)