Inference slows down after Lora Adapter merger of fine-tuned models

#15
by MinHyeong - opened

I am measuring the performance of the model through the Triviaqa benchmark. (Qwen-2.5-7B) base

When the base model is evaluated using the evaluation tool of lm-val-harness, it takes about 1 hour to 1 hour and 30 minutes, and the model I merged into lora takes 8 to 9 hours when running under the same conditions and in the same batch.
Why is this happening? Here's the code for the model I merged
At first, I thought it was finetuning with gradinet checkpointing without using use_cache, so I got the base model's config file and saved it, but it's still the same.

code ---------------------

lora_model_path = "/nas/home/mhlee/_0ung/output/qwen_64_128_logit/checkpoint-1748"
repo_id = "MinHyeong/qwen_logit"

try:
hf_token = os.getenv("HF_TOKEN")
if hf_token:
login(token=hf_token)
else:
logger.warning("HF_TOKEN Not.")

logger.info("PEFT...")
peft_config = PeftConfig.from_pretrained(lora_model_path)
base_model_name_or_path = peft_config.base_model_name_or_path
logger.info(f"base model: {base_model_name_or_path}")

logger.info(f"base model loding: {base_model_name_or_path}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name_or_path,
    trust_remote_code=True
)
logger.info(f"base model config: {base_model.config.to_json_string()}")

logger.info("base model token...")
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name_or_path,
    trust_remote_code=True
)

#PEFT model and adpater
logger.info("LoRA...")
model = PeftModel.from_pretrained(base_model, lora_model_path)

# Merge
logger.info("Merge...")
model = model.merge_and_unload()

with tempfile.TemporaryDirectory() as temp_dir:
    logger.info(f"temporary: {temp_dir}")

    # save
    logger.info(f"merge {temp_dir} to save...")
    model.save_pretrained(
        temp_dir,
        safe_serialization=True,
        save_config=True,
        max_shard_size="10GB"
    )
    logger.info(f"finish 10GB")
    tokenizer.save_pretrained(temp_dir)

    saved_files = os.listdir(temp_dir)
    logger.info(f"file list: {saved_files}")
    has_sharded_files = any(f.startswith("model-") and f.endswith(".safetensors") for f in saved_files)
    has_index_file = "model.safetensors.index.json" in saved_files
    has_config_file = "config.json" in saved_files
    if not (has_sharded_files and has_index_file and has_config_file):
        raise FileNotFoundError("Can't saved. Sharded safetensors")

    api = HfApi()
    logger.info(f"Make repo: {repo_id}")
    api.create_repo(
        repo_id=repo_id,
        repo_type="model",
        exist_ok=True,
        token=hf_token
    )
    logger.info(f"Done: {repo_id}")

    logger.info(f"Hugging Face Hub: {repo_id}")
    api.upload_folder(
        folder_path=temp_dir,
        repo_id=repo_id,
        repo_type="model",
        commit_message="Upload vanilla merged LoRA model with base model properties",
        ignore_patterns=["*.bin"],
    )
    logger.info(f"Done!")

except Exception as e:
logger.error(f"Error: {str(e)}")
import traceback
logger.error(traceback.format_exc())
raise

Sign up or log in to comment