Spaces:

yuantao-infini-ai
/

demo_test

Runtime error

App Files Files Community

demo_test / fastchat /model /model_adapter.py

yuantao-infini-ai

Upload 136 files

7472549 verified 10 months ago

raw

history blame

70.6 kB

	"""Model adapter registration."""

	import math
	import os
	import re
	import sys
	from typing import Dict, List, Optional
	import warnings

	if sys.version_info >= (3, 9):
	from functools import cache
	else:
	from functools import lru_cache as cache

	import accelerate
	import psutil
	import torch
	from transformers import (
	AutoConfig,
	AutoModel,
	AutoModelForCausalLM,
	AutoModelForSeq2SeqLM,
	AutoTokenizer,
	LlamaTokenizer,
	LlamaForCausalLM,
	T5Tokenizer,
	)

	from fastchat.constants import CPU_ISA
	from fastchat.conversation import Conversation, get_conv_template
	from fastchat.model.compression import load_compress_model
	from fastchat.model.llama_condense_monkey_patch import replace_llama_with_condense
	from fastchat.model.model_chatglm import generate_stream_chatglm
	from fastchat.model.model_codet5p import generate_stream_codet5p
	from fastchat.model.model_falcon import generate_stream_falcon
	from fastchat.model.model_exllama import generate_stream_exllama
	from fastchat.model.model_xfastertransformer import generate_stream_xft
	from fastchat.model.monkey_patch_non_inplace import (
	replace_llama_attn_with_non_inplace_operations,
	)
	from fastchat.modules.awq import AWQConfig, load_awq_quantized
	from fastchat.modules.exllama import ExllamaConfig, load_exllama_model
	from fastchat.modules.xfastertransformer import load_xft_model, XftConfig
	from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
	from fastchat.utils import get_gpu_memory

	# Check an environment variable to check if we should be sharing Peft model
	# weights. When false we treat all Peft models as separate.
	peft_share_base_weights = (
	os.environ.get("PEFT_SHARE_BASE_WEIGHTS", "false").lower() == "true"
	)

	ANTHROPIC_MODEL_LIST = (
	"claude-1",
	"claude-2",
	"claude-instant-1",
	)


	class BaseModelAdapter:
	"""The base and the default model adapter."""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return True

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	try:
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	use_fast=self.use_fast_tokenizer,
	revision=revision,
	trust_remote_code=True,
	)
	except TypeError:
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, use_fast=False, revision=revision, trust_remote_code=True
	)
	try:
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	use_flash_attention_2=True,

	**from_pretrained_kwargs,
	)
	except: # NameError:
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	use_flash_attention_2=False,

	**from_pretrained_kwargs,
	)
	# model = AutoModel.from_pretrained(
	# model_path,
	# low_cpu_mem_usage=True,
	# trust_remote_code=True,
	# **from_pretrained_kwargs,
	# )
	return model, tokenizer

	def load_compress_model(self, model_path, device, torch_dtype, revision="main"):
	return load_compress_model(
	model_path,
	device,
	torch_dtype,
	use_fast=self.use_fast_tokenizer,
	revision=revision,
	)

	def get_default_conv_template(self, model_path: str) -> Conversation:
	if 'megrez' in model_path.lower():
	model_path = 'megrez'
	elif 'minicpm' in model_path.lower():
	model_path = "minicpm"
	return get_conv_template(model_path.lower())


	# A global registry for all model adapters
	# TODO (lmzheng): make it a priority queue.
	model_adapters: List[BaseModelAdapter] = []


	def register_model_adapter(cls):
	"""Register a model adapter."""
	model_adapters.append(cls())


	@cache
	def get_model_adapter(model_path: str, model_name: str = None) -> BaseModelAdapter:
	"""Get a model adapter for a model_path."""
	model_path_basename = os.path.basename(os.path.normpath(model_path)) if not model_name else model_name
	# Try the basename of model_path at first
	for adapter in model_adapters:
	if adapter.match(model_path_basename) and type(adapter) != BaseModelAdapter:
	print(f"Matching model adapter: {adapter}")
	return adapter

	model_path = model_path if not model_name else model_name
	# Then try the full path
	for adapter in model_adapters:
	if adapter.match(model_path):
	print(f"Using model adapter: {adapter}")
	return adapter

	raise ValueError(f"No valid model adapter for {model_path}")


	def raise_warning_for_incompatible_cpu_offloading_configuration(
	device: str, load_8bit: bool, cpu_offloading: bool
	):
	if cpu_offloading:
	if not load_8bit:
	warnings.warn(
	"The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
	"Use '--load-8bit' to enable 8-bit-quantization\n"
	"Continuing without cpu-offloading enabled\n"
	)
	return False
	if not "linux" in sys.platform:
	warnings.warn(
	"CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
	"Continuing without cpu-offloading enabled\n"
	)
	return False
	if device != "cuda":
	warnings.warn(
	"CPU-offloading is only enabled when using CUDA-devices\n"
	"Continuing without cpu-offloading enabled\n"
	)
	return False
	return cpu_offloading


	def load_model(
	model_path: str,
	device: str = "cuda",
	num_gpus: int = 1,
	max_gpu_memory: Optional[str] = None,
	dtype: Optional[torch.dtype] = None,
	load_8bit: bool = False,
	cpu_offloading: bool = False,
	gptq_config: Optional[GptqConfig] = None,
	awq_config: Optional[AWQConfig] = None,
	exllama_config: Optional[ExllamaConfig] = None,
	xft_config: Optional[XftConfig] = None,
	revision: str = "main",
	debug: bool = False,
	model_name: str = None,
	):
	"""Load a model from Hugging Face."""
	# get model adapter
	adapter = get_model_adapter(model_path, model_name)

	# Handle device mapping
	cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
	device, load_8bit, cpu_offloading
	)
	if device == "cpu":
	# kwargs = {"torch_dtype": torch.float32}
	kwargs = {"torch_dtype": torch.float16}
	if CPU_ISA in ["avx512_bf16", "amx"]:
	try:
	import intel_extension_for_pytorch as ipex

	kwargs = {"torch_dtype": torch.bfloat16}
	except ImportError:
	warnings.warn(
	"Intel Extension for PyTorch is not installed, it can be installed to accelerate cpu inference"
	)
	elif device == "cuda":
	# kwargs = {"torch_dtype": torch.float16}
	kwargs = {"torch_dtype": torch.bfloat16}
	if num_gpus != 1:
	kwargs["device_map"] = "auto"
	if max_gpu_memory is None:
	kwargs[
	"device_map"
	] = "sequential" # This is important for not the same VRAM sizes
	available_gpu_memory = get_gpu_memory(num_gpus)
	kwargs["max_memory"] = {
	i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
	for i in range(num_gpus)
	}
	else:
	kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
	elif device == "mps":
	kwargs = {"torch_dtype": torch.float16}
	# Avoid bugs in mps backend by not using in-place operations.
	replace_llama_attn_with_non_inplace_operations()
	elif device == "xpu":
	kwargs = {"torch_dtype": torch.bfloat16}
	# Try to load ipex, while it looks unused, it links into torch for xpu support
	try:
	import intel_extension_for_pytorch as ipex
	except ImportError:
	warnings.warn(
	"Intel Extension for PyTorch is not installed, but is required for xpu inference."
	)
	elif device == "npu":
	kwargs = {"torch_dtype": torch.float16}
	# Try to load ipex, while it looks unused, it links into torch for xpu support
	try:
	import torch_npu
	except ImportError:
	warnings.warn("Ascend Extension for PyTorch is not installed.")
	else:
	raise ValueError(f"Invalid device: {device}")

	if cpu_offloading:
	# raises an error on incompatible platforms
	from transformers import BitsAndBytesConfig

	if "max_memory" in kwargs:
	kwargs["max_memory"]["cpu"] = (
	str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib"
	)
	kwargs["quantization_config"] = BitsAndBytesConfig(
	load_in_8bit_fp32_cpu_offload=cpu_offloading
	)
	kwargs["load_in_8bit"] = load_8bit
	elif load_8bit:
	if num_gpus != 1:
	warnings.warn(
	"8-bit quantization is not supported for multi-gpu inference."
	)
	else:
	model, tokenizer = adapter.load_compress_model(
	model_path=model_path,
	device=device,
	torch_dtype=kwargs["torch_dtype"],
	revision=revision,
	)
	if debug:
	print(model)
	return model, tokenizer
	elif awq_config and awq_config.wbits < 16:
	assert (
	awq_config.wbits == 4
	), "Currently we only support 4-bit inference for AWQ."
	model, tokenizer = load_awq_quantized(model_path, awq_config, device)
	if num_gpus != 1:
	device_map = accelerate.infer_auto_device_map(
	model,
	max_memory=kwargs["max_memory"],
	no_split_module_classes=[
	"OPTDecoderLayer",
	"LlamaDecoderLayer",
	"BloomBlock",
	"MPTBlock",
	"DecoderLayer",
	],
	)
	model = accelerate.dispatch_model(
	model, device_map=device_map, offload_buffers=True
	)
	else:
	model.to(device)
	return model, tokenizer
	elif gptq_config and gptq_config.wbits < 16:
	model, tokenizer = load_gptq_quantized(model_path, gptq_config)
	if num_gpus != 1:
	device_map = accelerate.infer_auto_device_map(
	model,
	max_memory=kwargs["max_memory"],
	no_split_module_classes=["LlamaDecoderLayer"],
	)
	model = accelerate.dispatch_model(
	model, device_map=device_map, offload_buffers=True
	)
	else:
	model.to(device)
	return model, tokenizer
	elif exllama_config:
	model, tokenizer = load_exllama_model(model_path, exllama_config)
	return model, tokenizer
	elif xft_config:
	model, tokenizer = load_xft_model(model_path, xft_config)
	return model, tokenizer
	kwargs["revision"] = revision

	if dtype is not None: # Overwrite dtype if it is provided in the arguments.
	kwargs["torch_dtype"] = dtype

	# Load model
	model, tokenizer = adapter.load_model(model_path, kwargs)

	if (
	device == "cpu"
	and kwargs["torch_dtype"] is torch.bfloat16
	and CPU_ISA is not None
	):
	model = ipex.optimize(model, dtype=kwargs["torch_dtype"])

	if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
	"mps",
	"xpu",
	"npu",
	):
	model.to(device)

	if device == "xpu":
	model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)

	if debug:
	print(model)

	return model, tokenizer


	def get_conversation_template(model_path: str) -> Conversation:
	"""Get the default conversation template."""
	adapter = get_model_adapter(model_path)
	return adapter.get_default_conv_template(model_path)


	def get_generate_stream_function(model: torch.nn.Module, model_path: str):
	"""Get the generate_stream function for inference."""
	from fastchat.serve.inference import generate_stream

	model_type = str(type(model)).lower()
	is_chatglm = "chatglm" in model_type
	is_falcon = "rwforcausallm" in model_type
	is_codet5p = "codet5p" in model_type
	is_peft = "peft" in model_type
	is_exllama = "exllama" in model_type
	is_xft = "xft" in model_type

	if is_chatglm:
	return generate_stream_chatglm
	elif is_falcon:
	return generate_stream_falcon
	elif is_codet5p:
	return generate_stream_codet5p
	elif is_exllama:
	return generate_stream_exllama
	elif is_xft:
	return generate_stream_xft

	elif peft_share_base_weights and is_peft:
	# Return a curried stream function that loads the right adapter
	# according to the model_name available in this context. This ensures
	# the right weights are available.
	@torch.inference_mode()
	def generate_stream_peft(
	model,
	tokenizer,
	params: Dict,
	device: str,
	context_len: int,
	stream_interval: int = 2,
	judge_sent_end: bool = False,
	):
	model.set_adapter(model_path)
	for x in generate_stream(
	model,
	tokenizer,
	params,
	device,
	context_len,
	stream_interval,
	judge_sent_end,
	):
	yield x

	return generate_stream_peft
	else:
	return generate_stream


	def add_model_args(parser):
	parser.add_argument(
	"--model-path",
	type=str,
	default="lmsys/vicuna-7b-v1.5",
	help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
	)
	parser.add_argument(
	"--revision",
	type=str,
	default="main",
	help="Hugging Face Hub model revision identifier",
	)
	parser.add_argument(
	"--device",
	type=str,
	choices=["cpu", "cuda", "mps", "xpu", "npu"],
	default="cuda",
	help="The device type",
	)
	parser.add_argument(
	"--gpus",
	type=str,
	default=None,
	help="A single GPU like 1 or multiple GPUs like 0,2",
	)
	parser.add_argument("--num-gpus", type=int, default=1)
	parser.add_argument(
	"--max-gpu-memory",
	type=str,
	help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
	)
	parser.add_argument(
	"--dtype",
	type=str,
	choices=["float32", "float16", "bfloat16"],
	help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
	default=None,
	)
	parser.add_argument(
	"--load-8bit", action="store_true", help="Use 8-bit quantization"
	)
	parser.add_argument(
	"--cpu-offloading",
	action="store_true",
	help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
	)
	parser.add_argument(
	"--gptq-ckpt",
	type=str,
	default=None,
	help="Used for GPTQ. The path to the local GPTQ checkpoint.",
	)
	parser.add_argument(
	"--gptq-wbits",
	type=int,
	default=16,
	choices=[2, 3, 4, 8, 16],
	help="Used for GPTQ. #bits to use for quantization",
	)
	parser.add_argument(
	"--gptq-groupsize",
	type=int,
	default=-1,
	help="Used for GPTQ. Groupsize to use for quantization; default uses full row.",
	)
	parser.add_argument(
	"--gptq-act-order",
	action="store_true",
	help="Used for GPTQ. Whether to apply the activation order GPTQ heuristic",
	)
	parser.add_argument(
	"--awq-ckpt",
	type=str,
	default=None,
	help="Used for AWQ. Load quantized model. The path to the local AWQ checkpoint.",
	)
	parser.add_argument(
	"--awq-wbits",
	type=int,
	default=16,
	choices=[4, 16],
	help="Used for AWQ. #bits to use for AWQ quantization",
	)
	parser.add_argument(
	"--awq-groupsize",
	type=int,
	default=-1,
	help="Used for AWQ. Groupsize to use for AWQ quantization; default uses full row.",
	)
	parser.add_argument(
	"--enable-exllama",
	action="store_true",
	help="Used for exllamabv2. Enable exllamaV2 inference framework.",
	)
	parser.add_argument(
	"--exllama-max-seq-len",
	type=int,
	default=4096,
	help="Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length.",
	)
	parser.add_argument(
	"--exllama-gpu-split",
	type=str,
	default=None,
	help="Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7",
	)
	parser.add_argument(
	"--enable-xft",
	action="store_true",
	help="Used for xFasterTransformer Enable xFasterTransformer inference framework.",
	)
	parser.add_argument(
	"--xft-max-seq-len",
	type=int,
	default=4096,
	help="Used for xFasterTransformer. Max sequence length to use for xFasterTransformer framework; default 4096 sequence length.",
	)
	parser.add_argument(
	"--xft-dtype",
	type=str,
	choices=["fp16", "bf16", "int8", "bf16_fp16", "bf16_int8"],
	help="Override the default dtype. If not set, it will use bfloat16 for first token and float16 next tokens on CPU.",
	default=None,
	)


	def remove_parent_directory_name(model_path):
	"""Remove parent directory name."""
	if model_path[-1] == "/":
	model_path = model_path[:-1]
	return model_path.split("/")[-1]


	peft_model_cache = {}


	class PeftModelAdapter:
	"""Loads any "peft" model and it's base model."""

	def match(self, model_path: str):
	"""Accepts any model path with "peft" in the name"""
	if os.path.exists(os.path.join(model_path, "adapter_config.json")):
	return True
	return "peft" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	"""Loads the base model then the (peft) adapter weights"""
	from peft import PeftConfig, PeftModel

	config = PeftConfig.from_pretrained(model_path)
	base_model_path = config.base_model_name_or_path
	if "peft" in base_model_path:
	raise ValueError(
	f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
	)

	# Basic proof of concept for loading peft adapters that share the base
	# weights. This is pretty messy because Peft re-writes the underlying
	# base model and internally stores a map of adapter layers.
	# So, to make this work we:
	# 1. Cache the first peft model loaded for a given base models.
	# 2. Call `load_model` for any follow on Peft models.
	# 3. Make sure we load the adapters by the model_path. Why? This is
	# what's accessible during inference time.
	# 4. In get_generate_stream_function, make sure we load the right
	# adapter before doing inference. This should be safe when calls
	# are blocked the same semaphore.
	if peft_share_base_weights:
	if base_model_path in peft_model_cache:
	model, tokenizer = peft_model_cache[base_model_path]
	# Super important: make sure we use model_path as the
	# `adapter_name`.
	model.load_adapter(model_path, adapter_name=model_path)
	else:
	base_adapter = get_model_adapter(base_model_path)
	base_model, tokenizer = base_adapter.load_model(
	base_model_path, from_pretrained_kwargs
	)
	# Super important: make sure we use model_path as the
	# `adapter_name`.
	model = PeftModel.from_pretrained(
	base_model, model_path, adapter_name=model_path
	)
	peft_model_cache[base_model_path] = (model, tokenizer)
	return model, tokenizer

	# In the normal case, load up the base model weights again.
	base_adapter = get_model_adapter(base_model_path)
	base_model, tokenizer = base_adapter.load_model(
	base_model_path, from_pretrained_kwargs
	)
	model = PeftModel.from_pretrained(base_model, model_path)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	"""Uses the conv template of the base model"""
	from peft import PeftConfig, PeftModel

	config = PeftConfig.from_pretrained(model_path)
	if "peft" in config.base_model_name_or_path:
	raise ValueError(
	f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
	)
	base_model_path = config.base_model_name_or_path
	base_adapter = get_model_adapter(base_model_path)
	return base_adapter.get_default_conv_template(config.base_model_name_or_path)



	class DeepseekChatAdapter(BaseModelAdapter):
	"""The model adapter for deepseek-ai's chat models"""

	# Note: that this model will require tokenizer version >= 0.13.3 because the tokenizer class is LlamaTokenizerFast

	def match(self, model_path: str):
	return "deepseek" in model_path.lower() and "chat" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("deepseek")

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	device_map="sequential",
	torch_dtype=torch.bfloat16,
	max_memory=from_pretrained_kwargs['max_memory'],
	attn_implementation="flash_attention_2"#"eager"
	)

	tokenizer = AutoTokenizer.from_pretrained(
	model_path, utrust_remote_code=True
	)

	return model, tokenizer


	class VicunaAdapter(BaseModelAdapter):
	"Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" ""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "vicuna" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, use_fast=self.use_fast_tokenizer, revision=revision
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	use_flash_attention_2=True,
	**from_pretrained_kwargs,
	)
	self.raise_warning_for_old_weights(model)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	if "v0" in remove_parent_directory_name(model_path):
	return get_conv_template("one_shot")
	return get_conv_template("vicuna_v1.1")

	def raise_warning_for_old_weights(self, model):
	if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000:
	warnings.warn(
	"\nYou are probably using the old Vicuna-v0 model, "
	"which will generate unexpected results with the "
	"current fastchat.\nYou can try one of the following methods:\n"
	"1. Upgrade your weights to the new Vicuna-v1.3: https://github.com/lm-sys/FastChat#vicuna-weights.\n"
	"2. Use the old conversation template by `python3 -m fastchat.serve.cli --model-path /path/to/vicuna-v0 --conv-template one_shot`\n"
	"3. Downgrade fschat to fschat==0.1.10 (Not recommended).\n"
	)


	class AiroborosAdapter(BaseModelAdapter):
	"""The model adapter for jondurbin/airoboros-*"""

	def match(self, model_path: str):
	if re.search(r"airoboros\|spicyboros", model_path, re.I):
	return True
	return False

	def get_default_conv_template(self, model_path: str) -> Conversation:
	if "-3." in model_path or "-3p" in model_path:
	return get_conv_template("airoboros_v3")
	if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
	return get_conv_template("airoboros_v2")
	return get_conv_template("airoboros_v1")

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	if "mpt" not in model_path.lower():
	return super().load_model(model_path, from_pretrained_kwargs)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	max_seq_len=8192,
	**from_pretrained_kwargs,
	)
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, use_fast=True
	)
	return model, tokenizer

	class Zhinao360Adapter(BaseModelAdapter):
	def match(self, model_path: str):
	return "360zhinao" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	trust_remote_code=True)

	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True)
	from transformers import GenerationConfig
	generation_config = GenerationConfig.from_pretrained(
	model_path,
	trust_remote_code=True)

	return model, tokenizer, generation_config

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("360zhinao")

	class LongChatAdapter(BaseModelAdapter):
	"Model adapter for LongChat models (e.g., lmsys/longchat-7b-16k)."

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "longchat" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")

	# Apply monkey patch, TODO(Dacheng): Add flash attention support
	config = AutoConfig.from_pretrained(model_path, revision=revision)
	replace_llama_with_condense(config.rope_scaling["factor"])

	tokenizer = AutoTokenizer.from_pretrained(
	model_path, use_fast=self.use_fast_tokenizer, revision=revision
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("vicuna_v1.1")


	class GoogleT5Adapter(BaseModelAdapter):
	"""The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""

	def match(self, model_path: str):
	return any(
	model_str in model_path.lower()
	for model_str in ["flan-", "fastchat-t5", "codet5p"]
	)

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
	model = AutoModelForSeq2SeqLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	**from_pretrained_kwargs,
	)
	return model, tokenizer


	class KoalaAdapter(BaseModelAdapter):
	"""The model adapter for Koala"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "koala" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("koala_v1")


	class AlpacaAdapter(BaseModelAdapter):
	"""The model adapter for Alpaca"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "alpaca" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("alpaca")


	class ChatGLMAdapter(BaseModelAdapter):
	"""The model adapter for THUDM/chatglm-6b, THUDM/chatglm2-6b"""

	def match(self, model_path: str):
	return "chatglm" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	if "chatglm3" in model_path.lower():
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	encode_special_tokens=True,
	trust_remote_code=True,
	revision=revision,
	)
	else:
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	model = AutoModel.from_pretrained(
	model_path, trust_remote_code=True, **from_pretrained_kwargs
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	model_path = model_path.lower()
	if "chatglm2" in model_path.lower():
	return get_conv_template("chatglm2")
	if "chatglm3" in model_path.lower():
	return get_conv_template("chatglm3")
	return get_conv_template("chatglm")


	class CodeGeexAdapter(BaseModelAdapter):
	"""The model adapter for THUDM/codegeex-6b, THUDM/codegeex2-6b"""

	def match(self, model_path: str):
	return "codegeex" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	model = AutoModel.from_pretrained(
	model_path, trust_remote_code=True, **from_pretrained_kwargs
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("codegeex")


	class DollyV2Adapter(BaseModelAdapter):
	"""The model adapter for databricks/dolly-v2-12b"""

	def match(self, model_path: str):
	return "dolly-v2" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	)
	# 50277 means "### End"
	tokenizer.eos_token_id = 50277
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("dolly_v2")


	class OasstPythiaAdapter(BaseModelAdapter):
	"""The model adapter for OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"""

	def match(self, model_path: str):
	model_path = model_path.lower()
	return "oasst" in model_path and "pythia" in model_path

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("oasst_pythia")

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id
	return model, tokenizer


	class OasstLLaMAAdapter(BaseModelAdapter):
	"""The model adapter for OpenAssistant/oasst-sft-7-llama-30b"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	model_path = model_path.lower()
	if "openassistant-sft-7-llama-30b-hf" in model_path:
	return True
	return "oasst" in model_path and "pythia" not in model_path

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("oasst_llama")


	class OpenChat35Adapter(BaseModelAdapter):
	"""The model adapter for OpenChat 3.5 (e.g. openchat/openchat_3.5)"""

	def match(self, model_path: str):
	return "openchat" in model_path.lower() and "3.5" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("openchat_3.5")


	class PythiaAdapter(BaseModelAdapter):
	"""The model adapter for any EleutherAI/pythia model"""

	def match(self, model_path: str):
	return "pythia" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id
	return model, tokenizer


	class StableLMAdapter(BaseModelAdapter):
	"""The model adapter for StabilityAI/stablelm-tuned-alpha-7b"""

	def match(self, model_path: str):
	return "stablelm" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("stablelm")


	class MPTAdapter(BaseModelAdapter):
	"""The model adapter for MPT series (mosaicml/mpt-7b-chat, mosaicml/mpt-30b-chat)"""

	def match(self, model_path: str):
	model_path = model_path.lower()
	return "mpt" in model_path and not "airoboros" in model_path

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	max_seq_len=8192,
	**from_pretrained_kwargs,
	)
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	model_path = model_path.lower()
	if "mpt-7b-chat" in model_path:
	return get_conv_template("mpt-7b-chat")
	elif "mpt-30b-chat" in model_path:
	return get_conv_template("mpt-30b-chat")
	elif "mpt-30b-instruct" in model_path:
	return get_conv_template("mpt-30b-instruct")
	else:
	print(
	"Warning: Loading base MPT model with `zero_shot` conversation configuration. "
	"If this is not desired, inspect model configurations and names."
	)
	return get_conv_template("zero_shot")


	class BaizeAdapter(BaseModelAdapter):
	"""The model adapter for project-baize/baize-v2-7b"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "baize" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("baize")


	class RwkvAdapter(BaseModelAdapter):
	"""The model adapter for BlinkDL/RWKV-4-Raven"""

	def match(self, model_path: str):
	return "rwkv-4" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	from fastchat.model.rwkv_model import RwkvModel

	model = RwkvModel(model_path)
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	"EleutherAI/pythia-160m", revision=revision
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("rwkv")


	class OpenBuddyAdapter(BaseModelAdapter):
	"""The model adapter for OpenBuddy/openbuddy-7b-v1.1-bf16-enc"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "openbuddy" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("openbuddy")


	class PhoenixAdapter(BaseModelAdapter):
	"""The model adapter for FreedomIntelligence/phoenix-inst-chat-7b"""

	def match(self, model_path: str):
	return "phoenix" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("phoenix")


	class ReaLMAdapter(BaseModelAdapter):
	"""The model adapter for FreedomIntelligence/ReaLM-7b"""

	def match(self, model_path: str):
	return "ReaLM" in model_path

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("ReaLM-7b-v1")


	class ChatGPTAdapter(BaseModelAdapter):
	"""The model adapter for ChatGPT"""

	def match(self, model_path: str):
	return model_path in (
	"gpt-3.5-turbo",
	"gpt-3.5-turbo-1106",
	"gpt-4",
	"gpt-4-turbo",
	)

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	raise NotImplementedError()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("chatgpt")


	class AzureOpenAIAdapter(BaseModelAdapter):
	"""The model adapter for Azure OpenAI"""

	def match(self, model_path: str):
	return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	raise NotImplementedError()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("chatgpt")


	class ClaudeAdapter(BaseModelAdapter):
	"""The model adapter for Claude"""

	def match(self, model_path: str):
	return model_path in ANTHROPIC_MODEL_LIST

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	raise NotImplementedError()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("claude")


	class BardAdapter(BaseModelAdapter):
	"""The model adapter for Bard"""

	def match(self, model_path: str):
	return model_path == "bard"

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	raise NotImplementedError()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("bard")


	class PaLM2Adapter(BaseModelAdapter):
	"""The model adapter for PaLM2"""

	def match(self, model_path: str):
	return model_path == "palm-2"

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	raise NotImplementedError()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("bard")


	class BiLLaAdapter(BaseModelAdapter):
	"""The model adapter for Neutralzz/BiLLa-7B-SFT"""

	def match(self, model_path: str):
	return "billa" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("billa")


	class RedPajamaINCITEAdapter(BaseModelAdapter):
	"""The model adapter for togethercomputer/RedPajama-INCITE-7B-Chat"""

	def match(self, model_path: str):
	return "redpajama-incite" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("redpajama-incite")


	class H2OGPTAdapter(BaseModelAdapter):
	"""The model adapter for h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "h2ogpt" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("h2ogpt")


	class RobinAdapter(BaseModelAdapter):
	"""The model adapter for LMFlow/Full-Robin-7b-v2"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "robin" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("Robin")


	class SnoozyAdapter(BaseModelAdapter):
	"""The model adapter for nomic-ai/gpt4all-13b-snoozy"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	model_path = model_path.lower()
	return "gpt4all" in model_path and "snoozy" in model_path

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("snoozy")


	class WizardLMAdapter(BaseModelAdapter):
	"""The model adapter for WizardLM/WizardLM-13B-V1.0"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "wizardlm" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	model_path = model_path.lower()
	if "13b" in model_path or "30b" in model_path or "70b" in model_path:
	return get_conv_template("vicuna_v1.1")
	else:
	# TODO: use the recommended template for 7B
	# (https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
	return get_conv_template("one_shot")


	class ManticoreAdapter(BaseModelAdapter):
	"""The model adapter for openaccess-ai-collective/manticore-13b-chat-pyg"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "manticore" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("manticore")


	class GuanacoAdapter(BaseModelAdapter):
	"""The model adapter for timdettmers/guanaco-33b-merged"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "guanaco" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, use_fast=self.use_fast_tokenizer, revision=revision
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
	)
	# Fix a bug in tokenizer config
	tokenizer.eos_token_id = model.config.eos_token_id
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("zero_shot")


	class ChangGPTAdapter(BaseModelAdapter):
	"""The model adapter for lcw99/polyglot-ko-12.8b-chang-instruct-chat"""

	def match(self, model_path: str):
	model_path = model_path.lower()
	return "polyglot" in model_path and "chang" in model_path

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("polyglot_changgpt")


	class CamelAdapter(BaseModelAdapter):
	"""The model adapter for camel-ai/CAMEL-13B-Combined-Data"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "camel" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("vicuna_v1.1")


	class TuluAdapter(BaseModelAdapter):
	"""The model adapter for allenai/tulu-30b"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "tulu" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("tulu")


	class FalconAdapter(BaseModelAdapter):
	"""The model adapter for tiiuae/falcon-40b"""

	def match(self, model_path: str):
	return "falcon" in model_path.lower() and "chat" not in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	# Strongly suggest using bf16, which is recommended by the author of Falcon
	tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	**from_pretrained_kwargs,
	)
	# In Falcon tokenizer config and special config there is not any pad token
	# Setting `pad_token_id` to 9, which corresponds to special token '>>SUFFIX<<'
	tokenizer.pad_token_id = 9
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("falcon")


	class FalconChatAdapter(BaseModelAdapter):
	def match(self, model_path: str):
	return "falcon" in model_path.lower() and "chat" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("falcon-chat")


	class TigerBotAdapter(BaseModelAdapter):
	"""The model adapter for TigerResearch/tigerbot-7b-sft"""

	def match(self, model_path: str):
	return "tigerbot" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	trust_remote_code=True,
	revision=revision,
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("tigerbot")


	class BaichuanAdapter(BaseModelAdapter):
	"""The model adapter for Baichuan models (e.g., baichuan-inc/Baichuan-7B)"""

	def match(self, model_path: str):
	return "baichuan" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	# for Baichuan-13B-Chat
	if "chat" in model_path.lower():
	if "baichuan2" in model_path.lower():
	return get_conv_template("baichuan2-chat")
	return get_conv_template("baichuan-chat")
	return get_conv_template("zero_shot")


	class XGenAdapter(BaseModelAdapter):
	"""The model adapter for Salesforce/xgen-7b"""

	def match(self, model_path: str):
	return "xgen" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	**from_pretrained_kwargs,
	)
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	model.config.eos_token_id = 50256
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("xgen")


	class NousHermesAdapter(BaseModelAdapter):
	"""The model adapter for NousResearch/Nous-Hermes-13b"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "nous-hermes" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("alpaca")


	class InternLMChatAdapter(BaseModelAdapter):
	"""The model adapter for internlm/internlm-chat-7b"""

	def match(self, model_path: str):
	return "internlm-chat" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	**from_pretrained_kwargs,
	)
	model = model.eval()
	if "8k" in model_path.lower():
	model.config.max_sequence_length = 8192
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("internlm-chat")


	class StarChatAdapter(BaseModelAdapter):
	"""The model adapter for HuggingFaceH4/starchat-beta"""

	def match(self, model_path: str):
	return "starchat" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("starchat")


	class MistralAdapter(BaseModelAdapter):
	"""The model adapter for Mistral AI models"""

	def match(self, model_path: str):
	return "mistral" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("mistral")


	class Llama2Adapter(BaseModelAdapter):
	"""The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""

	def match(self, model_path: str):
	return "llama-2" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("llama-2")


	class CuteGPTAdapter(BaseModelAdapter):
	"""The model adapter for CuteGPT"""

	def match(self, model_path: str):
	return "cutegpt" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	tokenizer = LlamaTokenizer.from_pretrained(model_path)
	model = AutoModelForCausalLM.from_pretrained(
	model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
	)
	tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<end>")
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.eos_token_id
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("cutegpt")


	class OpenOrcaAdapter(BaseModelAdapter):
	"""Model adapter for Open-Orca models which may use different prompt templates
	- (e.g. Open-Orca/OpenOrcaxOpenChat-Preview2-13B, Open-Orca/Mistral-7B-OpenOrca)
	- `OpenOrcaxOpenChat-Preview2-13B` uses their "OpenChat Llama2 V1" prompt template.
	- [Open-Orca/OpenOrcaxOpenChat-Preview2-13B #Prompt Template](https://huggingface.co/Open-Orca/OpenOrcaxOpenChat-Preview2-13B#prompt-template)
	- `Mistral-7B-OpenOrca` uses the [OpenAI's Chat Markup Language (ChatML)](https://github.com/openai/openai-python/blob/main/chatml.md)
	format, with <\|im_start\|> and <\|im_end\|> tokens added to support this.
	- [Open-Orca/Mistral-7B-OpenOrca #Prompt Template](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template)
	"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return (
	"mistral-7b-openorca" in model_path.lower()
	or "openorca" in model_path.lower()
	)

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, use_fast=self.use_fast_tokenizer, revision=revision
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	).eval()
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	if "mistral-7b-openorca" in model_path.lower():
	return get_conv_template("mistral-7b-openorca")
	return get_conv_template("open-orca")


	class WizardCoderAdapter(BaseModelAdapter):
	"""The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "wizardcoder" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	# Same as Alpaca, see :
	# https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/inference_wizardcoder.py#L60
	return get_conv_template("alpaca")


	class QwenChatAdapter(BaseModelAdapter):
	"""The model adapter for Qwen/Qwen-7B-Chat
	To run this model, you need to ensure additional flash attention installation:
	``` bash
	git clone https://github.com/Dao-AILab/flash-attention
	cd flash-attention && pip install .
	pip install csrc/layer_norm
	pip install csrc/rotary
	```

	Since from 2.0, the following change happened
	- `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
	- `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
	- `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
	You may need to revise the code in: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py#L69
	to from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
	"""

	def match(self, model_path: str):
	return "qwen" in model_path.lower()

	def float_set(self, config, option):
	config.bf16 = False
	config.fp16 = False
	config.fp32 = False

	if option == "bf16":
	config.bf16 = True
	elif option == "fp16":
	config.fp16 = True
	elif option == "fp32":
	config.fp32 = True
	else:
	print("Invalid option. Please choose one from 'bf16', 'fp16' and 'fp32'.")

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	from transformers.generation import GenerationConfig

	revision = from_pretrained_kwargs.get("revision", "main")
	config = AutoConfig.from_pretrained(
	model_path,
	trust_remote_code=True,
	)
	# NOTE: if you use the old version of model file, please remove the comments below
	# config.use_flash_attn = False
	self.float_set(config, "fp16")
	generation_config = GenerationConfig.from_pretrained(
	model_path, trust_remote_code=True
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	config=config,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	**from_pretrained_kwargs,
	).eval()
	if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
	model.config.max_sequence_length = 16384
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	tokenizer.eos_token_id = config.eos_token_id
	tokenizer.bos_token_id = config.bos_token_id
	tokenizer.pad_token_id = generation_config.pad_token_id
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.bos_token_id = tokenizer.bos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id

	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("qwen-7b-chat")


	class BGEAdapter(BaseModelAdapter):
	"""The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "bge" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	model = AutoModel.from_pretrained(
	model_path,
	**from_pretrained_kwargs,
	)
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	if hasattr(model.config, "max_position_embeddings") and hasattr(
	tokenizer, "model_max_length"
	):
	model.config.max_sequence_length = min(
	model.config.max_position_embeddings, tokenizer.model_max_length
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("one_shot")


	class E5Adapter(BaseModelAdapter):
	"""The model adapter for E5 (e.g., intfloat/e5-large-v2)"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "e5-" in model_path.lower() and 'megrez' not in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	model = AutoModel.from_pretrained(
	model_path,
	**from_pretrained_kwargs,
	)
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	if hasattr(model.config, "max_position_embeddings") and hasattr(
	tokenizer, "model_max_length"
	):
	model.config.max_sequence_length = min(
	model.config.max_position_embeddings, tokenizer.model_max_length
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("one_shot")


	class AquilaChatAdapter(BaseModelAdapter):
	"""The model adapter for BAAI/Aquila

	Now supports:
	- BAAI/AquilaChat-7B
	- BAAI/AquilaChat2-7B
	- BAAI/AquilaChat2-34B
	"""

	def match(self, model_path: str):
	return "aquila" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	**from_pretrained_kwargs,
	)
	model = model.eval()
	tokenizer = AutoTokenizer.from_pretrained(
	model_path, trust_remote_code=True, revision=revision
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	model_path = model_path.lower()
	# See: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L347
	if "aquilachat2" in model_path:
	if "16k" in model_path:
	return get_conv_template("aquila")
	elif "34b" in model_path:
	return get_conv_template("aquila-legacy")
	else:
	return get_conv_template("aquila-v1")
	else:
	return get_conv_template("aquila-chat")


	class Lamma2ChineseAdapter(BaseModelAdapter):
	"""The model adapter for FlagAlpha/LLama2-Chinese sft"""

	def match(self, model_path: str):
	return "llama2-chinese" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	trust_remote_code=True,
	revision=revision,
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	)
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("llama2-chinese")


	class VigogneAdapter(BaseModelAdapter):
	"""The model adapter for vigogne (e.g., bofenghuang/vigogne-2-7b-chat)"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return bool(re.search(r"vigogne\|vigostral", model_path, re.I))

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	use_fast=self.use_fast_tokenizer,
	trust_remote_code=True,
	revision=revision,
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	).eval()
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	if "chat" in model_path.lower():
	if "vigostral" in model_path.lower():
	return get_conv_template("vigogne_chat_v3")
	return get_conv_template("vigogne_chat_v2")
	return get_conv_template("vigogne_instruct")


	class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
	"""The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return (
	"open-llama" in model_path.lower() and "open-instruct" in model_path.lower()
	)

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	revision = from_pretrained_kwargs.get("revision", "main")
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	use_fast=self.use_fast_tokenizer,
	trust_remote_code=True,
	revision=revision,
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	**from_pretrained_kwargs,
	).eval()
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("alpaca")


	class CodeLlamaAdapter(BaseModelAdapter):
	"""The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""

	def match(self, model_path: str):
	return "codellama" in model_path.lower()

	def load_model(self, model_path: str, from_pretrained_kwargs: dict):
	model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
	model.config.eos_token_id = tokenizer.eos_token_id
	model.config.pad_token_id = tokenizer.pad_token_id
	return model, tokenizer

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("llama-2")


	class PhindCodeLlamaAdapter(CodeLlamaAdapter):
	"""The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""

	def match(self, model_path: str):
	return "phind-codellama-" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("phind")


	class Llama2ChangAdapter(Llama2Adapter):
	"""The model adapter for Llama2-ko-chang (e.g., lcw99/llama2-ko-chang-instruct-chat)"""

	def match(self, model_path: str):
	return "llama2-ko-chang" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("polyglot_changgpt")


	class ZephyrAdapter(BaseModelAdapter):
	"""The model adapter for Zephyr (e.g. HuggingFaceH4/zephyr-7b-alpha)"""

	def match(self, model_path: str):
	return "zephyr" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("zephyr")


	class XwinLMAdapter(BaseModelAdapter):
	"""The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)"""

	# use_fast_tokenizer = False

	def match(self, model_path: str):
	return "xwin-lm" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("vicuna_v1.1")


	class LemurAdapter(BaseModelAdapter):
	"""The model adapter for OpenLemur/lemur-70b-chat-v1"""

	use_fast_tokenizer = False

	def match(self, model_path: str):
	return "lemur-70b-chat" in model_path.lower()

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("lemur-70b-chat")


	class PygmalionAdapter(BaseModelAdapter):
	"""The model adapter for Pygmalion/Metharme series of models(e.g., PygmalionAI/mythalion-13b)"""

	# use_fast_tokenizer = False

	def match(self, model_path: str):
	return bool(
	re.search(r"pygmalion\|mythalion\|metharme", model_path.lower(), re.I)
	)

	def get_default_conv_template(self, model_path: str) -> Conversation:
	return get_conv_template("metharme")


	# Note: the registration order matters.
	# The one registered earlier has a higher matching priority.
	register_model_adapter(PeftModelAdapter)
	register_model_adapter(DeepseekChatAdapter)
	register_model_adapter(VicunaAdapter)
	register_model_adapter(AiroborosAdapter)
	register_model_adapter(LongChatAdapter)
	register_model_adapter(GoogleT5Adapter)
	register_model_adapter(KoalaAdapter)
	register_model_adapter(AlpacaAdapter)
	register_model_adapter(ChatGLMAdapter)
	register_model_adapter(CodeGeexAdapter)
	register_model_adapter(DollyV2Adapter)
	register_model_adapter(OasstPythiaAdapter)
	register_model_adapter(OasstLLaMAAdapter)
	register_model_adapter(OpenChat35Adapter)
	register_model_adapter(StableLMAdapter)
	register_model_adapter(BaizeAdapter)
	register_model_adapter(RwkvAdapter)
	register_model_adapter(OpenBuddyAdapter)
	register_model_adapter(PhoenixAdapter)
	register_model_adapter(BardAdapter)
	register_model_adapter(PaLM2Adapter)
	register_model_adapter(ChatGPTAdapter)
	register_model_adapter(AzureOpenAIAdapter)
	register_model_adapter(ClaudeAdapter)
	register_model_adapter(MPTAdapter)
	register_model_adapter(BiLLaAdapter)
	register_model_adapter(RedPajamaINCITEAdapter)
	register_model_adapter(H2OGPTAdapter)
	register_model_adapter(RobinAdapter)
	register_model_adapter(SnoozyAdapter)
	register_model_adapter(WizardLMAdapter)
	register_model_adapter(ManticoreAdapter)
	register_model_adapter(GuanacoAdapter)
	register_model_adapter(CamelAdapter)
	register_model_adapter(ChangGPTAdapter)
	register_model_adapter(TuluAdapter)
	register_model_adapter(FalconChatAdapter)
	register_model_adapter(FalconAdapter)
	register_model_adapter(TigerBotAdapter)
	register_model_adapter(BaichuanAdapter)
	register_model_adapter(XGenAdapter)
	register_model_adapter(NousHermesAdapter)
	register_model_adapter(PythiaAdapter)
	register_model_adapter(InternLMChatAdapter)
	register_model_adapter(StarChatAdapter)
	register_model_adapter(Llama2Adapter)
	register_model_adapter(CuteGPTAdapter)
	register_model_adapter(OpenOrcaAdapter)
	register_model_adapter(MistralAdapter)
	register_model_adapter(WizardCoderAdapter)
	register_model_adapter(QwenChatAdapter)
	register_model_adapter(AquilaChatAdapter)
	register_model_adapter(BGEAdapter)
	register_model_adapter(E5Adapter)
	register_model_adapter(Lamma2ChineseAdapter)
	register_model_adapter(VigogneAdapter)
	register_model_adapter(OpenLLaMaOpenInstructAdapter)
	register_model_adapter(ReaLMAdapter)
	register_model_adapter(PhindCodeLlamaAdapter)
	register_model_adapter(CodeLlamaAdapter)
	register_model_adapter(Llama2ChangAdapter)
	register_model_adapter(ZephyrAdapter)
	register_model_adapter(XwinLMAdapter)
	register_model_adapter(LemurAdapter)
	register_model_adapter(PygmalionAdapter)
	register_model_adapter(Zhinao360Adapter)

	# After all adapters, try the default base adapter.
	register_model_adapter(BaseModelAdapter)