tokenization fixes
Browse files
src/axolotl/prompt_strategies/alpaca_chat.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
|
| 2 |
+
from axolotl.prompters import AlpacaPrompter, PromptStyle
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def load(tokenizer, cfg):
|
| 6 |
+
return AlpacaPromptTokenizingStrategy(
|
| 7 |
+
AlpacaPrompter(PromptStyle.chat), tokenizer, cfg.train_on_inputs, cfg.sequence_len
|
| 8 |
+
)
|
src/axolotl/prompt_tokenizers.py
CHANGED
|
@@ -38,14 +38,14 @@ class PromptTokenizingStrategy(abc.ABC):
|
|
| 38 |
@functools.cache
|
| 39 |
def _get_user_token(self):
|
| 40 |
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
|
| 41 |
-
if
|
| 42 |
return id_or_ids
|
| 43 |
return False
|
| 44 |
|
| 45 |
@functools.cache
|
| 46 |
def _get_assistant_token(self):
|
| 47 |
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
|
| 48 |
-
if
|
| 49 |
return id_or_ids
|
| 50 |
return False
|
| 51 |
|
|
@@ -272,15 +272,16 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
|
| 272 |
# this is still the user query, we should
|
| 273 |
res = self._tokenize(part.strip(), add_eos_token=False, strip_bos_token=True)
|
| 274 |
if user_token:
|
| 275 |
-
res = [user_token, *res]
|
| 276 |
# everything from this is masked out from the labels
|
| 277 |
labels = [ IGNORE_TOKEN_ID ] * len(res["input_ids"])
|
| 278 |
elif part[0] == "ASSISTANT:":
|
|
|
|
| 279 |
part = part[0] + part[1] if not assistant_token else part[1]
|
| 280 |
# this should be the assistent response, should end with an eos token
|
| 281 |
res = self._tokenize(part.strip(), add_eos_token=True, strip_bos_token=True)
|
| 282 |
if assistant_token:
|
| 283 |
-
res = [assistant_token, *res]
|
| 284 |
# not masked out from labels
|
| 285 |
labels = copy.deepcopy(res["input_ids"])
|
| 286 |
else:
|
|
|
|
| 38 |
@functools.cache
|
| 39 |
def _get_user_token(self):
|
| 40 |
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
|
| 41 |
+
if isinstance(id_or_ids, (int,)):
|
| 42 |
return id_or_ids
|
| 43 |
return False
|
| 44 |
|
| 45 |
@functools.cache
|
| 46 |
def _get_assistant_token(self):
|
| 47 |
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
|
| 48 |
+
if isinstance(id_or_ids, (int,)):
|
| 49 |
return id_or_ids
|
| 50 |
return False
|
| 51 |
|
|
|
|
| 272 |
# this is still the user query, we should
|
| 273 |
res = self._tokenize(part.strip(), add_eos_token=False, strip_bos_token=True)
|
| 274 |
if user_token:
|
| 275 |
+
res["input_ids"] = [user_token, *res["input_ids"]]
|
| 276 |
# everything from this is masked out from the labels
|
| 277 |
labels = [ IGNORE_TOKEN_ID ] * len(res["input_ids"])
|
| 278 |
elif part[0] == "ASSISTANT:":
|
| 279 |
+
# TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
|
| 280 |
part = part[0] + part[1] if not assistant_token else part[1]
|
| 281 |
# this should be the assistent response, should end with an eos token
|
| 282 |
res = self._tokenize(part.strip(), add_eos_token=True, strip_bos_token=True)
|
| 283 |
if assistant_token:
|
| 284 |
+
res["input_ids"] = [assistant_token, *res["input_ids"]]
|
| 285 |
# not masked out from labels
|
| 286 |
labels = copy.deepcopy(res["input_ids"])
|
| 287 |
else:
|
src/axolotl/utils/data.py
CHANGED
|
@@ -12,6 +12,7 @@ from datasets import (
|
|
| 12 |
from huggingface_hub import hf_hub_download
|
| 13 |
|
| 14 |
from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset
|
|
|
|
| 15 |
from axolotl.prompt_tokenizers import (
|
| 16 |
AlpacaPromptTokenizingStrategy,
|
| 17 |
GPTeacherPromptTokenizingStrategy,
|
|
@@ -94,10 +95,13 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
|
|
| 94 |
if not ds:
|
| 95 |
raise Exception("unhandled dataset load")
|
| 96 |
d_type = d.type
|
| 97 |
-
d_type_split =
|
| 98 |
d_base_type = d_type_split[0]
|
| 99 |
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
|
| 100 |
-
if
|
|
|
|
|
|
|
|
|
|
| 101 |
ds_strategy = AlpacaPromptTokenizingStrategy(
|
| 102 |
AlpacaPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
|
| 103 |
)
|
|
|
|
| 12 |
from huggingface_hub import hf_hub_download
|
| 13 |
|
| 14 |
from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset
|
| 15 |
+
from axolotl.prompt_strategies import load
|
| 16 |
from axolotl.prompt_tokenizers import (
|
| 17 |
AlpacaPromptTokenizingStrategy,
|
| 18 |
GPTeacherPromptTokenizingStrategy,
|
|
|
|
| 95 |
if not ds:
|
| 96 |
raise Exception("unhandled dataset load")
|
| 97 |
d_type = d.type
|
| 98 |
+
d_type_split = d_type.split(":")
|
| 99 |
d_base_type = d_type_split[0]
|
| 100 |
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
|
| 101 |
+
if (ds_strategy := load(d.type, tokenizer, cfg)):
|
| 102 |
+
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
|
| 103 |
+
datasets.append(ds_wrapper)
|
| 104 |
+
elif d_base_type == "alpaca":
|
| 105 |
ds_strategy = AlpacaPromptTokenizingStrategy(
|
| 106 |
AlpacaPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
|
| 107 |
)
|
src/axolotl/utils/models.py
CHANGED
|
@@ -220,7 +220,7 @@ def load_model(
|
|
| 220 |
for k, v in cfg.special_tokens.items():
|
| 221 |
tokenizer.add_special_tokens({k: v})
|
| 222 |
if cfg.tokens:
|
| 223 |
-
tokenizer.add_tokens(cfg.tokens)
|
| 224 |
|
| 225 |
embeddings_len = math.ceil(len(tokenizer) / 32) * 32
|
| 226 |
model.resize_token_embeddings(embeddings_len)
|
|
|
|
| 220 |
for k, v in cfg.special_tokens.items():
|
| 221 |
tokenizer.add_special_tokens({k: v})
|
| 222 |
if cfg.tokens:
|
| 223 |
+
tokenizer.add_tokens(list(cfg.tokens))
|
| 224 |
|
| 225 |
embeddings_len = math.ceil(len(tokenizer) / 32) * 32
|
| 226 |
model.resize_token_embeddings(embeddings_len)
|