AkashDataScience commited on
Commit
1287e6c
·
1 Parent(s): c563cac

Updated encoding

Browse files
Files changed (1) hide show
  1. language_bpe/bpe_tokenizer.py +1 -1
language_bpe/bpe_tokenizer.py CHANGED
@@ -142,7 +142,7 @@ class BPETokenizer(Tokenizer):
142
  vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
143
  for index in tqdm(range(256, 384)):
144
  pair = list(vocab[index])
145
- ids = [merge_hindi(chunk_ids, pair, index) for chunk_ids in ids]
146
  chunk_ids = self._encode_chunk(ids)
147
  ids_list.extend(chunk_ids)
148
  return ids_list
 
142
  vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
143
  for index in tqdm(range(256, 384)):
144
  pair = list(vocab[index])
145
+ ids = merge_hindi(ids, pair, index)
146
  chunk_ids = self._encode_chunk(ids)
147
  ids_list.extend(chunk_ids)
148
  return ids_list