Spaces:
Sleeping
Sleeping
Commit
·
1287e6c
1
Parent(s):
c563cac
Updated encoding
Browse files
language_bpe/bpe_tokenizer.py
CHANGED
@@ -142,7 +142,7 @@ class BPETokenizer(Tokenizer):
|
|
142 |
vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
|
143 |
for index in tqdm(range(256, 384)):
|
144 |
pair = list(vocab[index])
|
145 |
-
ids =
|
146 |
chunk_ids = self._encode_chunk(ids)
|
147 |
ids_list.extend(chunk_ids)
|
148 |
return ids_list
|
|
|
142 |
vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
|
143 |
for index in tqdm(range(256, 384)):
|
144 |
pair = list(vocab[index])
|
145 |
+
ids = merge_hindi(ids, pair, index)
|
146 |
chunk_ids = self._encode_chunk(ids)
|
147 |
ids_list.extend(chunk_ids)
|
148 |
return ids_list
|