Spaces:

AkashDataScience
/

languageBPE

Sleeping

AkashDataScience commited on Jun 27, 2024

Commit

f049fd3

1 Parent(s): 7d4468d

Minor change

Files changed (2) hide show

app.py CHANGED Viewed

@@ -29,8 +29,8 @@ demo = gr.Interface(
         ],
     outputs = [
         gr.Label(label="Token count"),
-        gr.HighlightedText(label="Sentence after tokenization", show_inline_category=False),
-        gr.Textbox(label="Encoding", type="text")
         ],
     title = title,
     description = description,

         ],
     outputs = [
         gr.Label(label="Token count"),
+        gr.HighlightedText(label="Sentence", show_inline_category=False),
+        gr.HighlightedText(label="Encoding", tshow_inline_category=False)
         ],
     title = title,
     description = description,

language_bpe/bpe_tokenizer.py CHANGED Viewed

@@ -42,7 +42,7 @@ class BPETokenizer(Tokenizer):
         vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
         print("Merging hindi characters in single token")
-        for index in tqdm(range(256, 384)):
             pair = list(vocab[index])
             ids = [merge_hindi(chunk_ids, pair, index) for chunk_ids in ids]

         vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
         print("Merging hindi characters in single token")
+        for index in range(256, 384):
             pair = list(vocab[index])
             ids = [merge_hindi(chunk_ids, pair, index) for chunk_ids in ids]