Shriti09 commited on
Commit
35a7290
·
1 Parent(s): a908e8a

Committing BPE to hugging face

Browse files
Files changed (2) hide show
  1. app.py +46 -0
  2. bpe_vocab_5000.json +0 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+
4
+ class BPETokenizer:
5
+ def __init__(self, vocab_path):
6
+ # Load pre-trained vocabulary
7
+ with open(vocab_path, 'r', encoding='utf-8') as f:
8
+ self.vocab = json.load(f)
9
+
10
+ def encode(self, text):
11
+ """Encode a piece of text into BPE tokens."""
12
+ for token in sorted(self.vocab, key=len, reverse=True): # Sort tokens by length in descending order
13
+ text = text.replace(token, f' {token} ') # Replace tokens with space-separated versions
14
+ return text.split() # Split text into tokens
15
+
16
+ # Load the pre-trained tokenizer
17
+ vocab_path = "bpe_vocab_5000.json"
18
+ bpe_tokenizer = BPETokenizer(vocab_path)
19
+
20
+ # Gradio Functions
21
+ def encode_text(text):
22
+ """Encode user-provided text with the pre-trained tokenizer."""
23
+ if not text.strip():
24
+ return "Please enter some text to encode." # Handle empty input
25
+ tokens = bpe_tokenizer.encode(text)
26
+ return " | ".join(tokens) # Use a separator to display tokens clearly
27
+
28
+ # Gradio Interface
29
+ with gr.Blocks() as demo:
30
+ gr.Markdown("# Bengali BPE Tokenizer")
31
+ gr.Markdown(
32
+ """
33
+ This app encodes Bengali text into Byte Pair Encoding (BPE) tokens using a pre-trained tokenizer.
34
+ Enter Bengali text below and press "Encode" to view the tokenized output.
35
+ """
36
+ )
37
+
38
+ with gr.Row():
39
+ input_text = gr.TextArea(label="Enter Bengali Text to Encode", lines=5, placeholder="Type Bengali text here...")
40
+ output_tokens = gr.Textbox(label="Encoded Tokens", lines=5, interactive=False)
41
+
42
+ encode_button = gr.Button("Encode")
43
+ encode_button.click(encode_text, inputs=input_text, outputs=output_tokens)
44
+
45
+ # Launch the app
46
+ demo.launch()
bpe_vocab_5000.json ADDED
The diff for this file is too large to render. See raw diff