Spaces:

Shriti09
/

BengaliBytePairTokenizer

Sleeping

App Files Files Community

Shriti09 commited on Jan 14

Commit

35a7290

1 Parent(s): a908e8a

Committing BPE to hugging face

Browse files

Files changed (2) hide show

app.py +46 -0
bpe_vocab_5000.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import gradio as gr
+import json
+class BPETokenizer:
+    def __init__(self, vocab_path):
+        # Load pre-trained vocabulary
+        with open(vocab_path, 'r', encoding='utf-8') as f:
+            self.vocab = json.load(f)
+    def encode(self, text):
+        """Encode a piece of text into BPE tokens."""
+        for token in sorted(self.vocab, key=len, reverse=True):  # Sort tokens by length in descending order
+            text = text.replace(token, f' {token} ')  # Replace tokens with space-separated versions
+        return text.split()  # Split text into tokens
+# Load the pre-trained tokenizer
+vocab_path = "bpe_vocab_5000.json"
+bpe_tokenizer = BPETokenizer(vocab_path)
+# Gradio Functions
+def encode_text(text):
+    """Encode user-provided text with the pre-trained tokenizer."""
+    if not text.strip():
+        return "Please enter some text to encode."  # Handle empty input
+    tokens = bpe_tokenizer.encode(text)
+    return " | ".join(tokens)  # Use a separator to display tokens clearly
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Bengali BPE Tokenizer")
+    gr.Markdown(
+        """
+        This app encodes Bengali text into Byte Pair Encoding (BPE) tokens using a pre-trained tokenizer.
+        Enter Bengali text below and press "Encode" to view the tokenized output.
+        """
+    )
+    with gr.Row():
+        input_text = gr.TextArea(label="Enter Bengali Text to Encode", lines=5, placeholder="Type Bengali text here...")
+        output_tokens = gr.Textbox(label="Encoded Tokens", lines=5, interactive=False)
+    encode_button = gr.Button("Encode")
+    encode_button.click(encode_text, inputs=input_text, outputs=output_tokens)
+# Launch the app
+demo.launch()

bpe_vocab_5000.json ADDED Viewed

The diff for this file is too large to render. See raw diff