Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
class BPETokenizer: | |
def __init__(self, vocab_path): | |
# Load pre-trained vocabulary | |
with open(vocab_path, 'r', encoding='utf-8') as f: | |
self.vocab = json.load(f) | |
def encode(self, text): | |
"""Encode a piece of text into BPE tokens.""" | |
for token in sorted(self.vocab, key=len, reverse=True): # Sort tokens by length in descending order | |
text = text.replace(token, f' {token} ') # Replace tokens with space-separated versions | |
return text.split() # Split text into tokens | |
# Load the pre-trained tokenizer | |
vocab_path = "bpe_vocab_5000.json" | |
bpe_tokenizer = BPETokenizer(vocab_path) | |
# Gradio Functions | |
def encode_text(text): | |
"""Encode user-provided text with the pre-trained tokenizer.""" | |
if not text.strip(): | |
return "Please enter some text to encode." # Handle empty input | |
tokens = bpe_tokenizer.encode(text) | |
return " | ".join(tokens) # Use a separator to display tokens clearly | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Bengali BPE Tokenizer") | |
gr.Markdown( | |
""" | |
This app encodes Bengali text into Byte Pair Encoding (BPE) tokens using a pre-trained tokenizer. | |
Enter Bengali text below and press "Encode" to view the tokenized output. | |
""" | |
) | |
with gr.Row(): | |
input_text = gr.TextArea(label="Enter Bengali Text to Encode", lines=5, placeholder="Type Bengali text here...") | |
output_tokens = gr.Textbox(label="Encoded Tokens", lines=5, interactive=False) | |
encode_button = gr.Button("Encode") | |
encode_button.click(encode_text, inputs=input_text, outputs=output_tokens) | |
# Launch the app | |
demo.launch(share=True) | |