Shriti09's picture
corrected package name
17a53a3
import gradio as gr
import json
class BPETokenizer:
def __init__(self, vocab_path):
# Load pre-trained vocabulary
with open(vocab_path, 'r', encoding='utf-8') as f:
self.vocab = json.load(f)
def encode(self, text):
"""Encode a piece of text into BPE tokens."""
for token in sorted(self.vocab, key=len, reverse=True): # Sort tokens by length in descending order
text = text.replace(token, f' {token} ') # Replace tokens with space-separated versions
return text.split() # Split text into tokens
# Load the pre-trained tokenizer
vocab_path = "bpe_vocab_5000.json"
bpe_tokenizer = BPETokenizer(vocab_path)
# Gradio Functions
def encode_text(text):
"""Encode user-provided text with the pre-trained tokenizer."""
if not text.strip():
return "Please enter some text to encode." # Handle empty input
tokens = bpe_tokenizer.encode(text)
return " | ".join(tokens) # Use a separator to display tokens clearly
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# Bengali BPE Tokenizer")
gr.Markdown(
"""
This app encodes Bengali text into Byte Pair Encoding (BPE) tokens using a pre-trained tokenizer.
Enter Bengali text below and press "Encode" to view the tokenized output.
"""
)
with gr.Row():
input_text = gr.TextArea(label="Enter Bengali Text to Encode", lines=5, placeholder="Type Bengali text here...")
output_tokens = gr.Textbox(label="Encoded Tokens", lines=5, interactive=False)
encode_button = gr.Button("Encode")
encode_button.click(encode_text, inputs=input_text, outputs=output_tokens)
# Launch the app
demo.launch(share=True)