Spaces:
Sleeping
Sleeping
File size: 1,727 Bytes
35a7290 17a53a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import gradio as gr
import json
class BPETokenizer:
def __init__(self, vocab_path):
# Load pre-trained vocabulary
with open(vocab_path, 'r', encoding='utf-8') as f:
self.vocab = json.load(f)
def encode(self, text):
"""Encode a piece of text into BPE tokens."""
for token in sorted(self.vocab, key=len, reverse=True): # Sort tokens by length in descending order
text = text.replace(token, f' {token} ') # Replace tokens with space-separated versions
return text.split() # Split text into tokens
# Load the pre-trained tokenizer
vocab_path = "bpe_vocab_5000.json"
bpe_tokenizer = BPETokenizer(vocab_path)
# Gradio Functions
def encode_text(text):
"""Encode user-provided text with the pre-trained tokenizer."""
if not text.strip():
return "Please enter some text to encode." # Handle empty input
tokens = bpe_tokenizer.encode(text)
return " | ".join(tokens) # Use a separator to display tokens clearly
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# Bengali BPE Tokenizer")
gr.Markdown(
"""
This app encodes Bengali text into Byte Pair Encoding (BPE) tokens using a pre-trained tokenizer.
Enter Bengali text below and press "Encode" to view the tokenized output.
"""
)
with gr.Row():
input_text = gr.TextArea(label="Enter Bengali Text to Encode", lines=5, placeholder="Type Bengali text here...")
output_tokens = gr.Textbox(label="Encoded Tokens", lines=5, interactive=False)
encode_button = gr.Button("Encode")
encode_button.click(encode_text, inputs=input_text, outputs=output_tokens)
# Launch the app
demo.launch(share=True)
|