Spaces:
Sleeping
Sleeping
File size: 3,053 Bytes
6582b49 30a712b 656f8ed 30a712b 6582b49 97e6401 6582b49 30a712b 6582b49 97e6401 7cd1a0a 97e6401 30a712b 97e6401 6582b49 97e6401 7cd1a0a 97e6401 30a712b 6582b49 97e6401 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
from tokenizer import HindiTokenizer
# Load the tokenizer
tokenizer = HindiTokenizer()
tokenizer.load_bpe_vocab("hindi_bpe_vocab.model")
def encode_text(hindi_text):
"""
Encodes the given Hindi text into token IDs.
"""
token_ids = tokenizer.encode(hindi_text)
return token_ids
def encode_text_with_compression(hindi_text):
"""
Encodes the given Hindi text into token IDs and calculates the compression ratio.
"""
# Get token IDs
token_ids = tokenizer.encode(hindi_text)
# Calculate the original text size in bytes
text_byte_length = len(hindi_text.encode('utf-8'))
# Calculate the number of token IDs
token_id_length = len(token_ids)
# Compression ratio
if text_byte_length > 0:
compression_ratio = text_byte_length / token_id_length
else:
compression_ratio = 0 # Handle edge case for empty input
return token_ids, f"{compression_ratio:.2f}"
def decode_tokens(token_ids):
"""
Decodes the given token IDs into Hindi text.
"""
# Ensure token_ids is a list of integers
try:
token_ids = list(map(int, token_ids.strip("[]").split(",")))
except Exception as e:
return f"Error in processing token IDs: {e}"
decoded_text = tokenizer.decode(token_ids)
return decoded_text
# Gradio interface
with gr.Blocks() as app:
gr.Markdown("## Hindi Tokenizer Encoder-Decoder")
with gr.Row():
with gr.Column():
gr.Markdown("### Encode Hindi Text to Token IDs")
hindi_text_input = gr.Textbox(label="Enter Hindi Text")
token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
encode_button = gr.Button("Encode")
# Example for encoding
encode_example = gr.Examples(
examples=["मेरा भारत महान॥", "आपका घर कितनी दूर है?", "स्वतंत्रता दिवस", "द क्विक ब्राउन फॉक्स जम्प्स ओवर ए लेज़ी डॉग।"],
inputs=hindi_text_input,
outputs=[token_ids_output, compression_ratio_output],
fn=encode_text_with_compression
)
with gr.Column():
gr.Markdown("### Decode Token IDs to Hindi Text")
token_ids_input = gr.Textbox(label="Enter Token IDs (comma-separated or list)")
decoded_text_output = gr.Textbox(label="Decoded Hindi Text", interactive=False)
decode_button = gr.Button("Decode")
encode_button.click(
encode_text_with_compression,
inputs=hindi_text_input,
outputs=[token_ids_output, compression_ratio_output]
)
decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
app.launch() |