Spaces:

piyushgrover
/

HindiTokenizer

Sleeping

App Files Files Community

HindiTokenizer / app.py

piyushgrover

Update app.py

656f8ed verified 7 months ago

raw

history blame contribute delete

3.05 kB

	import gradio as gr
	from tokenizer import HindiTokenizer

	# Load the tokenizer
	tokenizer = HindiTokenizer()
	tokenizer.load_bpe_vocab("hindi_bpe_vocab.model")

	def encode_text(hindi_text):
	"""
	Encodes the given Hindi text into token IDs.
	"""
	token_ids = tokenizer.encode(hindi_text)
	return token_ids

	def encode_text_with_compression(hindi_text):
	"""
	Encodes the given Hindi text into token IDs and calculates the compression ratio.
	"""
	# Get token IDs
	token_ids = tokenizer.encode(hindi_text)

	# Calculate the original text size in bytes
	text_byte_length = len(hindi_text.encode('utf-8'))

	# Calculate the number of token IDs
	token_id_length = len(token_ids)

	# Compression ratio
	if text_byte_length > 0:
	compression_ratio = text_byte_length / token_id_length
	else:
	compression_ratio = 0 # Handle edge case for empty input

	return token_ids, f"{compression_ratio:.2f}"

	def decode_tokens(token_ids):
	"""
	Decodes the given token IDs into Hindi text.
	"""
	# Ensure token_ids is a list of integers
	try:
	token_ids = list(map(int, token_ids.strip("[]").split(",")))
	except Exception as e:
	return f"Error in processing token IDs: {e}"

	decoded_text = tokenizer.decode(token_ids)
	return decoded_text


	# Gradio interface
	with gr.Blocks() as app:
	gr.Markdown("## Hindi Tokenizer Encoder-Decoder")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Encode Hindi Text to Token IDs")
	hindi_text_input = gr.Textbox(label="Enter Hindi Text")
	token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
	compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
	encode_button = gr.Button("Encode")

	# Example for encoding
	encode_example = gr.Examples(
	examples=["मेरा भारत महान॥", "आपका घर कितनी दूर है?", "स्वतंत्रता दिवस", "द क्विक ब्राउन फॉक्स जम्प्स ओवर ए लेज़ी डॉग।"],
	inputs=hindi_text_input,
	outputs=[token_ids_output, compression_ratio_output],
	fn=encode_text_with_compression
	)

	with gr.Column():
	gr.Markdown("### Decode Token IDs to Hindi Text")
	token_ids_input = gr.Textbox(label="Enter Token IDs (comma-separated or list)")
	decoded_text_output = gr.Textbox(label="Decoded Hindi Text", interactive=False)
	decode_button = gr.Button("Decode")



	encode_button.click(
	encode_text_with_compression,
	inputs=hindi_text_input,
	outputs=[token_ids_output, compression_ratio_output]
	)
	decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)

	app.launch()