Spaces:

AkashDataScience
/

languageBPE

Sleeping

App Files Files Community

languageBPE / app.py

AkashDataScience

Updated examples

f5c5cfa about 1 year ago

raw

history blame contribute delete

3.19 kB

	import torch
	import random
	import gradio as gr
	from language_bpe import BPETokenizer

	hinglish_tokenizer = BPETokenizer()
	hinglish_tokenizer.load('models/hinglish_5000.model')

	hindi_tokenizer = BPETokenizer()
	hindi_tokenizer.load('models/hindi_5000.model')

	english_tokenizer = BPETokenizer()
	english_tokenizer.load('models/english_5000.model')

	tokenizer_dict = {"Hinglish_5k": hinglish_tokenizer, "Hindi_5k": hindi_tokenizer, "English_5k": english_tokenizer}

	def inference(input_text, tokenizer):
	encoding = tokenizer_dict[tokenizer].encode_ordinary(input_text)
	sentence = [tokenizer_dict[tokenizer].decode([x]) for x in encoding]
	color_sentence = []
	color_encoding = []
	for word, encode in zip(sentence, encoding):
	color_sentence.append((word, str(encode)))
	color_encoding.append((str(encode), str(encode)))
	return len(encoding), color_sentence, color_encoding

	title = "Bilingual Tokenizer"
	description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text"
	examples = [["I can't believe this is the eighth time I've smashed open my piggy bank on the same day! मैं विश्वास नहीं कर सकता कि यह आठवीं बार है जब मैंने उसी दिन अपना गुल्लक तोड़ा है!", "Hinglish_5k"],
	["Please wait outside of the house. कृपया घर के बाहर प्रतीक्षा करें.", "Hinglish_5k"],
	["He said he was not there yesterday; however, many people saw him there. उन्होंने कहा कि वह कल वहां नहीं थे; हालाँकि, कई लोगों ने उसे वहाँ देखा।", "Hinglish_5k"],
	["कृपया घर के बाहर प्रतीक्षा करें.", "Hindi_5k"],
	["उन्होंने कहा कि वह कल वहां नहीं थे; हालाँकि, कई लोगों ने उसे वहाँ देखा।", "Hindi_5k"],
	["गुलाबी जिराफ और लाल हाथियों को दिखाने के लिए दरवाजा खुला।", "Hindi_5k"],
	["He said he was not there yesterday; however, many people saw him there.", "English_5k"],
	["The door swung open to reveal pink giraffes and red elephants.", "English_5k"],
	["There's enough glass in my cupboard to build an undersea aquarium.", "English_5k"]
	]
	demo = gr.Interface(
	inference,
	inputs = [
	gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"),
	gr.Dropdown(label="Tokenizer", choices=["Hinglish_5k", "Hindi_5k", "English_5k"], value="Hinglish_5k")
	],
	outputs = [
	gr.Label(label="Token count"),
	gr.HighlightedText(label="Sentence", show_inline_category=False),
	gr.HighlightedText(label="Encoding", show_inline_category=False)
	],
	title = title,
	description = description,
	examples = examples,
	)
	demo.launch()