import torch import random import gradio as gr from language_bpe import BPETokenizer hinglish_tokenizer = BPETokenizer() hinglish_tokenizer.load('models/hinglish_5000.model') hindi_tokenizer = BPETokenizer() hindi_tokenizer.load('models/hindi_5000.model') english_tokenizer = BPETokenizer() english_tokenizer.load('models/english_5000.model') tokenizer_dict = {"Hinglish_5k": hinglish_tokenizer, "Hindi_5k": hindi_tokenizer, "English_5k": english_tokenizer} def inference(input_text, tokenizer): encoding = tokenizer_dict[tokenizer].encode_ordinary(input_text) sentence = [tokenizer_dict[tokenizer].decode([x]) for x in encoding] color_sentence = [] color_encoding = [] for word, encode in zip(sentence, encoding): color_sentence.append((word, str(encode))) color_encoding.append((str(encode), str(encode))) return len(encoding), color_sentence, color_encoding title = "Bilingual Tokenizer" description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text" examples = [["He walked into the basement with the horror movie from the night before playing in his head.", "Hinglish_5k"], ["Henry couldn't decide if he was an auto mechanic or a priest.", "Hinglish_5k"], ["Poison ivy grew through the fence they said was impenetrable.", "Hinglish_5k"], ] demo = gr.Interface( inference, inputs = [ gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"), gr.Dropdown(label="Tokenizer", choices=["Hinglish_5k", "Hindi_5k", "English_5k"], value="Hinglish_5k") ], outputs = [ gr.Label(label="Token count"), gr.HighlightedText(label="Sentence", show_inline_category=False), gr.HighlightedText(label="Encoding", show_inline_category=False) ], title = title, description = description, examples = examples, ) demo.launch()