import torch import random import gradio as gr from language_bpe import BPETokenizer tokenizer = BPETokenizer() tokenizer.load('models/english_5000.model') def inference(input_text): encoding = tokenizer.encode_ordinary(input_text) sentence = [tokenizer.decode([x]) for x in encoding] color_sentence = "" for word in sentence: background_color = random.randint(40, 47) color_sentence = f"\033[0;37;{background_color}m {word} " color_sentence = color_sentence.strip() return len(encoding), color_sentence, encoding title = "Bilingual tokenizer" description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text" examples = [["He walked into the basement with the horror movie from the night before playing in his head."], ["Henry couldn't decide if he was an auto mechanic or a priest."], ["Poison ivy grew through the fence they said was impenetrable."], ] demo = gr.Interface( inference, inputs = [ gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"), ], outputs = [ gr.Label(label="Token count"), gr.Textbox(label="Sentence after tokenization", type="text"), gr.Textbox(label="Encoding", type="text") ], title = title, description = description, examples = examples, ) demo.launch()