Spaces:
Sleeping
Sleeping
import torch | |
import random | |
import gradio as gr | |
from language_bpe import BPETokenizer | |
tokenizer = BPETokenizer() | |
tokenizer.load('models/english_5000.model') | |
def inference(input_text): | |
encoding = tokenizer.encode_ordinary(input_text) | |
sentence = [tokenizer.decode([x]) for x in encoding] | |
color_sentence = [] | |
color_encoding = [] | |
for word, encode in zip(sentence, encoding): | |
color_sentence.append((word, str(encode))) | |
color_encoding.append((encode, str(encode))) | |
return len(encoding), color_sentence, color_encoding | |
title = "Bilingual Tokenizer" | |
description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text" | |
examples = [["He walked into the basement with the horror movie from the night before playing in his head."], | |
["Henry couldn't decide if he was an auto mechanic or a priest."], | |
["Poison ivy grew through the fence they said was impenetrable."], | |
] | |
demo = gr.Interface( | |
inference, | |
inputs = [ | |
gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"), | |
], | |
outputs = [ | |
gr.Label(label="Token count"), | |
gr.HighlightedText(label="Sentence", show_inline_category=False), | |
gr.HighlightedText(label="Encoding", tshow_inline_category=False) | |
], | |
title = title, | |
description = description, | |
examples = examples, | |
) | |
demo.launch() |