File size: 1,412 Bytes
41dfb3a
e40da07
41dfb3a
 
 
 
 
 
 
b216aad
c8dbcc1
 
 
 
 
af1568e
e40da07
41dfb3a
af1568e
41dfb3a
 
 
 
 
 
 
 
 
 
 
812bb26
b216aad
 
41dfb3a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import torch
import random
import gradio as gr
from language_bpe import BPETokenizer

tokenizer = BPETokenizer()
tokenizer.load('models/english_5000.model')

def inference(input_text):
    encoding = tokenizer.encode_ordinary(input_text)
    # sentence = [tokenizer.decode([x]) for x in encoding]
    # color_sentence = ""
    # for word in sentence:
    #     background_color = random.randint(40, 47)
    #     color_sentence += f"\033[0;37;{background_color}m {word}"
    color_sentence = "\033[0;37;41m Black"
    return len(encoding), color_sentence, encoding

title = "Bilingual Tokenizer"
description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text"
examples = [["He walked into the basement with the horror movie from the night before playing in his head."], 
           ["Henry couldn't decide if he was an auto mechanic or a priest."], 
           ["Poison ivy grew through the fence they said was impenetrable."], 
           ]
demo = gr.Interface(
    inference, 
    inputs = [
        gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"),
        ], 
    outputs = [
        gr.Label(label="Token count"),
        gr.Textbox(label="Sentence after tokenization", type="text"),
        gr.Textbox(label="Encoding", type="text")
        ],
    title = title,
    description = description,
    examples = examples,
)
demo.launch()