Spaces:

AkashDataScience
/

languageBPE

Sleeping

File size: 5,523 Bytes

41dfb3a
e40da07
41dfb3a
 
 
5581e99
 
41dfb3a
5581e99
 
 
 
 
 
 
 
 
 
 
035182f
7d4468d
 
 
49c6fa3
7d4468d
41dfb3a
af1568e
41dfb3a
fcbd881
 
 
fcb599a
 
fcbd881
 
 
fcb599a
 
fcbd881
 
 
fcb599a
fcbd881
41dfb3a
 
 
 
 
5581e99
41dfb3a
 
812bb26
f049fd3
2f70820
41dfb3a

import torch
import random
import gradio as gr
from language_bpe import BPETokenizer

hinglish_tokenizer = BPETokenizer()
hinglish_tokenizer.load('models/hinglish_5000.model')

hindi_tokenizer = BPETokenizer()
hindi_tokenizer.load('models/hindi_5000.model')

english_tokenizer = BPETokenizer()
english_tokenizer.load('models/english_5000.model')

tokenizer_dict = {"Hinglish_5k": hinglish_tokenizer, "Hindi_5k": hindi_tokenizer, "English_5k": english_tokenizer}

def inference(input_text, tokenizer):
    encoding = tokenizer_dict[tokenizer].encode_ordinary(input_text)
    sentence = [tokenizer_dict[tokenizer].decode([x]) for x in encoding]
    color_sentence = []
    color_encoding = []
    for word, encode in zip(sentence, encoding):
        color_sentence.append((word, str(encode)))
        color_encoding.append((str(encode), str(encode)))
    return len(encoding), color_sentence, color_encoding

title = "Bilingual Tokenizer"
description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text"
examples = [["I can't believe this is the eighth time I've smashed open my piggy bank on the same day! मैं विश्वास नहीं कर सकता कि यह आठवीं बार है जब मैंने उसी दिन अपना गुल्लक तोड़ा है!", "Hinglish_5k"], 
           ["Please wait outside of the house. कृपया घर के बाहर प्रतीक्षा करें.", "Hinglish_5k"], 
           ["He said he was not there yesterday; however, many people saw him there. उन्होंने कहा कि वह कल वहां नहीं थे; हालाँकि, कई लोगों ने उसे वहाँ देखा।", "Hinglish_5k"], 
           ["The door swung open to reveal pink giraffes and red elephants. गुलाबी जिराफ और लाल हाथियों को दिखाने के लिए दरवाजा खुला।", "Hinglish_5k"],
           ["There's enough glass in my cupboard to build an undersea aquarium. समुद्र के अंदर एक मछलीघर बनाने के लिए मेरी अलमारी में पर्याप्त ग्लास हैं।", "Hinglish_5k"],
           ["I can't believe this is the eighth time I've smashed open my piggy bank on the same day! मैं विश्वास नहीं कर सकता कि यह आठवीं बार है जब मैंने उसी दिन अपना गुल्लक तोड़ा है!", "Hindi_5k"], 
           ["Please wait outside of the house. कृपया घर के बाहर प्रतीक्षा करें.", "Hindi_5k"], 
           ["He said he was not there yesterday; however, many people saw him there. उन्होंने कहा कि वह कल वहां नहीं थे; हालाँकि, कई लोगों ने उसे वहाँ देखा।", "Hindi_5k"], 
           ["The door swung open to reveal pink giraffes and red elephants. गुलाबी जिराफ और लाल हाथियों को दिखाने के लिए दरवाजा खुला।", "Hindi_5k"],
           ["There's enough glass in my cupboard to build an undersea aquarium. समुद्र के अंदर एक मछलीघर बनाने के लिए मेरी अलमारी में पर्याप्त ग्लास हैं।", "Hindi_5k"],
           ["I can't believe this is the eighth time I've smashed open my piggy bank on the same day! मैं विश्वास नहीं कर सकता कि यह आठवीं बार है जब मैंने उसी दिन अपना गुल्लक तोड़ा है!", "English_5k"], 
           ["Please wait outside of the house. कृपया घर के बाहर प्रतीक्षा करें.", "English_5k"], 
           ["He said he was not there yesterday; however, many people saw him there. उन्होंने कहा कि वह कल वहां नहीं थे; हालाँकि, कई लोगों ने उसे वहाँ देखा।", "English_5k"], 
           ["The door swung open to reveal pink giraffes and red elephants. गुलाबी जिराफ और लाल हाथियों को दिखाने के लिए दरवाजा खुला।", "English_5k"],
           ["There's enough glass in my cupboard to build an undersea aquarium. समुद्र के अंदर एक मछलीघर बनाने के लिए मेरी अलमारी में पर्याप्त ग्लास हैं।", "English_5k"]
           ]
demo = gr.Interface(
    inference, 
    inputs = [
        gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"),
        gr.Dropdown(label="Tokenizer", choices=["Hinglish_5k", "Hindi_5k", "English_5k"], value="Hinglish_5k")
        ], 
    outputs = [
        gr.Label(label="Token count"),
        gr.HighlightedText(label="Sentence", show_inline_category=False),
        gr.HighlightedText(label="Encoding", show_inline_category=False)
        ],
    title = title,
    description = description,
    examples = examples,
)
demo.launch()