import torch
import random
import gradio as gr
from language_bpe import BPETokenizer

hinglish_tokenizer = BPETokenizer()
hinglish_tokenizer.load('models/hinglish_5000.model')

hindi_tokenizer = BPETokenizer()
hindi_tokenizer.load('models/hindi_5000.model')

english_tokenizer = BPETokenizer()
english_tokenizer.load('models/english_5000.model')

tokenizer_dict = {"Hinglish_5k": hinglish_tokenizer, "Hindi_5k": hindi_tokenizer, "English_5k": english_tokenizer}

def inference(input_text, tokenizer):
    encoding = tokenizer_dict[tokenizer].encode_ordinary(input_text)
    sentence = [tokenizer_dict[tokenizer].decode([x]) for x in encoding]
    color_sentence = []
    color_encoding = []
    for word, encode in zip(sentence, encoding):
        color_sentence.append((word, str(encode)))
        color_encoding.append((str(encode), str(encode)))
    return len(encoding), color_sentence, color_encoding

title = "Bilingual Tokenizer"
description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text"
examples = [["He walked into the basement with the horror movie from the night before playing in his head.", "Hinglish_5k"], 
           ["Henry couldn't decide if he was an auto mechanic or a priest.", "Hinglish_5k"], 
           ["Poison ivy grew through the fence they said was impenetrable.", "Hinglish_5k"], 
           ]
demo = gr.Interface(
    inference, 
    inputs = [
        gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"),
        gr.Dropdown(label="Tokenizer", choices=["Hinglish_5k", "Hindi_5k", "English_5k"], value="Hinglish_5k")
        ], 
    outputs = [
        gr.Label(label="Token count"),
        gr.HighlightedText(label="Sentence", show_inline_category=False),
        gr.HighlightedText(label="Encoding", show_inline_category=False)
        ],
    title = title,
    description = description,
    examples = examples,
)
demo.launch()