Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer | |
import gradio as gr | |
# Define tokenizer models | |
MODELS = { | |
"LLaMa-1/LLaMa-2": "TheBloke/Llama-2-7B-fp16", | |
"LLaMa-3": "unsloth/llama-3-8b", | |
"Mistral": "mistral-community/Mistral-7B-v0.2", | |
"GPT-2/GPT-J": "openai-community/gpt2", | |
"GPT-NeoX": "EleutherAI/gpt-neox-20b", | |
"Falcon": "tiiuae/falcon-7b", | |
"Phi-1/Phi-2": "microsoft/phi-2", | |
"Phi-3": "microsoft/Phi-3-mini-4k-instruct", | |
"T5": "google/flan-t5-xxl", | |
"Gemma": "alpindale/gemma-2b", | |
"Command-R": "CohereForAI/c4ai-command-r-plus", | |
"Qwen/Qwen1.5": "Qwen/Qwen1.5-7B", | |
"CodeQwen": "Qwen/CodeQwen1.5-7B", | |
"RWKV-v4": "RWKV/rwkv-4-14b-pile", | |
"RWKV-v5/RWKV-v6": "RWKV/v5-EagleX-v2-7B-HF", | |
"DeepSeek-LLM": "deepseek-ai/deepseek-llm-7b-base", | |
"DeepSeek-V2": "deepseek-ai/DeepSeek-V2" | |
} | |
def tokenize(input_text): | |
results = {} | |
for model_name, model_tokenizer in MODELS.items(): | |
tokenizer = AutoTokenizer.from_pretrained(model_tokenizer, trust_remote_code=True) | |
tokens = len(tokenizer(input_text, add_special_tokens=True)["input_ids"]) | |
results[model_name] = tokens | |
# Sort the results in descending order based on token length | |
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True) | |
return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results]) | |
if __name__ == "__main__": | |
iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=len(MODELS)), outputs="text") | |
iface.launch() | |