xzuyn commited on
Commit
24e4af8
·
verified ·
1 Parent(s): 23a5098

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -51
app.py CHANGED
@@ -2,63 +2,38 @@ from transformers import AutoTokenizer
2
  import gradio as gr
3
 
4
 
5
- def tokenize(input_text):
6
- llama_tokens = len(llama_tokenizer(input_text, add_special_tokens=True)["input_ids"])
7
- llama3_tokens = len(llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"])
8
- mistral_tokens = len(mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"])
9
- gpt2_tokens = len(gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
10
- gpt_neox_tokens = len(gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"])
11
- falcon_tokens = len(falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"])
12
- phi2_tokens = len(phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
13
- phi3_tokens = len(phi3_tokenizer(input_text, add_special_tokens=True)["input_ids"])
14
- t5_tokens = len(t5_tokenizer(input_text, add_special_tokens=True)["input_ids"])
15
- gemma_tokens = len(gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"])
16
- command_r_tokens = len(command_r_tokenizer(input_text, add_special_tokens=True)["input_ids"])
17
- qwen_tokens = len(qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"])
18
- codeqwen_tokens = len(codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"])
19
- rwkv4_tokens = len(rwkv4_tokenizer(input_text, add_special_tokens=True)["input_ids"])
20
- rwkv5_tokens = len(rwkv5_tokenizer(input_text, add_special_tokens=True)["input_ids"])
 
 
21
 
22
- results = {
23
- "LLaMa-1/LLaMa-2": llama_tokens,
24
- "LLaMa-3": llama3_tokens,
25
- "Mistral": mistral_tokens,
26
- "GPT-2/GPT-J": gpt2_tokens,
27
- "GPT-NeoX": gpt_neox_tokens,
28
- "Falcon": falcon_tokens,
29
- "Phi-1/Phi-2": phi2_tokens,
30
- "Phi-3": phi3_tokens,
31
- "T5": t5_tokens,
32
- "Gemma": gemma_tokens,
33
- "Command-R": command_r_tokens,
34
- "Qwen/Qwen1.5": qwen_tokens,
35
- "CodeQwen": codeqwen_tokens,
36
- "RWKV-v4": rwkv4_tokens,
37
- "RWKV-v5/RWKV-v6": rwkv5_tokens
38
- }
39
 
 
 
 
 
 
 
 
40
  # Sort the results in descending order based on token length
41
  sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
42
-
43
  return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])
44
 
45
 
46
  if __name__ == "__main__":
47
- llama_tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-fp16")
48
- llama3_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b")
49
- mistral_tokenizer = AutoTokenizer.from_pretrained("mistral-community/Mistral-7B-v0.2")
50
- gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
51
- gpt_neox_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
52
- falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
53
- phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
54
- phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
55
- t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
56
- gemma_tokenizer = AutoTokenizer.from_pretrained("alpindale/gemma-2b")
57
- command_r_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
58
- qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B")
59
- codeqwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/CodeQwen1.5-7B")
60
- rwkv4_tokenizer = AutoTokenizer.from_pretrained("RWKV/rwkv-4-14b-pile", trust_remote_code=True)
61
- rwkv5_tokenizer = AutoTokenizer.from_pretrained("RWKV/v5-EagleX-v2-7B-HF", trust_remote_code=True)
62
-
63
- iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=15), outputs="text")
64
  iface.launch()
 
2
  import gradio as gr
3
 
4
 
5
+ # Define tokenizer models
6
+ MODELS = {
7
+ "LLaMa-1/LLaMa-2": "TheBloke/Llama-2-7B-fp16",
8
+ "LLaMa-3": "unsloth/llama-3-8b",
9
+ "Mistral": "mistral-community/Mistral-7B-v0.2",
10
+ "GPT-2/GPT-J": "gpt2",
11
+ "GPT-NeoX": "EleutherAI/gpt-neox-20b",
12
+ "Falcon": "tiiuae/falcon-7b",
13
+ "Phi-1/Phi-2": "microsoft/phi-2",
14
+ "Phi-3": "microsoft/Phi-3-mini-4k-instruct",
15
+ "T5": "google/flan-t5-xxl",
16
+ "Gemma": "alpindale/gemma-2b",
17
+ "Command-R": "CohereForAI/c4ai-command-r-plus",
18
+ "Qwen/Qwen1.5": "Qwen/Qwen1.5-7B",
19
+ "CodeQwen": "Qwen/CodeQwen1.5-7B",
20
+ "RWKV-v4": "RWKV/rwkv-4-14b-pile",
21
+ "RWKV-v5/RWKV-v6": "RWKV/v5-EagleX-v2-7B-HF"
22
+ }
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ def tokenize(input_text):
26
+ results = {}
27
+ for model_name, model_tokenizer in MODELS.items():
28
+ tokenizer = AutoTokenizer.from_pretrained(model_tokenizer, trust_remote_code=True)
29
+ tokens = len(tokenizer(input_text, add_special_tokens=True)["input_ids"])
30
+ results[model_name] = tokens
31
+
32
  # Sort the results in descending order based on token length
33
  sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
 
34
  return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])
35
 
36
 
37
  if __name__ == "__main__":
38
+ iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=len(MODELS)), outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  iface.launch()