Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,562 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
from datetime import datetime
|
6 |
+
from huggingface_hub import HfApi, DatasetCard, DatasetCardData, create_repo
|
7 |
+
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
8 |
+
import os
|
9 |
+
import tempfile
|
10 |
+
|
11 |
+
# --- Configuration ---
|
12 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
13 |
+
DATASET_REPO_ID = os.getenv("DATASET_REPO", "Lyte/tokenizer-leaderboard")
|
14 |
+
DATASET_FILE_NAME = "leaderboard.csv"
|
15 |
+
|
16 |
+
PREDEFINED_TEXT = '''
|
17 |
+
import gradio as gr
|
18 |
+
from transformers import AutoTokenizer
|
19 |
+
import pandas as pd
|
20 |
+
import re
|
21 |
+
from datetime import datetime
|
22 |
+
from huggingface_hub import HfApi, DatasetCard, DatasetCardData, create_repo
|
23 |
+
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
24 |
+
import os
|
25 |
+
import tempfile
|
26 |
+
|
27 |
+
# --- Configuration ---
|
28 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
29 |
+
DATASET_REPO_ID = os.getenv("DATASET_REPO", "Lyte/tokenizer-leaderboard")
|
30 |
+
DATASET_FILE_NAME = "leaderboard.csv"
|
31 |
+
|
32 |
+
PREDEFINED_TEXT = """
|
33 |
+
The quick brown fox jumps over 12 lazy dogs! 🐕🦺
|
34 |
+
Special characters: #@%^&*()_+-=[]{}|;:'",.<>/?\\~
|
35 |
+
Code samples:
|
36 |
+
- Python: def hello(): print("Hello World! 2023")
|
37 |
+
- HTML: <div class="container" id="main">Content</div>
|
38 |
+
- JSON: {"key": "value", "numbers": [1, 2, 3.14]}
|
39 |
+
Math equations: E = mc² → 3×10⁸ m/s
|
40 |
+
Multilingual text: 速い茶色の狐が怠惰な犬を飛び越える 😸
|
41 |
+
Emojis: 👍🎉🚀❤️🔥
|
42 |
+
Mixed casing: OpenAI's GPT-4 vs gpt-3.5-turbo
|
43 |
+
"""
|
44 |
+
|
45 |
+
WORD_COUNT = len(re.findall(r'\S+', PREDEFINED_TEXT))
|
46 |
+
LEADERBOARD_COLUMNS = [
|
47 |
+
"Model ID", "Token Count", "Vocab Size",
|
48 |
+
"Tokens/Word", "Chars/Token", "Timestamp"
|
49 |
+
]
|
50 |
+
|
51 |
+
# --- Hugging Face Hub Functions ---
|
52 |
+
def create_huggingface_dataset():
|
53 |
+
"""Creates the dataset repository on the Hub if it doesn't exist."""
|
54 |
+
try:
|
55 |
+
api = HfApi(token=HF_TOKEN)
|
56 |
+
create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
57 |
+
|
58 |
+
card_data = DatasetCardData(
|
59 |
+
language="en",
|
60 |
+
license="mit",
|
61 |
+
size_categories=["1K<n<10K"],
|
62 |
+
tags=["tokenizer", "leaderboard", "performance", "gradio"],
|
63 |
+
)
|
64 |
+
card = DatasetCard.from_template(
|
65 |
+
card_data,
|
66 |
+
template_path=None,
|
67 |
+
Title="Tokenizer Leaderboard",
|
68 |
+
Description="A leaderboard of tokenizer performance based on various metrics.",
|
69 |
+
How_to_use="The leaderboard data is stored in a CSV file named 'leaderboard.csv'.",
|
70 |
+
)
|
71 |
+
card.push_to_hub(repo_id=DATASET_REPO_ID, token=HF_TOKEN)
|
72 |
+
print(f"Dataset repository '{DATASET_REPO_ID}' created (or already exists).")
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
print(f"Error creating dataset repository: {e}")
|
76 |
+
raise
|
77 |
+
|
78 |
+
def load_leaderboard_from_hub():
|
79 |
+
"""Loads the leaderboard data from the Hugging Face Hub as a pandas DataFrame."""
|
80 |
+
try:
|
81 |
+
api = HfApi(token=HF_TOKEN)
|
82 |
+
dataset_path = api.dataset_info(repo_id=DATASET_REPO_ID, token=HF_TOKEN).siblings
|
83 |
+
csv_file_info = next((file for file in dataset_path if file.rfilename == DATASET_FILE_NAME), None)
|
84 |
+
|
85 |
+
if csv_file_info is None:
|
86 |
+
print(f"'{DATASET_FILE_NAME}' not found in '{DATASET_REPO_ID}'. Returning an empty DataFrame")
|
87 |
+
return pd.DataFrame(columns=LEADERBOARD_COLUMNS)
|
88 |
+
|
89 |
+
file_path = api.hf_hub_download(repo_id=DATASET_REPO_ID, filename=DATASET_FILE_NAME, repo_type="dataset")
|
90 |
+
df = pd.read_csv(file_path)
|
91 |
+
df = df.sort_values(by="Token Count", ascending=True)
|
92 |
+
df["Tokens/Word"] = df["Tokens/Word"].round(2)
|
93 |
+
df["Chars/Token"] = df["Chars/Token"].round(2)
|
94 |
+
return df
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
print(f"Error loading leaderboard from Hugging Face Hub: {e}")
|
98 |
+
return pd.DataFrame(columns=LEADERBOARD_COLUMNS)
|
99 |
+
|
100 |
+
def push_leaderboard_to_hub(df):
|
101 |
+
"""Pushes the updated leaderboard DataFrame to the Hugging Face Hub."""
|
102 |
+
try:
|
103 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".csv") as tmpfile:
|
104 |
+
df.to_csv(tmpfile.name, index=False)
|
105 |
+
tmp_path = tmpfile.name
|
106 |
+
|
107 |
+
api = HfApi(token=HF_TOKEN)
|
108 |
+
api.upload_file(
|
109 |
+
path_or_fileobj=tmp_path,
|
110 |
+
path_in_repo=DATASET_FILE_NAME,
|
111 |
+
repo_id=DATASET_REPO_ID,
|
112 |
+
repo_type="dataset",
|
113 |
+
token=HF_TOKEN,
|
114 |
+
commit_message="Update leaderboard"
|
115 |
+
)
|
116 |
+
os.remove(tmp_path)
|
117 |
+
|
118 |
+
print(f"Leaderboard updated and pushed to {DATASET_REPO_ID}")
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Error pushing leaderboard to Hugging Face Hub: {e}")
|
122 |
+
raise
|
123 |
+
|
124 |
+
|
125 |
+
# --- Utility Functions ---
|
126 |
+
|
127 |
+
def get_tokenizer_stats(model_id, text):
|
128 |
+
if not model_id:
|
129 |
+
raise ValueError("No model ID provided")
|
130 |
+
try:
|
131 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
|
132 |
+
tokens = tokenizer.encode(text, add_special_tokens=False)
|
133 |
+
text_length = len(text)
|
134 |
+
return {
|
135 |
+
"token_count": len(tokens),
|
136 |
+
"vocab_size": tokenizer.vocab_size,
|
137 |
+
"token_word_ratio": round(len(tokens) / WORD_COUNT, 2),
|
138 |
+
"chars_per_token": round(text_length / len(tokens), 2) if tokens else 0
|
139 |
+
}
|
140 |
+
except Exception as e:
|
141 |
+
raise RuntimeError(f"Failed to load tokenizer or encode text: {str(e)}") from e
|
142 |
+
|
143 |
+
def is_model_in_leaderboard(df, model_id):
|
144 |
+
return model_id in df["Model ID"].values
|
145 |
+
|
146 |
+
def add_to_leaderboard(model_id):
|
147 |
+
if not model_id:
|
148 |
+
return "❌ Error: No model ID provided"
|
149 |
+
df = load_leaderboard_from_hub()
|
150 |
+
if is_model_in_leaderboard(df, model_id):
|
151 |
+
return "⚠️ Model already in leaderboard"
|
152 |
+
try:
|
153 |
+
stats = get_tokenizer_stats(model_id, PREDEFINED_TEXT)
|
154 |
+
new_row = pd.DataFrame([{
|
155 |
+
"Model ID": model_id,
|
156 |
+
"Token Count": stats["token_count"],
|
157 |
+
"Vocab Size": stats["vocab_size"],
|
158 |
+
"Tokens/Word": stats["token_word_ratio"],
|
159 |
+
"Chars/Token": stats["chars_per_token"],
|
160 |
+
"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
161 |
+
}])
|
162 |
+
updated_df = pd.concat([df, new_row], ignore_index=True)
|
163 |
+
push_leaderboard_to_hub(updated_df)
|
164 |
+
return "✅ Added to leaderboard!"
|
165 |
+
except Exception as e:
|
166 |
+
return f"❌ Error: {str(e)}"
|
167 |
+
|
168 |
+
def analyze_tokenizer(model_id, text):
|
169 |
+
if not model_id:
|
170 |
+
return "❌ Error: Please select or enter a model ID"
|
171 |
+
try:
|
172 |
+
stats = get_tokenizer_stats(model_id, text)
|
173 |
+
return (
|
174 |
+
f"Token Count: {stats['token_count']}\n"
|
175 |
+
f"Vocab Size: {stats['vocab_size']}\n"
|
176 |
+
f"Tokens/Word: {stats['token_word_ratio']:.2f}\n"
|
177 |
+
f"Chars/Token: {stats['chars_per_token']:.2f}"
|
178 |
+
)
|
179 |
+
except Exception as e:
|
180 |
+
return f"❌ Analysis Failed: {str(e)}"
|
181 |
+
|
182 |
+
def compare_tokenizers(model_ids_str, use_standard_text):
|
183 |
+
try:
|
184 |
+
model_list = [mid.strip() for mid in model_ids_str.split(',') if mid.strip()]
|
185 |
+
if not model_list:
|
186 |
+
return pd.DataFrame({"Error": ["No models provided"]})
|
187 |
+
results = []
|
188 |
+
for model_id in model_list:
|
189 |
+
try:
|
190 |
+
stats = get_tokenizer_stats(model_id, PREDEFINED_TEXT)
|
191 |
+
results.append({
|
192 |
+
"Model ID": model_id,
|
193 |
+
"Tokens": stats["token_count"],
|
194 |
+
"Vocab Size": stats["vocab_size"],
|
195 |
+
"Tokens/Word": f"{stats['token_word_ratio']:.2f}",
|
196 |
+
"Chars/Token": f"{stats['chars_per_token']:.2f}",
|
197 |
+
"Status": "✅ Success"
|
198 |
+
})
|
199 |
+
except Exception as e:
|
200 |
+
results.append({
|
201 |
+
"Model ID": model_id,
|
202 |
+
"Tokens": "-",
|
203 |
+
"Vocab Size": "-",
|
204 |
+
"Tokens/Word": "-",
|
205 |
+
"Chars/Token": "-",
|
206 |
+
"Status": f"❌ {str(e)}"
|
207 |
+
})
|
208 |
+
return pd.DataFrame(results)
|
209 |
+
except Exception as e:
|
210 |
+
return pd.DataFrame({"Error": [str(e)]})
|
211 |
+
|
212 |
+
def get_leaderboard_for_download():
|
213 |
+
"""Loads, prepares, and returns a Gradio File object for download."""
|
214 |
+
try:
|
215 |
+
df = load_leaderboard_from_hub()
|
216 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
|
217 |
+
df.to_csv(tmpfile.name, index=False)
|
218 |
+
# Return a Gradio File object, NOT just the path
|
219 |
+
return gr.File(value=tmpfile.name, label="Download CSV")
|
220 |
+
except Exception as e:
|
221 |
+
print(f"Error preparing file for download: {e}")
|
222 |
+
return None
|
223 |
+
|
224 |
+
|
225 |
+
def initial_benchmark_run():
|
226 |
+
try:
|
227 |
+
print("Starting initial benchmark run...")
|
228 |
+
default_models = [
|
229 |
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
|
230 |
+
"Qwen/Qwen2.5-7B-Instruct-1M",
|
231 |
+
"simplescaling/s1.1-32B",
|
232 |
+
"Xenova/gpt-4o",
|
233 |
+
"microsoft/phi-4",
|
234 |
+
"deepseek-ai/DeepSeek-R1",
|
235 |
+
"google/gemma-2-27b-it",
|
236 |
+
"HuggingFaceTB/SmolLM2-135M-Instruct",
|
237 |
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
238 |
+
"tomg-group-umd/huginn-0125",
|
239 |
+
"microsoft/Phi-3.5-mini-instruct",
|
240 |
+
"openai-community/gpt2"
|
241 |
+
]
|
242 |
+
df = load_leaderboard_from_hub()
|
243 |
+
for model_id in default_models:
|
244 |
+
try:
|
245 |
+
if not is_model_in_leaderboard(df, model_id):
|
246 |
+
print(f"Benchmarking {model_id}...")
|
247 |
+
result = add_to_leaderboard(model_id)
|
248 |
+
print(f"Result for {model_id}: {result}")
|
249 |
+
else:
|
250 |
+
print(f"{model_id} already in leaderboard, skipping.")
|
251 |
+
except Exception as e:
|
252 |
+
print(f"Error benchmarking {model_id}: {str(e)}")
|
253 |
+
print("Initial benchmarking complete.")
|
254 |
+
except Exception as e:
|
255 |
+
print(f"Fatal error in initial benchmark: {str(e)}")
|
256 |
+
|
257 |
+
# --- Gradio Interface ---
|
258 |
+
with gr.Blocks(title="Tokenizers Leaderboard", theme=gr.themes.Soft()) as iface:
|
259 |
+
gr.Markdown("# 🏆 Tokenizers Leaderboard")
|
260 |
+
|
261 |
+
with gr.Tab("Analyze"):
|
262 |
+
gr.Markdown("## Single Tokenizer Analysis")
|
263 |
+
with gr.Row():
|
264 |
+
model_search = HuggingfaceHubSearch(label="Search Models", placeholder="Search Hugging Face models...", search_type="model")
|
265 |
+
custom_model = gr.Textbox(label="Direct Model ID", placeholder="e.g.: mistralai/Mistral-7B-Instruct-v0.3", max_lines=1)
|
266 |
+
model_id = gr.Textbox(visible=False)
|
267 |
+
gr.Markdown("### Input Text")
|
268 |
+
text_input = gr.Textbox(lines=5, value=PREDEFINED_TEXT, label="Analysis Text")
|
269 |
+
with gr.Row():
|
270 |
+
analyze_btn = gr.Button("Analyze", variant="primary")
|
271 |
+
add_btn = gr.Button("Add to Leaderboard")
|
272 |
+
analysis_output = gr.Textbox(label="Results", interactive=False)
|
273 |
+
model_search.change(lambda x: x, model_search, model_id)
|
274 |
+
custom_model.change(lambda x: x, custom_model, model_id)
|
275 |
+
analyze_btn.click(analyze_tokenizer, [model_id, text_input], analysis_output)
|
276 |
+
add_event = add_btn.click(add_to_leaderboard, model_id, analysis_output)
|
277 |
+
|
278 |
+
with gr.Tab("Compare"):
|
279 |
+
gr.Markdown("## Multi-Model Comparison")
|
280 |
+
gr.Markdown(f"**Standard Text:** `{PREDEFINED_TEXT[:80]}...`")
|
281 |
+
model_ids = gr.Textbox(label="Model IDs (comma-separated)", placeholder="Enter models: meta-llama/Llama-2-7b, google/gemma-7b, ...")
|
282 |
+
compare_btn = gr.Button("Compare Models", variant="primary")
|
283 |
+
comparison_table = gr.DataFrame(label="Results", interactive=False)
|
284 |
+
compare_btn.click(compare_tokenizers, [model_ids, gr.Checkbox(value=True, visible=False)], comparison_table)
|
285 |
+
|
286 |
+
with gr.Tab("Leaderboard"):
|
287 |
+
gr.Markdown("## Performance Leaderboard")
|
288 |
+
with gr.Row():
|
289 |
+
download_btn = gr.DownloadButton(label="Download CSV", value="tokenizer_leaderboard.csv")
|
290 |
+
leaderboard_table = gr.DataFrame(label="Top Tokenizers", headers=LEADERBOARD_COLUMNS, interactive=False,
|
291 |
+
datatype=["str", "number", "number", "number", "number", "str"])
|
292 |
+
|
293 |
+
# Connect the download button to the function that prepares the CSV
|
294 |
+
download_btn.click(get_leaderboard_for_download, inputs=[], outputs=download_btn)
|
295 |
+
|
296 |
+
iface.load(fn=load_leaderboard_from_hub, outputs=leaderboard_table)
|
297 |
+
add_event.then(load_leaderboard_from_hub, None, leaderboard_table)
|
298 |
+
|
299 |
+
|
300 |
+
create_huggingface_dataset()
|
301 |
+
initial_benchmark_run()
|
302 |
+
iface.launch()
|
303 |
+
'''
|
304 |
+
|
305 |
+
WORD_COUNT = len(re.findall(r'\S+', PREDEFINED_TEXT))
|
306 |
+
LEADERBOARD_COLUMNS = [
|
307 |
+
"Model ID", "Token Count", "Vocab Size",
|
308 |
+
"Tokens/Word", "Chars/Token", "Timestamp"
|
309 |
+
]
|
310 |
+
|
311 |
+
# --- Hugging Face Hub Functions ---
|
312 |
+
def create_huggingface_dataset():
|
313 |
+
"""Creates the dataset repository on the Hub if it doesn't exist."""
|
314 |
+
try:
|
315 |
+
api = HfApi(token=HF_TOKEN)
|
316 |
+
create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
317 |
+
|
318 |
+
card_data = DatasetCardData(
|
319 |
+
language="en",
|
320 |
+
license="mit",
|
321 |
+
size_categories=["1K<n<10K"],
|
322 |
+
tags=["tokenizer", "leaderboard", "performance", "gradio"],
|
323 |
+
)
|
324 |
+
card = DatasetCard.from_template(
|
325 |
+
card_data,
|
326 |
+
template_path=None,
|
327 |
+
Title="Tokenizer Leaderboard",
|
328 |
+
Description="A leaderboard of tokenizer performance based on various metrics.",
|
329 |
+
How_to_use="The leaderboard data is stored in a CSV file named 'leaderboard.csv'.",
|
330 |
+
)
|
331 |
+
card.push_to_hub(repo_id=DATASET_REPO_ID, token=HF_TOKEN)
|
332 |
+
print(f"Dataset repository '{DATASET_REPO_ID}' created (or already exists).")
|
333 |
+
|
334 |
+
except Exception as e:
|
335 |
+
print(f"Error creating dataset repository: {e}")
|
336 |
+
raise
|
337 |
+
|
338 |
+
def load_leaderboard_from_hub():
|
339 |
+
"""Loads the leaderboard data from the Hugging Face Hub as a pandas DataFrame."""
|
340 |
+
try:
|
341 |
+
api = HfApi(token=HF_TOKEN)
|
342 |
+
dataset_path = api.dataset_info(repo_id=DATASET_REPO_ID, token=HF_TOKEN).siblings
|
343 |
+
csv_file_info = next((file for file in dataset_path if file.rfilename == DATASET_FILE_NAME), None)
|
344 |
+
|
345 |
+
if csv_file_info is None:
|
346 |
+
print(f"'{DATASET_FILE_NAME}' not found in '{DATASET_REPO_ID}'. Returning an empty DataFrame")
|
347 |
+
return pd.DataFrame(columns=LEADERBOARD_COLUMNS)
|
348 |
+
|
349 |
+
file_path = api.hf_hub_download(repo_id=DATASET_REPO_ID, filename=DATASET_FILE_NAME, repo_type="dataset")
|
350 |
+
df = pd.read_csv(file_path)
|
351 |
+
df = df.sort_values(by="Token Count", ascending=True)
|
352 |
+
df["Tokens/Word"] = df["Tokens/Word"].round(2)
|
353 |
+
df["Chars/Token"] = df["Chars/Token"].round(2)
|
354 |
+
return df
|
355 |
+
|
356 |
+
except Exception as e:
|
357 |
+
print(f"Error loading leaderboard from Hugging Face Hub: {e}")
|
358 |
+
return pd.DataFrame(columns=LEADERBOARD_COLUMNS)
|
359 |
+
|
360 |
+
def push_leaderboard_to_hub(df):
|
361 |
+
"""Pushes the updated leaderboard DataFrame to the Hugging Face Hub."""
|
362 |
+
try:
|
363 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".csv") as tmpfile:
|
364 |
+
df.to_csv(tmpfile.name, index=False)
|
365 |
+
tmp_path = tmpfile.name
|
366 |
+
|
367 |
+
api = HfApi(token=HF_TOKEN)
|
368 |
+
api.upload_file(
|
369 |
+
path_or_fileobj=tmp_path,
|
370 |
+
path_in_repo=DATASET_FILE_NAME,
|
371 |
+
repo_id=DATASET_REPO_ID,
|
372 |
+
repo_type="dataset",
|
373 |
+
token=HF_TOKEN,
|
374 |
+
commit_message="Update leaderboard"
|
375 |
+
)
|
376 |
+
os.remove(tmp_path)
|
377 |
+
|
378 |
+
print(f"Leaderboard updated and pushed to {DATASET_REPO_ID}")
|
379 |
+
|
380 |
+
except Exception as e:
|
381 |
+
print(f"Error pushing leaderboard to Hugging Face Hub: {e}")
|
382 |
+
raise
|
383 |
+
|
384 |
+
|
385 |
+
# --- Utility Functions ---
|
386 |
+
|
387 |
+
def get_tokenizer_stats(model_id, text):
|
388 |
+
if not model_id:
|
389 |
+
raise ValueError("No model ID provided")
|
390 |
+
try:
|
391 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
|
392 |
+
tokens = tokenizer.encode(text, add_special_tokens=False)
|
393 |
+
text_length = len(text)
|
394 |
+
return {
|
395 |
+
"token_count": len(tokens),
|
396 |
+
"vocab_size": tokenizer.vocab_size,
|
397 |
+
"token_word_ratio": round(len(tokens) / WORD_COUNT, 2),
|
398 |
+
"chars_per_token": round(text_length / len(tokens), 2) if tokens else 0
|
399 |
+
}
|
400 |
+
except Exception as e:
|
401 |
+
raise RuntimeError(f"Failed to load tokenizer or encode text: {str(e)}") from e
|
402 |
+
|
403 |
+
def is_model_in_leaderboard(df, model_id):
|
404 |
+
return model_id in df["Model ID"].values
|
405 |
+
|
406 |
+
def add_to_leaderboard(model_id):
|
407 |
+
if not model_id:
|
408 |
+
return "❌ Error: No model ID provided"
|
409 |
+
df = load_leaderboard_from_hub()
|
410 |
+
if is_model_in_leaderboard(df, model_id):
|
411 |
+
return "⚠️ Model already in leaderboard"
|
412 |
+
try:
|
413 |
+
stats = get_tokenizer_stats(model_id, PREDEFINED_TEXT)
|
414 |
+
new_row = pd.DataFrame([{
|
415 |
+
"Model ID": model_id,
|
416 |
+
"Token Count": stats["token_count"],
|
417 |
+
"Vocab Size": stats["vocab_size"],
|
418 |
+
"Tokens/Word": stats["token_word_ratio"],
|
419 |
+
"Chars/Token": stats["chars_per_token"],
|
420 |
+
"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
421 |
+
}])
|
422 |
+
updated_df = pd.concat([df, new_row], ignore_index=True)
|
423 |
+
push_leaderboard_to_hub(updated_df)
|
424 |
+
return "✅ Added to leaderboard!"
|
425 |
+
except Exception as e:
|
426 |
+
return f"❌ Error: {str(e)}"
|
427 |
+
|
428 |
+
def analyze_tokenizer(model_id, text):
|
429 |
+
if not model_id:
|
430 |
+
return "❌ Error: Please select or enter a model ID"
|
431 |
+
try:
|
432 |
+
stats = get_tokenizer_stats(model_id, text)
|
433 |
+
return (
|
434 |
+
f"Token Count: {stats['token_count']}\n"
|
435 |
+
f"Vocab Size: {stats['vocab_size']}\n"
|
436 |
+
f"Tokens/Word: {stats['token_word_ratio']:.2f}\n"
|
437 |
+
f"Chars/Token: {stats['chars_per_token']:.2f}"
|
438 |
+
)
|
439 |
+
except Exception as e:
|
440 |
+
return f"❌ Analysis Failed: {str(e)}"
|
441 |
+
|
442 |
+
def compare_tokenizers(model_ids_str, use_standard_text):
|
443 |
+
try:
|
444 |
+
model_list = [mid.strip() for mid in model_ids_str.split(',') if mid.strip()]
|
445 |
+
if not model_list:
|
446 |
+
return pd.DataFrame({"Error": ["No models provided"]})
|
447 |
+
results = []
|
448 |
+
for model_id in model_list:
|
449 |
+
try:
|
450 |
+
stats = get_tokenizer_stats(model_id, PREDEFINED_TEXT)
|
451 |
+
results.append({
|
452 |
+
"Model ID": model_id,
|
453 |
+
"Tokens": stats["token_count"],
|
454 |
+
"Vocab Size": stats["vocab_size"],
|
455 |
+
"Tokens/Word": f"{stats['token_word_ratio']:.2f}",
|
456 |
+
"Chars/Token": f"{stats['chars_per_token']:.2f}",
|
457 |
+
"Status": "✅ Success"
|
458 |
+
})
|
459 |
+
except Exception as e:
|
460 |
+
results.append({
|
461 |
+
"Model ID": model_id,
|
462 |
+
"Tokens": "-",
|
463 |
+
"Vocab Size": "-",
|
464 |
+
"Tokens/Word": "-",
|
465 |
+
"Chars/Token": "-",
|
466 |
+
"Status": f"❌ {str(e)}"
|
467 |
+
})
|
468 |
+
return pd.DataFrame(results)
|
469 |
+
except Exception as e:
|
470 |
+
return pd.DataFrame({"Error": [str(e)]})
|
471 |
+
|
472 |
+
def get_leaderboard_for_download():
|
473 |
+
"""Loads, prepares, and returns a Gradio File object for download."""
|
474 |
+
try:
|
475 |
+
df = load_leaderboard_from_hub()
|
476 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
|
477 |
+
df.to_csv(tmpfile.name, index=False)
|
478 |
+
# Return a Gradio File object, NOT just the path
|
479 |
+
return gr.File(value=tmpfile.name, label="Download CSV")
|
480 |
+
except Exception as e:
|
481 |
+
print(f"Error preparing file for download: {e}")
|
482 |
+
return None
|
483 |
+
|
484 |
+
|
485 |
+
def initial_benchmark_run():
|
486 |
+
try:
|
487 |
+
print("Starting initial benchmark run...")
|
488 |
+
default_models = [
|
489 |
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
|
490 |
+
"Qwen/Qwen2.5-7B-Instruct-1M",
|
491 |
+
"simplescaling/s1.1-32B",
|
492 |
+
"Xenova/gpt-4o",
|
493 |
+
"microsoft/phi-4",
|
494 |
+
"deepseek-ai/DeepSeek-R1",
|
495 |
+
"google/gemma-2-27b-it",
|
496 |
+
"HuggingFaceTB/SmolLM2-135M-Instruct",
|
497 |
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
498 |
+
"tomg-group-umd/huginn-0125",
|
499 |
+
"microsoft/Phi-3.5-mini-instruct",
|
500 |
+
"openai-community/gpt2"
|
501 |
+
]
|
502 |
+
df = load_leaderboard_from_hub()
|
503 |
+
for model_id in default_models:
|
504 |
+
try:
|
505 |
+
if not is_model_in_leaderboard(df, model_id):
|
506 |
+
print(f"Benchmarking {model_id}...")
|
507 |
+
result = add_to_leaderboard(model_id)
|
508 |
+
print(f"Result for {model_id}: {result}")
|
509 |
+
else:
|
510 |
+
print(f"{model_id} already in leaderboard, skipping.")
|
511 |
+
except Exception as e:
|
512 |
+
print(f"Error benchmarking {model_id}: {str(e)}")
|
513 |
+
print("Initial benchmarking complete.")
|
514 |
+
except Exception as e:
|
515 |
+
print(f"Fatal error in initial benchmark: {str(e)}")
|
516 |
+
|
517 |
+
# --- Gradio Interface ---
|
518 |
+
with gr.Blocks(title="Tokenizers Leaderboard", theme=gr.themes.Soft()) as iface:
|
519 |
+
gr.Markdown("# 🏆 Tokenizers Leaderboard")
|
520 |
+
|
521 |
+
with gr.Tab("Analyze"):
|
522 |
+
gr.Markdown("## Single Tokenizer Analysis")
|
523 |
+
with gr.Row():
|
524 |
+
model_search = HuggingfaceHubSearch(label="Search Models", placeholder="Search Hugging Face models...", search_type="model")
|
525 |
+
custom_model = gr.Textbox(label="Direct Model ID", placeholder="e.g.: mistralai/Mistral-7B-Instruct-v0.3", max_lines=1)
|
526 |
+
model_id = gr.Textbox(visible=False)
|
527 |
+
gr.Markdown("### Input Text")
|
528 |
+
text_input = gr.Textbox(lines=5, value=PREDEFINED_TEXT, label="Analysis Text")
|
529 |
+
with gr.Row():
|
530 |
+
analyze_btn = gr.Button("Analyze", variant="primary")
|
531 |
+
add_btn = gr.Button("Add to Leaderboard")
|
532 |
+
analysis_output = gr.Textbox(label="Results", interactive=False)
|
533 |
+
model_search.change(lambda x: x, model_search, model_id)
|
534 |
+
custom_model.change(lambda x: x, custom_model, model_id)
|
535 |
+
analyze_btn.click(analyze_tokenizer, [model_id, text_input], analysis_output)
|
536 |
+
add_event = add_btn.click(add_to_leaderboard, model_id, analysis_output)
|
537 |
+
|
538 |
+
with gr.Tab("Compare"):
|
539 |
+
gr.Markdown("## Multi-Model Comparison")
|
540 |
+
gr.Markdown(f"**Standard Text:** `{PREDEFINED_TEXT[:80]}...`")
|
541 |
+
model_ids = gr.Textbox(label="Model IDs (comma-separated)", placeholder="Enter models: meta-llama/Llama-2-7b, google/gemma-7b, ...")
|
542 |
+
compare_btn = gr.Button("Compare Models", variant="primary")
|
543 |
+
comparison_table = gr.DataFrame(label="Results", interactive=False)
|
544 |
+
compare_btn.click(compare_tokenizers, [model_ids, gr.Checkbox(value=True, visible=False)], comparison_table)
|
545 |
+
|
546 |
+
with gr.Tab("Leaderboard"):
|
547 |
+
gr.Markdown("## Performance Leaderboard")
|
548 |
+
with gr.Row():
|
549 |
+
download_btn = gr.DownloadButton(label="Download CSV", value="tokenizer_leaderboard.csv")
|
550 |
+
leaderboard_table = gr.DataFrame(label="Top Tokenizers", headers=LEADERBOARD_COLUMNS, interactive=False,
|
551 |
+
datatype=["str", "number", "number", "number", "number", "str"])
|
552 |
+
|
553 |
+
# Connect the download button to the function that prepares the CSV
|
554 |
+
download_btn.click(get_leaderboard_for_download, inputs=[], outputs=download_btn)
|
555 |
+
|
556 |
+
iface.load(fn=load_leaderboard_from_hub, outputs=leaderboard_table)
|
557 |
+
add_event.then(load_leaderboard_from_hub, None, leaderboard_table)
|
558 |
+
|
559 |
+
|
560 |
+
create_huggingface_dataset()
|
561 |
+
initial_benchmark_run()
|
562 |
+
iface.launch()
|