Update app.py
Browse files
app.py
CHANGED
@@ -162,12 +162,6 @@ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, pipeline_filter=N
|
|
162 |
if skip_orgs and len(skip_orgs) > 0:
|
163 |
filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
|
164 |
|
165 |
-
# Ensure count_by column exists with valid values
|
166 |
-
if count_by not in filtered_df.columns or filtered_df[count_by].isna().all():
|
167 |
-
print(f"Warning: {count_by} column is missing or all values are NaN")
|
168 |
-
# Create a default column with value 1 for all rows if count_by is missing
|
169 |
-
filtered_df[count_by] = 1
|
170 |
-
|
171 |
# Aggregate by organization
|
172 |
org_totals = filtered_df.groupby("organization")[count_by].sum().reset_index()
|
173 |
org_totals = org_totals.sort_values(by=count_by, ascending=False)
|
@@ -218,16 +212,9 @@ def create_treemap(treemap_data, count_by, title=None):
|
|
218 |
)
|
219 |
|
220 |
# Update traces for better readability
|
221 |
-
metric_display_names = {
|
222 |
-
"downloads": "Downloads (Last 30 days)",
|
223 |
-
"downloadsAllTime": "Downloads (All Time)",
|
224 |
-
"likes": "Likes"
|
225 |
-
}
|
226 |
-
display_name = metric_display_names.get(count_by, count_by.capitalize())
|
227 |
-
|
228 |
fig.update_traces(
|
229 |
textinfo="label+value+percent root",
|
230 |
-
hovertemplate="<b>%{label}</b><br>%{value:,} " +
|
231 |
)
|
232 |
|
233 |
return fig
|
@@ -248,18 +235,132 @@ def load_models_csv():
|
|
248 |
|
249 |
df['tags'] = df['tags'].apply(process_tags)
|
250 |
|
251 |
-
#
|
252 |
-
|
253 |
-
for metric in required_metrics:
|
254 |
-
if metric in df.columns:
|
255 |
-
df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0)
|
256 |
-
else:
|
257 |
-
print(f"Warning: '{metric}' not found in CSV. Creating empty column.")
|
258 |
-
df[metric] = 0
|
259 |
|
260 |
return df
|
261 |
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
# Create Gradio interface
|
265 |
with gr.Blocks() as demo:
|
@@ -272,20 +373,14 @@ with gr.Blocks() as demo:
|
|
272 |
This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
|
273 |
Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
|
274 |
|
275 |
-
The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (
|
276 |
-
|
277 |
-
*Note: Stats are correct as of May 12, 2025*
|
278 |
""")
|
279 |
|
280 |
with gr.Row():
|
281 |
with gr.Column(scale=1):
|
282 |
count_by_dropdown = gr.Dropdown(
|
283 |
label="Metric",
|
284 |
-
choices=[
|
285 |
-
("downloads", "Downloads (Last 30 days)"),
|
286 |
-
("downloadsAllTime", "Downloads (All Time)"),
|
287 |
-
("likes", "Likes")
|
288 |
-
],
|
289 |
value="downloads",
|
290 |
info="Select the metric to determine box sizes"
|
291 |
)
|
@@ -377,17 +472,10 @@ with gr.Blocks() as demo:
|
|
377 |
)
|
378 |
|
379 |
# Create plot
|
380 |
-
metric_display_names = {
|
381 |
-
"downloads": "Downloads (Last 30 days)",
|
382 |
-
"downloadsAllTime": "Downloads (All Time)",
|
383 |
-
"likes": "Likes"
|
384 |
-
}
|
385 |
-
display_name = metric_display_names.get(count_by, count_by.capitalize())
|
386 |
-
|
387 |
fig = create_treemap(
|
388 |
treemap_data=treemap_data,
|
389 |
count_by=count_by,
|
390 |
-
title=f"HuggingFace Models - {
|
391 |
)
|
392 |
|
393 |
# Generate statistics
|
@@ -399,21 +487,14 @@ with gr.Blocks() as demo:
|
|
399 |
top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5)
|
400 |
|
401 |
# Format the statistics using clean markdown
|
402 |
-
metric_display_names = {
|
403 |
-
"downloads": "Downloads (Last 30 days)",
|
404 |
-
"downloadsAllTime": "Downloads (All Time)",
|
405 |
-
"likes": "Likes"
|
406 |
-
}
|
407 |
-
display_name = metric_display_names.get(count_by, count_by.capitalize())
|
408 |
-
|
409 |
stats_md = f"""
|
410 |
-
## Statistics
|
411 |
- **Total models shown**: {total_models:,}
|
412 |
-
- **Total {
|
413 |
|
414 |
-
## Top Organizations by {
|
415 |
|
416 |
-
| Organization | {
|
417 |
|--------------|--------:|--------:|"""
|
418 |
|
419 |
# Add each organization as a row in the table
|
|
|
162 |
if skip_orgs and len(skip_orgs) > 0:
|
163 |
filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
# Aggregate by organization
|
166 |
org_totals = filtered_df.groupby("organization")[count_by].sum().reset_index()
|
167 |
org_totals = org_totals.sort_values(by=count_by, ascending=False)
|
|
|
212 |
)
|
213 |
|
214 |
# Update traces for better readability
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
fig.update_traces(
|
216 |
textinfo="label+value+percent root",
|
217 |
+
hovertemplate="<b>%{label}</b><br>%{value:,} " + count_by + "<br>%{percentRoot:.2%} of total<extra></extra>"
|
218 |
)
|
219 |
|
220 |
return fig
|
|
|
235 |
|
236 |
df['tags'] = df['tags'].apply(process_tags)
|
237 |
|
238 |
+
# Add more sample data for better visualization
|
239 |
+
add_sample_data(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
return df
|
242 |
|
243 |
+
def add_sample_data(df):
|
244 |
+
"""Add more sample data to make the visualization more interesting"""
|
245 |
+
# Top organizations to include
|
246 |
+
orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface',
|
247 |
+
'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together',
|
248 |
+
'facebook', 'amazon', 'deepmind', 'cohere', 'bigscience', 'eleutherai']
|
249 |
+
|
250 |
+
# Common model name formats
|
251 |
+
model_name_patterns = [
|
252 |
+
"model-{size}-{version}",
|
253 |
+
"{prefix}-{size}b",
|
254 |
+
"{prefix}-{size}b-{variant}",
|
255 |
+
"llama-{size}b-{variant}",
|
256 |
+
"gpt-{variant}-{size}b",
|
257 |
+
"{prefix}-instruct-{size}b",
|
258 |
+
"{prefix}-chat-{size}b",
|
259 |
+
"{prefix}-coder-{size}b",
|
260 |
+
"stable-diffusion-{version}",
|
261 |
+
"whisper-{size}",
|
262 |
+
"bert-{size}-{variant}",
|
263 |
+
"roberta-{size}",
|
264 |
+
"t5-{size}",
|
265 |
+
"{prefix}-vision-{size}b"
|
266 |
+
]
|
267 |
+
|
268 |
+
# Common name parts
|
269 |
+
prefixes = ["falcon", "llama", "mistral", "gpt", "phi", "gemma", "qwen", "yi", "mpt", "bloom"]
|
270 |
+
sizes = ["7", "13", "34", "70", "1", "3", "7b", "13b", "70b", "8b", "2b", "1b", "0.5b", "small", "base", "large", "huge"]
|
271 |
+
variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"]
|
272 |
+
|
273 |
+
# Generate sample data
|
274 |
+
sample_data = []
|
275 |
+
for org_idx, org in enumerate(orgs):
|
276 |
+
# Create 5-10 models per organization
|
277 |
+
num_models = np.random.randint(5, 11)
|
278 |
+
|
279 |
+
for i in range(num_models):
|
280 |
+
# Create realistic model name
|
281 |
+
pattern = np.random.choice(model_name_patterns)
|
282 |
+
prefix = np.random.choice(prefixes)
|
283 |
+
size = np.random.choice(sizes)
|
284 |
+
version = f"v{np.random.randint(1, 4)}"
|
285 |
+
variant = np.random.choice(variants)
|
286 |
+
|
287 |
+
model_name = pattern.format(
|
288 |
+
prefix=prefix,
|
289 |
+
size=size,
|
290 |
+
version=version,
|
291 |
+
variant=variant
|
292 |
+
)
|
293 |
+
|
294 |
+
model_id = f"{org}/{model_name}"
|
295 |
+
|
296 |
+
# Select a realistic pipeline tag based on name
|
297 |
+
if "diffusion" in model_name or "image" in model_name:
|
298 |
+
pipeline_tag = np.random.choice(["text-to-image", "image-to-image", "image-segmentation"])
|
299 |
+
elif "whisper" in model_name or "speech" in model_name:
|
300 |
+
pipeline_tag = np.random.choice(["automatic-speech-recognition", "text-to-speech"])
|
301 |
+
elif "coder" in model_name or "code" in model_name:
|
302 |
+
pipeline_tag = "text-generation"
|
303 |
+
elif "bert" in model_name or "roberta" in model_name:
|
304 |
+
pipeline_tag = np.random.choice(["fill-mask", "text-classification", "token-classification"])
|
305 |
+
elif "vision" in model_name:
|
306 |
+
pipeline_tag = np.random.choice(["image-classification", "image-to-text", "visual-question-answering"])
|
307 |
+
else:
|
308 |
+
pipeline_tag = "text-generation" # Most common
|
309 |
+
|
310 |
+
# Generate realistic tags
|
311 |
+
tags = [pipeline_tag]
|
312 |
+
|
313 |
+
if "text-generation" in pipeline_tag:
|
314 |
+
tags.extend(["language-model", "text", "gpt", "llm"])
|
315 |
+
if "instruct" in model_name:
|
316 |
+
tags.append("instruction-following")
|
317 |
+
if "chat" in model_name:
|
318 |
+
tags.append("chat")
|
319 |
+
elif "speech" in pipeline_tag:
|
320 |
+
tags.extend(["audio", "speech", "voice"])
|
321 |
+
elif "image" in pipeline_tag:
|
322 |
+
tags.extend(["vision", "image", "diffusion"])
|
323 |
+
|
324 |
+
# Add language tags
|
325 |
+
if np.random.random() < 0.8: # 80% chance for English
|
326 |
+
tags.append("en")
|
327 |
+
if np.random.random() < 0.3: # 30% chance for multilingual
|
328 |
+
tags.append("multilingual")
|
329 |
+
|
330 |
+
# Generate downloads and likes (weighted by org position for variety)
|
331 |
+
# Earlier orgs get more downloads to make the visualization interesting
|
332 |
+
popularity_factor = (len(orgs) - org_idx) / len(orgs) # 1.0 to 0.0
|
333 |
+
base_downloads = 10000 * (10 ** (2 * popularity_factor))
|
334 |
+
downloads = int(base_downloads * np.random.uniform(0.3, 3.0))
|
335 |
+
likes = int(downloads * np.random.uniform(0.01, 0.1)) # 1-10% like ratio
|
336 |
+
|
337 |
+
# Generate model size (in bytes for params)
|
338 |
+
# Model size should correlate somewhat with the size in the name
|
339 |
+
size_indicator = 1
|
340 |
+
for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]:
|
341 |
+
if s in model_name.lower():
|
342 |
+
size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3
|
343 |
+
break
|
344 |
+
|
345 |
+
# Size in bytes
|
346 |
+
params = int(np.random.uniform(0.5, 2.0) * size_indicator * 1e9)
|
347 |
+
|
348 |
+
# Create model entry
|
349 |
+
model = {
|
350 |
+
"id": model_id,
|
351 |
+
"author": org,
|
352 |
+
"downloads": downloads,
|
353 |
+
"likes": likes,
|
354 |
+
"pipeline_tag": pipeline_tag,
|
355 |
+
"tags": tags,
|
356 |
+
"params": params
|
357 |
+
}
|
358 |
+
|
359 |
+
sample_data.append(model)
|
360 |
+
|
361 |
+
# Convert sample data to DataFrame and append to original
|
362 |
+
sample_df = pd.DataFrame(sample_data)
|
363 |
+
return pd.concat([df, sample_df], ignore_index=True)
|
364 |
|
365 |
# Create Gradio interface
|
366 |
with gr.Blocks() as demo:
|
|
|
373 |
This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
|
374 |
Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
|
375 |
|
376 |
+
The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes).
|
|
|
|
|
377 |
""")
|
378 |
|
379 |
with gr.Row():
|
380 |
with gr.Column(scale=1):
|
381 |
count_by_dropdown = gr.Dropdown(
|
382 |
label="Metric",
|
383 |
+
choices=["downloads", "likes"],
|
|
|
|
|
|
|
|
|
384 |
value="downloads",
|
385 |
info="Select the metric to determine box sizes"
|
386 |
)
|
|
|
472 |
)
|
473 |
|
474 |
# Create plot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
fig = create_treemap(
|
476 |
treemap_data=treemap_data,
|
477 |
count_by=count_by,
|
478 |
+
title=f"HuggingFace Models - {count_by.capitalize()} by Organization"
|
479 |
)
|
480 |
|
481 |
# Generate statistics
|
|
|
487 |
top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5)
|
488 |
|
489 |
# Format the statistics using clean markdown
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
stats_md = f"""
|
491 |
+
## Statistics
|
492 |
- **Total models shown**: {total_models:,}
|
493 |
+
- **Total {count_by}**: {int(total_value):,}
|
494 |
|
495 |
+
## Top Organizations by {count_by.capitalize()}
|
496 |
|
497 |
+
| Organization | {count_by.capitalize()} | % of Total |
|
498 |
|--------------|--------:|--------:|"""
|
499 |
|
500 |
# Add each organization as a row in the table
|