evijit HF Staff commited on
Commit
3043125
·
verified ·
1 Parent(s): 18ef775

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -50
app.py CHANGED
@@ -162,12 +162,6 @@ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, pipeline_filter=N
162
  if skip_orgs and len(skip_orgs) > 0:
163
  filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
164
 
165
- # Ensure count_by column exists with valid values
166
- if count_by not in filtered_df.columns or filtered_df[count_by].isna().all():
167
- print(f"Warning: {count_by} column is missing or all values are NaN")
168
- # Create a default column with value 1 for all rows if count_by is missing
169
- filtered_df[count_by] = 1
170
-
171
  # Aggregate by organization
172
  org_totals = filtered_df.groupby("organization")[count_by].sum().reset_index()
173
  org_totals = org_totals.sort_values(by=count_by, ascending=False)
@@ -218,16 +212,9 @@ def create_treemap(treemap_data, count_by, title=None):
218
  )
219
 
220
  # Update traces for better readability
221
- metric_display_names = {
222
- "downloads": "Downloads (Last 30 days)",
223
- "downloadsAllTime": "Downloads (All Time)",
224
- "likes": "Likes"
225
- }
226
- display_name = metric_display_names.get(count_by, count_by.capitalize())
227
-
228
  fig.update_traces(
229
  textinfo="label+value+percent root",
230
- hovertemplate="<b>%{label}</b><br>%{value:,} " + display_name + "<br>%{percentRoot:.2%} of total<extra></extra>"
231
  )
232
 
233
  return fig
@@ -248,18 +235,132 @@ def load_models_csv():
248
 
249
  df['tags'] = df['tags'].apply(process_tags)
250
 
251
- # Ensure all required metrics are present and convert to numeric
252
- required_metrics = ['downloads', 'likes', 'downloadsAllTime']
253
- for metric in required_metrics:
254
- if metric in df.columns:
255
- df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0)
256
- else:
257
- print(f"Warning: '{metric}' not found in CSV. Creating empty column.")
258
- df[metric] = 0
259
 
260
  return df
261
 
262
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
  # Create Gradio interface
265
  with gr.Blocks() as demo:
@@ -272,20 +373,14 @@ with gr.Blocks() as demo:
272
  This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
273
  Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
274
 
275
- The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (Downloads, Likes).
276
-
277
- *Note: Stats are correct as of May 12, 2025*
278
  """)
279
 
280
  with gr.Row():
281
  with gr.Column(scale=1):
282
  count_by_dropdown = gr.Dropdown(
283
  label="Metric",
284
- choices=[
285
- ("downloads", "Downloads (Last 30 days)"),
286
- ("downloadsAllTime", "Downloads (All Time)"),
287
- ("likes", "Likes")
288
- ],
289
  value="downloads",
290
  info="Select the metric to determine box sizes"
291
  )
@@ -377,17 +472,10 @@ with gr.Blocks() as demo:
377
  )
378
 
379
  # Create plot
380
- metric_display_names = {
381
- "downloads": "Downloads (Last 30 days)",
382
- "downloadsAllTime": "Downloads (All Time)",
383
- "likes": "Likes"
384
- }
385
- display_name = metric_display_names.get(count_by, count_by.capitalize())
386
-
387
  fig = create_treemap(
388
  treemap_data=treemap_data,
389
  count_by=count_by,
390
- title=f"HuggingFace Models - {display_name} by Organization"
391
  )
392
 
393
  # Generate statistics
@@ -399,21 +487,14 @@ with gr.Blocks() as demo:
399
  top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5)
400
 
401
  # Format the statistics using clean markdown
402
- metric_display_names = {
403
- "downloads": "Downloads (Last 30 days)",
404
- "downloadsAllTime": "Downloads (All Time)",
405
- "likes": "Likes"
406
- }
407
- display_name = metric_display_names.get(count_by, count_by.capitalize())
408
-
409
  stats_md = f"""
410
- ## Statistics as of May 12, 2025
411
  - **Total models shown**: {total_models:,}
412
- - **Total {display_name}**: {int(total_value):,}
413
 
414
- ## Top Organizations by {display_name}
415
 
416
- | Organization | {display_name} | % of Total |
417
  |--------------|--------:|--------:|"""
418
 
419
  # Add each organization as a row in the table
 
162
  if skip_orgs and len(skip_orgs) > 0:
163
  filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
164
 
 
 
 
 
 
 
165
  # Aggregate by organization
166
  org_totals = filtered_df.groupby("organization")[count_by].sum().reset_index()
167
  org_totals = org_totals.sort_values(by=count_by, ascending=False)
 
212
  )
213
 
214
  # Update traces for better readability
 
 
 
 
 
 
 
215
  fig.update_traces(
216
  textinfo="label+value+percent root",
217
+ hovertemplate="<b>%{label}</b><br>%{value:,} " + count_by + "<br>%{percentRoot:.2%} of total<extra></extra>"
218
  )
219
 
220
  return fig
 
235
 
236
  df['tags'] = df['tags'].apply(process_tags)
237
 
238
+ # Add more sample data for better visualization
239
+ add_sample_data(df)
 
 
 
 
 
 
240
 
241
  return df
242
 
243
+ def add_sample_data(df):
244
+ """Add more sample data to make the visualization more interesting"""
245
+ # Top organizations to include
246
+ orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface',
247
+ 'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together',
248
+ 'facebook', 'amazon', 'deepmind', 'cohere', 'bigscience', 'eleutherai']
249
+
250
+ # Common model name formats
251
+ model_name_patterns = [
252
+ "model-{size}-{version}",
253
+ "{prefix}-{size}b",
254
+ "{prefix}-{size}b-{variant}",
255
+ "llama-{size}b-{variant}",
256
+ "gpt-{variant}-{size}b",
257
+ "{prefix}-instruct-{size}b",
258
+ "{prefix}-chat-{size}b",
259
+ "{prefix}-coder-{size}b",
260
+ "stable-diffusion-{version}",
261
+ "whisper-{size}",
262
+ "bert-{size}-{variant}",
263
+ "roberta-{size}",
264
+ "t5-{size}",
265
+ "{prefix}-vision-{size}b"
266
+ ]
267
+
268
+ # Common name parts
269
+ prefixes = ["falcon", "llama", "mistral", "gpt", "phi", "gemma", "qwen", "yi", "mpt", "bloom"]
270
+ sizes = ["7", "13", "34", "70", "1", "3", "7b", "13b", "70b", "8b", "2b", "1b", "0.5b", "small", "base", "large", "huge"]
271
+ variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"]
272
+
273
+ # Generate sample data
274
+ sample_data = []
275
+ for org_idx, org in enumerate(orgs):
276
+ # Create 5-10 models per organization
277
+ num_models = np.random.randint(5, 11)
278
+
279
+ for i in range(num_models):
280
+ # Create realistic model name
281
+ pattern = np.random.choice(model_name_patterns)
282
+ prefix = np.random.choice(prefixes)
283
+ size = np.random.choice(sizes)
284
+ version = f"v{np.random.randint(1, 4)}"
285
+ variant = np.random.choice(variants)
286
+
287
+ model_name = pattern.format(
288
+ prefix=prefix,
289
+ size=size,
290
+ version=version,
291
+ variant=variant
292
+ )
293
+
294
+ model_id = f"{org}/{model_name}"
295
+
296
+ # Select a realistic pipeline tag based on name
297
+ if "diffusion" in model_name or "image" in model_name:
298
+ pipeline_tag = np.random.choice(["text-to-image", "image-to-image", "image-segmentation"])
299
+ elif "whisper" in model_name or "speech" in model_name:
300
+ pipeline_tag = np.random.choice(["automatic-speech-recognition", "text-to-speech"])
301
+ elif "coder" in model_name or "code" in model_name:
302
+ pipeline_tag = "text-generation"
303
+ elif "bert" in model_name or "roberta" in model_name:
304
+ pipeline_tag = np.random.choice(["fill-mask", "text-classification", "token-classification"])
305
+ elif "vision" in model_name:
306
+ pipeline_tag = np.random.choice(["image-classification", "image-to-text", "visual-question-answering"])
307
+ else:
308
+ pipeline_tag = "text-generation" # Most common
309
+
310
+ # Generate realistic tags
311
+ tags = [pipeline_tag]
312
+
313
+ if "text-generation" in pipeline_tag:
314
+ tags.extend(["language-model", "text", "gpt", "llm"])
315
+ if "instruct" in model_name:
316
+ tags.append("instruction-following")
317
+ if "chat" in model_name:
318
+ tags.append("chat")
319
+ elif "speech" in pipeline_tag:
320
+ tags.extend(["audio", "speech", "voice"])
321
+ elif "image" in pipeline_tag:
322
+ tags.extend(["vision", "image", "diffusion"])
323
+
324
+ # Add language tags
325
+ if np.random.random() < 0.8: # 80% chance for English
326
+ tags.append("en")
327
+ if np.random.random() < 0.3: # 30% chance for multilingual
328
+ tags.append("multilingual")
329
+
330
+ # Generate downloads and likes (weighted by org position for variety)
331
+ # Earlier orgs get more downloads to make the visualization interesting
332
+ popularity_factor = (len(orgs) - org_idx) / len(orgs) # 1.0 to 0.0
333
+ base_downloads = 10000 * (10 ** (2 * popularity_factor))
334
+ downloads = int(base_downloads * np.random.uniform(0.3, 3.0))
335
+ likes = int(downloads * np.random.uniform(0.01, 0.1)) # 1-10% like ratio
336
+
337
+ # Generate model size (in bytes for params)
338
+ # Model size should correlate somewhat with the size in the name
339
+ size_indicator = 1
340
+ for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]:
341
+ if s in model_name.lower():
342
+ size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3
343
+ break
344
+
345
+ # Size in bytes
346
+ params = int(np.random.uniform(0.5, 2.0) * size_indicator * 1e9)
347
+
348
+ # Create model entry
349
+ model = {
350
+ "id": model_id,
351
+ "author": org,
352
+ "downloads": downloads,
353
+ "likes": likes,
354
+ "pipeline_tag": pipeline_tag,
355
+ "tags": tags,
356
+ "params": params
357
+ }
358
+
359
+ sample_data.append(model)
360
+
361
+ # Convert sample data to DataFrame and append to original
362
+ sample_df = pd.DataFrame(sample_data)
363
+ return pd.concat([df, sample_df], ignore_index=True)
364
 
365
  # Create Gradio interface
366
  with gr.Blocks() as demo:
 
373
  This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
374
  Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
375
 
376
+ The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes).
 
 
377
  """)
378
 
379
  with gr.Row():
380
  with gr.Column(scale=1):
381
  count_by_dropdown = gr.Dropdown(
382
  label="Metric",
383
+ choices=["downloads", "likes"],
 
 
 
 
384
  value="downloads",
385
  info="Select the metric to determine box sizes"
386
  )
 
472
  )
473
 
474
  # Create plot
 
 
 
 
 
 
 
475
  fig = create_treemap(
476
  treemap_data=treemap_data,
477
  count_by=count_by,
478
+ title=f"HuggingFace Models - {count_by.capitalize()} by Organization"
479
  )
480
 
481
  # Generate statistics
 
487
  top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5)
488
 
489
  # Format the statistics using clean markdown
 
 
 
 
 
 
 
490
  stats_md = f"""
491
+ ## Statistics
492
  - **Total models shown**: {total_models:,}
493
+ - **Total {count_by}**: {int(total_value):,}
494
 
495
+ ## Top Organizations by {count_by.capitalize()}
496
 
497
+ | Organization | {count_by.capitalize()} | % of Total |
498
  |--------------|--------:|--------:|"""
499
 
500
  # Add each organization as a row in the table