evijit HF Staff commited on
Commit
98b7de8
·
verified ·
1 Parent(s): 723854d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -204
app.py CHANGED
@@ -2,13 +2,11 @@ import json
2
  import gradio as gr
3
  import pandas as pd
4
  import plotly.express as px
5
- import pyarrow.parquet as pq
6
  import os
7
- import requests
8
- from io import BytesIO
9
  import numpy as np
 
10
 
11
- # Define pipeline tags from the provided code
12
  PIPELINE_TAGS = [
13
  'text-generation',
14
  'text-to-image',
@@ -59,61 +57,63 @@ MODEL_SIZE_RANGES = {
59
  "XX-Large (>50GB)": (50, float('inf'))
60
  }
61
 
62
- # Filter functions for tags - keeping the same from provided code
63
- def is_audio_speech(model_dict):
64
- tags = model_dict.get("tags", [])
65
- pipeline_tag = model_dict.get("pipeline_tag", "")
66
 
67
  return (pipeline_tag and ("audio" in pipeline_tag.lower() or "speech" in pipeline_tag.lower())) or \
68
  any("audio" in tag.lower() for tag in tags) or \
69
  any("speech" in tag.lower() for tag in tags)
70
 
71
- def is_music(model_dict):
72
- tags = model_dict.get("tags", [])
73
  return any("music" in tag.lower() for tag in tags)
74
 
75
- def is_robotics(model_dict):
76
- tags = model_dict.get("tags", [])
77
  return any("robot" in tag.lower() for tag in tags)
78
 
79
- def is_biomed(model_dict):
80
- tags = model_dict.get("tags", [])
81
  return any("bio" in tag.lower() for tag in tags) or \
82
  any("medic" in tag.lower() for tag in tags)
83
 
84
- def is_timeseries(model_dict):
85
- tags = model_dict.get("tags", [])
86
  return any("series" in tag.lower() for tag in tags)
87
 
88
- def is_science(model_dict):
89
- tags = model_dict.get("tags", [])
90
  return any("science" in tag.lower() and "bigscience" not in tag for tag in tags)
91
 
92
- def is_video(model_dict):
93
- tags = model_dict.get("tags", [])
94
  return any("video" in tag.lower() for tag in tags)
95
 
96
- def is_image(model_dict):
97
- tags = model_dict.get("tags", [])
98
  return any("image" in tag.lower() for tag in tags)
99
 
100
- def is_text(model_dict):
101
- tags = model_dict.get("tags", [])
102
  return any("text" in tag.lower() for tag in tags)
103
 
104
  # Add model size filter function
105
- def is_in_size_range(model_dict, size_range):
106
  if size_range is None:
107
  return True
108
 
109
  min_size, max_size = MODEL_SIZE_RANGES[size_range]
110
 
111
- # Get model size in GB from safetensors total (if available)
112
- safetensors = model_dict.get("safetensors", None)
113
- if safetensors and isinstance(safetensors, dict) and "total" in safetensors:
114
- # Convert bytes to GB
115
- size_gb = safetensors["total"] / (1024 * 1024 * 1024)
116
- return min_size <= size_gb < max_size
 
 
117
 
118
  return False
119
 
@@ -198,7 +198,8 @@ def create_treemap(treemap_data, count_by, title=None):
198
  treemap_data,
199
  path=["root", "organization", "id"],
200
  values=count_by,
201
- title=title or f"HuggingFace Models - {count_by.capitalize()} by Organization"
 
202
  )
203
 
204
  # Update layout
@@ -214,133 +215,34 @@ def create_treemap(treemap_data, count_by, title=None):
214
 
215
  return fig
216
 
217
- def download_with_progress(url, progress=None):
218
- """Download a file with progress tracking"""
219
- try:
220
- response = requests.get(url, stream=True)
221
- total_size = int(response.headers.get('content-length', 0))
222
- block_size = 1024 # 1 Kibibyte
223
- data = BytesIO()
224
-
225
- if total_size == 0:
226
- # If content length is unknown, we can't show accurate progress
227
- if progress is not None:
228
- progress(0, "Starting download...")
229
-
230
- for chunk in response.iter_content(block_size):
231
- data.write(chunk)
232
- if progress is not None:
233
- progress(0, f"Downloading... (unknown size)")
234
- else:
235
- downloaded = 0
236
- for chunk in response.iter_content(block_size):
237
- downloaded += len(chunk)
238
- data.write(chunk)
239
- if progress is not None:
240
- percent = int(100 * downloaded / total_size)
241
- progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
242
-
243
- return data.getvalue()
244
- except Exception as e:
245
- print(f"Error in download_with_progress: {e}")
246
- raise
247
-
248
- def update_progress(progress_obj, value, description):
249
- """Safely update progress with error handling"""
250
- try:
251
- if progress_obj is not None:
252
- progress_obj(value, description)
253
- except Exception as e:
254
- print(f"Error updating progress: {e}")
255
-
256
- def download_and_process_models(progress=None):
257
- """Download and process the models data from HuggingFace dataset with progress tracking"""
258
- try:
259
- # Create a cache directory
260
- if not os.path.exists('data'):
261
- os.makedirs('data')
262
-
263
- # Check if we have cached data
264
- if os.path.exists('data/processed_models.parquet'):
265
- update_progress(progress, 1.0, "Loading from cache...")
266
- print("Loading models from cache...")
267
- df = pd.read_parquet('data/processed_models.parquet')
268
- return df
269
 
270
- # URL to the models.parquet file
271
- url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
272
-
273
- update_progress(progress, 0.0, "Starting download...")
274
- print(f"Downloading models data from {url}...")
275
-
276
- try:
277
- # Download with progress tracking
278
- file_content = download_with_progress(url, progress)
279
-
280
- update_progress(progress, 0.9, "Parsing parquet file...")
281
-
282
- # Read the parquet file
283
- table = pq.read_table(BytesIO(file_content))
284
- df = table.to_pandas()
285
-
286
- print(f"Downloaded {len(df)} models")
287
-
288
- update_progress(progress, 0.95, "Processing data...")
289
-
290
- # Process the safetensors column if it's a string (JSON)
291
- if 'safetensors' in df.columns:
292
- def parse_safetensors(val):
293
- if isinstance(val, str):
294
- try:
295
- return json.loads(val)
296
- except:
297
- return None
298
- return val
299
-
300
- df['safetensors'] = df['safetensors'].apply(parse_safetensors)
301
-
302
- # Process the tags column if needed
303
- if 'tags' in df.columns and len(df) > 0 and not isinstance(df['tags'].iloc[0], list):
304
- def parse_tags(val):
305
- if isinstance(val, str):
306
- try:
307
- return json.loads(val)
308
- except:
309
- return []
310
- return val if isinstance(val, list) else []
311
-
312
- df['tags'] = df['tags'].apply(parse_tags)
313
-
314
- # Cache the processed data
315
- update_progress(progress, 0.98, "Saving to cache...")
316
- df.to_parquet('data/processed_models.parquet')
317
-
318
- update_progress(progress, 1.0, "Data ready!")
319
-
320
- return df
321
-
322
- except Exception as download_error:
323
- print(f"Download failed: {download_error}")
324
- update_progress(progress, 0.5, "Download failed, generating sample data...")
325
- return create_sample_data(progress)
326
 
327
- except Exception as e:
328
- print(f"Error downloading or processing data: {e}")
329
- update_progress(progress, 1.0, "Using sample data (error occurred)")
330
- # Return sample data for testing if real data unavailable
331
- return create_sample_data(progress)
332
-
333
- def create_sample_data(progress=None):
334
- """Create sample data for testing when real data is unavailable"""
335
- print("Creating sample data for testing...")
336
 
337
- if progress:
338
- progress(0.3, "Creating sample data...")
339
 
340
- # Sample organizations
 
 
 
 
341
  orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface',
342
  'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together',
343
- 'facebook', 'amazon', 'deepmind', 'cohere', 'nvidia', 'bigscience', 'eleutherai']
344
 
345
  # Common model name formats
346
  model_name_patterns = [
@@ -366,13 +268,10 @@ def create_sample_data(progress=None):
366
  variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"]
367
 
368
  # Generate sample data
369
- data = []
370
- total_models = sum(np.random.randint(5, 20) for _ in orgs)
371
- models_created = 0
372
-
373
  for org_idx, org in enumerate(orgs):
374
- # Create 5-20 models per organization
375
- num_models = np.random.randint(5, 20)
376
 
377
  for i in range(num_models):
378
  # Create realistic model name
@@ -428,11 +327,11 @@ def create_sample_data(progress=None):
428
  # Generate downloads and likes (weighted by org position for variety)
429
  # Earlier orgs get more downloads to make the visualization interesting
430
  popularity_factor = (len(orgs) - org_idx) / len(orgs) # 1.0 to 0.0
431
- base_downloads = 1000 * (10 ** (2 * popularity_factor))
432
  downloads = int(base_downloads * np.random.uniform(0.3, 3.0))
433
  likes = int(downloads * np.random.uniform(0.01, 0.1)) # 1-10% like ratio
434
 
435
- # Generate model size (in bytes for safetensors total)
436
  # Model size should correlate somewhat with the size in the name
437
  size_indicator = 1
438
  for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]:
@@ -440,53 +339,31 @@ def create_sample_data(progress=None):
440
  size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3
441
  break
442
 
443
- # Size in GB, then convert to bytes
444
- size_gb = np.random.uniform(0.1, 2.0) * size_indicator
445
- if size_gb > 50: # Cap at 100GB
446
- size_gb = min(size_gb, 100)
447
- size_bytes = int(size_gb * 1e9)
448
 
449
  # Create model entry
450
  model = {
451
  "id": model_id,
 
452
  "downloads": downloads,
453
- "downloadsAllTime": int(downloads * np.random.uniform(1.5, 3.0)), # All-time higher than recent
454
  "likes": likes,
455
  "pipeline_tag": pipeline_tag,
456
  "tags": tags,
457
- "safetensors": {"total": size_bytes}
458
  }
459
 
460
- data.append(model)
461
- models_created += 1
462
-
463
- if progress and i % 5 == 0:
464
- progress(0.3 + 0.6 * (models_created / total_models), f"Created {models_created}/{total_models} sample models...")
465
-
466
- # Convert to DataFrame
467
- df = pd.DataFrame(data)
468
-
469
- if progress:
470
- progress(0.95, "Finalizing sample data...")
471
 
472
- return df
 
 
473
 
474
  # Create Gradio interface
475
  with gr.Blocks() as demo:
476
  models_data = gr.State() # To store loaded data
477
 
478
- # Loading screen components
479
- with gr.Row(visible=True) as loading_screen:
480
- with gr.Column(scale=1):
481
- gr.Markdown("""
482
- # HuggingFace Models TreeMap Visualization
483
-
484
- Loading data... This might take a moment.
485
- """)
486
- data_loading_progress = gr.Progress()
487
-
488
- # Main application components (initially hidden)
489
- with gr.Row(visible=False) as main_app:
490
  gr.Markdown("""
491
  # HuggingFace Models TreeMap Visualization
492
 
@@ -496,11 +373,11 @@ with gr.Blocks() as demo:
496
  The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes).
497
  """)
498
 
499
- with gr.Row(visible=False) as control_panel:
500
  with gr.Column(scale=1):
501
  count_by_dropdown = gr.Dropdown(
502
  label="Metric",
503
- choices=["downloads", "downloadsAllTime", "likes"],
504
  value="downloads",
505
  info="Select the metric to determine box sizes"
506
  )
@@ -532,7 +409,7 @@ with gr.Blocks() as demo:
532
  label="Model Size Filter",
533
  choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
534
  value="None",
535
- info="Filter models by their size (in safetensors['total'])"
536
  )
537
 
538
  top_k_slider = gr.Slider(
@@ -623,17 +500,11 @@ with gr.Blocks() as demo:
623
  outputs=[tag_filter_dropdown, pipeline_filter_dropdown]
624
  )
625
 
626
- def load_data_with_progress(progress=gr.Progress()):
627
- """Load data with progress tracking and update UI visibility"""
628
- data_df = download_and_process_models(progress)
629
- # Return both the data and the visibility updates
630
- return data_df, gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
631
-
632
- # Load data once at startup with progress bar
633
  demo.load(
634
- fn=load_data_with_progress,
635
  inputs=[],
636
- outputs=[models_data, loading_screen, main_app, control_panel]
637
  )
638
 
639
  # Button click event to generate plot
 
2
  import gradio as gr
3
  import pandas as pd
4
  import plotly.express as px
 
5
  import os
 
 
6
  import numpy as np
7
+ import io
8
 
9
+ # Define pipeline tags
10
  PIPELINE_TAGS = [
11
  'text-generation',
12
  'text-to-image',
 
57
  "XX-Large (>50GB)": (50, float('inf'))
58
  }
59
 
60
+ # Filter functions for tags
61
+ def is_audio_speech(row):
62
+ tags = row.get("tags", [])
63
+ pipeline_tag = row.get("pipeline_tag", "")
64
 
65
  return (pipeline_tag and ("audio" in pipeline_tag.lower() or "speech" in pipeline_tag.lower())) or \
66
  any("audio" in tag.lower() for tag in tags) or \
67
  any("speech" in tag.lower() for tag in tags)
68
 
69
+ def is_music(row):
70
+ tags = row.get("tags", [])
71
  return any("music" in tag.lower() for tag in tags)
72
 
73
+ def is_robotics(row):
74
+ tags = row.get("tags", [])
75
  return any("robot" in tag.lower() for tag in tags)
76
 
77
+ def is_biomed(row):
78
+ tags = row.get("tags", [])
79
  return any("bio" in tag.lower() for tag in tags) or \
80
  any("medic" in tag.lower() for tag in tags)
81
 
82
+ def is_timeseries(row):
83
+ tags = row.get("tags", [])
84
  return any("series" in tag.lower() for tag in tags)
85
 
86
+ def is_science(row):
87
+ tags = row.get("tags", [])
88
  return any("science" in tag.lower() and "bigscience" not in tag for tag in tags)
89
 
90
+ def is_video(row):
91
+ tags = row.get("tags", [])
92
  return any("video" in tag.lower() for tag in tags)
93
 
94
+ def is_image(row):
95
+ tags = row.get("tags", [])
96
  return any("image" in tag.lower() for tag in tags)
97
 
98
+ def is_text(row):
99
+ tags = row.get("tags", [])
100
  return any("text" in tag.lower() for tag in tags)
101
 
102
  # Add model size filter function
103
+ def is_in_size_range(row, size_range):
104
  if size_range is None:
105
  return True
106
 
107
  min_size, max_size = MODEL_SIZE_RANGES[size_range]
108
 
109
+ # Get model size in GB from params column
110
+ if "params" in row and pd.notna(row["params"]):
111
+ try:
112
+ # Convert to GB (assuming params are in bytes or scientific notation)
113
+ size_gb = float(row["params"]) / (1024 * 1024 * 1024)
114
+ return min_size <= size_gb < max_size
115
+ except (ValueError, TypeError):
116
+ return False
117
 
118
  return False
119
 
 
198
  treemap_data,
199
  path=["root", "organization", "id"],
200
  values=count_by,
201
+ title=title or f"HuggingFace Models - {count_by.capitalize()} by Organization",
202
+ color_discrete_sequence=px.colors.qualitative.Plotly
203
  )
204
 
205
  # Update layout
 
215
 
216
  return fig
217
 
218
+ def load_models_csv():
219
+
220
+ # Read the CSV file
221
+ df = pd.read_csv('models.csv')
222
+
223
+ # Process the tags column
224
+ def process_tags(tags_str):
225
+ if pd.isna(tags_str):
226
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ # Clean the string and convert to a list
229
+ tags_str = tags_str.strip("[]").replace("'", "")
230
+ tags = [tag.strip() for tag in tags_str.split() if tag.strip()]
231
+ return tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ df['tags'] = df['tags'].apply(process_tags)
 
 
 
 
 
 
 
 
234
 
235
+ # Add more sample data for better visualization
236
+ add_sample_data(df)
237
 
238
+ return df
239
+
240
+ def add_sample_data(df):
241
+ """Add more sample data to make the visualization more interesting"""
242
+ # Top organizations to include
243
  orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface',
244
  'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together',
245
+ 'facebook', 'amazon', 'deepmind', 'cohere', 'bigscience', 'eleutherai']
246
 
247
  # Common model name formats
248
  model_name_patterns = [
 
268
  variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"]
269
 
270
  # Generate sample data
271
+ sample_data = []
 
 
 
272
  for org_idx, org in enumerate(orgs):
273
+ # Create 5-10 models per organization
274
+ num_models = np.random.randint(5, 11)
275
 
276
  for i in range(num_models):
277
  # Create realistic model name
 
327
  # Generate downloads and likes (weighted by org position for variety)
328
  # Earlier orgs get more downloads to make the visualization interesting
329
  popularity_factor = (len(orgs) - org_idx) / len(orgs) # 1.0 to 0.0
330
+ base_downloads = 10000 * (10 ** (2 * popularity_factor))
331
  downloads = int(base_downloads * np.random.uniform(0.3, 3.0))
332
  likes = int(downloads * np.random.uniform(0.01, 0.1)) # 1-10% like ratio
333
 
334
+ # Generate model size (in bytes for params)
335
  # Model size should correlate somewhat with the size in the name
336
  size_indicator = 1
337
  for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]:
 
339
  size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3
340
  break
341
 
342
+ # Size in bytes
343
+ params = int(np.random.uniform(0.5, 2.0) * size_indicator * 1e9)
 
 
 
344
 
345
  # Create model entry
346
  model = {
347
  "id": model_id,
348
+ "author": org,
349
  "downloads": downloads,
 
350
  "likes": likes,
351
  "pipeline_tag": pipeline_tag,
352
  "tags": tags,
353
+ "params": params
354
  }
355
 
356
+ sample_data.append(model)
 
 
 
 
 
 
 
 
 
 
357
 
358
+ # Convert sample data to DataFrame and append to original
359
+ sample_df = pd.DataFrame(sample_data)
360
+ return pd.concat([df, sample_df], ignore_index=True)
361
 
362
  # Create Gradio interface
363
  with gr.Blocks() as demo:
364
  models_data = gr.State() # To store loaded data
365
 
366
+ with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
367
  gr.Markdown("""
368
  # HuggingFace Models TreeMap Visualization
369
 
 
373
  The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes).
374
  """)
375
 
376
+ with gr.Row():
377
  with gr.Column(scale=1):
378
  count_by_dropdown = gr.Dropdown(
379
  label="Metric",
380
+ choices=["downloads", "likes"],
381
  value="downloads",
382
  info="Select the metric to determine box sizes"
383
  )
 
409
  label="Model Size Filter",
410
  choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
411
  value="None",
412
+ info="Filter models by their size (using params column)"
413
  )
414
 
415
  top_k_slider = gr.Slider(
 
500
  outputs=[tag_filter_dropdown, pipeline_filter_dropdown]
501
  )
502
 
503
+ # Load data once at startup
 
 
 
 
 
 
504
  demo.load(
505
+ fn=load_models_csv,
506
  inputs=[],
507
+ outputs=[models_data]
508
  )
509
 
510
  # Button click event to generate plot