|
import json |
|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
import os |
|
import numpy as np |
|
import io |
|
|
|
|
|
PIPELINE_TAGS = [ |
|
'text-generation', |
|
'text-to-image', |
|
'text-classification', |
|
'text2text-generation', |
|
'audio-to-audio', |
|
'feature-extraction', |
|
'image-classification', |
|
'translation', |
|
'reinforcement-learning', |
|
'fill-mask', |
|
'text-to-speech', |
|
'automatic-speech-recognition', |
|
'image-text-to-text', |
|
'token-classification', |
|
'sentence-similarity', |
|
'question-answering', |
|
'image-feature-extraction', |
|
'summarization', |
|
'zero-shot-image-classification', |
|
'object-detection', |
|
'image-segmentation', |
|
'image-to-image', |
|
'image-to-text', |
|
'audio-classification', |
|
'visual-question-answering', |
|
'text-to-video', |
|
'zero-shot-classification', |
|
'depth-estimation', |
|
'text-ranking', |
|
'image-to-video', |
|
'multiple-choice', |
|
'unconditional-image-generation', |
|
'video-classification', |
|
'text-to-audio', |
|
'time-series-forecasting', |
|
'any-to-any', |
|
'video-text-to-text', |
|
'table-question-answering', |
|
] |
|
|
|
|
|
MODEL_SIZE_RANGES = { |
|
"Small (<1GB)": (0, 1), |
|
"Medium (1-5GB)": (1, 5), |
|
"Large (5-20GB)": (5, 20), |
|
"X-Large (20-50GB)": (20, 50), |
|
"XX-Large (>50GB)": (50, float('inf')) |
|
} |
|
|
|
|
|
def is_audio_speech(row): |
|
tags = row.get("tags", []) |
|
pipeline_tag = row.get("pipeline_tag", "") |
|
|
|
return (pipeline_tag and ("audio" in pipeline_tag.lower() or "speech" in pipeline_tag.lower())) or \ |
|
any("audio" in tag.lower() for tag in tags) or \ |
|
any("speech" in tag.lower() for tag in tags) |
|
|
|
def is_music(row): |
|
tags = row.get("tags", []) |
|
return any("music" in tag.lower() for tag in tags) |
|
|
|
def is_robotics(row): |
|
tags = row.get("tags", []) |
|
return any("robot" in tag.lower() for tag in tags) |
|
|
|
def is_biomed(row): |
|
tags = row.get("tags", []) |
|
return any("bio" in tag.lower() for tag in tags) or \ |
|
any("medic" in tag.lower() for tag in tags) |
|
|
|
def is_timeseries(row): |
|
tags = row.get("tags", []) |
|
return any("series" in tag.lower() for tag in tags) |
|
|
|
def is_science(row): |
|
tags = row.get("tags", []) |
|
return any("science" in tag.lower() and "bigscience" not in tag for tag in tags) |
|
|
|
def is_video(row): |
|
tags = row.get("tags", []) |
|
return any("video" in tag.lower() for tag in tags) |
|
|
|
def is_image(row): |
|
tags = row.get("tags", []) |
|
return any("image" in tag.lower() for tag in tags) |
|
|
|
def is_text(row): |
|
tags = row.get("tags", []) |
|
return any("text" in tag.lower() for tag in tags) |
|
|
|
|
|
def is_in_size_range(row, size_range): |
|
if size_range is None: |
|
return True |
|
|
|
min_size, max_size = MODEL_SIZE_RANGES[size_range] |
|
|
|
|
|
if "params" in row and pd.notna(row["params"]): |
|
try: |
|
|
|
size_gb = float(row["params"]) / (1024 * 1024 * 1024) |
|
return min_size <= size_gb < max_size |
|
except (ValueError, TypeError): |
|
return False |
|
|
|
return False |
|
|
|
TAG_FILTER_FUNCS = { |
|
"Audio & Speech": is_audio_speech, |
|
"Time series": is_timeseries, |
|
"Robotics": is_robotics, |
|
"Music": is_music, |
|
"Video": is_video, |
|
"Images": is_image, |
|
"Text": is_text, |
|
"Biomedical": is_biomed, |
|
"Sciences": is_science, |
|
} |
|
|
|
def extract_org_from_id(model_id): |
|
"""Extract organization name from model ID""" |
|
if "/" in model_id: |
|
return model_id.split("/")[0] |
|
return "unaffiliated" |
|
|
|
def make_treemap_data(df, count_by, top_k=25, tag_filter=None, pipeline_filter=None, size_filter=None): |
|
"""Process DataFrame into treemap format with filters applied""" |
|
|
|
filtered_df = df.copy() |
|
|
|
|
|
if tag_filter and tag_filter in TAG_FILTER_FUNCS: |
|
filter_func = TAG_FILTER_FUNCS[tag_filter] |
|
filtered_df = filtered_df[filtered_df.apply(filter_func, axis=1)] |
|
|
|
if pipeline_filter: |
|
filtered_df = filtered_df[filtered_df["pipeline_tag"] == pipeline_filter] |
|
|
|
if size_filter and size_filter in MODEL_SIZE_RANGES: |
|
|
|
def check_size(row): |
|
return is_in_size_range(row, size_filter) |
|
|
|
filtered_df = filtered_df[filtered_df.apply(check_size, axis=1)] |
|
|
|
|
|
filtered_df["organization"] = filtered_df["id"].apply(extract_org_from_id) |
|
|
|
|
|
org_totals = filtered_df.groupby("organization")[count_by].sum().reset_index() |
|
org_totals = org_totals.sort_values(by=count_by, ascending=False) |
|
|
|
|
|
top_orgs = org_totals.head(top_k)["organization"].tolist() |
|
|
|
|
|
filtered_df = filtered_df[filtered_df["organization"].isin(top_orgs)] |
|
|
|
|
|
treemap_data = filtered_df[["id", "organization", count_by]].copy() |
|
|
|
|
|
treemap_data["root"] = "models" |
|
|
|
|
|
treemap_data[count_by] = pd.to_numeric(treemap_data[count_by], errors="coerce").fillna(0) |
|
|
|
return treemap_data |
|
|
|
def create_treemap(treemap_data, count_by, title=None): |
|
"""Create a Plotly treemap from the prepared data""" |
|
if treemap_data.empty: |
|
|
|
fig = px.treemap( |
|
names=["No data matches the selected filters"], |
|
values=[1] |
|
) |
|
fig.update_layout( |
|
title="No data matches the selected filters", |
|
margin=dict(t=50, l=25, r=25, b=25) |
|
) |
|
return fig |
|
|
|
|
|
fig = px.treemap( |
|
treemap_data, |
|
path=["root", "organization", "id"], |
|
values=count_by, |
|
title=title or f"HuggingFace Models - {count_by.capitalize()} by Organization", |
|
color_discrete_sequence=px.colors.qualitative.Plotly |
|
) |
|
|
|
|
|
fig.update_layout( |
|
margin=dict(t=50, l=25, r=25, b=25) |
|
) |
|
|
|
|
|
fig.update_traces( |
|
textinfo="label+value+percent root", |
|
hovertemplate="<b>%{label}</b><br>%{value:,} " + count_by + "<br>%{percentRoot:.2%} of total<extra></extra>" |
|
) |
|
|
|
return fig |
|
|
|
def load_models_csv(): |
|
|
|
|
|
df = pd.read_csv('models.csv') |
|
|
|
|
|
def process_tags(tags_str): |
|
if pd.isna(tags_str): |
|
return [] |
|
|
|
|
|
tags_str = tags_str.strip("[]").replace("'", "") |
|
tags = [tag.strip() for tag in tags_str.split() if tag.strip()] |
|
return tags |
|
|
|
df['tags'] = df['tags'].apply(process_tags) |
|
|
|
|
|
add_sample_data(df) |
|
|
|
return df |
|
|
|
def add_sample_data(df): |
|
"""Add more sample data to make the visualization more interesting""" |
|
|
|
orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface', |
|
'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together', |
|
'facebook', 'amazon', 'deepmind', 'cohere', 'bigscience', 'eleutherai'] |
|
|
|
|
|
model_name_patterns = [ |
|
"model-{size}-{version}", |
|
"{prefix}-{size}b", |
|
"{prefix}-{size}b-{variant}", |
|
"llama-{size}b-{variant}", |
|
"gpt-{variant}-{size}b", |
|
"{prefix}-instruct-{size}b", |
|
"{prefix}-chat-{size}b", |
|
"{prefix}-coder-{size}b", |
|
"stable-diffusion-{version}", |
|
"whisper-{size}", |
|
"bert-{size}-{variant}", |
|
"roberta-{size}", |
|
"t5-{size}", |
|
"{prefix}-vision-{size}b" |
|
] |
|
|
|
|
|
prefixes = ["falcon", "llama", "mistral", "gpt", "phi", "gemma", "qwen", "yi", "mpt", "bloom"] |
|
sizes = ["7", "13", "34", "70", "1", "3", "7b", "13b", "70b", "8b", "2b", "1b", "0.5b", "small", "base", "large", "huge"] |
|
variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"] |
|
|
|
|
|
sample_data = [] |
|
for org_idx, org in enumerate(orgs): |
|
|
|
num_models = np.random.randint(5, 11) |
|
|
|
for i in range(num_models): |
|
|
|
pattern = np.random.choice(model_name_patterns) |
|
prefix = np.random.choice(prefixes) |
|
size = np.random.choice(sizes) |
|
version = f"v{np.random.randint(1, 4)}" |
|
variant = np.random.choice(variants) |
|
|
|
model_name = pattern.format( |
|
prefix=prefix, |
|
size=size, |
|
version=version, |
|
variant=variant |
|
) |
|
|
|
model_id = f"{org}/{model_name}" |
|
|
|
|
|
if "diffusion" in model_name or "image" in model_name: |
|
pipeline_tag = np.random.choice(["text-to-image", "image-to-image", "image-segmentation"]) |
|
elif "whisper" in model_name or "speech" in model_name: |
|
pipeline_tag = np.random.choice(["automatic-speech-recognition", "text-to-speech"]) |
|
elif "coder" in model_name or "code" in model_name: |
|
pipeline_tag = "text-generation" |
|
elif "bert" in model_name or "roberta" in model_name: |
|
pipeline_tag = np.random.choice(["fill-mask", "text-classification", "token-classification"]) |
|
elif "vision" in model_name: |
|
pipeline_tag = np.random.choice(["image-classification", "image-to-text", "visual-question-answering"]) |
|
else: |
|
pipeline_tag = "text-generation" |
|
|
|
|
|
tags = [pipeline_tag] |
|
|
|
if "text-generation" in pipeline_tag: |
|
tags.extend(["language-model", "text", "gpt", "llm"]) |
|
if "instruct" in model_name: |
|
tags.append("instruction-following") |
|
if "chat" in model_name: |
|
tags.append("chat") |
|
elif "speech" in pipeline_tag: |
|
tags.extend(["audio", "speech", "voice"]) |
|
elif "image" in pipeline_tag: |
|
tags.extend(["vision", "image", "diffusion"]) |
|
|
|
|
|
if np.random.random() < 0.8: |
|
tags.append("en") |
|
if np.random.random() < 0.3: |
|
tags.append("multilingual") |
|
|
|
|
|
|
|
popularity_factor = (len(orgs) - org_idx) / len(orgs) |
|
base_downloads = 10000 * (10 ** (2 * popularity_factor)) |
|
downloads = int(base_downloads * np.random.uniform(0.3, 3.0)) |
|
likes = int(downloads * np.random.uniform(0.01, 0.1)) |
|
|
|
|
|
|
|
size_indicator = 1 |
|
for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]: |
|
if s in model_name.lower(): |
|
size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3 |
|
break |
|
|
|
|
|
params = int(np.random.uniform(0.5, 2.0) * size_indicator * 1e9) |
|
|
|
|
|
model = { |
|
"id": model_id, |
|
"author": org, |
|
"downloads": downloads, |
|
"likes": likes, |
|
"pipeline_tag": pipeline_tag, |
|
"tags": tags, |
|
"params": params |
|
} |
|
|
|
sample_data.append(model) |
|
|
|
|
|
sample_df = pd.DataFrame(sample_data) |
|
return pd.concat([df, sample_df], ignore_index=True) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
models_data = gr.State() |
|
|
|
with gr.Row(): |
|
gr.Markdown(""" |
|
# HuggingFace Models TreeMap Visualization |
|
|
|
This app shows how different organizations contribute to the HuggingFace ecosystem with their models. |
|
Use the filters to explore models by different metrics, tags, pipelines, and model sizes. |
|
|
|
The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes). |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
count_by_dropdown = gr.Dropdown( |
|
label="Metric", |
|
choices=["downloads", "likes"], |
|
value="downloads", |
|
info="Select the metric to determine box sizes" |
|
) |
|
|
|
filter_choice_radio = gr.Radio( |
|
label="Filter Type", |
|
choices=["None", "Tag Filter", "Pipeline Filter"], |
|
value="None", |
|
info="Choose how to filter the models" |
|
) |
|
|
|
tag_filter_dropdown = gr.Dropdown( |
|
label="Select Tag", |
|
choices=list(TAG_FILTER_FUNCS.keys()), |
|
value=None, |
|
visible=False, |
|
info="Filter models by domain/category" |
|
) |
|
|
|
pipeline_filter_dropdown = gr.Dropdown( |
|
label="Select Pipeline Tag", |
|
choices=PIPELINE_TAGS, |
|
value=None, |
|
visible=False, |
|
info="Filter models by specific pipeline" |
|
) |
|
|
|
size_filter_dropdown = gr.Dropdown( |
|
label="Model Size Filter", |
|
choices=["None"] + list(MODEL_SIZE_RANGES.keys()), |
|
value="None", |
|
info="Filter models by their size (using params column)" |
|
) |
|
|
|
top_k_slider = gr.Slider( |
|
label="Number of Top Organizations", |
|
minimum=5, |
|
maximum=50, |
|
value=25, |
|
step=5, |
|
info="Number of top organizations to include" |
|
) |
|
|
|
generate_plot_button = gr.Button("Generate Plot", variant="primary") |
|
|
|
with gr.Column(scale=3): |
|
plot_output = gr.Plot() |
|
stats_output = gr.Markdown("*Generate a plot to see statistics*") |
|
|
|
def generate_plot_on_click(count_by, filter_choice, tag_filter, pipeline_filter, size_filter, top_k, data_df): |
|
print(f"Generating plot with: Metric={count_by}, Filter={filter_choice}, Tag={tag_filter}, Pipeline={pipeline_filter}, Size={size_filter}, Top K={top_k}") |
|
|
|
if data_df is None or len(data_df) == 0: |
|
return None, "Error: No data available. Please try again." |
|
|
|
selected_tag_filter = None |
|
selected_pipeline_filter = None |
|
selected_size_filter = None |
|
|
|
if filter_choice == "Tag Filter": |
|
selected_tag_filter = tag_filter |
|
elif filter_choice == "Pipeline Filter": |
|
selected_pipeline_filter = pipeline_filter |
|
|
|
if size_filter != "None": |
|
selected_size_filter = size_filter |
|
|
|
|
|
treemap_data = make_treemap_data( |
|
df=data_df, |
|
count_by=count_by, |
|
top_k=top_k, |
|
tag_filter=selected_tag_filter, |
|
pipeline_filter=selected_pipeline_filter, |
|
size_filter=selected_size_filter |
|
) |
|
|
|
|
|
fig = create_treemap( |
|
treemap_data=treemap_data, |
|
count_by=count_by, |
|
title=f"HuggingFace Models - {count_by.capitalize()} by Organization" |
|
) |
|
|
|
|
|
if treemap_data.empty: |
|
stats_md = "No data matches the selected filters." |
|
else: |
|
total_models = len(treemap_data) |
|
total_value = treemap_data[count_by].sum() |
|
top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5) |
|
|
|
stats_md = f""" |
|
### Statistics |
|
- **Total models shown**: {total_models:,} |
|
- **Total {count_by}**: {total_value:,} |
|
|
|
### Top 5 Organizations |
|
| Organization | {count_by.capitalize()} | % of Total | |
|
| --- | --- | --- | |
|
""" |
|
|
|
for org, value in top_5_orgs.items(): |
|
percentage = (value / total_value) * 100 |
|
stats_md += f"| {org} | {value:,} | {percentage:.2f}% |\n" |
|
|
|
return fig, stats_md |
|
|
|
def update_filter_visibility(filter_choice): |
|
if filter_choice == "Tag Filter": |
|
return gr.update(visible=True), gr.update(visible=False) |
|
elif filter_choice == "Pipeline Filter": |
|
return gr.update(visible=False), gr.update(visible=True) |
|
else: |
|
return gr.update(visible=False), gr.update(visible=False) |
|
|
|
filter_choice_radio.change( |
|
fn=update_filter_visibility, |
|
inputs=[filter_choice_radio], |
|
outputs=[tag_filter_dropdown, pipeline_filter_dropdown] |
|
) |
|
|
|
|
|
demo.load( |
|
fn=load_models_csv, |
|
inputs=[], |
|
outputs=[models_data] |
|
) |
|
|
|
|
|
generate_plot_button.click( |
|
fn=generate_plot_on_click, |
|
inputs=[ |
|
count_by_dropdown, |
|
filter_choice_radio, |
|
tag_filter_dropdown, |
|
pipeline_filter_dropdown, |
|
size_filter_dropdown, |
|
top_k_slider, |
|
models_data |
|
], |
|
outputs=[plot_output, stats_output] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |