OrgStats

Running

App Files Files Community

evijit HF Staff commited on 6 days ago

Commit

caa5704

verified ·

1 Parent(s): 4d0a8a3

Update app.py

Browse files

Files changed (1) hide show

app.py +222 -63

app.py CHANGED Viewed

@@ -2,7 +2,13 @@ import json
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 PIPELINE_TAGS = [
  'text-generation',
  'text-to-image',
@@ -44,6 +50,16 @@ PIPELINE_TAGS = [
  'table-question-answering',
 ]
 def is_audio_speech(repo_dct):
     res = (repo_dct.get("pipeline_tag", None) and "audio" in repo_dct.get("pipeline_tag", "").lower()) or \
         (repo_dct.get("pipeline_tag", None) and "speech" in repo_dct.get("pipeline_tag", "").lower()) or \
@@ -84,6 +100,21 @@ def is_text(repo_dct):
     res = (repo_dct.get("tags", None) and any("text" in tag.lower() for tag in repo_dct.get("tags", [])))
     return res
 TAG_FILTER_FUNCS = {
     "Audio & Speech": is_audio_speech,
     "Time series": is_timeseries,
@@ -96,79 +127,211 @@ TAG_FILTER_FUNCS = {
     "Sciences": is_science,
 }
-def make_org_stats(repo_type, count_by, org_stats, top_k=20, filter_func=None):
-    assert count_by in ["likes", "downloads", "downloads_all"]
-    assert repo_type in ["all", "datasets", "models"]
-    repos = ["datasets", "models"] if repo_type == "all" else [repo_type]
-    if filter_func is None:
-        filter_func = lambda x: True
     sorted_stats = sorted(
         [(
-            author,
-            sum(dct[count_by] for dct in author_dct[repo] if filter_func(dct))
-        ) for repo in repos for author, author_dct in org_stats.items()],
-        key=lambda x:x[1],
         reverse=True,
     )
     res = sorted_stats[:top_k] + [("Others...", sum(st for auth, st in sorted_stats[top_k:]))]
     total_st = sum(st for o, st in res)
     res_plot_df = []
     for org, st in res:
         if org == "Others...":
-            res_plot_df += [("Others...", "other", st * 100 / total_st)]
         else:
-            for repo in repos:
-                for dct in org_stats[org][repo]:
-                    if filter_func(dct):
-                        res_plot_df += [(org, dct["id"], dct[count_by] * 100 / total_st)]
-    return ([(o, 100 * st / total_st) for o, st in res if st > 0], res_plot_df)
-def make_figure(count_by, repo_type, org_stats, tag_filter=None, pipeline_filter=None):
-    assert count_by in ["downloads", "likes", "downloads_all"]
-    assert repo_type in ["all", "models", "datasets"]
-    assert tag_filter is None or pipeline_filter is None
     filter_func = None
     if tag_filter:
         filter_func = TAG_FILTER_FUNCS[tag_filter]
-    if pipeline_filter:
         filter_func = lambda dct: dct.get("pipeline_tag", None) and dct.get("pipeline_tag", "") == pipeline_filter
-    _, res_plot_df = make_org_stats(repo_type, count_by, org_stats, top_k=25, filter_func=filter_func)
     df = pd.DataFrame(
         dict(
             organizations=[o for o, _, _ in res_plot_df],
-            repo=[r for _, r, _ in res_plot_df],
             stats=[s for _, _, s in res_plot_df],
         )
     )
-    df[repo_type] = repo_type # in order to have a single root node
-    fig = px.treemap(df, path=[repo_type, 'organizations', 'repo'], values='stats')
     fig.update_layout(
-        treemapcolorway = ["pink" for _ in range(len(res_plot_df))],
-        margin = dict(t=50, l=25, r=25, b=25)
     )
     return fig
 with gr.Blocks() as demo:
-    org_stats_data = gr.State(value=None)  # To store loaded data
     with gr.Row():
         gr.Markdown("""
-            ## Hugging Face Organization Stats
-            This app shows how different organizations are contributing to different aspects of the open AI ecosystem.
-            Use the dropdowns on the left to select repository types, metrics, and optionally tags representing topics or modalities of interest.
         """)
     with gr.Row():
         with gr.Column(scale=1):
-            repo_type_dropdown = gr.Dropdown(
-                label="Repository Type",
-                choices=["all", "models", "datasets"],
-                value="all"
-            )
             count_by_dropdown = gr.Dropdown(
                 label="Metric",
-                choices=["downloads", "likes", "downloads_all"],
                 value="downloads"
             )
@@ -184,47 +347,50 @@ with gr.Blocks() as demo:
                 value=None,
                 visible=False
             )
             pipeline_filter_dropdown = gr.Dropdown(
                 label="Select Pipeline Tag",
                 choices=PIPELINE_TAGS,
                 value=None,
                 visible=False
             )
             generate_plot_button = gr.Button("Generate Plot")
         with gr.Column(scale=3):
             plot_output = gr.Plot()
-    def generate_plot_on_click(repo_type, count_by, filter_choice, tag_filter, pipeline_filter, data):
-        # Print the current state of the input variables
-        print(f"Generating plot with the following inputs:")
-        print(f"  Repository Type: {repo_type}")
-        print(f"  Metric (Count By): {count_by}")
-        print(f"  Filter Choice: {filter_choice}")
-        if filter_choice == "Tag Filter":
-            print(f"  Tag Filter: {tag_filter}")
-        elif filter_choice == "Pipeline Filter":
-            print(f"  Pipeline Filter: {pipeline_filter}")
         if data is None:
             print("Error: Data not loaded yet.")
             return None
         selected_tag_filter = None
         selected_pipeline_filter = None
         if filter_choice == "Tag Filter":
             selected_tag_filter = tag_filter
         elif filter_choice == "Pipeline Filter":
             selected_pipeline_filter = pipeline_filter
         fig = make_figure(
             count_by=count_by,
-            repo_type=repo_type,
             org_stats=data,
             tag_filter=selected_tag_filter,
-            pipeline_filter=selected_pipeline_filter
         )
         return fig
@@ -233,7 +399,7 @@ with gr.Blocks() as demo:
             return gr.update(visible=True), gr.update(visible=False)
         elif filter_choice == "Pipeline Filter":
             return gr.update(visible=False), gr.update(visible=True)
-        else: # "None"
             return gr.update(visible=False), gr.update(visible=False)
     filter_choice_radio.change(
@@ -243,33 +409,26 @@ with gr.Blocks() as demo:
     )
     # Load data once at startup
-    def load_org_data():
-        print("Loading organization statistics data...")
-        loaded_org_stats = json.load(open("org_to_artifacts_2l_stats.json"))
-        print("Data loaded successfully.")
-        return loaded_org_stats
     demo.load(
-        fn=load_org_data,
-        inputs=[], # No inputs needed to just load data
-        outputs=[org_stats_data] # Only output to the state
     )
     # Button click event to generate plot
     generate_plot_button.click(
         fn=generate_plot_on_click,
         inputs=[
-            repo_type_dropdown,
             count_by_dropdown,
             filter_choice_radio,
             tag_filter_dropdown,
             pipeline_filter_dropdown,
-            org_stats_data
         ],
         outputs=[plot_output]
     )
 if __name__ == "__main__":
-    # org_stats = json.load(open("org_to_artifacts_2l_stats.json")) # Data loading handled by demo.load
     demo.launch()

 import gradio as gr
 import pandas as pd
 import plotly.express as px
+import pyarrow.parquet as pq
+import os
+import requests
+from io import BytesIO
+import math
+# Define pipeline tags (keeping the same ones from the provided code)
 PIPELINE_TAGS = [
  'text-generation',
  'text-to-image',
  'table-question-answering',
 ]
+# Model size categories in GB
+MODEL_SIZE_RANGES = {
+    "Small (<1GB)": (0, 1),
+    "Medium (1-5GB)": (1, 5),
+    "Large (5-20GB)": (5, 20),
+    "X-Large (20-50GB)": (20, 50),
+    "XX-Large (>50GB)": (50, float('inf'))
+}
+# Filter functions for tags - keeping the same from provided code
 def is_audio_speech(repo_dct):
     res = (repo_dct.get("pipeline_tag", None) and "audio" in repo_dct.get("pipeline_tag", "").lower()) or \
         (repo_dct.get("pipeline_tag", None) and "speech" in repo_dct.get("pipeline_tag", "").lower()) or \
     res = (repo_dct.get("tags", None) and any("text" in tag.lower() for tag in repo_dct.get("tags", [])))
     return res
+# Add model size filter function
+def is_in_size_range(repo_dct, size_range):
+    if size_range is None:
+        return True
+    min_size, max_size = MODEL_SIZE_RANGES[size_range]
+    # Get model size in GB from safetensors total (if available)
+    if repo_dct.get("safetensors") and repo_dct["safetensors"].get("total"):
+        # Convert bytes to GB
+        size_gb = repo_dct["safetensors"]["total"] / (1024 * 1024 * 1024)
+        return min_size <= size_gb < max_size
+    return False
 TAG_FILTER_FUNCS = {
     "Audio & Speech": is_audio_speech,
     "Time series": is_timeseries,
     "Sciences": is_science,
 }
+def make_org_stats(count_by, org_stats, top_k=20, filter_func=None, size_range=None):
+    assert count_by in ["likes", "downloads"]
+    # Apply both filter_func and size_range if provided
+    def combined_filter(dct):
+        passes_tag_filter = filter_func(dct) if filter_func else True
+        passes_size_filter = is_in_size_range(dct, size_range) if size_range else True
+        return passes_tag_filter and passes_size_filter
+    # Sort organizations by total count
     sorted_stats = sorted(
         [(
+            org_id,
+            sum(model[count_by] for model in models if combined_filter(model))
+        ) for org_id, models in org_stats.items()],
+        key=lambda x: x[1],
         reverse=True,
     )
+    # Top organizations + Others category
     res = sorted_stats[:top_k] + [("Others...", sum(st for auth, st in sorted_stats[top_k:]))]
     total_st = sum(st for o, st in res)
+    # Prepare data for treemap
     res_plot_df = []
     for org, st in res:
         if org == "Others...":
+            res_plot_df += [("Others...", "other", st * 100 / total_st if total_st > 0 else 0)]
         else:
+            for model in org_stats[org]:
+                if combined_filter(model):
+                    res_plot_df += [(org, model["id"], model[count_by] * 100 / total_st if total_st > 0 else 0)]
+    return ([(o, 100 * st / total_st if total_st > 0 else 0) for o, st in res if st > 0], res_plot_df)
+def make_figure(count_by, org_stats, tag_filter=None, pipeline_filter=None, size_range=None):
+    assert count_by in ["downloads", "likes"]
+    # Determine which filter function to use
     filter_func = None
     if tag_filter:
         filter_func = TAG_FILTER_FUNCS[tag_filter]
+    elif pipeline_filter:
         filter_func = lambda dct: dct.get("pipeline_tag", None) and dct.get("pipeline_tag", "") == pipeline_filter
+    else:
+        filter_func = lambda dct: True
+    # Generate stats with filters
+    _, res_plot_df = make_org_stats(count_by, org_stats, top_k=25, filter_func=filter_func, size_range=size_range)
+    # Create DataFrame for Plotly
     df = pd.DataFrame(
         dict(
             organizations=[o for o, _, _ in res_plot_df],
+            model=[r for _, r, _ in res_plot_df],
             stats=[s for _, _, s in res_plot_df],
         )
     )
+    df["models"] = "models"  # Root node
+    # Create treemap
+    fig = px.treemap(df, path=["models", 'organizations', 'model'], values='stats',
+                     title=f"HuggingFace Models - {count_by.capitalize()} by Organization")
     fig.update_layout(
+        margin=dict(t=50, l=25, r=25, b=25)
     )
     return fig
+def download_and_process_models():
+    """Download and process the models data from HuggingFace dataset"""
+    try:
+        # Create a cache directory
+        if not os.path.exists('data'):
+            os.makedirs('data')
+        # Check if we have cached data
+        if os.path.exists('data/processed_models.json'):
+            print("Loading from cache...")
+            with open('data/processed_models.json', 'r') as f:
+                return json.load(f)
+        # URL to the models.parquet file
+        url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
+        print(f"Downloading models data from {url}...")
+        response = requests.get(url)
+        if response.status_code != 200:
+            raise Exception(f"Failed to download data: HTTP {response.status_code}")
+        # Read the parquet file
+        table = pq.read_table(BytesIO(response.content))
+        df = table.to_pandas()
+        print(f"Downloaded {len(df)} models")
+        # Process the dataframe into the organization structure we need
+        org_stats = {}
+        for _, row in df.iterrows():
+            model_id = row['id']
+            # Extract the organization part of the model ID
+            if '/' in model_id:
+                org_id = model_id.split('/')[0]
+            else:
+                org_id = "unaffiliated"
+            # Create model entry with needed fields
+            model_entry = {
+                "id": model_id,
+                "downloads": row.get('downloads', 0),
+                "likes": row.get('likes', 0),
+                "pipeline_tag": row.get('pipeline_tag'),
+                "tags": row.get('tags', []),
+            }
+            # Add safetensors information if available
+            if 'safetensors' in row and row['safetensors']:
+                if isinstance(row['safetensors'], dict) and 'total' in row['safetensors']:
+                    model_entry["safetensors"] = {"total": row['safetensors']['total']}
+                elif isinstance(row['safetensors'], str):
+                    # Try to parse JSON string
+                    try:
+                        safetensors = json.loads(row['safetensors'])
+                        if isinstance(safetensors, dict) and 'total' in safetensors:
+                            model_entry["safetensors"] = {"total": safetensors['total']}
+                    except:
+                        pass
+            # Add to organization stats
+            if org_id not in org_stats:
+                org_stats[org_id] = []
+            org_stats[org_id].append(model_entry)
+        # Cache the processed data
+        with open('data/processed_models.json', 'w') as f:
+            json.dump(org_stats, f)
+        return org_stats
+    except Exception as e:
+        print(f"Error downloading or processing data: {e}")
+        # Return sample data for testing if real data unavailable
+        return create_sample_data()
+def create_sample_data():
+    """Create sample data for testing when real data is unavailable"""
+    print("Creating sample data for testing...")
+    sample_orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'stability', 'huggingface']
+    org_stats = {}
+    for org in sample_orgs:
+        org_stats[org] = []
+        num_models = 5  # Each org has 5 sample models
+        for i in range(num_models):
+            model_id = f"{org}/model-{i+1}"
+            # Random pipeline tag
+            pipeline_idx = i % len(PIPELINE_TAGS)
+            pipeline_tag = PIPELINE_TAGS[pipeline_idx]
+            # Random tags
+            tags = [pipeline_tag, "sample-data"]
+            # Random downloads and likes
+            downloads = int(1000 * (10 ** (org_stats.keys().index(org) % 3)))  # Different magnitudes
+            likes = int(downloads * 0.05)  # 5% like rate
+            # Random model size in bytes (from 100MB to 100GB)
+            model_size = (10**8) * (10 ** (i % 3))  # Different magnitudes
+            org_stats[org].append({
+                "id": model_id,
+                "downloads": downloads,
+                "likes": likes,
+                "pipeline_tag": pipeline_tag,
+                "tags": tags,
+                "safetensors": {"total": model_size}
+            })
+    return org_stats
+# Create Gradio interface
 with gr.Blocks() as demo:
+    models_data = gr.State(value=None)  # To store loaded data
     with gr.Row():
         gr.Markdown("""
+            ## HuggingFace Models TreeMap
+            This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
+            Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
         """)
     with gr.Row():
         with gr.Column(scale=1):
             count_by_dropdown = gr.Dropdown(
                 label="Metric",
+                choices=["downloads", "likes"],
                 value="downloads"
             )
                 value=None,
                 visible=False
             )
             pipeline_filter_dropdown = gr.Dropdown(
                 label="Select Pipeline Tag",
                 choices=PIPELINE_TAGS,
                 value=None,
                 visible=False
             )
+            size_filter_dropdown = gr.Dropdown(
+                label="Model Size Filter",
+                choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
+                value="None"
+            )
             generate_plot_button = gr.Button("Generate Plot")
         with gr.Column(scale=3):
             plot_output = gr.Plot()
+    def generate_plot_on_click(count_by, filter_choice, tag_filter, pipeline_filter, size_filter, data):
+        print(f"Generating plot with: Metric={count_by}, Filter={filter_choice}, Tag={tag_filter}, Pipeline={pipeline_filter}, Size={size_filter}")
         if data is None:
             print("Error: Data not loaded yet.")
             return None
         selected_tag_filter = None
         selected_pipeline_filter = None
+        selected_size_filter = None
         if filter_choice == "Tag Filter":
             selected_tag_filter = tag_filter
         elif filter_choice == "Pipeline Filter":
             selected_pipeline_filter = pipeline_filter
+        if size_filter != "None":
+            selected_size_filter = size_filter
         fig = make_figure(
             count_by=count_by,
             org_stats=data,
             tag_filter=selected_tag_filter,
+            pipeline_filter=selected_pipeline_filter,
+            size_range=selected_size_filter
         )
         return fig
             return gr.update(visible=True), gr.update(visible=False)
         elif filter_choice == "Pipeline Filter":
             return gr.update(visible=False), gr.update(visible=True)
+        else:  # "None"
             return gr.update(visible=False), gr.update(visible=False)
     filter_choice_radio.change(
     )
     # Load data once at startup
     demo.load(
+        fn=download_and_process_models,
+        inputs=[],
+        outputs=[models_data]
     )
     # Button click event to generate plot
     generate_plot_button.click(
         fn=generate_plot_on_click,
         inputs=[
             count_by_dropdown,
             filter_choice_radio,
             tag_filter_dropdown,
             pipeline_filter_dropdown,
+            size_filter_dropdown,
+            models_data
         ],
         outputs=[plot_output]
     )
 if __name__ == "__main__":
     demo.launch()