OrgStats

Running

App Files Files Community

evijit HF Staff commited on 19 days ago

Commit

57e108c

verified ·

1 Parent(s): f973e99

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -52

app.py CHANGED Viewed

@@ -239,18 +239,19 @@ def load_models_csv():
 # Create Gradio interface
 with gr.Blocks() as demo:
-    models_data = gr.State()  # To store loaded data
     with gr.Row():
         gr.Markdown("""
             # HuggingFace Models TreeMap Visualization
             This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
             Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
-            The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes).
         """)
     with gr.Row():
         with gr.Column(scale=1):
             count_by_dropdown = gr.Dropdown(
@@ -263,14 +264,14 @@ with gr.Blocks() as demo:
                 value="downloads",
                 info="Select the metric to determine box sizes"
             )
             filter_choice_radio = gr.Radio(
                 label="Filter Type",
                 choices=["None", "Tag Filter", "Pipeline Filter"],
                 value="None",
                 info="Choose how to filter the models"
             )
             tag_filter_dropdown = gr.Dropdown(
                 label="Select Tag",
                 choices=list(TAG_FILTER_FUNCS.keys()),
@@ -278,7 +279,7 @@ with gr.Blocks() as demo:
                 visible=False,
                 info="Filter models by domain/category"
             )
             pipeline_filter_dropdown = gr.Dropdown(
                 label="Select Pipeline Tag",
                 choices=PIPELINE_TAGS,
@@ -286,7 +287,7 @@ with gr.Blocks() as demo:
                 visible=False,
                 info="Filter models by specific pipeline"
             )
             size_filter_dropdown = gr.Dropdown(
                 label="Model Size Filter",
                 choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
@@ -302,26 +303,63 @@ with gr.Blocks() as demo:
                 step=5,
                 info="Number of top organizations to include"
             )
             skip_orgs_textbox = gr.Textbox(
                 label="Organizations to Skip (comma-separated)",
                 placeholder="e.g., OpenAI, Google",
-                value="TheBloke, MaziyarPanahi, unsloth, modularai, Gensyn, bartowski",
-                info="Enter names of organizations to exclude from the visualization"
             )
-            generate_plot_button = gr.Button("Generate Plot", variant="primary")
         with gr.Column(scale=3):
             plot_output = gr.Plot()
             stats_output = gr.Markdown("*Generate a plot to see statistics*")
     def generate_plot_on_click(count_by, filter_choice, tag_filter, pipeline_filter, size_filter, top_k, skip_orgs_text, data_df):
-        print(f"Generating plot with: Metric={count_by}, Filter={filter_choice}, Tag={tag_filter}, Pipeline={pipeline_filter}, Size={size_filter}, Top K={top_k}")
-        if data_df is None or len(data_df) == 0:
-            return None, "Error: No data available. Please try again."
         selected_tag_filter = None
         selected_pipeline_filter = None
         selected_size_filter = None
@@ -330,17 +368,14 @@ with gr.Blocks() as demo:
             selected_tag_filter = tag_filter
         elif filter_choice == "Pipeline Filter":
             selected_pipeline_filter = pipeline_filter
         if size_filter != "None":
             selected_size_filter = size_filter
-        # Process skip organizations list
         skip_orgs = []
         if skip_orgs_text and skip_orgs_text.strip():
             skip_orgs = [org.strip() for org in skip_orgs_text.split(',') if org.strip()]
-            print(f"Skipping organizations: {skip_orgs}")
-        # Process data for treemap
         treemap_data = make_treemap_data(
             df=data_df,
             count_by=count_by,
@@ -350,64 +385,77 @@ with gr.Blocks() as demo:
             size_filter=selected_size_filter,
             skip_orgs=skip_orgs
         )
-        # Create plot
         fig = create_treemap(
             treemap_data=treemap_data,
             count_by=count_by,
-            title=f"HuggingFace Models - {count_by.replace('AllTime', ' (All Time)').capitalize()} by Organization"
         )
-        # Generate statistics
         if treemap_data.empty:
             stats_md = "No data matches the selected filters."
         else:
             total_models = len(treemap_data)
             total_value = treemap_data[count_by].sum()
             top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5)
-            # Format the statistics using clean markdown
             stats_md = f"""
 ## Statistics
 - **Total models shown**: {total_models:,}
 - **Total {count_by}**: {int(total_value):,}
 ## Top Organizations by {count_by.capitalize()}
 | Organization | {count_by.capitalize()} | % of Total |
-|--------------|--------:|--------:|"""
-            # Add each organization as a row in the table
             for org, value in top_5_orgs.items():
                 percentage = (value / total_value) * 100
-                stats_md += f"\n| {org} | {int(value):,} | {percentage:.2f}% |"
             # Add note about skipped organizations if any
             if skip_orgs:
-                stats_md += f"\n\n*Note: {len(skip_orgs)} organization(s) excluded: {', '.join(skip_orgs)}*"
         return fig, stats_md
-    def update_filter_visibility(filter_choice):
-        if filter_choice == "Tag Filter":
-            return gr.update(visible=True), gr.update(visible=False)
-        elif filter_choice == "Pipeline Filter":
-            return gr.update(visible=False), gr.update(visible=True)
-        else:  # "None"
-            return gr.update(visible=False), gr.update(visible=False)
-    filter_choice_radio.change(
-        fn=update_filter_visibility,
-        inputs=[filter_choice_radio],
-        outputs=[tag_filter_dropdown, pipeline_filter_dropdown]
-    )
-    # Load data once at startup
     demo.load(
         fn=load_models_csv,
         inputs=[],
-        outputs=[models_data]
     )
-    # Button click event to generate plot
     generate_plot_button.click(
         fn=generate_plot_on_click,
         inputs=[
@@ -424,5 +472,6 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
     demo.launch()

 # Create Gradio interface
 with gr.Blocks() as demo:
+    models_data = gr.State()
+    loading_complete = gr.State(False)  # Flag to indicate data load completion
     with gr.Row():
         gr.Markdown("""
             # HuggingFace Models TreeMap Visualization
             This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
             Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
+            The treemap visualizes models grouped by organization, with the size of each box representing the selected metric.
         """)
     with gr.Row():
         with gr.Column(scale=1):
             count_by_dropdown = gr.Dropdown(
                 value="downloads",
                 info="Select the metric to determine box sizes"
             )
             filter_choice_radio = gr.Radio(
                 label="Filter Type",
                 choices=["None", "Tag Filter", "Pipeline Filter"],
                 value="None",
                 info="Choose how to filter the models"
             )
             tag_filter_dropdown = gr.Dropdown(
                 label="Select Tag",
                 choices=list(TAG_FILTER_FUNCS.keys()),
                 visible=False,
                 info="Filter models by domain/category"
             )
             pipeline_filter_dropdown = gr.Dropdown(
                 label="Select Pipeline Tag",
                 choices=PIPELINE_TAGS,
                 visible=False,
                 info="Filter models by specific pipeline"
             )
             size_filter_dropdown = gr.Dropdown(
                 label="Model Size Filter",
                 choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
                 step=5,
                 info="Number of top organizations to include"
             )
             skip_orgs_textbox = gr.Textbox(
                 label="Organizations to Skip (comma-separated)",
                 placeholder="e.g., OpenAI, Google",
+                value="TheBloke, MaziyarPanahi, unsloth, modularai, Gensyn, bartowski"
             )
+            generate_plot_button = gr.Button("Generate Plot", variant="primary", interactive=False)
         with gr.Column(scale=3):
             plot_output = gr.Plot()
             stats_output = gr.Markdown("*Generate a plot to see statistics*")
+    # Updated load function returning both the data and loading flag
+    def load_models_csv():
+        df = pd.read_csv('models.csv')
+        def process_tags(tags_str):
+            if pd.isna(tags_str):
+                return []
+            tags_str = tags_str.strip("[]").replace("'", "")
+            tags = [tag.strip() for tag in tags_str.split() if tag.strip()]
+            return tags
+        df['tags'] = df['tags'].apply(process_tags)
+        return df, True
+    # Button enablement after data load
+    def enable_plot_button(loaded):
+        return gr.update(interactive=loaded)
+    loading_complete.change(
+        fn=enable_plot_button,
+        inputs=[loading_complete],
+        outputs=[generate_plot_button]
+    )
+    # Show/hide tag/pipeline dropdown
+    def update_filter_visibility(filter_choice):
+        if filter_choice == "Tag Filter":
+            return gr.update(visible=True), gr.update(visible=False)
+        elif filter_choice == "Pipeline Filter":
+            return gr.update(visible=False), gr.update(visible=True)
+        else:
+            return gr.update(visible=False), gr.update(visible=False)
+    filter_choice_radio.change(
+        fn=update_filter_visibility,
+        inputs=[filter_choice_radio],
+        outputs=[tag_filter_dropdown, pipeline_filter_dropdown]
+    )
+    # Main generate function
     def generate_plot_on_click(count_by, filter_choice, tag_filter, pipeline_filter, size_filter, top_k, skip_orgs_text, data_df):
+        if data_df is None or not isinstance(data_df, pd.DataFrame) or data_df.empty:
+            return None, "Error: Data is still loading. Please wait a moment and try again."
         selected_tag_filter = None
         selected_pipeline_filter = None
         selected_size_filter = None
             selected_tag_filter = tag_filter
         elif filter_choice == "Pipeline Filter":
             selected_pipeline_filter = pipeline_filter
         if size_filter != "None":
             selected_size_filter = size_filter
         skip_orgs = []
         if skip_orgs_text and skip_orgs_text.strip():
             skip_orgs = [org.strip() for org in skip_orgs_text.split(',') if org.strip()]
         treemap_data = make_treemap_data(
             df=data_df,
             count_by=count_by,
             size_filter=selected_size_filter,
             skip_orgs=skip_orgs
         )
+        title_labels = {
+            "downloads": "Downloads (last 30 days)",
+            "downloadsAllTime": "Downloads (All Time)",
+            "likes": "Likes"
+        }
+        title_text = f"HuggingFace Models - {title_labels.get(count_by, count_by)} by Organization"
         fig = create_treemap(
             treemap_data=treemap_data,
             count_by=count_by,
+            title=title_text
         )
         if treemap_data.empty:
             stats_md = "No data matches the selected filters."
         else:
             total_models = len(treemap_data)
             total_value = treemap_data[count_by].sum()
+            # Get top 5 organizations
             top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5)
+            # Get top 5 individual models
+            top_5_models = treemap_data[["id", count_by]].sort_values(by=count_by, ascending=False).head(5)
+            # Create statistics section
             stats_md = f"""
 ## Statistics
 - **Total models shown**: {total_models:,}
 - **Total {count_by}**: {int(total_value):,}
 ## Top Organizations by {count_by.capitalize()}
 | Organization | {count_by.capitalize()} | % of Total |
+|--------------|-------------:|----------:|
+"""
+            # Add top organizations to the table
             for org, value in top_5_orgs.items():
                 percentage = (value / total_value) * 100
+                stats_md += f"| {org} | {int(value):,} | {percentage:.2f}% |\n"
+            # Add the top models table
+            stats_md += f"""
+## Top Models by {count_by.capitalize()}
+| Model | {count_by.capitalize()} | % of Total |
+|-------|-------------:|----------:|
+"""
+            # Add top models to the table
+            for _, row in top_5_models.iterrows():
+                model_id = row["id"]
+                value = row[count_by]
+                percentage = (value / total_value) * 100
+                stats_md += f"| {model_id} | {int(value):,} | {percentage:.2f}% |\n"
             # Add note about skipped organizations if any
             if skip_orgs:
+                stats_md += f"\n*Note: {len(skip_orgs)} organization(s) excluded: {', '.join(skip_orgs)}*"
         return fig, stats_md
+    # Load data at startup
     demo.load(
         fn=load_models_csv,
         inputs=[],
+        outputs=[models_data, loading_complete]
     )
     generate_plot_button.click(
         fn=generate_plot_on_click,
         inputs=[
     )
 if __name__ == "__main__":
     demo.launch()