import gradio as gr from huggingface_hub import list_models, list_spaces from pathlib import Path from toolz import concat from datasets import Dataset import polars as pl from datetime import date from datasets import load_dataset import plotly.express as px import os from typing import Optional, Set, Tuple from functools import lru_cache HF_TOKEN = os.getenv("HF_TOKEN") assert HF_TOKEN def yield_models(exclude_users: Optional[Set[str]] = None): """Yields models from the hub optionally excluding users.""" for model in iter(list_models(full=True)): if ( exclude_users is not None and model.author is not None and model.author in exclude_users ): continue yield "model", model def yield_spaces(exclude_users: Optional[Set[str]] = None): for space in iter(list_spaces(full=True)): if exclude_users is not None and space.author and space.author in exclude_users: continue yield "space", space def yield_notebooks_counts(exclude_users: Optional[Set[str]] = None): for repo_type, repo in concat( [ yield_models(exclude_users=exclude_users), yield_spaces(exclude_users=exclude_users), ] ): files = (f.rfilename for f in repo.siblings) if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]: yield { "date": date.today(), "repo_type": repo_type, "repo_id": repo.id, "repo_notebook_count": len(jupyter_notebook), } def yield_notebooks(exclude_users: Optional[Set[str]] = None): for repo_type, repo in concat( [ yield_models(exclude_users=exclude_users), yield_spaces(exclude_users=exclude_users), ] ): files = (f.rfilename for f in repo.siblings) if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]: yield { "repo_type": repo_type, "repo_id": repo.id, "repo_notebook_count": len(jupyter_notebook), "jupyter_notebooks": jupyter_notebook, "likes": repo.likes, } @lru_cache def _get_top_liked_repos_with_notebooks(exclude_users: Optional[Tuple[str]] = None): df = pl.LazyFrame(yield_notebooks(exclude_users=exclude_users)) df = df.sort("likes", descending=True).collect() return df def get_top_liked_repos_with_notebooks(exclude_users: Optional[Set[str]] = None): exclude_users = tuple(exclude_users) if exclude_users else None return _get_top_liked_repos_with_notebooks(exclude_users) def get_top_k_notebooks_by_repo_type(type: str = "space", k: int = 50): df = get_top_liked_repos_with_notebooks({"gradio"}) return df.filter(pl.col("repo_type") == type).head(k).to_pandas() def raw_current_notebook_dataframe(): df = pl.DataFrame( yield_notebooks_counts(exclude_users={"gradio", "gradio-pr-deploys"}) ) return df.to_pandas() def update_stats(): df = pl.LazyFrame( yield_notebooks_counts(exclude_users={"gradio", "gradio-pr-deploys"}) ) df = ( df.with_columns(pl.col("repo_id").str.split_exact("/", 1)) .unnest("repo_id") .rename({"field_0": "user", "field_1": "repo_id"}) ) previous_raw_df = pl.DataFrame( load_dataset("davanstrien/notebooks_on_the_hub_raw", split="train",verification_mode='no_checks').data.table ) final_raw_df = pl.concat([previous_raw_df, df.collect()]).unique() Dataset(final_raw_df.to_arrow()).push_to_hub( "davanstrien/notebooks_on_the_hub_raw", token=HF_TOKEN ) by_user_count = ( df.groupby("user") .agg(pl.col("repo_notebook_count").sum()) .sort("repo_notebook_count", descending=True) .collect() ) by_user_count.mean().select( pl.col("repo_notebook_count").alias("mean notebooks per user") ) ds = Dataset(by_user_count.to_arrow()) ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN) grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum()) final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect() previous_df = pl.DataFrame( load_dataset("davanstrien/notebooks_by_repo_type", split="train",verification_mode='no_checks').data.table ) final_df = pl.concat([previous_df, final_df]).unique() spaces = final_df.filter(pl.col("repo_type") == "space").unique( subset=["date"], keep="last" ) models = final_df.filter(pl.col("repo_type") == "model").unique( subset=["date"], keep="last" ) final_df = pl.concat([spaces, models]).unique() Dataset(final_df.to_arrow()).push_to_hub( "davanstrien/notebooks_by_repo_type", token=HF_TOKEN ) final_df = final_df.sort("date") pandas_df = final_df.to_pandas() # final_df.to_pandas().set_index("date", drop=True).sort_index() return pandas_df, final_df, final_raw_df with gr.Blocks() as demo: with gr.Tab("Notebooks on the Hub stats"): gr.Markdown("# Notebooks on the Hub (updated daily)") pandas_df, final_df, final_raw_df = update_stats() gr.Markdown("## Notebooks on the Hub over time") gr.Plot(px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type")) gr.Markdown("## Notebooks on the Hub (total by date)") gr.DataFrame( final_df.select(pl.col(["date", "repo_notebook_count"])) .groupby("date") .sum() .sort("date") .to_pandas() ) gr.Markdown("## Top Repos by likes with notebooks") gr.Markdown("#### Top 10 Spaces") gr.DataFrame(get_top_k_notebooks_by_repo_type("space", 10)[["repo_id", "likes"]]) gr.Markdown("#### Top 10 Models") gr.DataFrame(get_top_k_notebooks_by_repo_type("model", 10)[["repo_id", "likes"]]) # repo_type.update(get_top_k_notebooks_by_repo_type, [repo_type, k],[df]) with gr.Tab("raw data"): gr.Markdown("## Notebooks on the Hub raw data") gr.DataFrame(final_raw_df.to_pandas()) demo.launch(debug=True)