In [67]:
from huggingface_hub import list_models, list_spaces
from pathlib import Path
from toolz import concat
from datasets import Dataset
import polars as pl
from datetime import date
from datetime import date, timedelta
from datasets import load_dataset
import plotly.express as px
import os

In [None]:
HF_TOKEN = os.getenv("HF_TOKEN")

In [23]:
def yield_models():
    for model in iter(list_models(full=True)):
        yield "model", model

In [24]:
def yield_spaces():
    for space in iter(list_spaces(full=True)):
        yield "space", space

In [25]:
def yield_notebooks():
    for repo_type, repo in concat([yield_models(), yield_spaces()]):
        files = (f.rfilename for f in repo.siblings)
        if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]:
            yield {
                "date": date.today(),
                "repo_type": repo_type,
                "repo_id": repo.id,
                "repo_notebook_count": len(jupyter_notebook),
            }

In [26]:
df = pl.LazyFrame(yield_notebooks())

In [27]:
df = (
    df.with_columns(pl.col("repo_id").str.split_exact("/", 1))
    .unnest("repo_id")
    .rename({"field_0": "user", "field_1": "repo_id"})
)

In [28]:
by_user_count = (
    df.groupby("user")
    .agg(pl.col("repo_notebook_count").sum())
    .sort("repo_notebook_count", descending=True)
    .collect()
)

In [29]:
by_user_count

user,repo_notebook_count
str,i64
"""gradio-pr-depl…",1798
"""gradio""",414
"""sgoodfriend""",240
"""merve""",63
"""chrisjay""",62
"""infinitejoy""",32
"""fabricius""",29
"""aammari""",26
"""flax-community…",24
"""rajesh1729""",24


In [30]:
by_user_count.describe()

describe,user,repo_notebook_count
str,str,f64
"""count""","""1540""",1540.0
"""null_count""","""0""",0.0
"""mean""",,3.787013
"""std""",,47.455407
"""min""","""007aneesh""",1.0
"""max""","""zinoubm""",1798.0
"""median""",,1.0


In [31]:
by_user_count.mean().select(
    pl.col("repo_notebook_count").alias("mean notebooks  per user")
)

mean notebooks per user
f64
3.787013


In [32]:
ds = Dataset(by_user_count.to_arrow())
ds

Dataset({
    features: ['user', 'repo_notebook_count'],
    num_rows: 1540
})

In [33]:
ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN)

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 617.08ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.39s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  2.01it/s]
Downloading metadata: 100%|██████████| 406/406 [00:00<00:00, 126kB/s]
Updating downloaded metadata with the new split.


In [34]:
grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum())

In [35]:
final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect()
final_df

repo_type,repo_notebook_count,date
str,i64,date
"""space""",4443,2023-03-30
"""model""",1389,2023-03-30


In [36]:
previous_df = pl.DataFrame(
    load_dataset("davanstrien/notebooks_by_repo_type", split="train").data.table
)
previous_df

Downloading readme: 100%|██████████| 441/441 [00:00<00:00, 130kB/s]


Downloading and preparing dataset None/None to /Users/davanstrien/.cache/huggingface/datasets/davanstrien___parquet/davanstrien--notebooks_by_repo_type-1004c11b0535dac5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 1.87k/1.87k [00:00<00:00, 705kB/s]
Downloading data files: 100%|██████████| 1/1 [00:01<00:00,  1.71s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 786.78it/s]
                                                                    

Dataset parquet downloaded and prepared to /Users/davanstrien/.cache/huggingface/datasets/davanstrien___parquet/davanstrien--notebooks_by_repo_type-1004c11b0535dac5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.




repo_type,repo_notebook_count,date
str,i64,date
"""space""",3956,2023-03-27
"""model""",1346,2023-03-27
"""model""",1348,2023-03-28
"""space""",4386,2023-03-28
"""space""",4422,2023-03-28
"""space""",4579,2023-03-29
"""model""",1384,2023-03-29


In [37]:
final_df = pl.concat([previous_df, final_df]).unique()

In [57]:
spaces = final_df.filter(pl.col("repo_type") == "space").unique(subset=["date"])
models = final_df.filter(pl.col("repo_type") == "model").unique(subset=["date"])

In [58]:
final_df = pl.concat([spaces, models]).unique()

In [62]:
final_df = final_df.sort("date")
final_df

repo_type,repo_notebook_count,date
str,i64,date
"""space""",3956,2023-03-27
"""model""",1346,2023-03-27
"""space""",4386,2023-03-28
"""model""",1348,2023-03-28
"""space""",4579,2023-03-29
"""model""",1384,2023-03-29
"""space""",4443,2023-03-30
"""model""",1389,2023-03-30


In [55]:
final_df.groupby("repo_type")

<polars.dataframe.groupby.GroupBy at 0x2a1035d10>

In [63]:
Dataset(final_df.to_arrow()).push_to_hub(
    "davanstrien/notebooks_by_repo_type", token=HF_TOKEN
)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 730.46ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  1.87it/s]
Downloading metadata: 100%|██████████| 441/441 [00:00<00:00, 173kB/s]
Updating downloaded metadata with the new split.


In [73]:
final_df = final_df.sort("date")
pandas_df = final_df.to_pandas()

In [65]:
# final_df.to_pandas().set_index("date", drop=True).sort_index()

Unnamed: 0_level_0,repo_type,repo_notebook_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-03-27,space,3956
2023-03-27,model,1346
2023-03-28,space,4386
2023-03-28,model,1348
2023-03-29,space,4579
2023-03-29,model,1384
2023-03-30,space,4443
2023-03-30,model,1389


In [66]:
px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type")