Spaces:

davanstrien
/

notebooks_on_the_hub

Runtime error

File size: 3,103 Bytes

c4fe3e2

import gradio as gr

from huggingface_hub import list_models, list_spaces
from pathlib import Path
from toolz import concat
from datasets import Dataset
import polars as pl
from datetime import date
from datasets import load_dataset
import plotly.express as px
import os

HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN


def yield_models():
    for model in iter(list_models(full=True)):
        yield "model", model


def yield_spaces():
    for space in iter(list_spaces(full=True)):
        yield "space", space


def yield_notebooks():
    for repo_type, repo in concat([yield_models(), yield_spaces()]):
        files = (f.rfilename for f in repo.siblings)
        if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]:
            yield {
                "date": date.today(),
                "repo_type": repo_type,
                "repo_id": repo.id,
                "repo_notebook_count": len(jupyter_notebook),
            }


def update_stats():
    df = pl.LazyFrame(yield_notebooks())

    df = (
        df.with_columns(pl.col("repo_id").str.split_exact("/", 1))
        .unnest("repo_id")
        .rename({"field_0": "user", "field_1": "repo_id"})
    )
    by_user_count = (
        df.groupby("user")
        .agg(pl.col("repo_notebook_count").sum())
        .sort("repo_notebook_count", descending=True)
        .collect()
    )

    by_user_count.mean().select(
        pl.col("repo_notebook_count").alias("mean notebooks  per user")
    )

    ds = Dataset(by_user_count.to_arrow())

    ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN)

    grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum())
    final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect()
    previous_df = pl.DataFrame(
        load_dataset("davanstrien/notebooks_by_repo_type", split="train").data.table
    )
    final_df = pl.concat([previous_df, final_df]).unique()
    spaces = final_df.filter(pl.col("repo_type") == "space").unique(
        subset=["date"], keep="last"
    )
    models = final_df.filter(pl.col("repo_type") == "model").unique(
        subset=["date"], keep="last"
    )
    final_df = pl.concat([spaces, models]).unique()
    Dataset(final_df.to_arrow()).push_to_hub(
        "davanstrien/notebooks_by_repo_type", token=HF_TOKEN
    )

    final_df = final_df.sort("date")
    pandas_df = final_df.to_pandas()
    # final_df.to_pandas().set_index("date", drop=True).sort_index()
    return pandas_df, final_df


with gr.Blocks() as demo:
    gr.Markdown("# Notebooks on the Hub (updated daily)")
    pandas_df, final_df = update_stats()
    gr.Markdown("## Notebooks on the Hub over time")
    gr.Plot(px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type"))
    gr.Markdown("## Notebooks on the Hub (total by date)")
    gr.DataFrame(
        final_df.select(pl.col(["date", "repo_notebook_count"]))
        .groupby("date")
        .sum()
        .sort("date")
        .to_pandas()
    )
    gr.Markdown("## Notebooks on the Hub raw data")
    gr.DataFrame(pandas_df)


demo.launch(debug=True)