Spaces:
Runtime error
Runtime error
import gradio as gr | |
from huggingface_hub import list_models, list_spaces | |
from pathlib import Path | |
from toolz import concat | |
from datasets import Dataset | |
import polars as pl | |
from datetime import date | |
from datasets import load_dataset | |
import plotly.express as px | |
import os | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
assert HF_TOKEN | |
def yield_models(): | |
for model in iter(list_models(full=True)): | |
yield "model", model | |
def yield_spaces(): | |
for space in iter(list_spaces(full=True)): | |
yield "space", space | |
def yield_notebooks(): | |
for repo_type, repo in concat([yield_models(), yield_spaces()]): | |
files = (f.rfilename for f in repo.siblings) | |
if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]: | |
yield { | |
"date": date.today(), | |
"repo_type": repo_type, | |
"repo_id": repo.id, | |
"repo_notebook_count": len(jupyter_notebook), | |
} | |
def update_stats(): | |
df = pl.LazyFrame(yield_notebooks()) | |
df = ( | |
df.with_columns(pl.col("repo_id").str.split_exact("/", 1)) | |
.unnest("repo_id") | |
.rename({"field_0": "user", "field_1": "repo_id"}) | |
) | |
by_user_count = ( | |
df.groupby("user") | |
.agg(pl.col("repo_notebook_count").sum()) | |
.sort("repo_notebook_count", descending=True) | |
.collect() | |
) | |
by_user_count.mean().select( | |
pl.col("repo_notebook_count").alias("mean notebooks per user") | |
) | |
ds = Dataset(by_user_count.to_arrow()) | |
ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN) | |
grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum()) | |
final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect() | |
previous_df = pl.DataFrame( | |
load_dataset("davanstrien/notebooks_by_repo_type", split="train").data.table | |
) | |
final_df = pl.concat([previous_df, final_df]).unique() | |
spaces = final_df.filter(pl.col("repo_type") == "space").unique( | |
subset=["date"], keep="last" | |
) | |
models = final_df.filter(pl.col("repo_type") == "model").unique( | |
subset=["date"], keep="last" | |
) | |
final_df = pl.concat([spaces, models]).unique() | |
Dataset(final_df.to_arrow()).push_to_hub( | |
"davanstrien/notebooks_by_repo_type", token=HF_TOKEN | |
) | |
final_df = final_df.sort("date") | |
pandas_df = final_df.to_pandas() | |
# final_df.to_pandas().set_index("date", drop=True).sort_index() | |
return pandas_df, final_df | |
with gr.Blocks() as demo: | |
gr.Markdown("# Notebooks on the Hub (updated daily)") | |
pandas_df, final_df = update_stats() | |
gr.Markdown("## Notebooks on the Hub over time") | |
gr.Plot(px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type")) | |
gr.Markdown("## Notebooks on the Hub (total by date)") | |
gr.DataFrame( | |
final_df.select(pl.col(["date", "repo_notebook_count"])) | |
.groupby("date") | |
.sum() | |
.sort("date") | |
.to_pandas() | |
) | |
gr.Markdown("## Notebooks on the Hub raw data") | |
gr.DataFrame(pandas_df) | |
demo.launch(debug=True) | |