Spaces:
Runtime error
Runtime error
File size: 3,103 Bytes
c4fe3e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import gradio as gr
from huggingface_hub import list_models, list_spaces
from pathlib import Path
from toolz import concat
from datasets import Dataset
import polars as pl
from datetime import date
from datasets import load_dataset
import plotly.express as px
import os
HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN
def yield_models():
for model in iter(list_models(full=True)):
yield "model", model
def yield_spaces():
for space in iter(list_spaces(full=True)):
yield "space", space
def yield_notebooks():
for repo_type, repo in concat([yield_models(), yield_spaces()]):
files = (f.rfilename for f in repo.siblings)
if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]:
yield {
"date": date.today(),
"repo_type": repo_type,
"repo_id": repo.id,
"repo_notebook_count": len(jupyter_notebook),
}
def update_stats():
df = pl.LazyFrame(yield_notebooks())
df = (
df.with_columns(pl.col("repo_id").str.split_exact("/", 1))
.unnest("repo_id")
.rename({"field_0": "user", "field_1": "repo_id"})
)
by_user_count = (
df.groupby("user")
.agg(pl.col("repo_notebook_count").sum())
.sort("repo_notebook_count", descending=True)
.collect()
)
by_user_count.mean().select(
pl.col("repo_notebook_count").alias("mean notebooks per user")
)
ds = Dataset(by_user_count.to_arrow())
ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN)
grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum())
final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect()
previous_df = pl.DataFrame(
load_dataset("davanstrien/notebooks_by_repo_type", split="train").data.table
)
final_df = pl.concat([previous_df, final_df]).unique()
spaces = final_df.filter(pl.col("repo_type") == "space").unique(
subset=["date"], keep="last"
)
models = final_df.filter(pl.col("repo_type") == "model").unique(
subset=["date"], keep="last"
)
final_df = pl.concat([spaces, models]).unique()
Dataset(final_df.to_arrow()).push_to_hub(
"davanstrien/notebooks_by_repo_type", token=HF_TOKEN
)
final_df = final_df.sort("date")
pandas_df = final_df.to_pandas()
# final_df.to_pandas().set_index("date", drop=True).sort_index()
return pandas_df, final_df
with gr.Blocks() as demo:
gr.Markdown("# Notebooks on the Hub (updated daily)")
pandas_df, final_df = update_stats()
gr.Markdown("## Notebooks on the Hub over time")
gr.Plot(px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type"))
gr.Markdown("## Notebooks on the Hub (total by date)")
gr.DataFrame(
final_df.select(pl.col(["date", "repo_notebook_count"]))
.groupby("date")
.sum()
.sort("date")
.to_pandas()
)
gr.Markdown("## Notebooks on the Hub raw data")
gr.DataFrame(pandas_df)
demo.launch(debug=True)
|