Spaces:
Runtime error
Runtime error
import gradio as gr | |
from huggingface_hub import list_models, list_spaces | |
from pathlib import Path | |
from toolz import concat | |
from datasets import Dataset | |
import polars as pl | |
from datetime import date | |
from datasets import load_dataset | |
import plotly.express as px | |
import os | |
from typing import Optional, Set, Tuple | |
from functools import lru_cache | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
assert HF_TOKEN | |
def yield_models(exclude_users: Optional[Set[str]] = None): | |
"""Yields models from the hub optionally excluding users.""" | |
for model in iter(list_models(full=True)): | |
if ( | |
exclude_users is not None | |
and model.author is not None | |
and model.author in exclude_users | |
): | |
continue | |
yield "model", model | |
def yield_spaces(exclude_users: Optional[Set[str]] = None): | |
for space in iter(list_spaces(full=True)): | |
if exclude_users is not None and space.author and space.author in exclude_users: | |
continue | |
yield "space", space | |
def yield_notebooks_counts(exclude_users: Optional[Set[str]] = None): | |
for repo_type, repo in concat( | |
[ | |
yield_models(exclude_users=exclude_users), | |
yield_spaces(exclude_users=exclude_users), | |
] | |
): | |
files = (f.rfilename for f in repo.siblings) | |
if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]: | |
yield { | |
"date": date.today(), | |
"repo_type": repo_type, | |
"repo_id": repo.id, | |
"repo_notebook_count": len(jupyter_notebook), | |
} | |
def yield_notebooks(exclude_users: Optional[Set[str]] = None): | |
for repo_type, repo in concat( | |
[ | |
yield_models(exclude_users=exclude_users), | |
yield_spaces(exclude_users=exclude_users), | |
] | |
): | |
files = (f.rfilename for f in repo.siblings) | |
if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]: | |
yield { | |
"repo_type": repo_type, | |
"repo_id": repo.id, | |
"repo_notebook_count": len(jupyter_notebook), | |
"jupyter_notebooks": jupyter_notebook, | |
"likes": repo.likes, | |
} | |
def _get_top_liked_repos_with_notebooks(exclude_users: Optional[Tuple[str]] = None): | |
df = pl.LazyFrame(yield_notebooks(exclude_users=exclude_users)) | |
df = df.sort("likes", descending=True).collect() | |
return df | |
def get_top_liked_repos_with_notebooks(exclude_users: Optional[Set[str]] = None): | |
exclude_users = tuple(exclude_users) if exclude_users else None | |
return _get_top_liked_repos_with_notebooks(exclude_users) | |
def get_top_k_notebooks_by_repo_type(type: str = "space", k: int = 50): | |
df = get_top_liked_repos_with_notebooks({"gradio"}) | |
return df.filter(pl.col("repo_type") == type).head(k).to_pandas() | |
def raw_current_notebook_dataframe(): | |
df = pl.DataFrame( | |
yield_notebooks_counts(exclude_users={"gradio", "gradio-pr-deploys"}) | |
) | |
return df.to_pandas() | |
def update_stats(): | |
df = pl.LazyFrame( | |
yield_notebooks_counts(exclude_users={"gradio", "gradio-pr-deploys"}) | |
) | |
df = ( | |
df.with_columns(pl.col("repo_id").str.split_exact("/", 1)) | |
.unnest("repo_id") | |
.rename({"field_0": "user", "field_1": "repo_id"}) | |
) | |
previous_raw_df = pl.DataFrame( | |
load_dataset("davanstrien/notebooks_on_the_hub_raw", split="train",verification_mode='no_checks').data.table | |
) | |
final_raw_df = pl.concat([previous_raw_df, df.collect()]).unique() | |
Dataset(final_raw_df.to_arrow()).push_to_hub( | |
"davanstrien/notebooks_on_the_hub_raw", token=HF_TOKEN | |
) | |
by_user_count = ( | |
df.groupby("user") | |
.agg(pl.col("repo_notebook_count").sum()) | |
.sort("repo_notebook_count", descending=True) | |
.collect() | |
) | |
by_user_count.mean().select( | |
pl.col("repo_notebook_count").alias("mean notebooks per user") | |
) | |
ds = Dataset(by_user_count.to_arrow()) | |
ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN) | |
grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum()) | |
final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect() | |
previous_df = pl.DataFrame( | |
load_dataset("davanstrien/notebooks_by_repo_type", split="train",verification_mode='no_checks').data.table | |
) | |
final_df = pl.concat([previous_df, final_df]).unique() | |
spaces = final_df.filter(pl.col("repo_type") == "space").unique( | |
subset=["date"], keep="last" | |
) | |
models = final_df.filter(pl.col("repo_type") == "model").unique( | |
subset=["date"], keep="last" | |
) | |
final_df = pl.concat([spaces, models]).unique() | |
Dataset(final_df.to_arrow()).push_to_hub( | |
"davanstrien/notebooks_by_repo_type", token=HF_TOKEN | |
) | |
final_df = final_df.sort("date") | |
pandas_df = final_df.to_pandas() | |
# final_df.to_pandas().set_index("date", drop=True).sort_index() | |
return pandas_df, final_df, final_raw_df | |
with gr.Blocks() as demo: | |
with gr.Tab("Notebooks on the Hub stats"): | |
gr.Markdown("# Notebooks on the Hub (updated daily)") | |
pandas_df, final_df, final_raw_df = update_stats() | |
gr.Markdown("## Notebooks on the Hub over time") | |
gr.Plot(px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type")) | |
gr.Markdown("## Notebooks on the Hub (total by date)") | |
gr.DataFrame( | |
final_df.select(pl.col(["date", "repo_notebook_count"])) | |
.groupby("date") | |
.sum() | |
.sort("date") | |
.to_pandas() | |
) | |
gr.Markdown("## Top Repos by likes with notebooks") | |
gr.Markdown("#### Top 10 Spaces") | |
gr.DataFrame(get_top_k_notebooks_by_repo_type("space", 10)[["repo_id", "likes"]]) | |
gr.Markdown("#### Top 10 Models") | |
gr.DataFrame(get_top_k_notebooks_by_repo_type("model", 10)[["repo_id", "likes"]]) | |
# repo_type.update(get_top_k_notebooks_by_repo_type, [repo_type, k],[df]) | |
with gr.Tab("raw data"): | |
gr.Markdown("## Notebooks on the Hub raw data") | |
gr.DataFrame(final_raw_df.to_pandas()) | |
demo.launch(debug=True) | |