davanstrien's picture
davanstrien HF Staff
Update app.py
cc56408
import gradio as gr
from huggingface_hub import list_models, list_spaces
from pathlib import Path
from toolz import concat
from datasets import Dataset
import polars as pl
from datetime import date
from datasets import load_dataset
import plotly.express as px
import os
from typing import Optional, Set, Tuple
from functools import lru_cache
HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN
def yield_models(exclude_users: Optional[Set[str]] = None):
"""Yields models from the hub optionally excluding users."""
for model in iter(list_models(full=True)):
if (
exclude_users is not None
and model.author is not None
and model.author in exclude_users
):
continue
yield "model", model
def yield_spaces(exclude_users: Optional[Set[str]] = None):
for space in iter(list_spaces(full=True)):
if exclude_users is not None and space.author and space.author in exclude_users:
continue
yield "space", space
def yield_notebooks_counts(exclude_users: Optional[Set[str]] = None):
for repo_type, repo in concat(
[
yield_models(exclude_users=exclude_users),
yield_spaces(exclude_users=exclude_users),
]
):
files = (f.rfilename for f in repo.siblings)
if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]:
yield {
"date": date.today(),
"repo_type": repo_type,
"repo_id": repo.id,
"repo_notebook_count": len(jupyter_notebook),
}
def yield_notebooks(exclude_users: Optional[Set[str]] = None):
for repo_type, repo in concat(
[
yield_models(exclude_users=exclude_users),
yield_spaces(exclude_users=exclude_users),
]
):
files = (f.rfilename for f in repo.siblings)
if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]:
yield {
"repo_type": repo_type,
"repo_id": repo.id,
"repo_notebook_count": len(jupyter_notebook),
"jupyter_notebooks": jupyter_notebook,
"likes": repo.likes,
}
@lru_cache
def _get_top_liked_repos_with_notebooks(exclude_users: Optional[Tuple[str]] = None):
df = pl.LazyFrame(yield_notebooks(exclude_users=exclude_users))
df = df.sort("likes", descending=True).collect()
return df
def get_top_liked_repos_with_notebooks(exclude_users: Optional[Set[str]] = None):
exclude_users = tuple(exclude_users) if exclude_users else None
return _get_top_liked_repos_with_notebooks(exclude_users)
def get_top_k_notebooks_by_repo_type(type: str = "space", k: int = 50):
df = get_top_liked_repos_with_notebooks({"gradio"})
return df.filter(pl.col("repo_type") == type).head(k).to_pandas()
def raw_current_notebook_dataframe():
df = pl.DataFrame(
yield_notebooks_counts(exclude_users={"gradio", "gradio-pr-deploys"})
)
return df.to_pandas()
def update_stats():
df = pl.LazyFrame(
yield_notebooks_counts(exclude_users={"gradio", "gradio-pr-deploys"})
)
df = (
df.with_columns(pl.col("repo_id").str.split_exact("/", 1))
.unnest("repo_id")
.rename({"field_0": "user", "field_1": "repo_id"})
)
previous_raw_df = pl.DataFrame(
load_dataset("davanstrien/notebooks_on_the_hub_raw", split="train",verification_mode='no_checks').data.table
)
final_raw_df = pl.concat([previous_raw_df, df.collect()]).unique()
Dataset(final_raw_df.to_arrow()).push_to_hub(
"davanstrien/notebooks_on_the_hub_raw", token=HF_TOKEN
)
by_user_count = (
df.groupby("user")
.agg(pl.col("repo_notebook_count").sum())
.sort("repo_notebook_count", descending=True)
.collect()
)
by_user_count.mean().select(
pl.col("repo_notebook_count").alias("mean notebooks per user")
)
ds = Dataset(by_user_count.to_arrow())
ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN)
grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum())
final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect()
previous_df = pl.DataFrame(
load_dataset("davanstrien/notebooks_by_repo_type", split="train",verification_mode='no_checks').data.table
)
final_df = pl.concat([previous_df, final_df]).unique()
spaces = final_df.filter(pl.col("repo_type") == "space").unique(
subset=["date"], keep="last"
)
models = final_df.filter(pl.col("repo_type") == "model").unique(
subset=["date"], keep="last"
)
final_df = pl.concat([spaces, models]).unique()
Dataset(final_df.to_arrow()).push_to_hub(
"davanstrien/notebooks_by_repo_type", token=HF_TOKEN
)
final_df = final_df.sort("date")
pandas_df = final_df.to_pandas()
# final_df.to_pandas().set_index("date", drop=True).sort_index()
return pandas_df, final_df, final_raw_df
with gr.Blocks() as demo:
with gr.Tab("Notebooks on the Hub stats"):
gr.Markdown("# Notebooks on the Hub (updated daily)")
pandas_df, final_df, final_raw_df = update_stats()
gr.Markdown("## Notebooks on the Hub over time")
gr.Plot(px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type"))
gr.Markdown("## Notebooks on the Hub (total by date)")
gr.DataFrame(
final_df.select(pl.col(["date", "repo_notebook_count"]))
.groupby("date")
.sum()
.sort("date")
.to_pandas()
)
gr.Markdown("## Top Repos by likes with notebooks")
gr.Markdown("#### Top 10 Spaces")
gr.DataFrame(get_top_k_notebooks_by_repo_type("space", 10)[["repo_id", "likes"]])
gr.Markdown("#### Top 10 Models")
gr.DataFrame(get_top_k_notebooks_by_repo_type("model", 10)[["repo_id", "likes"]])
# repo_type.update(get_top_k_notebooks_by_repo_type, [repo_type, k],[df])
with gr.Tab("raw data"):
gr.Markdown("## Notebooks on the Hub raw data")
gr.DataFrame(final_raw_df.to_pandas())
demo.launch(debug=True)