Spaces:

davanstrien
/

notebooks_on_the_hub

Runtime error

App Files Files Community

notebooks_on_the_hub / app.py

davanstrien HF Staff

Upload 4 files

c4fe3e2 about 2 years ago

raw

history blame

3.1 kB

	import gradio as gr

	from huggingface_hub import list_models, list_spaces
	from pathlib import Path
	from toolz import concat
	from datasets import Dataset
	import polars as pl
	from datetime import date
	from datasets import load_dataset
	import plotly.express as px
	import os

	HF_TOKEN = os.getenv("HF_TOKEN")
	assert HF_TOKEN


	def yield_models():
	for model in iter(list_models(full=True)):
	yield "model", model


	def yield_spaces():
	for space in iter(list_spaces(full=True)):
	yield "space", space


	def yield_notebooks():
	for repo_type, repo in concat([yield_models(), yield_spaces()]):
	files = (f.rfilename for f in repo.siblings)
	if jupyter_notebook := [f for f in files if Path(f).suffix == ".ipynb"]:
	yield {
	"date": date.today(),
	"repo_type": repo_type,
	"repo_id": repo.id,
	"repo_notebook_count": len(jupyter_notebook),
	}


	def update_stats():
	df = pl.LazyFrame(yield_notebooks())

	df = (
	df.with_columns(pl.col("repo_id").str.split_exact("/", 1))
	.unnest("repo_id")
	.rename({"field_0": "user", "field_1": "repo_id"})
	)
	by_user_count = (
	df.groupby("user")
	.agg(pl.col("repo_notebook_count").sum())
	.sort("repo_notebook_count", descending=True)
	.collect()
	)

	by_user_count.mean().select(
	pl.col("repo_notebook_count").alias("mean notebooks per user")
	)

	ds = Dataset(by_user_count.to_arrow())

	ds.push_to_hub("davanstrien/notebooks_by_user", token=HF_TOKEN)

	grouped = df.groupby("repo_type").agg(pl.col("repo_notebook_count").sum())
	final_df = grouped.with_columns(pl.lit(date.today()).alias("date")).collect()
	previous_df = pl.DataFrame(
	load_dataset("davanstrien/notebooks_by_repo_type", split="train").data.table
	)
	final_df = pl.concat([previous_df, final_df]).unique()
	spaces = final_df.filter(pl.col("repo_type") == "space").unique(
	subset=["date"], keep="last"
	)
	models = final_df.filter(pl.col("repo_type") == "model").unique(
	subset=["date"], keep="last"
	)
	final_df = pl.concat([spaces, models]).unique()
	Dataset(final_df.to_arrow()).push_to_hub(
	"davanstrien/notebooks_by_repo_type", token=HF_TOKEN
	)

	final_df = final_df.sort("date")
	pandas_df = final_df.to_pandas()
	# final_df.to_pandas().set_index("date", drop=True).sort_index()
	return pandas_df, final_df


	with gr.Blocks() as demo:
	gr.Markdown("# Notebooks on the Hub (updated daily)")
	pandas_df, final_df = update_stats()
	gr.Markdown("## Notebooks on the Hub over time")
	gr.Plot(px.line(pandas_df, x="date", y="repo_notebook_count", color="repo_type"))
	gr.Markdown("## Notebooks on the Hub (total by date)")
	gr.DataFrame(
	final_df.select(pl.col(["date", "repo_notebook_count"]))
	.groupby("date")
	.sum()
	.sort("date")
	.to_pandas()
	)
	gr.Markdown("## Notebooks on the Hub raw data")
	gr.DataFrame(pandas_df)


	demo.launch(debug=True)