from typing import Any, List import gradio as gr from toolz import concat import plotly.express as px import polars as pl from datasets import load_dataset from cachetools import TTLCache, cached from datetime import datetime, timedelta from datasets import Dataset import os from functools import lru_cache import pandas as pd from toolz import frequencies from dotenv import load_dotenv from typing import List, Any from toolz import concat from httpx import Client from tqdm.auto import tqdm load_dotenv() token = os.environ["HUGGINGFACE_TOKEN"] user_agent = os.environ["USER_AGENT"] user = os.environ["USER_TO_TRACK"] os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" assert token assert user_agent assert user headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"} client = Client(headers=headers, http2=True) def get_hub_community_activity(user: str) -> List[Any]: with tqdm() as pbar: all_data = [] i = 1 while True: r = client.get( f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}", ) activity = r.json()["recentActivity"] if not activity: break all_data.append(activity) if len(all_data) % 1000 == 0: # print(f"Length of all_data: {len(all_data)}") pbar.write(f"Length of all_data: {len(all_data)}") i += 100 pbar.update(100) return list(concat(all_data)) # def get_hub_community_activity(user: str) -> List[Any]: # all_data = [] # for i in range(1, 2000, 100): # r = httpx.get( # f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}" # ) # activity = r.json()["recentActivity"] # all_data.append(activity) # return list(concat(all_data)) def parse_date_time(date_time: str) -> datetime: return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ") def parse_pr_data(data): data = data["discussionData"] createdAt = parse_date_time(data["createdAt"]) pr_number = data["num"] status = data["status"] repo_id = data["repo"]["name"] repo_type = data["repo"]["type"] isPullRequest = data["isPullRequest"] return { "createdAt": createdAt, "pr_number": pr_number, "status": status, "repo_id": repo_id, "type": repo_type, "isPullRequest": isPullRequest, } @cached(cache=TTLCache(maxsize=1000, ttl=timedelta(minutes=30), timer=datetime.now)) def update_data(): try: previous_df = pl.DataFrame( load_dataset(f"librarian-bot/{user}-stats", split="train").data.table ) except FileNotFoundError: previous_df = pl.DataFrame() data = get_hub_community_activity(user) data = [parse_pr_data(d) for d in data] update_df = pl.DataFrame(data) df = pl.concat([previous_df, update_df]).unique() if len(df) != len(previous_df): Dataset(df.to_arrow()).push_to_hub(f"{user}-stats", token=token) return df # def get_pr_status(): # df = update_data() # df = df.filter(pl.col("isPullRequest") is True) # return df.select(pl.col("status").value_counts()) # # return frequencies(x["status"] for x in pr_data) @lru_cache(maxsize=512) def get_pr_status(user: str): all_data = get_hub_community_activity(user) pr_data = ( x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"] ) return frequencies(x["status"] for x in pr_data) def create_pie(): frequencies = get_pr_status(user) df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()}) return px.pie(df, values="number", names="status", template="seaborn") def group_status_by_pr_number(): all_data = get_hub_community_activity(user) all_data = [parse_pr_data(d) for d in all_data] return ( pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas() ) def plot_over_time(): all_data = get_hub_community_activity(user) all_data = [parse_pr_data(d) for d in all_data] df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date)) df = df.pivot( values=["status"], index=["createdAt"], columns=["status"], aggregate_function="count", ) df = df.fill_null(0) df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt") df = df.to_pandas().set_index("createdAt").cumsum() return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"]) create_pie() with gr.Blocks() as demo: # frequencies = get_pr_status("librarian-bot") gr.Markdown(f"# {user} PR Stats") gr.Markdown(f"Total prs and issues opened by {user}: {len(update_data()):,}") # gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}") with gr.Column(): gr.Markdown("## Pull requests status") gr.Markdown( "The below pie chart shows the percentage of pull requests made by" " librarian bot that are open, closed or merged" ) gr.Plot(create_pie()) with gr.Column(): gr.Markdown("Pull requests opened, closed and merged over time (cumulative)") gr.Plot(plot_over_time()) with gr.Column(): gr.Markdown("## Pull requests status by PR number") gr.DataFrame(group_status_by_pr_number()) demo.launch(debug=True)