File size: 5,458 Bytes
4992d0d
 
b9692b3
4992d0d
 
 
 
b9692b3
 
 
 
 
50ccfc5
 
 
caddeb0
 
 
 
 
b9692b3
caddeb0
 
50ccfc5
caddeb0
 
c22cfb8
caddeb0
 
 
 
4992d0d
 
 
caddeb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4992d0d
 
 
caddeb0
 
 
 
 
 
 
 
 
 
 
4992d0d
 
 
 
 
 
 
 
 
 
b9692b3
 
4992d0d
 
 
 
 
b9692b3
 
4992d0d
 
 
b9692b3
 
caddeb0
 
 
 
 
 
 
b9692b3
 
 
50ccfc5
caddeb0
b9692b3
 
 
 
 
 
 
 
 
 
50ccfc5
 
 
 
 
 
 
 
 
b9692b3
caddeb0
50ccfc5
 
 
 
4992d0d
caddeb0
4992d0d
 
 
 
 
 
 
caddeb0
4992d0d
 
 
 
 
 
 
 
 
 
 
 
 
50ccfc5
 
4992d0d
 
b9692b3
caddeb0
 
b9692b3
4992d0d
caddeb0
4992d0d
b9692b3
 
4992d0d
50ccfc5
1722c11
 
 
4992d0d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from typing import Any, List
import gradio as gr
from toolz import concat
import httpx
import plotly.express as px
import polars as pl
from pathlib import Path
from datasets import load_dataset
from cachetools import TTLCache, cached
from datetime import datetime, timedelta
from datasets import Dataset
import os
from functools import lru_cache
import pandas as pd
from toolz import frequencies
from dotenv import load_dotenv
from typing import List, Any
from toolz import concat
import httpx
from tqdm.auto import tqdm


load_dotenv()
token = os.environ["HUGGINGFACE_TOKEN"]
user_agent = os.environ["USER_AGENT"]
user = os.environ["USER_TO_TRACK"]
assert token
assert user_agent
assert user

headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}


def get_hub_community_activity(user: str) -> List[Any]:
    with tqdm() as pbar:
        all_data = []
        i = 1
        while True:
            r = httpx.get(
                f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}",
                headers=headers,
            )
            activity = r.json()["recentActivity"]
            if not activity:
                break
            all_data.append(activity)
            if len(all_data) % 1000 == 0:
                # print(f"Length of all_data: {len(all_data)}")
                pbar.write(f"Length of all_data: {len(all_data)}")
            i += 100
            pbar.update(100)

    return list(concat(all_data))


# def get_hub_community_activity(user: str) -> List[Any]:
#     all_data = []
#     for i in range(1, 2000, 100):
#         r = httpx.get(
#             f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}"
#         )
#         activity = r.json()["recentActivity"]
#         all_data.append(activity)
#     return list(concat(all_data))


def parse_date_time(date_time: str) -> datetime:
    return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ")


def parse_pr_data(data):
    data = data["discussionData"]
    createdAt = parse_date_time(data["createdAt"])
    pr_number = data["num"]
    status = data["status"]
    repo_id = data["repo"]["name"]
    repo_type = data["repo"]["type"]
    isPullRequest = data["isPullRequest"]
    return {
        "createdAt": createdAt,
        "pr_number": pr_number,
        "status": status,
        "repo_id": repo_id,
        "type": repo_type,
        "isPullRequest": isPullRequest,
    }


@cached(cache=TTLCache(maxsize=1000, ttl=timedelta(minutes=30), timer=datetime.now))
def update_data():
    try:
        previous_df = pl.DataFrame(
            load_dataset(f"librarian-bot/{user}-stats", split="train").data.table
        )
    except FileNotFoundError:
        previous_df = pl.DataFrame()
    data = get_hub_community_activity(user)
    data = [parse_pr_data(d) for d in data]
    update_df = pl.DataFrame(data)
    df = pl.concat([previous_df, update_df]).unique()
    if len(df) != len(previous_df):
        Dataset(df.to_arrow()).push_to_hub(f"{user}-stats", token=token)
    return df


# def get_pr_status():
#     df = update_data()
#     df = df.filter(pl.col("isPullRequest") is True)
#     return df.select(pl.col("status").value_counts())
#     # return frequencies(x["status"] for x in pr_data)


@lru_cache(maxsize=512)
def get_pr_status(user: str):
    all_data = get_hub_community_activity(user)
    pr_data = (
        x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"]
    )
    return frequencies(x["status"] for x in pr_data)


def create_pie():
    frequencies = get_pr_status(user)
    df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()})
    return px.pie(df, values="number", names="status", template="seaborn")


def group_status_by_pr_number():
    all_data = get_hub_community_activity(user)
    all_data = [parse_pr_data(d) for d in all_data]
    return (
        pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
    )


def plot_over_time():
    all_data = get_hub_community_activity(user)
    all_data = [parse_pr_data(d) for d in all_data]
    df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
    df = df.pivot(
        values=["status"],
        index=["createdAt"],
        columns=["status"],
        aggregate_function="count",
    )
    df = df.fill_null(0)
    df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt")
    df = df.to_pandas().set_index("createdAt").cumsum()
    return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])


create_pie()

with gr.Blocks() as demo:
    # frequencies = get_pr_status("librarian-bot")
    gr.Markdown(f"# {user} PR Stats")
    gr.Markdown(f"Total prs and issues opened by {user}: {len(update_data()):,}")
    # gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}")
    with gr.Column():
        gr.Markdown("## Pull requests status")
        gr.Markdown(
            "The below pie chart shows the percentage of pull requests made by"
            " librarian bot that are open, closed or merged"
        )
        gr.Plot(create_pie())
    with gr.Column():
        gr.Markdown("Pull requests opened, closed and merged over time (cumulative)")
        gr.Plot(plot_over_time())
    with gr.Column():
        gr.Markdown("## Pull requests status by PR number")
        gr.DataFrame(group_status_by_pr_number())
demo.launch(debug=True)