Spaces:
Runtime error
Runtime error
File size: 6,130 Bytes
58e96b2 4992d0d 58e96b2 4992d0d 58e96b2 4992d0d b9692b3 58e96b2 caddeb0 4c9274b 58e96b2 caddeb0 b9692b3 caddeb0 50ccfc5 caddeb0 55e6e0d c22cfb8 caddeb0 58e96b2 0cc7b2b 4c9274b 4992d0d 58e96b2 75d9f7d caddeb0 75d9f7d caddeb0 4992d0d caddeb0 4992d0d b9692b3 4992d0d b9692b3 4992d0d b9692b3 caddeb0 75d9f7d b9692b3 50ccfc5 caddeb0 b9692b3 50ccfc5 75d9f7d 50ccfc5 75d9f7d 50ccfc5 75d9f7d 50ccfc5 b9692b3 caddeb0 50ccfc5 4992d0d caddeb0 75d9f7d 4992d0d caddeb0 75d9f7d 4992d0d 75d9f7d 4992d0d 50ccfc5 4992d0d b9692b3 caddeb0 b9692b3 4992d0d caddeb0 4992d0d b9692b3 4992d0d 50ccfc5 1722c11 4992d0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import os
from datetime import datetime, timedelta
from functools import lru_cache
from typing import Any, List
import gradio as gr
import httpx
import pandas as pd
import plotly.express as px
import polars as pl
from cachetools import TTLCache, cached
from datasets import Dataset, load_dataset
from dotenv import load_dotenv
from httpx import Client
from toolz import concat, frequencies
from tqdm.auto import tqdm
load_dotenv()
token = os.environ["HUGGINGFACE_TOKEN"]
user_agent = os.environ["USER_AGENT"]
user = os.environ["USER_TO_TRACK"]
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
assert token
assert user_agent
assert user
headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
client = Client(headers=headers, limits=limits, timeout=120.0)
@lru_cache(maxsize=None)
def get_hub_community_activity(user: str) -> List[Any]:
with tqdm() as pbar:
all_data = []
i = 1
while True:
r = httpx.get(
f"https://huggingface.co/api/recent-activity?limit=100&activityType=discussion&skip={i}&entity={user}&feedType=user",
headers=headers,
)
activity = r.json()["recentActivity"]
if not activity:
break
all_data.append(activity)
if len(all_data) % 1000 == 0:
# print(f"Length of all_data: {len(all_data)}")
pbar.write(f"Length of all_data: {len(all_data)}")
i += 100
pbar.update(100)
return list(concat(all_data))
# def get_hub_community_activity(user: str) -> List[Any]:
# all_data = []
# for i in range(1, 2000, 100):
# r = httpx.get(
# f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}"
# )
# activity = r.json()["recentActivity"]
# all_data.append(activity)
# return list(concat(all_data))
def parse_date_time(date_time: str) -> datetime:
return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ")
def parse_pr_data(data):
data = data["discussionData"]
createdAt = parse_date_time(data["createdAt"])
pr_number = data["num"]
status = data["status"]
repo_id = data["repo"]["name"]
repo_type = data["repo"]["type"]
isPullRequest = data["isPullRequest"]
return {
"createdAt": createdAt,
"pr_number": pr_number,
"status": status,
"repo_id": repo_id,
"type": repo_type,
"isPullRequest": isPullRequest,
}
@cached(cache=TTLCache(maxsize=1000, ttl=timedelta(minutes=30), timer=datetime.now))
def update_data():
try:
previous_df = pl.DataFrame(
load_dataset(f"librarian-bot/{user}-stats", split="train").data.table
)
except FileNotFoundError:
previous_df = pl.DataFrame()
data = get_hub_community_activity(user)
data = [d for d in data if d.get("discussionData", None) is not None]
data = [parse_pr_data(d) for d in data]
update_df = pl.DataFrame(data)
df = pl.concat([previous_df, update_df]).unique()
if len(df) != len(previous_df):
Dataset(df.to_arrow()).push_to_hub(f"{user}-stats", token=token)
return df
# def get_pr_status():
# df = update_data()
# df = df.filter(pl.col("isPullRequest") is True)
# return df.select(pl.col("status").value_counts())
# # return frequencies(x["status"] for x in pr_data)
@lru_cache(maxsize=512)
def get_pr_status(user: str):
all_data = get_hub_community_activity(user)
print(all_data)
# pr_data = (
# x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"]
# )
all_data = [
pr_data
for pr_data in all_data
if pr_data.get("discussionData", None) is not None
]
pr_data = (
x.get("discussionData", {})
for x in all_data
if x.get("discussionData", {}).get("isPullRequest", False)
)
return frequencies(x["status"] for x in pr_data)
def create_pie():
frequencies = get_pr_status(user)
df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()})
return px.pie(df, values="number", names="status", template="seaborn")
def group_status_by_pr_number():
all_data = get_hub_community_activity(user)
all_data = [d for d in all_data if d.get("discussionData", None) is not None]
all_data = [parse_pr_data(d) for d in all_data]
return (
pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
)
def plot_over_time():
all_data = get_hub_community_activity(user)
all_data = [d for d in all_data if d.get("discussionData", None) is not None]
all_data = [parse_pr_data(d) for d in all_data]
df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
df = df.pivot(
values=["status"],
index=["createdAt"],
columns=["status"],
aggregate_function="count",
)
df = df.fill_null(0)
df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt")
df = df.to_pandas().set_index("createdAt").cumsum()
return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])
create_pie()
with gr.Blocks() as demo:
# frequencies = get_pr_status("librarian-bot")
gr.Markdown(f"# {user} PR Stats")
gr.Markdown(f"Total prs and issues opened by {user}: {len(update_data()):,}")
# gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}")
with gr.Column():
gr.Markdown("## Pull requests status")
gr.Markdown(
"The below pie chart shows the percentage of pull requests made by"
" librarian bot that are open, closed or merged"
)
gr.Plot(create_pie())
with gr.Column():
gr.Markdown("Pull requests opened, closed and merged over time (cumulative)")
gr.Plot(plot_over_time())
with gr.Column():
gr.Markdown("## Pull requests status by PR number")
gr.DataFrame(group_status_by_pr_number())
demo.launch(debug=True)
|