davanstrien HF Staff commited on
Commit
caddeb0
·
1 Parent(s): c39978f
Files changed (1) hide show
  1. app.py +56 -26
app.py CHANGED
@@ -13,23 +13,57 @@ import os
13
  from functools import lru_cache
14
  import pandas as pd
15
  from toolz import frequencies
 
 
 
 
 
16
 
 
 
17
  token = os.environ["HUGGINGFACE_TOKEN"]
 
 
18
  assert token
19
- librarian_bot_avatar = "https://aeiljuispo.cloudimg.io/v7/https://s3.amazonaws.com/moonup/production/uploads/1674830754237-63d3e0e8ff1384ce6c5dd17d.jpeg?w=200&h=200&f=face"
 
 
 
20
 
21
 
22
  def get_hub_community_activity(user: str) -> List[Any]:
23
- all_data = []
24
- for i in range(1, 2000, 100):
25
- r = httpx.get(
26
- f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}"
27
- )
28
- activity = r.json()["recentActivity"]
29
- all_data.append(activity)
 
 
 
 
 
 
 
 
 
 
 
30
  return list(concat(all_data))
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
33
  def parse_date_time(date_time: str) -> datetime:
34
  return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ")
35
 
@@ -54,15 +88,18 @@ def parse_pr_data(data):
54
 
55
  @cached(cache=TTLCache(maxsize=1000, ttl=timedelta(minutes=30), timer=datetime.now))
56
  def update_data():
57
- previous_df = pl.DataFrame(
58
- load_dataset("librarian-bot/stats", split="train").data.table
59
- )
60
- data = get_hub_community_activity("librarian-bot")
 
 
 
61
  data = [parse_pr_data(d) for d in data]
62
  update_df = pl.DataFrame(data)
63
  df = pl.concat([previous_df, update_df]).unique()
64
  if len(df) != len(previous_df):
65
- Dataset(df.to_arrow()).push_to_hub("librarian-bot/stats", token=token)
66
  return df
67
 
68
 
@@ -83,20 +120,13 @@ def get_pr_status(user: str):
83
 
84
 
85
  def create_pie():
86
- frequencies = get_pr_status("librarian-bot")
87
  df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()})
88
  return px.pie(df, values="number", names="status", template="seaborn")
89
 
90
 
91
- # def create_pie():
92
- # df = update_data()
93
- # df = df.filter(pl.col("isPullRequest") is True)
94
- # df = df["status"].value_counts().to_pandas()
95
- # return px.pie(df, values="counts", names="status", template="seaborn")
96
-
97
-
98
  def group_status_by_pr_number():
99
- all_data = get_hub_community_activity("librarian-bot")
100
  all_data = [parse_pr_data(d) for d in all_data]
101
  return (
102
  pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
@@ -104,7 +134,7 @@ def group_status_by_pr_number():
104
 
105
 
106
  def plot_over_time():
107
- all_data = get_hub_community_activity("librarian-bot")
108
  all_data = [parse_pr_data(d) for d in all_data]
109
  df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
110
  df = df.pivot(
@@ -123,11 +153,11 @@ create_pie()
123
 
124
  with gr.Blocks() as demo:
125
  # frequencies = get_pr_status("librarian-bot")
126
- gr.HTML(Path("description.html").read_text())
127
- gr.Markdown(f"Total prs and issues opened by librarian-bot: {len(update_data())}")
128
  # gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}")
129
  with gr.Column():
130
- gr.Markdown("## Pull requests Status")
131
  gr.Markdown(
132
  "The below pie chart shows the percentage of pull requests made by"
133
  " librarian bot that are open, closed or merged"
 
13
  from functools import lru_cache
14
  import pandas as pd
15
  from toolz import frequencies
16
+ from dotenv import load_dotenv
17
+ from typing import List, Any
18
+ from toolz import concat
19
+ import httpx
20
+ from tqdm.auto import tqdm
21
 
22
+
23
+ load_dotenv()
24
  token = os.environ["HUGGINGFACE_TOKEN"]
25
+ user_agent = os.environ["USER_AGENT"]
26
+ user = os.environ["USER_TO_TRACK"]
27
  assert token
28
+ assert user_agent
29
+ assert user
30
+
31
+ headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
32
 
33
 
34
  def get_hub_community_activity(user: str) -> List[Any]:
35
+ with tqdm() as pbar:
36
+ all_data = []
37
+ i = 1
38
+ while True:
39
+ r = httpx.get(
40
+ f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}",
41
+ headers=headers,
42
+ )
43
+ activity = r.json()["recentActivity"]
44
+ if not activity:
45
+ break
46
+ all_data.append(activity)
47
+ if len(all_data) % 1000 == 0:
48
+ # print(f"Length of all_data: {len(all_data)}")
49
+ pbar.write(f"Length of all_data: {len(all_data)}")
50
+ i += 100
51
+ pbar.update(100)
52
+
53
  return list(concat(all_data))
54
 
55
 
56
+ # def get_hub_community_activity(user: str) -> List[Any]:
57
+ # all_data = []
58
+ # for i in range(1, 2000, 100):
59
+ # r = httpx.get(
60
+ # f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}"
61
+ # )
62
+ # activity = r.json()["recentActivity"]
63
+ # all_data.append(activity)
64
+ # return list(concat(all_data))
65
+
66
+
67
  def parse_date_time(date_time: str) -> datetime:
68
  return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ")
69
 
 
88
 
89
  @cached(cache=TTLCache(maxsize=1000, ttl=timedelta(minutes=30), timer=datetime.now))
90
  def update_data():
91
+ try:
92
+ previous_df = pl.DataFrame(
93
+ load_dataset(f"librarian-bot/{user}-stats", split="train").data.table
94
+ )
95
+ except FileNotFoundError:
96
+ previous_df = pl.DataFrame()
97
+ data = get_hub_community_activity(user)
98
  data = [parse_pr_data(d) for d in data]
99
  update_df = pl.DataFrame(data)
100
  df = pl.concat([previous_df, update_df]).unique()
101
  if len(df) != len(previous_df):
102
+ Dataset(df.to_arrow()).push_to_hub(f"{user}-stats", token=token)
103
  return df
104
 
105
 
 
120
 
121
 
122
  def create_pie():
123
+ frequencies = get_pr_status(user)
124
  df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()})
125
  return px.pie(df, values="number", names="status", template="seaborn")
126
 
127
 
 
 
 
 
 
 
 
128
  def group_status_by_pr_number():
129
+ all_data = get_hub_community_activity(user)
130
  all_data = [parse_pr_data(d) for d in all_data]
131
  return (
132
  pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
 
134
 
135
 
136
  def plot_over_time():
137
+ all_data = get_hub_community_activity(user)
138
  all_data = [parse_pr_data(d) for d in all_data]
139
  df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
140
  df = df.pivot(
 
153
 
154
  with gr.Blocks() as demo:
155
  # frequencies = get_pr_status("librarian-bot")
156
+ gr.Markdown(f"# {user} PR Stats")
157
+ gr.Markdown(f"Total prs and issues opened by {user}: {len(update_data()):,}")
158
  # gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}")
159
  with gr.Column():
160
+ gr.Markdown("## Pull requests status")
161
  gr.Markdown(
162
  "The below pie chart shows the percentage of pull requests made by"
163
  " librarian bot that are open, closed or merged"