davanstrien HF Staff commited on
Commit
1ab8a1a
·
1 Parent(s): d2cae6f
Files changed (3) hide show
  1. app.py +119 -0
  2. requirements.in +8 -0
  3. requirements.txt +253 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import get_collection, Collection, CollectionItem
2
+ from toolz import groupby, valmap
3
+ from typing import Dict, List
4
+ import pandas as pd
5
+ from huggingface_hub import model_info
6
+ import gradio as gr
7
+ from functools import lru_cache
8
+
9
+ test_slug = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
10
+
11
+
12
+ def group_collection_by_repo_type(
13
+ collection_slug: str,
14
+ ) -> Dict[str, List[CollectionItem]]:
15
+ collection = get_collection(collection_slug)
16
+ return groupby(lambda x: x.repoType, collection.items)
17
+
18
+
19
+ def render_model_hub_link(hub_id):
20
+ link = f"https://huggingface.co/{hub_id}"
21
+ return (
22
+ f'<a target="_blank" href="{link}" style="color: var(--link-text-color);'
23
+ f' text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
24
+ )
25
+
26
+
27
+ def load_to_dataframe(data):
28
+ # Columns to keep
29
+ columns = [
30
+ "item_id",
31
+ "downloads",
32
+ "author",
33
+ "likes",
34
+ "pipeline_tag",
35
+ "lastModified",
36
+ ]
37
+ # convert to dicts
38
+ data = [item.__dict__ for item in data]
39
+
40
+ filtered_data = [
41
+ {key: item[key] for key in columns if key in item} for item in data
42
+ ]
43
+ required_info_keys = ["language", "tags", "license", "datasets"]
44
+
45
+ for item in filtered_data:
46
+ try:
47
+ card = model_info(item["item_id"]).cardData
48
+ for key in required_info_keys:
49
+ item[key] = card.get(key)
50
+ except AttributeError as e:
51
+ print(e)
52
+ for key in required_info_keys:
53
+ item[key] = None
54
+ # Load into a DataFrame
55
+ df = pd.DataFrame(filtered_data)
56
+ df["item_id"] = df["item_id"].apply(render_model_hub_link)
57
+ return df
58
+
59
+
60
+ def summary_of_na_values(df):
61
+ na_counts = df.isna().sum()
62
+ na_counts = na_counts[na_counts > 0]
63
+ na_percent = round(na_counts / len(df) * 100, 2)
64
+ return (
65
+ pd.DataFrame({"Missing Count": na_counts, "Missing Percent": na_percent})
66
+ .rename_axis(index="Metadata")
67
+ .reset_index()
68
+ )
69
+
70
+ def value_counts(df, column_name):
71
+ return df[column_name].value_counts()
72
+
73
+
74
+
75
+ @lru_cache(maxsize=10)
76
+ def load_data():
77
+ repos_grouped_by_type = group_collection_by_repo_type(test_slug)
78
+ models = repos_grouped_by_type["model"]
79
+ df = load_to_dataframe(models)
80
+ column_names = df.columns.to_list()
81
+ return repos_grouped_by_type, column_names, df
82
+
83
+
84
+
85
+ def generate_markdown_summary_of_collection(
86
+ grouped_collection: Dict[str, List[CollectionItem]]
87
+ ):
88
+ counts = valmap(len, grouped_collection)
89
+ results = "This collection contains the following items:\n"
90
+ for k, v in counts.items():
91
+ results += f"- {v} {k}s\n"
92
+ return results
93
+
94
+
95
+ repos_grouped_by_type, column_names, df = load_data()
96
+
97
+
98
+ def filter_df(columns_to_show=None):
99
+ *_, df = load_data()
100
+ return df if columns_to_show is None else df[columns_to_show]
101
+
102
+
103
+ # with gr.Blocks() as demo:
104
+ # gr.Markdown("## Info about models in this collection")
105
+ # gr.Markdown(generate_markdown_summary_of_collection(repos_grouped_by_type))
106
+ # gr.Markdown("### Summary of missing metadata values")
107
+ # gr.DataFrame(summary_of_na_values(df))
108
+ # gr.Markdown("# Models in this collection")
109
+ # with gr.Accordion("Models", open=False):
110
+ # columns_to_show = gr.Dropdown(
111
+ # label="Columns to show",
112
+ # value=column_names,
113
+ # choices=column_names,
114
+ # multiselect=True,
115
+ # )
116
+ # models_df = gr.DataFrame(filter_df, datatype="markdown")
117
+ # columns_to_show.change(filter_df, columns_to_show, models_df)
118
+
119
+ # demo.launch(debug=True)
requirements.in ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==3.41.2
2
+ httpx[http2]
3
+ git+https://github.com/huggingface/huggingface_hub
4
+ polars
5
+ python-dotenv
6
+ toolz
7
+ tqdm
8
+ rich[jupyter]
requirements.txt ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with Python 3.11
3
+ # by the following command:
4
+ #
5
+ # pip-compile
6
+ #
7
+ aiofiles==23.2.1
8
+ # via gradio
9
+ altair==5.1.1
10
+ # via gradio
11
+ annotated-types==0.5.0
12
+ # via pydantic
13
+ anyio==3.7.1
14
+ # via
15
+ # fastapi
16
+ # httpcore
17
+ # starlette
18
+ appnope==0.1.3
19
+ # via ipython
20
+ asttokens==2.4.0
21
+ # via stack-data
22
+ attrs==23.1.0
23
+ # via
24
+ # jsonschema
25
+ # referencing
26
+ backcall==0.2.0
27
+ # via ipython
28
+ certifi==2023.7.22
29
+ # via
30
+ # httpcore
31
+ # httpx
32
+ # requests
33
+ charset-normalizer==3.2.0
34
+ # via requests
35
+ click==8.1.7
36
+ # via uvicorn
37
+ comm==0.1.4
38
+ # via ipywidgets
39
+ contourpy==1.1.0
40
+ # via matplotlib
41
+ cycler==0.11.0
42
+ # via matplotlib
43
+ decorator==5.1.1
44
+ # via ipython
45
+ executing==1.2.0
46
+ # via stack-data
47
+ fastapi==0.103.1
48
+ # via gradio
49
+ ffmpy==0.3.1
50
+ # via gradio
51
+ filelock==3.12.3
52
+ # via huggingface-hub
53
+ fonttools==4.42.1
54
+ # via matplotlib
55
+ fsspec==2023.9.0
56
+ # via
57
+ # gradio-client
58
+ # huggingface-hub
59
+ gradio==3.41.2
60
+ # via -r requirements.in
61
+ gradio-client==0.5.0
62
+ # via gradio
63
+ h11==0.14.0
64
+ # via
65
+ # httpcore
66
+ # uvicorn
67
+ h2==4.1.0
68
+ # via httpx
69
+ hpack==4.0.0
70
+ # via h2
71
+ httpcore==0.18.0
72
+ # via httpx
73
+ httpx[http2]==0.25.0
74
+ # via
75
+ # -r requirements.in
76
+ # gradio
77
+ # gradio-client
78
+ huggingface-hub @ git+https://github.com/huggingface/huggingface_hub
79
+ # via
80
+ # -r requirements.in
81
+ # gradio
82
+ # gradio-client
83
+ hyperframe==6.0.1
84
+ # via h2
85
+ idna==3.4
86
+ # via
87
+ # anyio
88
+ # httpx
89
+ # requests
90
+ importlib-resources==6.0.1
91
+ # via gradio
92
+ ipython==8.15.0
93
+ # via ipywidgets
94
+ ipywidgets==8.1.1
95
+ # via rich
96
+ jedi==0.19.0
97
+ # via ipython
98
+ jinja2==3.1.2
99
+ # via
100
+ # altair
101
+ # gradio
102
+ jsonschema==4.19.0
103
+ # via altair
104
+ jsonschema-specifications==2023.7.1
105
+ # via jsonschema
106
+ jupyterlab-widgets==3.0.9
107
+ # via ipywidgets
108
+ kiwisolver==1.4.5
109
+ # via matplotlib
110
+ markdown-it-py==3.0.0
111
+ # via rich
112
+ markupsafe==2.1.3
113
+ # via
114
+ # gradio
115
+ # jinja2
116
+ matplotlib==3.7.3
117
+ # via gradio
118
+ matplotlib-inline==0.1.6
119
+ # via ipython
120
+ mdurl==0.1.2
121
+ # via markdown-it-py
122
+ numpy==1.25.2
123
+ # via
124
+ # altair
125
+ # contourpy
126
+ # gradio
127
+ # matplotlib
128
+ # pandas
129
+ orjson==3.9.7
130
+ # via gradio
131
+ packaging==23.1
132
+ # via
133
+ # altair
134
+ # gradio
135
+ # gradio-client
136
+ # huggingface-hub
137
+ # matplotlib
138
+ pandas==2.1.0
139
+ # via
140
+ # altair
141
+ # gradio
142
+ parso==0.8.3
143
+ # via jedi
144
+ pexpect==4.8.0
145
+ # via ipython
146
+ pickleshare==0.7.5
147
+ # via ipython
148
+ pillow==10.0.0
149
+ # via
150
+ # gradio
151
+ # matplotlib
152
+ polars==0.19.2
153
+ # via -r requirements.in
154
+ prompt-toolkit==3.0.39
155
+ # via ipython
156
+ ptyprocess==0.7.0
157
+ # via pexpect
158
+ pure-eval==0.2.2
159
+ # via stack-data
160
+ pydantic==2.3.0
161
+ # via
162
+ # fastapi
163
+ # gradio
164
+ pydantic-core==2.6.3
165
+ # via pydantic
166
+ pydub==0.25.1
167
+ # via gradio
168
+ pygments==2.16.1
169
+ # via
170
+ # ipython
171
+ # rich
172
+ pyparsing==3.1.1
173
+ # via matplotlib
174
+ python-dateutil==2.8.2
175
+ # via
176
+ # matplotlib
177
+ # pandas
178
+ python-dotenv==1.0.0
179
+ # via -r requirements.in
180
+ python-multipart==0.0.6
181
+ # via gradio
182
+ pytz==2023.3.post1
183
+ # via pandas
184
+ pyyaml==6.0.1
185
+ # via
186
+ # gradio
187
+ # huggingface-hub
188
+ referencing==0.30.2
189
+ # via
190
+ # jsonschema
191
+ # jsonschema-specifications
192
+ requests==2.31.0
193
+ # via
194
+ # gradio
195
+ # gradio-client
196
+ # huggingface-hub
197
+ rich[jupyter]==13.5.2
198
+ # via -r requirements.in
199
+ rpds-py==0.10.2
200
+ # via
201
+ # jsonschema
202
+ # referencing
203
+ semantic-version==2.10.0
204
+ # via gradio
205
+ six==1.16.0
206
+ # via
207
+ # asttokens
208
+ # python-dateutil
209
+ sniffio==1.3.0
210
+ # via
211
+ # anyio
212
+ # httpcore
213
+ # httpx
214
+ stack-data==0.6.2
215
+ # via ipython
216
+ starlette==0.27.0
217
+ # via fastapi
218
+ toolz==0.12.0
219
+ # via
220
+ # -r requirements.in
221
+ # altair
222
+ tqdm==4.66.1
223
+ # via
224
+ # -r requirements.in
225
+ # huggingface-hub
226
+ traitlets==5.10.0
227
+ # via
228
+ # comm
229
+ # ipython
230
+ # ipywidgets
231
+ # matplotlib-inline
232
+ typing-extensions==4.7.1
233
+ # via
234
+ # fastapi
235
+ # gradio
236
+ # gradio-client
237
+ # huggingface-hub
238
+ # pydantic
239
+ # pydantic-core
240
+ tzdata==2023.3
241
+ # via pandas
242
+ urllib3==2.0.4
243
+ # via requests
244
+ uvicorn==0.23.2
245
+ # via gradio
246
+ wcwidth==0.2.6
247
+ # via prompt-toolkit
248
+ websockets==11.0.3
249
+ # via
250
+ # gradio
251
+ # gradio-client
252
+ widgetsnbextension==4.0.9
253
+ # via ipywidgets