Spaces:
Runtime error
Runtime error
File size: 3,567 Bytes
1ab8a1a 11718ca d4089a6 11718ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
from huggingface_hub import get_collection, Collection, CollectionItem
from toolz import groupby, valmap
from typing import Dict, List
import pandas as pd
from huggingface_hub import model_info
import gradio as gr
from functools import lru_cache
test_slug = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
def group_collection_by_repo_type(
collection_slug: str,
) -> Dict[str, List[CollectionItem]]:
collection = get_collection(collection_slug)
return groupby(lambda x: x.repoType, collection.items)
def render_model_hub_link(hub_id):
link = f"https://huggingface.co/{hub_id}"
return (
f'<a target="_blank" href="{link}" style="color: var(--link-text-color);'
f' text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
)
def load_to_dataframe(data):
# Columns to keep
columns = [
"item_id",
"downloads",
"author",
"likes",
"pipeline_tag",
"lastModified",
]
# convert to dicts
data = [item.__dict__ for item in data]
filtered_data = [
{key: item[key] for key in columns if key in item} for item in data
]
required_info_keys = ["language", "tags", "license", "datasets"]
for item in filtered_data:
try:
card = model_info(item["item_id"]).cardData
for key in required_info_keys:
item[key] = card.get(key)
except AttributeError as e:
print(e)
for key in required_info_keys:
item[key] = None
# Load into a DataFrame
df = pd.DataFrame(filtered_data)
df["item_id"] = df["item_id"].apply(render_model_hub_link)
return df
def summary_of_na_values(df):
na_counts = df.isna().sum()
na_counts = na_counts[na_counts > 0]
na_percent = round(na_counts / len(df) * 100, 2)
return (
pd.DataFrame({"Missing Count": na_counts, "Missing Percent": na_percent})
.rename_axis(index="Metadata")
.reset_index()
)
def value_counts(df, column_name):
return df[column_name].value_counts()
@lru_cache(maxsize=10)
def load_data():
repos_grouped_by_type = group_collection_by_repo_type(test_slug)
models = repos_grouped_by_type["model"]
df = load_to_dataframe(models)
column_names = df.columns.to_list()
return repos_grouped_by_type, column_names, df
def generate_markdown_summary_of_collection(
grouped_collection: Dict[str, List[CollectionItem]]
):
counts = valmap(len, grouped_collection)
results = "This collection contains the following items:\n"
for k, v in counts.items():
results += f"- {v} {k}s\n"
return results
repos_grouped_by_type, column_names, df = load_data()
def filter_df(columns_to_show=None):
*_, df = load_data()
return df if columns_to_show is None else df[columns_to_show]
with gr.Blocks() as demo:
gr.Markdown("## Info about models in this collection")
gr.Markdown(generate_markdown_summary_of_collection(repos_grouped_by_type))
gr.Markdown("### Summary of missing metadata values")
gr.DataFrame(summary_of_na_values(df))
gr.Markdown("# Models in this collection")
with gr.Accordion("Models", open=True):
columns_to_show = gr.Dropdown(
label="Columns to show",
value=column_names,
choices=column_names,
multiselect=True,
)
models_df = gr.DataFrame(filter_df, datatype="markdown")
columns_to_show.change(filter_df, columns_to_show, models_df)
demo.launch(debug=True)
|