File size: 3,567 Bytes
1ab8a1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11718ca
 
 
 
 
 
d4089a6
11718ca
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from huggingface_hub import get_collection, Collection, CollectionItem
from toolz import groupby, valmap
from typing import Dict, List
import pandas as pd
from huggingface_hub import model_info
import gradio as gr
from functools import lru_cache

test_slug = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"


def group_collection_by_repo_type(
    collection_slug: str,
) -> Dict[str, List[CollectionItem]]:
    collection = get_collection(collection_slug)
    return groupby(lambda x: x.repoType, collection.items)


def render_model_hub_link(hub_id):
    link = f"https://huggingface.co/{hub_id}"
    return (
        f'<a target="_blank" href="{link}" style="color: var(--link-text-color);'
        f' text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
    )


def load_to_dataframe(data):
    # Columns to keep
    columns = [
        "item_id",
        "downloads",
        "author",
        "likes",
        "pipeline_tag",
        "lastModified",
    ]
    # convert to dicts
    data = [item.__dict__ for item in data]

    filtered_data = [
        {key: item[key] for key in columns if key in item} for item in data
    ]
    required_info_keys = ["language", "tags", "license", "datasets"]

    for item in filtered_data:
        try:
            card = model_info(item["item_id"]).cardData
            for key in required_info_keys:
                item[key] = card.get(key)
        except AttributeError as e:
            print(e)
            for key in required_info_keys:
                item[key] = None
    # Load into a DataFrame
    df = pd.DataFrame(filtered_data)
    df["item_id"] = df["item_id"].apply(render_model_hub_link)
    return df


def summary_of_na_values(df):
    na_counts = df.isna().sum()
    na_counts = na_counts[na_counts > 0]
    na_percent = round(na_counts / len(df) * 100, 2)
    return (
        pd.DataFrame({"Missing Count": na_counts, "Missing Percent": na_percent})
        .rename_axis(index="Metadata")
        .reset_index()
    )

def value_counts(df, column_name):
    return df[column_name].value_counts()



@lru_cache(maxsize=10)
def load_data():
    repos_grouped_by_type = group_collection_by_repo_type(test_slug)
    models = repos_grouped_by_type["model"]
    df = load_to_dataframe(models)
    column_names = df.columns.to_list()
    return repos_grouped_by_type, column_names, df



def generate_markdown_summary_of_collection(
    grouped_collection: Dict[str, List[CollectionItem]]
):
    counts = valmap(len, grouped_collection)
    results = "This collection contains the following items:\n"
    for k, v in counts.items():
        results += f"- {v} {k}s\n"
    return results


repos_grouped_by_type, column_names, df = load_data()


def filter_df(columns_to_show=None):
    *_, df = load_data()
    return df if columns_to_show is None else df[columns_to_show]


with gr.Blocks() as demo:
    gr.Markdown("## Info about models in this collection")
    gr.Markdown(generate_markdown_summary_of_collection(repos_grouped_by_type))
    gr.Markdown("### Summary of missing metadata values")
    gr.DataFrame(summary_of_na_values(df))
    gr.Markdown("# Models in this collection")
    with gr.Accordion("Models", open=True):
        columns_to_show = gr.Dropdown(
            label="Columns to show",
            value=column_names,
            choices=column_names,
            multiselect=True,
        )
        models_df = gr.DataFrame(filter_df, datatype="markdown")
        columns_to_show.change(filter_df, columns_to_show, models_df)

demo.launch(debug=True)