from huggingface_hub import get_collection, Collection, CollectionItem from toolz import groupby, valmap from typing import Dict, List import pandas as pd from huggingface_hub import model_info import gradio as gr from functools import lru_cache test_slug = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9" def group_collection_by_repo_type( collection_slug: str, ) -> Dict[str, List[CollectionItem]]: collection = get_collection(collection_slug) return groupby(lambda x: x.repoType, collection.items) def render_model_hub_link(hub_id): link = f"https://huggingface.co/{hub_id}" return ( f'{hub_id}' ) def load_to_dataframe(data): # Columns to keep columns = [ "item_id", "downloads", "author", "likes", "pipeline_tag", "lastModified", ] # convert to dicts data = [item.__dict__ for item in data] filtered_data = [ {key: item[key] for key in columns if key in item} for item in data ] required_info_keys = ["language", "tags", "license", "datasets"] for item in filtered_data: try: card = model_info(item["item_id"]).cardData for key in required_info_keys: item[key] = card.get(key) except AttributeError as e: print(e) for key in required_info_keys: item[key] = None # Load into a DataFrame df = pd.DataFrame(filtered_data) df["item_id"] = df["item_id"].apply(render_model_hub_link) return df def summary_of_na_values(df): na_counts = df.isna().sum() na_counts = na_counts[na_counts > 0] na_percent = round(na_counts / len(df) * 100, 2) return ( pd.DataFrame({"Missing Count": na_counts, "Missing Percent": na_percent}) .rename_axis(index="Metadata") .reset_index() ) def value_counts(df, column_name): return df[column_name].value_counts() @lru_cache(maxsize=10) def load_data(): repos_grouped_by_type = group_collection_by_repo_type(test_slug) models = repos_grouped_by_type["model"] df = load_to_dataframe(models) column_names = df.columns.to_list() return repos_grouped_by_type, column_names, df def generate_markdown_summary_of_collection( grouped_collection: Dict[str, List[CollectionItem]] ): counts = valmap(len, grouped_collection) results = "This collection contains the following items:\n" for k, v in counts.items(): results += f"- {v} {k}s\n" return results repos_grouped_by_type, column_names, df = load_data() def filter_df(columns_to_show=None): *_, df = load_data() return df if columns_to_show is None else df[columns_to_show] with gr.Blocks() as demo: gr.Markdown("## Info about models in this collection") gr.Markdown(generate_markdown_summary_of_collection(repos_grouped_by_type)) gr.Markdown("### Summary of missing metadata values") gr.DataFrame(summary_of_na_values(df)) gr.Markdown("# Models in this collection") with gr.Accordion("Models", open=True): columns_to_show = gr.Dropdown( label="Columns to show", value=column_names, choices=column_names, multiselect=True, ) models_df = gr.DataFrame(filter_df, datatype="markdown") columns_to_show.change(filter_df, columns_to_show, models_df) demo.launch(debug=True)