OrgStats

Running

App Files Files Community

evijit HF Staff commited on 22 days ago

Commit

18ef775

verified ·

1 Parent(s): 96bb7cf

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -136

app.py CHANGED Viewed

@@ -248,146 +248,18 @@ def load_models_csv():
     df['tags'] = df['tags'].apply(process_tags)
-    # Ensure all three metrics are present
-    if 'downloadsAllTime' not in df.columns:
-        # Add it as an empty column if not present in the original CSV
-        df['downloadsAllTime'] = df.get('downloads', 0) * np.random.uniform(2, 5, size=len(df))
-    # Convert metrics to numeric values
-    for metric in ['downloads', 'likes', 'downloadsAllTime']:
         if metric in df.columns:
             df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0)
-    # Add more sample data for better visualization
-    add_sample_data(df)
     return df
-def add_sample_data(df):
-    """Add more sample data to make the visualization more interesting"""
-    # Top organizations to include
-    orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface',
-            'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together',
-            'facebook', 'amazon', 'deepmind', 'cohere', 'bigscience', 'eleutherai']
-    # Common model name formats
-    model_name_patterns = [
-        "model-{size}-{version}",
-        "{prefix}-{size}b",
-        "{prefix}-{size}b-{variant}",
-        "llama-{size}b-{variant}",
-        "gpt-{variant}-{size}b",
-        "{prefix}-instruct-{size}b",
-        "{prefix}-chat-{size}b",
-        "{prefix}-coder-{size}b",
-        "stable-diffusion-{version}",
-        "whisper-{size}",
-        "bert-{size}-{variant}",
-        "roberta-{size}",
-        "t5-{size}",
-        "{prefix}-vision-{size}b"
-    ]
-    # Common name parts
-    prefixes = ["falcon", "llama", "mistral", "gpt", "phi", "gemma", "qwen", "yi", "mpt", "bloom"]
-    sizes = ["7", "13", "34", "70", "1", "3", "7b", "13b", "70b", "8b", "2b", "1b", "0.5b", "small", "base", "large", "huge"]
-    variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"]
-    # Generate sample data
-    sample_data = []
-    for org_idx, org in enumerate(orgs):
-        # Create 5-10 models per organization
-        num_models = np.random.randint(5, 11)
-        for i in range(num_models):
-            # Create realistic model name
-            pattern = np.random.choice(model_name_patterns)
-            prefix = np.random.choice(prefixes)
-            size = np.random.choice(sizes)
-            version = f"v{np.random.randint(1, 4)}"
-            variant = np.random.choice(variants)
-            model_name = pattern.format(
-                prefix=prefix,
-                size=size,
-                version=version,
-                variant=variant
-            )
-            model_id = f"{org}/{model_name}"
-            # Select a realistic pipeline tag based on name
-            if "diffusion" in model_name or "image" in model_name:
-                pipeline_tag = np.random.choice(["text-to-image", "image-to-image", "image-segmentation"])
-            elif "whisper" in model_name or "speech" in model_name:
-                pipeline_tag = np.random.choice(["automatic-speech-recognition", "text-to-speech"])
-            elif "coder" in model_name or "code" in model_name:
-                pipeline_tag = "text-generation"
-            elif "bert" in model_name or "roberta" in model_name:
-                pipeline_tag = np.random.choice(["fill-mask", "text-classification", "token-classification"])
-            elif "vision" in model_name:
-                pipeline_tag = np.random.choice(["image-classification", "image-to-text", "visual-question-answering"])
-            else:
-                pipeline_tag = "text-generation"  # Most common
-            # Generate realistic tags
-            tags = [pipeline_tag]
-            if "text-generation" in pipeline_tag:
-                tags.extend(["language-model", "text", "gpt", "llm"])
-                if "instruct" in model_name:
-                    tags.append("instruction-following")
-                if "chat" in model_name:
-                    tags.append("chat")
-            elif "speech" in pipeline_tag:
-                tags.extend(["audio", "speech", "voice"])
-            elif "image" in pipeline_tag:
-                tags.extend(["vision", "image", "diffusion"])
-            # Add language tags
-            if np.random.random() < 0.8:  # 80% chance for English
-                tags.append("en")
-            if np.random.random() < 0.3:  # 30% chance for multilingual
-                tags.append("multilingual")
-            # Generate downloads and likes (weighted by org position for variety)
-            # Earlier orgs get more downloads to make the visualization interesting
-            popularity_factor = (len(orgs) - org_idx) / len(orgs)  # 1.0 to 0.0
-            base_downloads = 10000 * (10 ** (2 * popularity_factor))
-            downloads = int(base_downloads * np.random.uniform(0.3, 3.0))
-            likes = int(downloads * np.random.uniform(0.01, 0.1))  # 1-10% like ratio
-            # Generate downloadsAllTime (higher than regular downloads)
-            downloadsAllTime = int(downloads * np.random.uniform(3, 8))
-            # Generate model size (in bytes for params)
-            # Model size should correlate somewhat with the size in the name
-            size_indicator = 1
-            for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]:
-                if s in model_name.lower():
-                    size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3
-                    break
-            # Size in bytes
-            params = int(np.random.uniform(0.5, 2.0) * size_indicator * 1e9)
-            # Create model entry
-            model = {
-                "id": model_id,
-                "author": org,
-                "downloads": downloads,
-                "likes": likes,
-                "downloadsAllTime": downloadsAllTime,
-                "pipeline_tag": pipeline_tag,
-                "tags": tags,
-                "params": params
-            }
-            sample_data.append(model)
-    # Convert sample data to DataFrame and append to original
-    sample_df = pd.DataFrame(sample_data)
-    return pd.concat([df, sample_df], ignore_index=True)
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -535,7 +407,7 @@ with gr.Blocks() as demo:
             display_name = metric_display_names.get(count_by, count_by.capitalize())
             stats_md = f"""
-## Statistics
 - **Total models shown**: {total_models:,}
 - **Total {display_name}**: {int(total_value):,}

     df['tags'] = df['tags'].apply(process_tags)
+    # Ensure all required metrics are present and convert to numeric
+    required_metrics = ['downloads', 'likes', 'downloadsAllTime']
+    for metric in required_metrics:
         if metric in df.columns:
             df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0)
+        else:
+            print(f"Warning: '{metric}' not found in CSV. Creating empty column.")
+            df[metric] = 0
     return df
 # Create Gradio interface
 with gr.Blocks() as demo:
             display_name = metric_display_names.get(count_by, count_by.capitalize())
             stats_md = f"""
+## Statistics as of May 12, 2025
 - **Total models shown**: {total_models:,}
 - **Total {display_name}**: {int(total_value):,}