Spaces:

priamaiorg
/

llm-pricing-calculator

Sleeping

App Files Files Community

ash-98 commited on Apr 7

Commit

e6ef9f1

1 Parent(s): 14f59dd

v1.1

Browse files

Files changed (5) hide show

.gitattributes copy +0 -35
.gitignore +1 -1
README.md +1 -1
app.py +111 -49
utils_on.py +429 -0

.gitattributes copy DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

	@@ -1 +1 @@
1	- ~~Dockerfile~~


1	+ __pycache__

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Llm Pricing Calculator
 emoji: 🐢
 colorFrom: indigo
 colorTo: gray

 ---
+title: LLM Pricing Calculator
 emoji: 🐢
 colorFrom: indigo
 colorTo: gray

app.py CHANGED Viewed

@@ -2,12 +2,14 @@ import streamlit as st
 import asyncio
 import tokonomics
 from utils import create_model_hierarchy
-st.set_page_config(page_title="LLM Pricing App", layout="wide")
 # --------------------------
 # Async Data Loading Function
 # --------------------------
 async def load_data():
     """Simulate loading data asynchronously."""
     AVAILABLE_MODELS = await tokonomics.get_available_models()
@@ -43,7 +45,7 @@ def provider_change(provider, selected_type, all_types=["text", "vision", "video
     return new_models if new_models else all_models
 # --------------------------
-# Estimate Cost Function (Updated)
 # --------------------------
 def estimate_cost(num_alerts, input_size, output_size, model_id):
     pricing = st.session_state.get("pricing", {})
@@ -79,21 +81,68 @@ if "data_loaded" not in st.session_state:
 with st.sidebar:
     st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg",
              use_container_width=True)
-    st.markdown(
-        """ Visit: [https://www.priam.ai](https://www.priam.ai)
-        """
-    )
     st.divider()
     st.sidebar.title("LLM Pricing Calculator")
 # --------------------------
-# Main Content Layout (Model Selection Tab)
 # --------------------------
-tab1, tab2 = st.tabs(["Model Selection", "About"])
-with tab1:
-    st.header("LLM Pricing App")
     # --- Row 1: Provider/Type and Model Selection ---
     col_left, col_right = st.columns(2)
     with col_left:
@@ -103,50 +152,27 @@ with tab1:
             index=st.session_state["providers"].index("azure") if "azure" in st.session_state["providers"] else 0
         )
         selected_type = st.radio("Select type", options=["text", "image"], index=0)
     with col_right:
-        # Filter models based on the selected provider and type
         filtered_models = provider_change(selected_provider, selected_type)
         if filtered_models:
-            # Force "gpt-4-turbo" as default if available; otherwise, default to the first model.
             default_model = "o1" if "o1" in filtered_models else filtered_models[0]
-            selected_model = st.selectbox(
-                "Select a model",
-                options=filtered_models,
-                index=filtered_models.index(default_model)
-            )
         else:
             selected_model = None
             st.write("No models available")
     # --- Row 2: Alert Stats ---
     col1, col2, col3 = st.columns(3)
     with col1:
-        num_alerts = st.number_input(
-            "Security Alerts Per Day",
-            value=100,
-            min_value=1,
-            step=1,
-            help="Number of security alerts to analyze daily"
-        )
     with col2:
-        input_size = st.number_input(
-            "Alert Content Size (characters)",
-            value=1000,
-            min_value=1,
-            step=1,
-            help="Include logs, metadata, and context per alert"
-        )
     with col3:
-        output_size = st.number_input(
-            "Analysis Output Size (characters)",
-            value=500,
-            min_value=1,
-            step=1,
-            help="Expected length of security analysis and recommendations"
-        )
     # --- Row 3: Buttons ---
     btn_col1, btn_col2 = st.columns(2)
     with btn_col1:
@@ -163,21 +189,34 @@ with tab1:
                 st.session_state["pricing"] = pricing
                 st.session_state["providers"] = providers
                 st.success("Pricing data refreshed!")
     st.divider()
-    # --- Display Results ---
     st.markdown("### Results")
     if "result" in st.session_state:
         st.write(st.session_state["result"])
     else:
         st.write("Use the buttons above to estimate costs.")
-    # --- Clear Button Below Results ---
     if st.button("Clear"):
         st.session_state.pop("result", None)
-        st.rerun()
-with tab2:
     st.markdown(
         """
         ## About This App
@@ -186,8 +225,31 @@ with tab2:
         - The app downloads the latest pricing from the LiteLLM repository.
         - Using simple maths to estimate the total tokens.
-        - Version 0.1
         Website: [https://www.priam.ai](https://www.priam.ai)
         """
     )

 import asyncio
 import tokonomics
 from utils import create_model_hierarchy
+from utils_on import analyze_hf_model  # New import for On Premise Estimator functionality
+st.set_page_config(page_title="LLM Pricing Calculator", layout="wide")
 # --------------------------
 # Async Data Loading Function
 # --------------------------
 async def load_data():
     """Simulate loading data asynchronously."""
     AVAILABLE_MODELS = await tokonomics.get_available_models()
     return new_models if new_models else all_models
 # --------------------------
+# Estimate Cost Function
 # --------------------------
 def estimate_cost(num_alerts, input_size, output_size, model_id):
     pricing = st.session_state.get("pricing", {})
 with st.sidebar:
     st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg",
              use_container_width=True)
+    st.markdown("Visit: [https://www.priam.ai](https://www.priam.ai)")
     st.divider()
     st.sidebar.title("LLM Pricing Calculator")
 # --------------------------
+# Pills Navigation (Using st.pills)
 # --------------------------
+# st.pills creates a pill-style selection widget.
+page = st.pills("Head",
+    options=["Model Selection", "On Premise Estimator", "About"],selection_mode="single",default="Model Selection",label_visibility="hidden",
+    #index=0  # Change index if you want a different default
+)
+# --------------------------
+# Helper: Format Analysis Report
+# --------------------------
+def format_analysis_report(analysis_result: dict) -> str:
+    """Convert the raw analysis_result dict into a human-readable report."""
+    if "error" in analysis_result:
+        return f"**Error:** {analysis_result['error']}"
+    lines = []
+    lines.append(f"### Model Analysis Report for `{analysis_result.get('model_id', 'Unknown Model')}`\n")
+    lines.append(f"**Parameter Size:** {analysis_result.get('parameter_size', 'N/A')} Billion parameters\n")
+    lines.append(f"**Precision:** {analysis_result.get('precision', 'N/A')}\n")
+    vram = analysis_result.get("vram_requirements", {})
+    lines.append("#### VRAM Requirements:")
+    lines.append(f"- Model Size: {vram.get('model_size_gb', 0):.2f} GB")
+    lines.append(f"- KV Cache: {vram.get('kv_cache_gb', 0):.2f} GB")
+    lines.append(f"- Activations: {vram.get('activations_gb', 0):.2f} GB")
+    lines.append(f"- Overhead: {vram.get('overhead_gb', 0):.2f} GB")
+    lines.append(f"- **Total VRAM:** {vram.get('total_vram_gb', 0):.2f} GB\n")
+    compatible_gpus = analysis_result.get("compatible_gpus", [])
+    lines.append("#### Compatible GPUs:")
+    if compatible_gpus:
+        for gpu in compatible_gpus:
+            lines.append(f"- {gpu}")
+    else:
+        lines.append("- None found")
+    lines.append(f"\n**Largest Compatible GPU:** {analysis_result.get('largest_compatible_gpu', 'N/A')}\n")
+    #gpu_perf = analysis_result.get("gpu_performance", {})
+    #if gpu_perf:
+    #    lines.append("#### GPU Performance:")
+    #    for gpu, perf in gpu_perf.items():
+        #    lines.append(f"**{gpu}:**")
+       #     lines.append(f"  - Tokens per Second: {perf.get('tokens_per_second', 0):.2f}")
+      #      lines.append(f"  - FLOPs per Token: {perf.get('flops_per_token', 0):.2f}")
+     #       lines.append(f"  - Effective TFLOPS: {perf.get('effective_tflops', 0):.2f}\n")
+    #else:
+    #    lines.append("#### GPU Performance: N/A\n")
+    return "\n".join(lines)
+# --------------------------
+# Render Content Based on Selected Pill
+# --------------------------
+if page == "Model Selection":
+    st.divider()
+    st.header("LLM Pricing App")
     # --- Row 1: Provider/Type and Model Selection ---
     col_left, col_right = st.columns(2)
     with col_left:
             index=st.session_state["providers"].index("azure") if "azure" in st.session_state["providers"] else 0
         )
         selected_type = st.radio("Select type", options=["text", "image"], index=0)
     with col_right:
         filtered_models = provider_change(selected_provider, selected_type)
         if filtered_models:
             default_model = "o1" if "o1" in filtered_models else filtered_models[0]
+            selected_model = st.selectbox("Select a model", options=filtered_models, index=filtered_models.index(default_model))
         else:
             selected_model = None
             st.write("No models available")
     # --- Row 2: Alert Stats ---
     col1, col2, col3 = st.columns(3)
     with col1:
+        num_alerts = st.number_input("Security Alerts Per Day", value=100, min_value=1, step=1,
+                                     help="Number of security alerts to analyze daily")
     with col2:
+        input_size = st.number_input("Alert Content Size (characters)", value=1000, min_value=1, step=1,
+                                     help="Include logs, metadata, and context per alert")
     with col3:
+        output_size = st.number_input("Analysis Output Size (characters)", value=500, min_value=1, step=1,
+                                      help="Expected length of security analysis and recommendations")
     # --- Row 3: Buttons ---
     btn_col1, btn_col2 = st.columns(2)
     with btn_col1:
                 st.session_state["pricing"] = pricing
                 st.session_state["providers"] = providers
                 st.success("Pricing data refreshed!")
     st.divider()
     st.markdown("### Results")
     if "result" in st.session_state:
         st.write(st.session_state["result"])
     else:
         st.write("Use the buttons above to estimate costs.")
     if st.button("Clear"):
         st.session_state.pop("result", None)
+elif page == "On Premise Estimator":
+    st.divider()
+    st.header("On Premise Estimator")
+    st.markdown("Enter a Hugging Face model ID to perform an on premise analysis using the provided estimator.")
+    hf_model_id = st.text_input("Hugging Face Model ID", value="meta-llama/Llama-4-Scout-17B-16E")
+    if st.button("Analyze Model"):
+        with st.spinner("Analyzing model..."):
+            analysis_result = analyze_hf_model(hf_model_id)
+        st.session_state["analysis_result"] = analysis_result
+    if "analysis_result" in st.session_state:
+        report = format_analysis_report(st.session_state["analysis_result"])
+        st.markdown(report)
+elif page == "About":
+    st.divider()
     st.markdown(
         """
         ## About This App
         - The app downloads the latest pricing from the LiteLLM repository.
         - Using simple maths to estimate the total tokens.
+        - Helps you estimate hardware requirements for running open-source large language models (LLMs) on-premise using only the model ID from Hugging Face.
+        - Latest Version 0.1
+        ---
+        ### 📌 Version History
+        | Version | Release Date | Key Feature Updates |
+        |--------|--------------|---------------------|
+        | `v1.1` | 2025-04-06  | Added On Premise Estimator Feature |
+        | `v1.0` | 2025-03-26  | Initial release with basic total tokens estimation |
+        ---
         Website: [https://www.priam.ai](https://www.priam.ai)
         """
     )
+    st.markdown(
+    """
+    ### Found a Bug?
+    If you encounter any issues or have feedback, please email to **[email protected]**
+    Your input helps us improve the app!
+    """
+)

utils_on.py ADDED Viewed

	@@ -0,0 +1,429 @@

+from typing import List, Dict, Tuple, Optional, Union
+import re
+import math
+import requests
+import numpy as np
+from huggingface_hub import HfApi, ModelInfo
+from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
+def parse_model_entries(model_entries: List[str]) -> List[Dict[str, str]]:
+    """
+    Parse a list of model entries into structured dictionaries with provider, model name, version, region, and type.
+    Args:
+        model_entries: List of model entry strings as found in models.txt
+    Returns:
+        List of dictionaries with parsed model information containing keys:
+        - provider: Name of the provider (e.g., 'azure', 'openai', 'anthropic', etc.)
+        - model_name: Base name of the model
+        - version: Version of the model (if available)
+        - region: Deployment region (if available)
+        - model_type: Type of the model (text, image, audio based on pattern analysis)
+    """
+    parsed_models = []
+    # Common provider prefixes to identify
+    known_providers = [
+        'azure', 'bedrock', 'anthropic', 'openai', 'cohere', 'google',
+        'mistral', 'meta', 'amazon', 'ai21', 'anyscale', 'stability',
+        'cloudflare', 'databricks', 'cerebras', 'assemblyai'
+    ]
+    # Image-related keywords to identify image models
+    image_indicators = ['dall-e', 'stable-diffusion', 'image', 'canvas', 'x-', 'steps']
+    # Audio-related keywords to identify audio models
+    audio_indicators = ['whisper', 'tts', 'audio', 'voice']
+    for entry in model_entries:
+        model_info = {
+            'provider': '',
+            'model_name': '',
+            'version': '',
+            'region': '',
+            'model_type': 'text'  # Default to text
+        }
+        # Check for image models
+        if any(indicator in entry.lower() for indicator in image_indicators):
+            model_info['model_type'] = 'image'
+        # Check for audio models
+        elif any(indicator in entry.lower() for indicator in audio_indicators):
+            model_info['model_type'] = 'audio'
+        # Parse the entry based on common patterns
+        parts = entry.split('/')
+        # Handle region and provider extraction
+        if len(parts) >= 2:
+            # Extract provider from the beginning (common pattern)
+            if parts[0].lower() in known_providers:
+                model_info['provider'] = parts[0].lower()
+                # For bedrock and azure, the region is often the next part
+                if parts[0].lower() in ['bedrock', 'azure'] and len(parts) >= 3:
+                    # Skip commitment parts if present
+                    if 'commitment' not in parts[1]:
+                        model_info['region'] = parts[1]
+            # The last part typically contains the model name and possibly version
+            model_with_version = parts[-1]
+        else:
+            # For single-part entries
+            model_with_version = entry
+        # Extract provider from model name if not already set
+        if not model_info['provider']:
+            # Look for known providers within the model name
+            for provider in known_providers:
+                if provider in model_with_version.lower() or f'{provider}.' in model_with_version.lower():
+                    model_info['provider'] = provider
+                    # Remove provider prefix if it exists at the beginning
+                    if model_with_version.lower().startswith(f'{provider}.'):
+                        model_with_version = model_with_version[len(provider) + 1:]
+                    break
+        # Extract version information
+        version_match = re.search(r'[:.-]v(\d+(?:\.\d+)*(?:-\d+)?|\d+)(?::\d+)?$', model_with_version)
+        if version_match:
+            model_info['version'] = version_match.group(1)
+            # Remove version from model name
+            model_name = model_with_version[:version_match.start()]
+        else:
+            # Look for date-based versions like 2024-08-06
+            date_match = re.search(r'-(\d{4}-\d{2}-\d{2})$', model_with_version)
+            if date_match:
+                model_info['version'] = date_match.group(1)
+                model_name = model_with_version[:date_match.start()]
+            else:
+                model_name = model_with_version
+        # Clean up model name by removing trailing/leading separators
+        model_info['model_name'] = model_name.strip('.-:')
+        parsed_models.append(model_info)
+    return parsed_models
+def create_model_hierarchy(model_entries: List[str]) -> Dict[str, Dict[str, Dict[str, Dict[str, str]]]]:
+    """
+    Organize model entries into a nested dictionary structure by provider, model, version, and region.
+    Args:
+        model_entries: List of model entry strings as found in models.txt
+    Returns:
+        Nested dictionary with the structure:
+        Provider -> Model -> Version -> Region = full model string
+        If region or version is None, they are replaced with "NA".
+    """
+    # Parse the model entries to get structured information
+    parsed_models = parse_model_entries(model_entries)
+    # Create the nested dictionary structure
+    hierarchy = {}
+    for i, model_info in enumerate(parsed_models):
+        provider = model_info['provider'] if model_info['provider'] else 'unknown'
+        model_name = model_info['model_name']
+        version = model_info['version'] if model_info['version'] else 'NA'
+        # For Azure models, always use 'NA' as region since they are globally available
+        region = 'NA' if provider == 'azure' else (model_info['region'] if model_info['region'] else 'NA')
+        # Initialize nested dictionaries if they don't exist
+        if provider not in hierarchy:
+            hierarchy[provider] = {}
+        if model_name not in hierarchy[provider]:
+            hierarchy[provider][model_name] = {}
+        if version not in hierarchy[provider][model_name]:
+            hierarchy[provider][model_name][version] = {}
+        # Store the full model string at the leaf node
+        hierarchy[provider][model_name][version][region] = model_entries[i]
+    return hierarchy
+# NVIDIA GPU specifications - Name: (VRAM in GB, FP16 TOPS)
+NVIDIA_GPUS = {
+    "RTX 3050": (8, 18),
+    "RTX 3060": (12, 25),
+    "RTX 3070": (8, 40),
+    "RTX 3080": (10, 58),
+    "RTX 3090": (24, 71),
+    "RTX 4060": (8, 41),
+    "RTX 4070": (12, 56),
+    "RTX 4080": (16, 113),
+    "RTX 4090": (24, 165),
+    "RTX A2000": (6, 20),
+    "RTX A4000": (16, 40),
+    "RTX A5000": (24, 64),
+    "RTX A6000": (48, 75),
+    "A100 40GB": (40, 312),
+    "A100 80GB": (80, 312),
+    "H100 80GB": (80, 989),
+}
+def get_hf_model_info(model_id: str) -> Optional[ModelInfo]:
+    """
+    Retrieve model information from the Hugging Face Hub.
+    Args:
+        model_id: Hugging Face model ID (e.g., "facebook/opt-1.3b")
+    Returns:
+        ModelInfo object or None if model not found
+    """
+    try:
+        api = HfApi()
+        model_info = api.model_info(model_id)
+        return model_info
+    except (RepositoryNotFoundError, RevisionNotFoundError) as e:
+        print(f"Error fetching model info: {e}")
+        return None
+def extract_model_size(model_info: ModelInfo) -> Optional[Tuple[float, str]]:
+    """
+    Extract the parameter size and precision from model information.
+    Args:
+        model_info: ModelInfo object from Hugging Face Hub
+    Returns:
+        Tuple of (parameter size in billions, precision) or None if not found
+    """
+    # Try to get parameter count from model card
+    if model_info.card_data is not None:
+        if "model-index" in model_info.card_data and isinstance(model_info.card_data["model-index"], list):
+            for item in model_info.card_data["model-index"]:
+                if "parameters" in item:
+                    return float(item["parameters"]) / 1e9, "fp16"  # Convert to billions and assume fp16
+    # Try to extract from model name
+    name = model_info.id.lower()
+    size_patterns = [
+        r"(\d+(\.\d+)?)b",  # matches patterns like "1.3b" or "7b"
+        r"-(\d+(\.\d+)?)b",  # matches patterns like "llama-7b"
+        r"(\d+(\.\d+)?)-b",  # matches other formatting variations
+    ]
+    for pattern in size_patterns:
+        match = re.search(pattern, name)
+        if match:
+            size_str = match.group(1)
+            return float(size_str), "fp16"  # Default to fp16
+    # Extract precision if available
+    precision = "fp16"  # Default
+    precision_patterns = {"fp16": r"fp16", "int8": r"int8", "int4": r"int4", "fp32": r"fp32"}
+    for prec, pattern in precision_patterns.items():
+        if re.search(pattern, name):
+            precision = prec
+            break
+    # If couldn't determine size, check sibling models or readme
+    if model_info.siblings:
+        for sibling in model_info.siblings:
+            if sibling.rfilename == "README.md" and sibling.size < 100000:  # reasonable size for readme
+                try:
+                    content = requests.get(sibling.lfs.url).text
+                    param_pattern = r"(\d+(\.\d+)?)\s*[Bb](illion)?\s*[Pp]arameters"
+                    match = re.search(param_pattern, content)
+                    if match:
+                        return float(match.group(1)), precision
+                except:
+                    pass
+    # As a last resort, try to analyze config.json if it exists
+    config_sibling = next((s for s in model_info.siblings if s.rfilename == "config.json"), None)
+    if config_sibling:
+        try:
+            config = requests.get(config_sibling.lfs.url).json()
+            if "n_params" in config:
+                return float(config["n_params"]) / 1e9, precision
+            # Calculate from architecture if available
+            if all(k in config for k in ["n_layer", "n_head", "n_embd"]):
+                n_layer = config["n_layer"]
+                n_embd = config["n_embd"]
+                n_head = config["n_head"]
+                # Transformer parameter estimation formula
+                params = 12 * n_layer * (n_embd**2) * (1 + 13 / (12 * n_embd))
+                return params / 1e9, precision
+        except:
+            pass
+    return None
+def calculate_vram_requirements(param_size: float, precision: str = "fp16") -> Dict[str, float]:
+    """
+    Calculate VRAM requirements for inference using the EleutherAI transformer math formula.
+    Args:
+        param_size: Model size in billions of parameters
+        precision: Model precision ("fp32", "fp16", "int8", "int4")
+    Returns:
+        Dictionary with various memory requirements in GB
+    """
+    # Convert parameters to actual count
+    param_count = param_size * 1e9
+    # Size per parameter based on precision
+    bytes_per_param = {
+        "fp32": 4,
+        "fp16": 2,
+        "int8": 1,
+        "int4": 0.5,  # 4 bits = 0.5 bytes
+    }[precision]
+    # Base model size (parameters * bytes per parameter)
+    model_size_gb = (param_count * bytes_per_param) / (1024**3)
+    # EleutherAI formula components for inference memory
+    # Layer activations - scales with sequence length
+    activation_factor = 1.2  # varies by architecture
+    # KV cache size (scales with batch size and sequence length)
+    # Estimate for single batch, 2048-token context
+    kv_cache_size_gb = (param_count * 0.0625 * bytes_per_param) / (1024**3)  # ~6.25% of params for KV cache
+    # Total VRAM needed for inference
+    total_inference_gb = model_size_gb + (model_size_gb * activation_factor) + kv_cache_size_gb
+    # Add overhead for CUDA, buffers, and fragmentation
+    overhead_gb = 0.8  # 800 MB overhead
+    # Dynamic computation graph allocation
+    compute_overhead_factor = 0.1  # varies based on attention computation method
+    # Final VRAM estimate
+    total_vram_required_gb = total_inference_gb + overhead_gb + (total_inference_gb * compute_overhead_factor)
+    return {
+        "model_size_gb": model_size_gb,
+        "kv_cache_gb": kv_cache_size_gb,
+        "activations_gb": model_size_gb * activation_factor,
+        "overhead_gb": overhead_gb + (total_inference_gb * compute_overhead_factor),
+        "total_vram_gb": total_vram_required_gb
+    }
+def find_compatible_gpus(vram_required: float) -> List[str]:
+    """
+    Find NVIDIA GPUs that can run a model requiring the specified VRAM.
+    Args:
+        vram_required: Required VRAM in GB
+    Returns:
+        List of compatible GPU names sorted by VRAM capacity (smallest first)
+    """
+    compatible_gpus = [(name, specs[0]) for name, specs in NVIDIA_GPUS.items() if specs[0] >= vram_required]
+    return [gpu[0] for gpu in sorted(compatible_gpus, key=lambda x: x[1])]
+def estimate_performance(param_size: float, precision: str, gpu_name: str) -> Dict[str, float]:
+    """
+    Estimate token/second performance for a model on a specific GPU.
+    Args:
+        param_size: Model size in billions of parameters
+        precision: Model precision
+        gpu_name: Name of the NVIDIA GPU
+    Returns:
+        Dictionary with performance metrics
+    """
+    if gpu_name not in NVIDIA_GPUS:
+        return {"tokens_per_second": 0, "tflops_utilization": 0}
+    gpu_vram, gpu_tops = NVIDIA_GPUS[gpu_name]
+    # Calculate FLOPs per token (based on model size)
+    # Formula: ~6 * num_parameters FLOPs per token (inference)
+    flops_per_token = 6 * param_size * 1e9
+    # Convert TOPS to TFLOPS based on precision
+    precision_factor = 1.0 if precision == "fp32" else 2.0 if precision == "fp16" else 4.0 if precision in ["int8", "int4"] else 1.0
+    gpu_tflops = gpu_tops * precision_factor
+    # Practical utilization (GPUs rarely achieve 100% of theoretical performance)
+    practical_utilization = 0.6  # 60% utilization
+    # Calculate tokens per second
+    effective_tflops = gpu_tflops * practical_utilization
+    tokens_per_second = (effective_tflops * 1e12) / flops_per_token
+    return {
+        "tokens_per_second": tokens_per_second,
+        "flops_per_token": flops_per_token,
+        "tflops_utilization": practical_utilization,
+        "effective_tflops": effective_tflops
+    }
+def analyze_hf_model(model_id: str) -> Dict[str, any]:
+    """
+    Comprehensive analysis of a Hugging Face model:
+    - Downloads model information
+    - Extracts parameter size and precision
+    - Estimates VRAM requirements
+    - Identifies compatible NVIDIA GPUs
+    - Estimates performance on these GPUs
+    Args:
+        model_id: Hugging Face model ID (e.g., "facebook/opt-1.3b")
+    Returns:
+        Dictionary with analysis results or error message
+    """
+    # Get model information
+    model_info = get_hf_model_info(model_id)
+    if not model_info:
+        return {"error": f"Model {model_id} not found on Hugging Face"}
+    # Extract model size and precision
+    size_info = extract_model_size(model_info)
+    if not size_info:
+        return {"error": f"Couldn't determine parameter count for {model_id}"}
+    param_size, precision = size_info
+    # Calculate VRAM requirements
+    vram_requirements = calculate_vram_requirements(param_size, precision)
+    total_vram_gb = vram_requirements["total_vram_gb"]
+    # Find compatible GPUs
+    compatible_gpus = find_compatible_gpus(total_vram_gb)
+    # Calculate performance for each compatible GPU
+    gpu_performance = {}
+    for gpu in compatible_gpus:
+        gpu_performance[gpu] = estimate_performance(param_size, precision, gpu)
+    # Determine the largest GPU that can run the model
+    largest_compatible_gpu = compatible_gpus[-1] if compatible_gpus else None
+    return {
+        "model_id": model_id,
+        "parameter_size": param_size,  # in billions
+        "precision": precision,
+        "vram_requirements": vram_requirements,
+        "compatible_gpus": compatible_gpus,
+        "largest_compatible_gpu": largest_compatible_gpu,
+        "gpu_performance": gpu_performance,
+        #"model_info": {
+            #"description": model_info.description,
+            #"tags": model_info.tags,
+            #"downloads": model_info.downloads,
+            #"library": getattr(model_info, "library", None)
+        #}
+    }