ash-98 commited on
Commit
6913a64
·
1 Parent(s): 9b8af98

On premise estimator v1

Browse files
__pycache__/utils.cpython-313.pyc ADDED
Binary file (5.03 kB). View file
 
__pycache__/utils_on.cpython-313.pyc ADDED
Binary file (14.4 kB). View file
 
app.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  import asyncio
3
  import tokonomics
4
  from utils import create_model_hierarchy
 
5
 
6
  st.set_page_config(page_title="LLM Pricing App", layout="wide")
7
 
@@ -86,11 +87,25 @@ with st.sidebar:
86
  st.divider()
87
  st.sidebar.title("LLM Pricing Calculator")
88
 
 
 
 
 
 
 
 
 
 
89
  # --------------------------
90
- # Main Content Layout (Model Selection Tab)
91
  # --------------------------
92
- tab1, tab2 = st.tabs(["Model Selection", "About"])
 
 
 
93
 
 
 
94
  with tab1:
95
  st.header("LLM Pricing App")
96
 
@@ -177,7 +192,85 @@ with tab1:
177
  st.session_state.pop("result", None)
178
  st.rerun()
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  with tab2:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  st.markdown(
182
  """
183
  ## About This App
@@ -186,8 +279,29 @@ with tab2:
186
 
187
  - The app downloads the latest pricing from the LiteLLM repository.
188
  - Using simple maths to estimate the total tokens.
 
189
  - Version 0.1
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  Website: [https://www.priam.ai](https://www.priam.ai)
192
  """
193
  )
 
 
 
 
 
 
 
 
2
  import asyncio
3
  import tokonomics
4
  from utils import create_model_hierarchy
5
+ from utils_on import analyze_hf_model # New import for On Premise Estimator functionality
6
 
7
  st.set_page_config(page_title="LLM Pricing App", layout="wide")
8
 
 
87
  st.divider()
88
  st.sidebar.title("LLM Pricing Calculator")
89
 
90
+ # Track active tab in session state
91
+ if "active_tab" not in st.session_state:
92
+ st.session_state.active_tab = "Model Selection"
93
+
94
+ def switch_tab(tab_name):
95
+ st.session_state.active_tab = tab_name
96
+ st.rerun()
97
+
98
+
99
  # --------------------------
100
+ # Main Content Layout (Tabs)
101
  # --------------------------
102
+ tab_labels = ["Model Selection", "On Premise Estimator", "About"]
103
+ tab_index = tab_labels.index(st.session_state.active_tab)
104
+ tabs = st.tabs(tab_labels)
105
+ tab1, tab2, tab3 = tabs
106
 
107
+
108
+ # ----- Tab 1: Model Selection -----
109
  with tab1:
110
  st.header("LLM Pricing App")
111
 
 
192
  st.session_state.pop("result", None)
193
  st.rerun()
194
 
195
+
196
+ # ----- Tab 2: On Premise Estimator -----
197
+ def format_analysis_report(analysis_result: dict) -> str:
198
+ """Convert the raw analysis_result dict into a human-readable report."""
199
+ if "error" in analysis_result:
200
+ return f"**Error:** {analysis_result['error']}"
201
+
202
+ lines = []
203
+ lines.append(f"### Model Analysis Report for `{analysis_result.get('model_id', 'Unknown Model')}`\n")
204
+ lines.append(f"**Parameter Size:** {analysis_result.get('parameter_size', 'N/A')} Billion parameters\n")
205
+ lines.append(f"**Precision:** {analysis_result.get('precision', 'N/A')}\n")
206
+
207
+ vram = analysis_result.get("vram_requirements", {})
208
+ lines.append("#### VRAM Requirements:")
209
+ lines.append(f"- Model Size: {vram.get('model_size_gb', 0):.2f} GB")
210
+ lines.append(f"- KV Cache: {vram.get('kv_cache_gb', 0):.2f} GB")
211
+ lines.append(f"- Activations: {vram.get('activations_gb', 0):.2f} GB")
212
+ lines.append(f"- Overhead: {vram.get('overhead_gb', 0):.2f} GB")
213
+ lines.append(f"- **Total VRAM:** {vram.get('total_vram_gb', 0):.2f} GB\n")
214
+
215
+ compatible_gpus = analysis_result.get("compatible_gpus", [])
216
+ lines.append("#### Compatible GPUs:")
217
+ if compatible_gpus:
218
+ for gpu in compatible_gpus:
219
+ lines.append(f"- {gpu}")
220
+ else:
221
+ lines.append("- None found")
222
+ lines.append(f"\n**Largest Compatible GPU:** {analysis_result.get('largest_compatible_gpu', 'N/A')}\n")
223
+
224
+ gpu_perf = analysis_result.get("gpu_performance", {})
225
+ if gpu_perf:
226
+ lines.append("#### GPU Performance:")
227
+ for gpu, perf in gpu_perf.items():
228
+ lines.append(f"**{gpu}:**")
229
+ lines.append(f" - Tokens per Second: {perf.get('tokens_per_second', 0):.2f}")
230
+ lines.append(f" - FLOPs per Token: {perf.get('flops_per_token', 0):.2f}")
231
+ lines.append(f" - Effective TFLOPS: {perf.get('effective_tflops', 0):.2f}\n")
232
+ else:
233
+ lines.append("#### GPU Performance: N/A\n")
234
+
235
+ #model_info = analysis_result.get("model_info", {})
236
+ #lines.append("#### Model Information:")
237
+ #if model_info:
238
+ # if model_info.get("description"):
239
+ # lines.append(f"- Description: {model_info['description']}")
240
+ # if model_info.get("tags"):
241
+ # lines.append(f"- Tags: {', '.join(model_info['tags'])}")
242
+ #if model_info.get("downloads") is not None:
243
+ # lines.append(f"- Downloads: {model_info['downloads']}")
244
+ #if model_info.get("library"):
245
+ # lines.append(f"- Library: {model_info['library']}")
246
+ #else:
247
+ # lines.append("No additional model info available.")
248
+
249
+ return "\n".join(lines)
250
+
251
+
252
+ # ----- Tab 2: On Premise Estimator -----
253
  with tab2:
254
+ st.header("On Premise Estimator")
255
+ st.markdown("Enter a Hugging Face model ID to perform an on premise analysis using the provided estimator.")
256
+
257
+ # Input for model ID with a default value
258
+ hf_model_id = st.text_input("Hugging Face Model ID", value="facebook/opt-1.3b")
259
+
260
+ if st.button("Analyze Model"):
261
+ st.session_state.active_tab = "On Premise Estimator"
262
+ with st.spinner("Analyzing model..."):
263
+ analysis_result = analyze_hf_model(hf_model_id)
264
+ st.session_state.analysis_result = analysis_result
265
+ st.rerun()
266
+
267
+ # Render if analysis result exists
268
+ if "analysis_result" in st.session_state:
269
+ report = format_analysis_report(st.session_state.analysis_result)
270
+ st.markdown(report)
271
+
272
+ # ----- Tab 3: About -----
273
+ with tab3:
274
  st.markdown(
275
  """
276
  ## About This App
 
279
 
280
  - The app downloads the latest pricing from the LiteLLM repository.
281
  - Using simple maths to estimate the total tokens.
282
+ - helps you estimate hardware requirements for running open-source large language models (LLMs) on-premise using only the model ID from Hugging Face.
283
  - Version 0.1
284
 
285
+ ---
286
+
287
+ ### 📌 Version History
288
+
289
+ | Version | Release Date | Key Feature Updates |
290
+ |--------|--------------|---------------------|
291
+ | `v1.0` | 2025-03-26 | Initial release with basic total tokens estimation|
292
+ | `v1.1` | 2025-04-06 | Added On premise Estimator Tab |
293
+
294
+
295
+ ---
296
+
297
+
298
  Website: [https://www.priam.ai](https://www.priam.ai)
299
  """
300
  )
301
+ st.markdown(
302
+ """
303
+ ### Disclaimer
304
+
305
+ This app is for demonstration purposes only. Actual costs may vary based on usage patterns and other factors.
306
+ """
307
+ )
utils_on.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple, Optional, Union
2
+ import re
3
+ import math
4
+ import requests
5
+ import numpy as np
6
+ from huggingface_hub import HfApi, ModelInfo
7
+ from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
8
+
9
+ def parse_model_entries(model_entries: List[str]) -> List[Dict[str, str]]:
10
+ """
11
+ Parse a list of model entries into structured dictionaries with provider, model name, version, region, and type.
12
+
13
+ Args:
14
+ model_entries: List of model entry strings as found in models.txt
15
+
16
+ Returns:
17
+ List of dictionaries with parsed model information containing keys:
18
+ - provider: Name of the provider (e.g., 'azure', 'openai', 'anthropic', etc.)
19
+ - model_name: Base name of the model
20
+ - version: Version of the model (if available)
21
+ - region: Deployment region (if available)
22
+ - model_type: Type of the model (text, image, audio based on pattern analysis)
23
+ """
24
+ parsed_models = []
25
+
26
+ # Common provider prefixes to identify
27
+ known_providers = [
28
+ 'azure', 'bedrock', 'anthropic', 'openai', 'cohere', 'google',
29
+ 'mistral', 'meta', 'amazon', 'ai21', 'anyscale', 'stability',
30
+ 'cloudflare', 'databricks', 'cerebras', 'assemblyai'
31
+ ]
32
+
33
+ # Image-related keywords to identify image models
34
+ image_indicators = ['dall-e', 'stable-diffusion', 'image', 'canvas', 'x-', 'steps']
35
+
36
+ # Audio-related keywords to identify audio models
37
+ audio_indicators = ['whisper', 'tts', 'audio', 'voice']
38
+
39
+ for entry in model_entries:
40
+ model_info = {
41
+ 'provider': '',
42
+ 'model_name': '',
43
+ 'version': '',
44
+ 'region': '',
45
+ 'model_type': 'text' # Default to text
46
+ }
47
+
48
+ # Check for image models
49
+ if any(indicator in entry.lower() for indicator in image_indicators):
50
+ model_info['model_type'] = 'image'
51
+
52
+ # Check for audio models
53
+ elif any(indicator in entry.lower() for indicator in audio_indicators):
54
+ model_info['model_type'] = 'audio'
55
+
56
+ # Parse the entry based on common patterns
57
+ parts = entry.split('/')
58
+
59
+ # Handle region and provider extraction
60
+ if len(parts) >= 2:
61
+ # Extract provider from the beginning (common pattern)
62
+ if parts[0].lower() in known_providers:
63
+ model_info['provider'] = parts[0].lower()
64
+
65
+ # For bedrock and azure, the region is often the next part
66
+ if parts[0].lower() in ['bedrock', 'azure'] and len(parts) >= 3:
67
+ # Skip commitment parts if present
68
+ if 'commitment' not in parts[1]:
69
+ model_info['region'] = parts[1]
70
+
71
+ # The last part typically contains the model name and possibly version
72
+ model_with_version = parts[-1]
73
+ else:
74
+ # For single-part entries
75
+ model_with_version = entry
76
+
77
+ # Extract provider from model name if not already set
78
+ if not model_info['provider']:
79
+ # Look for known providers within the model name
80
+ for provider in known_providers:
81
+ if provider in model_with_version.lower() or f'{provider}.' in model_with_version.lower():
82
+ model_info['provider'] = provider
83
+ # Remove provider prefix if it exists at the beginning
84
+ if model_with_version.lower().startswith(f'{provider}.'):
85
+ model_with_version = model_with_version[len(provider) + 1:]
86
+ break
87
+
88
+ # Extract version information
89
+ version_match = re.search(r'[:.-]v(\d+(?:\.\d+)*(?:-\d+)?|\d+)(?::\d+)?$', model_with_version)
90
+ if version_match:
91
+ model_info['version'] = version_match.group(1)
92
+ # Remove version from model name
93
+ model_name = model_with_version[:version_match.start()]
94
+ else:
95
+ # Look for date-based versions like 2024-08-06
96
+ date_match = re.search(r'-(\d{4}-\d{2}-\d{2})$', model_with_version)
97
+ if date_match:
98
+ model_info['version'] = date_match.group(1)
99
+ model_name = model_with_version[:date_match.start()]
100
+ else:
101
+ model_name = model_with_version
102
+
103
+ # Clean up model name by removing trailing/leading separators
104
+ model_info['model_name'] = model_name.strip('.-:')
105
+
106
+ parsed_models.append(model_info)
107
+
108
+ return parsed_models
109
+
110
+
111
+ def create_model_hierarchy(model_entries: List[str]) -> Dict[str, Dict[str, Dict[str, Dict[str, str]]]]:
112
+ """
113
+ Organize model entries into a nested dictionary structure by provider, model, version, and region.
114
+
115
+ Args:
116
+ model_entries: List of model entry strings as found in models.txt
117
+
118
+ Returns:
119
+ Nested dictionary with the structure:
120
+ Provider -> Model -> Version -> Region = full model string
121
+ If region or version is None, they are replaced with "NA".
122
+ """
123
+ # Parse the model entries to get structured information
124
+ parsed_models = parse_model_entries(model_entries)
125
+
126
+ # Create the nested dictionary structure
127
+ hierarchy = {}
128
+
129
+ for i, model_info in enumerate(parsed_models):
130
+ provider = model_info['provider'] if model_info['provider'] else 'unknown'
131
+ model_name = model_info['model_name']
132
+ version = model_info['version'] if model_info['version'] else 'NA'
133
+ # For Azure models, always use 'NA' as region since they are globally available
134
+ region = 'NA' if provider == 'azure' else (model_info['region'] if model_info['region'] else 'NA')
135
+
136
+ # Initialize nested dictionaries if they don't exist
137
+ if provider not in hierarchy:
138
+ hierarchy[provider] = {}
139
+
140
+ if model_name not in hierarchy[provider]:
141
+ hierarchy[provider][model_name] = {}
142
+
143
+ if version not in hierarchy[provider][model_name]:
144
+ hierarchy[provider][model_name][version] = {}
145
+
146
+ # Store the full model string at the leaf node
147
+ hierarchy[provider][model_name][version][region] = model_entries[i]
148
+
149
+ return hierarchy
150
+
151
+
152
+ # NVIDIA GPU specifications - Name: (VRAM in GB, FP16 TOPS)
153
+ NVIDIA_GPUS = {
154
+ "RTX 3050": (8, 18),
155
+ "RTX 3060": (12, 25),
156
+ "RTX 3070": (8, 40),
157
+ "RTX 3080": (10, 58),
158
+ "RTX 3090": (24, 71),
159
+ "RTX 4060": (8, 41),
160
+ "RTX 4070": (12, 56),
161
+ "RTX 4080": (16, 113),
162
+ "RTX 4090": (24, 165),
163
+ "RTX A2000": (6, 20),
164
+ "RTX A4000": (16, 40),
165
+ "RTX A5000": (24, 64),
166
+ "RTX A6000": (48, 75),
167
+ "A100 40GB": (40, 312),
168
+ "A100 80GB": (80, 312),
169
+ "H100 80GB": (80, 989),
170
+ }
171
+
172
+
173
+ def get_hf_model_info(model_id: str) -> Optional[ModelInfo]:
174
+ """
175
+ Retrieve model information from the Hugging Face Hub.
176
+
177
+ Args:
178
+ model_id: Hugging Face model ID (e.g., "facebook/opt-1.3b")
179
+
180
+ Returns:
181
+ ModelInfo object or None if model not found
182
+ """
183
+ try:
184
+ api = HfApi()
185
+ model_info = api.model_info(model_id)
186
+ return model_info
187
+ except (RepositoryNotFoundError, RevisionNotFoundError) as e:
188
+ print(f"Error fetching model info: {e}")
189
+ return None
190
+
191
+
192
+ def extract_model_size(model_info: ModelInfo) -> Optional[Tuple[float, str]]:
193
+ """
194
+ Extract the parameter size and precision from model information.
195
+
196
+ Args:
197
+ model_info: ModelInfo object from Hugging Face Hub
198
+
199
+ Returns:
200
+ Tuple of (parameter size in billions, precision) or None if not found
201
+ """
202
+ # Try to get parameter count from model card
203
+ if model_info.card_data is not None:
204
+ if "model-index" in model_info.card_data and isinstance(model_info.card_data["model-index"], list):
205
+ for item in model_info.card_data["model-index"]:
206
+ if "parameters" in item:
207
+ return float(item["parameters"]) / 1e9, "fp16" # Convert to billions and assume fp16
208
+
209
+ # Try to extract from model name
210
+ name = model_info.id.lower()
211
+ size_patterns = [
212
+ r"(\d+(\.\d+)?)b", # matches patterns like "1.3b" or "7b"
213
+ r"-(\d+(\.\d+)?)b", # matches patterns like "llama-7b"
214
+ r"(\d+(\.\d+)?)-b", # matches other formatting variations
215
+ ]
216
+
217
+ for pattern in size_patterns:
218
+ match = re.search(pattern, name)
219
+ if match:
220
+ size_str = match.group(1)
221
+ return float(size_str), "fp16" # Default to fp16
222
+
223
+ # Extract precision if available
224
+ precision = "fp16" # Default
225
+ precision_patterns = {"fp16": r"fp16", "int8": r"int8", "int4": r"int4", "fp32": r"fp32"}
226
+ for prec, pattern in precision_patterns.items():
227
+ if re.search(pattern, name):
228
+ precision = prec
229
+ break
230
+
231
+ # If couldn't determine size, check sibling models or readme
232
+ if model_info.siblings:
233
+ for sibling in model_info.siblings:
234
+ if sibling.rfilename == "README.md" and sibling.size < 100000: # reasonable size for readme
235
+ try:
236
+ content = requests.get(sibling.lfs.url).text
237
+ param_pattern = r"(\d+(\.\d+)?)\s*[Bb](illion)?\s*[Pp]arameters"
238
+ match = re.search(param_pattern, content)
239
+ if match:
240
+ return float(match.group(1)), precision
241
+ except:
242
+ pass
243
+
244
+ # As a last resort, try to analyze config.json if it exists
245
+ config_sibling = next((s for s in model_info.siblings if s.rfilename == "config.json"), None)
246
+ if config_sibling:
247
+ try:
248
+ config = requests.get(config_sibling.lfs.url).json()
249
+ if "n_params" in config:
250
+ return float(config["n_params"]) / 1e9, precision
251
+ # Calculate from architecture if available
252
+ if all(k in config for k in ["n_layer", "n_head", "n_embd"]):
253
+ n_layer = config["n_layer"]
254
+ n_embd = config["n_embd"]
255
+ n_head = config["n_head"]
256
+ # Transformer parameter estimation formula
257
+ params = 12 * n_layer * (n_embd**2) * (1 + 13 / (12 * n_embd))
258
+ return params / 1e9, precision
259
+ except:
260
+ pass
261
+
262
+ return None
263
+
264
+
265
+ def calculate_vram_requirements(param_size: float, precision: str = "fp16") -> Dict[str, float]:
266
+ """
267
+ Calculate VRAM requirements for inference using the EleutherAI transformer math formula.
268
+
269
+ Args:
270
+ param_size: Model size in billions of parameters
271
+ precision: Model precision ("fp32", "fp16", "int8", "int4")
272
+
273
+ Returns:
274
+ Dictionary with various memory requirements in GB
275
+ """
276
+ # Convert parameters to actual count
277
+ param_count = param_size * 1e9
278
+
279
+ # Size per parameter based on precision
280
+ bytes_per_param = {
281
+ "fp32": 4,
282
+ "fp16": 2,
283
+ "int8": 1,
284
+ "int4": 0.5, # 4 bits = 0.5 bytes
285
+ }[precision]
286
+
287
+ # Base model size (parameters * bytes per parameter)
288
+ model_size_gb = (param_count * bytes_per_param) / (1024**3)
289
+
290
+ # EleutherAI formula components for inference memory
291
+ # Layer activations - scales with sequence length
292
+ activation_factor = 1.2 # varies by architecture
293
+
294
+ # KV cache size (scales with batch size and sequence length)
295
+ # Estimate for single batch, 2048-token context
296
+ kv_cache_size_gb = (param_count * 0.0625 * bytes_per_param) / (1024**3) # ~6.25% of params for KV cache
297
+
298
+ # Total VRAM needed for inference
299
+ total_inference_gb = model_size_gb + (model_size_gb * activation_factor) + kv_cache_size_gb
300
+
301
+ # Add overhead for CUDA, buffers, and fragmentation
302
+ overhead_gb = 0.8 # 800 MB overhead
303
+
304
+ # Dynamic computation graph allocation
305
+ compute_overhead_factor = 0.1 # varies based on attention computation method
306
+
307
+ # Final VRAM estimate
308
+ total_vram_required_gb = total_inference_gb + overhead_gb + (total_inference_gb * compute_overhead_factor)
309
+
310
+ return {
311
+ "model_size_gb": model_size_gb,
312
+ "kv_cache_gb": kv_cache_size_gb,
313
+ "activations_gb": model_size_gb * activation_factor,
314
+ "overhead_gb": overhead_gb + (total_inference_gb * compute_overhead_factor),
315
+ "total_vram_gb": total_vram_required_gb
316
+ }
317
+
318
+
319
+ def find_compatible_gpus(vram_required: float) -> List[str]:
320
+ """
321
+ Find NVIDIA GPUs that can run a model requiring the specified VRAM.
322
+
323
+ Args:
324
+ vram_required: Required VRAM in GB
325
+
326
+ Returns:
327
+ List of compatible GPU names sorted by VRAM capacity (smallest first)
328
+ """
329
+ compatible_gpus = [(name, specs[0]) for name, specs in NVIDIA_GPUS.items() if specs[0] >= vram_required]
330
+ return [gpu[0] for gpu in sorted(compatible_gpus, key=lambda x: x[1])]
331
+
332
+
333
+ def estimate_performance(param_size: float, precision: str, gpu_name: str) -> Dict[str, float]:
334
+ """
335
+ Estimate token/second performance for a model on a specific GPU.
336
+
337
+ Args:
338
+ param_size: Model size in billions of parameters
339
+ precision: Model precision
340
+ gpu_name: Name of the NVIDIA GPU
341
+
342
+ Returns:
343
+ Dictionary with performance metrics
344
+ """
345
+ if gpu_name not in NVIDIA_GPUS:
346
+ return {"tokens_per_second": 0, "tflops_utilization": 0}
347
+
348
+ gpu_vram, gpu_tops = NVIDIA_GPUS[gpu_name]
349
+
350
+ # Calculate FLOPs per token (based on model size)
351
+ # Formula: ~6 * num_parameters FLOPs per token (inference)
352
+ flops_per_token = 6 * param_size * 1e9
353
+
354
+ # Convert TOPS to TFLOPS based on precision
355
+ precision_factor = 1.0 if precision == "fp32" else 2.0 if precision == "fp16" else 4.0 if precision in ["int8", "int4"] else 1.0
356
+ gpu_tflops = gpu_tops * precision_factor
357
+
358
+ # Practical utilization (GPUs rarely achieve 100% of theoretical performance)
359
+ practical_utilization = 0.6 # 60% utilization
360
+
361
+ # Calculate tokens per second
362
+ effective_tflops = gpu_tflops * practical_utilization
363
+ tokens_per_second = (effective_tflops * 1e12) / flops_per_token
364
+
365
+ return {
366
+ "tokens_per_second": tokens_per_second,
367
+ "flops_per_token": flops_per_token,
368
+ "tflops_utilization": practical_utilization,
369
+ "effective_tflops": effective_tflops
370
+ }
371
+
372
+
373
+ def analyze_hf_model(model_id: str) -> Dict[str, any]:
374
+ """
375
+ Comprehensive analysis of a Hugging Face model:
376
+ - Downloads model information
377
+ - Extracts parameter size and precision
378
+ - Estimates VRAM requirements
379
+ - Identifies compatible NVIDIA GPUs
380
+ - Estimates performance on these GPUs
381
+
382
+ Args:
383
+ model_id: Hugging Face model ID (e.g., "facebook/opt-1.3b")
384
+
385
+ Returns:
386
+ Dictionary with analysis results or error message
387
+ """
388
+ # Get model information
389
+ model_info = get_hf_model_info(model_id)
390
+ if not model_info:
391
+ return {"error": f"Model {model_id} not found on Hugging Face"}
392
+
393
+ # Extract model size and precision
394
+ size_info = extract_model_size(model_info)
395
+ if not size_info:
396
+ return {"error": f"Couldn't determine parameter count for {model_id}"}
397
+
398
+ param_size, precision = size_info
399
+
400
+ # Calculate VRAM requirements
401
+ vram_requirements = calculate_vram_requirements(param_size, precision)
402
+ total_vram_gb = vram_requirements["total_vram_gb"]
403
+
404
+ # Find compatible GPUs
405
+ compatible_gpus = find_compatible_gpus(total_vram_gb)
406
+
407
+ # Calculate performance for each compatible GPU
408
+ gpu_performance = {}
409
+ for gpu in compatible_gpus:
410
+ gpu_performance[gpu] = estimate_performance(param_size, precision, gpu)
411
+
412
+ # Determine the largest GPU that can run the model
413
+ largest_compatible_gpu = compatible_gpus[-1] if compatible_gpus else None
414
+
415
+ return {
416
+ "model_id": model_id,
417
+ "parameter_size": param_size, # in billions
418
+ "precision": precision,
419
+ "vram_requirements": vram_requirements,
420
+ "compatible_gpus": compatible_gpus,
421
+ "largest_compatible_gpu": largest_compatible_gpu,
422
+ "gpu_performance": gpu_performance,
423
+ #"model_info": {
424
+ #"description": model_info.description,
425
+ #"tags": model_info.tags,
426
+ #"downloads": model_info.downloads,
427
+ #"library": getattr(model_info, "library", None)
428
+ #}
429
+ }