ThatsGroes commited on
Commit
76ba794
·
1 Parent(s): d16240b

authentication

Browse files
Files changed (1) hide show
  1. app.py +39 -113
app.py CHANGED
@@ -1,16 +1,8 @@
1
- from typing import Dict, Union
2
- from huggingface_hub import get_safetensors_metadata, hf_hub_download
3
- import argparse
4
- import sys
5
- import json
6
  import gradio as gr
7
  from typing import Dict, Union
8
- from huggingface_hub import get_safetensors_metadata, hf_hub_download
9
  import json
10
 
11
- # Example:
12
- # python get_gpu_memory.py Qwen/Qwen2.5-7B-Instruct
13
-
14
  # Dictionary mapping dtype strings to their byte sizes
15
  bytes_per_dtype: Dict[str, float] = {
16
  "int4": 0.5,
@@ -20,57 +12,8 @@ bytes_per_dtype: Dict[str, float] = {
20
  "float32": 4,
21
  }
22
 
23
-
24
- def calculate_kv_cache_memory(context_size: int, model_id: str, dtype: str, filename: str="config.json"):
25
- """
26
- Implements the formula suggested in https://medium.com/@tejaswi_kashyap/memory-optimization-in-llms-leveraging-kv-cache-quantization-for-efficient-inference-94bc3df5faef
27
- """
28
- try:
29
- file_path = hf_hub_download(repo_id=model_id, filename=filename)
30
-
31
- with open(file_path, 'r') as f:
32
- config = json.load(f)
33
-
34
- keys_to_find = {"num_hidden_layers", "num_key_value_heads", "hidden_size", "num_attention_heads"}
35
-
36
- config = extract_keys(config, keys_to_find)
37
-
38
- num_layers = config["num_hidden_layers"]
39
-
40
- if "num_key_value_heads" in config:
41
- num_att_heads = config["num_key_value_heads"]
42
- else:
43
- num_att_heads = config["num_attention_heads"]
44
-
45
- dim_att_head = config["hidden_size"] // config["num_attention_heads"]
46
-
47
- dtype_bytes = bytes_per_dtype[dtype]
48
-
49
- memory_per_token = num_layers * num_att_heads * dim_att_head * dtype_bytes * 2
50
-
51
- context_size_memory_footprint_gb = (context_size * memory_per_token) / 1_000_000_000
52
-
53
- return context_size_memory_footprint_gb
54
-
55
- except Exception as e:
56
- print(f"Error estimating context size: {str(e)}", file=sys.stderr)
57
- return None
58
-
59
-
60
-
61
  def extract_keys(json_obj, keys_to_extract):
62
- """
63
- Recursively searches for specific keys in a nested JSON object.
64
-
65
- Args:
66
- json_obj (dict or list): The JSON data (parsed as a dictionary or list).
67
- keys_to_extract (set): A set of keys to extract values for.
68
-
69
- Returns:
70
- dict: A dictionary with found key-value pairs.
71
- """
72
  extracted_values = {}
73
-
74
  def recursive_search(obj):
75
  if isinstance(obj, dict):
76
  for key, value in obj.items():
@@ -80,71 +23,53 @@ def extract_keys(json_obj, keys_to_extract):
80
  elif isinstance(obj, list):
81
  for item in obj:
82
  recursive_search(item)
83
-
84
  recursive_search(json_obj)
85
  return extracted_values
86
 
 
 
 
 
 
 
 
 
87
 
88
- def calculate_model_memory(parameters: float, bytes: float) -> float:
89
- """Calculates the GPU memory required for serving a Large Language Model (LLM).
90
- This function estimates the GPU memory needed using the formula:
91
- M = (P * 4B) / (32 / Q) * 1.18
92
- where:
93
- - M is the GPU memory in Gigabytes
94
- - P is the number of parameters in billions (e.g., 7 for a 7B model)
95
- - 4B represents 4 bytes per parameter
96
- - 32 represents bits in 4 bytes
97
- - Q is the quantization bits (e.g., 16, 8, or 4 bits)
98
- - 1.18 represents ~18% overhead for additional GPU memory requirements
99
- Args:
100
- parameters: Number of model parameters in billions
101
- bytes: Number of bytes per parameter based on dtype
102
- Returns:
103
- Estimated GPU memory required in Gigabytes
104
- Examples:
105
- >>> calculate_gpu_memory(7, bytes_per_dtype["float16"])
106
- 13.72
107
- >>> calculate_gpu_memory(13, bytes_per_dtype["int8"])
108
- 12.74
109
- """
110
- memory = round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
111
- return memory
112
 
 
 
 
113
 
114
- def get_model_size(model_id: str, dtype: str = "float16") -> Union[float, None]:
115
- """Get the estimated GPU memory requirement for a Hugging Face model.
116
- Args:
117
- model_id: Hugging Face model ID (e.g., "facebook/opt-350m")
118
- dtype: Data type for model loading ("float16", "int8", etc.)
119
- Returns:
120
- Estimated GPU memory in GB, or None if estimation fails
121
- Examples:
122
- >>> get_model_size("facebook/opt-350m")
123
- 0.82
124
- >>> get_model_size("meta-llama/Llama-2-7b-hf", dtype="int8")
125
- 6.86
126
- """
127
  try:
128
-
129
- metadata = get_safetensors_metadata(model_id)
130
  if not metadata or not metadata.parameter_count:
131
- raise ValueError(f"Could not fetch metadata for model: {model_id}")
132
-
133
- model_parameters = list(metadata.parameter_count.values())[0]
134
- model_parameters = int(model_parameters) / 1_000_000_000 # Convert to billions
135
- return calculate_model_memory(model_parameters, bytes_per_dtype[dtype])
136
-
137
  except Exception as e:
138
- print(f"Error estimating model size: {str(e)}", file=sys.stderr)
139
- return None
140
-
141
 
142
- def estimate_vram(model_id, dtype, context_size):
 
 
 
143
  if dtype not in bytes_per_dtype:
144
  return "Error: Unsupported dtype"
145
 
146
- model_memory = get_model_size(model_id, dtype)
147
- context_memory = calculate_kv_cache_memory(context_size, model_id, dtype)
148
 
149
  if isinstance(model_memory, str) or isinstance(context_memory, str):
150
  return model_memory if isinstance(model_memory, str) else context_memory
@@ -157,11 +82,12 @@ iface = gr.Interface(
157
  inputs=[
158
  gr.Textbox(label="Hugging Face Model ID", value="google/gemma-3-27b-it"),
159
  gr.Dropdown(choices=list(bytes_per_dtype.keys()), label="Data Type", value="float16"),
160
- gr.Number(label="Context Size", value=128000)
 
161
  ],
162
  outputs=gr.Textbox(label="Estimated VRAM Usage"),
163
  title="LLM GPU VRAM Calculator",
164
- description="Estimate the VRAM requirements of a model and context size."
165
  )
166
 
167
- iface.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
  from typing import Dict, Union
3
+ from huggingface_hub import get_safetensors_metadata, hf_hub_download, login
4
  import json
5
 
 
 
 
6
  # Dictionary mapping dtype strings to their byte sizes
7
  bytes_per_dtype: Dict[str, float] = {
8
  "int4": 0.5,
 
12
  "float32": 4,
13
  }
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def extract_keys(json_obj, keys_to_extract):
 
 
 
 
 
 
 
 
 
 
16
  extracted_values = {}
 
17
  def recursive_search(obj):
18
  if isinstance(obj, dict):
19
  for key, value in obj.items():
 
23
  elif isinstance(obj, list):
24
  for item in obj:
25
  recursive_search(item)
 
26
  recursive_search(json_obj)
27
  return extracted_values
28
 
29
+ def calculate_kv_cache_memory(context_size: int, model_id: str, dtype: str, token: str = None):
30
+ try:
31
+ file_path = hf_hub_download(repo_id=model_id, filename="config.json", token=token)
32
+ with open(file_path, 'r') as f:
33
+ config = json.load(f)
34
+
35
+ keys_to_find = {"num_hidden_layers", "num_key_value_heads", "hidden_size", "num_attention_heads"}
36
+ config = extract_keys(config, keys_to_find)
37
 
38
+ num_layers = config["num_hidden_layers"]
39
+ num_att_heads = config.get("num_key_value_heads", config["num_attention_heads"])
40
+ dim_att_head = config["hidden_size"] // config["num_attention_heads"]
41
+ dtype_bytes = bytes_per_dtype[dtype]
42
+
43
+ memory_per_token = num_layers * num_att_heads * dim_att_head * dtype_bytes * 2
44
+ context_size_memory_footprint_gb = (context_size * memory_per_token) / 1_000_000_000
45
+
46
+ return context_size_memory_footprint_gb
47
+ except Exception as e:
48
+ return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ def calculate_model_memory(parameters: float, dtype: str) -> float:
51
+ bytes = bytes_per_dtype[dtype]
52
+ return round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
53
 
54
+ def get_model_size(model_id: str, dtype: str, token: str = None) -> Union[float, str]:
 
 
 
 
 
 
 
 
 
 
 
 
55
  try:
56
+ metadata = get_safetensors_metadata(model_id, token=token)
 
57
  if not metadata or not metadata.parameter_count:
58
+ return "Error: Could not fetch metadata."
59
+ model_parameters = int(list(metadata.parameter_count.values())[0]) / 1_000_000_000
60
+ return calculate_model_memory(model_parameters, dtype)
 
 
 
61
  except Exception as e:
62
+ return f"Error: {str(e)}"
 
 
63
 
64
+ def estimate_vram(model_id, dtype, context_size, hf_token):
65
+ if hf_token:
66
+ login(token=hf_token)
67
+
68
  if dtype not in bytes_per_dtype:
69
  return "Error: Unsupported dtype"
70
 
71
+ model_memory = get_model_size(model_id, dtype, hf_token)
72
+ context_memory = calculate_kv_cache_memory(context_size, model_id, dtype, hf_token)
73
 
74
  if isinstance(model_memory, str) or isinstance(context_memory, str):
75
  return model_memory if isinstance(model_memory, str) else context_memory
 
82
  inputs=[
83
  gr.Textbox(label="Hugging Face Model ID", value="google/gemma-3-27b-it"),
84
  gr.Dropdown(choices=list(bytes_per_dtype.keys()), label="Data Type", value="float16"),
85
+ gr.Number(label="Context Size", value=128000),
86
+ gr.Textbox(label="Hugging Face Access Token", type="password", placeholder="Optional - Needed for gated models")
87
  ],
88
  outputs=gr.Textbox(label="Estimated VRAM Usage"),
89
  title="LLM GPU VRAM Calculator",
90
+ description="Estimate the VRAM requirements of a model and context size. Optionally provide a Hugging Face token for gated models."
91
  )
92
 
93
+ iface.launch()