ThatsGroes commited on
Commit
d16240b
·
1 Parent(s): 34814ca

calculator

Browse files
Files changed (1) hide show
  1. app.py +167 -0
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Union
2
+ from huggingface_hub import get_safetensors_metadata, hf_hub_download
3
+ import argparse
4
+ import sys
5
+ import json
6
+ import gradio as gr
7
+ from typing import Dict, Union
8
+ from huggingface_hub import get_safetensors_metadata, hf_hub_download
9
+ import json
10
+
11
+ # Example:
12
+ # python get_gpu_memory.py Qwen/Qwen2.5-7B-Instruct
13
+
14
+ # Dictionary mapping dtype strings to their byte sizes
15
+ bytes_per_dtype: Dict[str, float] = {
16
+ "int4": 0.5,
17
+ "int8": 1,
18
+ "float8": 1,
19
+ "float16": 2,
20
+ "float32": 4,
21
+ }
22
+
23
+
24
+ def calculate_kv_cache_memory(context_size: int, model_id: str, dtype: str, filename: str="config.json"):
25
+ """
26
+ Implements the formula suggested in https://medium.com/@tejaswi_kashyap/memory-optimization-in-llms-leveraging-kv-cache-quantization-for-efficient-inference-94bc3df5faef
27
+ """
28
+ try:
29
+ file_path = hf_hub_download(repo_id=model_id, filename=filename)
30
+
31
+ with open(file_path, 'r') as f:
32
+ config = json.load(f)
33
+
34
+ keys_to_find = {"num_hidden_layers", "num_key_value_heads", "hidden_size", "num_attention_heads"}
35
+
36
+ config = extract_keys(config, keys_to_find)
37
+
38
+ num_layers = config["num_hidden_layers"]
39
+
40
+ if "num_key_value_heads" in config:
41
+ num_att_heads = config["num_key_value_heads"]
42
+ else:
43
+ num_att_heads = config["num_attention_heads"]
44
+
45
+ dim_att_head = config["hidden_size"] // config["num_attention_heads"]
46
+
47
+ dtype_bytes = bytes_per_dtype[dtype]
48
+
49
+ memory_per_token = num_layers * num_att_heads * dim_att_head * dtype_bytes * 2
50
+
51
+ context_size_memory_footprint_gb = (context_size * memory_per_token) / 1_000_000_000
52
+
53
+ return context_size_memory_footprint_gb
54
+
55
+ except Exception as e:
56
+ print(f"Error estimating context size: {str(e)}", file=sys.stderr)
57
+ return None
58
+
59
+
60
+
61
+ def extract_keys(json_obj, keys_to_extract):
62
+ """
63
+ Recursively searches for specific keys in a nested JSON object.
64
+
65
+ Args:
66
+ json_obj (dict or list): The JSON data (parsed as a dictionary or list).
67
+ keys_to_extract (set): A set of keys to extract values for.
68
+
69
+ Returns:
70
+ dict: A dictionary with found key-value pairs.
71
+ """
72
+ extracted_values = {}
73
+
74
+ def recursive_search(obj):
75
+ if isinstance(obj, dict):
76
+ for key, value in obj.items():
77
+ if key in keys_to_extract:
78
+ extracted_values[key] = value
79
+ recursive_search(value)
80
+ elif isinstance(obj, list):
81
+ for item in obj:
82
+ recursive_search(item)
83
+
84
+ recursive_search(json_obj)
85
+ return extracted_values
86
+
87
+
88
+ def calculate_model_memory(parameters: float, bytes: float) -> float:
89
+ """Calculates the GPU memory required for serving a Large Language Model (LLM).
90
+ This function estimates the GPU memory needed using the formula:
91
+ M = (P * 4B) / (32 / Q) * 1.18
92
+ where:
93
+ - M is the GPU memory in Gigabytes
94
+ - P is the number of parameters in billions (e.g., 7 for a 7B model)
95
+ - 4B represents 4 bytes per parameter
96
+ - 32 represents bits in 4 bytes
97
+ - Q is the quantization bits (e.g., 16, 8, or 4 bits)
98
+ - 1.18 represents ~18% overhead for additional GPU memory requirements
99
+ Args:
100
+ parameters: Number of model parameters in billions
101
+ bytes: Number of bytes per parameter based on dtype
102
+ Returns:
103
+ Estimated GPU memory required in Gigabytes
104
+ Examples:
105
+ >>> calculate_gpu_memory(7, bytes_per_dtype["float16"])
106
+ 13.72
107
+ >>> calculate_gpu_memory(13, bytes_per_dtype["int8"])
108
+ 12.74
109
+ """
110
+ memory = round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
111
+ return memory
112
+
113
+
114
+ def get_model_size(model_id: str, dtype: str = "float16") -> Union[float, None]:
115
+ """Get the estimated GPU memory requirement for a Hugging Face model.
116
+ Args:
117
+ model_id: Hugging Face model ID (e.g., "facebook/opt-350m")
118
+ dtype: Data type for model loading ("float16", "int8", etc.)
119
+ Returns:
120
+ Estimated GPU memory in GB, or None if estimation fails
121
+ Examples:
122
+ >>> get_model_size("facebook/opt-350m")
123
+ 0.82
124
+ >>> get_model_size("meta-llama/Llama-2-7b-hf", dtype="int8")
125
+ 6.86
126
+ """
127
+ try:
128
+
129
+ metadata = get_safetensors_metadata(model_id)
130
+ if not metadata or not metadata.parameter_count:
131
+ raise ValueError(f"Could not fetch metadata for model: {model_id}")
132
+
133
+ model_parameters = list(metadata.parameter_count.values())[0]
134
+ model_parameters = int(model_parameters) / 1_000_000_000 # Convert to billions
135
+ return calculate_model_memory(model_parameters, bytes_per_dtype[dtype])
136
+
137
+ except Exception as e:
138
+ print(f"Error estimating model size: {str(e)}", file=sys.stderr)
139
+ return None
140
+
141
+
142
+ def estimate_vram(model_id, dtype, context_size):
143
+ if dtype not in bytes_per_dtype:
144
+ return "Error: Unsupported dtype"
145
+
146
+ model_memory = get_model_size(model_id, dtype)
147
+ context_memory = calculate_kv_cache_memory(context_size, model_id, dtype)
148
+
149
+ if isinstance(model_memory, str) or isinstance(context_memory, str):
150
+ return model_memory if isinstance(model_memory, str) else context_memory
151
+
152
+ total_memory = model_memory + context_memory
153
+ return f"Model VRAM: {model_memory:.2f} GB\nContext VRAM: {context_memory:.2f} GB\nTotal VRAM: {total_memory:.2f} GB"
154
+
155
+ iface = gr.Interface(
156
+ fn=estimate_vram,
157
+ inputs=[
158
+ gr.Textbox(label="Hugging Face Model ID", value="google/gemma-3-27b-it"),
159
+ gr.Dropdown(choices=list(bytes_per_dtype.keys()), label="Data Type", value="float16"),
160
+ gr.Number(label="Context Size", value=128000)
161
+ ],
162
+ outputs=gr.Textbox(label="Estimated VRAM Usage"),
163
+ title="LLM GPU VRAM Calculator",
164
+ description="Estimate the VRAM requirements of a model and context size."
165
+ )
166
+
167
+ iface.launch()