Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| import webbrowser | |
| import pandas as pd | |
| import gradio as gr | |
| from huggingface_hub import HfApi | |
| from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError | |
| from accelerate.commands.estimate import create_empty_model, check_has_model | |
| from accelerate.utils import convert_bytes, calculate_maximum_sizes | |
| # We need to store them as globals because gradio doesn't have a way for us to pass them in to the button | |
| HAS_DISCUSSION = True | |
| MODEL_NAME = None | |
| LIBRARY = None | |
| USER_TOKEN = None | |
| TOKEN = os.environ.get("HUGGINGFACE_API_LOGIN", None) | |
| def check_for_discussion(model_name:str): | |
| "Checks if an automated discussion has been opened on the model by `model-sizer-bot`" | |
| global TOKEN | |
| api = HfApi(token=TOKEN) | |
| discussions = list(api.get_repo_discussions(model_name)) | |
| return any(discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot" for discussion in discussions) | |
| def report_results(): | |
| "Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards" | |
| global MODEL_NAME, LIBRARY, TOKEN, USER_TOKEN | |
| api = HfApi(token=TOKEN) | |
| results, data = calculate_memory(MODEL_NAME, LIBRARY, ["fp32", "fp16", "int8", "int4"], access_token=USER_TOKEN, raw=True) | |
| minimum = data[0] | |
| USER_TOKEN = None | |
| post = f"""# Model Memory Requirements\n | |
| You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam. | |
| These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub. | |
| The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer". | |
| When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model. | |
| When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer). | |
| ## Results: | |
| {results} | |
| """ | |
| discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post) | |
| webbrowser.open_new_tab(discussion.url) | |
| def convert_url_to_name(url:str): | |
| "Converts a model URL to its name on the Hub" | |
| results = re.findall(r"huggingface.co\/(.*?)#", url) | |
| if len(results) < 1: | |
| raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub") | |
| return results[0] | |
| # Based on the following doc: | |
| # | |
| # - https://huggingface.co/docs/transformers/v4.31.0/perf_train_gpu_one#anatomy-of-models-memory | |
| # - https://blog.eleuther.ai/transformer-math/ | |
| # - https://kipp.ly/transformer-inference-arithmetic/ | |
| # - https://github.com/ray-project/llm-numbers | |
| # | |
| def calc_vram_f32(model, optimizer, sequence_len, micro_batch_size, device_count, gradient_checkpointing): | |
| # is_16bit = cfg.bf16 or cfg.bfloat16 or cfg.load_in_8bit or cfg.fp16 or cfg.float16 | |
| # if torch.cuda.device_count() > 1 or cfg.fsdp or os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" or cfg.adapter: | |
| # return { 'supported': False } | |
| # Model Weights | |
| # | |
| # Hf doc counts: | |
| # | |
| # - 4 bytes * number of parameters for fp32 training | |
| # - 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory) | |
| # | |
| # But we follow https://blog.eleuther.ai/transformer-math/#model-weights to count 2 bytes here for mixed precision training, | |
| # leave the rest to optimizor state. | |
| # | |
| # Here we calculate only for fp32, will adjust for each dtype outside. | |
| # | |
| # for param in model.parameters(): | |
| # print(f'{type(param)} {param.shape} {param.element_size()}') | |
| # | |
| # print(f'total parameters = {sum([param.nelement() for param in model.parameters()])}') | |
| param_element_size = 4 | |
| vram_model = sum([param.nelement() * param_element_size for param in model.parameters()]) | |
| # Buffers | |
| # | |
| # Buffers are tensors that do not require gradients and not registered as parameters. | |
| # e.g. mean and std in batch norm layers. | |
| # - https://github.com/huggingface/transformers/blob/d4bd33cc9f11ca48635e54983d75249c78d72e2a/src/transformers/modeling_utils.py#L1897 | |
| # - https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 | |
| # | |
| # for buf in model.buffers(): | |
| # print(f'buf.element_size() = {buf.element_size()}') | |
| vram_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) | |
| # Optimizer States: | |
| # - 8 bytes * number of parameters for normal AdamW (maintains 2 states) | |
| # - 2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes | |
| # - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state) | |
| # | |
| # For now we use AdamW/SGD as the baseline for the estimation, even for other more memory-efficient optimizers | |
| # ADAMW_HF = "adamw_hf" | |
| # ADAMW_TORCH = "adamw_torch" | |
| # ADAMW_TORCH_FUSED = "adamw_torch_fused" | |
| # ADAMW_TORCH_XLA = "adamw_torch_xla" | |
| # ADAMW_APEX_FUSED = "adamw_apex_fused" | |
| # ADAFACTOR = "adafactor" | |
| # ADAMW_ANYPRECISION = "adamw_anyprecision" | |
| # SGD = "sgd" | |
| # ADAGRAD = "adagrad" | |
| # ADAMW_BNB = "adamw_bnb_8bit" | |
| # ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit | |
| # LION_8BIT = "lion_8bit" | |
| # LION = "lion_32bit" | |
| # PAGED_ADAMW = "paged_adamw_32bit" | |
| # PAGED_ADAMW_8BIT = "paged_adamw_8bit" | |
| # PAGED_LION = "paged_lion_32bit" | |
| # PAGED_LION_8BIT = "paged_lion_8bit" | |
| # optimizer = cfg.optimizer | |
| optimizer_state_size_per_param = 4 if 'sgd' in optimizer else (2 if '8bit' in optimizer else 8) | |
| vram_optimizer = sum([param.nelement() * optimizer_state_size_per_param for param in model.parameters()]) | |
| # Gradients | |
| # | |
| # 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32) | |
| # but we will follow transformer-math to treat it conditionally outside | |
| # for now we ignores whether is mixed precision training | |
| # | |
| gradient_element_size = 4 # 2 if is_16bit else 4 | |
| vram_gradient = sum([param.nelement() * gradient_element_size for param in model.parameters()]) | |
| # Forward Activations | |
| # size depends on many factors, the key ones being sequence length, hidden size and batch size. | |
| s = sequence_len # cfg.sequence_len | |
| b = micro_batch_size # cfg.micro_batch_size | |
| h = model.config.hidden_size | |
| L = model.config.num_hidden_layers | |
| t = device_count # max(1, torch.cuda.device_count()) # len(DataParallel(model).device_ids) #torch.cuda.device_count() | |
| a = model.config.num_attention_heads | |
| print(f's={s} b={b} h={h} L={L} t={t} a={a}') | |
| sbHL = s * b * h * L | |
| print(f'sbHL = {sbHL / 1e9} GB') | |
| print(f'10 + {24 / t} + {5 * a * s / (h * t)}') | |
| vram_activation = sbHL * (10 + 24 / t) if gradient_checkpointing else sbHL * (10 + 24 / t + 5 * a * s / (h * t)) | |
| return { | |
| # 'supported': True, | |
| 'param_element_size': param_element_size, | |
| 'total': vram_model + vram_buffer + vram_optimizer + vram_activation, | |
| 'model': vram_model, | |
| 'buffer': vram_buffer, | |
| 'optimizer': vram_optimizer, | |
| 'activation': vram_activation, | |
| } | |
| def bytes_by_dtype(bytes, dtype): | |
| if dtype in ("fp16", "bf16", "float16/bfloat16"): | |
| return bytes / 2 | |
| elif dtype == "int8": | |
| return bytes / 4 | |
| elif dtype == "int4": | |
| return bytes / 8 | |
| else: | |
| return bytes | |
| def calculate_memory(model_name:str, library:str, dtypes:list, optimizer:str, access_token:str, raw=False): | |
| "Calculates the memory usage for a model" | |
| if library == "auto": | |
| library = None | |
| if "http" in model_name and "//" in model_name: | |
| try: | |
| model_name = convert_url_to_name(model_name) | |
| except ValueError: | |
| raise gr.Error(f"URL `{model_name}` is not a valid model URL to the Hugging Face Hub") | |
| try: | |
| model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token) | |
| except GatedRepoError: | |
| raise gr.Error(f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. ") | |
| except RepositoryNotFoundError: | |
| raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.") | |
| except ValueError as e: | |
| raise gr.Error(f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)") | |
| except (RuntimeError, OSError) as e: | |
| library = check_has_model(e) | |
| if library != "unknown": | |
| raise gr.Error(f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo.") | |
| total_size, largest_layer = calculate_maximum_sizes(model) | |
| data = [] | |
| title = f"Memory Usage for '{model_name}'" | |
| vram_f32 = calc_vram_f32(model, optimizer=optimizer, sequence_len=2048, micro_batch_size=1, device_count=1, gradient_checkpointing=True) | |
| for dtype in dtypes: | |
| param_element_size = bytes_by_dtype(vram_f32['param_element_size'], dtype) | |
| vram_model = bytes_by_dtype(vram_f32['model'], dtype) | |
| vram_buffer = vram_f32['buffer'] | |
| vram_optimizer = vram_f32['optimizer'] | |
| vram_activation = vram_f32['activation'] | |
| row = { | |
| "dtype": dtype, | |
| 'inference_total': convert_bytes(vram_model), | |
| 'training_total': convert_bytes(vram_model + vram_buffer + vram_optimizer + vram_activation), | |
| 'model': convert_bytes(vram_model), | |
| 'buffer': convert_bytes(vram_buffer), | |
| 'optimizer': convert_bytes(vram_optimizer), | |
| 'activation': convert_bytes(vram_activation), | |
| } | |
| data.append(row) | |
| # dtype_total_size = total_size | |
| # dtype_largest_layer = largest_layer[0] | |
| # if dtype in ("fp16", "bf16", "float16/bfloat16"): | |
| # dtype_total_size /= 2 | |
| # dtype_largest_layer /= 2 | |
| # elif dtype == "int8": | |
| # dtype_total_size /= 4 | |
| # dtype_largest_layer /= 4 | |
| # elif dtype == "int4": | |
| # dtype_total_size /= 8 | |
| # dtype_largest_layer /= 8 | |
| # dtype_training_size = convert_bytes(dtype_total_size * 4) | |
| # dtype_total_size = convert_bytes(dtype_total_size) | |
| # dtype_largest_layer = convert_bytes(dtype_largest_layer) | |
| # data.append({ | |
| # "dtype": dtype, | |
| # "Largest Layer or Residual Group": dtype_largest_layer, | |
| # "Total Size": dtype_total_size, | |
| # "Training using Adam": dtype_training_size, | |
| # "Test": 12345 | |
| # }) | |
| # data.append({ | |
| # "dtype": dtype, | |
| # "Largest Layer or Residual Group": dtype_largest_layer, | |
| # "Total Size": dtype_total_size, | |
| # "Training using Adam": dtype_training_size, | |
| # "Test": 12345 | |
| # }) | |
| global HAS_DISCUSSION, MODEL_NAME, LIBRARY | |
| HAS_DISCUSSION = check_for_discussion(model_name) | |
| MODEL_NAME = model_name | |
| LIBRARY = library | |
| if raw: | |
| return pd.DataFrame(data).to_markdown(index=False), data | |
| results = [ | |
| f'## {title}', | |
| gr.update(visible=True, value=pd.DataFrame(data)), | |
| # gr.update(visible=not HAS_DISCUSSION) | |
| ] | |
| return results | |
| with gr.Blocks() as demo: | |
| with gr.Column(): | |
| gr.Markdown( | |
| """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1> | |
| This tool is modified from https://huggingface.co/spaces/hf-accelerate/model-memory-usage with the following changes: | |
| - Focus on transformers and gives more detailed estimation based on more configs | |
| - Will auto-calculate the proper batch size given a VRAM constraint later | |
| - LoRA/QLoRA etc. will be supported later | |
| Note: | |
| - inference_total = model | |
| - training_total = model + buffer + optimizer + activation | |
| """ | |
| ) | |
| out_text = gr.Markdown() | |
| out = gr.DataFrame(headers=[ | |
| "dtype", | |
| 'inference_total', | |
| 'training_total', | |
| 'model', | |
| 'buffer', | |
| 'optimizer', | |
| 'activation', | |
| ], | |
| interactive=False, | |
| visible=False, | |
| ) | |
| with gr.Row(): | |
| inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased") | |
| with gr.Row(): | |
| library = gr.Radio(["transformers"], label="Library", value="transformers") | |
| dtypes = gr.CheckboxGroup( | |
| ["float32", "float16/bfloat16", "int8", "int4"], | |
| value=["float32", "float16/bfloat16", "int8", "int4"], | |
| label="Model Precision", | |
| ) | |
| # ADAMW_HF = "adamw_hf" | |
| # ADAMW_TORCH = "adamw_torch" | |
| # ADAMW_TORCH_FUSED = "adamw_torch_fused" | |
| # ADAMW_TORCH_XLA = "adamw_torch_xla" | |
| # ADAMW_APEX_FUSED = "adamw_apex_fused" | |
| # ADAFACTOR = "adafactor" | |
| # ADAMW_ANYPRECISION = "adamw_anyprecision" | |
| # SGD = "sgd" | |
| # ADAGRAD = "adagrad" | |
| # ADAMW_BNB = "adamw_bnb_8bit" | |
| # ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit | |
| # LION_8BIT = "lion_8bit" | |
| # LION = "lion_32bit" | |
| # PAGED_ADAMW = "paged_adamw_32bit" | |
| # PAGED_ADAMW_8BIT = "paged_adamw_8bit" | |
| # PAGED_LION = "paged_lion_32bit" | |
| # PAGED_LION_8BIT = "paged_lion_8bit" | |
| optimizer = gr.Dropdown(choices=["adamw_hf", "adamw_torch", "sgd", "lion_32bit", "adamw_8bit", "lion_8bit", "paged_adamw_8bit", "paged_lion_8bit"], | |
| value="adamw_hf", label="Optimizer", allow_custom_value=True) | |
| access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)") | |
| with gr.Row(): | |
| btn = gr.Button("Calculate Memory Usage") | |
| # post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False) | |
| USER_TOKEN = access_token | |
| btn.click( | |
| calculate_memory, inputs=[inp, library, dtypes, optimizer, access_token], outputs=[out_text, out], | |
| ) | |
| # post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub) | |
| demo.launch() # (share=True, inline=False, debug=True) |