accurate-gguf-vram-calculator

Running

App Files Files Community

accurate-gguf-vram-calculator / app.py

oobabooga

Update app.py

2c659e0 verified about 2 months ago

raw

history blame

15.2 kB

	import io
	import re
	import struct
	from enum import IntEnum
	from math import floor

	import requests

	import gradio as gr


	class GGUFValueType(IntEnum):
	UINT8 = 0
	INT8 = 1
	UINT16 = 2
	INT16 = 3
	UINT32 = 4
	INT32 = 5
	FLOAT32 = 6
	BOOL = 7
	STRING = 8
	ARRAY = 9
	UINT64 = 10
	INT64 = 11
	FLOAT64 = 12


	_simple_value_packing = {
	GGUFValueType.UINT8: "<B",
	GGUFValueType.INT8: "<b",
	GGUFValueType.UINT16: "<H",
	GGUFValueType.INT16: "<h",
	GGUFValueType.UINT32: "<I",
	GGUFValueType.INT32: "<i",
	GGUFValueType.FLOAT32: "<f",
	GGUFValueType.UINT64: "<Q",
	GGUFValueType.INT64: "<q",
	GGUFValueType.FLOAT64: "<d",
	GGUFValueType.BOOL: "?",
	}

	value_type_info = {
	GGUFValueType.UINT8: 1,
	GGUFValueType.INT8: 1,
	GGUFValueType.UINT16: 2,
	GGUFValueType.INT16: 2,
	GGUFValueType.UINT32: 4,
	GGUFValueType.INT32: 4,
	GGUFValueType.FLOAT32: 4,
	GGUFValueType.UINT64: 8,
	GGUFValueType.INT64: 8,
	GGUFValueType.FLOAT64: 8,
	GGUFValueType.BOOL: 1,
	}


	def get_single(value_type, file):
	if value_type == GGUFValueType.STRING:
	value_length = struct.unpack("<Q", file.read(8))[0]
	value = file.read(value_length)
	try:
	value = value.decode('utf-8')
	except:
	pass
	else:
	type_str = _simple_value_packing.get(value_type)
	bytes_length = value_type_info.get(value_type)
	value = struct.unpack(type_str, file.read(bytes_length))[0]

	return value


	def load_metadata_from_file(file_obj):
	"""Load metadata from a file-like object"""
	metadata = {}

	GGUF_MAGIC = struct.unpack("<I", file_obj.read(4))[0]
	GGUF_VERSION = struct.unpack("<I", file_obj.read(4))[0]
	ti_data_count = struct.unpack("<Q", file_obj.read(8))[0]
	kv_data_count = struct.unpack("<Q", file_obj.read(8))[0]

	if GGUF_VERSION == 1:
	raise Exception('You are using an outdated GGUF, please download a new one.')

	for i in range(kv_data_count):
	key_length = struct.unpack("<Q", file_obj.read(8))[0]
	key = file_obj.read(key_length)

	value_type = GGUFValueType(struct.unpack("<I", file_obj.read(4))[0])
	if value_type == GGUFValueType.ARRAY:
	ltype = GGUFValueType(struct.unpack("<I", file_obj.read(4))[0])
	length = struct.unpack("<Q", file_obj.read(8))[0]

	arr = [get_single(ltype, file_obj) for _ in range(length)]
	metadata[key.decode()] = arr
	else:
	value = get_single(value_type, file_obj)
	metadata[key.decode()] = value

	# Extract specific fields needed for VRAM calculation
	extracted_fields = {}
	for key, value in metadata.items():
	if key.endswith('.block_count'):
	extracted_fields['n_layers'] = value
	elif key.endswith('.attention.head_count_kv'):
	extracted_fields['n_kv_heads'] = max(value) if isinstance(value, list) else value
	elif key.endswith('.embedding_length'):
	extracted_fields['embedding_dim'] = value
	elif key.endswith('.context_length'):
	extracted_fields['context_length'] = value
	elif key.endswith('.feed_forward_length'):
	extracted_fields['feed_forward_dim'] = value

	# Add extracted fields to metadata for easy access
	metadata.update(extracted_fields)
	return metadata


	def download_gguf_partial(url, max_bytes=25 * 1024 * 1024):
	"""Download the first max_bytes from a GGUF URL"""
	try:
	# Set up headers for partial content request
	headers = {'Range': f'bytes=0-{max_bytes-1}'}

	# Make the request
	response = requests.get(url, headers=headers, stream=True)
	response.raise_for_status()

	# Read the content
	content = response.content

	# Convert to BytesIO for file-like interface
	return io.BytesIO(content)

	except Exception as e:
	raise Exception(f"Failed to download GGUF file: {str(e)}")


	def load_metadata(model_url, current_metadata):
	"""Load metadata from model URL and return updated metadata dict"""
	if not model_url or model_url.strip() == "":
	return {}, "Please enter a model URL"

	try:
	# Get model size first
	model_size_mb = get_model_size_mb_from_url(model_url)

	# Normalize URL for downloading
	normalized_url = normalize_huggingface_url(model_url)

	# Download the first 25MB of the file
	file_obj = download_gguf_partial(normalized_url)

	# Parse the metadata
	metadata = load_metadata_from_file(file_obj)

	# Extract filename from URL
	gguf_filename = model_url.split('/')[-1].split('?')[0] # Remove query parameters if any

	# Extract model name from URL if it's a Hugging Face URL
	model_name = model_url
	if "huggingface.co/" in model_url:
	try:
	# Extract model name from URL like https://huggingface.co/user/model
	parts = model_url.split("huggingface.co/")[1].split("/")
	if len(parts) >= 2:
	model_name = f"{parts[0]}/{parts[1]}"
	except:
	model_name = model_url

	# Add URL, model name, and size to metadata
	metadata['url'] = model_url
	metadata['model_name'] = model_name
	metadata['model_size_mb'] = model_size_mb
	metadata['loaded'] = True

	return metadata, gr.update(value=metadata["n_layers"], maximum=metadata["n_layers"]), f"Metadata loaded successfully for: {gguf_filename}"

	except Exception as e:
	error_msg = f"Error loading metadata: {str(e)}"
	return {}, gr.update(), error_msg


	def normalize_huggingface_url(url: str) -> str:
	"""Normalize HuggingFace URL to resolve format for direct access"""
	if 'huggingface.co' not in url:
	return url

	# Remove query parameters first
	base_url = url.split('?')[0]

	# Convert blob URL to resolve URL
	if '/blob/' in base_url:
	base_url = base_url.replace('/blob/', '/resolve/')

	return base_url


	def get_model_size_mb_from_url(model_url: str) -> float:
	"""Get model size in MB from URL without downloading, handling multi-part files"""
	try:
	# Normalize the URL for direct access
	normalized_url = normalize_huggingface_url(model_url)

	# Get size of the main file
	response = requests.head(normalized_url, allow_redirects=True)
	response.raise_for_status()
	main_file_size = int(response.headers.get('content-length', 0))

	# Extract filename from original URL
	filename = normalized_url.split('/')[-1]

	# Check for multipart pattern (e.g., model-00001-of-00002.gguf)
	match = re.match(r'(.+)-(\d+)-of-(\d+)\.gguf$', filename)

	if match:
	base_pattern = match.group(1)
	total_parts = int(match.group(3))

	total_size = 0
	base_url = '/'.join(normalized_url.split('/')[:-1]) + '/'

	# Get size of all parts
	for part_num in range(1, total_parts + 1):
	part_filename = f"{base_pattern}-{part_num:05d}-of-{total_parts:05d}.gguf"
	part_url = base_url + part_filename

	try:
	part_response = requests.head(part_url, allow_redirects=True)
	part_response.raise_for_status()
	part_size = int(part_response.headers.get('content-length', 0))
	total_size += part_size
	except requests.RequestException as e:
	print(f"Warning: Could not get size of {part_filename}, estimating...")
	# If we can't get some parts, estimate based on what we have
	if total_size > 0:
	avg_size = total_size / (part_num - 1)
	remaining_parts = total_parts - (part_num - 1)
	total_size += avg_size * remaining_parts
	else:
	# Fallback to main file size * total parts
	total_size = main_file_size * total_parts
	break

	return total_size / (1024 ** 2)
	else:
	# Single part file
	return main_file_size / (1024 ** 2)

	except Exception as e:
	print(f"Error getting model size: {e}")
	return 0.0


	def estimate_vram(metadata, gpu_layers, ctx_size, cache_type):
	"""Calculate VRAM usage using the actual formula"""
	try:
	# Extract required values from metadata
	n_layers = metadata.get('n_layers')
	n_kv_heads = metadata.get('n_kv_heads')
	embedding_dim = metadata.get('embedding_dim')
	context_length = metadata.get('context_length')
	feed_forward_dim = metadata.get('feed_forward_dim')
	size_in_mb = metadata.get('model_size_mb', 0)

	# Check if we have all required fields
	required_fields = [n_layers, n_kv_heads, embedding_dim, context_length, feed_forward_dim]
	if any(field is None for field in required_fields):
	missing = [name for name, field in zip(
	['n_layers', 'n_kv_heads', 'embedding_dim', 'context_length', 'feed_forward_dim'],
	required_fields) if field is None]
	raise ValueError(f"Missing required metadata fields: {missing}")

	# Ensure gpu_layers doesn't exceed total layers
	if gpu_layers > n_layers:
	gpu_layers = n_layers

	# Convert cache_type to numeric
	if cache_type == 'q4_0':
	cache_type = 4
	elif cache_type == 'q8_0':
	cache_type = 8
	else:
	cache_type = 16

	# Derived features
	size_per_layer = size_in_mb / max(n_layers, 1e-6)
	kv_cache_factor = n_kv_heads * cache_type * ctx_size
	embedding_per_context = embedding_dim / ctx_size

	# Calculate VRAM using the model
	# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
	vram = (
	(size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
	* (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
	+ 1516.522943869404
	)

	return vram

	except Exception as e:
	print(f"Error in VRAM calculation: {e}")
	raise


	def estimate_vram_wrapper(model_metadata, gpu_layers, ctx_size, cache_type):
	"""Wrapper function to estimate VRAM usage"""
	if not model_metadata or 'model_name' not in model_metadata:
	return "<div id=\"vram-info\">Estimated VRAM to load the model:</div>"

	# Use cache_type directly (it's already a string from the radio button)
	try:
	result = estimate_vram(model_metadata, gpu_layers, ctx_size, cache_type)
	conservative = result + 577
	return f"""<div id="vram-info">
	<div>Expected VRAM usage: <span class="value">{result:.0f} MiB</span></div>
	<div>Safe estimate: <span class="value">{conservative:.0f} MiB</span> - 95% chance the VRAM is at most this.</div>
	</div>"""
	except Exception as e:
	return f"<div id=\"vram-info\">Estimated VRAM to load the model: <span class=\"value\">Error: {str(e)}</span></div>"


	def create_ui():
	"""Create the simplified UI"""
	# Custom CSS to limit max width and center the content
	css = """
	body {
	max-width: 810px !important;
	margin: 0 auto !important;
	}

	#vram-info {
	padding: 10px;
	border-radius: 4px;
	background-color: var(--background-fill-secondary);
	}

	#vram-info .value {
	font-weight: bold;
	color: var(--primary-500);
	}
	"""

	with gr.Blocks(css=css) as demo:
	# State to hold model metadata
	model_metadata = gr.State(value={})

	gr.Markdown("# Accurate GGUF VRAM Calculator\n\nCalculate VRAM for GGUF models from GPU layers and context length using an accurate formula.\n\nFor an explanation about how this works, consult this blog post: https://oobabooga.github.io/blog/posts/gguf-vram-formula/")
	with gr.Row():
	with gr.Column():
	# Model URL input
	model_url = gr.Textbox(
	label="GGUF Model URL",
	placeholder="https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF/blob/main/UD-Q2_K_XL/Qwen3-235B-A22B-UD-Q2_K_XL-00001-of-00002.gguf",
	value=""
	)

	# Load metadata button
	load_metadata_btn = gr.Button("Load metadata", elem_classes='refresh-button')

	# GPU layers slider
	gpu_layers = gr.Slider(
	label="GPU Layers",
	minimum=0,
	maximum=256,
	value=256,
	info='`--gpu-layers` in llama.cpp.'
	)

	# Context size slider
	ctx_size = gr.Slider(
	label='Context Length',
	minimum=512,
	maximum=131072,
	step=256,
	value=8192,
	info='`--ctx-size` in llama.cpp.'
	)

	# Cache type checkbox group
	cache_type = gr.Radio(
	choices=['fp16', 'q8_0', 'q4_0'],
	value='fp16',
	label="Cache Type",
	info='Cache quantization.'
	)

	# VRAM info display
	vram_info = gr.HTML(
	value="<div id=\"vram-info\">Estimated VRAM to load the model:</div>"
	)

	# Status display
	status = gr.Textbox(
	label="Status",
	value="No model loaded",
	interactive=False
	)

	# Event handlers
	load_metadata_btn.click(
	load_metadata,
	inputs=[model_url, model_metadata],
	outputs=[model_metadata, gpu_layers, status],
	show_progress=True
	).then(
	estimate_vram_wrapper,
	inputs=[model_metadata, gpu_layers, ctx_size, cache_type],
	outputs=[vram_info],
	show_progress=False
	)

	# Update VRAM estimate when any parameter changes
	for component in [gpu_layers, ctx_size, cache_type]:
	component.change(
	estimate_vram_wrapper,
	inputs=[model_metadata, gpu_layers, ctx_size, cache_type],
	outputs=[vram_info],
	show_progress=False
	)

	# Also update when model_metadata state changes
	model_metadata.change(
	estimate_vram_wrapper,
	inputs=[model_metadata, gpu_layers, ctx_size, cache_type],
	outputs=[vram_info],
	show_progress=False
	)

	return demo


	if __name__ == "__main__":
	# Create and launch the app
	demo = create_ui()
	demo.launch()