Spaces:

Steelskull
/

Vis_Diff

Running

App Files Files Community

Vis_Diff / app.py

Steelskull

Update app.py

1b0840b verified 28 days ago

raw

history blame contribute delete

22.9 kB

	import io
	import gc
	import os
	import json
	import struct
	import torch
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import plotly.graph_objects as go
	import gradio as gr
	import PIL.Image
	from transformers import AutoModelForCausalLM, AutoConfig
	from huggingface_hub import hf_hub_download, hf_hub_url, snapshot_download
	from huggingface_hub.utils import build_hf_headers
	from safetensors import safe_open
	import requests

	# Set style for matplotlib
	sns.set_theme(style="whitegrid")

	# Cache for metadata only
	_metadata_cache = {}

	def calculate_weight_diff(base_weight, chat_weight):
	"""Calculates the mean absolute difference between two tensors."""
	b_w = base_weight.float()
	c_w = chat_weight.float()
	result = torch.abs(b_w - c_w).mean().item()
	del b_w, c_w
	return result

	def get_safetensor_index(repo_id, token=None):
	"""Download and parse the safetensors index."""
	cache_key = f"{repo_id}_index"
	if cache_key in _metadata_cache:
	return _metadata_cache[cache_key]

	try:
	index_path = hf_hub_download(repo_id, "model.safetensors.index.json", token=token)
	with open(index_path, 'r') as f:
	index_data = json.load(f)
	weight_map = index_data.get("weight_map", {})
	_metadata_cache[cache_key] = weight_map
	return weight_map
	except Exception:
	_metadata_cache[cache_key] = None
	return None

	# =============================================================================
	# STREAMING MODE (Ultra Low Memory - No disk usage)
	# =============================================================================

	def get_safetensor_header(repo_id, filename, token=None):
	"""Fetch only the header of a safetensor file using HTTP range request."""
	cache_key = f"{repo_id}_{filename}_header"
	if cache_key in _metadata_cache:
	return _metadata_cache[cache_key]

	url = hf_hub_url(repo_id, filename)
	headers = build_hf_headers(token=token)

	# First, get the header size (first 8 bytes)
	headers["Range"] = "bytes=0-7"
	response = requests.get(url, headers=headers)
	response.raise_for_status()

	header_size = struct.unpack('<Q', response.content)[0]

	# Now get the header JSON
	headers["Range"] = f"bytes=8-{8 + header_size - 1}"
	response = requests.get(url, headers=headers)
	response.raise_for_status()

	header = json.loads(response.content.decode('utf-8'))

	result = {"header": header, "header_size": header_size, "data_offset": 8 + header_size}
	_metadata_cache[cache_key] = result
	return result

	def stream_tensor_from_safetensor(repo_id, filename, tensor_name, token=None):
	"""Stream a specific tensor using HTTP range requests - minimal memory usage."""
	header_info = get_safetensor_header(repo_id, filename, token)
	header = header_info["header"]
	data_offset = header_info["data_offset"]

	if tensor_name not in header:
	raise KeyError(f"Tensor {tensor_name} not found in {filename}")

	tensor_info = header[tensor_name]
	dtype_str = tensor_info["dtype"]
	shape = tensor_info["shape"]
	offsets = tensor_info["data_offsets"]
	start_offset = offsets[0] + data_offset
	end_offset = offsets[1] + data_offset - 1

	dtype_map = {
	"F32": torch.float32,
	"F16": torch.float16,
	"BF16": torch.bfloat16,
	"I64": torch.int64,
	"I32": torch.int32,
	"I16": torch.int16,
	"I8": torch.int8,
	"U8": torch.uint8,
	"BOOL": torch.bool,
	}
	torch_dtype = dtype_map.get(dtype_str, torch.float32)

	url = hf_hub_url(repo_id, filename)
	headers = build_hf_headers(token=token)
	headers["Range"] = f"bytes={start_offset}-{end_offset}"

	response = requests.get(url, headers=headers)
	response.raise_for_status()

	tensor = torch.frombuffer(bytearray(response.content), dtype=torch_dtype).reshape(shape).clone()
	del response

	return tensor

	def load_tensor_streaming(repo_id, tensor_name, token=None):
	"""Load a specific tensor using streaming - minimal memory."""
	weight_map = get_safetensor_index(repo_id, token)

	if weight_map is not None:
	if tensor_name not in weight_map:
	raise KeyError(f"Tensor {tensor_name} not found in weight map for {repo_id}")
	filename = weight_map[tensor_name]
	else:
	filename = "model.safetensors"

	return stream_tensor_from_safetensor(repo_id, filename, tensor_name, token)

	def calculate_layer_diffs_streaming(base_repo, chat_repo, token=None, progress=None):
	"""Ultra memory-efficient: streams individual tensors via HTTP range requests."""
	global _metadata_cache
	_metadata_cache = {}

	print("Fetching model configuration...")
	base_config = AutoConfig.from_pretrained(base_repo, token=token, trust_remote_code=True)
	num_layers = base_config.num_hidden_layers

	components_to_track = [
	('input_layernorm', 'model.layers.{}.input_layernorm.weight'),
	('self_attn_q_proj', 'model.layers.{}.self_attn.q_proj.weight'),
	('self_attn_k_proj', 'model.layers.{}.self_attn.k_proj.weight'),
	('self_attn_v_proj', 'model.layers.{}.self_attn.v_proj.weight'),
	('self_attn_o_proj', 'model.layers.{}.self_attn.o_proj.weight'),
	('post_attention_layernorm', 'model.layers.{}.post_attention_layernorm.weight'),
	('mlp_gate_proj', 'model.layers.{}.mlp.gate_proj.weight'),
	('mlp_up_proj', 'model.layers.{}.mlp.up_proj.weight'),
	('mlp_down_proj', 'model.layers.{}.mlp.down_proj.weight'),
	]

	layer_diffs = []
	total_ops = num_layers * len(components_to_track)
	current_op = 0

	print(f"Processing {num_layers} layers in streaming mode...")
	get_safetensor_index(base_repo, token)
	get_safetensor_index(chat_repo, token)

	for layer_idx in range(num_layers):
	layer_data = {}

	for name, pattern in components_to_track:
	tensor_name = pattern.format(layer_idx)

	try:
	base_tensor = load_tensor_streaming(base_repo, tensor_name, token)
	chat_tensor = load_tensor_streaming(chat_repo, tensor_name, token)

	diff = calculate_weight_diff(base_tensor, chat_tensor)
	layer_data[name] = diff

	del base_tensor
	del chat_tensor

	except (KeyError, Exception) as e:
	print(f"Warning: Could not load {tensor_name}: {e}")
	layer_data[name] = 0.0

	current_op += 1
	if progress is not None:
	progress(current_op / total_ops, desc=f"Layer {layer_idx + 1}/{num_layers}: {name}")

	layer_diffs.append(layer_data)
	print(f"Completed layer {layer_idx + 1}/{num_layers}")
	gc.collect()

	_metadata_cache = {}
	gc.collect()

	return layer_diffs

	# =============================================================================
	# DISK CACHE MODE (Low Memory - Uses disk storage)
	# =============================================================================

	def download_model_safetensors(repo_id, token=None, progress_callback=None):
	"""Download all safetensor files for a model to disk cache."""
	print(f"Downloading safetensor files for {repo_id}...")

	# Download the entire model's safetensors files
	local_dir = snapshot_download(
	repo_id,
	token=token,
	allow_patterns=[".safetensors", ".json"],
	ignore_patterns=[".bin", ".pt", "*.ckpt"],
	)

	return local_dir

	def get_local_safetensor_files(local_dir):
	"""Get list of safetensor files in local directory."""
	safetensor_files = []
	for f in os.listdir(local_dir):
	if f.endswith('.safetensors'):
	safetensor_files.append(os.path.join(local_dir, f))
	return safetensor_files

	def get_local_weight_map(local_dir):
	"""Get weight map from local index file."""
	index_path = os.path.join(local_dir, "model.safetensors.index.json")
	if os.path.exists(index_path):
	with open(index_path, 'r') as f:
	index_data = json.load(f)
	return index_data.get("weight_map", {})
	return None

	def load_tensor_from_disk(local_dir, tensor_name, weight_map=None):
	"""Load a specific tensor from disk-cached safetensor files."""
	if weight_map is not None:
	if tensor_name not in weight_map:
	raise KeyError(f"Tensor {tensor_name} not found in weight map")
	filename = weight_map[tensor_name]
	file_path = os.path.join(local_dir, filename)
	else:
	# Single file model
	file_path = os.path.join(local_dir, "model.safetensors")

	with safe_open(file_path, framework="pt", device="cpu") as f:
	if tensor_name not in f.keys():
	raise KeyError(f"Tensor {tensor_name} not found in {file_path}")
	tensor = f.get_tensor(tensor_name).clone()

	return tensor

	def calculate_layer_diffs_disk_cache(base_repo, chat_repo, token=None, progress=None):
	"""Disk cache mode: downloads full model to disk, loads tensors one at a time."""

	print("=" * 60)
	print("DISK CACHE MODE")
	print("Step 1: Downloading model files to disk cache...")
	print("=" * 60)

	if progress:
	progress(0.05, desc="Downloading base model to disk...")

	base_local_dir = download_model_safetensors(base_repo, token)
	print(f"Base model cached at: {base_local_dir}")

	if progress:
	progress(0.15, desc="Downloading chat model to disk...")

	chat_local_dir = download_model_safetensors(chat_repo, token)
	print(f"Chat model cached at: {chat_local_dir}")

	# Get weight maps
	base_weight_map = get_local_weight_map(base_local_dir)
	chat_weight_map = get_local_weight_map(chat_local_dir)

	# Get config
	print("\nFetching model configuration...")
	base_config = AutoConfig.from_pretrained(base_repo, token=token, trust_remote_code=True)
	num_layers = base_config.num_hidden_layers

	components_to_track = [
	('input_layernorm', 'model.layers.{}.input_layernorm.weight'),
	('self_attn_q_proj', 'model.layers.{}.self_attn.q_proj.weight'),
	('self_attn_k_proj', 'model.layers.{}.self_attn.k_proj.weight'),
	('self_attn_v_proj', 'model.layers.{}.self_attn.v_proj.weight'),
	('self_attn_o_proj', 'model.layers.{}.self_attn.o_proj.weight'),
	('post_attention_layernorm', 'model.layers.{}.post_attention_layernorm.weight'),
	('mlp_gate_proj', 'model.layers.{}.mlp.gate_proj.weight'),
	('mlp_up_proj', 'model.layers.{}.mlp.up_proj.weight'),
	('mlp_down_proj', 'model.layers.{}.mlp.down_proj.weight'),
	]

	layer_diffs = []
	total_ops = num_layers * len(components_to_track)
	current_op = 0

	print("=" * 60)
	print(f"Step 2: Processing {num_layers} layers from disk cache...")
	print("=" * 60)

	for layer_idx in range(num_layers):
	layer_data = {}

	for name, pattern in components_to_track:
	tensor_name = pattern.format(layer_idx)

	try:
	# Load from disk cache - only this tensor goes into RAM
	base_tensor = load_tensor_from_disk(base_local_dir, tensor_name, base_weight_map)
	chat_tensor = load_tensor_from_disk(chat_local_dir, tensor_name, chat_weight_map)

	diff = calculate_weight_diff(base_tensor, chat_tensor)
	layer_data[name] = diff

	# Free RAM immediately
	del base_tensor
	del chat_tensor

	except (KeyError, Exception) as e:
	print(f"Warning: Could not load {tensor_name}: {e}")
	layer_data[name] = 0.0

	current_op += 1
	if progress is not None:
	# Scale progress from 0.2 to 0.9 for the processing phase
	scaled_progress = 0.2 + (current_op / total_ops) * 0.7
	progress(scaled_progress, desc=f"Layer {layer_idx + 1}/{num_layers}: {name}")

	layer_diffs.append(layer_data)
	print(f"Completed layer {layer_idx + 1}/{num_layers}")

	# Garbage collect every few layers
	if layer_idx % 5 == 0:
	gc.collect()

	gc.collect()
	print("\nProcessing complete!")

	return layer_diffs

	# =============================================================================
	# STANDARD MODE (Full models in memory)
	# =============================================================================

	def calculate_layer_diffs_standard(base_model, chat_model, progress=None):
	"""Standard mode: loads full models into memory."""
	layer_diffs = []

	layers = list(zip(base_model.model.layers, chat_model.model.layers))
	total_layers = len(layers)

	components_to_track = [
	('input_layernorm', lambda l: l.input_layernorm.weight),
	('self_attn_q_proj', lambda l: l.self_attn.q_proj.weight),
	('self_attn_k_proj', lambda l: l.self_attn.k_proj.weight),
	('self_attn_v_proj', lambda l: l.self_attn.v_proj.weight),
	('self_attn_o_proj', lambda l: l.self_attn.o_proj.weight),
	('post_attention_layernorm', lambda l: l.post_attention_layernorm.weight),
	('mlp_gate_proj', lambda l: l.mlp.gate_proj.weight),
	('mlp_up_proj', lambda l: l.mlp.up_proj.weight),
	('mlp_down_proj', lambda l: l.mlp.down_proj.weight),
	]

	print("Calculating differences (standard mode)...")
	for idx, (base_layer, chat_layer) in enumerate(layers):
	layer_data = {}
	for name, getter in components_to_track:
	try:
	val = calculate_weight_diff(getter(base_layer), getter(chat_layer))
	layer_data[name] = val
	except AttributeError:
	layer_data[name] = 0.0

	layer_diffs.append(layer_data)

	if progress is not None:
	progress((idx + 1) / total_layers, desc=f"Processing layer {idx + 1}/{total_layers}")

	return layer_diffs

	# =============================================================================
	# VISUALIZATION
	# =============================================================================

	def visualize_2d_heatmap(layer_diffs, base_model_name, chat_model_name):
	"""Generates the static 2D Heatmap image."""
	if not layer_diffs:
	return None

	num_layers = len(layer_diffs)
	components = list(layer_diffs[0].keys())
	num_components = len(components)

	height = max(8, num_layers / 6)
	width = max(20, num_components * 2.5)

	if num_components > 6:
	nrows = 2
	ncols = (num_components + 1) // 2
	else:
	nrows = 1
	ncols = num_components

	fig, axs = plt.subplots(nrows, ncols, figsize=(width, height * (1.2 if nrows > 1 else 1)))
	axs = axs.flatten() if num_components > 1 else [axs]

	fig.suptitle(f"Weight Differences: {base_model_name} vs {chat_model_name}", fontsize=16, y=0.98)

	tick_font_size = max(6, min(10, 300 / num_layers))

	for i, component in enumerate(components):
	data = [[row[component]] for row in layer_diffs]

	sns.heatmap(data,
	annot=True,
	fmt=".6f",
	cmap="viridis",
	ax=axs[i],
	cbar=False,
	annot_kws={'size': tick_font_size * 0.8})

	axs[i].set_title(component, fontsize=12, fontweight='bold')
	axs[i].set_yticks(range(num_layers))
	axs[i].set_yticklabels(range(num_layers), fontsize=tick_font_size)
	axs[i].set_xticks([])
	axs[i].invert_yaxis()

	for j in range(i + 1, len(axs)):
	fig.delaxes(axs[j])

	plt.tight_layout(rect=[0, 0, 1, 0.96])

	buf = io.BytesIO()
	fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	plt.close(fig)
	return PIL.Image.open(buf)

	def generate_3d_plot(layer_diffs):
	"""Generates an interactive 3D Surface plot as a Plotly Figure."""
	if not layer_diffs:
	return None

	df = pd.DataFrame(layer_diffs)
	x_labels = df.columns.tolist()
	y_labels = df.index.tolist()
	z_data = df.values

	fig = go.Figure(data=[go.Surface(z=z_data, x=x_labels, y=y_labels, colorscale='Viridis')])

	fig.update_layout(
	title='3D Landscape of Weight Differences',
	scene=dict(
	xaxis_title='Model Components',
	yaxis_title='Layer Index',
	zaxis_title='Mean Weight Diff',
	xaxis=dict(tickangle=45),
	),
	autosize=True,
	height=700,
	margin=dict(l=65, r=50, b=65, t=90)
	)

	return fig

	# =============================================================================
	# MAIN PROCESSING
	# =============================================================================

	def process_models(base_name, chat_name, hf_token, memory_mode, progress=gr.Progress()):
	if not base_name or not chat_name:
	raise gr.Error("Please provide both model names.")

	token = hf_token if hf_token else None

	try:
	if memory_mode == "streaming":
	# Streaming mode - ultra low memory, no disk
	progress(0, desc="Starting streaming mode (ultra low memory)...")
	diffs = calculate_layer_diffs_streaming(
	base_name,
	chat_name,
	token=token,
	progress=progress
	)
	elif memory_mode == "disk_cache":
	# Disk cache mode - downloads to disk, loads tensors one at a time
	progress(0, desc="Starting disk cache mode...")
	diffs = calculate_layer_diffs_disk_cache(
	base_name,
	chat_name,
	token=token,
	progress=progress
	)
	else:
	# Standard mode - full models in memory
	progress(0, desc=f"Loading {base_name}...")
	print(f"Loading {base_name}...")
	base_model = AutoModelForCausalLM.from_pretrained(
	base_name,
	torch_dtype=torch.bfloat16,
	token=token,
	trust_remote_code=True,
	low_cpu_mem_usage=True
	)

	progress(0.3, desc=f"Loading {chat_name}...")
	print(f"Loading {chat_name}...")
	chat_model = AutoModelForCausalLM.from_pretrained(
	chat_name,
	torch_dtype=torch.bfloat16,
	token=token,
	trust_remote_code=True,
	low_cpu_mem_usage=True
	)

	progress(0.5, desc="Calculating differences...")
	diffs = calculate_layer_diffs_standard(base_model, chat_model, progress=None)

	del base_model
	del chat_model

	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	progress(0.9, desc="Generating visualizations...")
	img_2d = visualize_2d_heatmap(diffs, base_name, chat_name)
	plot_3d = generate_3d_plot(diffs)

	progress(1.0, desc="Complete!")
	return img_2d, plot_3d

	except Exception as e:
	import traceback
	traceback.print_exc()
	raise gr.Error(f"Error processing models: {str(e)}")

	# =============================================================================
	# GRADIO UI
	# =============================================================================

	with gr.Blocks(title="Model Diff Visualizer") as demo:
	gr.Markdown("# 🧠 LLM Weight Difference Visualizer")
	gr.Markdown("Compare the weights of a Base model vs. its Instruct/Chat tuned version layer by layer.")

	with gr.Row():
	with gr.Column(scale=1):
	base_input = gr.Textbox(
	label="Base Model Name",
	placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct"
	)
	chat_input = gr.Textbox(
	label="Chat/Tuned Model Name",
	placeholder="e.g., CrucibleLab/L3.3-70B-Loki-V2.0"
	)
	token_input = gr.Textbox(
	label="Hugging Face Token (Optional)",
	type="password",
	placeholder="hf_..."
	)

	memory_mode = gr.Radio(
	label="Memory Mode",
	choices=[
	("🚀 Standard (Fast, High RAM)", "standard"),
	("💾 Disk Cache (Medium Speed, Low RAM, Uses Disk)", "disk_cache"),
	("🐢 Streaming (Slow, Ultra Low RAM, No Disk)", "streaming"),
	],
	value="standard",
	info="Choose based on your available RAM and disk space"
	)

	with gr.Accordion("Memory Mode Details", open=False):
	gr.Markdown("""
	### 🚀 Standard Mode
	- RAM Usage: ~2x model size (e.g., ~280GB for 70B models)
	- Disk Usage: HuggingFace cache only
	- Speed: Fastest
	- Best for: Machines with lots of RAM

	### 💾 Disk Cache Mode
	- RAM Usage: ~2-4GB (only one tensor at a time)
	- Disk Usage: ~2x model size (downloads full safetensors)
	- Speed: Medium (disk I/O bound)
	- Best for: Machines with limited RAM but plenty of disk space

	### 🐢 Streaming Mode
	- RAM Usage: ~1-2GB (streams bytes directly)
	- Disk Usage: Minimal (only metadata cached)
	- Speed: Slowest (many HTTP requests)
	- Best for: Very constrained environments, or when disk space is also limited
	""")

	submit_btn = gr.Button("🚀 Analyze Differences", variant="primary")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 2D Layer-wise Heatmap")
	output_2d = gr.Image(label="2D Visualization", type="pil")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 3D Interactive Landscape")
	output_3d = gr.Plot(label="3D Visualization")

	submit_btn.click(
	fn=process_models,
	inputs=[base_input, chat_input, token_input, memory_mode],
	outputs=[output_2d, output_3d]
	)

	if __name__ == "__main__":
	demo.launch(share=False, server_port=7860)