Spaces:
Running
Running
File size: 15,238 Bytes
5f23e61 ad1f73b 5f23e61 ad1f73b 5f23e61 ad1f73b 5f23e61 69e8ec5 5f23e61 ad1f73b 5f23e61 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 |
import gradio as gr
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import tempfile
import os
import hashlib
from gradio import Progress
# Function to get OID from a raw Hugging Face LFS file URL
def get_lfs_oid(raw_url: str) -> str | None:
"""
Fetches the content of a raw Hugging Face LFS file URL and extracts the SHA256 OID.
"""
try:
response = requests.get(raw_url, timeout=10)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
content = response.text
for line in content.splitlines():
if line.startswith("oid sha256:"):
return line.split("sha256:")[1].strip()
return None
except requests.exceptions.RequestException as e:
print(f"Error fetching OID from {raw_url}: {e}")
return None
# Function to get .safetensors file info (file list and OIDs) using only HTTP requests
def get_model_safetensors_info(model_id: str) -> tuple[dict, str]:
"""
Fetches safetensors file information for a Hugging Face model using HTTP requests.
Returns {filename: oid} and error_message.
"""
safetensors_oids = {}
error_message = ""
try:
# Use Hugging Face Hub REST API to get file list
api_url = f"https://huggingface.co/api/models/{model_id}"
resp = requests.get(api_url, timeout=10)
if resp.status_code != 200:
error_message += f"Could not fetch file list for {model_id}: HTTP {resp.status_code}\n"
return safetensors_oids, error_message
data = resp.json()
files = [f['rfilename'] for f in data.get('siblings', []) if f['rfilename'].endswith('.safetensors')]
if not files:
error_message += f"No .safetensors files found for {model_id}.\n"
return safetensors_oids, error_message
# Parallel OID fetch
def fetch_oid(f):
raw_url = f"https://huggingface.co/{model_id}/raw/main/{f}"
oid = get_lfs_oid(raw_url)
return f, oid
with ThreadPoolExecutor(max_workers=min(8, len(files))) as executor:
future_to_file = {executor.submit(fetch_oid, f): f for f in files}
for future in as_completed(future_to_file):
f, oid = future.result()
if oid:
safetensors_oids[f] = oid
else:
error_message += f"Could not get OID for {f} in {model_id}.\n"
except Exception as e:
error_message += f"Error fetching info for {model_id}: {e}\n"
return safetensors_oids, error_message
# Main comparison function (no config, only file structure and OIDs)
def compare_hf_models(model_id1: str, model_id2: str) -> str:
"""
Compares two Hugging Face models based on their safetensors OIDs.
"""
if not model_id1 or not model_id2:
return "Please provide both model IDs."
output = []
output.append(f"--- Fetching info for Model 1: {model_id1} ---")
oids1, err1 = get_model_safetensors_info(model_id1)
if err1: output.append(err1)
output.append(f"Found {len(oids1)} .safetensors files for {model_id1}.")
output.append(f"\n--- Fetching info for Model 2: {model_id2} ---")
oids2, err2 = get_model_safetensors_info(model_id2)
if err2: output.append(err2)
output.append(f"Found {len(oids2)} .safetensors files for {model_id2}.")
# 1. Compare Safetensors OIDs
output.append("\n--- Safetensors Weight File Comparison (via OID) ---")
if not oids1 and not oids2:
output.append("No .safetensors files found for either model. Cannot compare weights.")
weights_identical = False
elif not oids1:
output.append(f"No .safetensors files found for {model_id1}. Cannot compare weights.")
weights_identical = False
elif not oids2:
output.append(f"No .safetensors files found for {model_id2}. Cannot compare weights.")
weights_identical = False
else:
# Check if file lists are identical
files1_set = set(oids1.keys())
files2_set = set(oids2.keys())
if files1_set != files2_set:
output.append("The set of .safetensors files differs between models.")
output.append(f"Files in {model_id1} but not {model_id2}: {files1_set - files2_set}")
output.append(f"Files in {model_id2} but not {model_id1}: {files2_set - files1_set}")
weights_identical = False
else:
output.append("The models have the same set of .safetensors files.")
all_oids_match = True
diff_files = []
for filename in files1_set:
if oids1[filename] != oids2[filename]:
all_oids_match = False
diff_files.append(filename)
if all_oids_match:
output.append("All corresponding .safetensors OIDs are IDENTICAL.")
output.append(f"This strongly suggests '{model_id1}' and '{model_id2}' are 'copy-paste' models at the weight level.")
weights_identical = True
else:
output.append(f"Some .safetensors OIDs DIFFER. Differing files: {', '.join(diff_files)}")
output.append(f"This indicates different weights. If file structure is identical, '{model_id2}' could be a 'fine-tuned' version of '{model_id1}' (or vice-versa, or both fine-tuned from a common base).")
weights_identical = False
output.append("\n--- Summary ---")
if weights_identical:
output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' are IDENTICAL (copy-paste).")
else:
output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' have different weights or file structures. They are distinct or fine-tuned models.")
return "\n".join(output)
def multi_compare_hf_models(model_ids: list) -> tuple:
if not model_ids or len(model_ids) < 2:
return "Please provide at least two model IDs.", None, None
details = []
safetensors_data = {}
errors = {}
# Fetch all model info in parallel
with ThreadPoolExecutor(max_workers=min(8, len(model_ids))) as executor:
future_to_model = {executor.submit(get_model_safetensors_info, mid): mid for mid in model_ids}
for future in as_completed(future_to_model):
mid = future_to_model[future]
oids, err = future.result()
safetensors_data[mid] = oids
errors[mid] = err
# Build summary
summary = []
all_files = set()
for mid, oids in safetensors_data.items():
all_files.update(oids.keys())
all_files = sorted(all_files)
# Table header
table = [["File"] + model_ids + ["Match"]]
for f in all_files:
row = [f]
oids_for_file = []
for mid in model_ids:
oid = safetensors_data.get(mid, {}).get(f, "-")
oids_for_file.append(oid if oid else "-")
row.append(oid if oid else "-")
# Determine if all OIDs for this file match (ignoring missing)
present_oids = [oid for oid in oids_for_file if oid != "-"]
if len(present_oids) > 1 and all(oid == present_oids[0] for oid in present_oids):
row.append("Match")
else:
row.append("Unmatch")
table.append(row)
# Per-model details
for mid in model_ids:
oids = safetensors_data.get(mid, {})
summary.append(f"{mid}: {len(oids)} .safetensors files.")
if errors[mid]:
summary.append(f"Errors for {mid}: {errors[mid]}")
# File presence summary
for f in all_files:
present = [mid for mid in model_ids if f in safetensors_data.get(mid, {})]
if len(present) != len(model_ids):
summary.append(f"File '{f}' missing in: {set(model_ids) - set(present)}")
return "\n".join(summary), table, safetensors_data
def download_file(url, dest):
try:
r = requests.get(url, stream=True, timeout=30)
r.raise_for_status()
with open(dest, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return True, ""
except Exception as e:
return False, str(e)
def download_file_with_progress(url, dest, progress: Progress = None):
try:
r = requests.get(url, stream=True, timeout=30)
r.raise_for_status()
total = int(r.headers.get('content-length', 0))
downloaded = 0
with open(dest, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if progress and total:
progress((downloaded / total), desc=f"Downloading {os.path.basename(dest)}: {downloaded//1024}KB/{total//1024}KB")
return True, ""
except Exception as e:
return False, str(e)
def file_similarity(file1, file2, chunk_size=1024*1024):
"""
Compares two files byte-by-byte and returns percent similarity (by identical bytes).
"""
size1 = os.path.getsize(file1)
size2 = os.path.getsize(file2)
if size1 != size2:
return 0.0, f"File sizes differ: {size1} vs {size2} bytes."
total = size1
same = 0
with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
while True:
b1 = f1.read(chunk_size)
b2 = f2.read(chunk_size)
if not b1:
break
for x, y in zip(b1, b2):
if x == y:
same += 1
percent = (same / total) * 100 if total else 0.0
return percent, None
# Gradio Interface
with gr.Blocks(theme="soft") as demo:
gr.Markdown(
"""
# 🤖 Hugging Face Model Cross-Checker
Easily check if two Hugging Face models are **identical (copy-paste)**, **fine-tuned**, or **completely different**—without downloading any weights!
- Enter two model IDs below (e.g. `deepseek-ai/DeepSeek-R1-0528` and `Parveshiiii/DeepSeek-R1-0528-MathX`).
- Click **Compare** to see a clear verdict and detailed breakdown.
"""
)
with gr.Row():
model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
compare_btn = gr.Button("Compare")
verdict = gr.HighlightedText(label="Result Verdict", color_map={"Copy-Paste":"green","Fine-Tuned":"orange","Different":"red","Error":"gray"})
details = gr.Dataframe(headers=["File","Model 1 OID","Model 2 OID","Match"], label="File-by-File Comparison", interactive=False)
summary = gr.Textbox(label="Summary Details", lines=8, interactive=False)
def crosscheck_ui(m1, m2):
if not m1 or not m2:
return [("Error: Please provide both model IDs.", "Error")], [], ""
oids1, err1 = get_model_safetensors_info(m1)
oids2, err2 = get_model_safetensors_info(m2)
if err1 or err2:
return [(f"Error: {err1 or ''} {err2 or ''}", "Error")], [], ""
files = sorted(set(oids1.keys()) | set(oids2.keys()))
table = []
all_match = True
all_present = True
diff_count = 0
for f in files:
oid1 = oids1.get(f, "-")
oid2 = oids2.get(f, "-")
if oid1 == oid2 and oid1 != "-":
match = "Match"
else:
match = "Unmatch"
all_match = False
if oid1 != "-" and oid2 != "-":
diff_count += 1
if oid1 == "-" or oid2 == "-":
all_present = False
table.append([f, oid1, oid2, match])
# Verdict logic
if all_match and all_present and files:
verdict_text = [("Copy-Paste: Models are identical at the safetensors level!", "Copy-Paste")]
elif all_present and diff_count > 0:
verdict_text = [("Fine-Tuned: Same file structure, but weights differ.", "Fine-Tuned")]
else:
verdict_text = [("Different: File structure or weights are different.", "Different")]
# Summary
summary_lines = [
f"Model 1: {m1} ({len(oids1)} .safetensors files)",
f"Model 2: {m2} ({len(oids2)} .safetensors files)",
f"Files compared: {len(files)}",
f"Matching files: {sum(1 for row in table if row[3]=='Match')}",
f"Unmatched files: {sum(1 for row in table if row[3]=='Unmatch')}",
]
missing1 = [f for f in files if oids1.get(f) is None]
missing2 = [f for f in files if oids2.get(f) is None]
if missing1:
summary_lines.append(f"Files missing in Model 1: {', '.join(missing1)}")
if missing2:
summary_lines.append(f"Files missing in Model 2: {', '.join(missing2)}")
return verdict_text, table, "\n".join(summary_lines)
compare_btn.click(
fn=crosscheck_ui,
inputs=[model1, model2],
outputs=[verdict, details, summary]
)
with gr.Accordion("Advanced: Compare File Shards Bitwise", open=False):
gr.Markdown("""
## Compare a specific file (shard) from both models, byte-by-byte
- Enter the file name (e.g. `model-00001-of-00010.safetensors`).
- The tool will download this file from both models and compare their contents.
- Shows the percent of identical bytes (100% = exact copy).
""")
adv_model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
adv_model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
adv_filename = gr.Textbox(label="File Name", placeholder="e.g. model-00001-of-00010.safetensors")
adv_btn = gr.Button("Download & Compare File")
adv_result = gr.Textbox(label="Bitwise Comparison Result", lines=3, interactive=False)
def adv_compare(m1, m2, fname, progress=gr.Progress(track_tqdm=True)):
if not m1 or not m2 or not fname:
return "Please provide both model IDs and the file name."
url1 = f"https://huggingface.co/{m1}/resolve/main/{fname}?download=true"
url2 = f"https://huggingface.co/{m2}/resolve/main/{fname}?download=true"
with tempfile.TemporaryDirectory() as tmp:
f1 = os.path.join(tmp, "f1.safetensors")
f2 = os.path.join(tmp, "f2.safetensors")
ok1, err1 = download_file_with_progress(url1, f1, progress)
ok2, err2 = download_file_with_progress(url2, f2, progress)
if not ok1 or not ok2:
return f"Download error: {err1 or ''} {err2 or ''}"
percent, err = file_similarity(f1, f2)
if err:
return f"Comparison error: {err}"
return f"Similarity: {percent:.2f}% ({'identical' if percent==100 else 'different'})"
adv_btn.click(
fn=adv_compare,
inputs=[adv_model1, adv_model2, adv_filename],
outputs=[adv_result]
)
demo.launch() |