Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
import hashlib
|
7 |
+
|
8 |
+
# Function to get OID from a raw Hugging Face LFS file URL
|
9 |
+
def get_lfs_oid(raw_url: str) -> str | None:
|
10 |
+
"""
|
11 |
+
Fetches the content of a raw Hugging Face LFS file URL and extracts the SHA256 OID.
|
12 |
+
"""
|
13 |
+
try:
|
14 |
+
response = requests.get(raw_url, timeout=10)
|
15 |
+
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
16 |
+
content = response.text
|
17 |
+
for line in content.splitlines():
|
18 |
+
if line.startswith("oid sha256:"):
|
19 |
+
return line.split("sha256:")[1].strip()
|
20 |
+
return None
|
21 |
+
except requests.exceptions.RequestException as e:
|
22 |
+
print(f"Error fetching OID from {raw_url}: {e}")
|
23 |
+
return None
|
24 |
+
|
25 |
+
# Function to get .safetensors file info (file list and OIDs) using only HTTP requests
|
26 |
+
def get_model_safetensors_info(model_id: str) -> tuple[dict, str]:
|
27 |
+
"""
|
28 |
+
Fetches safetensors file information for a Hugging Face model using HTTP requests.
|
29 |
+
Returns {filename: oid} and error_message.
|
30 |
+
"""
|
31 |
+
safetensors_oids = {}
|
32 |
+
error_message = ""
|
33 |
+
|
34 |
+
try:
|
35 |
+
# Use Hugging Face Hub REST API to get file list
|
36 |
+
api_url = f"https://huggingface.co/api/models/{model_id}"
|
37 |
+
resp = requests.get(api_url, timeout=10)
|
38 |
+
if resp.status_code != 200:
|
39 |
+
error_message += f"Could not fetch file list for {model_id}: HTTP {resp.status_code}\n"
|
40 |
+
return safetensors_oids, error_message
|
41 |
+
data = resp.json()
|
42 |
+
files = [f['rfilename'] for f in data.get('siblings', []) if f['rfilename'].endswith('.safetensors')]
|
43 |
+
if not files:
|
44 |
+
error_message += f"No .safetensors files found for {model_id}.\n"
|
45 |
+
return safetensors_oids, error_message
|
46 |
+
|
47 |
+
# Parallel OID fetch
|
48 |
+
def fetch_oid(f):
|
49 |
+
raw_url = f"https://huggingface.co/{model_id}/raw/main/{f}"
|
50 |
+
oid = get_lfs_oid(raw_url)
|
51 |
+
return f, oid
|
52 |
+
|
53 |
+
with ThreadPoolExecutor(max_workers=min(8, len(files))) as executor:
|
54 |
+
future_to_file = {executor.submit(fetch_oid, f): f for f in files}
|
55 |
+
for future in as_completed(future_to_file):
|
56 |
+
f, oid = future.result()
|
57 |
+
if oid:
|
58 |
+
safetensors_oids[f] = oid
|
59 |
+
else:
|
60 |
+
error_message += f"Could not get OID for {f} in {model_id}.\n"
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
error_message += f"Error fetching info for {model_id}: {e}\n"
|
64 |
+
|
65 |
+
return safetensors_oids, error_message
|
66 |
+
|
67 |
+
# Main comparison function (no config, only file structure and OIDs)
|
68 |
+
def compare_hf_models(model_id1: str, model_id2: str) -> str:
|
69 |
+
"""
|
70 |
+
Compares two Hugging Face models based on their safetensors OIDs.
|
71 |
+
"""
|
72 |
+
if not model_id1 or not model_id2:
|
73 |
+
return "Please provide both model IDs."
|
74 |
+
|
75 |
+
output = []
|
76 |
+
|
77 |
+
output.append(f"--- Fetching info for Model 1: {model_id1} ---")
|
78 |
+
oids1, err1 = get_model_safetensors_info(model_id1)
|
79 |
+
if err1: output.append(err1)
|
80 |
+
output.append(f"Found {len(oids1)} .safetensors files for {model_id1}.")
|
81 |
+
|
82 |
+
output.append(f"\n--- Fetching info for Model 2: {model_id2} ---")
|
83 |
+
oids2, err2 = get_model_safetensors_info(model_id2)
|
84 |
+
if err2: output.append(err2)
|
85 |
+
output.append(f"Found {len(oids2)} .safetensors files for {model_id2}.")
|
86 |
+
|
87 |
+
# 1. Compare Safetensors OIDs
|
88 |
+
output.append("\n--- Safetensors Weight File Comparison (via OID) ---")
|
89 |
+
|
90 |
+
if not oids1 and not oids2:
|
91 |
+
output.append("No .safetensors files found for either model. Cannot compare weights.")
|
92 |
+
weights_identical = False
|
93 |
+
elif not oids1:
|
94 |
+
output.append(f"No .safetensors files found for {model_id1}. Cannot compare weights.")
|
95 |
+
weights_identical = False
|
96 |
+
elif not oids2:
|
97 |
+
output.append(f"No .safetensors files found for {model_id2}. Cannot compare weights.")
|
98 |
+
weights_identical = False
|
99 |
+
else:
|
100 |
+
# Check if file lists are identical
|
101 |
+
files1_set = set(oids1.keys())
|
102 |
+
files2_set = set(oids2.keys())
|
103 |
+
|
104 |
+
if files1_set != files2_set:
|
105 |
+
output.append("The set of .safetensors files differs between models.")
|
106 |
+
output.append(f"Files in {model_id1} but not {model_id2}: {files1_set - files2_set}")
|
107 |
+
output.append(f"Files in {model_id2} but not {model_id1}: {files2_set - files1_set}")
|
108 |
+
weights_identical = False
|
109 |
+
else:
|
110 |
+
output.append("The models have the same set of .safetensors files.")
|
111 |
+
all_oids_match = True
|
112 |
+
diff_files = []
|
113 |
+
for filename in files1_set:
|
114 |
+
if oids1[filename] != oids2[filename]:
|
115 |
+
all_oids_match = False
|
116 |
+
diff_files.append(filename)
|
117 |
+
|
118 |
+
if all_oids_match:
|
119 |
+
output.append("All corresponding .safetensors OIDs are IDENTICAL.")
|
120 |
+
output.append(f"This strongly suggests '{model_id1}' and '{model_id2}' are 'copy-paste' models at the weight level.")
|
121 |
+
weights_identical = True
|
122 |
+
else:
|
123 |
+
output.append(f"Some .safetensors OIDs DIFFER. Differing files: {', '.join(diff_files)}")
|
124 |
+
output.append(f"This indicates different weights. If file structure is identical, '{model_id2}' could be a 'fine-tuned' version of '{model_id1}' (or vice-versa, or both fine-tuned from a common base).")
|
125 |
+
weights_identical = False
|
126 |
+
|
127 |
+
output.append("\n--- Summary ---")
|
128 |
+
if weights_identical:
|
129 |
+
output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' are IDENTICAL (copy-paste).")
|
130 |
+
else:
|
131 |
+
output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' have different weights or file structures. They are distinct or fine-tuned models.")
|
132 |
+
|
133 |
+
return "\n".join(output)
|
134 |
+
|
135 |
+
def multi_compare_hf_models(model_ids: list) -> tuple:
|
136 |
+
if not model_ids or len(model_ids) < 2:
|
137 |
+
return "Please provide at least two model IDs.", None, None
|
138 |
+
details = []
|
139 |
+
safetensors_data = {}
|
140 |
+
errors = {}
|
141 |
+
# Fetch all model info in parallel
|
142 |
+
with ThreadPoolExecutor(max_workers=min(8, len(model_ids))) as executor:
|
143 |
+
future_to_model = {executor.submit(get_model_safetensors_info, mid): mid for mid in model_ids}
|
144 |
+
for future in as_completed(future_to_model):
|
145 |
+
mid = future_to_model[future]
|
146 |
+
oids, err = future.result()
|
147 |
+
safetensors_data[mid] = oids
|
148 |
+
errors[mid] = err
|
149 |
+
# Build summary
|
150 |
+
summary = []
|
151 |
+
all_files = set()
|
152 |
+
for mid, oids in safetensors_data.items():
|
153 |
+
all_files.update(oids.keys())
|
154 |
+
all_files = sorted(all_files)
|
155 |
+
# Table header
|
156 |
+
table = [["File"] + model_ids + ["Match"]]
|
157 |
+
for f in all_files:
|
158 |
+
row = [f]
|
159 |
+
oids_for_file = []
|
160 |
+
for mid in model_ids:
|
161 |
+
oid = safetensors_data.get(mid, {}).get(f, "-")
|
162 |
+
oids_for_file.append(oid if oid else "-")
|
163 |
+
row.append(oid if oid else "-")
|
164 |
+
# Determine if all OIDs for this file match (ignoring missing)
|
165 |
+
present_oids = [oid for oid in oids_for_file if oid != "-"]
|
166 |
+
if len(present_oids) > 1 and all(oid == present_oids[0] for oid in present_oids):
|
167 |
+
row.append("Match")
|
168 |
+
else:
|
169 |
+
row.append("Unmatch")
|
170 |
+
table.append(row)
|
171 |
+
# Per-model details
|
172 |
+
for mid in model_ids:
|
173 |
+
oids = safetensors_data.get(mid, {})
|
174 |
+
summary.append(f"{mid}: {len(oids)} .safetensors files.")
|
175 |
+
if errors[mid]:
|
176 |
+
summary.append(f"Errors for {mid}: {errors[mid]}")
|
177 |
+
# File presence summary
|
178 |
+
for f in all_files:
|
179 |
+
present = [mid for mid in model_ids if f in safetensors_data.get(mid, {})]
|
180 |
+
if len(present) != len(model_ids):
|
181 |
+
summary.append(f"File '{f}' missing in: {set(model_ids) - set(present)}")
|
182 |
+
return "\n".join(summary), table, safetensors_data
|
183 |
+
|
184 |
+
def download_file(url, dest):
|
185 |
+
try:
|
186 |
+
r = requests.get(url, stream=True, timeout=30)
|
187 |
+
r.raise_for_status()
|
188 |
+
with open(dest, 'wb') as f:
|
189 |
+
for chunk in r.iter_content(chunk_size=8192):
|
190 |
+
f.write(chunk)
|
191 |
+
return True, ""
|
192 |
+
except Exception as e:
|
193 |
+
return False, str(e)
|
194 |
+
|
195 |
+
def file_similarity(file1, file2, chunk_size=1024*1024):
|
196 |
+
"""
|
197 |
+
Compares two files byte-by-byte and returns percent similarity (by identical bytes).
|
198 |
+
"""
|
199 |
+
size1 = os.path.getsize(file1)
|
200 |
+
size2 = os.path.getsize(file2)
|
201 |
+
if size1 != size2:
|
202 |
+
return 0.0, f"File sizes differ: {size1} vs {size2} bytes."
|
203 |
+
total = size1
|
204 |
+
same = 0
|
205 |
+
with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
|
206 |
+
while True:
|
207 |
+
b1 = f1.read(chunk_size)
|
208 |
+
b2 = f2.read(chunk_size)
|
209 |
+
if not b1:
|
210 |
+
break
|
211 |
+
for x, y in zip(b1, b2):
|
212 |
+
if x == y:
|
213 |
+
same += 1
|
214 |
+
percent = (same / total) * 100 if total else 0.0
|
215 |
+
return percent, None
|
216 |
+
|
217 |
+
# Gradio Interface
|
218 |
+
with gr.Blocks(theme="soft") as demo:
|
219 |
+
gr.Markdown(
|
220 |
+
"""
|
221 |
+
# 🤖 Hugging Face Model Cross-Checker
|
222 |
+
Easily check if two Hugging Face models are **identical (copy-paste)**, **fine-tuned**, or **completely different**—without downloading any weights!
|
223 |
+
|
224 |
+
- Enter two model IDs below (e.g. `deepseek-ai/DeepSeek-R1-0528` and `Parveshiiii/DeepSeek-R1-0528-MathX`).
|
225 |
+
- Click **Compare** to see a clear verdict and detailed breakdown.
|
226 |
+
"""
|
227 |
+
)
|
228 |
+
with gr.Row():
|
229 |
+
model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
|
230 |
+
model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
|
231 |
+
compare_btn = gr.Button("Compare")
|
232 |
+
verdict = gr.HighlightedText(label="Result Verdict", color_map={"Copy-Paste":"green","Fine-Tuned":"orange","Different":"red","Error":"gray"})
|
233 |
+
details = gr.Dataframe(headers=["File","Model 1 OID","Model 2 OID","Match"], label="File-by-File Comparison", interactive=False)
|
234 |
+
summary = gr.Textbox(label="Summary Details", lines=8, interactive=False)
|
235 |
+
|
236 |
+
def crosscheck_ui(m1, m2):
|
237 |
+
if not m1 or not m2:
|
238 |
+
return [("Error: Please provide both model IDs.", "Error")], [], ""
|
239 |
+
oids1, err1 = get_model_safetensors_info(m1)
|
240 |
+
oids2, err2 = get_model_safetensors_info(m2)
|
241 |
+
if err1 or err2:
|
242 |
+
return [(f"Error: {err1 or ''} {err2 or ''}", "Error")], [], ""
|
243 |
+
files = sorted(set(oids1.keys()) | set(oids2.keys()))
|
244 |
+
table = []
|
245 |
+
all_match = True
|
246 |
+
all_present = True
|
247 |
+
diff_count = 0
|
248 |
+
for f in files:
|
249 |
+
oid1 = oids1.get(f, "-")
|
250 |
+
oid2 = oids2.get(f, "-")
|
251 |
+
if oid1 == oid2 and oid1 != "-":
|
252 |
+
match = "Match"
|
253 |
+
else:
|
254 |
+
match = "Unmatch"
|
255 |
+
all_match = False
|
256 |
+
if oid1 != "-" and oid2 != "-":
|
257 |
+
diff_count += 1
|
258 |
+
if oid1 == "-" or oid2 == "-":
|
259 |
+
all_present = False
|
260 |
+
table.append([f, oid1, oid2, match])
|
261 |
+
# Verdict logic
|
262 |
+
if all_match and all_present and files:
|
263 |
+
verdict_text = [("Copy-Paste: Models are identical at the safetensors level!", "Copy-Paste")]
|
264 |
+
elif all_present and diff_count > 0:
|
265 |
+
verdict_text = [("Fine-Tuned: Same file structure, but weights differ.", "Fine-Tuned")]
|
266 |
+
else:
|
267 |
+
verdict_text = [("Different: File structure or weights are different.", "Different")]
|
268 |
+
# Summary
|
269 |
+
summary_lines = [
|
270 |
+
f"Model 1: {m1} ({len(oids1)} .safetensors files)",
|
271 |
+
f"Model 2: {m2} ({len(oids2)} .safetensors files)",
|
272 |
+
f"Files compared: {len(files)}",
|
273 |
+
f"Matching files: {sum(1 for row in table if row[3]=='Match')}",
|
274 |
+
f"Unmatched files: {sum(1 for row in table if row[3]=='Unmatch')}",
|
275 |
+
]
|
276 |
+
missing1 = [f for f in files if oids1.get(f) is None]
|
277 |
+
missing2 = [f for f in files if oids2.get(f) is None]
|
278 |
+
if missing1:
|
279 |
+
summary_lines.append(f"Files missing in Model 1: {', '.join(missing1)}")
|
280 |
+
if missing2:
|
281 |
+
summary_lines.append(f"Files missing in Model 2: {', '.join(missing2)}")
|
282 |
+
return verdict_text, table, "\n".join(summary_lines)
|
283 |
+
|
284 |
+
compare_btn.click(
|
285 |
+
fn=crosscheck_ui,
|
286 |
+
inputs=[model1, model2],
|
287 |
+
outputs=[verdict, details, summary]
|
288 |
+
)
|
289 |
+
with gr.Accordion("Advanced: Compare File Shards Bitwise", open=False):
|
290 |
+
gr.Markdown("""
|
291 |
+
## Compare a specific file (shard) from both models, byte-by-byte
|
292 |
+
- Enter the file name (e.g. `model-00001-of-00010.safetensors`).
|
293 |
+
- The tool will download this file from both models and compare their contents.
|
294 |
+
- Shows the percent of identical bytes (100% = exact copy).
|
295 |
+
""")
|
296 |
+
adv_model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
|
297 |
+
adv_model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
|
298 |
+
adv_filename = gr.Textbox(label="File Name", placeholder="e.g. model-00001-of-00010.safetensors")
|
299 |
+
adv_btn = gr.Button("Download & Compare File")
|
300 |
+
adv_result = gr.Textbox(label="Bitwise Comparison Result", lines=3, interactive=False)
|
301 |
+
|
302 |
+
def adv_compare(m1, m2, fname):
|
303 |
+
if not m1 or not m2 or not fname:
|
304 |
+
return "Please provide both model IDs and the file name."
|
305 |
+
url1 = f"https://huggingface.co/{m1}/resolve/main/{fname}"
|
306 |
+
url2 = f"https://huggingface.co/{m2}/resolve/main/{fname}"
|
307 |
+
with tempfile.TemporaryDirectory() as tmp:
|
308 |
+
f1 = os.path.join(tmp, "f1.safetensors")
|
309 |
+
f2 = os.path.join(tmp, "f2.safetensors")
|
310 |
+
ok1, err1 = download_file(url1, f1)
|
311 |
+
ok2, err2 = download_file(url2, f2)
|
312 |
+
if not ok1 or not ok2:
|
313 |
+
return f"Download error: {err1 or ''} {err2 or ''}"
|
314 |
+
percent, err = file_similarity(f1, f2)
|
315 |
+
if err:
|
316 |
+
return f"Comparison error: {err}"
|
317 |
+
return f"Similarity: {percent:.2f}% ({'identical' if percent==100 else 'different'})"
|
318 |
+
adv_btn.click(
|
319 |
+
fn=adv_compare,
|
320 |
+
inputs=[adv_model1, adv_model2, adv_filename],
|
321 |
+
outputs=[adv_result]
|
322 |
+
)
|
323 |
+
demo.launch()
|