Abhaykoul commited on
Commit
5f23e61
·
verified ·
1 Parent(s): 652e4c7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +323 -0
app.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ import tempfile
5
+ import os
6
+ import hashlib
7
+
8
+ # Function to get OID from a raw Hugging Face LFS file URL
9
+ def get_lfs_oid(raw_url: str) -> str | None:
10
+ """
11
+ Fetches the content of a raw Hugging Face LFS file URL and extracts the SHA256 OID.
12
+ """
13
+ try:
14
+ response = requests.get(raw_url, timeout=10)
15
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
16
+ content = response.text
17
+ for line in content.splitlines():
18
+ if line.startswith("oid sha256:"):
19
+ return line.split("sha256:")[1].strip()
20
+ return None
21
+ except requests.exceptions.RequestException as e:
22
+ print(f"Error fetching OID from {raw_url}: {e}")
23
+ return None
24
+
25
+ # Function to get .safetensors file info (file list and OIDs) using only HTTP requests
26
+ def get_model_safetensors_info(model_id: str) -> tuple[dict, str]:
27
+ """
28
+ Fetches safetensors file information for a Hugging Face model using HTTP requests.
29
+ Returns {filename: oid} and error_message.
30
+ """
31
+ safetensors_oids = {}
32
+ error_message = ""
33
+
34
+ try:
35
+ # Use Hugging Face Hub REST API to get file list
36
+ api_url = f"https://huggingface.co/api/models/{model_id}"
37
+ resp = requests.get(api_url, timeout=10)
38
+ if resp.status_code != 200:
39
+ error_message += f"Could not fetch file list for {model_id}: HTTP {resp.status_code}\n"
40
+ return safetensors_oids, error_message
41
+ data = resp.json()
42
+ files = [f['rfilename'] for f in data.get('siblings', []) if f['rfilename'].endswith('.safetensors')]
43
+ if not files:
44
+ error_message += f"No .safetensors files found for {model_id}.\n"
45
+ return safetensors_oids, error_message
46
+
47
+ # Parallel OID fetch
48
+ def fetch_oid(f):
49
+ raw_url = f"https://huggingface.co/{model_id}/raw/main/{f}"
50
+ oid = get_lfs_oid(raw_url)
51
+ return f, oid
52
+
53
+ with ThreadPoolExecutor(max_workers=min(8, len(files))) as executor:
54
+ future_to_file = {executor.submit(fetch_oid, f): f for f in files}
55
+ for future in as_completed(future_to_file):
56
+ f, oid = future.result()
57
+ if oid:
58
+ safetensors_oids[f] = oid
59
+ else:
60
+ error_message += f"Could not get OID for {f} in {model_id}.\n"
61
+
62
+ except Exception as e:
63
+ error_message += f"Error fetching info for {model_id}: {e}\n"
64
+
65
+ return safetensors_oids, error_message
66
+
67
+ # Main comparison function (no config, only file structure and OIDs)
68
+ def compare_hf_models(model_id1: str, model_id2: str) -> str:
69
+ """
70
+ Compares two Hugging Face models based on their safetensors OIDs.
71
+ """
72
+ if not model_id1 or not model_id2:
73
+ return "Please provide both model IDs."
74
+
75
+ output = []
76
+
77
+ output.append(f"--- Fetching info for Model 1: {model_id1} ---")
78
+ oids1, err1 = get_model_safetensors_info(model_id1)
79
+ if err1: output.append(err1)
80
+ output.append(f"Found {len(oids1)} .safetensors files for {model_id1}.")
81
+
82
+ output.append(f"\n--- Fetching info for Model 2: {model_id2} ---")
83
+ oids2, err2 = get_model_safetensors_info(model_id2)
84
+ if err2: output.append(err2)
85
+ output.append(f"Found {len(oids2)} .safetensors files for {model_id2}.")
86
+
87
+ # 1. Compare Safetensors OIDs
88
+ output.append("\n--- Safetensors Weight File Comparison (via OID) ---")
89
+
90
+ if not oids1 and not oids2:
91
+ output.append("No .safetensors files found for either model. Cannot compare weights.")
92
+ weights_identical = False
93
+ elif not oids1:
94
+ output.append(f"No .safetensors files found for {model_id1}. Cannot compare weights.")
95
+ weights_identical = False
96
+ elif not oids2:
97
+ output.append(f"No .safetensors files found for {model_id2}. Cannot compare weights.")
98
+ weights_identical = False
99
+ else:
100
+ # Check if file lists are identical
101
+ files1_set = set(oids1.keys())
102
+ files2_set = set(oids2.keys())
103
+
104
+ if files1_set != files2_set:
105
+ output.append("The set of .safetensors files differs between models.")
106
+ output.append(f"Files in {model_id1} but not {model_id2}: {files1_set - files2_set}")
107
+ output.append(f"Files in {model_id2} but not {model_id1}: {files2_set - files1_set}")
108
+ weights_identical = False
109
+ else:
110
+ output.append("The models have the same set of .safetensors files.")
111
+ all_oids_match = True
112
+ diff_files = []
113
+ for filename in files1_set:
114
+ if oids1[filename] != oids2[filename]:
115
+ all_oids_match = False
116
+ diff_files.append(filename)
117
+
118
+ if all_oids_match:
119
+ output.append("All corresponding .safetensors OIDs are IDENTICAL.")
120
+ output.append(f"This strongly suggests '{model_id1}' and '{model_id2}' are 'copy-paste' models at the weight level.")
121
+ weights_identical = True
122
+ else:
123
+ output.append(f"Some .safetensors OIDs DIFFER. Differing files: {', '.join(diff_files)}")
124
+ output.append(f"This indicates different weights. If file structure is identical, '{model_id2}' could be a 'fine-tuned' version of '{model_id1}' (or vice-versa, or both fine-tuned from a common base).")
125
+ weights_identical = False
126
+
127
+ output.append("\n--- Summary ---")
128
+ if weights_identical:
129
+ output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' are IDENTICAL (copy-paste).")
130
+ else:
131
+ output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' have different weights or file structures. They are distinct or fine-tuned models.")
132
+
133
+ return "\n".join(output)
134
+
135
+ def multi_compare_hf_models(model_ids: list) -> tuple:
136
+ if not model_ids or len(model_ids) < 2:
137
+ return "Please provide at least two model IDs.", None, None
138
+ details = []
139
+ safetensors_data = {}
140
+ errors = {}
141
+ # Fetch all model info in parallel
142
+ with ThreadPoolExecutor(max_workers=min(8, len(model_ids))) as executor:
143
+ future_to_model = {executor.submit(get_model_safetensors_info, mid): mid for mid in model_ids}
144
+ for future in as_completed(future_to_model):
145
+ mid = future_to_model[future]
146
+ oids, err = future.result()
147
+ safetensors_data[mid] = oids
148
+ errors[mid] = err
149
+ # Build summary
150
+ summary = []
151
+ all_files = set()
152
+ for mid, oids in safetensors_data.items():
153
+ all_files.update(oids.keys())
154
+ all_files = sorted(all_files)
155
+ # Table header
156
+ table = [["File"] + model_ids + ["Match"]]
157
+ for f in all_files:
158
+ row = [f]
159
+ oids_for_file = []
160
+ for mid in model_ids:
161
+ oid = safetensors_data.get(mid, {}).get(f, "-")
162
+ oids_for_file.append(oid if oid else "-")
163
+ row.append(oid if oid else "-")
164
+ # Determine if all OIDs for this file match (ignoring missing)
165
+ present_oids = [oid for oid in oids_for_file if oid != "-"]
166
+ if len(present_oids) > 1 and all(oid == present_oids[0] for oid in present_oids):
167
+ row.append("Match")
168
+ else:
169
+ row.append("Unmatch")
170
+ table.append(row)
171
+ # Per-model details
172
+ for mid in model_ids:
173
+ oids = safetensors_data.get(mid, {})
174
+ summary.append(f"{mid}: {len(oids)} .safetensors files.")
175
+ if errors[mid]:
176
+ summary.append(f"Errors for {mid}: {errors[mid]}")
177
+ # File presence summary
178
+ for f in all_files:
179
+ present = [mid for mid in model_ids if f in safetensors_data.get(mid, {})]
180
+ if len(present) != len(model_ids):
181
+ summary.append(f"File '{f}' missing in: {set(model_ids) - set(present)}")
182
+ return "\n".join(summary), table, safetensors_data
183
+
184
+ def download_file(url, dest):
185
+ try:
186
+ r = requests.get(url, stream=True, timeout=30)
187
+ r.raise_for_status()
188
+ with open(dest, 'wb') as f:
189
+ for chunk in r.iter_content(chunk_size=8192):
190
+ f.write(chunk)
191
+ return True, ""
192
+ except Exception as e:
193
+ return False, str(e)
194
+
195
+ def file_similarity(file1, file2, chunk_size=1024*1024):
196
+ """
197
+ Compares two files byte-by-byte and returns percent similarity (by identical bytes).
198
+ """
199
+ size1 = os.path.getsize(file1)
200
+ size2 = os.path.getsize(file2)
201
+ if size1 != size2:
202
+ return 0.0, f"File sizes differ: {size1} vs {size2} bytes."
203
+ total = size1
204
+ same = 0
205
+ with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
206
+ while True:
207
+ b1 = f1.read(chunk_size)
208
+ b2 = f2.read(chunk_size)
209
+ if not b1:
210
+ break
211
+ for x, y in zip(b1, b2):
212
+ if x == y:
213
+ same += 1
214
+ percent = (same / total) * 100 if total else 0.0
215
+ return percent, None
216
+
217
+ # Gradio Interface
218
+ with gr.Blocks(theme="soft") as demo:
219
+ gr.Markdown(
220
+ """
221
+ # 🤖 Hugging Face Model Cross-Checker
222
+ Easily check if two Hugging Face models are **identical (copy-paste)**, **fine-tuned**, or **completely different**—without downloading any weights!
223
+
224
+ - Enter two model IDs below (e.g. `deepseek-ai/DeepSeek-R1-0528` and `Parveshiiii/DeepSeek-R1-0528-MathX`).
225
+ - Click **Compare** to see a clear verdict and detailed breakdown.
226
+ """
227
+ )
228
+ with gr.Row():
229
+ model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
230
+ model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
231
+ compare_btn = gr.Button("Compare")
232
+ verdict = gr.HighlightedText(label="Result Verdict", color_map={"Copy-Paste":"green","Fine-Tuned":"orange","Different":"red","Error":"gray"})
233
+ details = gr.Dataframe(headers=["File","Model 1 OID","Model 2 OID","Match"], label="File-by-File Comparison", interactive=False)
234
+ summary = gr.Textbox(label="Summary Details", lines=8, interactive=False)
235
+
236
+ def crosscheck_ui(m1, m2):
237
+ if not m1 or not m2:
238
+ return [("Error: Please provide both model IDs.", "Error")], [], ""
239
+ oids1, err1 = get_model_safetensors_info(m1)
240
+ oids2, err2 = get_model_safetensors_info(m2)
241
+ if err1 or err2:
242
+ return [(f"Error: {err1 or ''} {err2 or ''}", "Error")], [], ""
243
+ files = sorted(set(oids1.keys()) | set(oids2.keys()))
244
+ table = []
245
+ all_match = True
246
+ all_present = True
247
+ diff_count = 0
248
+ for f in files:
249
+ oid1 = oids1.get(f, "-")
250
+ oid2 = oids2.get(f, "-")
251
+ if oid1 == oid2 and oid1 != "-":
252
+ match = "Match"
253
+ else:
254
+ match = "Unmatch"
255
+ all_match = False
256
+ if oid1 != "-" and oid2 != "-":
257
+ diff_count += 1
258
+ if oid1 == "-" or oid2 == "-":
259
+ all_present = False
260
+ table.append([f, oid1, oid2, match])
261
+ # Verdict logic
262
+ if all_match and all_present and files:
263
+ verdict_text = [("Copy-Paste: Models are identical at the safetensors level!", "Copy-Paste")]
264
+ elif all_present and diff_count > 0:
265
+ verdict_text = [("Fine-Tuned: Same file structure, but weights differ.", "Fine-Tuned")]
266
+ else:
267
+ verdict_text = [("Different: File structure or weights are different.", "Different")]
268
+ # Summary
269
+ summary_lines = [
270
+ f"Model 1: {m1} ({len(oids1)} .safetensors files)",
271
+ f"Model 2: {m2} ({len(oids2)} .safetensors files)",
272
+ f"Files compared: {len(files)}",
273
+ f"Matching files: {sum(1 for row in table if row[3]=='Match')}",
274
+ f"Unmatched files: {sum(1 for row in table if row[3]=='Unmatch')}",
275
+ ]
276
+ missing1 = [f for f in files if oids1.get(f) is None]
277
+ missing2 = [f for f in files if oids2.get(f) is None]
278
+ if missing1:
279
+ summary_lines.append(f"Files missing in Model 1: {', '.join(missing1)}")
280
+ if missing2:
281
+ summary_lines.append(f"Files missing in Model 2: {', '.join(missing2)}")
282
+ return verdict_text, table, "\n".join(summary_lines)
283
+
284
+ compare_btn.click(
285
+ fn=crosscheck_ui,
286
+ inputs=[model1, model2],
287
+ outputs=[verdict, details, summary]
288
+ )
289
+ with gr.Accordion("Advanced: Compare File Shards Bitwise", open=False):
290
+ gr.Markdown("""
291
+ ## Compare a specific file (shard) from both models, byte-by-byte
292
+ - Enter the file name (e.g. `model-00001-of-00010.safetensors`).
293
+ - The tool will download this file from both models and compare their contents.
294
+ - Shows the percent of identical bytes (100% = exact copy).
295
+ """)
296
+ adv_model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
297
+ adv_model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
298
+ adv_filename = gr.Textbox(label="File Name", placeholder="e.g. model-00001-of-00010.safetensors")
299
+ adv_btn = gr.Button("Download & Compare File")
300
+ adv_result = gr.Textbox(label="Bitwise Comparison Result", lines=3, interactive=False)
301
+
302
+ def adv_compare(m1, m2, fname):
303
+ if not m1 or not m2 or not fname:
304
+ return "Please provide both model IDs and the file name."
305
+ url1 = f"https://huggingface.co/{m1}/resolve/main/{fname}"
306
+ url2 = f"https://huggingface.co/{m2}/resolve/main/{fname}"
307
+ with tempfile.TemporaryDirectory() as tmp:
308
+ f1 = os.path.join(tmp, "f1.safetensors")
309
+ f2 = os.path.join(tmp, "f2.safetensors")
310
+ ok1, err1 = download_file(url1, f1)
311
+ ok2, err2 = download_file(url2, f2)
312
+ if not ok1 or not ok2:
313
+ return f"Download error: {err1 or ''} {err2 or ''}"
314
+ percent, err = file_similarity(f1, f2)
315
+ if err:
316
+ return f"Comparison error: {err}"
317
+ return f"Similarity: {percent:.2f}% ({'identical' if percent==100 else 'different'})"
318
+ adv_btn.click(
319
+ fn=adv_compare,
320
+ inputs=[adv_model1, adv_model2, adv_filename],
321
+ outputs=[adv_result]
322
+ )
323
+ demo.launch()