Oleg Shulyakov commited on
Commit
ad94fc8
·
1 Parent(s): c0d1d96
Files changed (1) hide show
  1. app.py +734 -697
app.py CHANGED
@@ -1,753 +1,790 @@
1
  import os
2
  import subprocess
3
  import signal
4
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
- import gradio as gr
6
  import tempfile
 
 
 
 
 
 
7
 
 
8
  from huggingface_hub import HfApi, ModelCard, whoami
9
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
10
- from pathlib import Path
11
- from textwrap import dedent
12
  from apscheduler.schedulers.background import BackgroundScheduler
13
 
14
- # Space parameters
15
- SPACE_ID = os.environ.get("SPACE_ID") if os.environ.get("SPACE_ID") else ""
16
- SPACE_URL = "https://" + SPACE_ID.replace("/", "-") + ".hf.space/" if SPACE_ID else "http://localhost:7860/"
17
- HF_TOKEN = os.environ.get("HF_TOKEN")
18
-
19
- RUN_LOCALLY = os.environ.get("RUN_LOCALLY")
20
-
21
- # Folder
22
- DOWNLOAD_FOLDER = "./downloads"
23
- OUTPUT_FOLDER = "./outputs"
24
-
25
- ERROR_LOGIN = "You must be logged in to use GGUF-my-repo."
26
-
27
- def create_folder(folder_name: str):
28
- if not os.path.exists(folder_name):
29
- print(f"Creating folder: {folder_name}")
30
- os.makedirs(folder_name)
31
-
32
- return folder_name
33
-
34
- def validate_token(oauth_token):
35
- if oauth_token is None or oauth_token.token is None:
36
- raise gr.Error(ERROR_LOGIN)
37
-
38
- try:
39
- whoami(oauth_token.token)
40
- except Exception as e:
41
- raise gr.Error(ERROR_LOGIN)
42
-
43
- # escape HTML for logging
44
- def escape(s: str) -> str:
45
- s = s.replace("&", "&") # Must be done first!
46
- s = s.replace("<", "&lt;")
47
- s = s.replace(">", "&gt;")
48
- s = s.replace('"', "&quot;")
49
- s = s.replace("\n", "<br/>")
50
- return s
51
-
52
- def get_model_creator(model_id: str):
53
- return model_id.split('/')[0]
54
-
55
- def get_model_name(model_id: str):
56
- return model_id.split('/')[-1]
57
-
58
- def upload_file(token, path_or_fileobj, path_in_repo, repo_id):
59
- if RUN_LOCALLY == "1":
60
- print("Skipping upload...")
61
- return
62
-
63
- api = HfApi(token=token)
64
- api.upload_file(
65
- path_or_fileobj=path_or_fileobj,
66
- path_in_repo=path_in_repo,
67
- repo_id=repo_id,
68
- )
69
-
70
- def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
71
- if not os.path.isfile(model_path):
72
- raise Exception(f"Model file not found: {model_path}")
73
-
74
- print("Running imatrix command...")
75
- imatrix_command = [
76
- "llama-imatrix",
77
- "-m", model_path,
78
- "-f", train_data_path,
79
- "-ngl", "99",
80
- "--output-frequency", "10",
81
- "--output-format", "dat",
82
- "-o", output_path,
83
- ]
84
- process = subprocess.Popen(imatrix_command, shell=False)
85
-
86
- try:
87
- process.wait(timeout=60) # added wait
88
- except subprocess.TimeoutExpired:
89
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
90
- process.send_signal(signal.SIGINT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  try:
92
- process.wait(timeout=5) # grace period
93
  except subprocess.TimeoutExpired:
94
- print("Imatrix proc still didn't term. Forecfully terming process...")
95
- process.kill()
96
-
97
- print(f"Importance matrix generation completed: {os.path.abspath(output_path)}")
98
-
99
- def split_upload_model(model_path: str, outdir: str, repo_id: str, token: str, split_max_tensors=256, split_max_size=None):
100
- print(f"Model path: {model_path}")
101
- print(f"Output dir: {outdir}")
102
-
103
- split_cmd = [
104
- "llama-gguf-split",
105
- "--split",
106
- ]
107
- if split_max_size:
108
- split_cmd.append("--split-max-size")
109
- split_cmd.append(split_max_size)
110
- else:
111
- split_cmd.append("--split-max-tensors")
112
- split_cmd.append(str(split_max_tensors))
113
-
114
- # args for output
115
- model_path_prefix = '.'.join(model_path.split('.')[:-1]) # remove the file extension
116
- split_cmd.append(model_path)
117
- split_cmd.append(model_path_prefix)
118
-
119
- print(f"Split command: {split_cmd}")
120
-
121
- result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
122
- print(f"Split command stdout: {result.stdout}")
123
- print(f"Split command stderr: {result.stderr}")
124
-
125
- if result.returncode != 0:
126
- stderr_str = result.stderr.decode("utf-8")
127
- raise Exception(f"Error splitting the model: {stderr_str}")
128
- print("Model split successfully!")
129
-
130
- # remove the original model file if needed
131
- if os.path.exists(model_path):
132
- os.remove(model_path)
133
-
134
- model_file_prefix = model_path_prefix.split('/')[-1]
135
- print(f"Model file name prefix: {model_file_prefix}")
136
- sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
137
- if sharded_model_files:
 
 
 
 
 
 
 
 
 
138
  print(f"Sharded model files: {sharded_model_files}")
139
  for file in sharded_model_files:
140
  file_path = os.path.join(outdir, file)
141
  try:
142
  print(f"Uploading file: {file_path}")
143
- upload_file(
144
- token=token,
145
- path_or_fileobj=file_path,
146
- path_in_repo=file,
147
- repo_id=repo_id,
148
- )
149
  except Exception as e:
150
- raise Exception(f"Error uploading file {file_path}: {e}")
151
- else:
152
- raise Exception("No sharded files found.")
153
 
154
- print("Sharded model has been uploaded successfully!")
155
 
156
- def download_base_model(token: str, model_id: str, outdir: str):
157
- model_name = get_model_name(model_id)
158
- print(f"Downloading model {model_name}")
 
 
159
 
160
- fp16_model = f"{outdir}/{model_name}-fp16.gguf"
161
- if os.path.exists(fp16_model):
162
- print("Skipping fp16 convertion...")
163
- print(f"Converted model path: {os.path.abspath(fp16_model)}")
164
 
165
- with tempfile.TemporaryDirectory(dir=DOWNLOAD_FOLDER) as tmpdir:
166
- # Keep the model name as the dirname so the model name metadata is populated correctly
167
- local_dir = f"{Path(tmpdir)}/{model_name}"
168
- print(f"Local directory: {os.path.abspath(local_dir)}")
169
 
170
- # Download model
171
- api = HfApi(token=token)
172
- pattern = (
173
- "*.safetensors"
174
- if any(
175
- file.path.endswith(".safetensors")
176
- for file in api.list_repo_tree(
177
- repo_id=model_id,
178
- recursive=True,
 
179
  )
 
180
  )
181
- else "*.bin"
182
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- dl_pattern = ["*.md", "*.json", "*.model"]
185
- dl_pattern += [pattern]
186
-
187
- api.snapshot_download(repo_id=model_id, local_dir=local_dir, allow_patterns=dl_pattern)
188
- print("Model downloaded successfully!")
189
-
190
- print(f"Model directory contents: {os.listdir(local_dir)}")
191
- config_dir = local_dir/"config.json"
192
- adapter_config_dir = local_dir/"adapter_config.json"
193
- if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
194
- raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
195
-
196
- # Convert HF to GGUF
197
- print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
198
- result = subprocess.run(
199
- [
200
- "python3", "/app/convert_hf_to_gguf.py", local_dir, "--outtype", "f16", "--outfile", fp16_model
201
- ],
202
- shell=False,
203
- capture_output=True
204
- )
205
- print(f"Model directory contents: {result}")
206
  if result.returncode != 0:
207
  stderr_str = result.stderr.decode("utf-8")
208
- raise Exception(f"Error converting to fp16: {stderr_str}")
209
-
210
- print("Model converted to fp16 successfully!")
211
- print(f"Converted model path: {os.path.abspath(fp16_model)}")
212
-
213
- return fp16_model
214
-
215
- def quantize_model(
216
- outdir: str,
217
- gguf_name: str,
218
- fp16: str,
219
- q_method: str,
220
- use_imatrix: bool,
221
- imatrix_q_method: str,
222
- imatrix_file: str,
223
- quant_embedding: bool,
224
- embedding_tensor_method: str,
225
- leave_output: bool,
226
- quant_output: bool,
227
- output_tensor_method: str,
228
- ):
229
- # Quantize the model
230
- quantize_cmd = ["llama-quantize"]
231
-
232
- if quant_embedding:
233
- quantize_cmd.append("--token-embedding-type")
234
- quantize_cmd.append(embedding_tensor_method)
235
- if leave_output:
236
- quantize_cmd.append("--leave-output-tensor")
237
- else:
238
- if quant_output:
239
- quantize_cmd.append("--output-tensor-type")
240
- quantize_cmd.append(output_tensor_method)
241
-
242
- if use_imatrix:
243
- train_data_path = "calibration_data_v5_rc.txt" #fallback calibration dataset
244
- # if train_data_file:
245
- # train_data_path = train_data_file.name
246
-
247
- print(f"Training data file path: {train_data_path}")
248
-
249
- if not os.path.isfile(train_data_path):
250
- raise Exception(f"Training data file not found: {train_data_path}")
251
-
252
- generate_importance_matrix(fp16, train_data_path, imatrix_file)
253
-
254
- quantize_cmd.append("--imatrix")
255
- quantize_cmd.append(imatrix_file)
256
- else:
257
- print("Not using imatrix quantization.")
258
-
259
- quantized_gguf = f"{outdir}/{gguf_name}"
260
- quantize_cmd.append(fp16)
261
- quantize_cmd.append(quantized_gguf)
262
-
263
- if use_imatrix:
264
- quantize_cmd.append(imatrix_q_method)
265
- else:
266
- quantize_cmd.append(q_method)
267
-
268
- print(f"Quantizing model with {quantize_cmd}")
269
- result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
270
- if result.returncode != 0:
271
- stderr_str = result.stderr.decode("utf-8")
272
- raise Exception(f"Error quantizing: {stderr_str}")
273
-
274
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
275
- print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
276
- return quantized_gguf
277
-
278
- def generate_readme(outdir: str, token: str, model_id: str, new_repo_id: str, gguf_name: str):
279
- creator = get_model_creator(model_id)
280
- model_name = get_model_name(model_id)
281
- username = whoami(token)["name"]
282
-
283
- try:
284
- card = ModelCard.load(model_id, token=token)
285
- except:
286
- card = ModelCard("")
287
-
288
- if card.data.tags is None:
289
- card.data.tags = []
290
-
291
- card.data.tags.append("llama-cpp")
292
- card.data.tags.append("gguf-my-repo")
293
- card.data.base_model = model_id
294
- card.text = dedent(
295
- f"""
296
- # {model_name}
297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  **Model creator:** [{creator}](https://huggingface.co/{creator})<br/>
299
  **Original model**: [{model_id}](https://huggingface.co/{model_id})<br/>
300
  **GGUF quantization:** provided by [{username}](https:/huggingface.co/{username}) using `llama.cpp`<br/>
301
-
302
  ## Special thanks
303
-
304
  🙏 Special thanks to [Georgi Gerganov](https://github.com/ggerganov) and the whole team working on [llama.cpp](https://github.com/ggerganov/llama.cpp/) for making all of this possible.
305
-
306
  ## Use with Ollama
307
-
308
  ```bash
309
  ollama run "hf.co/{new_repo_id}:<quantization>"
310
  ```
311
-
312
  ## Use with LM Studio
313
-
314
  ```bash
315
  lms load "{new_repo_id}"
316
  ```
317
-
318
  ## Use with llama.cpp CLI
319
-
320
  ```bash
321
  llama-cli --hf-repo "{new_repo_id}" --hf-file "{gguf_name}" -p "The meaning to life and the universe is"
322
  ```
323
-
324
  ## Use with llama.cpp Server:
325
-
326
  ```bash
327
  llama-server --hf-repo "{new_repo_id}" --hf-file "{gguf_name}" -c 4096
328
  ```
329
- """
330
- )
331
- readme_path = f"{outdir}/README.md"
332
- card.save(readme_path)
333
- return readme_path
334
-
335
- def process_model(
336
- model_id: str,
337
- q_method: str,
338
- use_imatrix: bool,
339
- imatrix_q_method: str,
340
- private_repo: bool,
341
- train_data_file,
342
- repo_name: str,
343
- gguf_name: str,
344
- quant_embedding: bool,
345
- embedding_tensor_method: str,
346
- leave_output: bool,
347
- quant_output: bool,
348
- output_tensor_method: str,
349
- split_model: bool,
350
- split_max_tensors,
351
- split_max_size: str | None,
352
- oauth_token: gr.OAuthToken | None,
353
- ):
354
- validate_token(oauth_token)
355
- token = oauth_token.token
356
-
357
- print(f"Current working directory: {os.path.abspath(os.getcwd())}")
358
- create_folder(DOWNLOAD_FOLDER)
359
- create_folder(OUTPUT_FOLDER)
360
-
361
- model_name = get_model_name(model_id)
362
-
363
- try:
364
- with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outDirObj:
365
- outdir = create_folder(os.path.join(OUTPUT_FOLDER, model_name)) if RUN_LOCALLY == "1" else Path(outDirObj)
366
- fp16 = download_base_model(token, model_id, outdir)
367
- imatrix_file = f"{outdir}/{model_name}-imatrix.dat"
368
- quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method, imatrix_file, quant_embedding, embedding_tensor_method, leave_output, quant_output, output_tensor_method)
369
-
370
- # Create empty repo
371
- api = HfApi(token=token)
372
- new_repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, private=private_repo)
373
- new_repo_id = new_repo_url.repo_id
374
- print("Repo created successfully!", new_repo_url)
375
-
376
- # Upload model
377
- if split_model:
378
- print(f"Splitting quantized model: {os.path.abspath(quantized_gguf)}")
379
- split_upload_model(quantized_gguf, outdir, new_repo_id, token, split_max_tensors, split_max_size)
380
- else:
381
- try:
382
- print(f"Uploading quantized model: {os.path.abspath(quantized_gguf)}")
383
- upload_file(
384
- token=token,
385
- path_or_fileobj=quantized_gguf,
386
- path_in_repo=gguf_name,
387
- repo_id=new_repo_id,
388
- )
389
- except Exception as e:
390
- raise Exception(f"Error uploading quantized model: {e}")
391
-
392
- if os.path.isfile(imatrix_file):
393
- try:
394
- print(f"Uploading imatrix.dat: {os.path.abspath(output_path)}")
395
- upload_file(
396
- token=token,
397
- path_or_fileobj=imatrix_file,
398
- path_in_repo="imatrix.dat",
399
- repo_id=new_repo_id,
400
- )
401
- except Exception as e:
402
- raise Exception(f"Error uploading imatrix.dat: {e}")
403
 
404
- # Upload README.md
405
- readme_path = generate_readme(outdir, token, model_id, new_repo_id, gguf_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
- upload_file(
408
- token=token,
409
- path_or_fileobj=readme_path,
410
- path_in_repo="README.md",
411
- repo_id=new_repo_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  )
413
- print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
414
 
415
- # end of the TemporaryDirectory(dir="outputs") block; temporary outputs are deleted here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
- return (
418
- f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
419
- "llama.png",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  )
421
- except Exception as e:
422
- print((f"Error processing model: {e}"))
423
- return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
424
-
425
-
426
- css="""/* Custom CSS to allow scrolling */
427
- .gradio-container {overflow-y: auto;}
428
- """
429
- #####
430
- # Base model section
431
- #####
432
- model_id = HuggingfaceHubSearch(
433
- label="Hub Model ID",
434
- placeholder="Search for model id on Huggingface",
435
- search_type="model",
436
- )
437
-
438
- #####
439
- # Quantization section
440
- #####
441
- use_imatrix = gr.Checkbox(
442
- value=False,
443
- label="Use Imatrix Quantization",
444
- info="Use importance matrix for quantization."
445
- )
446
-
447
- q_method = gr.Dropdown(
448
- choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
449
- label="Quantization Method",
450
- info="GGML quantization type",
451
- value="Q4_K_M",
452
- filterable=False,
453
- visible=True
454
- )
455
-
456
- imatrix_q_method = gr.Dropdown(
457
- choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
458
- label="Imatrix Quantization Method",
459
- info="GGML imatrix quants type",
460
- value="IQ4_NL",
461
- filterable=False,
462
- visible=False
463
- )
464
-
465
- train_data_file = gr.File(
466
- label="Training Data File",
467
- file_types=[".txt"],
468
- visible=False
469
- )
470
-
471
- #####
472
- # Advanced Options section
473
- #####
474
- split_model = gr.Checkbox(
475
- value=False,
476
- label="Split Model",
477
- info="Shard the model using gguf-split."
478
- )
479
-
480
- split_max_tensors = gr.Number(
481
- value=256,
482
- label="Max Tensors per File",
483
- info="Maximum number of tensors per file when splitting model.",
484
- visible=False
485
- )
486
-
487
- split_max_size = gr.Textbox(
488
- label="Max File Size",
489
- info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
490
- visible=False
491
- )
492
-
493
- leave_output = gr.Checkbox(
494
- value=False,
495
- label="Leave output tensor",
496
- info="Leaves output.weight un(re)quantized"
497
- )
498
-
499
- quant_embedding = gr.Checkbox(
500
- value=False,
501
- label="Quant embeddings tensor",
502
- info="Quantize embeddings tensor separately"
503
- )
504
- embedding_tensor_method = gr.Dropdown(
505
- choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
506
- label="Output Quantization Method",
507
- info="use a specific quant type for the token embeddings tensor",
508
- value="Q8_0",
509
- filterable=False,
510
- visible=False
511
- )
512
-
513
- quant_output = gr.Checkbox(
514
- value=False,
515
- label="Quant output tensor",
516
- info="Quantize output tensor separately"
517
- )
518
- output_tensor_method = gr.Dropdown(
519
- choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
520
- label="Output Quantization Method",
521
- info="use a specific quant type for the output.weight tensor",
522
- value="Q8_0",
523
- filterable=False,
524
- visible=False
525
- )
526
-
527
- #####
528
- # Output Settings section
529
- #####
530
- private_repo = gr.Checkbox(
531
- value=False,
532
- label="Private Repo",
533
- info="Create a private repo under your username."
534
- )
535
-
536
- repo_name = gr.Textbox(
537
- label="Output Repository Name",
538
- info="Set your repository name",
539
- max_lines=1
540
- )
541
-
542
- gguf_name = gr.Textbox(
543
- label="Output File Name",
544
- info="Set output file name",
545
- max_lines=1
546
- )
547
-
548
- def update_output_repo(model_id, oauth_token: gr.OAuthToken | None):
549
- if oauth_token is None or not oauth_token.token:
550
- return ""
551
-
552
- if not model_id:
553
- return ""
554
-
555
- username = whoami(oauth_token.token)["name"]
556
- model_name = get_model_name(model_id)
557
- return f"{username}/{model_name}-GGUF"
558
-
559
- def update_output_filename(model_id, use_imatrix, q_method, imatrix_q_method):
560
- if not model_id:
561
- return ""
562
-
563
- model_name = get_model_name(model_id)
564
-
565
- if use_imatrix:
566
- return f"{model_name}-{imatrix_q_method.upper()}-imat.gguf"
567
-
568
- return f"{model_name}-{q_method.upper()}.gguf"
569
-
570
- #####
571
- # Buttons section
572
- #####
573
- clear_btn = gr.ClearButton(
574
- value="Clear",
575
- variant="secondary",
576
- components=[
577
- model_id,
578
- q_method,
579
- use_imatrix,
580
- imatrix_q_method,
581
- private_repo,
582
- train_data_file,
583
- leave_output,
584
- quant_embedding,
585
- embedding_tensor_method,
586
- quant_output,
587
- output_tensor_method,
588
- split_model,
589
- split_max_tensors,
590
- split_max_size,
591
- repo_name,
592
- gguf_name,
593
- ]
594
- )
595
- submit_btn = gr.Button(
596
- value="Submit",
597
- variant="primary"
598
- )
599
-
600
- #####
601
- # Outputs section
602
- #####
603
- output_label = gr.Markdown(label="output")
604
-
605
- output_image = gr.Image(
606
- show_label=False,
607
- show_download_button=False,
608
- interactive=False
609
- )
610
-
611
- # Create Gradio interface
612
- with gr.Blocks(css=css) as demo:
613
- #####
614
- # Layout
615
- #####
616
- gr.Markdown(ERROR_LOGIN)
617
- gr.LoginButton(min_width=250)
618
-
619
- gr.HTML("<h1 style=\"text-aling:center;\">Create your own GGUF Quants!</h1>")
620
- gr.Markdown(f"The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.<br/>Use via {SPACE_URL}")
621
-
622
- with gr.Row():
623
- with gr.Column() as inputs:
624
- gr.Markdown("### Model Configuration")
625
- model_id.render()
626
-
627
- with gr.Column():
628
- use_imatrix.render()
629
- q_method.render()
630
- imatrix_q_method.render()
631
- train_data_file.render()
632
-
633
- gr.Markdown("### Advanced Options")
634
-
635
- quant_embedding.render()
636
- embedding_tensor_method.render()
637
- leave_output.render()
638
- quant_output.render()
639
- output_tensor_method.render()
640
-
641
- split_model.render()
642
- with gr.Row() as split_options: # Group split options
643
- split_max_tensors.render()
644
- split_max_size.render()
645
-
646
- gr.Markdown("### Output Settings")
647
- gr.Markdown("You can customize settings for your GGUF repo.")
648
- private_repo.render()
649
  with gr.Row():
650
- repo_name.render()
651
- gguf_name.render()
652
-
653
- # Buttons
654
- with gr.Row() as buttons:
655
- clear_btn.render()
656
- submit_btn.render()
657
-
658
- with gr.Column() as outputs:
659
- output_image.render()
660
- output_label.render()
661
-
662
- #####
663
- # Button Click handlers
664
- #####
665
- submit_btn.click(
666
- fn=process_model,
667
- inputs=[
668
- model_id,
669
- q_method,
670
- use_imatrix,
671
- imatrix_q_method,
672
- private_repo,
673
- train_data_file,
674
- repo_name,
675
- gguf_name,
676
- quant_embedding,
677
- embedding_tensor_method,
678
- leave_output,
679
- quant_output,
680
- output_tensor_method,
681
- split_model,
682
- split_max_tensors,
683
- split_max_size
684
- ],
685
- outputs=[
686
- output_label,
687
- output_image,
688
- ],
689
- )
690
-
691
- #####
692
- # OnChange handlers
693
- #####
694
- use_imatrix.change(
695
- fn=lambda use_imatrix: [gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)],
696
- inputs=use_imatrix,
697
- outputs=[q_method, imatrix_q_method, train_data_file]
698
- )
699
-
700
- split_model.change(
701
- fn=lambda split_model: [gr.update(visible=split_model), gr.update(visible=split_model)],
702
- inputs=split_model,
703
- outputs=[split_max_tensors, split_max_size]
704
- )
705
-
706
- quant_embedding.change(
707
- fn=lambda quant_embedding: gr.update(visible=quant_embedding),
708
- inputs=quant_embedding,
709
- outputs=[embedding_tensor_method]
710
- )
711
-
712
- quant_output.change(
713
- fn=lambda quant_output: [gr.update(visible=quant_output), gr.update(visible=not quant_output)],
714
- inputs=quant_output,
715
- outputs=[output_tensor_method, leave_output]
716
- )
717
-
718
- model_id.change(
719
- fn=update_output_repo,
720
- inputs=model_id,
721
- outputs=[repo_name]
722
- )
723
-
724
- model_id.change(
725
- fn=update_output_filename,
726
- inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
727
- outputs=[gguf_name]
728
- )
729
- use_imatrix.change(
730
- fn=update_output_filename,
731
- inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
732
- outputs=[gguf_name]
733
- )
734
- q_method.change(
735
- fn=update_output_filename,
736
- inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
737
- outputs=[gguf_name]
738
- )
739
- imatrix_q_method.change(
740
- fn=update_output_filename,
741
- inputs=[model_id, use_imatrix, q_method, imatrix_q_method],
742
- outputs=[gguf_name]
743
- )
744
-
745
- def restart_space():
746
- HfApi().restart_space(repo_id=SPACE_ID, token=HF_TOKEN, factory_reboot=True)
747
-
748
- scheduler = BackgroundScheduler()
749
- scheduler.add_job(restart_space, "interval", seconds=21600)
750
- scheduler.start()
751
-
752
- # Launch the interface
753
- demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import subprocess
3
  import signal
 
 
4
  import tempfile
5
+ from pathlib import Path
6
+ from textwrap import dedent
7
+ from typing import Optional, Tuple, List, Union
8
+ from dataclasses import dataclass
9
+
10
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
11
 
12
+ import gradio as gr
13
  from huggingface_hub import HfApi, ModelCard, whoami
14
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
 
15
  from apscheduler.schedulers.background import BackgroundScheduler
16
 
17
+
18
+ @dataclass
19
+ class QuantizationConfig:
20
+ """Configuration for model quantization."""
21
+ method: str
22
+ use_imatrix: bool = False
23
+ imatrix_method: str = "IQ4_NL"
24
+ quant_embedding: bool = False
25
+ embedding_tensor_method: str = "Q8_0"
26
+ leave_output: bool = False
27
+ quant_output: bool = False
28
+ output_tensor_method: str = "Q8_0"
29
+
30
+
31
+ @dataclass
32
+ class SplitConfig:
33
+ """Configuration for model splitting."""
34
+ enabled: bool = False
35
+ max_tensors: int = 256
36
+ max_size: Optional[str] = None
37
+
38
+
39
+ @dataclass
40
+ class OutputConfig:
41
+ """Configuration for output settings."""
42
+ private_repo: bool = False
43
+ repo_name: str = ""
44
+ filename: str = ""
45
+
46
+
47
+ class GGUFConverterError(Exception):
48
+ """Custom exception for GGUF conversion errors."""
49
+ pass
50
+
51
+
52
+ class HuggingFaceModelProcessor:
53
+ """Handles the processing of Hugging Face models to GGUF format."""
54
+
55
+ ERROR_LOGIN = "You must be logged in to use GGUF-my-repo."
56
+ DOWNLOAD_FOLDER = "./downloads"
57
+ OUTPUT_FOLDER = "./outputs"
58
+
59
+ def __init__(self):
60
+ self.SPACE_ID = os.environ.get("SPACE_ID", "")
61
+ self.SPACE_URL = f"https://{self.SPACE_ID.replace('/', '-')}.hf.space/" if self.SPACE_ID else "http://localhost:7860/"
62
+ self.HF_TOKEN = os.environ.get("HF_TOKEN")
63
+ self.RUN_LOCALLY = os.environ.get("RUN_LOCALLY")
64
+
65
+ # Create necessary folders
66
+ self._create_folder(self.DOWNLOAD_FOLDER)
67
+ self._create_folder(self.OUTPUT_FOLDER)
68
+
69
+ def _create_folder(self, folder_name: str) -> str:
70
+ """Create a folder if it doesn't exist."""
71
+ if not os.path.exists(folder_name):
72
+ print(f"Creating folder: {folder_name}")
73
+ os.makedirs(folder_name)
74
+ return folder_name
75
+
76
+ def _validate_token(self, oauth_token: Optional[gr.OAuthToken]) -> str:
77
+ """Validate the OAuth token and return the token string."""
78
+ if oauth_token is None or oauth_token.token is None:
79
+ raise GGUFConverterError(self.ERROR_LOGIN)
80
+
81
+ try:
82
+ whoami(oauth_token.token)
83
+ return oauth_token.token
84
+ except Exception as e:
85
+ raise GGUFConverterError(self.ERROR_LOGIN)
86
+
87
+ def _escape_html(self, s: str) -> str:
88
+ """Escape HTML characters for safe display."""
89
+ replacements = [
90
+ ("&", "&amp;"),
91
+ ("<", "&lt;"),
92
+ (">", "&gt;"),
93
+ ('"', "&quot;"),
94
+ ("\n", "<br/>")
95
+ ]
96
+ for old, new in replacements:
97
+ s = s.replace(old, new)
98
+ return s
99
+
100
+ def _get_model_creator(self, model_id: str) -> str:
101
+ """Extract model creator from model ID."""
102
+ return model_id.split('/')[0]
103
+
104
+ def _get_model_name(self, model_id: str) -> str:
105
+ """Extract model name from model ID."""
106
+ return model_id.split('/')[-1]
107
+
108
+ def _upload_file(self, token: str, path_or_fileobj: str, path_in_repo: str, repo_id: str) -> None:
109
+ """Upload a file to Hugging Face repository."""
110
+ if self.RUN_LOCALLY == "1":
111
+ print("Skipping upload...")
112
+ return
113
+
114
+ api = HfApi(token=token)
115
+ api.upload_file(
116
+ path_or_fileobj=path_or_fileobj,
117
+ path_in_repo=path_in_repo,
118
+ repo_id=repo_id,
119
+ )
120
+
121
+ def _generate_importance_matrix(self, model_path: str, train_data_path: str, output_path: str) -> None:
122
+ """Generate importance matrix for quantization."""
123
+ if not os.path.isfile(model_path):
124
+ raise GGUFConverterError(f"Model file not found: {model_path}")
125
+
126
+ print("Running imatrix command...")
127
+ imatrix_command = [
128
+ "llama-imatrix",
129
+ "-m", model_path,
130
+ "-f", train_data_path,
131
+ "-ngl", "99",
132
+ "--output-frequency", "10",
133
+ "--output-format", "dat",
134
+ "-o", output_path,
135
+ ]
136
+
137
+ process = subprocess.Popen(imatrix_command, shell=False)
138
  try:
139
+ process.wait(timeout=60)
140
  except subprocess.TimeoutExpired:
141
+ print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
142
+ process.send_signal(signal.SIGINT)
143
+ try:
144
+ process.wait(timeout=5)
145
+ except subprocess.TimeoutExpired:
146
+ print("Imatrix proc still didn't term. Forecfully terming process...")
147
+ process.kill()
148
+
149
+ print(f"Importance matrix generation completed: {os.path.abspath(output_path)}")
150
+
151
+ def _split_and_upload_model(self, model_path: str, outdir: str, repo_id: str, token: str,
152
+ split_config: SplitConfig) -> None:
153
+ """Split large model files and upload shards."""
154
+ print(f"Model path: {model_path}")
155
+ print(f"Output dir: {outdir}")
156
+
157
+ split_cmd = ["llama-gguf-split", "--split"]
158
+
159
+ if split_config.max_size:
160
+ split_cmd.extend(["--split-max-size", split_config.max_size])
161
+ else:
162
+ split_cmd.extend(["--split-max-tensors", str(split_config.max_tensors)])
163
+
164
+ model_path_prefix = '.'.join(model_path.split('.')[:-1])
165
+ split_cmd.extend([model_path, model_path_prefix])
166
+
167
+ print(f"Split command: {split_cmd}")
168
+ result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
169
+
170
+ print(f"Split command stdout: {result.stdout}")
171
+ print(f"Split command stderr: {result.stderr}")
172
+
173
+ if result.returncode != 0:
174
+ stderr_str = result.stderr.decode("utf-8")
175
+ raise GGUFConverterError(f"Error splitting the model: {stderr_str}")
176
+
177
+ print("Model split successfully!")
178
+
179
+ # Remove original model file
180
+ if os.path.exists(model_path):
181
+ os.remove(model_path)
182
+
183
+ model_file_prefix = model_path_prefix.split('/')[-1]
184
+ print(f"Model file name prefix: {model_file_prefix}")
185
+
186
+ sharded_model_files = [
187
+ f for f in os.listdir(outdir)
188
+ if f.startswith(model_file_prefix) and f.endswith(".gguf")
189
+ ]
190
+
191
+ if not sharded_model_files:
192
+ raise GGUFConverterError("No sharded files found.")
193
+
194
  print(f"Sharded model files: {sharded_model_files}")
195
  for file in sharded_model_files:
196
  file_path = os.path.join(outdir, file)
197
  try:
198
  print(f"Uploading file: {file_path}")
199
+ self._upload_file(token, file_path, file, repo_id)
 
 
 
 
 
200
  except Exception as e:
201
+ raise GGUFConverterError(f"Error uploading file {file_path}: {e}")
 
 
202
 
203
+ print("Sharded model has been uploaded successfully!")
204
 
205
+ def _download_base_model(self, token: str, model_id: str, outdir: str) -> str:
206
+ """Download and convert Hugging Face model to GGUF FP16 format."""
207
+ model_name = self._get_model_name(model_id)
208
+ print(f"Downloading model {model_name}")
209
+ fp16_model = f"{outdir}/{model_name}-fp16.gguf"
210
 
211
+ if os.path.exists(fp16_model):
212
+ print("Skipping fp16 conversion...")
213
+ print(f"Converted model path: {os.path.abspath(fp16_model)}")
214
+ return fp16_model
215
 
216
+ with tempfile.TemporaryDirectory(dir=self.DOWNLOAD_FOLDER) as tmpdir:
217
+ local_dir = f"{Path(tmpdir)}/{model_name}"
218
+ print(f"Local directory: {os.path.abspath(local_dir)}")
 
219
 
220
+ # Download model
221
+ api = HfApi(token=token)
222
+ pattern = (
223
+ "*.safetensors"
224
+ if any(
225
+ file.path.endswith(".safetensors")
226
+ for file in api.list_repo_tree(
227
+ repo_id=model_id,
228
+ recursive=True,
229
+ )
230
  )
231
+ else "*.bin"
232
  )
233
+ dl_pattern = ["*.md", "*.json", "*.model"]
234
+ dl_pattern += [pattern]
235
+ api.snapshot_download(repo_id=model_id, local_dir=local_dir, allow_patterns=dl_pattern)
236
+ print("Model downloaded successfully!")
237
+ print(f"Model directory contents: {os.listdir(local_dir)}")
238
+
239
+ config_dir = local_dir/"config.json"
240
+ adapter_config_dir = local_dir/"adapter_config.json"
241
+ if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
242
+ raise GGUFConverterError(
243
+ 'adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, '
244
+ 'please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" '
245
+ 'style="text-decoration:underline">GGUF-my-lora</a>.'
246
+ )
247
+
248
+ # Convert HF to GGUF
249
+ print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
250
+ result = subprocess.run(
251
+ [
252
+ "python3", "/app/convert_hf_to_gguf.py", local_dir,
253
+ "--outtype", "f16", "--outfile", fp16_model
254
+ ],
255
+ shell=False,
256
+ capture_output=True
257
+ )
258
+
259
+ print(f"Model directory contents: {result}")
260
+ if result.returncode != 0:
261
+ stderr_str = result.stderr.decode("utf-8")
262
+ raise GGUFConverterError(f"Error converting to fp16: {stderr_str}")
263
+
264
+ print("Model converted to fp16 successfully!")
265
+ print(f"Converted model path: {os.path.abspath(fp16_model)}")
266
+ return fp16_model
267
+
268
+ def _quantize_model(self, outdir: str, gguf_name: str, fp16: str,
269
+ quant_config: QuantizationConfig) -> str:
270
+ """Quantize the GGUF model."""
271
+ quantize_cmd = ["llama-quantize"]
272
+
273
+ if quant_config.quant_embedding:
274
+ quantize_cmd.extend(["--token-embedding-type", quant_config.embedding_tensor_method])
275
+
276
+ if quant_config.leave_output:
277
+ quantize_cmd.append("--leave-output-tensor")
278
+ else:
279
+ if quant_config.quant_output:
280
+ quantize_cmd.extend(["--output-tensor-type", quant_config.output_tensor_method])
281
+
282
+ imatrix_file = f"{outdir}/{self._get_model_name(gguf_name.split('-')[0])}-imatrix.dat"
283
+
284
+ if quant_config.use_imatrix:
285
+ train_data_path = "calibration_data_v5_rc.txt"
286
+ print(f"Training data file path: {train_data_path}")
287
+
288
+ if not os.path.isfile(train_data_path):
289
+ raise GGUFConverterError(f"Training data file not found: {train_data_path}")
290
+
291
+ self._generate_importance_matrix(fp16, train_data_path, imatrix_file)
292
+ quantize_cmd.extend(["--imatrix", imatrix_file])
293
+ else:
294
+ print("Not using imatrix quantization.")
295
+
296
+ quantized_gguf = f"{outdir}/{gguf_name}"
297
+ quantize_cmd.append(fp16)
298
+ quantize_cmd.append(quantized_gguf)
299
+
300
+ if quant_config.use_imatrix:
301
+ quantize_cmd.append(quant_config.imatrix_method)
302
+ else:
303
+ quantize_cmd.append(quant_config.method)
304
+
305
+ print(f"Quantizing model with {quantize_cmd}")
306
+ result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  if result.returncode != 0:
309
  stderr_str = result.stderr.decode("utf-8")
310
+ raise GGUFConverterError(f"Error quantizing: {stderr_str}")
311
+
312
+ print(f"Quantized successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
313
+ print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
314
+ return quantized_gguf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
+ def _generate_readme(self, outdir: str, token: str, model_id: str,
317
+ new_repo_id: str, gguf_name: str) -> str:
318
+ """Generate README.md for the quantized model."""
319
+ creator = self._get_model_creator(model_id)
320
+ model_name = self._get_model_name(model_id)
321
+ username = whoami(token)["name"]
322
+
323
+ try:
324
+ card = ModelCard.load(model_id, token=token)
325
+ except:
326
+ card = ModelCard("")
327
+
328
+ if card.data.tags is None:
329
+ card.data.tags = []
330
+ card.data.tags.extend(["llama-cpp", "gguf-my-repo"])
331
+ card.data.base_model = model_id
332
+
333
+ card.text = dedent(
334
+ f"""
335
+ # {model_name}
336
  **Model creator:** [{creator}](https://huggingface.co/{creator})<br/>
337
  **Original model**: [{model_id}](https://huggingface.co/{model_id})<br/>
338
  **GGUF quantization:** provided by [{username}](https:/huggingface.co/{username}) using `llama.cpp`<br/>
 
339
  ## Special thanks
 
340
  🙏 Special thanks to [Georgi Gerganov](https://github.com/ggerganov) and the whole team working on [llama.cpp](https://github.com/ggerganov/llama.cpp/) for making all of this possible.
 
341
  ## Use with Ollama
 
342
  ```bash
343
  ollama run "hf.co/{new_repo_id}:<quantization>"
344
  ```
 
345
  ## Use with LM Studio
 
346
  ```bash
347
  lms load "{new_repo_id}"
348
  ```
 
349
  ## Use with llama.cpp CLI
 
350
  ```bash
351
  llama-cli --hf-repo "{new_repo_id}" --hf-file "{gguf_name}" -p "The meaning to life and the universe is"
352
  ```
 
353
  ## Use with llama.cpp Server:
 
354
  ```bash
355
  llama-server --hf-repo "{new_repo_id}" --hf-file "{gguf_name}" -c 4096
356
  ```
357
+ """
358
+ )
359
+
360
+ readme_path = f"{outdir}/README.md"
361
+ card.save(readme_path)
362
+ return readme_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
+ def process_model(self, model_id: str, quant_config: QuantizationConfig,
365
+ split_config: SplitConfig, output_config: OutputConfig,
366
+ oauth_token: Optional[gr.OAuthToken]) -> Tuple[str, str]:
367
+ """Main method to process a model through the entire pipeline."""
368
+ try:
369
+ token = self._validate_token(oauth_token)
370
+ print(f"Current working directory: {os.path.abspath(os.getcwd())}")
371
+
372
+ model_name = self._get_model_name(model_id)
373
+
374
+ with tempfile.TemporaryDirectory(dir=self.OUTPUT_FOLDER) as outDirObj:
375
+ outdir = (
376
+ self._create_folder(os.path.join(self.OUTPUT_FOLDER, model_name))
377
+ if self.RUN_LOCALLY == "1"
378
+ else Path(outDirObj)
379
+ )
380
 
381
+ fp16 = self._download_base_model(token, model_id, outdir)
382
+ quantized_gguf = self._quantize_model(outdir, output_config.filename, fp16, quant_config)
383
+
384
+ # Create empty repo
385
+ api = HfApi(token=token)
386
+ new_repo_url = api.create_repo(
387
+ repo_id=output_config.repo_name,
388
+ exist_ok=True,
389
+ private=output_config.private_repo
390
+ )
391
+ new_repo_id = new_repo_url.repo_id
392
+ print("Repo created successfully!", new_repo_url)
393
+
394
+ # Upload model
395
+ if split_config.enabled:
396
+ print(f"Splitting quantized model: {os.path.abspath(quantized_gguf)}")
397
+ self._split_and_upload_model(quantized_gguf, outdir, new_repo_id, token, split_config)
398
+ else:
399
+ try:
400
+ print(f"Uploading quantized model: {os.path.abspath(quantized_gguf)}")
401
+ self._upload_file(token, quantized_gguf, output_config.filename, new_repo_id)
402
+ except Exception as e:
403
+ raise GGUFConverterError(f"Error uploading quantized model: {e}")
404
+
405
+ # Upload imatrix if it exists
406
+ imatrix_file = f"{outdir}/{model_name}-imatrix.dat"
407
+ if os.path.isfile(imatrix_file):
408
+ try:
409
+ print(f"Uploading imatrix.dat: {os.path.abspath(imatrix_file)}")
410
+ self._upload_file(token, imatrix_file, "imatrix.dat", new_repo_id)
411
+ except Exception as e:
412
+ raise GGUFConverterError(f"Error uploading imatrix.dat: {e}")
413
+
414
+ # Upload README.md
415
+ readme_path = self._generate_readme(outdir, token, model_id, new_repo_id, output_config.filename)
416
+ self._upload_file(token, readme_path, "README.md", new_repo_id)
417
+
418
+ print(f"Uploaded successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
419
+
420
+ return (
421
+ f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
422
+ "llama.png",
423
  )
 
424
 
425
+ except Exception as e:
426
+ print(f"Error processing model: {e}")
427
+ return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{self._escape_html(str(e))}</pre>', "error.png")
428
+
429
+
430
+ class GGUFConverterUI:
431
+ """Gradio UI for the GGUF Converter."""
432
+
433
+ def __init__(self):
434
+ self.processor = HuggingFaceModelProcessor()
435
+ self.css = """/* Custom CSS to allow scrolling */
436
+ .gradio-container {overflow-y: auto;}
437
+ """
438
+
439
+ # Initialize components
440
+ self._initialize_components()
441
+ self._setup_interface()
442
+
443
+ def _initialize_components(self):
444
+ """Initialize all UI components."""
445
+ #####
446
+ # Base model section
447
+ #####
448
+ self.model_id = HuggingfaceHubSearch(
449
+ label="Hub Model ID",
450
+ placeholder="Search for model id on Huggingface",
451
+ search_type="model",
452
+ )
453
+
454
+ #####
455
+ # Quantization section
456
+ #####
457
+ self.use_imatrix = gr.Checkbox(
458
+ value=False,
459
+ label="Use Imatrix Quantization",
460
+ info="Use importance matrix for quantization."
461
+ )
462
+ self.q_method = gr.Dropdown(
463
+ choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
464
+ label="Quantization Method",
465
+ info="GGML quantization type",
466
+ value="Q4_K_M",
467
+ filterable=False,
468
+ visible=True
469
+ )
470
+ self.imatrix_q_method = gr.Dropdown(
471
+ choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
472
+ label="Imatrix Quantization Method",
473
+ info="GGML imatrix quants type",
474
+ value="IQ4_NL",
475
+ filterable=False,
476
+ visible=False
477
+ )
478
+ self.train_data_file = gr.File(
479
+ label="Training Data File",
480
+ file_types=[".txt"],
481
+ visible=False
482
+ )
483
+
484
+ #####
485
+ # Advanced Options section
486
+ #####
487
+ self.split_model = gr.Checkbox(
488
+ value=False,
489
+ label="Split Model",
490
+ info="Shard the model using gguf-split."
491
+ )
492
+ self.split_max_tensors = gr.Number(
493
+ value=256,
494
+ label="Max Tensors per File",
495
+ info="Maximum number of tensors per file when splitting model.",
496
+ visible=False
497
+ )
498
+ self.split_max_size = gr.Textbox(
499
+ label="Max File Size",
500
+ info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
501
+ visible=False
502
+ )
503
+ self.leave_output = gr.Checkbox(
504
+ value=False,
505
+ label="Leave output tensor",
506
+ info="Leaves output.weight un(re)quantized"
507
+ )
508
+ self.quant_embedding = gr.Checkbox(
509
+ value=False,
510
+ label="Quant embeddings tensor",
511
+ info="Quantize embeddings tensor separately"
512
+ )
513
+ self.embedding_tensor_method = gr.Dropdown(
514
+ choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
515
+ label="Output Quantization Method",
516
+ info="use a specific quant type for the token embeddings tensor",
517
+ value="Q8_0",
518
+ filterable=False,
519
+ visible=False
520
+ )
521
+ self.quant_output = gr.Checkbox(
522
+ value=False,
523
+ label="Quant output tensor",
524
+ info="Quantize output tensor separately"
525
+ )
526
+ self.output_tensor_method = gr.Dropdown(
527
+ choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
528
+ label="Output Quantization Method",
529
+ info="use a specific quant type for the output.weight tensor",
530
+ value="Q8_0",
531
+ filterable=False,
532
+ visible=False
533
+ )
534
+
535
+ #####
536
+ # Output Settings section
537
+ #####
538
+ self.private_repo = gr.Checkbox(
539
+ value=False,
540
+ label="Private Repo",
541
+ info="Create a private repo under your username."
542
+ )
543
+ self.repo_name = gr.Textbox(
544
+ label="Output Repository Name",
545
+ info="Set your repository name",
546
+ max_lines=1
547
+ )
548
+ self.gguf_name = gr.Textbox(
549
+ label="Output File Name",
550
+ info="Set output file name",
551
+ max_lines=1
552
+ )
553
 
554
+ #####
555
+ # Buttons section
556
+ #####
557
+ self.clear_btn = gr.ClearButton(
558
+ value="Clear",
559
+ variant="secondary",
560
+ components=[
561
+ self.model_id,
562
+ self.q_method,
563
+ self.use_imatrix,
564
+ self.imatrix_q_method,
565
+ self.private_repo,
566
+ self.train_data_file,
567
+ self.leave_output,
568
+ self.quant_embedding,
569
+ self.embedding_tensor_method,
570
+ self.quant_output,
571
+ self.output_tensor_method,
572
+ self.split_model,
573
+ self.split_max_tensors,
574
+ self.split_max_size,
575
+ self.repo_name,
576
+ self.gguf_name,
577
+ ]
578
  )
579
+ self.submit_btn = gr.Button(
580
+ value="Submit",
581
+ variant="primary"
582
+ )
583
+
584
+ #####
585
+ # Outputs section
586
+ #####
587
+ self.output_label = gr.Markdown(label="output")
588
+ self.output_image = gr.Image(
589
+ show_label=False,
590
+ show_download_button=False,
591
+ interactive=False
592
+ )
593
+
594
+ @staticmethod
595
+ def _update_output_repo(model_id: str, oauth_token: gr.OAuthToken | None) -> str:
596
+ """Update output repository name based on model and user."""
597
+ if oauth_token is None or not oauth_token.token:
598
+ return ""
599
+ if not model_id:
600
+ return ""
601
+ try:
602
+ username = whoami(oauth_token.token)["name"]
603
+ model_name = model_id.split('/')[-1]
604
+ return f"{username}/{model_name}-GGUF"
605
+ except:
606
+ return ""
607
+
608
+ @staticmethod
609
+ def _update_output_filename(model_id: str, use_imatrix: bool, q_method: str, imatrix_q_method: str) -> str:
610
+ """Update output filename based on model and quantization settings."""
611
+ if not model_id:
612
+ return ""
613
+ model_name = model_id.split('/')[-1]
614
+ if use_imatrix:
615
+ return f"{model_name}-{imatrix_q_method.upper()}-imat.gguf"
616
+ return f"{model_name}-{q_method.upper()}.gguf"
617
+
618
+ def _setup_interface(self):
619
+ """Set up the Gradio interface."""
620
+ with gr.Blocks(css=self.css) as self.demo:
621
+ #####
622
+ # Layout
623
+ #####
624
+ gr.Markdown(HuggingFaceModelProcessor.ERROR_LOGIN)
625
+ gr.LoginButton(min_width=250)
626
+ gr.HTML("<h1 style=\"text-aling:center;\">Create your own GGUF Quants!</h1>")
627
+ gr.Markdown(f"The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.<br/>Use via {self.processor.SPACE_URL}")
628
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  with gr.Row():
630
+ with gr.Column() as inputs:
631
+ gr.Markdown("### Model Configuration")
632
+ self.model_id.render()
633
+ with gr.Column():
634
+ self.use_imatrix.render()
635
+ self.q_method.render()
636
+ self.imatrix_q_method.render()
637
+ self.train_data_file.render()
638
+ gr.Markdown("### Advanced Options")
639
+ self.quant_embedding.render()
640
+ self.embedding_tensor_method.render()
641
+ self.leave_output.render()
642
+ self.quant_output.render()
643
+ self.output_tensor_method.render()
644
+ self.split_model.render()
645
+ with gr.Row() as split_options:
646
+ self.split_max_tensors.render()
647
+ self.split_max_size.render()
648
+ gr.Markdown("### Output Settings")
649
+ gr.Markdown("You can customize settings for your GGUF repo.")
650
+ self.private_repo.render()
651
+ with gr.Row():
652
+ self.repo_name.render()
653
+ self.gguf_name.render()
654
+ # Buttons
655
+ with gr.Row() as buttons:
656
+ self.clear_btn.render()
657
+ self.submit_btn.render()
658
+ with gr.Column() as outputs:
659
+ self.output_image.render()
660
+ self.output_label.render()
661
+
662
+ #####
663
+ # Event handlers
664
+ #####
665
+ self.submit_btn.click(
666
+ fn=self._process_model_wrapper,
667
+ inputs=[
668
+ self.model_id,
669
+ self.q_method,
670
+ self.use_imatrix,
671
+ self.imatrix_q_method,
672
+ self.private_repo,
673
+ self.train_data_file,
674
+ self.repo_name,
675
+ self.gguf_name,
676
+ self.quant_embedding,
677
+ self.embedding_tensor_method,
678
+ self.leave_output,
679
+ self.quant_output,
680
+ self.output_tensor_method,
681
+ self.split_model,
682
+ self.split_max_tensors,
683
+ self.split_max_size
684
+ ],
685
+ outputs=[
686
+ self.output_label,
687
+ self.output_image,
688
+ ],
689
+ )
690
+
691
+ #####
692
+ # OnChange handlers
693
+ #####
694
+ self.use_imatrix.change(
695
+ fn=lambda use_imatrix: [gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)],
696
+ inputs=self.use_imatrix,
697
+ outputs=[self.q_method, self.imatrix_q_method, self.train_data_file]
698
+ )
699
+ self.split_model.change(
700
+ fn=lambda split_model: [gr.update(visible=split_model), gr.update(visible=split_model)],
701
+ inputs=self.split_model,
702
+ outputs=[self.split_max_tensors, self.split_max_size]
703
+ )
704
+ self.quant_embedding.change(
705
+ fn=lambda quant_embedding: gr.update(visible=quant_embedding),
706
+ inputs=self.quant_embedding,
707
+ outputs=[self.embedding_tensor_method]
708
+ )
709
+ self.quant_output.change(
710
+ fn=lambda quant_output: [gr.update(visible=quant_output), gr.update(visible=not quant_output)],
711
+ inputs=self.quant_output,
712
+ outputs=[self.output_tensor_method, self.leave_output]
713
+ )
714
+ self.model_id.change(
715
+ fn=self._update_output_repo,
716
+ inputs=[self.model_id],
717
+ outputs=[self.repo_name]
718
+ )
719
+ self.model_id.change(
720
+ fn=self._update_output_filename,
721
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
722
+ outputs=[self.gguf_name]
723
+ )
724
+ self.use_imatrix.change(
725
+ fn=self._update_output_filename,
726
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
727
+ outputs=[self.gguf_name]
728
+ )
729
+ self.q_method.change(
730
+ fn=self._update_output_filename,
731
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
732
+ outputs=[self.gguf_name]
733
+ )
734
+ self.imatrix_q_method.change(
735
+ fn=self._update_output_filename,
736
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
737
+ outputs=[self.gguf_name]
738
+ )
739
+
740
+ def _process_model_wrapper(self, model_id: str, q_method: str, use_imatrix: bool,
741
+ imatrix_q_method: str, private_repo: bool, train_data_file,
742
+ repo_name: str, gguf_name: str, quant_embedding: bool,
743
+ embedding_tensor_method: str, leave_output: bool,
744
+ quant_output: bool, output_tensor_method: str,
745
+ split_model: bool, split_max_tensors, split_max_size: str) -> Tuple[str, str]:
746
+ """Wrapper for the process_model method to handle the conversion."""
747
+ # Create configuration objects
748
+ quant_config = QuantizationConfig(
749
+ method=q_method,
750
+ use_imatrix=use_imatrix,
751
+ imatrix_method=imatrix_q_method,
752
+ quant_embedding=quant_embedding,
753
+ embedding_tensor_method=embedding_tensor_method,
754
+ leave_output=leave_output,
755
+ quant_output=quant_output,
756
+ output_tensor_method=output_tensor_method
757
+ )
758
+
759
+ split_config = SplitConfig(
760
+ enabled=split_model,
761
+ max_tensors=split_max_tensors,
762
+ max_size=split_max_size
763
+ )
764
+
765
+ output_config = OutputConfig(
766
+ private_repo=private_repo,
767
+ repo_name=repo_name,
768
+ filename=gguf_name
769
+ )
770
+
771
+ return self.processor.process_model(model_id, quant_config, split_config, output_config, gr.OAuthToken)
772
+
773
+ def launch(self):
774
+ """Launch the Gradio interface."""
775
+ # Set up space restart scheduler
776
+ def restart_space():
777
+ HfApi().restart_space(repo_id=self.processor.SPACE_ID, token=self.processor.HF_TOKEN, factory_reboot=True)
778
+
779
+ scheduler = BackgroundScheduler()
780
+ scheduler.add_job(restart_space, "interval", seconds=21600)
781
+ scheduler.start()
782
+
783
+ # Launch the interface
784
+ self.demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
785
+
786
+
787
+ # Main execution
788
+ if __name__ == "__main__":
789
+ ui = GGUFConverterUI()
790
+ ui.launch()