Oleg Shulyakov commited on
Commit
17f9e2b
·
1 Parent(s): d9e2874

imatrix change

Browse files
Files changed (1) hide show
  1. app.py +26 -25
app.py CHANGED
@@ -62,6 +62,7 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
62
  "-f", train_data_path,
63
  "-ngl", "99",
64
  "--output-frequency", "10",
 
65
  "-o", output_path,
66
  ]
67
  process = subprocess.Popen(imatrix_command, shell=False)
@@ -77,7 +78,7 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
77
  print("Imatrix proc still didn't term. Forecfully terming process...")
78
  process.kill()
79
 
80
- print("Importance matrix generation completed.")
81
 
82
  def split_upload_model(model_path: str, outdir: str, repo_id: str, token: str, split_max_tensors=256, split_max_size=None):
83
  print(f"Model path: {model_path}")
@@ -171,7 +172,7 @@ def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDir
171
  raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
172
 
173
  # Convert HF to GGUF
174
- fp16_model = str(Path(outdir)/f"{model_name}_fp16.gguf")
175
  print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
176
  result = subprocess.run(
177
  [
@@ -197,27 +198,13 @@ def quantize_model(
197
  q_method: str,
198
  use_imatrix: bool,
199
  imatrix_q_method: str,
200
- imatrix_path: str,
201
  quant_embedding: bool,
202
  embedding_tensor_method: str,
203
  leave_output: bool,
204
  quant_output: bool,
205
  output_tensor_method: str,
206
  ):
207
- if use_imatrix:
208
- train_data_path = "calibration_data_v5_rc.txt" #fallback calibration dataset
209
- # if train_data_file:
210
- # train_data_path = train_data_file.name
211
-
212
- print(f"Training data file path: {train_data_path}")
213
-
214
- if not os.path.isfile(train_data_path):
215
- raise Exception(f"Training data file not found: {train_data_path}")
216
-
217
- generate_importance_matrix(fp16, train_data_path, imatrix_path)
218
- else:
219
- print("Not using imatrix quantization.")
220
-
221
  # Quantize the model
222
  quantize_cmd = ["llama-quantize"]
223
 
@@ -230,9 +217,23 @@ def quantize_model(
230
  if quant_output:
231
  quantize_cmd.append("--output-tensor-type")
232
  quantize_cmd.append(output_tensor_method)
 
233
  if use_imatrix:
 
 
 
 
 
 
 
 
 
 
 
234
  quantize_cmd.append("--imatrix")
235
- quantize_cmd.append(imatrix_path)
 
 
236
 
237
  quantized_gguf = str(Path(outdir)/gguf_name)
238
  quantize_cmd.append(fp16)
@@ -339,8 +340,8 @@ def process_model(
339
  try:
340
  with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
341
  fp16 = download_base_model(token, model_id, outdir)
342
- imatrix_path = Path(outdir)/"imatrix.dat"
343
- quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method, imatrix_path, quant_embedding, embedding_tensor_method, leave_output, quant_output, output_tensor_method)
344
 
345
  # Create empty repo
346
  api = HfApi(token=token)
@@ -350,11 +351,11 @@ def process_model(
350
 
351
  # Upload model
352
  if split_model:
353
- print(f"Splitting quantized model: {quantized_gguf}")
354
  split_upload_model(str(quantized_gguf), outdir, new_repo_id, token, split_max_tensors, split_max_size)
355
  else:
356
  try:
357
- print(f"Uploading quantized model: {quantized_gguf}")
358
  api.upload_file(
359
  path_or_fileobj=quantized_gguf,
360
  path_in_repo=gguf_name,
@@ -363,11 +364,11 @@ def process_model(
363
  except Exception as e:
364
  raise Exception(f"Error uploading quantized model: {e}")
365
 
366
- if os.path.isfile(imatrix_path):
367
  try:
368
- print(f"Uploading imatrix.dat: {imatrix_path}")
369
  api.upload_file(
370
- path_or_fileobj=imatrix_path,
371
  path_in_repo="imatrix.dat",
372
  repo_id=new_repo_id,
373
  )
 
62
  "-f", train_data_path,
63
  "-ngl", "99",
64
  "--output-frequency", "10",
65
+ "--output-format", "dat",
66
  "-o", output_path,
67
  ]
68
  process = subprocess.Popen(imatrix_command, shell=False)
 
78
  print("Imatrix proc still didn't term. Forecfully terming process...")
79
  process.kill()
80
 
81
+ print(f"Importance matrix generation completed: {os.path.abspath(output_path)}")
82
 
83
  def split_upload_model(model_path: str, outdir: str, repo_id: str, token: str, split_max_tensors=256, split_max_size=None):
84
  print(f"Model path: {model_path}")
 
172
  raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
173
 
174
  # Convert HF to GGUF
175
+ fp16_model = str(Path(outdir)/f"{model_name}-fp16.gguf")
176
  print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
177
  result = subprocess.run(
178
  [
 
198
  q_method: str,
199
  use_imatrix: bool,
200
  imatrix_q_method: str,
201
+ imatrix_file: str,
202
  quant_embedding: bool,
203
  embedding_tensor_method: str,
204
  leave_output: bool,
205
  quant_output: bool,
206
  output_tensor_method: str,
207
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  # Quantize the model
209
  quantize_cmd = ["llama-quantize"]
210
 
 
217
  if quant_output:
218
  quantize_cmd.append("--output-tensor-type")
219
  quantize_cmd.append(output_tensor_method)
220
+
221
  if use_imatrix:
222
+ train_data_path = "calibration_data_v5_rc.txt" #fallback calibration dataset
223
+ # if train_data_file:
224
+ # train_data_path = train_data_file.name
225
+
226
+ print(f"Training data file path: {train_data_path}")
227
+
228
+ if not os.path.isfile(train_data_path):
229
+ raise Exception(f"Training data file not found: {train_data_path}")
230
+
231
+ generate_importance_matrix(fp16, train_data_path, imatrix_file)
232
+
233
  quantize_cmd.append("--imatrix")
234
+ quantize_cmd.append(imatrix_file)
235
+ else:
236
+ print("Not using imatrix quantization.")
237
 
238
  quantized_gguf = str(Path(outdir)/gguf_name)
239
  quantize_cmd.append(fp16)
 
340
  try:
341
  with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
342
  fp16 = download_base_model(token, model_id, outdir)
343
+ imatrix_file = Path(outdir)/f"{get_model_name(model_id)}-imatrix.dat"
344
+ quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method, imatrix_file, quant_embedding, embedding_tensor_method, leave_output, quant_output, output_tensor_method)
345
 
346
  # Create empty repo
347
  api = HfApi(token=token)
 
351
 
352
  # Upload model
353
  if split_model:
354
+ print(f"Splitting quantized model: {os.path.abspath(quantized_gguf)}")
355
  split_upload_model(str(quantized_gguf), outdir, new_repo_id, token, split_max_tensors, split_max_size)
356
  else:
357
  try:
358
+ print(f"Uploading quantized model: {os.path.abspath(quantized_gguf)}")
359
  api.upload_file(
360
  path_or_fileobj=quantized_gguf,
361
  path_in_repo=gguf_name,
 
364
  except Exception as e:
365
  raise Exception(f"Error uploading quantized model: {e}")
366
 
367
+ if os.path.isfile(imatrix_file):
368
  try:
369
+ print(f"Uploading imatrix.dat: {os.path.abspath(output_path)}")
370
  api.upload_file(
371
+ path_or_fileobj=imatrix_file,
372
  path_in_repo="imatrix.dat",
373
  repo_id=new_repo_id,
374
  )