Spaces:
Sleeping
Sleeping
Oleg Shulyakov
commited on
Commit
·
17f9e2b
1
Parent(s):
d9e2874
imatrix change
Browse files
app.py
CHANGED
@@ -62,6 +62,7 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
|
|
62 |
"-f", train_data_path,
|
63 |
"-ngl", "99",
|
64 |
"--output-frequency", "10",
|
|
|
65 |
"-o", output_path,
|
66 |
]
|
67 |
process = subprocess.Popen(imatrix_command, shell=False)
|
@@ -77,7 +78,7 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
|
|
77 |
print("Imatrix proc still didn't term. Forecfully terming process...")
|
78 |
process.kill()
|
79 |
|
80 |
-
print("Importance matrix generation completed.")
|
81 |
|
82 |
def split_upload_model(model_path: str, outdir: str, repo_id: str, token: str, split_max_tensors=256, split_max_size=None):
|
83 |
print(f"Model path: {model_path}")
|
@@ -171,7 +172,7 @@ def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDir
|
|
171 |
raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
|
172 |
|
173 |
# Convert HF to GGUF
|
174 |
-
fp16_model = str(Path(outdir)/f"{model_name}
|
175 |
print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
|
176 |
result = subprocess.run(
|
177 |
[
|
@@ -197,27 +198,13 @@ def quantize_model(
|
|
197 |
q_method: str,
|
198 |
use_imatrix: bool,
|
199 |
imatrix_q_method: str,
|
200 |
-
|
201 |
quant_embedding: bool,
|
202 |
embedding_tensor_method: str,
|
203 |
leave_output: bool,
|
204 |
quant_output: bool,
|
205 |
output_tensor_method: str,
|
206 |
):
|
207 |
-
if use_imatrix:
|
208 |
-
train_data_path = "calibration_data_v5_rc.txt" #fallback calibration dataset
|
209 |
-
# if train_data_file:
|
210 |
-
# train_data_path = train_data_file.name
|
211 |
-
|
212 |
-
print(f"Training data file path: {train_data_path}")
|
213 |
-
|
214 |
-
if not os.path.isfile(train_data_path):
|
215 |
-
raise Exception(f"Training data file not found: {train_data_path}")
|
216 |
-
|
217 |
-
generate_importance_matrix(fp16, train_data_path, imatrix_path)
|
218 |
-
else:
|
219 |
-
print("Not using imatrix quantization.")
|
220 |
-
|
221 |
# Quantize the model
|
222 |
quantize_cmd = ["llama-quantize"]
|
223 |
|
@@ -230,9 +217,23 @@ def quantize_model(
|
|
230 |
if quant_output:
|
231 |
quantize_cmd.append("--output-tensor-type")
|
232 |
quantize_cmd.append(output_tensor_method)
|
|
|
233 |
if use_imatrix:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
quantize_cmd.append("--imatrix")
|
235 |
-
quantize_cmd.append(
|
|
|
|
|
236 |
|
237 |
quantized_gguf = str(Path(outdir)/gguf_name)
|
238 |
quantize_cmd.append(fp16)
|
@@ -339,8 +340,8 @@ def process_model(
|
|
339 |
try:
|
340 |
with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
|
341 |
fp16 = download_base_model(token, model_id, outdir)
|
342 |
-
|
343 |
-
quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method,
|
344 |
|
345 |
# Create empty repo
|
346 |
api = HfApi(token=token)
|
@@ -350,11 +351,11 @@ def process_model(
|
|
350 |
|
351 |
# Upload model
|
352 |
if split_model:
|
353 |
-
print(f"Splitting quantized model: {quantized_gguf}")
|
354 |
split_upload_model(str(quantized_gguf), outdir, new_repo_id, token, split_max_tensors, split_max_size)
|
355 |
else:
|
356 |
try:
|
357 |
-
print(f"Uploading quantized model: {quantized_gguf}")
|
358 |
api.upload_file(
|
359 |
path_or_fileobj=quantized_gguf,
|
360 |
path_in_repo=gguf_name,
|
@@ -363,11 +364,11 @@ def process_model(
|
|
363 |
except Exception as e:
|
364 |
raise Exception(f"Error uploading quantized model: {e}")
|
365 |
|
366 |
-
if os.path.isfile(
|
367 |
try:
|
368 |
-
print(f"Uploading imatrix.dat: {
|
369 |
api.upload_file(
|
370 |
-
path_or_fileobj=
|
371 |
path_in_repo="imatrix.dat",
|
372 |
repo_id=new_repo_id,
|
373 |
)
|
|
|
62 |
"-f", train_data_path,
|
63 |
"-ngl", "99",
|
64 |
"--output-frequency", "10",
|
65 |
+
"--output-format", "dat",
|
66 |
"-o", output_path,
|
67 |
]
|
68 |
process = subprocess.Popen(imatrix_command, shell=False)
|
|
|
78 |
print("Imatrix proc still didn't term. Forecfully terming process...")
|
79 |
process.kill()
|
80 |
|
81 |
+
print(f"Importance matrix generation completed: {os.path.abspath(output_path)}")
|
82 |
|
83 |
def split_upload_model(model_path: str, outdir: str, repo_id: str, token: str, split_max_tensors=256, split_max_size=None):
|
84 |
print(f"Model path: {model_path}")
|
|
|
172 |
raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
|
173 |
|
174 |
# Convert HF to GGUF
|
175 |
+
fp16_model = str(Path(outdir)/f"{model_name}-fp16.gguf")
|
176 |
print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
|
177 |
result = subprocess.run(
|
178 |
[
|
|
|
198 |
q_method: str,
|
199 |
use_imatrix: bool,
|
200 |
imatrix_q_method: str,
|
201 |
+
imatrix_file: str,
|
202 |
quant_embedding: bool,
|
203 |
embedding_tensor_method: str,
|
204 |
leave_output: bool,
|
205 |
quant_output: bool,
|
206 |
output_tensor_method: str,
|
207 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
# Quantize the model
|
209 |
quantize_cmd = ["llama-quantize"]
|
210 |
|
|
|
217 |
if quant_output:
|
218 |
quantize_cmd.append("--output-tensor-type")
|
219 |
quantize_cmd.append(output_tensor_method)
|
220 |
+
|
221 |
if use_imatrix:
|
222 |
+
train_data_path = "calibration_data_v5_rc.txt" #fallback calibration dataset
|
223 |
+
# if train_data_file:
|
224 |
+
# train_data_path = train_data_file.name
|
225 |
+
|
226 |
+
print(f"Training data file path: {train_data_path}")
|
227 |
+
|
228 |
+
if not os.path.isfile(train_data_path):
|
229 |
+
raise Exception(f"Training data file not found: {train_data_path}")
|
230 |
+
|
231 |
+
generate_importance_matrix(fp16, train_data_path, imatrix_file)
|
232 |
+
|
233 |
quantize_cmd.append("--imatrix")
|
234 |
+
quantize_cmd.append(imatrix_file)
|
235 |
+
else:
|
236 |
+
print("Not using imatrix quantization.")
|
237 |
|
238 |
quantized_gguf = str(Path(outdir)/gguf_name)
|
239 |
quantize_cmd.append(fp16)
|
|
|
340 |
try:
|
341 |
with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
|
342 |
fp16 = download_base_model(token, model_id, outdir)
|
343 |
+
imatrix_file = Path(outdir)/f"{get_model_name(model_id)}-imatrix.dat"
|
344 |
+
quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method, imatrix_file, quant_embedding, embedding_tensor_method, leave_output, quant_output, output_tensor_method)
|
345 |
|
346 |
# Create empty repo
|
347 |
api = HfApi(token=token)
|
|
|
351 |
|
352 |
# Upload model
|
353 |
if split_model:
|
354 |
+
print(f"Splitting quantized model: {os.path.abspath(quantized_gguf)}")
|
355 |
split_upload_model(str(quantized_gguf), outdir, new_repo_id, token, split_max_tensors, split_max_size)
|
356 |
else:
|
357 |
try:
|
358 |
+
print(f"Uploading quantized model: {os.path.abspath(quantized_gguf)}")
|
359 |
api.upload_file(
|
360 |
path_or_fileobj=quantized_gguf,
|
361 |
path_in_repo=gguf_name,
|
|
|
364 |
except Exception as e:
|
365 |
raise Exception(f"Error uploading quantized model: {e}")
|
366 |
|
367 |
+
if os.path.isfile(imatrix_file):
|
368 |
try:
|
369 |
+
print(f"Uploading imatrix.dat: {os.path.abspath(output_path)}")
|
370 |
api.upload_file(
|
371 |
+
path_or_fileobj=imatrix_file,
|
372 |
path_in_repo="imatrix.dat",
|
373 |
repo_id=new_repo_id,
|
374 |
)
|