gguf-my-repo

Running

App Files Files Community

Oleg Shulyakov commited on 27 days ago

Commit

e395b9b

1 Parent(s): 55ecc95

Move quantization to separate method

Browse files

Files changed (1) hide show

app.py +43 -39

app.py CHANGED Viewed

@@ -189,7 +189,45 @@ def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDir
         return fp16_model
-def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, repo_name, gguf_name, oauth_token: gr.OAuthToken | None):
     # validate the oauth token
     if is_valid_token(oauth_token) is False:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
@@ -201,42 +239,8 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
     try:
         with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
             fp16 = download_base_model(oauth_token.token, model_id, outdir)
             imatrix_path = Path(outdir)/"imatrix.dat"
-            if use_imatrix:
-                if train_data_file:
-                    train_data_path = train_data_file.name
-                else:
-                    train_data_path = "train_data.txt" #fallback calibration dataset
-                print(f"Training data file path: {train_data_path}")
-                if not os.path.isfile(train_data_path):
-                    raise Exception(f"Training data file not found: {train_data_path}")
-                generate_importance_matrix(fp16, train_data_path, imatrix_path)
-            else:
-                print("Not using imatrix quantization.")
-            # Quantize the model
-            quantized_gguf_path = str(Path(outdir)/gguf_name)
-            if use_imatrix:
-                quantise_ggml = [
-                    "llama-quantize",
-                    "--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method
-                ]
-            else:
-                quantise_ggml = [
-                    "llama-quantize",
-                    fp16, quantized_gguf_path, q_method
-                ]
-            result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
-            if result.returncode != 0:
-                stderr_str = result.stderr.decode("utf-8")
-                raise Exception(f"Error quantizing: {stderr_str}")
-            print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
-            print(f"Quantized model path: {os.path.abspath(quantized_gguf_path)}")
             # Create empty repo
             api = HfApi(token=oauth_token.token)
@@ -312,12 +316,12 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
             card.save(readme_path)
             if split_model:
-                split_upload_model(str(quantized_gguf_path), outdir, new_repo_id, oauth_token, split_max_tensors, split_max_size)
             else:
                 try:
-                    print(f"Uploading quantized model: {quantized_gguf_path}")
                     api.upload_file(
-                        path_or_fileobj=quantized_gguf_path,
                         path_in_repo=gguf_name,
                         repo_id=new_repo_id,
                     )

         return fp16_model
+def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_method: str, use_imatrix: bool, imatrix_q_method: str, imatrix_path: str):
+    if use_imatrix:
+        if train_data_file:
+            train_data_path = train_data_file.name
+        else:
+            train_data_path = "train_data.txt" #fallback calibration dataset
+        print(f"Training data file path: {train_data_path}")
+        if not os.path.isfile(train_data_path):
+            raise Exception(f"Training data file not found: {train_data_path}")
+        generate_importance_matrix(fp16, train_data_path, imatrix_path)
+    else:
+        print("Not using imatrix quantization.")
+    # Quantize the model
+    quantized_gguf = str(Path(outdir)/gguf_name)
+    if use_imatrix:
+        quantize_cmd = [
+            "llama-quantize",
+            "--imatrix", imatrix_path, fp16, quantized_gguf, imatrix_q_method
+        ]
+    else:
+        quantize_cmd = [
+            "llama-quantize",
+            fp16, quantized_gguf, q_method
+        ]
+    result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
+    if result.returncode != 0:
+        stderr_str = result.stderr.decode("utf-8")
+        raise Exception(f"Error quantizing: {stderr_str}")
+    print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
+    print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
+    return quantized_gguf
+def process_model(model_id: str, q_method: str, use_imatrix: bool, imatrix_q_method: str, private_repo: bool, train_data_file, split_model: bool, split_max_tensors, split_max_size: str | None, repo_name: str, gguf_name: str, oauth_token: gr.OAuthToken | None):
     # validate the oauth token
     if is_valid_token(oauth_token) is False:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
     try:
         with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
             fp16 = download_base_model(oauth_token.token, model_id, outdir)
             imatrix_path = Path(outdir)/"imatrix.dat"
+            quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method, imatrix_path)
             # Create empty repo
             api = HfApi(token=oauth_token.token)
             card.save(readme_path)
             if split_model:
+                split_upload_model(str(quantized_gguf), outdir, new_repo_id, oauth_token, split_max_tensors, split_max_size)
             else:
                 try:
+                    print(f"Uploading quantized model: {quantized_gguf}")
                     api.upload_file(
+                        path_or_fileobj=quantized_gguf,
                         path_in_repo=gguf_name,
                         repo_id=new_repo_id,
                     )