Spaces:

ggml-org
/

gguf-my-repo

Running on A10G

App Files Files Community

190

reach-vb HF Staff commited on Mar 27, 2024

Commit

804c57e

1 Parent(s): f749def

Update app.py (#10)

Browse files

- Update app.py (82b293ec8b0c6ef5e44c1bd6bc4e5c6ccbed7b97)

Files changed (1) hide show

app.py +34 -11

app.py CHANGED Viewed

@@ -11,25 +11,41 @@ from huggingface_hub import ModelCard
 from textwrap import dedent
-api = HfApi()
 def process_model(model_id, q_method, hf_token):
     MODEL_NAME = model_id.split('/')[-1]
     fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
     username = whoami(hf_token)["name"]
     snapshot_download(repo_id=model_id, local_dir = f"{MODEL_NAME}", local_dir_use_symlinks=False)
     print("Model downloaded successully!")
-    fp16_conversion = f"python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}"
-    subprocess.run(fp16_conversion, shell=True)
     print("Model converted to fp16 successully!")
     qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{q_method.upper()}.gguf"
     quantise_ggml = f"./llama.cpp/quantize {fp16} {qtype} {q_method}"
-    subprocess.run(quantise_ggml, shell=True)
     print("Quantised successfully!")
     # Create empty repo
@@ -40,8 +56,7 @@ def process_model(model_id, q_method, hf_token):
         exist_ok=True,
         token=hf_token
     )
-    print("Empty repo created successfully!")
     card = ModelCard.load(model_id)
     card.data.tags = ["llama-cpp"] if card.data.tags is None else card.data.tags + ["llama-cpp"]
@@ -59,6 +74,10 @@ def process_model(model_id, q_method, hf_token):
         ```bash
         llama-cli --hf-repo {repo_id} --model {qtype.split("/")[-1]} -p "The meaning to life and the universe is "
         ```
         """
     )
     card.save(os.path.join(MODEL_NAME, "README-new.md"))
@@ -93,17 +112,21 @@ iface = gr.Interface(
         gr.Textbox(
             lines=1,
             label="Hub Model ID",
-            info="Model repo ID"
         ),
         gr.Dropdown(
             ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
             label="Quantization Method",
-            info="GGML quantisation type"
         ),
         gr.Textbox(
             lines=1,
             label="HF Write Token",
-            info="https://hf.co/settings/token"
         )
     ],
     outputs=[

 from textwrap import dedent
+LLAMA_LIKE_ARCHS = ["MistralForCausalLM", "LlamaForCausalLM"]
+def script_to_use(model_id, api):
+    info = api.model_info(model_id)
+    if info.config is None:
+        return None
+    arch = info.config.get("architectures", None)
+    if arch is None:
+        return None
+    arch = arch[0]
+    return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
 def process_model(model_id, q_method, hf_token):
     MODEL_NAME = model_id.split('/')[-1]
     fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
+    api = HfApi(token=hf_token)
     username = whoami(hf_token)["name"]
     snapshot_download(repo_id=model_id, local_dir = f"{MODEL_NAME}", local_dir_use_symlinks=False)
     print("Model downloaded successully!")
+    conversion_script = script_to_use(model_id, api)
+    fp16_conversion = f"python llama.cpp/{conversion_script} {MODEL_NAME} --outtype f16 --outfile {fp16}"
+    result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
+    if result.returncode != 0:
+        return (f"Error converting to fp16: {result.stderr}", "error.png")
     print("Model converted to fp16 successully!")
     qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{q_method.upper()}.gguf"
     quantise_ggml = f"./llama.cpp/quantize {fp16} {qtype} {q_method}"
+    result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
+    if result.returncode != 0:
+        return (f"Error quantizing: {result.stderr}", "error.png")
     print("Quantised successfully!")
     # Create empty repo
         exist_ok=True,
         token=hf_token
     )
+    print("Repo created successfully!")
     card = ModelCard.load(model_id)
     card.data.tags = ["llama-cpp"] if card.data.tags is None else card.data.tags + ["llama-cpp"]
         ```bash
         llama-cli --hf-repo {repo_id} --model {qtype.split("/")[-1]} -p "The meaning to life and the universe is "
         ```
+        ```bash
+        llama-server --hf-repo {repo_id} --model {qtype.split("/")[-1]} -c 2048
+        ```
         """
     )
     card.save(os.path.join(MODEL_NAME, "README-new.md"))
         gr.Textbox(
             lines=1,
             label="Hub Model ID",
+            info="Model repo ID",
+            placeholder="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            value="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
         ),
         gr.Dropdown(
             ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
             label="Quantization Method",
+            info="GGML quantisation type",
+            value="Q4_K_M",
         ),
         gr.Textbox(
             lines=1,
             label="HF Write Token",
+            info="https://hf.co/settings/token",
+            type="password",
         )
     ],
     outputs=[