Spaces:
Running
Running
# | |
# Shortcut for quantizing HF models using named parameters and short options | |
# | |
# Usage with long options: | |
# ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M | |
# ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix | |
# ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf | |
# ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf --split-model --split-max-tensors 256 --split-max-size 4G | |
# | |
# ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M | |
# ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix | |
# ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf | |
# ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf -split --split-max-tensors 256 --split-max-size 4G | |
# | |
# --- Configuration --- | |
# Path to convert_hf_to_gguf.py | |
CONVERT_SCRIPT_PATH="./llama.cpp/convert_hf_to_gguf.py" | |
# Path to calibration data file for imatrix | |
CALIBRATION_FILE_PATH="./calibration_data_v5_rc.txt" | |
# --- Input Arguments --- | |
# Required: Hugging Face model ID (e.g., meta-llama/Llama-3.2-1B) | |
MODEL_ID="" | |
# Required: Quantization method (e.g., Q4_K_M, Q5_K_M, F16) | |
QUANT_METHOD="" | |
# Optional: "true" to use imatrix, anything else or empty for false | |
USE_IMATRIX="false" | |
# Optional: Final GGUF filename (default: <model_name>-<quant_method>.gguf) | |
OUTPUT_FILENAME="" | |
# Optional: "true" to split the model, anything else or empty for false | |
SPLIT_MODEL="false" | |
# Optional: Max tensors per shard if splitting (default: 256) | |
SPLIT_MAX_TENSORS="256" | |
# Optional: Max size per shard if splitting (e.g., 2G) - overrides SPLIT_MAX_TENSORS if set | |
SPLIT_MAX_SIZE="" | |
# Optional: Quant embeddings tensor | |
TOKEN_EMBEDDING_TYPE="" | |
# Optional: Leave output tensor | |
LEAVE_OUTPUT_TENSOR="false" | |
# Optional: Output Quantization Method | |
OUTPUT_TENSOR_TYPE="" | |
# --- Parse Named Arguments --- | |
while [[ $# -gt 0 ]]; do | |
case $1 in | |
-m|--model) | |
MODEL_ID="$2" | |
shift 2 | |
;; | |
-q|--quant-method) | |
QUANT_METHOD="$2" | |
shift 2 | |
;; | |
-imatrix|--use-imatrix) | |
USE_IMATRIX="true" | |
shift 1 | |
;; | |
-o|--output-filename) | |
OUTPUT_FILENAME="$2" | |
shift 2 | |
;; | |
-split|--split-model) | |
SPLIT_MODEL="true" | |
shift 1 | |
;; | |
--split-max-tensors) | |
SPLIT_MAX_TENSORS="$2" | |
shift 2 | |
;; | |
--split-max-size) | |
SPLIT_MAX_SIZE="$2" | |
shift 2 | |
;; | |
--token-embedding-type) | |
TOKEN_EMBEDDING_TYPE="$2" | |
shift 2 | |
;; | |
--leave-output-tensor) | |
LEAVE_OUTPUT_TENSOR="true" | |
shift 1 | |
;; | |
--output-tensor-type) | |
OUTPUT_TENSOR_TYPE="$2" | |
shift 2 | |
;; | |
-h|--help) | |
echo "Usage:" | |
echo " Long options:" | |
echo " $0 --model <MODEL_ID> --quant-method <QUANT_METHOD> [--use-imatrix] [--output-filename <FILENAME>] [--split-model] [--split-max-tensors <NUM>] [--split-max-size <SIZE>] [--token-embedding-type <QUANT_METHOD>] [--leave-output-tensor] [--output-tensor-type <QUANT_METHOD>]" | |
echo "" | |
echo " Short options:" | |
echo " $0 -m <MODEL_ID> -q <QUANT_METHOD> [-imatrix] [-o <FILENAME>] [-split]" | |
echo "" | |
echo "Examples:" | |
echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M" | |
echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix" | |
echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf" | |
echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf -split --split-max-tensors 256 --split-max-size 4G" | |
exit 0 | |
;; | |
*) | |
echo "Unknown option: $1" | |
echo "Use --help or -h for usage information." | |
exit 1 | |
;; | |
esac | |
done | |
# --- Validation --- | |
if [ -z "$MODEL_ID" ] || [ -z "$QUANT_METHOD" ]; then | |
echo "Error: Both --model (-m) and --quant-method (-q) are required." | |
echo | |
echo "Use --help or -h for usage information." | |
exit 1 | |
fi | |
# --- Derived Variables --- | |
# Extract model name from ID | |
MODEL_NAME=$(basename "$MODEL_ID") | |
# Directory to store intermediate and final files | |
OUTPUT_DIR="./outputs/${MODEL_NAME}" | |
mkdir -p "$OUTPUT_DIR" | |
if [ "$USE_IMATRIX" = "true" ]; then | |
if [ ! -f "$CALIBRATION_FILE_PATH" ]; then | |
echo "Error: Calibration file '$CALIBRATION_FILE_PATH' not found. Please provide it." | |
exit 1 | |
fi | |
fi | |
if [ -z "$OUTPUT_FILENAME" ]; then | |
OUTPUT_FILENAME="${MODEL_NAME}-${QUANT_METHOD}.gguf" | |
fi | |
FP16_MODEL_PATH="$OUTPUT_DIR/${MODEL_NAME}-fp16.gguf" | |
IMATRIX_FILE_PATH="$OUTPUT_DIR/${MODEL_NAME}-imatrix.gguf" | |
QUANTIZED_MODEL_PATH="$OUTPUT_DIR/$OUTPUT_FILENAME" | |
echo "=== Starting GGUF Conversion Pipeline ===" | |
echo "Model ID: $MODEL_ID" | |
echo "Model Name: $MODEL_NAME" | |
echo "Quantization Method: $QUANT_METHOD" | |
echo "Use Imatrix: $USE_IMATRIX" | |
if [ "$USE_IMATRIX" = "true" ]; then | |
echo "Calibration File: $CALIBRATION_FILE_PATH" | |
fi | |
echo "Output Directory: $OUTPUT_DIR" | |
echo "Final Output File: $QUANTIZED_MODEL_PATH" | |
echo "Split Model: $SPLIT_MODEL" | |
if [ "$SPLIT_MODEL" = "true" ]; then | |
if [ -n "$SPLIT_MAX_SIZE" ]; then | |
echo "Split Max Size: $SPLIT_MAX_SIZE" | |
else | |
if [ -z "$SPLIT_MAX_TENSORS" ]; then | |
SPLIT_MAX_TENSORS=256 | |
fi | |
echo "Split Max Tensors: $SPLIT_MAX_TENSORS" | |
fi | |
fi | |
echo "----------------------------------------" | |
if [ -f "$FP16_MODEL_PATH" ]; then | |
echo "FP16 model '$FP16_MODEL_PATH' already exists. Skipping conversion." | |
else | |
# --- Step 1: Check Hugging Face Login --- | |
echo "Checking Hugging Face login status..." | |
if ! hf auth whoami > /dev/null 2>&1; then | |
echo "Error: Not logged into Hugging Face. Please run 'hf auth login' first." | |
exit 1 | |
fi | |
echo "Logged in successfully." | |
# --- Step 2: Download Hugging Face Model --- | |
echo "Downloading model '$MODEL_ID'..." | |
MODEL_DOWNLOAD_DIR="./downloads/$MODEL_NAME" | |
mkdir -p "$MODEL_DOWNLOAD_DIR" | |
# Download necessary files | |
hf download "$MODEL_ID" \ | |
--revision main \ | |
--include "*.md" \ | |
--include "*.json" \ | |
--include "*.model" \ | |
--include "*.safetensors" \ | |
--include "*.bin" \ | |
--local-dir "$MODEL_DOWNLOAD_DIR" | |
if [ $? -ne 0 ]; then | |
echo "Error: Failed to download model '$MODEL_ID'." | |
rm -rf "$MODEL_DOWNLOAD_DIR" | |
exit 1 | |
fi | |
echo "Model downloaded to '$MODEL_DOWNLOAD_DIR'." | |
# Check for LoRA adapter (simplified check) | |
if [ -f "$MODEL_DOWNLOAD_DIR/adapter_config.json" ] && [ ! -f "$MODEL_DOWNLOAD_DIR/config.json" ]; then | |
echo "Error: adapter_config.json found but no config.json. This might be a LoRA adapter. Please use GGUF-my-lora." | |
exit 1 | |
fi | |
# --- Step 3: Convert HF Model to FP16 GGUF --- | |
echo "Converting Hugging Face model to FP16 GGUF..." | |
python3 "$CONVERT_SCRIPT_PATH" "$MODEL_DOWNLOAD_DIR" \ | |
--outtype f16 \ | |
--outfile "$FP16_MODEL_PATH" | |
if [ $? -ne 0 ]; then | |
echo "Error: Failed to convert model to FP16 GGUF." | |
rm -f "$FP16_MODEL_PATH" | |
exit 1 | |
fi | |
echo "FP16 GGUF model created at '$FP16_MODEL_PATH'." | |
fi | |
# --- Step 4: (Optional) Generate Imatrix --- | |
if [ "$USE_IMATRIX" = "true" ]; then | |
if [ -f "$IMATRIX_FILE_PATH" ]; then | |
echo "Imatrix file '$IMATRIX_FILE_PATH' already exists. Skipping generation." | |
else | |
echo "Generating importance matrix (imatrix)..." | |
IMATRIX_CMD=( | |
llama-imatrix | |
-m "$FP16_MODEL_PATH" | |
-f "$CALIBRATION_FILE_PATH" | |
-ngl 99 | |
--output-frequency 10 | |
-o "$IMATRIX_FILE_PATH" | |
) | |
echo "Running command: ${IMATRIX_CMD[*]}" | |
"${IMATRIX_CMD[@]}" | |
if [ $? -ne 0 ]; then | |
echo "Error: Failed to generate imatrix." | |
rm -f "$IMATRIX_FILE_PATH" | |
exit 1 | |
fi | |
echo "Imatrix generated at '$IMATRIX_FILE_PATH'." | |
fi | |
fi | |
# --- Step 5: Quantize the GGUF Model --- | |
echo "Quantizing GGUF model..." | |
QUANTIZE_CMD=( | |
llama-quantize | |
) | |
if [ "$USE_IMATRIX" = "true" ] && [ -f "$IMATRIX_FILE_PATH" ]; then | |
QUANTIZE_CMD+=( | |
--imatrix "$IMATRIX_FILE_PATH" | |
) | |
fi | |
if [ -n "$TOKEN_EMBEDDING_TYPE" ]; then | |
QUANTIZE_CMD+=( | |
--token-embedding-type "$TOKEN_EMBEDDING_TYPE" | |
) | |
fi | |
if [ "$LEAVE_OUTPUT_TENSOR" = "true" ]; then | |
QUANTIZE_CMD+=( | |
--leave-output-tensor | |
) | |
else | |
if [ -n "$OUTPUT_TENSOR_TYPE" ]; then | |
QUANTIZE_CMD+=( | |
--output-tensor-type "$OUTPUT_TENSOR_TYPE" | |
) | |
fi | |
fi | |
QUANTIZE_CMD+=( | |
"$FP16_MODEL_PATH" | |
"$QUANTIZED_MODEL_PATH" | |
"$QUANT_METHOD" | |
) | |
echo "Running command: ${QUANTIZE_CMD[*]}" | |
"${QUANTIZE_CMD[@]}" | |
if [ $? -ne 0 ]; then | |
echo "Error: Failed to quantize model." | |
rm -f "$QUANTIZED_MODEL_PATH" | |
exit 1 | |
fi | |
echo "Model quantized successfully to '$QUANTIZED_MODEL_PATH'." | |
# --- Step 6: (Optional) Split the Quantized Model --- | |
if [ "$SPLIT_MODEL" = "true" ]; then | |
echo "Splitting quantized model..." | |
SPLIT_CMD=( | |
llama-gguf-split | |
--split | |
) | |
if [ -n "$SPLIT_MAX_SIZE" ]; then | |
SPLIT_CMD+=(--split-max-size "$SPLIT_MAX_SIZE") | |
else | |
SPLIT_CMD+=(--split-max-tensors "$SPLIT_MAX_TENSORS") | |
fi | |
# Output prefix (without .gguf extension) | |
OUTPUT_PREFIX="${QUANTIZED_MODEL_PATH%.gguf}" | |
SPLIT_CMD+=("$QUANTIZED_MODEL_PATH" "$OUTPUT_PREFIX") | |
echo "Running command: ${SPLIT_CMD[*]}" | |
"${SPLIT_CMD[@]}" | |
if [ $? -ne 0 ]; then | |
echo "Error: Failed to split model." | |
exit 1 | |
fi | |
# Remove the original unsplit file | |
if [ -f "$QUANTIZED_MODEL_PATH" ]; then | |
rm "$QUANTIZED_MODEL_PATH" | |
echo "Removed original unsplit file '$QUANTIZED_MODEL_PATH'." | |
fi | |
echo "Model split successfully. Shards are in '$OUTPUT_DIR' with prefix '$OUTPUT_PREFIX'." | |
else | |
echo "Model splitting skipped." | |
fi | |
echo "=== GGUF Conversion Pipeline Completed Successfully ===" | |
if [ "$SPLIT_MODEL" = "true" ]; then | |
echo "Check directory '$OUTPUT_DIR' for split GGUF files." | |
else | |
echo "Final GGUF file is located at: $QUANTIZED_MODEL_PATH" | |
fi |