Spaces:
Running
Running
Oleg Shulyakov
commited on
Commit
·
7deeb97
1
Parent(s):
ac04c37
Shell script
Browse files- hf-quantize.sh +260 -0
hf-quantize.sh
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#
|
3 |
+
# Shortcut for quantizing HF models
|
4 |
+
#
|
5 |
+
# Usage:
|
6 |
+
# ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M
|
7 |
+
# ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M true
|
8 |
+
# ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf
|
9 |
+
# ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf true 256 4G
|
10 |
+
#
|
11 |
+
|
12 |
+
# --- Configuration ---
|
13 |
+
|
14 |
+
# Path to convert_hf_to_gguf.py
|
15 |
+
CONVERT_SCRIPT_PATH="./llma.cpp/convert_hf_to_gguf.py"
|
16 |
+
|
17 |
+
# Path to calibration data file for imatrix
|
18 |
+
CALIBRATION_FILE_PATH="./calibration_data_v5_rc.txt"
|
19 |
+
|
20 |
+
# --- Input Arguments ---
|
21 |
+
# Required: Hugging Face model ID (e.g., meta-llama/Llama-3.2-1B)
|
22 |
+
MODEL_ID="$1"
|
23 |
+
|
24 |
+
# Required: Quantization method (e.g., Q4_K_M, Q5_K_M, F16)
|
25 |
+
QUANT_METHOD="$2"
|
26 |
+
|
27 |
+
# Optional: "true" to use imatrix, anything else or empty for false
|
28 |
+
USE_IMATRIX="$3"
|
29 |
+
|
30 |
+
# Optional: Final GGUF filename (default: <model_name>-<quant_method>.gguf or <model_name>-<quant_method>-imat.gguf)
|
31 |
+
OUTPUT_FILENAME="$4"
|
32 |
+
|
33 |
+
# Optional: "true" to split the model, anything else or empty for false
|
34 |
+
SPLIT_MODEL="$5"
|
35 |
+
|
36 |
+
# Optional: Max tensors per shard if splitting (default: 256)
|
37 |
+
SPLIT_MAX_TENSORS="$6"
|
38 |
+
|
39 |
+
# Optional: Max size per shard if splitting (e.g., 2G) - overrides SPLIT_MAX_TENSORS if set
|
40 |
+
SPLIT_MAX_SIZE="$7"
|
41 |
+
|
42 |
+
# --- Derived Variables ---
|
43 |
+
if [ -z "$MODEL_ID" ] || [ -z "$QUANT_METHOD" ]; then
|
44 |
+
echo "Usage: $0 <MODEL_ID> <QUANT_METHOD> [USE_IMATRIX] [OUTPUT_FILENAME] [SPLIT_MODEL] [SPLIT_MAX_TENSORS] [SPLIT_MAX_SIZE]"
|
45 |
+
echo " $0 meta-llama/Llama-2-7b Q4_K_M"
|
46 |
+
echo " $0 meta-llama/Llama-2-7b Q4_K_M true"
|
47 |
+
echo " $0 meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf"
|
48 |
+
echo " $0 meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf true 256 4G"
|
49 |
+
exit 1
|
50 |
+
fi
|
51 |
+
|
52 |
+
# Extract model name from ID
|
53 |
+
MODEL_NAME=$(basename "$MODEL_ID")
|
54 |
+
|
55 |
+
# Directory to store intermediate and final files
|
56 |
+
OUTPUT_DIR="./outputs/${MODEL_NAME}"
|
57 |
+
mkdir -p "$OUTPUT_DIR"
|
58 |
+
|
59 |
+
if [ "$USE_IMATRIX" = "true" ]; then
|
60 |
+
if [ ! -f "$CALIBRATION_FILE_PATH" ]; then
|
61 |
+
echo "Error: Calibration file '$CALIBRATION_FILE_PATH' not found. Please provide it."
|
62 |
+
exit 1
|
63 |
+
fi
|
64 |
+
if [ -z "$OUTPUT_FILENAME" ]; then
|
65 |
+
OUTPUT_FILENAME="${MODEL_NAME}-${QUANT_METHOD}-imat.gguf"
|
66 |
+
fi
|
67 |
+
else
|
68 |
+
if [ -z "$OUTPUT_FILENAME" ]; then
|
69 |
+
OUTPUT_FILENAME="${MODEL_NAME}-${QUANT_METHOD}.gguf"
|
70 |
+
fi
|
71 |
+
fi
|
72 |
+
|
73 |
+
FP16_MODEL_PATH="$OUTPUT_DIR/${MODEL_NAME}-fp16.gguf"
|
74 |
+
IMATRIX_FILE_PATH="$OUTPUT_DIR/${MODEL_NAME}-imatrix.gguf"
|
75 |
+
QUANTIZED_MODEL_PATH="$OUTPUT_DIR/$OUTPUT_FILENAME"
|
76 |
+
|
77 |
+
echo "=== Starting GGUF Conversion Pipeline ==="
|
78 |
+
echo "Model ID: $MODEL_ID"
|
79 |
+
echo "Model Name: $MODEL_NAME"
|
80 |
+
echo "Quantization Method: $QUANT_METHOD"
|
81 |
+
echo "Use Imatrix: $USE_IMATRIX"
|
82 |
+
if [ "$USE_IMATRIX" = "true" ]; then
|
83 |
+
echo "Calibration File: $CALIBRATION_FILE_PATH"
|
84 |
+
fi
|
85 |
+
echo "Output Directory: $OUTPUT_DIR"
|
86 |
+
echo "Final Output File: $QUANTIZED_MODEL_PATH"
|
87 |
+
echo "Split Model: $SPLIT_MODEL"
|
88 |
+
if [ "$SPLIT_MODEL" = "true" ]; then
|
89 |
+
if [ -n "$SPLIT_MAX_SIZE" ]; then
|
90 |
+
echo "Split Max Size: $SPLIT_MAX_SIZE"
|
91 |
+
else
|
92 |
+
if [ -z "$SPLIT_MAX_TENSORS" ]; then
|
93 |
+
SPLIT_MAX_TENSORS=256
|
94 |
+
fi
|
95 |
+
echo "Split Max Tensors: $SPLIT_MAX_TENSORS"
|
96 |
+
fi
|
97 |
+
fi
|
98 |
+
echo "----------------------------------------"
|
99 |
+
|
100 |
+
if [ -f "$FP16_MODEL_PATH" ]; then
|
101 |
+
echo "FP16 model '$FP16_MODEL_PATH' already exists. Skipping conversion."
|
102 |
+
else
|
103 |
+
# --- Step 1: Check Hugging Face Login ---
|
104 |
+
echo "Checking Hugging Face login status..."
|
105 |
+
if ! huggingface-cli whoami > /dev/null 2>&1; then
|
106 |
+
echo "Error: Not logged into Hugging Face. Please run 'huggingface-cli login' first."
|
107 |
+
exit 1
|
108 |
+
fi
|
109 |
+
echo "Logged in successfully."
|
110 |
+
|
111 |
+
# --- Step 2: Download Hugging Face Model ---
|
112 |
+
echo "Downloading model '$MODEL_ID'..."
|
113 |
+
MODEL_DOWNLOAD_DIR="./downloads/$MODEL_NAME"
|
114 |
+
mkdir -p "$MODEL_DOWNLOAD_DIR"
|
115 |
+
|
116 |
+
# Determine if safetensors or bin files exist
|
117 |
+
echo "Checking for safetensors files..."
|
118 |
+
if huggingface-cli repo-files "$MODEL_ID" | grep -q '\.safetensors$'; then
|
119 |
+
PATTERN="*.safetensors"
|
120 |
+
echo "Found safetensors files. Downloading with pattern: $PATTERN"
|
121 |
+
else
|
122 |
+
PATTERN="*.bin"
|
123 |
+
echo "No safetensors files found. Downloading with pattern: $PATTERN"
|
124 |
+
fi
|
125 |
+
|
126 |
+
# Download necessary files
|
127 |
+
huggingface-cli download "$MODEL_ID" \
|
128 |
+
--revision main \
|
129 |
+
--include "*.md" \
|
130 |
+
--include "*.json" \
|
131 |
+
--include "*.model" \
|
132 |
+
--include "$PATTERN" \
|
133 |
+
--local-dir "$MODEL_DOWNLOAD_DIR" \
|
134 |
+
--local-dir-use-symlinks False
|
135 |
+
|
136 |
+
if [ $? -ne 0 ]; then
|
137 |
+
echo "Error: Failed to download model '$MODEL_ID'."
|
138 |
+
exit 1
|
139 |
+
fi
|
140 |
+
|
141 |
+
echo "Model downloaded to '$MODEL_DOWNLOAD_DIR'."
|
142 |
+
|
143 |
+
# Check for LoRA adapter (simplified check)
|
144 |
+
if [ -f "$MODEL_DOWNLOAD_DIR/adapter_config.json" ] && [ ! -f "$MODEL_DOWNLOAD_DIR/config.json" ]; then
|
145 |
+
echo "Error: adapter_config.json found but no config.json. This might be a LoRA adapter. Please use GGUF-my-lora."
|
146 |
+
exit 1
|
147 |
+
fi
|
148 |
+
|
149 |
+
# --- Step 3: Convert HF Model to FP16 GGUF ---
|
150 |
+
echo "Converting Hugging Face model to FP16 GGUF..."
|
151 |
+
python3 "$CONVERT_SCRIPT_PATH" "$MODEL_DOWNLOAD_DIR" \
|
152 |
+
--outtype f16 \
|
153 |
+
--outfile "$FP16_MODEL_PATH"
|
154 |
+
|
155 |
+
if [ $? -ne 0 ]; then
|
156 |
+
echo "Error: Failed to convert model to FP16 GGUF."
|
157 |
+
exit 1
|
158 |
+
fi
|
159 |
+
echo "FP16 GGUF model created at '$FP16_MODEL_PATH'."
|
160 |
+
fi
|
161 |
+
|
162 |
+
# --- Step 4: (Optional) Generate Imatrix ---
|
163 |
+
if [ "$USE_IMATRIX" = "true" ]; then
|
164 |
+
echo "Generating importance matrix (imatrix)..."
|
165 |
+
IMATRIX_CMD=(
|
166 |
+
llama-imatrix
|
167 |
+
-m "$FP16_MODEL_PATH"
|
168 |
+
-f "$CALIBRATION_FILE_PATH"
|
169 |
+
-ngl 99
|
170 |
+
--output-frequency 10
|
171 |
+
-o "$IMATRIX_FILE_PATH"
|
172 |
+
)
|
173 |
+
echo "Running command: ${IMATRIX_CMD[*]}"
|
174 |
+
"${IMATRIX_CMD[@]}"
|
175 |
+
|
176 |
+
if [ $? -ne 0 ]; then
|
177 |
+
echo "Error: Failed to generate imatrix."
|
178 |
+
exit 1
|
179 |
+
fi
|
180 |
+
echo "Imatrix generated at '$IMATRIX_FILE_PATH'."
|
181 |
+
fi
|
182 |
+
|
183 |
+
# --- Step 5: Quantize the GGUF Model ---
|
184 |
+
echo "Quantizing GGUF model..."
|
185 |
+
QUANTIZE_CMD=(
|
186 |
+
llama-quantize
|
187 |
+
)
|
188 |
+
|
189 |
+
# Add optional quantization flags
|
190 |
+
# Note: The original script has logic for --leave-output-tensor vs --output-tensor-type
|
191 |
+
# and --token-embedding-type. This script omits these for simplicity.
|
192 |
+
# You can add them back if needed, but they require more input arguments.
|
193 |
+
|
194 |
+
if [ "$USE_IMATRIX" = "true" ] && [ -f "$IMATRIX_FILE_PATH" ]; then
|
195 |
+
QUANTIZE_CMD+=(
|
196 |
+
--imatrix "$IMATRIX_FILE_PATH"
|
197 |
+
"$FP16_MODEL_PATH"
|
198 |
+
"$QUANTIZED_MODEL_PATH"
|
199 |
+
"$QUANT_METHOD"
|
200 |
+
)
|
201 |
+
else
|
202 |
+
QUANTIZE_CMD+=(
|
203 |
+
"$FP16_MODEL_PATH"
|
204 |
+
"$QUANTIZED_MODEL_PATH"
|
205 |
+
"$QUANT_METHOD"
|
206 |
+
)
|
207 |
+
fi
|
208 |
+
|
209 |
+
echo "Running command: ${QUANTIZE_CMD[*]}"
|
210 |
+
"${QUANTIZE_CMD[@]}"
|
211 |
+
|
212 |
+
if [ $? -ne 0 ]; then
|
213 |
+
echo "Error: Failed to quantize model."
|
214 |
+
exit 1
|
215 |
+
fi
|
216 |
+
echo "Model quantized successfully to '$QUANTIZED_MODEL_PATH'."
|
217 |
+
|
218 |
+
# --- Step 6: (Optional) Split the Quantized Model ---
|
219 |
+
if [ "$SPLIT_MODEL" = "true" ]; then
|
220 |
+
echo "Splitting quantized model..."
|
221 |
+
SPLIT_CMD=(
|
222 |
+
llama-gguf-split
|
223 |
+
--split
|
224 |
+
)
|
225 |
+
|
226 |
+
if [ -n "$SPLIT_MAX_SIZE" ]; then
|
227 |
+
SPLIT_CMD+=(--split-max-size "$SPLIT_MAX_SIZE")
|
228 |
+
else
|
229 |
+
SPLIT_CMD+=(--split-max-tensors "$SPLIT_MAX_TENSORS")
|
230 |
+
fi
|
231 |
+
|
232 |
+
# Output prefix (without .gguf extension)
|
233 |
+
OUTPUT_PREFIX="${QUANTIZED_MODEL_PATH%.gguf}"
|
234 |
+
SPLIT_CMD+=("$QUANTIZED_MODEL_PATH" "$OUTPUT_PREFIX")
|
235 |
+
|
236 |
+
echo "Running command: ${SPLIT_CMD[*]}"
|
237 |
+
"${SPLIT_CMD[@]}"
|
238 |
+
|
239 |
+
if [ $? -ne 0 ]; then
|
240 |
+
echo "Error: Failed to split model."
|
241 |
+
exit 1
|
242 |
+
fi
|
243 |
+
|
244 |
+
# Remove the original unsplit file
|
245 |
+
if [ -f "$QUANTIZED_MODEL_PATH" ]; then
|
246 |
+
rm "$QUANTIZED_MODEL_PATH"
|
247 |
+
echo "Removed original unsplit file '$QUANTIZED_MODEL_PATH'."
|
248 |
+
fi
|
249 |
+
|
250 |
+
echo "Model split successfully. Shards are in '$OUTPUT_DIR' with prefix '$OUTPUT_PREFIX'."
|
251 |
+
else
|
252 |
+
echo "Model splitting skipped."
|
253 |
+
fi
|
254 |
+
|
255 |
+
echo "=== GGUF Conversion Pipeline Completed Successfully ==="
|
256 |
+
if [ "$SPLIT_MODEL" = "true" ]; then
|
257 |
+
echo "Check directory '$OUTPUT_DIR' for split GGUF files."
|
258 |
+
else
|
259 |
+
echo "Final GGUF file is located at: $QUANTIZED_MODEL_PATH"
|
260 |
+
fi
|