Oleg Shulyakov commited on
Commit
7deeb97
·
1 Parent(s): ac04c37

Shell script

Browse files
Files changed (1) hide show
  1. hf-quantize.sh +260 -0
hf-quantize.sh ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # Shortcut for quantizing HF models
4
+ #
5
+ # Usage:
6
+ # ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M
7
+ # ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M true
8
+ # ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf
9
+ # ./hf-quantize.sh meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf true 256 4G
10
+ #
11
+
12
+ # --- Configuration ---
13
+
14
+ # Path to convert_hf_to_gguf.py
15
+ CONVERT_SCRIPT_PATH="./llma.cpp/convert_hf_to_gguf.py"
16
+
17
+ # Path to calibration data file for imatrix
18
+ CALIBRATION_FILE_PATH="./calibration_data_v5_rc.txt"
19
+
20
+ # --- Input Arguments ---
21
+ # Required: Hugging Face model ID (e.g., meta-llama/Llama-3.2-1B)
22
+ MODEL_ID="$1"
23
+
24
+ # Required: Quantization method (e.g., Q4_K_M, Q5_K_M, F16)
25
+ QUANT_METHOD="$2"
26
+
27
+ # Optional: "true" to use imatrix, anything else or empty for false
28
+ USE_IMATRIX="$3"
29
+
30
+ # Optional: Final GGUF filename (default: <model_name>-<quant_method>.gguf or <model_name>-<quant_method>-imat.gguf)
31
+ OUTPUT_FILENAME="$4"
32
+
33
+ # Optional: "true" to split the model, anything else or empty for false
34
+ SPLIT_MODEL="$5"
35
+
36
+ # Optional: Max tensors per shard if splitting (default: 256)
37
+ SPLIT_MAX_TENSORS="$6"
38
+
39
+ # Optional: Max size per shard if splitting (e.g., 2G) - overrides SPLIT_MAX_TENSORS if set
40
+ SPLIT_MAX_SIZE="$7"
41
+
42
+ # --- Derived Variables ---
43
+ if [ -z "$MODEL_ID" ] || [ -z "$QUANT_METHOD" ]; then
44
+ echo "Usage: $0 <MODEL_ID> <QUANT_METHOD> [USE_IMATRIX] [OUTPUT_FILENAME] [SPLIT_MODEL] [SPLIT_MAX_TENSORS] [SPLIT_MAX_SIZE]"
45
+ echo " $0 meta-llama/Llama-2-7b Q4_K_M"
46
+ echo " $0 meta-llama/Llama-2-7b Q4_K_M true"
47
+ echo " $0 meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf"
48
+ echo " $0 meta-llama/Llama-2-7b Q4_K_M true Llama-2-7b-Q4_K_M.gguf true 256 4G"
49
+ exit 1
50
+ fi
51
+
52
+ # Extract model name from ID
53
+ MODEL_NAME=$(basename "$MODEL_ID")
54
+
55
+ # Directory to store intermediate and final files
56
+ OUTPUT_DIR="./outputs/${MODEL_NAME}"
57
+ mkdir -p "$OUTPUT_DIR"
58
+
59
+ if [ "$USE_IMATRIX" = "true" ]; then
60
+ if [ ! -f "$CALIBRATION_FILE_PATH" ]; then
61
+ echo "Error: Calibration file '$CALIBRATION_FILE_PATH' not found. Please provide it."
62
+ exit 1
63
+ fi
64
+ if [ -z "$OUTPUT_FILENAME" ]; then
65
+ OUTPUT_FILENAME="${MODEL_NAME}-${QUANT_METHOD}-imat.gguf"
66
+ fi
67
+ else
68
+ if [ -z "$OUTPUT_FILENAME" ]; then
69
+ OUTPUT_FILENAME="${MODEL_NAME}-${QUANT_METHOD}.gguf"
70
+ fi
71
+ fi
72
+
73
+ FP16_MODEL_PATH="$OUTPUT_DIR/${MODEL_NAME}-fp16.gguf"
74
+ IMATRIX_FILE_PATH="$OUTPUT_DIR/${MODEL_NAME}-imatrix.gguf"
75
+ QUANTIZED_MODEL_PATH="$OUTPUT_DIR/$OUTPUT_FILENAME"
76
+
77
+ echo "=== Starting GGUF Conversion Pipeline ==="
78
+ echo "Model ID: $MODEL_ID"
79
+ echo "Model Name: $MODEL_NAME"
80
+ echo "Quantization Method: $QUANT_METHOD"
81
+ echo "Use Imatrix: $USE_IMATRIX"
82
+ if [ "$USE_IMATRIX" = "true" ]; then
83
+ echo "Calibration File: $CALIBRATION_FILE_PATH"
84
+ fi
85
+ echo "Output Directory: $OUTPUT_DIR"
86
+ echo "Final Output File: $QUANTIZED_MODEL_PATH"
87
+ echo "Split Model: $SPLIT_MODEL"
88
+ if [ "$SPLIT_MODEL" = "true" ]; then
89
+ if [ -n "$SPLIT_MAX_SIZE" ]; then
90
+ echo "Split Max Size: $SPLIT_MAX_SIZE"
91
+ else
92
+ if [ -z "$SPLIT_MAX_TENSORS" ]; then
93
+ SPLIT_MAX_TENSORS=256
94
+ fi
95
+ echo "Split Max Tensors: $SPLIT_MAX_TENSORS"
96
+ fi
97
+ fi
98
+ echo "----------------------------------------"
99
+
100
+ if [ -f "$FP16_MODEL_PATH" ]; then
101
+ echo "FP16 model '$FP16_MODEL_PATH' already exists. Skipping conversion."
102
+ else
103
+ # --- Step 1: Check Hugging Face Login ---
104
+ echo "Checking Hugging Face login status..."
105
+ if ! huggingface-cli whoami > /dev/null 2>&1; then
106
+ echo "Error: Not logged into Hugging Face. Please run 'huggingface-cli login' first."
107
+ exit 1
108
+ fi
109
+ echo "Logged in successfully."
110
+
111
+ # --- Step 2: Download Hugging Face Model ---
112
+ echo "Downloading model '$MODEL_ID'..."
113
+ MODEL_DOWNLOAD_DIR="./downloads/$MODEL_NAME"
114
+ mkdir -p "$MODEL_DOWNLOAD_DIR"
115
+
116
+ # Determine if safetensors or bin files exist
117
+ echo "Checking for safetensors files..."
118
+ if huggingface-cli repo-files "$MODEL_ID" | grep -q '\.safetensors$'; then
119
+ PATTERN="*.safetensors"
120
+ echo "Found safetensors files. Downloading with pattern: $PATTERN"
121
+ else
122
+ PATTERN="*.bin"
123
+ echo "No safetensors files found. Downloading with pattern: $PATTERN"
124
+ fi
125
+
126
+ # Download necessary files
127
+ huggingface-cli download "$MODEL_ID" \
128
+ --revision main \
129
+ --include "*.md" \
130
+ --include "*.json" \
131
+ --include "*.model" \
132
+ --include "$PATTERN" \
133
+ --local-dir "$MODEL_DOWNLOAD_DIR" \
134
+ --local-dir-use-symlinks False
135
+
136
+ if [ $? -ne 0 ]; then
137
+ echo "Error: Failed to download model '$MODEL_ID'."
138
+ exit 1
139
+ fi
140
+
141
+ echo "Model downloaded to '$MODEL_DOWNLOAD_DIR'."
142
+
143
+ # Check for LoRA adapter (simplified check)
144
+ if [ -f "$MODEL_DOWNLOAD_DIR/adapter_config.json" ] && [ ! -f "$MODEL_DOWNLOAD_DIR/config.json" ]; then
145
+ echo "Error: adapter_config.json found but no config.json. This might be a LoRA adapter. Please use GGUF-my-lora."
146
+ exit 1
147
+ fi
148
+
149
+ # --- Step 3: Convert HF Model to FP16 GGUF ---
150
+ echo "Converting Hugging Face model to FP16 GGUF..."
151
+ python3 "$CONVERT_SCRIPT_PATH" "$MODEL_DOWNLOAD_DIR" \
152
+ --outtype f16 \
153
+ --outfile "$FP16_MODEL_PATH"
154
+
155
+ if [ $? -ne 0 ]; then
156
+ echo "Error: Failed to convert model to FP16 GGUF."
157
+ exit 1
158
+ fi
159
+ echo "FP16 GGUF model created at '$FP16_MODEL_PATH'."
160
+ fi
161
+
162
+ # --- Step 4: (Optional) Generate Imatrix ---
163
+ if [ "$USE_IMATRIX" = "true" ]; then
164
+ echo "Generating importance matrix (imatrix)..."
165
+ IMATRIX_CMD=(
166
+ llama-imatrix
167
+ -m "$FP16_MODEL_PATH"
168
+ -f "$CALIBRATION_FILE_PATH"
169
+ -ngl 99
170
+ --output-frequency 10
171
+ -o "$IMATRIX_FILE_PATH"
172
+ )
173
+ echo "Running command: ${IMATRIX_CMD[*]}"
174
+ "${IMATRIX_CMD[@]}"
175
+
176
+ if [ $? -ne 0 ]; then
177
+ echo "Error: Failed to generate imatrix."
178
+ exit 1
179
+ fi
180
+ echo "Imatrix generated at '$IMATRIX_FILE_PATH'."
181
+ fi
182
+
183
+ # --- Step 5: Quantize the GGUF Model ---
184
+ echo "Quantizing GGUF model..."
185
+ QUANTIZE_CMD=(
186
+ llama-quantize
187
+ )
188
+
189
+ # Add optional quantization flags
190
+ # Note: The original script has logic for --leave-output-tensor vs --output-tensor-type
191
+ # and --token-embedding-type. This script omits these for simplicity.
192
+ # You can add them back if needed, but they require more input arguments.
193
+
194
+ if [ "$USE_IMATRIX" = "true" ] && [ -f "$IMATRIX_FILE_PATH" ]; then
195
+ QUANTIZE_CMD+=(
196
+ --imatrix "$IMATRIX_FILE_PATH"
197
+ "$FP16_MODEL_PATH"
198
+ "$QUANTIZED_MODEL_PATH"
199
+ "$QUANT_METHOD"
200
+ )
201
+ else
202
+ QUANTIZE_CMD+=(
203
+ "$FP16_MODEL_PATH"
204
+ "$QUANTIZED_MODEL_PATH"
205
+ "$QUANT_METHOD"
206
+ )
207
+ fi
208
+
209
+ echo "Running command: ${QUANTIZE_CMD[*]}"
210
+ "${QUANTIZE_CMD[@]}"
211
+
212
+ if [ $? -ne 0 ]; then
213
+ echo "Error: Failed to quantize model."
214
+ exit 1
215
+ fi
216
+ echo "Model quantized successfully to '$QUANTIZED_MODEL_PATH'."
217
+
218
+ # --- Step 6: (Optional) Split the Quantized Model ---
219
+ if [ "$SPLIT_MODEL" = "true" ]; then
220
+ echo "Splitting quantized model..."
221
+ SPLIT_CMD=(
222
+ llama-gguf-split
223
+ --split
224
+ )
225
+
226
+ if [ -n "$SPLIT_MAX_SIZE" ]; then
227
+ SPLIT_CMD+=(--split-max-size "$SPLIT_MAX_SIZE")
228
+ else
229
+ SPLIT_CMD+=(--split-max-tensors "$SPLIT_MAX_TENSORS")
230
+ fi
231
+
232
+ # Output prefix (without .gguf extension)
233
+ OUTPUT_PREFIX="${QUANTIZED_MODEL_PATH%.gguf}"
234
+ SPLIT_CMD+=("$QUANTIZED_MODEL_PATH" "$OUTPUT_PREFIX")
235
+
236
+ echo "Running command: ${SPLIT_CMD[*]}"
237
+ "${SPLIT_CMD[@]}"
238
+
239
+ if [ $? -ne 0 ]; then
240
+ echo "Error: Failed to split model."
241
+ exit 1
242
+ fi
243
+
244
+ # Remove the original unsplit file
245
+ if [ -f "$QUANTIZED_MODEL_PATH" ]; then
246
+ rm "$QUANTIZED_MODEL_PATH"
247
+ echo "Removed original unsplit file '$QUANTIZED_MODEL_PATH'."
248
+ fi
249
+
250
+ echo "Model split successfully. Shards are in '$OUTPUT_DIR' with prefix '$OUTPUT_PREFIX'."
251
+ else
252
+ echo "Model splitting skipped."
253
+ fi
254
+
255
+ echo "=== GGUF Conversion Pipeline Completed Successfully ==="
256
+ if [ "$SPLIT_MODEL" = "true" ]; then
257
+ echo "Check directory '$OUTPUT_DIR' for split GGUF files."
258
+ else
259
+ echo "Final GGUF file is located at: $QUANTIZED_MODEL_PATH"
260
+ fi