Spaces:
Running
Running
Oleg Shulyakov
commited on
Commit
·
e395b9b
1
Parent(s):
55ecc95
Move quantization to separate method
Browse files
app.py
CHANGED
@@ -189,7 +189,45 @@ def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDir
|
|
189 |
|
190 |
return fp16_model
|
191 |
|
192 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
# validate the oauth token
|
194 |
if is_valid_token(oauth_token) is False:
|
195 |
raise gr.Error("You must be logged in to use GGUF-my-repo")
|
@@ -201,42 +239,8 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
201 |
try:
|
202 |
with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
|
203 |
fp16 = download_base_model(oauth_token.token, model_id, outdir)
|
204 |
-
|
205 |
imatrix_path = Path(outdir)/"imatrix.dat"
|
206 |
-
|
207 |
-
if use_imatrix:
|
208 |
-
if train_data_file:
|
209 |
-
train_data_path = train_data_file.name
|
210 |
-
else:
|
211 |
-
train_data_path = "train_data.txt" #fallback calibration dataset
|
212 |
-
|
213 |
-
print(f"Training data file path: {train_data_path}")
|
214 |
-
|
215 |
-
if not os.path.isfile(train_data_path):
|
216 |
-
raise Exception(f"Training data file not found: {train_data_path}")
|
217 |
-
|
218 |
-
generate_importance_matrix(fp16, train_data_path, imatrix_path)
|
219 |
-
else:
|
220 |
-
print("Not using imatrix quantization.")
|
221 |
-
|
222 |
-
# Quantize the model
|
223 |
-
quantized_gguf_path = str(Path(outdir)/gguf_name)
|
224 |
-
if use_imatrix:
|
225 |
-
quantise_ggml = [
|
226 |
-
"llama-quantize",
|
227 |
-
"--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method
|
228 |
-
]
|
229 |
-
else:
|
230 |
-
quantise_ggml = [
|
231 |
-
"llama-quantize",
|
232 |
-
fp16, quantized_gguf_path, q_method
|
233 |
-
]
|
234 |
-
result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
|
235 |
-
if result.returncode != 0:
|
236 |
-
stderr_str = result.stderr.decode("utf-8")
|
237 |
-
raise Exception(f"Error quantizing: {stderr_str}")
|
238 |
-
print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
|
239 |
-
print(f"Quantized model path: {os.path.abspath(quantized_gguf_path)}")
|
240 |
|
241 |
# Create empty repo
|
242 |
api = HfApi(token=oauth_token.token)
|
@@ -312,12 +316,12 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
312 |
card.save(readme_path)
|
313 |
|
314 |
if split_model:
|
315 |
-
split_upload_model(str(
|
316 |
else:
|
317 |
try:
|
318 |
-
print(f"Uploading quantized model: {
|
319 |
api.upload_file(
|
320 |
-
path_or_fileobj=
|
321 |
path_in_repo=gguf_name,
|
322 |
repo_id=new_repo_id,
|
323 |
)
|
|
|
189 |
|
190 |
return fp16_model
|
191 |
|
192 |
+
def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_method: str, use_imatrix: bool, imatrix_q_method: str, imatrix_path: str):
|
193 |
+
if use_imatrix:
|
194 |
+
if train_data_file:
|
195 |
+
train_data_path = train_data_file.name
|
196 |
+
else:
|
197 |
+
train_data_path = "train_data.txt" #fallback calibration dataset
|
198 |
+
|
199 |
+
print(f"Training data file path: {train_data_path}")
|
200 |
+
|
201 |
+
if not os.path.isfile(train_data_path):
|
202 |
+
raise Exception(f"Training data file not found: {train_data_path}")
|
203 |
+
|
204 |
+
generate_importance_matrix(fp16, train_data_path, imatrix_path)
|
205 |
+
else:
|
206 |
+
print("Not using imatrix quantization.")
|
207 |
+
|
208 |
+
# Quantize the model
|
209 |
+
quantized_gguf = str(Path(outdir)/gguf_name)
|
210 |
+
if use_imatrix:
|
211 |
+
quantize_cmd = [
|
212 |
+
"llama-quantize",
|
213 |
+
"--imatrix", imatrix_path, fp16, quantized_gguf, imatrix_q_method
|
214 |
+
]
|
215 |
+
else:
|
216 |
+
quantize_cmd = [
|
217 |
+
"llama-quantize",
|
218 |
+
fp16, quantized_gguf, q_method
|
219 |
+
]
|
220 |
+
|
221 |
+
result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
|
222 |
+
if result.returncode != 0:
|
223 |
+
stderr_str = result.stderr.decode("utf-8")
|
224 |
+
raise Exception(f"Error quantizing: {stderr_str}")
|
225 |
+
|
226 |
+
print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
|
227 |
+
print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
|
228 |
+
return quantized_gguf
|
229 |
+
|
230 |
+
def process_model(model_id: str, q_method: str, use_imatrix: bool, imatrix_q_method: str, private_repo: bool, train_data_file, split_model: bool, split_max_tensors, split_max_size: str | None, repo_name: str, gguf_name: str, oauth_token: gr.OAuthToken | None):
|
231 |
# validate the oauth token
|
232 |
if is_valid_token(oauth_token) is False:
|
233 |
raise gr.Error("You must be logged in to use GGUF-my-repo")
|
|
|
239 |
try:
|
240 |
with tempfile.TemporaryDirectory(dir=OUTPUT_FOLDER) as outdir:
|
241 |
fp16 = download_base_model(oauth_token.token, model_id, outdir)
|
|
|
242 |
imatrix_path = Path(outdir)/"imatrix.dat"
|
243 |
+
quantized_gguf = quantize_model(outdir, gguf_name, fp16, q_method, use_imatrix, imatrix_q_method, imatrix_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
# Create empty repo
|
246 |
api = HfApi(token=oauth_token.token)
|
|
|
316 |
card.save(readme_path)
|
317 |
|
318 |
if split_model:
|
319 |
+
split_upload_model(str(quantized_gguf), outdir, new_repo_id, oauth_token, split_max_tensors, split_max_size)
|
320 |
else:
|
321 |
try:
|
322 |
+
print(f"Uploading quantized model: {quantized_gguf}")
|
323 |
api.upload_file(
|
324 |
+
path_or_fileobj=quantized_gguf,
|
325 |
path_in_repo=gguf_name,
|
326 |
repo_id=new_repo_id,
|
327 |
)
|