Spaces:
Running
Running
Oleg Shulyakov
commited on
Commit
·
2d3ee83
1
Parent(s):
ad94fc8
OOP draft
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import tempfile
|
|
5 |
from pathlib import Path
|
6 |
from textwrap import dedent
|
7 |
from typing import Optional, Tuple, List, Union
|
8 |
-
from dataclasses import dataclass
|
9 |
|
10 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
|
11 |
|
@@ -26,7 +26,10 @@ class QuantizationConfig:
|
|
26 |
leave_output: bool = False
|
27 |
quant_output: bool = False
|
28 |
output_tensor_method: str = "Q8_0"
|
29 |
-
|
|
|
|
|
|
|
30 |
|
31 |
@dataclass
|
32 |
class SplitConfig:
|
@@ -43,6 +46,16 @@ class OutputConfig:
|
|
43 |
repo_name: str = ""
|
44 |
filename: str = ""
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
class GGUFConverterError(Exception):
|
48 |
"""Custom exception for GGUF conversion errors."""
|
@@ -202,29 +215,27 @@ class HuggingFaceModelProcessor:
|
|
202 |
|
203 |
print("Sharded model has been uploaded successfully!")
|
204 |
|
205 |
-
def _download_base_model(self,
|
206 |
"""Download and convert Hugging Face model to GGUF FP16 format."""
|
207 |
-
|
208 |
-
print(f"Downloading model {model_name}")
|
209 |
-
fp16_model = f"{outdir}/{model_name}-fp16.gguf"
|
210 |
|
211 |
-
if os.path.exists(fp16_model):
|
212 |
print("Skipping fp16 conversion...")
|
213 |
-
print(f"Converted model path: {os.path.abspath(fp16_model)}")
|
214 |
-
return fp16_model
|
215 |
|
216 |
with tempfile.TemporaryDirectory(dir=self.DOWNLOAD_FOLDER) as tmpdir:
|
217 |
-
local_dir = f"{Path(tmpdir)}/{model_name}"
|
218 |
print(f"Local directory: {os.path.abspath(local_dir)}")
|
219 |
|
220 |
# Download model
|
221 |
-
api = HfApi(token=token)
|
222 |
pattern = (
|
223 |
"*.safetensors"
|
224 |
if any(
|
225 |
file.path.endswith(".safetensors")
|
226 |
for file in api.list_repo_tree(
|
227 |
-
repo_id=model_id,
|
228 |
recursive=True,
|
229 |
)
|
230 |
)
|
@@ -232,12 +243,12 @@ class HuggingFaceModelProcessor:
|
|
232 |
)
|
233 |
dl_pattern = ["*.md", "*.json", "*.model"]
|
234 |
dl_pattern += [pattern]
|
235 |
-
api.snapshot_download(repo_id=model_id, local_dir=local_dir, allow_patterns=dl_pattern)
|
236 |
print("Model downloaded successfully!")
|
237 |
print(f"Model directory contents: {os.listdir(local_dir)}")
|
238 |
|
239 |
-
config_dir = local_dir
|
240 |
-
adapter_config_dir = local_dir
|
241 |
if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
|
242 |
raise GGUFConverterError(
|
243 |
'adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, '
|
@@ -246,11 +257,11 @@ class HuggingFaceModelProcessor:
|
|
246 |
)
|
247 |
|
248 |
# Convert HF to GGUF
|
249 |
-
print(f"Converting to GGUF FP16: {os.path.abspath(fp16_model)}")
|
250 |
result = subprocess.run(
|
251 |
[
|
252 |
"python3", "/app/convert_hf_to_gguf.py", local_dir,
|
253 |
-
"--outtype", "f16", "--outfile", fp16_model
|
254 |
],
|
255 |
shell=False,
|
256 |
capture_output=True
|
@@ -262,11 +273,10 @@ class HuggingFaceModelProcessor:
|
|
262 |
raise GGUFConverterError(f"Error converting to fp16: {stderr_str}")
|
263 |
|
264 |
print("Model converted to fp16 successfully!")
|
265 |
-
print(f"Converted model path: {os.path.abspath(fp16_model)}")
|
266 |
return fp16_model
|
267 |
|
268 |
-
def _quantize_model(self,
|
269 |
-
quant_config: QuantizationConfig) -> str:
|
270 |
"""Quantize the GGUF model."""
|
271 |
quantize_cmd = ["llama-quantize"]
|
272 |
|
@@ -279,8 +289,7 @@ class HuggingFaceModelProcessor:
|
|
279 |
if quant_config.quant_output:
|
280 |
quantize_cmd.extend(["--output-tensor-type", quant_config.output_tensor_method])
|
281 |
|
282 |
-
|
283 |
-
|
284 |
if quant_config.use_imatrix:
|
285 |
train_data_path = "calibration_data_v5_rc.txt"
|
286 |
print(f"Training data file path: {train_data_path}")
|
@@ -288,14 +297,13 @@ class HuggingFaceModelProcessor:
|
|
288 |
if not os.path.isfile(train_data_path):
|
289 |
raise GGUFConverterError(f"Training data file not found: {train_data_path}")
|
290 |
|
291 |
-
self._generate_importance_matrix(
|
292 |
-
quantize_cmd.extend(["--imatrix", imatrix_file])
|
293 |
else:
|
294 |
print("Not using imatrix quantization.")
|
295 |
|
296 |
-
|
297 |
-
quantize_cmd.append(
|
298 |
-
quantize_cmd.append(quantized_gguf)
|
299 |
|
300 |
if quant_config.use_imatrix:
|
301 |
quantize_cmd.append(quant_config.imatrix_method)
|
@@ -310,8 +318,8 @@ class HuggingFaceModelProcessor:
|
|
310 |
raise GGUFConverterError(f"Error quantizing: {stderr_str}")
|
311 |
|
312 |
print(f"Quantized successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
|
313 |
-
print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
|
314 |
-
return quantized_gguf
|
315 |
|
316 |
def _generate_readme(self, outdir: str, token: str, model_id: str,
|
317 |
new_repo_id: str, gguf_name: str) -> str:
|
@@ -361,61 +369,55 @@ llama-server --hf-repo "{new_repo_id}" --hf-file "{gguf_name}" -c 4096
|
|
361 |
card.save(readme_path)
|
362 |
return readme_path
|
363 |
|
364 |
-
def process_model(self,
|
365 |
-
split_config: SplitConfig, output_config: OutputConfig,
|
366 |
-
oauth_token: Optional[gr.OAuthToken]) -> Tuple[str, str]:
|
367 |
"""Main method to process a model through the entire pipeline."""
|
368 |
try:
|
369 |
-
|
370 |
-
|
|
|
371 |
|
372 |
-
|
373 |
|
374 |
-
|
375 |
-
|
376 |
-
self._create_folder(os.path.join(self.OUTPUT_FOLDER, model_name))
|
377 |
-
if self.RUN_LOCALLY == "1"
|
378 |
-
else Path(outDirObj)
|
379 |
-
)
|
380 |
|
381 |
-
|
382 |
-
|
383 |
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
print(f"Uploaded successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
|
419 |
|
420 |
return (
|
421 |
f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
|
@@ -743,32 +745,68 @@ class GGUFConverterUI:
|
|
743 |
embedding_tensor_method: str, leave_output: bool,
|
744 |
quant_output: bool, output_tensor_method: str,
|
745 |
split_model: bool, split_max_tensors, split_max_size: str) -> Tuple[str, str]:
|
746 |
-
"""Wrapper for the process_model method to handle the conversion."""
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
|
|
|
|
|
|
|
|
758 |
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
|
771 |
-
return self.processor.process_model(model_id, quant_config, split_config, output_config, gr.OAuthToken)
|
772 |
|
773 |
def launch(self):
|
774 |
"""Launch the Gradio interface."""
|
|
|
5 |
from pathlib import Path
|
6 |
from textwrap import dedent
|
7 |
from typing import Optional, Tuple, List, Union
|
8 |
+
from dataclasses import dataclass, field
|
9 |
|
10 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
|
11 |
|
|
|
26 |
leave_output: bool = False
|
27 |
quant_output: bool = False
|
28 |
output_tensor_method: str = "Q8_0"
|
29 |
+
# Generated values - These will be set during processing
|
30 |
+
fp16_model: str = field(default="", init=False)
|
31 |
+
quantized_gguf: str = field(default="", init=False)
|
32 |
+
imatrix_file: str = field(default="", init=False)
|
33 |
|
34 |
@dataclass
|
35 |
class SplitConfig:
|
|
|
46 |
repo_name: str = ""
|
47 |
filename: str = ""
|
48 |
|
49 |
+
@dataclass
|
50 |
+
class ModelProcessingConfig:
|
51 |
+
"""Configuration for the entire model processing pipeline."""
|
52 |
+
token: str
|
53 |
+
model_id: str
|
54 |
+
model_name: str
|
55 |
+
outdir: str
|
56 |
+
quant_config: QuantizationConfig
|
57 |
+
split_config: SplitConfig
|
58 |
+
output_config: OutputConfig
|
59 |
|
60 |
class GGUFConverterError(Exception):
|
61 |
"""Custom exception for GGUF conversion errors."""
|
|
|
215 |
|
216 |
print("Sharded model has been uploaded successfully!")
|
217 |
|
218 |
+
def _download_base_model(self, processing_config: ModelProcessingConfig) -> str:
|
219 |
"""Download and convert Hugging Face model to GGUF FP16 format."""
|
220 |
+
print(f"Downloading model {processing_config.model_name}")
|
|
|
|
|
221 |
|
222 |
+
if os.path.exists(processing_config.quant_config.fp16_model):
|
223 |
print("Skipping fp16 conversion...")
|
224 |
+
print(f"Converted model path: {os.path.abspath(processing_config.quant_config.fp16_model)}")
|
225 |
+
return processing_config.quant_config.fp16_model
|
226 |
|
227 |
with tempfile.TemporaryDirectory(dir=self.DOWNLOAD_FOLDER) as tmpdir:
|
228 |
+
local_dir = f"{Path(tmpdir)}/{processing_config.model_name}"
|
229 |
print(f"Local directory: {os.path.abspath(local_dir)}")
|
230 |
|
231 |
# Download model
|
232 |
+
api = HfApi(token=processing_config.token)
|
233 |
pattern = (
|
234 |
"*.safetensors"
|
235 |
if any(
|
236 |
file.path.endswith(".safetensors")
|
237 |
for file in api.list_repo_tree(
|
238 |
+
repo_id=processing_config.model_id,
|
239 |
recursive=True,
|
240 |
)
|
241 |
)
|
|
|
243 |
)
|
244 |
dl_pattern = ["*.md", "*.json", "*.model"]
|
245 |
dl_pattern += [pattern]
|
246 |
+
api.snapshot_download(repo_id=processing_config.model_id, local_dir=local_dir, allow_patterns=dl_pattern)
|
247 |
print("Model downloaded successfully!")
|
248 |
print(f"Model directory contents: {os.listdir(local_dir)}")
|
249 |
|
250 |
+
config_dir = os.path.join(local_dir, "config.json")
|
251 |
+
adapter_config_dir = os.path.join(local_dir, "adapter_config.json")
|
252 |
if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
|
253 |
raise GGUFConverterError(
|
254 |
'adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, '
|
|
|
257 |
)
|
258 |
|
259 |
# Convert HF to GGUF
|
260 |
+
print(f"Converting to GGUF FP16: {os.path.abspath(processing_config.quant_config.fp16_model)}")
|
261 |
result = subprocess.run(
|
262 |
[
|
263 |
"python3", "/app/convert_hf_to_gguf.py", local_dir,
|
264 |
+
"--outtype", "f16", "--outfile", processing_config.quant_config.fp16_model
|
265 |
],
|
266 |
shell=False,
|
267 |
capture_output=True
|
|
|
273 |
raise GGUFConverterError(f"Error converting to fp16: {stderr_str}")
|
274 |
|
275 |
print("Model converted to fp16 successfully!")
|
276 |
+
print(f"Converted model path: {os.path.abspath(processing_config.quant_config.fp16_model)}")
|
277 |
return fp16_model
|
278 |
|
279 |
+
def _quantize_model(self, quant_config: QuantizationConfig) -> str:
|
|
|
280 |
"""Quantize the GGUF model."""
|
281 |
quantize_cmd = ["llama-quantize"]
|
282 |
|
|
|
289 |
if quant_config.quant_output:
|
290 |
quantize_cmd.extend(["--output-tensor-type", quant_config.output_tensor_method])
|
291 |
|
292 |
+
# Set imatrix file path if needed
|
|
|
293 |
if quant_config.use_imatrix:
|
294 |
train_data_path = "calibration_data_v5_rc.txt"
|
295 |
print(f"Training data file path: {train_data_path}")
|
|
|
297 |
if not os.path.isfile(train_data_path):
|
298 |
raise GGUFConverterError(f"Training data file not found: {train_data_path}")
|
299 |
|
300 |
+
self._generate_importance_matrix(quant_config.fp16_model, train_data_path, quant_config.imatrix_file)
|
301 |
+
quantize_cmd.extend(["--imatrix", quant_config.imatrix_file])
|
302 |
else:
|
303 |
print("Not using imatrix quantization.")
|
304 |
|
305 |
+
quantize_cmd.append(quant_config.fp16_model)
|
306 |
+
quantize_cmd.append(quant_config.quantized_gguf)
|
|
|
307 |
|
308 |
if quant_config.use_imatrix:
|
309 |
quantize_cmd.append(quant_config.imatrix_method)
|
|
|
318 |
raise GGUFConverterError(f"Error quantizing: {stderr_str}")
|
319 |
|
320 |
print(f"Quantized successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
|
321 |
+
print(f"Quantized model path: {os.path.abspath(quant_config.quantized_gguf)}")
|
322 |
+
return quant_config.quantized_gguf
|
323 |
|
324 |
def _generate_readme(self, outdir: str, token: str, model_id: str,
|
325 |
new_repo_id: str, gguf_name: str) -> str:
|
|
|
369 |
card.save(readme_path)
|
370 |
return readme_path
|
371 |
|
372 |
+
def process_model(self, processing_config: ModelProcessingConfig) -> Tuple[str, str]:
|
|
|
|
|
373 |
"""Main method to process a model through the entire pipeline."""
|
374 |
try:
|
375 |
+
quant_config = processing_config.quant_config
|
376 |
+
split_config = processing_config.split_config
|
377 |
+
output_config = processing_config.output_config
|
378 |
|
379 |
+
print(f"Current working directory: {os.path.abspath(os.getcwd())}")
|
380 |
|
381 |
+
# Download and convert base model
|
382 |
+
self._download_base_model(processing_config)
|
|
|
|
|
|
|
|
|
383 |
|
384 |
+
# Quantize the model
|
385 |
+
self._quantize_model(quant_config)
|
386 |
|
387 |
+
# Create empty repo
|
388 |
+
api = HfApi(token=processing_config.token)
|
389 |
+
new_repo_url = api.create_repo(
|
390 |
+
repo_id=output_config.repo_name,
|
391 |
+
exist_ok=True,
|
392 |
+
private=output_config.private_repo
|
393 |
+
)
|
394 |
+
new_repo_id = new_repo_url.repo_id
|
395 |
+
print("Repo created successfully!", new_repo_url)
|
396 |
+
|
397 |
+
# Upload model
|
398 |
+
if split_config.enabled:
|
399 |
+
print(f"Splitting quantized model: {os.path.abspath(quant_config.quantized_gguf)}")
|
400 |
+
self._split_and_upload_model(quant_config.quantized_gguf, processing_config.outdir, new_repo_id, processing_config.token, split_config)
|
401 |
+
else:
|
402 |
+
try:
|
403 |
+
print(f"Uploading quantized model: {os.path.abspath(quant_config.quantized_gguf)}")
|
404 |
+
self._upload_file(processing_config.token, quant_config.quantized_gguf, output_config.filename, new_repo_id)
|
405 |
+
except Exception as e:
|
406 |
+
raise GGUFConverterError(f"Error uploading quantized model: {e}")
|
407 |
+
|
408 |
+
# Upload imatrix if it exists
|
409 |
+
if quant_config.use_imatrix and os.path.isfile(quant_config.imatrix_file):
|
410 |
+
try:
|
411 |
+
print(f"Uploading imatrix.dat: {os.path.abspath(quant_config.imatrix_file)}")
|
412 |
+
self._upload_file(processing_config.token, quant_config.imatrix_file, "imatrix.dat", new_repo_id)
|
413 |
+
except Exception as e:
|
414 |
+
raise GGUFConverterError(f"Error uploading imatrix.dat: {e}")
|
415 |
+
|
416 |
+
# Upload README.md
|
417 |
+
readme_path = self._generate_readme(processing_config.outdir, processing_config.token, processing_config.model_id, new_repo_id, output_config.filename)
|
418 |
+
self._upload_file(processing_config.token, readme_path, "README.md", new_repo_id)
|
419 |
+
|
420 |
+
print(f"Uploaded successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
|
|
|
421 |
|
422 |
return (
|
423 |
f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
|
|
|
745 |
embedding_tensor_method: str, leave_output: bool,
|
746 |
quant_output: bool, output_tensor_method: str,
|
747 |
split_model: bool, split_max_tensors, split_max_size: str) -> Tuple[str, str]:
|
748 |
+
"""Wrapper for the process_model method to handle the conversion using ModelProcessingConfig."""
|
749 |
+
try:
|
750 |
+
# Validate token and get token string
|
751 |
+
token = self.processor._validate_token(oauth_token)
|
752 |
+
|
753 |
+
# Create configuration objects
|
754 |
+
quant_config = QuantizationConfig(
|
755 |
+
method=q_method,
|
756 |
+
use_imatrix=use_imatrix,
|
757 |
+
imatrix_method=imatrix_q_method,
|
758 |
+
quant_embedding=quant_embedding,
|
759 |
+
embedding_tensor_method=embedding_tensor_method,
|
760 |
+
leave_output=leave_output,
|
761 |
+
quant_output=quant_output,
|
762 |
+
output_tensor_method=output_tensor_method
|
763 |
+
)
|
764 |
|
765 |
+
split_config = SplitConfig(
|
766 |
+
enabled=split_model,
|
767 |
+
max_tensors=split_max_tensors if isinstance(split_max_tensors, int) else 256,
|
768 |
+
max_size=split_max_size
|
769 |
+
)
|
770 |
|
771 |
+
output_config = OutputConfig(
|
772 |
+
private_repo=private_repo,
|
773 |
+
repo_name=repo_name,
|
774 |
+
filename=gguf_name
|
775 |
+
)
|
776 |
+
|
777 |
+
model_name = self.processor._get_model_name(model_id)
|
778 |
+
|
779 |
+
with tempfile.TemporaryDirectory(dir=self.OUTPUT_FOLDER) as outDirObj:
|
780 |
+
outdir = (
|
781 |
+
self._create_folder(os.path.join(self.OUTPUT_FOLDER, model_name))
|
782 |
+
if self.RUN_LOCALLY == "1"
|
783 |
+
else Path(outDirObj)
|
784 |
+
)
|
785 |
+
|
786 |
+
quant_config.fp16_model = f"{outdir}/{model_name}-fp16.gguf"
|
787 |
+
quant_config.imatrix_file = f"{outdir}/{model_name}-imatrix.dat"
|
788 |
+
quant_config.quantized_gguf = f"{outdir}/{gguf_name}"
|
789 |
+
|
790 |
+
processing_config = ModelProcessingConfig(
|
791 |
+
token=token,
|
792 |
+
model_id=model_id,
|
793 |
+
model_name=model_name,
|
794 |
+
outdir=outdir,
|
795 |
+
quant_config=quant_config,
|
796 |
+
split_config=split_config,
|
797 |
+
output_config=output_config
|
798 |
+
)
|
799 |
+
|
800 |
+
# Call the processor's main method with the config object
|
801 |
+
return self.processor.process_model(processing_config)
|
802 |
+
|
803 |
+
except GGUFConverterError as e:
|
804 |
+
print(f"Error in wrapper: {e}")
|
805 |
+
return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{self.processor._escape_html(str(e))}</pre>', "error.png")
|
806 |
+
except Exception as e:
|
807 |
+
print(f"Unexpected error in wrapper: {e}")
|
808 |
+
return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{self.processor._escape_html(str(e))}</pre>', "error.png")
|
809 |
|
|
|
810 |
|
811 |
def launch(self):
|
812 |
"""Launch the Gradio interface."""
|