Spaces:
Running
Running
Oleg Shulyakov
commited on
Commit
·
239afdd
1
Parent(s):
dd433e4
subprocess.Run
Browse files
app.py
CHANGED
@@ -31,6 +31,7 @@ class QuantizationConfig:
|
|
31 |
quantized_gguf: str = field(default="", init=False)
|
32 |
imatrix_file: str = field(default="", init=False)
|
33 |
|
|
|
34 |
@dataclass
|
35 |
class SplitConfig:
|
36 |
"""Configuration for model splitting."""
|
@@ -46,6 +47,7 @@ class OutputConfig:
|
|
46 |
repo_name: str = ""
|
47 |
filename: str = ""
|
48 |
|
|
|
49 |
@dataclass
|
50 |
class ModelProcessingConfig:
|
51 |
"""Configuration for the entire model processing pipeline."""
|
@@ -60,6 +62,7 @@ class ModelProcessingConfig:
|
|
60 |
new_repo_url: str = field(default="", init=False)
|
61 |
new_repo_id: str = field(default="", init=False)
|
62 |
|
|
|
63 |
class GGUFConverterError(Exception):
|
64 |
"""Custom exception for GGUF conversion errors."""
|
65 |
pass
|
@@ -143,9 +146,10 @@ class HuggingFaceModelProcessor:
|
|
143 |
train_data_path = self.CALIBRATION_FILE
|
144 |
if not os.path.isfile(train_data_path):
|
145 |
raise GGUFConverterError(f"Training data file not found: {train_data_path}")
|
146 |
-
print(f"Training data file path: {train_data_path}")
|
147 |
|
|
|
148 |
print("Running imatrix command...")
|
|
|
149 |
imatrix_command = [
|
150 |
"llama-imatrix",
|
151 |
"-m", quant_config.fp16_model,
|
@@ -157,16 +161,19 @@ class HuggingFaceModelProcessor:
|
|
157 |
|
158 |
process = subprocess.Popen(imatrix_command, shell=False)
|
159 |
try:
|
160 |
-
process.wait(timeout=
|
161 |
except subprocess.TimeoutExpired:
|
162 |
print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
|
163 |
process.send_signal(signal.SIGINT)
|
164 |
try:
|
165 |
process.wait(timeout=5)
|
166 |
except subprocess.TimeoutExpired:
|
167 |
-
print("Imatrix proc still didn't term.
|
168 |
process.kill()
|
169 |
|
|
|
|
|
|
|
170 |
print(f"Importance matrix generation completed: {os.path.abspath(quant_config.imatrix_file)}")
|
171 |
|
172 |
def _split_and_upload_model(self, processing_config: ModelProcessingConfig) -> None:
|
@@ -188,14 +195,16 @@ class HuggingFaceModelProcessor:
|
|
188 |
split_cmd.extend([quant_config.quantized_gguf, model_path_prefix])
|
189 |
|
190 |
print(f"Split command: {split_cmd}")
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
195 |
|
196 |
-
if
|
197 |
-
|
198 |
-
raise GGUFConverterError(f"Error splitting the model: {stderr_str}")
|
199 |
|
200 |
print("Model split successfully!")
|
201 |
|
@@ -215,6 +224,7 @@ class HuggingFaceModelProcessor:
|
|
215 |
raise GGUFConverterError("No sharded files found.")
|
216 |
|
217 |
print(f"Sharded model files: {sharded_model_files}")
|
|
|
218 |
for file in sharded_model_files:
|
219 |
file_path = os.path.join(processing_config.outdir, file)
|
220 |
try:
|
@@ -268,19 +278,20 @@ class HuggingFaceModelProcessor:
|
|
268 |
|
269 |
# Convert HF to GGUF
|
270 |
print(f"Converting to GGUF FP16: {os.path.abspath(processing_config.quant_config.fp16_model)}")
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
279 |
|
280 |
-
|
281 |
-
|
282 |
-
stderr_str = result.stderr.decode("utf-8")
|
283 |
-
raise GGUFConverterError(f"Error converting to fp16: {stderr_str}")
|
284 |
|
285 |
print("Model converted to fp16 successfully!")
|
286 |
print(f"Converted model path: {os.path.abspath(processing_config.quant_config.fp16_model)}")
|
@@ -315,11 +326,18 @@ class HuggingFaceModelProcessor:
|
|
315 |
quantize_cmd.append(quant_config.method)
|
316 |
|
317 |
print(f"Quantizing model with {quantize_cmd}")
|
318 |
-
result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
|
319 |
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
print(f"Quantized successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
|
325 |
print(f"Quantized model path: {os.path.abspath(quant_config.quantized_gguf)}")
|
@@ -338,7 +356,7 @@ class HuggingFaceModelProcessor:
|
|
338 |
|
339 |
return new_repo_url
|
340 |
|
341 |
-
def _generate_readme(self, processing_config
|
342 |
"""Generate README.md for the quantized model."""
|
343 |
creator = self._get_model_creator(processing_config.model_id)
|
344 |
username = whoami(processing_config.token)["name"]
|
|
|
31 |
quantized_gguf: str = field(default="", init=False)
|
32 |
imatrix_file: str = field(default="", init=False)
|
33 |
|
34 |
+
|
35 |
@dataclass
|
36 |
class SplitConfig:
|
37 |
"""Configuration for model splitting."""
|
|
|
47 |
repo_name: str = ""
|
48 |
filename: str = ""
|
49 |
|
50 |
+
|
51 |
@dataclass
|
52 |
class ModelProcessingConfig:
|
53 |
"""Configuration for the entire model processing pipeline."""
|
|
|
62 |
new_repo_url: str = field(default="", init=False)
|
63 |
new_repo_id: str = field(default="", init=False)
|
64 |
|
65 |
+
|
66 |
class GGUFConverterError(Exception):
|
67 |
"""Custom exception for GGUF conversion errors."""
|
68 |
pass
|
|
|
146 |
train_data_path = self.CALIBRATION_FILE
|
147 |
if not os.path.isfile(train_data_path):
|
148 |
raise GGUFConverterError(f"Training data file not found: {train_data_path}")
|
|
|
149 |
|
150 |
+
print(f"Training data file path: {train_data_path}")
|
151 |
print("Running imatrix command...")
|
152 |
+
|
153 |
imatrix_command = [
|
154 |
"llama-imatrix",
|
155 |
"-m", quant_config.fp16_model,
|
|
|
161 |
|
162 |
process = subprocess.Popen(imatrix_command, shell=False)
|
163 |
try:
|
164 |
+
process.wait(timeout=300)
|
165 |
except subprocess.TimeoutExpired:
|
166 |
print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
|
167 |
process.send_signal(signal.SIGINT)
|
168 |
try:
|
169 |
process.wait(timeout=5)
|
170 |
except subprocess.TimeoutExpired:
|
171 |
+
print("Imatrix proc still didn't term. Forcefully terminating process...")
|
172 |
process.kill()
|
173 |
|
174 |
+
if process.returncode != 0:
|
175 |
+
raise GGUFConverterError(f"Error generating imatrix")
|
176 |
+
|
177 |
print(f"Importance matrix generation completed: {os.path.abspath(quant_config.imatrix_file)}")
|
178 |
|
179 |
def _split_and_upload_model(self, processing_config: ModelProcessingConfig) -> None:
|
|
|
195 |
split_cmd.extend([quant_config.quantized_gguf, model_path_prefix])
|
196 |
|
197 |
print(f"Split command: {split_cmd}")
|
198 |
+
process = subprocess.Popen(split_cmd, shell=False)
|
199 |
+
try:
|
200 |
+
process.wait(timeout=300)
|
201 |
+
except subprocess.TimeoutExpired:
|
202 |
+
print("Splitting timed out. Killing process...")
|
203 |
+
process.kill()
|
204 |
+
raise GGUFConverterError("Error splitting the model: Operation timed out.")
|
205 |
|
206 |
+
if process.returncode != 0:
|
207 |
+
raise GGUFConverterError(f"Error splitting the model")
|
|
|
208 |
|
209 |
print("Model split successfully!")
|
210 |
|
|
|
224 |
raise GGUFConverterError("No sharded files found.")
|
225 |
|
226 |
print(f"Sharded model files: {sharded_model_files}")
|
227 |
+
|
228 |
for file in sharded_model_files:
|
229 |
file_path = os.path.join(processing_config.outdir, file)
|
230 |
try:
|
|
|
278 |
|
279 |
# Convert HF to GGUF
|
280 |
print(f"Converting to GGUF FP16: {os.path.abspath(processing_config.quant_config.fp16_model)}")
|
281 |
+
convert_command = [
|
282 |
+
"python3", "/app/convert_hf_to_gguf.py", local_dir,
|
283 |
+
"--outtype", "f16", "--outfile", processing_config.quant_config.fp16_model
|
284 |
+
]
|
285 |
+
process = subprocess.Popen(convert_command, shell=False)
|
286 |
+
try:
|
287 |
+
process.wait(timeout=600)
|
288 |
+
except subprocess.TimeoutExpired:
|
289 |
+
print("Conversion timed out. Killing process...")
|
290 |
+
process.kill()
|
291 |
+
raise GGUFConverterError("Error converting to fp16: Operation timed out.")
|
292 |
|
293 |
+
if process.returncode != 0:
|
294 |
+
raise GGUFConverterError(f"Error converting to fp16")
|
|
|
|
|
295 |
|
296 |
print("Model converted to fp16 successfully!")
|
297 |
print(f"Converted model path: {os.path.abspath(processing_config.quant_config.fp16_model)}")
|
|
|
326 |
quantize_cmd.append(quant_config.method)
|
327 |
|
328 |
print(f"Quantizing model with {quantize_cmd}")
|
|
|
329 |
|
330 |
+
# Use Popen for quantization
|
331 |
+
process = subprocess.Popen(quantize_cmd, shell=False)
|
332 |
+
try:
|
333 |
+
process.wait(timeout=3600)
|
334 |
+
except subprocess.TimeoutExpired:
|
335 |
+
print("Quantization timed out. Killing process...")
|
336 |
+
process.kill()
|
337 |
+
raise GGUFConverterError("Error quantizing: Operation timed out.")
|
338 |
+
|
339 |
+
if process.returncode != 0:
|
340 |
+
raise GGUFConverterError(f"Error quantizing")
|
341 |
|
342 |
print(f"Quantized successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
|
343 |
print(f"Quantized model path: {os.path.abspath(quant_config.quantized_gguf)}")
|
|
|
356 |
|
357 |
return new_repo_url
|
358 |
|
359 |
+
def _generate_readme(self, processing_config: ModelProcessingConfig) -> str:
|
360 |
"""Generate README.md for the quantized model."""
|
361 |
creator = self._get_model_creator(processing_config.model_id)
|
362 |
username = whoami(processing_config.token)["name"]
|