Commit
·
3a806f2
1
Parent(s):
0805af2
- app.py +31 -34
- convert.log +0 -0
app.py
CHANGED
|
@@ -19,16 +19,13 @@ log.setLevel(logging.INFO)
|
|
| 19 |
log.addHandler(logging.StreamHandler())
|
| 20 |
log.addHandler(logging.FileHandler("convert.log"))
|
| 21 |
|
| 22 |
-
def log(msg):
|
| 23 |
-
"""追加并打印日志信息"""
|
| 24 |
-
log.info(msg)
|
| 25 |
|
| 26 |
def timeit(func):
|
| 27 |
def wrapper(*args, **kwargs):
|
| 28 |
start_time = time.time()
|
| 29 |
result = func(*args, **kwargs)
|
| 30 |
end_time = time.time()
|
| 31 |
-
log(f"{func.__name__}: {end_time - start_time:.2f} s")
|
| 32 |
return result
|
| 33 |
return wrapper
|
| 34 |
|
|
@@ -41,35 +38,35 @@ def get_model_size_in_gb(model_name):
|
|
| 41 |
# 使用 safetensors 大小(不假定文件扩展名)
|
| 42 |
return model_info.safetensors.total / (1024 ** 3)
|
| 43 |
except Exception as e:
|
| 44 |
-
log(f"Unable to estimate model size: {e}")
|
| 45 |
return 1 # 默认值
|
| 46 |
|
| 47 |
@timeit
|
| 48 |
def check_system_resources(model_name):
|
| 49 |
"""检查系统资源,决定使用 CPU 或 GPU"""
|
| 50 |
-
log("Checking system resources...")
|
| 51 |
system_memory = psutil.virtual_memory()
|
| 52 |
total_memory_gb = system_memory.total / (1024 ** 3)
|
| 53 |
-
log(f"Total system memory: {total_memory_gb:.1f}GB")
|
| 54 |
|
| 55 |
model_size_gb = get_model_size_in_gb(model_name)
|
| 56 |
required_memory_gb = model_size_gb * 2.5 # 预留额外内存
|
| 57 |
-
log(f"Estimated required memory for model: {required_memory_gb:.1f}GB")
|
| 58 |
|
| 59 |
if torch.cuda.is_available():
|
| 60 |
gpu_name = torch.cuda.get_device_name(0)
|
| 61 |
gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
|
| 62 |
-
log(f"Detected GPU: {gpu_name} with {gpu_memory_gb:.1f}GB memory")
|
| 63 |
if gpu_memory_gb >= required_memory_gb:
|
| 64 |
-
log("✅ Sufficient GPU memory available; using GPU.")
|
| 65 |
return "cuda", gpu_memory_gb
|
| 66 |
else:
|
| 67 |
-
log(f"⚠️ Insufficient GPU memory (requires {required_memory_gb:.1f}GB, found {gpu_memory_gb:.1f}GB).")
|
| 68 |
else:
|
| 69 |
-
log("❌ No GPU detected.")
|
| 70 |
|
| 71 |
if total_memory_gb >= required_memory_gb:
|
| 72 |
-
log("✅ Sufficient CPU memory available; using CPU.")
|
| 73 |
return "cpu", total_memory_gb
|
| 74 |
else:
|
| 75 |
raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
|
|
@@ -80,7 +77,7 @@ def setup_environment(model_name):
|
|
| 80 |
try:
|
| 81 |
device, _ = check_system_resources(model_name)
|
| 82 |
except Exception as e:
|
| 83 |
-
log(f"Resource check failed: {e}. Defaulting to CPU.")
|
| 84 |
device = "cpu"
|
| 85 |
return device
|
| 86 |
|
|
@@ -95,14 +92,14 @@ def create_hf_repo(repo_name, private=True):
|
|
| 95 |
repo_name_with_index = repo_name
|
| 96 |
while api.repo_exists(repo_name_with_index):
|
| 97 |
retry_index += 1
|
| 98 |
-
log(f"Repository {repo_name_with_index} exists; trying {repo_name}_{retry_index}")
|
| 99 |
repo_name_with_index = f"{repo_name}_{retry_index}"
|
| 100 |
repo_name = repo_name_with_index
|
| 101 |
repo_url = create_repo(repo_name, private=private)
|
| 102 |
-
log(f"Repository created successfully: {repo_url}")
|
| 103 |
return repo_name
|
| 104 |
except Exception as e:
|
| 105 |
-
log(f"Failed to create repository: {e}")
|
| 106 |
raise
|
| 107 |
|
| 108 |
@timeit
|
|
@@ -115,18 +112,18 @@ def download_and_merge_model(base_model_name, lora_model_name, output_dir, devic
|
|
| 115 |
5. 求 base 与 adapter tokenizer 的词表并取并集,扩展 tokenizer
|
| 116 |
6. 调整合并模型嵌入层尺寸并保存
|
| 117 |
"""
|
| 118 |
-
log("Loading base model...")
|
| 119 |
model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
|
| 120 |
-
log("Loading adapter tokenizer...")
|
| 121 |
adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
|
| 122 |
-
log("Resizing token embeddings...")
|
| 123 |
added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
|
| 124 |
model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
|
| 125 |
-
log("Loading LoRA adapter...")
|
| 126 |
peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
|
| 127 |
-
log("Merging and unloading model...")
|
| 128 |
model = peft_model.merge_and_unload()
|
| 129 |
-
log("Saving model...")
|
| 130 |
model.save_pretrained(output_dir)
|
| 131 |
adapter_tokenizer.save_pretrained(output_dir)
|
| 132 |
return output_dir
|
|
@@ -138,10 +135,10 @@ def clone_llamacpp_and_download_build():
|
|
| 138 |
llamacpp_dir = os.path.join(os.getcwd(), "llama.cpp")
|
| 139 |
|
| 140 |
if not os.path.exists(llamacpp_dir):
|
| 141 |
-
log(f"Cloning llama.cpp from {llamacpp_repo}...")
|
| 142 |
os.system(f"git clone {llamacpp_repo} {llamacpp_dir}")
|
| 143 |
|
| 144 |
-
log("Building llama.cpp...")
|
| 145 |
build_dir = os.path.join(llamacpp_dir, "build")
|
| 146 |
os.makedirs(build_dir, exist_ok=True)
|
| 147 |
|
|
@@ -155,7 +152,7 @@ def clone_llamacpp_and_download_build():
|
|
| 155 |
os.system("cmake -B build")
|
| 156 |
os.system("cmake --build build --config Release")
|
| 157 |
|
| 158 |
-
log("llama.cpp build completed.")
|
| 159 |
# 返回到原始目录
|
| 160 |
os.chdir(os.path.dirname(llamacpp_dir))
|
| 161 |
|
|
@@ -186,17 +183,17 @@ def quantize(model_path, repo_id, quant_method=None):
|
|
| 186 |
guff_16 = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
|
| 187 |
|
| 188 |
if not os.path.exists(guff_16):
|
| 189 |
-
log(f"正在将模型转换为GGML格式")
|
| 190 |
convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
|
| 191 |
convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16}"
|
| 192 |
print(f"syscall:[{convert_cmd}]")
|
| 193 |
os.system(convert_cmd)
|
| 194 |
else:
|
| 195 |
-
log(f"GGML中间文件已存在,跳过转换")
|
| 196 |
|
| 197 |
# 最终文件保存在 model_output 目录下
|
| 198 |
final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
|
| 199 |
-
log(f"正在进行{quant_method}量化")
|
| 200 |
quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
|
| 201 |
quant_cmd = f"{quantize_bin} {guff_16} {final_path} {quant_method}"
|
| 202 |
print(f"syscall:[{quant_cmd}]")
|
|
@@ -204,7 +201,7 @@ def quantize(model_path, repo_id, quant_method=None):
|
|
| 204 |
if not os.path.exists(final_path):
|
| 205 |
os.system(quant_cmd)
|
| 206 |
else:
|
| 207 |
-
log(f"{quant_method}量化文件已存在,跳过量化")
|
| 208 |
return None
|
| 209 |
|
| 210 |
return final_path
|
|
@@ -281,7 +278,7 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
|
|
| 281 |
repo_name = create_hf_repo(repo_name)
|
| 282 |
|
| 283 |
output_dir = os.path.join(".", "output", repo_name)
|
| 284 |
-
log("Starting model merge process...")
|
| 285 |
model_path = download_and_merge_model(base_model_name, lora_model_name, output_dir, device)
|
| 286 |
|
| 287 |
|
|
@@ -299,15 +296,15 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
|
|
| 299 |
num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
|
| 300 |
print_report_every=10,
|
| 301 |
)
|
| 302 |
-
log("Upload completed.")
|
| 303 |
|
| 304 |
# rm -rf model_path
|
| 305 |
shutil.rmtree(model_path)
|
| 306 |
-
log("Removed model from local")
|
| 307 |
|
| 308 |
except Exception as e:
|
| 309 |
error_message = f"Error during processing: {e}"
|
| 310 |
-
log(error_message)
|
| 311 |
raise e
|
| 312 |
|
| 313 |
|
|
|
|
| 19 |
log.addHandler(logging.StreamHandler())
|
| 20 |
log.addHandler(logging.FileHandler("convert.log"))
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def timeit(func):
|
| 24 |
def wrapper(*args, **kwargs):
|
| 25 |
start_time = time.time()
|
| 26 |
result = func(*args, **kwargs)
|
| 27 |
end_time = time.time()
|
| 28 |
+
log.info(f"{func.__name__}: {end_time - start_time:.2f} s")
|
| 29 |
return result
|
| 30 |
return wrapper
|
| 31 |
|
|
|
|
| 38 |
# 使用 safetensors 大小(不假定文件扩展名)
|
| 39 |
return model_info.safetensors.total / (1024 ** 3)
|
| 40 |
except Exception as e:
|
| 41 |
+
log.error(f"Unable to estimate model size: {e}")
|
| 42 |
return 1 # 默认值
|
| 43 |
|
| 44 |
@timeit
|
| 45 |
def check_system_resources(model_name):
|
| 46 |
"""检查系统资源,决定使用 CPU 或 GPU"""
|
| 47 |
+
log.info("Checking system resources...")
|
| 48 |
system_memory = psutil.virtual_memory()
|
| 49 |
total_memory_gb = system_memory.total / (1024 ** 3)
|
| 50 |
+
log.info(f"Total system memory: {total_memory_gb:.1f}GB")
|
| 51 |
|
| 52 |
model_size_gb = get_model_size_in_gb(model_name)
|
| 53 |
required_memory_gb = model_size_gb * 2.5 # 预留额外内存
|
| 54 |
+
log.info(f"Estimated required memory for model: {required_memory_gb:.1f}GB")
|
| 55 |
|
| 56 |
if torch.cuda.is_available():
|
| 57 |
gpu_name = torch.cuda.get_device_name(0)
|
| 58 |
gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
|
| 59 |
+
log.info(f"Detected GPU: {gpu_name} with {gpu_memory_gb:.1f}GB memory")
|
| 60 |
if gpu_memory_gb >= required_memory_gb:
|
| 61 |
+
log.info("✅ Sufficient GPU memory available; using GPU.")
|
| 62 |
return "cuda", gpu_memory_gb
|
| 63 |
else:
|
| 64 |
+
log.warning(f"⚠️ Insufficient GPU memory (requires {required_memory_gb:.1f}GB, found {gpu_memory_gb:.1f}GB).")
|
| 65 |
else:
|
| 66 |
+
log.error("❌ No GPU detected.")
|
| 67 |
|
| 68 |
if total_memory_gb >= required_memory_gb:
|
| 69 |
+
log.info("✅ Sufficient CPU memory available; using CPU.")
|
| 70 |
return "cpu", total_memory_gb
|
| 71 |
else:
|
| 72 |
raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
|
|
|
|
| 77 |
try:
|
| 78 |
device, _ = check_system_resources(model_name)
|
| 79 |
except Exception as e:
|
| 80 |
+
log.error(f"Resource check failed: {e}. Defaulting to CPU.")
|
| 81 |
device = "cpu"
|
| 82 |
return device
|
| 83 |
|
|
|
|
| 92 |
repo_name_with_index = repo_name
|
| 93 |
while api.repo_exists(repo_name_with_index):
|
| 94 |
retry_index += 1
|
| 95 |
+
log.info(f"Repository {repo_name_with_index} exists; trying {repo_name}_{retry_index}")
|
| 96 |
repo_name_with_index = f"{repo_name}_{retry_index}"
|
| 97 |
repo_name = repo_name_with_index
|
| 98 |
repo_url = create_repo(repo_name, private=private)
|
| 99 |
+
log.info(f"Repository created successfully: {repo_url}")
|
| 100 |
return repo_name
|
| 101 |
except Exception as e:
|
| 102 |
+
log.error(f"Failed to create repository: {e}")
|
| 103 |
raise
|
| 104 |
|
| 105 |
@timeit
|
|
|
|
| 112 |
5. 求 base 与 adapter tokenizer 的词表并取并集,扩展 tokenizer
|
| 113 |
6. 调整合并模型嵌入层尺寸并保存
|
| 114 |
"""
|
| 115 |
+
log.info("Loading base model...")
|
| 116 |
model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
|
| 117 |
+
log.info("Loading adapter tokenizer...")
|
| 118 |
adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
|
| 119 |
+
log.info("Resizing token embeddings...")
|
| 120 |
added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
|
| 121 |
model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
|
| 122 |
+
log.info("Loading LoRA adapter...")
|
| 123 |
peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
|
| 124 |
+
log.info("Merging and unloading model...")
|
| 125 |
model = peft_model.merge_and_unload()
|
| 126 |
+
log.info("Saving model...")
|
| 127 |
model.save_pretrained(output_dir)
|
| 128 |
adapter_tokenizer.save_pretrained(output_dir)
|
| 129 |
return output_dir
|
|
|
|
| 135 |
llamacpp_dir = os.path.join(os.getcwd(), "llama.cpp")
|
| 136 |
|
| 137 |
if not os.path.exists(llamacpp_dir):
|
| 138 |
+
log.info(f"Cloning llama.cpp from {llamacpp_repo}...")
|
| 139 |
os.system(f"git clone {llamacpp_repo} {llamacpp_dir}")
|
| 140 |
|
| 141 |
+
log.info("Building llama.cpp...")
|
| 142 |
build_dir = os.path.join(llamacpp_dir, "build")
|
| 143 |
os.makedirs(build_dir, exist_ok=True)
|
| 144 |
|
|
|
|
| 152 |
os.system("cmake -B build")
|
| 153 |
os.system("cmake --build build --config Release")
|
| 154 |
|
| 155 |
+
log.info("llama.cpp build completed.")
|
| 156 |
# 返回到原始目录
|
| 157 |
os.chdir(os.path.dirname(llamacpp_dir))
|
| 158 |
|
|
|
|
| 183 |
guff_16 = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
|
| 184 |
|
| 185 |
if not os.path.exists(guff_16):
|
| 186 |
+
log.info(f"正在将模型转换为GGML格式")
|
| 187 |
convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
|
| 188 |
convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16}"
|
| 189 |
print(f"syscall:[{convert_cmd}]")
|
| 190 |
os.system(convert_cmd)
|
| 191 |
else:
|
| 192 |
+
log.info(f"GGML中间文件已存在,跳过转换")
|
| 193 |
|
| 194 |
# 最终文件保存在 model_output 目录下
|
| 195 |
final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
|
| 196 |
+
log.info(f"正在进行{quant_method}量化")
|
| 197 |
quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
|
| 198 |
quant_cmd = f"{quantize_bin} {guff_16} {final_path} {quant_method}"
|
| 199 |
print(f"syscall:[{quant_cmd}]")
|
|
|
|
| 201 |
if not os.path.exists(final_path):
|
| 202 |
os.system(quant_cmd)
|
| 203 |
else:
|
| 204 |
+
log.info(f"{quant_method}量化文件已存在,跳过量化")
|
| 205 |
return None
|
| 206 |
|
| 207 |
return final_path
|
|
|
|
| 278 |
repo_name = create_hf_repo(repo_name)
|
| 279 |
|
| 280 |
output_dir = os.path.join(".", "output", repo_name)
|
| 281 |
+
log.info("Starting model merge process...")
|
| 282 |
model_path = download_and_merge_model(base_model_name, lora_model_name, output_dir, device)
|
| 283 |
|
| 284 |
|
|
|
|
| 296 |
num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
|
| 297 |
print_report_every=10,
|
| 298 |
)
|
| 299 |
+
log.info("Upload completed.")
|
| 300 |
|
| 301 |
# rm -rf model_path
|
| 302 |
shutil.rmtree(model_path)
|
| 303 |
+
log.info("Removed model from local")
|
| 304 |
|
| 305 |
except Exception as e:
|
| 306 |
error_message = f"Error during processing: {e}"
|
| 307 |
+
log.error(error_message)
|
| 308 |
raise e
|
| 309 |
|
| 310 |
|
convert.log
ADDED
|
File without changes
|