Spaces:
Runtime error
Runtime error
zetavg
commited on
update
Browse files- llama_lora/globals.py +63 -0
- llama_lora/models.py +9 -5
- llama_lora/ui/finetune_ui.py +27 -6
- llama_lora/ui/inference_ui.py +1 -1
- requirements.txt +2 -0
llama_lora/globals.py
CHANGED
|
@@ -3,6 +3,9 @@ import subprocess
|
|
| 3 |
|
| 4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
from .lib.finetune import train
|
| 7 |
|
| 8 |
|
|
@@ -25,6 +28,12 @@ class Global:
|
|
| 25 |
# Model related
|
| 26 |
model_has_been_used = False
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# UI related
|
| 29 |
ui_title: str = "LLaMA-LoRA"
|
| 30 |
ui_emoji: str = "🦙🎛️"
|
|
@@ -60,3 +69,57 @@ commit_hash = get_git_commit_hash()
|
|
| 60 |
|
| 61 |
if commit_hash:
|
| 62 |
Global.version = commit_hash[:8]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 5 |
|
| 6 |
+
from numba import cuda
|
| 7 |
+
import nvidia_smi
|
| 8 |
+
|
| 9 |
from .lib.finetune import train
|
| 10 |
|
| 11 |
|
|
|
|
| 28 |
# Model related
|
| 29 |
model_has_been_used = False
|
| 30 |
|
| 31 |
+
# GPU Info
|
| 32 |
+
gpu_cc = None # GPU compute capability
|
| 33 |
+
gpu_sms = None # GPU total number of SMs
|
| 34 |
+
gpu_total_cores = None # GPU total cores
|
| 35 |
+
gpu_total_memory = None
|
| 36 |
+
|
| 37 |
# UI related
|
| 38 |
ui_title: str = "LLaMA-LoRA"
|
| 39 |
ui_emoji: str = "🦙🎛️"
|
|
|
|
| 69 |
|
| 70 |
if commit_hash:
|
| 71 |
Global.version = commit_hash[:8]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def load_gpu_info():
|
| 75 |
+
try:
|
| 76 |
+
cc_cores_per_SM_dict = {
|
| 77 |
+
(2, 0): 32,
|
| 78 |
+
(2, 1): 48,
|
| 79 |
+
(3, 0): 192,
|
| 80 |
+
(3, 5): 192,
|
| 81 |
+
(3, 7): 192,
|
| 82 |
+
(5, 0): 128,
|
| 83 |
+
(5, 2): 128,
|
| 84 |
+
(6, 0): 64,
|
| 85 |
+
(6, 1): 128,
|
| 86 |
+
(7, 0): 64,
|
| 87 |
+
(7, 5): 64,
|
| 88 |
+
(8, 0): 64,
|
| 89 |
+
(8, 6): 128,
|
| 90 |
+
(8, 9): 128,
|
| 91 |
+
(9, 0): 128
|
| 92 |
+
}
|
| 93 |
+
# the above dictionary should result in a value of "None" if a cc match
|
| 94 |
+
# is not found. The dictionary needs to be extended as new devices become
|
| 95 |
+
# available, and currently does not account for all Jetson devices
|
| 96 |
+
device = cuda.get_current_device()
|
| 97 |
+
device_sms = getattr(device, 'MULTIPROCESSOR_COUNT')
|
| 98 |
+
device_cc = device.compute_capability
|
| 99 |
+
cores_per_sm = cc_cores_per_SM_dict.get(device_cc)
|
| 100 |
+
total_cores = cores_per_sm*device_sms
|
| 101 |
+
print("GPU compute capability: ", device_cc)
|
| 102 |
+
print("GPU total number of SMs: ", device_sms)
|
| 103 |
+
print("GPU total cores: ", total_cores)
|
| 104 |
+
Global.gpu_cc = device_cc
|
| 105 |
+
Global.gpu_sms = device_sms
|
| 106 |
+
Global.gpu_total_cores = total_cores
|
| 107 |
+
|
| 108 |
+
nvidia_smi.nvmlInit()
|
| 109 |
+
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
|
| 110 |
+
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
|
| 111 |
+
total_memory = info.total
|
| 112 |
+
|
| 113 |
+
total_memory_mb = total_memory / (1024 ** 2)
|
| 114 |
+
total_memory_gb = total_memory / (1024 ** 3)
|
| 115 |
+
|
| 116 |
+
# Print the memory size
|
| 117 |
+
print(
|
| 118 |
+
f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
|
| 119 |
+
Global.gpu_total_memory = total_memory
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"Notice: cannot get GPU info: {e}")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
load_gpu_info()
|
llama_lora/models.py
CHANGED
|
@@ -102,6 +102,14 @@ def load_base_model():
|
|
| 102 |
)
|
| 103 |
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
def unload_models():
|
| 106 |
del Global.loaded_base_model
|
| 107 |
Global.loaded_base_model = None
|
|
@@ -109,11 +117,7 @@ def unload_models():
|
|
| 109 |
del Global.loaded_tokenizer
|
| 110 |
Global.loaded_tokenizer = None
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
# if not shared.args.cpu: # will not be running on CPUs anyway
|
| 115 |
-
with torch.no_grad():
|
| 116 |
-
torch.cuda.empty_cache()
|
| 117 |
|
| 118 |
Global.model_has_been_used = False
|
| 119 |
|
|
|
|
| 102 |
)
|
| 103 |
|
| 104 |
|
| 105 |
+
def clear_cache():
|
| 106 |
+
gc.collect()
|
| 107 |
+
|
| 108 |
+
# if not shared.args.cpu: # will not be running on CPUs anyway
|
| 109 |
+
with torch.no_grad():
|
| 110 |
+
torch.cuda.empty_cache()
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def unload_models():
|
| 114 |
del Global.loaded_base_model
|
| 115 |
Global.loaded_base_model = None
|
|
|
|
| 117 |
del Global.loaded_tokenizer
|
| 118 |
Global.loaded_tokenizer = None
|
| 119 |
|
| 120 |
+
clear_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
Global.model_has_been_used = False
|
| 123 |
|
llama_lora/ui/finetune_ui.py
CHANGED
|
@@ -9,7 +9,9 @@ from random_word import RandomWords
|
|
| 9 |
from transformers import TrainerCallback
|
| 10 |
|
| 11 |
from ..globals import Global
|
| 12 |
-
from ..models import
|
|
|
|
|
|
|
| 13 |
from ..utils.data import (
|
| 14 |
get_available_template_names,
|
| 15 |
get_available_dataset_names,
|
|
@@ -238,6 +240,12 @@ def parse_plain_text_input(
|
|
| 238 |
return result
|
| 239 |
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
def do_train(
|
| 242 |
# Dataset
|
| 243 |
template,
|
|
@@ -258,9 +266,10 @@ def do_train(
|
|
| 258 |
lora_alpha,
|
| 259 |
lora_dropout,
|
| 260 |
model_name,
|
| 261 |
-
progress=gr.Progress(track_tqdm=
|
| 262 |
):
|
| 263 |
try:
|
|
|
|
| 264 |
# If model has been used in inference, we need to unload it first.
|
| 265 |
# Otherwise, we'll get a 'Function MmBackward0 returned an invalid
|
| 266 |
# gradient at index 1 - expected device meta but got cuda:0' error.
|
|
@@ -337,7 +346,8 @@ def do_train(
|
|
| 337 |
|
| 338 |
progress(
|
| 339 |
(i, 300),
|
| 340 |
-
desc="(Simulate) " +
|
|
|
|
| 341 |
)
|
| 342 |
|
| 343 |
time.sleep(0.1)
|
|
@@ -401,12 +411,13 @@ Train data (first 10):
|
|
| 401 |
|
| 402 |
# Do this again right before training to make sure the model is not used in inference.
|
| 403 |
unload_models_if_already_used()
|
|
|
|
| 404 |
|
| 405 |
base_model = get_base_model()
|
| 406 |
tokenizer = get_tokenizer()
|
| 407 |
|
| 408 |
# Do not let other tqdm iterations interfere the progress reporting after training starts.
|
| 409 |
-
progress.track_tqdm = False
|
| 410 |
|
| 411 |
results = Global.train_fn(
|
| 412 |
base_model, # base_model
|
|
@@ -431,7 +442,8 @@ Train data (first 10):
|
|
| 431 |
training_callbacks # callbacks
|
| 432 |
)
|
| 433 |
|
| 434 |
-
logs_str = "\n".join([json.dumps(log)
|
|
|
|
| 435 |
|
| 436 |
result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
|
| 437 |
print(result_message)
|
|
@@ -590,9 +602,18 @@ def finetune_ui():
|
|
| 590 |
)
|
| 591 |
|
| 592 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
with gr.Column():
|
| 594 |
micro_batch_size = gr.Slider(
|
| 595 |
-
minimum=1, maximum=100, step=1, value=
|
| 596 |
label="Micro Batch Size",
|
| 597 |
info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
|
| 598 |
)
|
|
|
|
| 9 |
from transformers import TrainerCallback
|
| 10 |
|
| 11 |
from ..globals import Global
|
| 12 |
+
from ..models import (
|
| 13 |
+
get_base_model, get_tokenizer,
|
| 14 |
+
clear_cache, unload_models_if_already_used)
|
| 15 |
from ..utils.data import (
|
| 16 |
get_available_template_names,
|
| 17 |
get_available_dataset_names,
|
|
|
|
| 240 |
return result
|
| 241 |
|
| 242 |
|
| 243 |
+
should_training_progress_track_tqdm = True
|
| 244 |
+
|
| 245 |
+
if Global.gpu_total_cores is not None and Global.gpu_total_cores > 2560:
|
| 246 |
+
should_training_progress_track_tqdm = False
|
| 247 |
+
|
| 248 |
+
|
| 249 |
def do_train(
|
| 250 |
# Dataset
|
| 251 |
template,
|
|
|
|
| 266 |
lora_alpha,
|
| 267 |
lora_dropout,
|
| 268 |
model_name,
|
| 269 |
+
progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
|
| 270 |
):
|
| 271 |
try:
|
| 272 |
+
clear_cache()
|
| 273 |
# If model has been used in inference, we need to unload it first.
|
| 274 |
# Otherwise, we'll get a 'Function MmBackward0 returned an invalid
|
| 275 |
# gradient at index 1 - expected device meta but got cuda:0' error.
|
|
|
|
| 346 |
|
| 347 |
progress(
|
| 348 |
(i, 300),
|
| 349 |
+
desc="(Simulate) " +
|
| 350 |
+
get_progress_text(epoch, epochs, last_loss)
|
| 351 |
)
|
| 352 |
|
| 353 |
time.sleep(0.1)
|
|
|
|
| 411 |
|
| 412 |
# Do this again right before training to make sure the model is not used in inference.
|
| 413 |
unload_models_if_already_used()
|
| 414 |
+
clear_cache()
|
| 415 |
|
| 416 |
base_model = get_base_model()
|
| 417 |
tokenizer = get_tokenizer()
|
| 418 |
|
| 419 |
# Do not let other tqdm iterations interfere the progress reporting after training starts.
|
| 420 |
+
# progress.track_tqdm = False # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
|
| 421 |
|
| 422 |
results = Global.train_fn(
|
| 423 |
base_model, # base_model
|
|
|
|
| 442 |
training_callbacks # callbacks
|
| 443 |
)
|
| 444 |
|
| 445 |
+
logs_str = "\n".join([json.dumps(log)
|
| 446 |
+
for log in log_history]) or "None"
|
| 447 |
|
| 448 |
result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
|
| 449 |
print(result_message)
|
|
|
|
| 602 |
)
|
| 603 |
|
| 604 |
with gr.Row():
|
| 605 |
+
micro_batch_size_default_value = 1
|
| 606 |
+
|
| 607 |
+
if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
|
| 608 |
+
memory_per_core = Global.gpu_total_memory / Global.gpu_total_cores
|
| 609 |
+
if memory_per_core >= 6291456:
|
| 610 |
+
micro_batch_size_default_value = 8
|
| 611 |
+
elif memory_per_core >= 4000000: # ?
|
| 612 |
+
micro_batch_size_default_value = 4
|
| 613 |
+
|
| 614 |
with gr.Column():
|
| 615 |
micro_batch_size = gr.Slider(
|
| 616 |
+
minimum=1, maximum=100, step=1, value=micro_batch_size_default_value,
|
| 617 |
label="Micro Batch Size",
|
| 618 |
info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
|
| 619 |
)
|
llama_lora/ui/inference_ui.py
CHANGED
|
@@ -245,7 +245,7 @@ def inference_ui():
|
|
| 245 |
preview_prompt = gr.Textbox(
|
| 246 |
show_label=False, interactive=False, elem_id="inference_preview_prompt")
|
| 247 |
update_prompt_preview_btn = gr.Button(
|
| 248 |
-
"↻", elem_id="inference_update_prompt_preview_btn"
|
| 249 |
update_prompt_preview_btn.style(size="sm")
|
| 250 |
|
| 251 |
# with gr.Column():
|
|
|
|
| 245 |
preview_prompt = gr.Textbox(
|
| 246 |
show_label=False, interactive=False, elem_id="inference_preview_prompt")
|
| 247 |
update_prompt_preview_btn = gr.Button(
|
| 248 |
+
"↻", elem_id="inference_update_prompt_preview_btn")
|
| 249 |
update_prompt_preview_btn.style(size="sm")
|
| 250 |
|
| 251 |
# with gr.Column():
|
requirements.txt
CHANGED
|
@@ -7,6 +7,8 @@ datasets
|
|
| 7 |
fire
|
| 8 |
git+https://github.com/huggingface/peft.git
|
| 9 |
git+https://github.com/huggingface/transformers.git
|
|
|
|
|
|
|
| 10 |
gradio
|
| 11 |
loralib
|
| 12 |
sentencepiece
|
|
|
|
| 7 |
fire
|
| 8 |
git+https://github.com/huggingface/peft.git
|
| 9 |
git+https://github.com/huggingface/transformers.git
|
| 10 |
+
numba
|
| 11 |
+
nvidia-ml-py3
|
| 12 |
gradio
|
| 13 |
loralib
|
| 14 |
sentencepiece
|