SeaLLM-Chat

Paused

App Files Files Community

phi commited on Oct 19, 2023

Commit

6ded56f

1 Parent(s): a572fd2

update

Browse files

Files changed (1) hide show

app.py +103 -29

app.py CHANGED Viewed

@@ -32,17 +32,72 @@ from huggingface_hub import snapshot_download
 # @@ constants ================
 DEBUG = bool(int(os.environ.get("DEBUG", "1")))
-BLOCK_ZH = bool(int(os.environ.get("BLOCK_ZH", "0")))
 TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
 DTYPE = os.environ.get("DTYPE", "bfloat16")
-# DTYPE = 'float16'
-# MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
-MODEL_PATH = os.environ.get("MODEL_PATH", "seal_13b_a")
 PORT = int(os.environ.get("PORT", "7860"))
 STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
-MAX_TOKENS = 2048
 # @@ constants ================
 if not DEBUG:
@@ -115,7 +170,6 @@ def hf_model_weights_iterator(
         x for x in glob.glob(os.path.join(hf_folder, "*model*.safetensors"))
         if not x.endswith("training_args.bin")
     ]
-    # print(F'Load bin files: {hf_bin_files} // safetensors: {hf_safetensors_files}')
     if use_np_cache:
         # Convert the model weights from torch tensors to numpy arrays for
@@ -226,15 +280,8 @@ def llama_load_weights(
     state_dict = self.state_dict()
     need_to_load = len(state_dict)
     loaded = 0
-    # try:
-    #     iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
-    # except Exception as e:
-    #     iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, load_format, revision)
     iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
-    # for name, loaded_weight in hf_model_weights_iterator(
-    #         model_name_or_path, cache_dir, load_format, revision):
-            # model_name_or_path, cache_dir, use_np_cache):
     for name, loaded_weight in iterator:
         if "rotary_emb.inv_freq" in name:
             continue
@@ -253,12 +300,6 @@ def llama_load_weights(
             if num_extra_rows > 0:
                 print(f'Add empty to {num_extra_rows} extra row for {name}')
             print(f'Load: {name} | {padded_vocab_size=} | {self.config.vocab_size=} | {num_extra_rows=} | {param.size()=} | {loaded_weight.size()=} | {load_size=}')
-        # if "embed_tokens" in name or "lm_head" in name:
-        #     param = state_dict[name]
-        #     load_padded_tensor_parallel_vocab(param, loaded_weight, tensor_model_parallel_rank)
-        #     loaded += 1
-            # continue
         is_attention_weight = False
         for weight_name, shard_size, offset in attention_weight_specs:
@@ -385,8 +426,6 @@ if not DEBUG:
 set_documentation_group("component")
-DATA_ROOT = os.environ.get("dataroot", "/mnt/workspace/workgroup/phi")
-MODEL_CACHE_DIR = os.path.join(DATA_ROOT, "pret_models")
 DTYPES = {
@@ -397,7 +436,6 @@ DTYPES = {
 llm = None
 demo = None
-RELOAD_SIGNAL = '<<<reload:'
 BOS_TOKEN = '<s>'
 EOS_TOKEN = '</s>'
@@ -824,28 +862,64 @@ path_markdown = """
 {model_path}
 """
 def launch():
     global demo, llm, DEBUG
     model_desc = MODEL_DESC
     model_path = MODEL_PATH
     model_title = MODEL_TITLE
     tensor_parallel = TENSOR_PARALLEL
     assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
     dtype = DTYPE
     sys_prompt = SYSTEM_PROMPT_1
     max_tokens = MAX_TOKENS
-    print(f'Launch config: {model_path=} / {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens}\n{SYSTEM_PROMPT_1} | {BLOCK_ZH=}')
     if DEBUG:
-        model_desc += "\n<br>!!!!! This is in debug mode, responses will be copy original"
         response_fn = debug_chat_response_echo
     else:
         # ! load the model
         import vllm
-        assert os.path.exists(model_path), f'{model_path} not found'
         print(F'VLLM: {vllm.__version__}')
-        print(f'Load path: {model_path}')
         llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel)
         print(f'Use system prompt:\n{sys_prompt}')
@@ -871,9 +945,9 @@ def launch():
         description=f"{model_desc}",
         # ! decide if can change the system prompt.
         additional_inputs=[
-            gr.Number(value=0, label='Temperature (higher -> more random)'),
             gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
-            gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
             # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
         ],
     )

 # @@ constants ================
 DEBUG = bool(int(os.environ.get("DEBUG", "1")))
+BLOCK_ZH = bool(int(os.environ.get("BLOCK_ZH", "1")))
 TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
 DTYPE = os.environ.get("DTYPE", "bfloat16")
+# ! (no debug) whether to download HF_MODEL_NAME and save to MODEL_PATH
+DOWNLOAD_SNAPSHOT = bool(int(os.environ.get("DOWNLOAD_SNAPSHOT", "0")))
+# ! uploaded model path, will be downloaded to MODEL_PATH
+HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "DAMO-NLP-SG/seal-13b-chat-a")
+MODEL_PATH = os.environ.get("MODEL_PATH", "./seal-13b-chat-a")
+# gradio config
 PORT = int(os.environ.get("PORT", "7860"))
 STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
+MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
+TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
+FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.4"))
+"""
+TODO:
+need to upload the model as hugginface/models/seal_13b_a
+# https://huggingface.co/docs/hub/spaces-overview#managing-secrets
+set
+MODEL_REPO_ID=hugginface/models/seal_13b_a
+# if persistent, then export the following
+HF_HOME=/data/.huggingface
+TRANSFORMERS_CACHE=/data/.huggingface
+MODEL_PATH=/data/.huggingface/seal-13b-chat-a
+HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
+# if not persistent
+MODEL_PATH=./seal-13b-chat-a
+HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
+# download will auto detect and get the most updated one
+if DOWNLOAD_SNAPSHOT:
+    print(f'Download from HF_MODEL_NAME={HF_MODEL_NAME} -> {MODEL_PATH}')
+    snapshot_download(HF_MODEL_NAME, local_dir=MODEL_PATH)
+elif not DEBUG:
+    assert os.path.exists(MODEL_PATH), f'{MODEL_PATH} not found and no snapshot download'
+"""
+# ==============================
+print(f'DEBUG mode: {DEBUG}')
+if DTYPE == "bfloat16" and not DEBUG:
+    try:
+        compute_capability = torch.cuda.get_device_capability()
+        if compute_capability[0] < 8:
+            gpu_name = torch.cuda.get_device_name()
+            print(
+                "Bfloat16 is only supported on GPUs with compute capability "
+                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
+                f"{compute_capability[0]}.{compute_capability[1]}. --> Move to FLOAT16")
+            DTYPE = "float16"
+    except Exception as e:
+        print(f'Unable to obtain compute_capability: {e}')
 # @@ constants ================
 if not DEBUG:
         x for x in glob.glob(os.path.join(hf_folder, "*model*.safetensors"))
         if not x.endswith("training_args.bin")
     ]
     if use_np_cache:
         # Convert the model weights from torch tensors to numpy arrays for
     state_dict = self.state_dict()
     need_to_load = len(state_dict)
     loaded = 0
     iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
     for name, loaded_weight in iterator:
         if "rotary_emb.inv_freq" in name:
             continue
             if num_extra_rows > 0:
                 print(f'Add empty to {num_extra_rows} extra row for {name}')
             print(f'Load: {name} | {padded_vocab_size=} | {self.config.vocab_size=} | {num_extra_rows=} | {param.size()=} | {loaded_weight.size()=} | {load_size=}')
         is_attention_weight = False
         for weight_name, shard_size, offset in attention_weight_specs:
 set_documentation_group("component")
 DTYPES = {
 llm = None
 demo = None
 BOS_TOKEN = '<s>'
 EOS_TOKEN = '</s>'
 {model_path}
 """
+def check_model_path(model_path) -> str:
+    assert os.path.exists(model_path), f'{model_path} not found'
+    ckpt_info = "None"
+    if os.path.isdir(model_path):
+        if os.path.exists(f'{model_path}/info.txt'):
+            with open(f'{model_path}/info.txt', 'r') as f:
+                ckpt_info = f.read()
+                print(f'Checkpoint info:\n{ckpt_info}\n-----')
+        else:
+            print(f'info.txt not found in {model_path}')
+        print(f'model path dir: {list(os.listdir(model_path))}')
+    return ckpt_info
 def launch():
     global demo, llm, DEBUG
     model_desc = MODEL_DESC
     model_path = MODEL_PATH
     model_title = MODEL_TITLE
+    hf_model_name = HF_MODEL_NAME
     tensor_parallel = TENSOR_PARALLEL
     assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
     dtype = DTYPE
     sys_prompt = SYSTEM_PROMPT_1
     max_tokens = MAX_TOKENS
+    temperature = TEMPERATURE
+    frequence_penalty = FREQUENCE_PENALTY
+    ckpt_info = "None"
+    print(
+        f'Launch config: {model_path=} / {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens} | {BLOCK_ZH=} '
+        f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
+        f'\n| frequence_penalty={frequence_penalty} '
+        f'\n| temperature={temperature} '
+        f'\n| hf_model_name={hf_model_name} '
+        f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
+        f'\nsys={SYSTEM_PROMPT_1}'
+        f'\ndesc={model_desc}'
+    )
     if DEBUG:
+        model_desc += "\n<br>!!!!! This is in debug mode, responses will copy original"
         response_fn = debug_chat_response_echo
+        print(f'Creating in DEBUG MODE')
     else:
         # ! load the model
         import vllm
         print(F'VLLM: {vllm.__version__}')
+        if DOWNLOAD_SNAPSHOT:
+            print(f'Downloading from HF_MODEL_NAME={hf_model_name} -> {model_path}')
+            snapshot_download(hf_model_name, local_dir=model_path)
+        assert os.path.exists(model_path), f'{model_path} not found and no snapshot download'
+        ckpt_info = check_model_path(model_path)
+        print(f'Load path: {model_path} | {ckpt_info}')
         llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel)
         print(f'Use system prompt:\n{sys_prompt}')
         description=f"{model_desc}",
         # ! decide if can change the system prompt.
         additional_inputs=[
+            gr.Number(value=temperature, label='Temperature (higher -> more random)'),
             gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+            gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens)'),
             # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
         ],
     )