Spaces:
Paused
Paused
NGUYEN, Xuan Phi
commited on
Commit
·
f028d50
1
Parent(s):
203c3cd
update
Browse files
app.py
CHANGED
|
@@ -10,43 +10,45 @@ tensor_parallel must == 1
|
|
| 10 |
|
| 11 |
"""
|
| 12 |
|
| 13 |
-
|
| 14 |
import os
|
| 15 |
import numpy as np
|
| 16 |
import argparse
|
| 17 |
-
|
| 18 |
import gradio as gr
|
| 19 |
-
from gradio_client.documentation import document, set_documentation_group
|
| 20 |
-
|
| 21 |
-
from typing import List, Optional, Union, Dict, Tuple
|
| 22 |
-
|
| 23 |
-
from tqdm import tqdm
|
| 24 |
-
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
| 25 |
-
|
| 26 |
-
from vllm.engine.arg_utils import EngineArgs
|
| 27 |
-
from vllm.engine.llm_engine import LLMEngine
|
| 28 |
-
from vllm.outputs import RequestOutput
|
| 29 |
-
from vllm.sampling_params import SamplingParams
|
| 30 |
-
from vllm.utils import Counter
|
| 31 |
-
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
| 32 |
-
SequenceGroupMetadata, SequenceOutputs,
|
| 33 |
-
SequenceStatus)
|
| 34 |
-
|
| 35 |
-
# ! reconfigure vllm to faster llama
|
| 36 |
from typing import Any, Iterator
|
| 37 |
from typing import Iterator, List, Optional, Tuple
|
| 38 |
import filelock
|
| 39 |
import glob
|
| 40 |
import json
|
| 41 |
-
import os
|
| 42 |
-
from huggingface_hub import snapshot_download
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
from tqdm.auto import tqdm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
from vllm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def hf_model_weights_iterator(
|
|
@@ -661,18 +663,35 @@ def debug_chat_response_echo(
|
|
| 661 |
yield message
|
| 662 |
|
| 663 |
|
|
|
|
| 664 |
MODEL_TITLE = "DAMO-SeaL-13B - An Assistant for South East Asian Languages"
|
| 665 |
MODEL_DESC = """
|
| 666 |
This is a 13B DAMO-SeaL-Chat assistant model built by DAMO Academy, Alibaba Group. It can produce helpful responses in English, Vietnamese, Indonesian and Thai.
|
| 667 |
""".strip()
|
| 668 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
|
| 670 |
DTYPE = 'bfloat16'
|
| 671 |
DTYPE = 'float16'
|
| 672 |
|
| 673 |
MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
|
| 674 |
|
| 675 |
-
|
|
|
|
| 676 |
|
| 677 |
def launch():
|
| 678 |
global demo, llm, DEBUG
|
|
@@ -720,6 +739,8 @@ def launch():
|
|
| 720 |
gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
|
| 721 |
gr.Textbox(value=sys_prompt, label='System prompt', lines=8)],
|
| 722 |
)
|
|
|
|
|
|
|
| 723 |
demo.queue()
|
| 724 |
# demo.launch(server_port=args.port)
|
| 725 |
demo.launch()
|
|
|
|
| 10 |
|
| 11 |
"""
|
| 12 |
|
| 13 |
+
|
| 14 |
import os
|
| 15 |
import numpy as np
|
| 16 |
import argparse
|
| 17 |
+
import torch
|
| 18 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
from typing import Any, Iterator
|
| 20 |
from typing import Iterator, List, Optional, Tuple
|
| 21 |
import filelock
|
| 22 |
import glob
|
| 23 |
import json
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
from gradio_client.documentation import document, set_documentation_group
|
| 26 |
+
|
| 27 |
+
from typing import List, Optional, Union, Dict, Tuple
|
| 28 |
from tqdm.auto import tqdm
|
| 29 |
+
from huggingface_hub import snapshot_download
|
| 30 |
+
|
| 31 |
+
DEBUG = True
|
| 32 |
+
|
| 33 |
+
if not DEBUG:
|
| 34 |
|
| 35 |
+
# vllm import
|
| 36 |
+
from vllm import LLM, SamplingParams
|
| 37 |
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
| 38 |
+
from vllm.engine.arg_utils import EngineArgs
|
| 39 |
+
from vllm.engine.llm_engine import LLMEngine
|
| 40 |
+
from vllm.outputs import RequestOutput
|
| 41 |
+
from vllm.sampling_params import SamplingParams
|
| 42 |
+
from vllm.utils import Counter
|
| 43 |
+
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
| 44 |
+
SequenceGroupMetadata, SequenceOutputs,
|
| 45 |
+
SequenceStatus)
|
| 46 |
+
# ! reconfigure vllm to faster llama
|
| 47 |
+
from vllm.model_executor.model_loader import _MODEL_REGISTRY
|
| 48 |
+
from vllm.model_executor.models import LlamaForCausalLM
|
| 49 |
|
| 50 |
+
|
| 51 |
+
_MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
|
| 52 |
|
| 53 |
|
| 54 |
def hf_model_weights_iterator(
|
|
|
|
| 663 |
yield message
|
| 664 |
|
| 665 |
|
| 666 |
+
# ============ CONSTANT ============
|
| 667 |
MODEL_TITLE = "DAMO-SeaL-13B - An Assistant for South East Asian Languages"
|
| 668 |
MODEL_DESC = """
|
| 669 |
This is a 13B DAMO-SeaL-Chat assistant model built by DAMO Academy, Alibaba Group. It can produce helpful responses in English, Vietnamese, Indonesian and Thai.
|
| 670 |
""".strip()
|
| 671 |
|
| 672 |
+
|
| 673 |
+
cite_markdown = """
|
| 674 |
+
## Citation
|
| 675 |
+
If you find our project useful, hope you can star our repo and cite our paper as follows:
|
| 676 |
+
```
|
| 677 |
+
@article{damonlpsg2023seallm,
|
| 678 |
+
author = {???},
|
| 679 |
+
title = {SeaL: A language model for South East Asian Languages},
|
| 680 |
+
year = 2023,
|
| 681 |
+
}
|
| 682 |
+
"""
|
| 683 |
+
# journal = {arXiv preprint arXiv:2306.02858}
|
| 684 |
+
# url = {https://arxiv.org/abs/2306.02858}
|
| 685 |
+
|
| 686 |
+
|
| 687 |
TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
|
| 688 |
DTYPE = 'bfloat16'
|
| 689 |
DTYPE = 'float16'
|
| 690 |
|
| 691 |
MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
|
| 692 |
|
| 693 |
+
|
| 694 |
+
|
| 695 |
|
| 696 |
def launch():
|
| 697 |
global demo, llm, DEBUG
|
|
|
|
| 739 |
gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
|
| 740 |
gr.Textbox(value=sys_prompt, label='System prompt', lines=8)],
|
| 741 |
)
|
| 742 |
+
|
| 743 |
+
gr.Markdown(cite_markdown)
|
| 744 |
demo.queue()
|
| 745 |
# demo.launch(server_port=args.port)
|
| 746 |
demo.launch()
|