update docker file
Browse files
runtime/triton_trtllm/Dockerfile.server
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
FROM nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3
|
2 |
-
RUN
|
|
|
|
|
3 |
WORKDIR /workspace
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 |
-
|
|
|
1 |
FROM nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3
|
2 |
+
RUN apt-get update && apt-get install -y cmake
|
3 |
+
RUN git clone https://github.com/pytorch/audio.git && cd audio && git checkout c670ad8 && PATH=/usr/local/cuda/bin:$PATH python3 setup.py develop
|
4 |
+
RUN pip install einx==0.3.0 omegaconf==2.3.0 soundfile==0.12.1 soxr==0.5.0.post1 gradio tritonclient librosa
|
5 |
WORKDIR /workspace
|
6 |
|
7 |
|
8 |
|
9 |
|
10 |
|
|
runtime/triton_trtllm/README.md
CHANGED
@@ -36,8 +36,10 @@ python3 client_grpc.py --num-tasks $num_task --huggingface-dataset yuekai/seed_t
|
|
36 |
```
|
37 |
|
38 |
### Benchmark Results
|
39 |
-
Decoding on a single L20 GPU, using 26 different prompt_audio/target_text pairs.
|
40 |
|
41 |
| Model | Note | Concurrency | Avg Latency | RTF |
|
42 |
|-------|-----------|-----------------------|---------|--|
|
43 |
-
| Spark-TTS-0.5B | [Code Commit]() |
|
|
|
|
|
|
36 |
```
|
37 |
|
38 |
### Benchmark Results
|
39 |
+
Decoding on a single L20 GPU, using 26 different prompt_audio/target_text pairs, total audio duration 169 secs.
|
40 |
|
41 |
| Model | Note | Concurrency | Avg Latency | RTF |
|
42 |
|-------|-----------|-----------------------|---------|--|
|
43 |
+
| Spark-TTS-0.5B | [Code Commit]() | 1 | 876.24 ms | 0.1362|
|
44 |
+
| Spark-TTS-0.5B | [Code Commit]() | 2 | 920.97 ms | 0.0737|
|
45 |
+
| Spark-TTS-0.5B | [Code Commit]() | 4 | 1611.51 ms | 0.0704|
|
runtime/triton_trtllm/build.sh
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
export PYTHONPATH=../../../Spark-TTS/
|
4 |
-
export CUDA_VISIBLE_DEVICES=0
|
5 |
-
stage=$1
|
6 |
-
stop_stage=$2
|
7 |
-
echo "Start stage: $stage, Stop stage: $stop_stage"
|
8 |
-
|
9 |
-
huggingface_model_local_dir=../../pretrained_models/Spark-TTS-0.5B
|
10 |
-
trt_dtype=bfloat16
|
11 |
-
trt_weights_dir=./tllm_checkpoint_${trt_dtype}
|
12 |
-
trt_engines_dir=./trt_engines_${trt_dtype}
|
13 |
-
|
14 |
-
model_repo=./model_repo_test
|
15 |
-
|
16 |
-
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
17 |
-
echo "Downloading Spark-TTS-0.5B from HuggingFace"
|
18 |
-
hugginface-cli download SparkAudio/Spark-TTS-0.5B --local-dir $huggingface_model_local_dir || exit 1
|
19 |
-
# pip install -r /workspace_yuekai/spark-tts/Spark-TTS/requirements.txt
|
20 |
-
fi
|
21 |
-
|
22 |
-
|
23 |
-
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
24 |
-
echo "Converting checkpoint to TensorRT weights"
|
25 |
-
python scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir/LLM \
|
26 |
-
--output_dir $trt_weights_dir \
|
27 |
-
--dtype $trt_dtype || exit 1
|
28 |
-
|
29 |
-
echo "Building TensorRT engines"
|
30 |
-
trtllm-build --checkpoint_dir $trt_weights_dir \
|
31 |
-
--output_dir $trt_engines_dir \
|
32 |
-
--max_batch_size 16 \
|
33 |
-
--max_num_tokens 32768 \
|
34 |
-
--gemm_plugin $trt_dtype || exit 1
|
35 |
-
fi
|
36 |
-
|
37 |
-
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
38 |
-
echo "Creating model repository"
|
39 |
-
rm -rf $model_repo
|
40 |
-
cp -r ./model_repo $model_repo
|
41 |
-
|
42 |
-
ENGINE_PATH=$trt_engines_dir
|
43 |
-
MAX_QUEUE_DELAY_MICROSECONDS=0
|
44 |
-
MODEL_DIR=$huggingface_model_local_dir
|
45 |
-
LLM_TOKENIZER_DIR=$huggingface_model_local_dir/LLM
|
46 |
-
BLS_INSTANCE_NUM=4
|
47 |
-
TRITON_MAX_BATCH_SIZE=16
|
48 |
-
|
49 |
-
python3 scripts/fill_template.py -i ${model_repo}/vocoder/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
|
50 |
-
python3 scripts/fill_template.py -i ${model_repo}/audio_tokenizer/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
|
51 |
-
python3 scripts/fill_template.py -i ${model_repo}/spark_tts/config.pbtxt bls_instance_num:${BLS_INSTANCE_NUM},llm_tokenizer_dir:${LLM_TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
|
52 |
-
python3 scripts/fill_template.py -i ${model_repo}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
|
53 |
-
|
54 |
-
fi
|
55 |
-
|
56 |
-
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
57 |
-
echo "Starting Triton server"
|
58 |
-
tritonserver --model-repository ${model_repo}
|
59 |
-
fi
|
60 |
-
|
61 |
-
|
62 |
-
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
63 |
-
echo "Running client"
|
64 |
-
num_task=4
|
65 |
-
python3 client_grpc.py \
|
66 |
-
--server-addr localhost \
|
67 |
-
--model-name spark_tts \
|
68 |
-
--num-tasks $num_task \
|
69 |
-
--log-dir ./log_${num_task}
|
70 |
-
fi
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
runtime/triton_trtllm/client_grpc.py
CHANGED
@@ -435,7 +435,7 @@ async def main():
|
|
435 |
log_interval=args.log_interval,
|
436 |
model_name=args.model_name,
|
437 |
audio_save_dir=args.log_dir,
|
438 |
-
padding_duration=
|
439 |
)
|
440 |
)
|
441 |
tasks.append(task)
|
|
|
435 |
log_interval=args.log_interval,
|
436 |
model_name=args.model_name,
|
437 |
audio_save_dir=args.log_dir,
|
438 |
+
padding_duration=None,
|
439 |
)
|
440 |
)
|
441 |
tasks.append(task)
|
runtime/triton_trtllm/docker-compose.yml
CHANGED
@@ -17,4 +17,4 @@ services:
|
|
17 |
device_ids: ['0']
|
18 |
capabilities: [gpu]
|
19 |
command: >
|
20 |
-
/bin/bash -c "rm -rf
|
|
|
17 |
device_ids: ['0']
|
18 |
capabilities: [gpu]
|
19 |
command: >
|
20 |
+
/bin/bash -c "rm -rf Spark-TTS && git clone https://github.com/SparkAudio/Spark-TTS.git && cd Spark-TTS/runtime/triton_trtllm && bash run.sh 0 3"
|
runtime/triton_trtllm/run.sh
CHANGED
@@ -15,14 +15,14 @@ model_repo=./model_repo_test
|
|
15 |
|
16 |
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
17 |
echo "Downloading Spark-TTS-0.5B from HuggingFace"
|
18 |
-
|
19 |
# pip install -r /workspace_yuekai/spark-tts/Spark-TTS/requirements.txt
|
20 |
fi
|
21 |
|
22 |
|
23 |
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
24 |
echo "Converting checkpoint to TensorRT weights"
|
25 |
-
python convert_checkpoint.py --model_dir $huggingface_model_local_dir/LLM \
|
26 |
--output_dir $trt_weights_dir \
|
27 |
--dtype $trt_dtype || exit 1
|
28 |
|
@@ -61,12 +61,12 @@ fi
|
|
61 |
|
62 |
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
63 |
echo "Running client"
|
64 |
-
num_task=
|
65 |
python3 client_grpc.py \
|
66 |
--server-addr localhost \
|
67 |
--model-name spark_tts \
|
68 |
--num-tasks $num_task \
|
69 |
-
--log-dir ./
|
70 |
fi
|
71 |
|
72 |
|
|
|
15 |
|
16 |
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
17 |
echo "Downloading Spark-TTS-0.5B from HuggingFace"
|
18 |
+
huggingface-cli download SparkAudio/Spark-TTS-0.5B --local-dir $huggingface_model_local_dir || exit 1
|
19 |
# pip install -r /workspace_yuekai/spark-tts/Spark-TTS/requirements.txt
|
20 |
fi
|
21 |
|
22 |
|
23 |
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
24 |
echo "Converting checkpoint to TensorRT weights"
|
25 |
+
python scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir/LLM \
|
26 |
--output_dir $trt_weights_dir \
|
27 |
--dtype $trt_dtype || exit 1
|
28 |
|
|
|
61 |
|
62 |
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
63 |
echo "Running client"
|
64 |
+
num_task=2
|
65 |
python3 client_grpc.py \
|
66 |
--server-addr localhost \
|
67 |
--model-name spark_tts \
|
68 |
--num-tasks $num_task \
|
69 |
+
--log-dir ./log_concurrent_tasks_${num_task}
|
70 |
fi
|
71 |
|
72 |
|