yuekaiz commited on
Commit
4d769ff
·
1 Parent(s): 82f7b02

update docker file

Browse files
runtime/triton_trtllm/Dockerfile.server CHANGED
@@ -1,9 +1,10 @@
1
  FROM nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3
2
- RUN pip install tritonclient[grpc] librosa
 
 
3
  WORKDIR /workspace
4
 
5
 
6
 
7
 
8
 
9
-
 
1
  FROM nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3
2
+ RUN apt-get update && apt-get install -y cmake
3
+ RUN git clone https://github.com/pytorch/audio.git && cd audio && git checkout c670ad8 && PATH=/usr/local/cuda/bin:$PATH python3 setup.py develop
4
+ RUN pip install einx==0.3.0 omegaconf==2.3.0 soundfile==0.12.1 soxr==0.5.0.post1 gradio tritonclient librosa
5
  WORKDIR /workspace
6
 
7
 
8
 
9
 
10
 
 
runtime/triton_trtllm/README.md CHANGED
@@ -36,8 +36,10 @@ python3 client_grpc.py --num-tasks $num_task --huggingface-dataset yuekai/seed_t
36
  ```
37
 
38
  ### Benchmark Results
39
- Decoding on a single L20 GPU, using 26 different prompt_audio/target_text pairs.
40
 
41
  | Model | Note | Concurrency | Avg Latency | RTF |
42
  |-------|-----------|-----------------------|---------|--|
43
- | Spark-TTS-0.5B | [Code Commit]() | 4 | 253 ms | 0.0394|
 
 
 
36
  ```
37
 
38
  ### Benchmark Results
39
+ Decoding on a single L20 GPU, using 26 different prompt_audio/target_text pairs, total audio duration 169 secs.
40
 
41
  | Model | Note | Concurrency | Avg Latency | RTF |
42
  |-------|-----------|-----------------------|---------|--|
43
+ | Spark-TTS-0.5B | [Code Commit]() | 1 | 876.24 ms | 0.1362|
44
+ | Spark-TTS-0.5B | [Code Commit]() | 2 | 920.97 ms | 0.0737|
45
+ | Spark-TTS-0.5B | [Code Commit]() | 4 | 1611.51 ms | 0.0704|
runtime/triton_trtllm/build.sh DELETED
@@ -1,76 +0,0 @@
1
-
2
-
3
- export PYTHONPATH=../../../Spark-TTS/
4
- export CUDA_VISIBLE_DEVICES=0
5
- stage=$1
6
- stop_stage=$2
7
- echo "Start stage: $stage, Stop stage: $stop_stage"
8
-
9
- huggingface_model_local_dir=../../pretrained_models/Spark-TTS-0.5B
10
- trt_dtype=bfloat16
11
- trt_weights_dir=./tllm_checkpoint_${trt_dtype}
12
- trt_engines_dir=./trt_engines_${trt_dtype}
13
-
14
- model_repo=./model_repo_test
15
-
16
- if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
17
- echo "Downloading Spark-TTS-0.5B from HuggingFace"
18
- hugginface-cli download SparkAudio/Spark-TTS-0.5B --local-dir $huggingface_model_local_dir || exit 1
19
- # pip install -r /workspace_yuekai/spark-tts/Spark-TTS/requirements.txt
20
- fi
21
-
22
-
23
- if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
24
- echo "Converting checkpoint to TensorRT weights"
25
- python scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir/LLM \
26
- --output_dir $trt_weights_dir \
27
- --dtype $trt_dtype || exit 1
28
-
29
- echo "Building TensorRT engines"
30
- trtllm-build --checkpoint_dir $trt_weights_dir \
31
- --output_dir $trt_engines_dir \
32
- --max_batch_size 16 \
33
- --max_num_tokens 32768 \
34
- --gemm_plugin $trt_dtype || exit 1
35
- fi
36
-
37
- if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
38
- echo "Creating model repository"
39
- rm -rf $model_repo
40
- cp -r ./model_repo $model_repo
41
-
42
- ENGINE_PATH=$trt_engines_dir
43
- MAX_QUEUE_DELAY_MICROSECONDS=0
44
- MODEL_DIR=$huggingface_model_local_dir
45
- LLM_TOKENIZER_DIR=$huggingface_model_local_dir/LLM
46
- BLS_INSTANCE_NUM=4
47
- TRITON_MAX_BATCH_SIZE=16
48
-
49
- python3 scripts/fill_template.py -i ${model_repo}/vocoder/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
50
- python3 scripts/fill_template.py -i ${model_repo}/audio_tokenizer/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
51
- python3 scripts/fill_template.py -i ${model_repo}/spark_tts/config.pbtxt bls_instance_num:${BLS_INSTANCE_NUM},llm_tokenizer_dir:${LLM_TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
52
- python3 scripts/fill_template.py -i ${model_repo}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
53
-
54
- fi
55
-
56
- if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
57
- echo "Starting Triton server"
58
- tritonserver --model-repository ${model_repo}
59
- fi
60
-
61
-
62
- if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
63
- echo "Running client"
64
- num_task=4
65
- python3 client_grpc.py \
66
- --server-addr localhost \
67
- --model-name spark_tts \
68
- --num-tasks $num_task \
69
- --log-dir ./log_${num_task}
70
- fi
71
-
72
-
73
-
74
-
75
-
76
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
runtime/triton_trtllm/client_grpc.py CHANGED
@@ -435,7 +435,7 @@ async def main():
435
  log_interval=args.log_interval,
436
  model_name=args.model_name,
437
  audio_save_dir=args.log_dir,
438
- padding_duration=10,
439
  )
440
  )
441
  tasks.append(task)
 
435
  log_interval=args.log_interval,
436
  model_name=args.model_name,
437
  audio_save_dir=args.log_dir,
438
+ padding_duration=None,
439
  )
440
  )
441
  tasks.append(task)
runtime/triton_trtllm/docker-compose.yml CHANGED
@@ -17,4 +17,4 @@ services:
17
  device_ids: ['0']
18
  capabilities: [gpu]
19
  command: >
20
- /bin/bash -c "rm -rf sherpa && git clone https://github.com/yuekaizhang/sherpa.git -b f5 && cd sherpa/triton/f5_tts/ && bash build_server.sh $VOCODER"
 
17
  device_ids: ['0']
18
  capabilities: [gpu]
19
  command: >
20
+ /bin/bash -c "rm -rf Spark-TTS && git clone https://github.com/SparkAudio/Spark-TTS.git && cd Spark-TTS/runtime/triton_trtllm && bash run.sh 0 3"
runtime/triton_trtllm/run.sh CHANGED
@@ -15,14 +15,14 @@ model_repo=./model_repo_test
15
 
16
  if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
17
  echo "Downloading Spark-TTS-0.5B from HuggingFace"
18
- hugginface-cli download SparkAudio/Spark-TTS-0.5B --local-dir $huggingface_model_local_dir || exit 1
19
  # pip install -r /workspace_yuekai/spark-tts/Spark-TTS/requirements.txt
20
  fi
21
 
22
 
23
  if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
24
  echo "Converting checkpoint to TensorRT weights"
25
- python convert_checkpoint.py --model_dir $huggingface_model_local_dir/LLM \
26
  --output_dir $trt_weights_dir \
27
  --dtype $trt_dtype || exit 1
28
 
@@ -61,12 +61,12 @@ fi
61
 
62
  if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
63
  echo "Running client"
64
- num_task=4
65
  python3 client_grpc.py \
66
  --server-addr localhost \
67
  --model-name spark_tts \
68
  --num-tasks $num_task \
69
- --log-dir ./log_${num_task}
70
  fi
71
 
72
 
 
15
 
16
  if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
17
  echo "Downloading Spark-TTS-0.5B from HuggingFace"
18
+ huggingface-cli download SparkAudio/Spark-TTS-0.5B --local-dir $huggingface_model_local_dir || exit 1
19
  # pip install -r /workspace_yuekai/spark-tts/Spark-TTS/requirements.txt
20
  fi
21
 
22
 
23
  if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
24
  echo "Converting checkpoint to TensorRT weights"
25
+ python scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir/LLM \
26
  --output_dir $trt_weights_dir \
27
  --dtype $trt_dtype || exit 1
28
 
 
61
 
62
  if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
63
  echo "Running client"
64
+ num_task=2
65
  python3 client_grpc.py \
66
  --server-addr localhost \
67
  --model-name spark_tts \
68
  --num-tasks $num_task \
69
+ --log-dir ./log_concurrent_tasks_${num_task}
70
  fi
71
 
72