Spaces:
Runtime error
Runtime error
set -ex | |
############################################################################################ | |
# Reference: https://github.com/NVIDIA/TensorRT-LLM/tree/v0.18.2/examples/multimodal#nougat | |
############################################################################################ | |
export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/tensorrt_libs/:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH | |
# 1. Download Huggingface weights | |
export MODEL_NAME="Dolphin" | |
git clone https://huggingface.co/Bytedance/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} | |
export MAX_BATCH_SIZE=16 | |
export MAX_SEQ_LEN=4096 | |
export MAX_INPUT_LEN=10 | |
export MAX_ENCODER_INPUT_LEN=784 | |
# 2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in examples/enc_dec | |
python ./convert/convert_checkpoint.py --model_type bart \ | |
--model_dir tmp/hf_models/${MODEL_NAME} \ | |
--output_dir tmp/trt_models/${MODEL_NAME}/bfloat16 \ | |
--tp_size 1 \ | |
--pp_size 1 \ | |
--dtype bfloat16 \ | |
--nougat | |
trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/bfloat16/decoder \ | |
--output_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/bfloat16/decoder \ | |
--paged_kv_cache disable \ | |
--moe_plugin disable \ | |
--gemm_plugin bfloat16 \ | |
--bert_attention_plugin bfloat16 \ | |
--gpt_attention_plugin bfloat16 \ | |
--remove_input_padding enable \ | |
--max_beam_width 1 \ | |
--max_batch_size ${MAX_BATCH_SIZE} \ | |
--max_seq_len ${MAX_SEQ_LEN} \ | |
--max_input_len ${MAX_INPUT_LEN} \ | |
--max_encoder_input_len $((${MAX_BATCH_SIZE} * ${MAX_ENCODER_INPUT_LEN})) # MAX_BATCH_SIZE (max_batch_size) * MAX_ENCODER_INPUT_LEN (num_visual_features) | |
# 3. Generate TensorRT engines for visual components and combine everything into final pipeline. | |
python ./convert/build_visual_engine.py --model_type nougat \ | |
--model_path tmp/hf_models/${MODEL_NAME} \ | |
--max_batch_size ${MAX_BATCH_SIZE} |