File size: 8,422 Bytes

0cbcfbb

# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

import types
from pathlib import Path

import tensorrt as trt
import torch
from cache_diffusion.cachify import CACHED_PIPE, get_model
from cuda import cudart
from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
from pipe.config import ONNX_CONFIG
from pipe.models.sd3 import sd3_forward
from pipe.models.sdxl import (
    cachecrossattnupblock2d_forward,
    cacheunet_forward,
    cacheupblock2d_forward,
)
from polygraphy.backend.trt import (
    CreateConfig,
    Profile,
    engine_from_network,
    network_from_onnx_path,
    save_engine,
)
from torch.onnx import export as onnx_export

from .utils import Engine


def replace_new_forward(backbone):
    if backbone.__class__ == UNet2DConditionModel:
        backbone.forward = types.MethodType(cacheunet_forward, backbone)
        for upsample_block in backbone.up_blocks:
            if (
                hasattr(upsample_block, "has_cross_attention")
                and upsample_block.has_cross_attention
            ):
                upsample_block.forward = types.MethodType(
                    cachecrossattnupblock2d_forward, upsample_block
                )
            else:
                upsample_block.forward = types.MethodType(cacheupblock2d_forward, upsample_block)
    elif backbone.__class__ == SD3Transformer2DModel:
        backbone.forward = types.MethodType(sd3_forward, backbone)


def get_input_info(dummy_dict, info: str = None, batch_size: int = 1):
    return_val = [] if info == "profile_shapes" or info == "input_names" else {}

    def collect_leaf_keys(d):
        for key, value in d.items():
            if isinstance(value, dict):
                collect_leaf_keys(value)
            else:
                value = (value[0] * batch_size,) + value[1:]
                if info == "profile_shapes":
                    return_val.append((key, value))  # type: ignore
                elif info == "profile_shapes_dict":
                    return_val[key] = value  # type: ignore
                elif info == "dummy_input":
                    return_val[key] = torch.ones(value).half().cuda()  # type: ignore
                elif info == "input_names":
                    return_val.append(key)  # type: ignore

    collect_leaf_keys(dummy_dict)
    return return_val


def complie2trt(cls, onnx_path: Path, engine_path: Path, batch_size: int = 1):
    subdirs = [f for f in onnx_path.iterdir() if f.is_dir()]
    for subdir in subdirs:
        if subdir.name not in ONNX_CONFIG[cls].keys():
            continue
        model_path = subdir / "model.onnx"
        plan_path = engine_path / f"{subdir.name}.plan"
        if not plan_path.exists():
            print(f"Building {str(model_path)}")
            build_profile = Profile()
            profile_shapes = get_input_info(
                ONNX_CONFIG[cls][subdir.name]["dummy_input"], "profile_shapes", batch_size
            )
            for input_name, input_shape in profile_shapes:
                min_input_shape = (2,) + input_shape[1:]
                build_profile.add(input_name, min_input_shape, input_shape, input_shape)
            block_network = network_from_onnx_path(
                str(model_path), flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM], strongly_typed=True
            )
            build_config = CreateConfig(
                builder_optimization_level=6,
                tf32=True,
                #bf16=True,
                profiles=[build_profile],
            )
            engine = engine_from_network(
                block_network,
                config=build_config,
            )
            save_engine(engine, path=plan_path)
        else:
            print(f"{str(model_path)} already exists!")


def get_total_device_memory(backbone):
    max_device_memory = 0
    for _, engine in backbone.engines.items():
        max_device_memory = max(max_device_memory, engine.engine.device_memory_size)
    return max_device_memory


def load_engines(backbone, engine_path: Path, batch_size: int = 1):
    backbone.engines = {}
    for f in engine_path.iterdir():
        if f.is_file():
            eng = Engine()
            eng.load(str(f))
            backbone.engines[f"{f.stem}"] = eng
    _, shared_device_memory = cudart.cudaMalloc(get_total_device_memory(backbone))
    for engine in backbone.engines.values():
        engine.activate(shared_device_memory)
    backbone.cuda_stream = cudart.cudaStreamCreate()[1]
    for block_name in backbone.engines.keys():
        backbone.engines[block_name].allocate_buffers(
            shape_dict=get_input_info(
                ONNX_CONFIG[backbone.__class__][block_name]["dummy_input"],
                "profile_shapes_dict",
                batch_size,
            ),
            device=backbone.device,
            batch_size=batch_size,
        )
    # TODO: Free and clean up the origin pytorch cuda memory


def export_onnx(backbone, onnx_path: Path):
    for name, module in backbone.named_modules():
        if isinstance(module, CACHED_PIPE[backbone.__class__]):
            _onnx_dir = onnx_path.joinpath(f"{name}")
            _onnx_file = _onnx_dir.joinpath("model.onnx")
            if not _onnx_file.exists():
                _onnx_dir.mkdir(parents=True, exist_ok=True)
                dummy_input = get_input_info(
                    ONNX_CONFIG[backbone.__class__][f"{name}"]["dummy_input"], "dummy_input"
                )
                input_names = get_input_info(
                    ONNX_CONFIG[backbone.__class__][f"{name}"]["dummy_input"], "input_names"
                )
                output_names = ONNX_CONFIG[backbone.__class__][f"{name}"]["output_names"]
                onnx_export(
                    module,
                    args=dummy_input,
                    f=_onnx_file.as_posix(),
                    input_names=input_names,
                    output_names=output_names,
                    dynamic_axes=ONNX_CONFIG[backbone.__class__][f"{name}"]["dynamic_axes"],
                    do_constant_folding=True,
                    opset_version=17,
                )
            else:
                print(f"{str(_onnx_file)} alread exists!")


def warm_up(backbone, batch_size: int = 1):
    print("Warming-up TensorRT engines...")
    for name, engine in backbone.engines.items():
        dummy_input = get_input_info(
            ONNX_CONFIG[backbone.__class__][name]["dummy_input"], "dummy_input", batch_size
        )
        _ = engine(dummy_input, backbone.cuda_stream)


def teardown(pipe):
    backbone = get_model(pipe)
    for engine in backbone.engines.values():
        del engine

    cudart.cudaStreamDestroy(backbone.cuda_stream)
    del backbone.cuda_stream


def compile(pipe, onnx_path: Path, engine_path: Path, batch_size: int = 1):
    backbone = get_model(pipe)
    onnx_path.mkdir(parents=True, exist_ok=True)
    engine_path.mkdir(parents=True, exist_ok=True)

    replace_new_forward(backbone)
    export_onnx(backbone, onnx_path)
    complie2trt(backbone.__class__, onnx_path, engine_path, batch_size)
    load_engines(backbone, engine_path, batch_size)
    warm_up(backbone, batch_size)
    backbone.use_trt_infer = True