Spaces:
Build error
Build error
/* | |
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
* SPDX-License-Identifier: Apache-2.0 | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
/** @file dlss.cu | |
* @author Thomas Müller, NVIDIA | |
*/ | |
static_assert(false, "DLSS can only be compiled when both Vulkan and GUI support is enabled.") | |
// NGX's macro `NVSDK_NGX_FAILED` results in a change of sign, which does not affect correctness. | |
// Thus, suppress the corresponding warning. | |
namespace ngp { | |
extern std::atomic<size_t> g_total_n_bytes_allocated; | |
/// Checks the result of a vkXXXXXX call and throws an error on failure | |
std::string ngx_error_string(NVSDK_NGX_Result result) { | |
std::wstring wstr = GetNGXResultAsString(result); | |
std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter; | |
return converter.to_bytes(wstr); | |
}; | |
/// Checks the result of a NVSDK_NGX_XXXXXX call and throws an error on failure | |
static VKAPI_ATTR VkBool32 VKAPI_CALL vk_debug_callback( | |
VkDebugUtilsMessageSeverityFlagBitsEXT message_severity, | |
VkDebugUtilsMessageTypeFlagsEXT message_type, | |
const VkDebugUtilsMessengerCallbackDataEXT* callback_data, | |
void* user_data | |
) { | |
// Ignore json files that couldn't be found... third party tools sometimes install bogus layers | |
// that manifest as warnings like this. | |
if (std::string{callback_data->pMessage}.find("Failed to open JSON file") != std::string::npos) { | |
return VK_FALSE; | |
} | |
if (message_severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) { | |
tlog::warning() << "Vulkan error: " << callback_data->pMessage; | |
} else if (message_severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { | |
tlog::warning() << "Vulkan: " << callback_data->pMessage; | |
} else { | |
tlog::info() << "Vulkan: " << callback_data->pMessage; | |
} | |
return VK_FALSE; | |
} | |
std::set<std::string> vk_supported_instance_layers() { | |
uint32_t count = 0; | |
VK_CHECK_THROW(vkEnumerateInstanceLayerProperties(&count, nullptr)); | |
std::vector<VkLayerProperties> layer_properties(count); | |
VK_CHECK_THROW(vkEnumerateInstanceLayerProperties(&count, layer_properties.data())); | |
std::set<std::string> layers; | |
for (auto& l : layer_properties) { | |
layers.insert(l.layerName); | |
} | |
return layers; | |
} | |
std::set<std::string> vk_supported_device_layers(VkPhysicalDevice device) { | |
uint32_t count = 0; | |
VK_CHECK_THROW(vkEnumerateDeviceLayerProperties(device, &count, nullptr)); | |
std::vector<VkLayerProperties> layer_properties(count); | |
VK_CHECK_THROW(vkEnumerateDeviceLayerProperties(device, &count, layer_properties.data())); | |
std::set<std::string> layers; | |
for (auto& l : layer_properties) { | |
layers.insert(l.layerName); | |
} | |
return layers; | |
} | |
std::set<std::string> vk_supported_instance_extensions(const char* layer_name) { | |
uint32_t count = 0; | |
VK_CHECK_THROW(vkEnumerateInstanceExtensionProperties(layer_name, &count, nullptr)); | |
std::vector<VkExtensionProperties> extension_properties(count); | |
VK_CHECK_THROW(vkEnumerateInstanceExtensionProperties(layer_name, &count, extension_properties.data())); | |
std::set<std::string> extensions; | |
for (auto& e : extension_properties) { | |
extensions.insert(e.extensionName); | |
} | |
return extensions; | |
} | |
std::set<std::string> vk_supported_device_extensions(VkPhysicalDevice device, const char* layer_name) { | |
uint32_t count = 0; | |
VK_CHECK_THROW(vkEnumerateDeviceExtensionProperties(device, layer_name, &count, nullptr)); | |
std::vector<VkExtensionProperties> extension_properties(count); | |
VK_CHECK_THROW(vkEnumerateDeviceExtensionProperties(device, layer_name, &count, extension_properties.data())); | |
std::set<std::string> extensions; | |
for (auto& e : extension_properties) { | |
extensions.insert(e.extensionName); | |
} | |
return extensions; | |
} | |
class VulkanAndNgx : public IDlssProvider, public std::enable_shared_from_this<VulkanAndNgx> { | |
public: | |
VulkanAndNgx() { | |
ScopeGuard cleanup_guard{[&]() { clear(); }}; | |
if (!glfwVulkanSupported()) { | |
throw std::runtime_error{"!glfwVulkanSupported()"}; | |
} | |
// ------------------------------- | |
// Vulkan Instance | |
// ------------------------------- | |
VkApplicationInfo app_info{}; | |
app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; | |
app_info.pApplicationName = "NGP"; | |
app_info.applicationVersion = VK_MAKE_VERSION(1, 0, 0); | |
app_info.pEngineName = "No engine"; | |
app_info.engineVersion = VK_MAKE_VERSION(1, 0, 0); | |
app_info.apiVersion = VK_API_VERSION_1_0; | |
VkInstanceCreateInfo instance_create_info = {}; | |
instance_create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; | |
instance_create_info.pApplicationInfo = &app_info; | |
std::vector<const char*> instance_extensions; | |
std::vector<const char*> device_extensions; | |
uint32_t n_ngx_instance_extensions = 0; | |
const char** ngx_instance_extensions; | |
uint32_t n_ngx_device_extensions = 0; | |
const char** ngx_device_extensions; | |
NVSDK_NGX_VULKAN_RequiredExtensions(&n_ngx_instance_extensions, &ngx_instance_extensions, &n_ngx_device_extensions, &ngx_device_extensions); | |
for (uint32_t i = 0; i < n_ngx_instance_extensions; ++i) { | |
instance_extensions.emplace_back(ngx_instance_extensions[i]); | |
} | |
instance_extensions.emplace_back(VK_KHR_DEVICE_GROUP_CREATION_EXTENSION_NAME); | |
instance_extensions.emplace_back(VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME); | |
instance_extensions.emplace_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); | |
instance_extensions.emplace_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); | |
auto supported_instance_layers = vk_supported_instance_layers(); | |
const char* validation_layer_name = "VK_LAYER_KHRONOS_validation"; | |
bool instance_validation_layer_enabled = supported_instance_layers.count(validation_layer_name) > 0; | |
if (!instance_validation_layer_enabled) { | |
tlog::warning() << "Vulkan instance validation layer is not available. Vulkan errors will be difficult to diagnose."; | |
} | |
std::vector<const char*> instance_layers; | |
if (instance_validation_layer_enabled) { | |
instance_layers.emplace_back(validation_layer_name); | |
} | |
instance_create_info.enabledLayerCount = static_cast<uint32_t>(instance_layers.size()); | |
instance_create_info.ppEnabledLayerNames = instance_layers.empty() ? nullptr : instance_layers.data(); | |
if (instance_validation_layer_enabled) { | |
instance_extensions.emplace_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); | |
} | |
auto supported_instance_extensions = vk_supported_instance_extensions(nullptr); | |
for (const auto& e : instance_extensions) { | |
if (supported_instance_extensions.count(e) == 0) { | |
throw std::runtime_error{fmt::format("Required instance extension '{}' is not supported.", e)}; | |
} | |
} | |
instance_create_info.enabledExtensionCount = (uint32_t)instance_extensions.size(); | |
instance_create_info.ppEnabledExtensionNames = instance_extensions.data(); | |
VkDebugUtilsMessengerCreateInfoEXT debug_messenger_create_info = {}; | |
debug_messenger_create_info.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; | |
debug_messenger_create_info.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; | |
debug_messenger_create_info.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; | |
debug_messenger_create_info.pfnUserCallback = vk_debug_callback; | |
debug_messenger_create_info.pUserData = nullptr; | |
if (instance_validation_layer_enabled) { | |
instance_create_info.pNext = &debug_messenger_create_info; | |
} | |
VK_CHECK_THROW(vkCreateInstance(&instance_create_info, nullptr, &m_vk_instance)); | |
if (instance_validation_layer_enabled) { | |
auto CreateDebugUtilsMessengerEXT = [](VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pDebugMessenger) { | |
auto func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT"); | |
if (func != nullptr) { | |
return func(instance, pCreateInfo, pAllocator, pDebugMessenger); | |
} else { | |
return VK_ERROR_EXTENSION_NOT_PRESENT; | |
} | |
}; | |
if (CreateDebugUtilsMessengerEXT(m_vk_instance, &debug_messenger_create_info, nullptr, &m_vk_debug_messenger) != VK_SUCCESS) { | |
tlog::warning() << "Vulkan: could not initialize debug messenger."; | |
} | |
} | |
// ------------------------------- | |
// Vulkan Physical Device | |
// ------------------------------- | |
uint32_t n_devices = 0; | |
vkEnumeratePhysicalDevices(m_vk_instance, &n_devices, nullptr); | |
if (n_devices == 0) { | |
throw std::runtime_error{"Failed to find GPUs with Vulkan support."}; | |
} | |
std::vector<VkPhysicalDevice> devices(n_devices); | |
vkEnumeratePhysicalDevices(m_vk_instance, &n_devices, devices.data()); | |
struct QueueFamilyIndices { | |
int graphics_family = -1; | |
int compute_family = -1; | |
int transfer_family = -1; | |
int all_family = -1; | |
}; | |
auto find_queue_families = [](VkPhysicalDevice device) { | |
QueueFamilyIndices indices; | |
uint32_t queue_family_count = 0; | |
vkGetPhysicalDeviceQueueFamilyProperties(device, &queue_family_count, nullptr); | |
std::vector<VkQueueFamilyProperties> queue_families(queue_family_count); | |
vkGetPhysicalDeviceQueueFamilyProperties(device, &queue_family_count, queue_families.data()); | |
int i = 0; | |
for (const auto& queue_family : queue_families) { | |
if (queue_family.queueFlags & VK_QUEUE_GRAPHICS_BIT) { | |
indices.graphics_family = i; | |
} | |
if (queue_family.queueFlags & VK_QUEUE_COMPUTE_BIT) { | |
indices.compute_family = i; | |
} | |
if (queue_family.queueFlags & VK_QUEUE_TRANSFER_BIT) { | |
indices.transfer_family = i; | |
} | |
if ((queue_family.queueFlags & VK_QUEUE_GRAPHICS_BIT) && (queue_family.queueFlags & VK_QUEUE_COMPUTE_BIT) && (queue_family.queueFlags & VK_QUEUE_TRANSFER_BIT)) { | |
indices.all_family = i; | |
} | |
i++; | |
} | |
return indices; | |
}; | |
cudaDeviceProp cuda_device_prop; | |
CUDA_CHECK_THROW(cudaGetDeviceProperties(&cuda_device_prop, cuda_device())); | |
auto is_same_as_cuda_device = [&](VkPhysicalDevice device) { | |
VkPhysicalDeviceIDProperties physical_device_id_properties = {}; | |
physical_device_id_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; | |
physical_device_id_properties.pNext = NULL; | |
VkPhysicalDeviceProperties2 physical_device_properties = {}; | |
physical_device_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; | |
physical_device_properties.pNext = &physical_device_id_properties; | |
vkGetPhysicalDeviceProperties2(device, &physical_device_properties); | |
return !memcmp(&cuda_device_prop.uuid, physical_device_id_properties.deviceUUID, VK_UUID_SIZE) && find_queue_families(device).all_family >= 0; | |
}; | |
uint32_t device_id = 0; | |
for (uint32_t i = 0; i < n_devices; ++i) { | |
if (is_same_as_cuda_device(devices[i])) { | |
m_vk_physical_device = devices[i]; | |
device_id = i; | |
break; | |
} | |
} | |
if (m_vk_physical_device == VK_NULL_HANDLE) { | |
throw std::runtime_error{"Failed to find Vulkan device corresponding to CUDA device."}; | |
} | |
for (uint32_t i = 0; i < n_ngx_device_extensions; ++i) { | |
device_extensions.emplace_back(ngx_device_extensions[i]); | |
} | |
device_extensions.emplace_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); | |
device_extensions.emplace_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); | |
device_extensions.emplace_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); | |
device_extensions.emplace_back(VK_KHR_DEVICE_GROUP_EXTENSION_NAME); | |
auto supported_device_extensions = vk_supported_device_extensions(m_vk_physical_device, nullptr); | |
for (const auto& e : device_extensions) { | |
if (supported_device_extensions.count(e) == 0) { | |
throw std::runtime_error{fmt::format("Required device extension '{}' is not supported.", e)}; | |
} | |
} | |
// ------------------------------- | |
// Vulkan Logical Device | |
// ------------------------------- | |
VkPhysicalDeviceProperties physical_device_properties; | |
vkGetPhysicalDeviceProperties(m_vk_physical_device, &physical_device_properties); | |
QueueFamilyIndices indices = find_queue_families(m_vk_physical_device); | |
VkDeviceQueueCreateInfo queue_create_info{}; | |
queue_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; | |
queue_create_info.queueFamilyIndex = indices.all_family; | |
queue_create_info.queueCount = 1; | |
float queue_priority = 1.0f; | |
queue_create_info.pQueuePriorities = &queue_priority; | |
VkPhysicalDeviceFeatures device_features = {}; | |
device_features.shaderStorageImageWriteWithoutFormat = true; | |
VkDeviceCreateInfo device_create_info = {}; | |
device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; | |
device_create_info.pQueueCreateInfos = &queue_create_info; | |
device_create_info.queueCreateInfoCount = 1; | |
device_create_info.pEnabledFeatures = &device_features; | |
device_create_info.enabledExtensionCount = (uint32_t)device_extensions.size(); | |
device_create_info.ppEnabledExtensionNames = device_extensions.data(); | |
VkPhysicalDeviceBufferDeviceAddressFeaturesEXT buffer_device_address_feature = {}; | |
buffer_device_address_feature.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT; | |
buffer_device_address_feature.bufferDeviceAddress = VK_TRUE; | |
device_create_info.pNext = &buffer_device_address_feature; | |
throw std::runtime_error{"Buffer device address extension not available."}; | |
VK_CHECK_THROW(vkCreateDevice(m_vk_physical_device, &device_create_info, nullptr, &m_vk_device)); | |
// ----------------------------------------------- | |
// Vulkan queue / command pool / command buffer | |
// ----------------------------------------------- | |
vkGetDeviceQueue(m_vk_device, indices.all_family, 0, &m_vk_queue); | |
VkCommandPoolCreateInfo command_pool_info = {}; | |
command_pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; | |
command_pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; | |
command_pool_info.queueFamilyIndex = indices.all_family; | |
VK_CHECK_THROW(vkCreateCommandPool(m_vk_device, &command_pool_info, nullptr, &m_vk_command_pool)); | |
VkCommandBufferAllocateInfo command_buffer_alloc_info = {}; | |
command_buffer_alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; | |
command_buffer_alloc_info.commandPool = m_vk_command_pool; | |
command_buffer_alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; | |
command_buffer_alloc_info.commandBufferCount = 1; | |
VK_CHECK_THROW(vkAllocateCommandBuffers(m_vk_device, &command_buffer_alloc_info, &m_vk_command_buffer)); | |
// ------------------------------- | |
// NGX init | |
// ------------------------------- | |
std::wstring path; | |
path = fs::path::getcwd().wstr(); | |
std::string tmp = fs::path::getcwd().str(); | |
std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter; | |
path = converter.from_bytes(tmp); | |
NGX_CHECK_THROW(NVSDK_NGX_VULKAN_Init_with_ProjectID("ea75345e-5a42-4037-a5c9-59bf94dee157", NVSDK_NGX_ENGINE_TYPE_CUSTOM, "1.0.0", path.c_str(), m_vk_instance, m_vk_physical_device, m_vk_device)); | |
m_ngx_initialized = true; | |
// ------------------------------- | |
// Ensure DLSS capability | |
// ------------------------------- | |
NGX_CHECK_THROW(NVSDK_NGX_VULKAN_GetCapabilityParameters(&m_ngx_parameters)); | |
int needs_updated_driver = 0; | |
unsigned int min_driver_version_major = 0; | |
unsigned int min_driver_version_minor = 0; | |
NVSDK_NGX_Result result_updated_driver = m_ngx_parameters->Get(NVSDK_NGX_Parameter_SuperSampling_NeedsUpdatedDriver, &needs_updated_driver); | |
NVSDK_NGX_Result result_min_driver_version_major = m_ngx_parameters->Get(NVSDK_NGX_Parameter_SuperSampling_MinDriverVersionMajor, &min_driver_version_major); | |
NVSDK_NGX_Result result_min_driver_version_minor = m_ngx_parameters->Get(NVSDK_NGX_Parameter_SuperSampling_MinDriverVersionMinor, &min_driver_version_minor); | |
if (result_updated_driver == NVSDK_NGX_Result_Success && result_min_driver_version_major == NVSDK_NGX_Result_Success && result_min_driver_version_minor == NVSDK_NGX_Result_Success) { | |
if (needs_updated_driver) { | |
throw std::runtime_error{fmt::format("Driver too old. Minimum version required is {}.{}", min_driver_version_major, min_driver_version_minor)}; | |
} | |
} | |
int dlss_available = 0; | |
NVSDK_NGX_Result ngx_result = m_ngx_parameters->Get(NVSDK_NGX_Parameter_SuperSampling_Available, &dlss_available); | |
if (ngx_result != NVSDK_NGX_Result_Success || !dlss_available) { | |
ngx_result = NVSDK_NGX_Result_Fail; | |
NVSDK_NGX_Parameter_GetI(m_ngx_parameters, NVSDK_NGX_Parameter_SuperSampling_FeatureInitResult, (int*)&ngx_result); | |
throw std::runtime_error{fmt::format("DLSS not available: {}", ngx_error_string(ngx_result))}; | |
} | |
cleanup_guard.disarm(); | |
tlog::success() << "Initialized Vulkan and NGX on GPU #" << device_id << ": " << physical_device_properties.deviceName; | |
} | |
virtual ~VulkanAndNgx() { | |
clear(); | |
} | |
void clear() { | |
if (m_ngx_parameters) { | |
NVSDK_NGX_VULKAN_DestroyParameters(m_ngx_parameters); | |
m_ngx_parameters = nullptr; | |
} | |
if (m_ngx_initialized) { | |
NVSDK_NGX_VULKAN_Shutdown(); | |
m_ngx_initialized = false; | |
} | |
if (m_vk_command_pool) { | |
vkDestroyCommandPool(m_vk_device, m_vk_command_pool, nullptr); | |
m_vk_command_pool = VK_NULL_HANDLE; | |
} | |
if (m_vk_device) { | |
vkDestroyDevice(m_vk_device, nullptr); | |
m_vk_device = VK_NULL_HANDLE; | |
} | |
if (m_vk_debug_messenger) { | |
auto DestroyDebugUtilsMessengerEXT = [](VkInstance instance, VkDebugUtilsMessengerEXT debugMessenger, const VkAllocationCallbacks* pAllocator) { | |
auto func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT"); | |
if (func != nullptr) { | |
func(instance, debugMessenger, pAllocator); | |
} | |
}; | |
DestroyDebugUtilsMessengerEXT(m_vk_instance, m_vk_debug_messenger, nullptr); | |
m_vk_debug_messenger = VK_NULL_HANDLE; | |
} | |
if (m_vk_instance) { | |
vkDestroyInstance(m_vk_instance, nullptr); | |
m_vk_instance = VK_NULL_HANDLE; | |
} | |
} | |
uint32_t vk_find_memory_type(uint32_t type_filter, VkMemoryPropertyFlags properties) { | |
VkPhysicalDeviceMemoryProperties mem_properties; | |
vkGetPhysicalDeviceMemoryProperties(m_vk_physical_device, &mem_properties); | |
for (uint32_t i = 0; i < mem_properties.memoryTypeCount; i++) { | |
if (type_filter & (1 << i) && (mem_properties.memoryTypes[i].propertyFlags & properties) == properties) { | |
return i; | |
} | |
} | |
throw std::runtime_error{"Failed to find suitable memory type."}; | |
} | |
void vk_command_buffer_begin() { | |
VkCommandBufferBeginInfo begin_info = {}; | |
begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; | |
begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; | |
begin_info.pInheritanceInfo = nullptr; | |
VK_CHECK_THROW(vkBeginCommandBuffer(m_vk_command_buffer, &begin_info)); | |
} | |
void vk_command_buffer_end() { | |
VK_CHECK_THROW(vkEndCommandBuffer(m_vk_command_buffer)); | |
} | |
void vk_command_buffer_submit() { | |
VkSubmitInfo submit_info = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; | |
submit_info.commandBufferCount = 1; | |
submit_info.pCommandBuffers = &m_vk_command_buffer; | |
VK_CHECK_THROW(vkQueueSubmit(m_vk_queue, 1, &submit_info, VK_NULL_HANDLE)); | |
} | |
void vk_synchronize() { | |
VK_CHECK_THROW(vkDeviceWaitIdle(m_vk_device)); | |
} | |
void vk_command_buffer_submit_sync() { | |
vk_command_buffer_submit(); | |
vk_synchronize(); | |
} | |
void vk_command_buffer_end_and_submit_sync() { | |
vk_command_buffer_end(); | |
vk_command_buffer_submit_sync(); | |
} | |
const VkCommandBuffer& vk_command_buffer() const { | |
return m_vk_command_buffer; | |
} | |
const VkDevice& vk_device() const { | |
return m_vk_device; | |
} | |
NVSDK_NGX_Parameter* ngx_parameters() const { | |
return m_ngx_parameters; | |
} | |
size_t allocated_bytes() const override { | |
unsigned long long allocated_bytes = 0; | |
if (!m_ngx_parameters) { | |
return 0; | |
} | |
try { | |
NGX_CHECK_THROW(NGX_DLSS_GET_STATS(m_ngx_parameters, &allocated_bytes)); | |
} catch (...) { | |
return 0; | |
} | |
return allocated_bytes; | |
} | |
std::unique_ptr<IDlss> init_dlss(const ivec2& out_resolution) override; | |
private: | |
VkInstance m_vk_instance = VK_NULL_HANDLE; | |
VkDebugUtilsMessengerEXT m_vk_debug_messenger = VK_NULL_HANDLE; | |
VkPhysicalDevice m_vk_physical_device = VK_NULL_HANDLE; | |
VkDevice m_vk_device = VK_NULL_HANDLE; | |
VkQueue m_vk_queue = VK_NULL_HANDLE; | |
VkCommandPool m_vk_command_pool = VK_NULL_HANDLE; | |
VkCommandBuffer m_vk_command_buffer = VK_NULL_HANDLE; | |
NVSDK_NGX_Parameter* m_ngx_parameters = nullptr; | |
bool m_ngx_initialized = false; | |
}; | |
std::shared_ptr<IDlssProvider> init_vulkan_and_ngx() { | |
return std::make_shared<VulkanAndNgx>(); | |
} | |
class VulkanTexture { | |
public: | |
VulkanTexture(std::shared_ptr<VulkanAndNgx> vk, const ivec2& size, uint32_t n_channels) : m_vk{vk}, m_size{size}, m_n_channels{n_channels} { | |
ScopeGuard cleanup_guard{[&]() { clear(); }}; | |
VkImageCreateInfo image_info{}; | |
image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; | |
image_info.imageType = VK_IMAGE_TYPE_2D; | |
image_info.extent.width = static_cast<uint32_t>(m_size.x); | |
image_info.extent.height = static_cast<uint32_t>(m_size.y); | |
image_info.extent.depth = 1; | |
image_info.mipLevels = 1; | |
image_info.arrayLayers = 1; | |
switch (n_channels) { | |
case 1: image_info.format = VK_FORMAT_R32_SFLOAT; break; | |
case 2: image_info.format = VK_FORMAT_R32G32_SFLOAT; break; | |
case 3: image_info.format = VK_FORMAT_R32G32B32_SFLOAT; break; | |
case 4: image_info.format = VK_FORMAT_R32G32B32A32_SFLOAT; break; | |
default: throw std::runtime_error{"VulkanTexture only supports 1, 2, 3, or 4 channels."}; | |
} | |
image_info.tiling = VK_IMAGE_TILING_OPTIMAL; | |
image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; | |
image_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT; | |
image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; | |
image_info.samples = VK_SAMPLE_COUNT_1_BIT; | |
image_info.flags = 0; | |
VkExternalMemoryImageCreateInfoKHR ext_image_info = {}; | |
ext_image_info.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR; | |
ext_image_info.handleTypes |= VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR; | |
ext_image_info.handleTypes |= VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; | |
image_info.pNext = &ext_image_info; | |
VK_CHECK_THROW(vkCreateImage(m_vk->vk_device(), &image_info, nullptr, &m_vk_image)); | |
// Create device memory to back up the image | |
VkMemoryRequirements mem_requirements = {}; | |
vkGetImageMemoryRequirements(m_vk->vk_device(), m_vk_image, &mem_requirements); | |
VkMemoryAllocateInfo mem_alloc_info = {}; | |
mem_alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; | |
mem_alloc_info.allocationSize = mem_requirements.size; | |
mem_alloc_info.memoryTypeIndex = m_vk->vk_find_memory_type(mem_requirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); | |
VkExportMemoryAllocateInfoKHR export_info = {}; | |
export_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; | |
export_info.handleTypes = ext_image_info.handleTypes; | |
mem_alloc_info.pNext = &export_info; | |
VK_CHECK_THROW(vkAllocateMemory(m_vk->vk_device(), &mem_alloc_info, nullptr, &m_vk_device_memory)); | |
VK_CHECK_THROW(vkBindImageMemory(m_vk->vk_device(), m_vk_image, m_vk_device_memory, 0)); | |
m_vk->vk_command_buffer_begin(); | |
VkImageMemoryBarrier barrier = {}; | |
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; | |
barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; | |
barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; | |
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; | |
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; | |
barrier.image = m_vk_image; | |
barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; | |
barrier.subresourceRange.baseMipLevel = 0; | |
barrier.subresourceRange.levelCount = 1; | |
barrier.subresourceRange.baseArrayLayer = 0; | |
barrier.subresourceRange.layerCount = 1; | |
barrier.srcAccessMask = 0; | |
barrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; | |
vkCmdPipelineBarrier( | |
m_vk->vk_command_buffer(), | |
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | |
0, | |
0, nullptr, | |
0, nullptr, | |
1, &barrier | |
); | |
m_vk->vk_command_buffer_end_and_submit_sync(); | |
// Image view | |
VkImageViewCreateInfo view_info = {}; | |
view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; | |
view_info.image = m_vk_image; | |
view_info.viewType = VK_IMAGE_VIEW_TYPE_2D; | |
view_info.format = image_info.format; | |
view_info.subresourceRange = barrier.subresourceRange; | |
VK_CHECK_THROW(vkCreateImageView(m_vk->vk_device(), &view_info, nullptr, &m_vk_image_view)); | |
// Map to NGX | |
m_ngx_resource = NVSDK_NGX_Create_ImageView_Resource_VK(m_vk_image_view, m_vk_image, view_info.subresourceRange, image_info.format, m_size.x, m_size.y, true); | |
// Map to CUDA memory: VkDeviceMemory->FD/HANDLE->cudaExternalMemory->CUDA pointer | |
HANDLE handle = nullptr; | |
VkMemoryGetWin32HandleInfoKHR handle_info = {}; | |
handle_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; | |
handle_info.memory = m_vk_device_memory; | |
handle_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; | |
auto pfn_vkGetMemory = (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(m_vk->vk_device(), "vkGetMemoryWin32HandleKHR"); | |
int handle = -1; | |
VkMemoryGetFdInfoKHR handle_info = {}; | |
handle_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; | |
handle_info.memory = m_vk_device_memory; | |
handle_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; | |
auto pfn_vkGetMemory = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_vk->vk_device(), "vkGetMemoryFdKHR"); | |
if (!pfn_vkGetMemory) { | |
throw std::runtime_error{"Failed to locate pfn_vkGetMemory."}; | |
} | |
VK_CHECK_THROW(pfn_vkGetMemory(m_vk->vk_device(), &handle_info, &handle)); | |
// Map handle to CUDA memory | |
cudaExternalMemoryHandleDesc external_memory_handle_desc = {}; | |
memset(&external_memory_handle_desc, 0, sizeof(external_memory_handle_desc)); | |
external_memory_handle_desc.type = cudaExternalMemoryHandleTypeOpaqueWin32; | |
external_memory_handle_desc.handle.win32.handle = handle; | |
external_memory_handle_desc.type = cudaExternalMemoryHandleTypeOpaqueFd; | |
external_memory_handle_desc.handle.fd = handle; | |
external_memory_handle_desc.size = mem_requirements.size; | |
CUDA_CHECK_THROW(cudaImportExternalMemory(&m_cuda_external_memory, &external_memory_handle_desc)); | |
cudaExternalMemoryBufferDesc external_memory_buffer_desc = {}; | |
memset(&external_memory_buffer_desc, 0, sizeof(external_memory_buffer_desc)); | |
external_memory_buffer_desc.offset = 0; | |
external_memory_buffer_desc.size = mem_requirements.size; | |
void* ptr; | |
CUDA_CHECK_THROW(cudaExternalMemoryGetMappedBuffer(&ptr, m_cuda_external_memory, &external_memory_buffer_desc)); | |
m_cuda_data = (float*)ptr; | |
// ---------------- | |
// Also get a surface object array, as the above buffer might be too cumbersome to deal with | |
// ---------------- | |
cudaExternalMemoryMipmappedArrayDesc external_memory_mipmapped_array_desc = {}; | |
memset(&external_memory_mipmapped_array_desc, 0, sizeof(external_memory_mipmapped_array_desc)); | |
cudaChannelFormatDesc channel_format = {}; | |
channel_format.f = cudaChannelFormatKindFloat; | |
switch (n_channels) { | |
case 1: channel_format.x = 32; channel_format.y = 0; channel_format.z = 0; channel_format.w = 0; break; | |
case 2: channel_format.x = 32; channel_format.y = 32; channel_format.z = 0; channel_format.w = 0; break; | |
case 3: channel_format.x = 32; channel_format.y = 32; channel_format.z = 32; channel_format.w = 0; break; | |
case 4: channel_format.x = 32; channel_format.y = 32; channel_format.z = 32; channel_format.w = 32; break; | |
default: throw std::runtime_error{"VulkanTexture only supports 1, 2, 3, or 4 channels."}; | |
} | |
cudaExtent extent = {}; | |
extent.width = m_size.x; | |
extent.height = m_size.y; | |
extent.depth = 0; | |
external_memory_mipmapped_array_desc.offset = 0; | |
external_memory_mipmapped_array_desc.formatDesc = channel_format; | |
external_memory_mipmapped_array_desc.extent = extent; | |
external_memory_mipmapped_array_desc.flags = cudaArraySurfaceLoadStore; | |
external_memory_mipmapped_array_desc.numLevels = 1; | |
cudaExternalMemoryGetMappedMipmappedArray(&m_cuda_mipmapped_array, m_cuda_external_memory, &external_memory_mipmapped_array_desc); | |
cudaArray_t first_level_array; | |
CUDA_CHECK_THROW(cudaGetMipmappedArrayLevel(&first_level_array, m_cuda_mipmapped_array, 0)); | |
struct cudaResourceDesc resource_desc; | |
memset(&resource_desc, 0, sizeof(resource_desc)); | |
resource_desc.resType = cudaResourceTypeArray; | |
resource_desc.res.array.array = first_level_array; | |
CUDA_CHECK_THROW(cudaCreateSurfaceObject(&m_cuda_surface_object, &resource_desc)); | |
m_n_bytes = mem_requirements.size; | |
g_total_n_bytes_allocated += m_n_bytes; | |
cleanup_guard.disarm(); | |
} | |
virtual ~VulkanTexture() { | |
clear(); | |
} | |
void clear() { | |
g_total_n_bytes_allocated -= m_n_bytes; | |
if (m_cuda_data) { | |
cudaFree(m_cuda_data); | |
m_cuda_data = nullptr; | |
} | |
if (m_cuda_surface_object) { | |
cudaDestroySurfaceObject(m_cuda_surface_object); | |
m_cuda_surface_object = {}; | |
} | |
if (m_cuda_mipmapped_array) { | |
cudaFreeMipmappedArray(m_cuda_mipmapped_array); | |
m_cuda_mipmapped_array = {}; | |
} | |
if (m_cuda_external_memory) { | |
cudaDestroyExternalMemory(m_cuda_external_memory); | |
m_cuda_external_memory = {}; | |
} | |
if (m_vk_image_view) { | |
vkDestroyImageView(m_vk->vk_device(), m_vk_image_view, nullptr); | |
m_vk_image_view = {}; | |
} | |
if (m_vk_image) { | |
vkDestroyImage(m_vk->vk_device(), m_vk_image, nullptr); | |
m_vk_image = {}; | |
} | |
if (m_vk_device_memory) { | |
vkFreeMemory(m_vk->vk_device(), m_vk_device_memory, nullptr); | |
m_vk_device_memory = {}; | |
} | |
} | |
float* data() { | |
return m_cuda_data; | |
} | |
cudaSurfaceObject_t surface() { | |
return m_cuda_surface_object; | |
} | |
NVSDK_NGX_Resource_VK& ngx_resource() { | |
return m_ngx_resource; | |
} | |
size_t bytes() const { | |
return m_size.x * (size_t)m_size.y * sizeof(float) * m_n_channels; | |
} | |
ivec2 size() const { | |
return m_size; | |
} | |
private: | |
std::shared_ptr<VulkanAndNgx> m_vk; | |
ivec2 m_size; | |
uint32_t m_n_channels; | |
size_t m_n_bytes = 0; | |
VkImage m_vk_image = {}; | |
VkImageView m_vk_image_view = {}; | |
VkDeviceMemory m_vk_device_memory = {}; | |
cudaExternalMemory_t m_cuda_external_memory = {}; | |
cudaMipmappedArray_t m_cuda_mipmapped_array = {}; | |
cudaSurfaceObject_t m_cuda_surface_object = {}; | |
float* m_cuda_data = nullptr; | |
NVSDK_NGX_Resource_VK m_ngx_resource = {}; | |
}; | |
NVSDK_NGX_PerfQuality_Value ngx_dlss_quality(EDlssQuality quality) { | |
switch (quality) { | |
case EDlssQuality::UltraPerformance: return NVSDK_NGX_PerfQuality_Value_UltraPerformance; | |
case EDlssQuality::MaxPerformance: return NVSDK_NGX_PerfQuality_Value_MaxPerf; | |
case EDlssQuality::Balanced: return NVSDK_NGX_PerfQuality_Value_Balanced; | |
case EDlssQuality::MaxQuality: return NVSDK_NGX_PerfQuality_Value_MaxQuality; | |
case EDlssQuality::UltraQuality: return NVSDK_NGX_PerfQuality_Value_UltraQuality; | |
default: throw std::runtime_error{"Unknown DLSS quality setting."}; | |
} | |
} | |
struct DlssFeatureSpecs { | |
EDlssQuality quality; | |
ivec2 out_resolution; | |
ivec2 optimal_in_resolution; | |
ivec2 min_in_resolution; | |
ivec2 max_in_resolution; | |
float optimal_sharpness; | |
float distance(const ivec2& resolution) const { | |
return length(vec2(max(max(min_in_resolution - resolution, resolution - max_in_resolution), ivec2(0)))); | |
} | |
ivec2 clamp_resolution(const ivec2& resolution) const { | |
return clamp(resolution, min_in_resolution, max_in_resolution); | |
} | |
}; | |
DlssFeatureSpecs dlss_feature_specs(NVSDK_NGX_Parameter* ngx_parameters, const ivec2& out_resolution, EDlssQuality quality) { | |
DlssFeatureSpecs specs; | |
specs.quality = quality; | |
specs.out_resolution = out_resolution; | |
NGX_CHECK_THROW(NGX_DLSS_GET_OPTIMAL_SETTINGS( | |
ngx_parameters, | |
specs.out_resolution.x, specs.out_resolution.y, | |
ngx_dlss_quality(quality), | |
(uint32_t*)&specs.optimal_in_resolution.x, (uint32_t*)&specs.optimal_in_resolution.y, | |
(uint32_t*)&specs.max_in_resolution.x, (uint32_t*)&specs.max_in_resolution.y, | |
(uint32_t*)&specs.min_in_resolution.x, (uint32_t*)&specs.min_in_resolution.y, | |
&specs.optimal_sharpness | |
)); | |
// Don't permit input resolutions larger than the output. (Just in case DLSS allows it.) | |
specs.optimal_in_resolution = min(specs.optimal_in_resolution, out_resolution); | |
specs.max_in_resolution = min(specs.max_in_resolution, out_resolution); | |
specs.min_in_resolution = min(specs.min_in_resolution, out_resolution); | |
return specs; | |
} | |
class DlssFeature { | |
public: | |
DlssFeature(std::shared_ptr<VulkanAndNgx> vk_and_ngx, const DlssFeatureSpecs& specs, bool is_hdr, bool sharpen) : m_vk_and_ngx{vk_and_ngx}, m_specs{specs}, m_is_hdr{is_hdr}, m_sharpen{sharpen} { | |
// Initialize DLSS | |
unsigned int creation_node_mask = 1; | |
unsigned int visibility_node_mask = 1; | |
int dlss_create_feature_flags = NVSDK_NGX_DLSS_Feature_Flags_None; | |
dlss_create_feature_flags |= true ? NVSDK_NGX_DLSS_Feature_Flags_MVLowRes : 0; | |
dlss_create_feature_flags |= false ? NVSDK_NGX_DLSS_Feature_Flags_MVJittered : 0; | |
dlss_create_feature_flags |= is_hdr ? NVSDK_NGX_DLSS_Feature_Flags_IsHDR : 0; | |
dlss_create_feature_flags |= true ? NVSDK_NGX_DLSS_Feature_Flags_DepthInverted : 0; | |
dlss_create_feature_flags |= sharpen ? NVSDK_NGX_DLSS_Feature_Flags_DoSharpening : 0; | |
dlss_create_feature_flags |= false ? NVSDK_NGX_DLSS_Feature_Flags_AutoExposure : 0; | |
NVSDK_NGX_DLSS_Create_Params dlss_create_params; | |
memset(&dlss_create_params, 0, sizeof(dlss_create_params)); | |
dlss_create_params.Feature.InWidth = m_specs.optimal_in_resolution.x; | |
dlss_create_params.Feature.InHeight = m_specs.optimal_in_resolution.y; | |
dlss_create_params.Feature.InTargetWidth = m_specs.out_resolution.x; | |
dlss_create_params.Feature.InTargetHeight = m_specs.out_resolution.y; | |
dlss_create_params.Feature.InPerfQualityValue = ngx_dlss_quality(m_specs.quality); | |
dlss_create_params.InFeatureCreateFlags = dlss_create_feature_flags; | |
{ | |
m_vk_and_ngx->vk_command_buffer_begin(); | |
ScopeGuard command_buffer_guard{[&]() { m_vk_and_ngx->vk_command_buffer_end_and_submit_sync(); }}; | |
NGX_CHECK_THROW(NGX_VULKAN_CREATE_DLSS_EXT(m_vk_and_ngx->vk_command_buffer(), creation_node_mask, visibility_node_mask, &m_ngx_dlss, m_vk_and_ngx->ngx_parameters(), &dlss_create_params)); | |
} | |
} | |
DlssFeature(std::shared_ptr<VulkanAndNgx> vk_and_ngx, const ivec2& out_resolution, bool is_hdr, bool sharpen, EDlssQuality quality) | |
: DlssFeature{vk_and_ngx, dlss_feature_specs(vk_and_ngx->ngx_parameters(), out_resolution, quality), is_hdr, sharpen} {} | |
~DlssFeature() { | |
cudaDeviceSynchronize(); | |
if (m_ngx_dlss) { | |
NVSDK_NGX_VULKAN_ReleaseFeature(m_ngx_dlss); | |
} | |
m_vk_and_ngx->vk_synchronize(); | |
} | |
void run( | |
const ivec2& in_resolution, | |
const vec2& jitter_offset, | |
float sharpening, | |
bool shall_reset, | |
NVSDK_NGX_Resource_VK& frame, | |
NVSDK_NGX_Resource_VK& depth, | |
NVSDK_NGX_Resource_VK& mvec, | |
NVSDK_NGX_Resource_VK& exposure, | |
NVSDK_NGX_Resource_VK& output | |
) { | |
if (!m_sharpen && sharpening != 0.0f) { | |
throw std::runtime_error{"May only specify non-zero sharpening, when DlssFeature has been created with sharpen option."}; | |
} | |
m_vk_and_ngx->vk_command_buffer_begin(); | |
NVSDK_NGX_VK_DLSS_Eval_Params dlss_params; | |
memset(&dlss_params, 0, sizeof(dlss_params)); | |
dlss_params.Feature.pInColor = &frame; | |
dlss_params.Feature.pInOutput = &output; | |
dlss_params.pInDepth = &depth; | |
dlss_params.pInMotionVectors = &mvec; | |
dlss_params.pInExposureTexture = &exposure; | |
dlss_params.InJitterOffsetX = jitter_offset.x; | |
dlss_params.InJitterOffsetY = jitter_offset.y; | |
dlss_params.Feature.InSharpness = sharpening; | |
dlss_params.InReset = shall_reset; | |
dlss_params.InMVScaleX = 1.0f; | |
dlss_params.InMVScaleY = 1.0f; | |
dlss_params.InRenderSubrectDimensions = {(uint32_t)in_resolution.x, (uint32_t)in_resolution.y}; | |
NGX_CHECK_THROW(NGX_VULKAN_EVALUATE_DLSS_EXT(m_vk_and_ngx->vk_command_buffer(), m_ngx_dlss, m_vk_and_ngx->ngx_parameters(), &dlss_params)); | |
m_vk_and_ngx->vk_command_buffer_end_and_submit_sync(); | |
} | |
bool is_hdr() const { | |
return m_is_hdr; | |
} | |
bool sharpen() const { | |
return m_sharpen; | |
} | |
EDlssQuality quality() const { | |
return m_specs.quality; | |
} | |
ivec2 out_resolution() const { | |
return m_specs.out_resolution; | |
} | |
ivec2 clamp_resolution(const ivec2& resolution) const { | |
return m_specs.clamp_resolution(resolution); | |
} | |
ivec2 optimal_in_resolution() const { | |
return m_specs.optimal_in_resolution; | |
} | |
private: | |
std::shared_ptr<VulkanAndNgx> m_vk_and_ngx; | |
NVSDK_NGX_Handle* m_ngx_dlss = {}; | |
DlssFeatureSpecs m_specs; | |
bool m_is_hdr; | |
bool m_sharpen; | |
}; | |
class Dlss : public IDlss { | |
public: | |
Dlss(std::shared_ptr<VulkanAndNgx> vk_and_ngx, const ivec2& max_out_resolution) | |
: | |
m_vk_and_ngx{vk_and_ngx}, | |
m_max_out_resolution{max_out_resolution}, | |
// Allocate all buffers at output resolution and use dynamic sub-rects | |
// to use subsets of them. This avoids re-allocations when using DLSS | |
// with dynamically changing input resolution. | |
m_frame_buffer{m_vk_and_ngx, max_out_resolution, 4}, | |
m_depth_buffer{m_vk_and_ngx, max_out_resolution, 1}, | |
m_mvec_buffer{m_vk_and_ngx, max_out_resolution, 2}, | |
m_exposure_buffer{m_vk_and_ngx, {1, 1}, 1}, | |
m_output_buffer{m_vk_and_ngx, max_out_resolution, 4} | |
{ | |
// Various quality modes of DLSS | |
for (int i = 0; i < (int)EDlssQuality::NumDlssQualitySettings; ++i) { | |
try { | |
auto specs = dlss_feature_specs(m_vk_and_ngx->ngx_parameters(), max_out_resolution, (EDlssQuality)i); | |
// Only emplace the specs if the feature can be created in practice! | |
DlssFeature{m_vk_and_ngx, specs, true, true}; | |
DlssFeature{m_vk_and_ngx, specs, true, false}; | |
DlssFeature{m_vk_and_ngx, specs, false, true}; | |
DlssFeature{m_vk_and_ngx, specs, false, false}; | |
m_dlss_specs.emplace_back(specs); | |
} catch (...) {} | |
} | |
// For super insane performance requirements (more than 3x upscaling) try UltraPerformance | |
// with reduced output resolutions for 4.5x, 6x, 9x. | |
std::vector<ivec2> reduced_out_resolutions = { | |
max_out_resolution / 3 * 2, | |
max_out_resolution / 2, | |
max_out_resolution / 3, | |
// max_out_resolution / 4, | |
}; | |
for (const auto& out_resolution : reduced_out_resolutions) { | |
try { | |
auto specs = dlss_feature_specs(m_vk_and_ngx->ngx_parameters(), out_resolution, EDlssQuality::UltraPerformance); | |
// Only emplace the specs if the feature can be created in practice! | |
DlssFeature{m_vk_and_ngx, specs, true, true}; | |
DlssFeature{m_vk_and_ngx, specs, true, false}; | |
DlssFeature{m_vk_and_ngx, specs, false, true}; | |
DlssFeature{m_vk_and_ngx, specs, false, false}; | |
m_dlss_specs.emplace_back(specs); | |
} catch (...) {} | |
} | |
} | |
virtual ~Dlss() { | |
// Destroy DLSS feature prior to killing underlying buffers. | |
m_dlss_feature = nullptr; | |
} | |
void update_feature(const ivec2& in_resolution, bool is_hdr, bool sharpen) override { | |
CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
DlssFeatureSpecs specs; | |
bool found = false; | |
for (const auto& s : m_dlss_specs) { | |
if (s.distance(in_resolution) == 0.0f) { | |
specs = s; | |
found = true; | |
} | |
} | |
if (!found) { | |
throw std::runtime_error{"Dlss::run called with invalid input resolution."}; | |
} | |
if (!m_dlss_feature || m_dlss_feature->is_hdr() != is_hdr || m_dlss_feature->sharpen() != sharpen || m_dlss_feature->quality() != specs.quality || m_dlss_feature->out_resolution() != specs.out_resolution) { | |
m_dlss_feature.reset(new DlssFeature{m_vk_and_ngx, specs.out_resolution, is_hdr, sharpen, specs.quality}); | |
} | |
} | |
void run( | |
const ivec2& in_resolution, | |
bool is_hdr, | |
float sharpening, | |
const vec2& jitter_offset, | |
bool shall_reset | |
) override { | |
CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
update_feature(in_resolution, is_hdr, sharpening != 0.0f); | |
m_dlss_feature->run( | |
in_resolution, | |
jitter_offset, | |
sharpening, | |
shall_reset, | |
m_frame_buffer.ngx_resource(), | |
m_depth_buffer.ngx_resource(), | |
m_mvec_buffer.ngx_resource(), | |
m_exposure_buffer.ngx_resource(), | |
m_output_buffer.ngx_resource() | |
); | |
} | |
cudaSurfaceObject_t frame() override { | |
return m_frame_buffer.surface(); | |
} | |
cudaSurfaceObject_t depth() override { | |
return m_depth_buffer.surface(); | |
} | |
cudaSurfaceObject_t mvec() override { | |
return m_mvec_buffer.surface(); | |
} | |
cudaSurfaceObject_t exposure() override { | |
return m_exposure_buffer.surface(); | |
} | |
cudaSurfaceObject_t output() override { | |
return m_output_buffer.surface(); | |
} | |
ivec2 clamp_resolution(const ivec2& resolution) const { | |
float min_distance = std::numeric_limits<float>::infinity(); | |
DlssFeatureSpecs min_distance_specs = {}; | |
for (const auto& specs : m_dlss_specs) { | |
float distance = specs.distance(resolution); | |
if (distance <= min_distance) { | |
min_distance = distance; | |
min_distance_specs = specs; | |
} | |
} | |
return min_distance_specs.clamp_resolution(resolution); | |
} | |
ivec2 out_resolution() const override { | |
return m_dlss_feature ? m_dlss_feature->out_resolution() : m_max_out_resolution; | |
} | |
ivec2 max_out_resolution() const override { | |
return m_max_out_resolution; | |
} | |
bool is_hdr() const override { | |
return m_dlss_feature && m_dlss_feature->is_hdr(); | |
} | |
bool sharpen() const override { | |
return m_dlss_feature && m_dlss_feature->sharpen(); | |
} | |
EDlssQuality quality() const override { | |
return m_dlss_feature ? m_dlss_feature->quality() : EDlssQuality::None; | |
} | |
private: | |
std::shared_ptr<VulkanAndNgx> m_vk_and_ngx; | |
std::unique_ptr<DlssFeature> m_dlss_feature; | |
std::vector<DlssFeatureSpecs> m_dlss_specs; | |
VulkanTexture m_frame_buffer; | |
VulkanTexture m_depth_buffer; | |
VulkanTexture m_mvec_buffer; | |
VulkanTexture m_exposure_buffer; | |
VulkanTexture m_output_buffer; | |
ivec2 m_max_out_resolution; | |
}; | |
std::unique_ptr<IDlss> VulkanAndNgx::init_dlss(const ivec2& out_resolution) { | |
return std::make_unique<Dlss>(shared_from_this(), out_resolution); | |
} | |
} | |