/* * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** @file gpu_memory.h * @author Thomas Müller and Nikolaus Binder, NVIDIA * @brief Managed memory on the GPU. Like a std::vector, memory is allocated either explicitly (resize/enlarge) * or implicitly (resize_and_copy_from_host etc). Memory is always and automatically released in the destructor. * Also contains a GPU memory arena for light-weight stream-ordered allocations of temporary memory. The * memory arena makes use of virtual memory when available to avoid re-allocations during progressive growing. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include namespace tcnn { #define DEBUG_GUARD_SIZE 0 inline std::atomic& total_n_bytes_allocated() { static std::atomic s_total_n_bytes_allocated{0}; return s_total_n_bytes_allocated; } /// Managed memory on the Device template class GPUMemory { private: T* m_data = nullptr; size_t m_size = 0; // Number of elements bool m_managed = false; public: using Type = T; using View = T*; using ConstView = const T*; GPUMemory() {} GPUMemory(size_t size, bool managed = false) : m_managed{managed} { resize(size); } GPUMemory& operator=(GPUMemory&& other) { std::swap(m_data, other.m_data); std::swap(m_size, other.m_size); std::swap(m_managed, other.m_managed); return *this; } GPUMemory(GPUMemory&& other) { *this = std::move(other); } // Don't permit copy assignment to prevent performance accidents. // Copy is permitted through an explicit copy constructor. GPUMemory& operator=(const GPUMemory& other) = delete; explicit GPUMemory(const GPUMemory& other) { m_managed = other.managed(); copy_from_device(other); } void check_guards() const { #if DEBUG_GUARD_SIZE > 0 if (!m_data) return; uint8_t buf[DEBUG_GUARD_SIZE]; const uint8_t *rawptr=(const uint8_t *)m_data; cudaMemcpy(buf, rawptr-DEBUG_GUARD_SIZE, DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost); for (int i=0;i 0 CUDA_CHECK_THROW(cudaMemset(rawptr, 0xff, DEBUG_GUARD_SIZE)); CUDA_CHECK_THROW(cudaMemset(rawptr + n_bytes + DEBUG_GUARD_SIZE, 0xfe, DEBUG_GUARD_SIZE)); #endif if (rawptr) rawptr += DEBUG_GUARD_SIZE; m_data = (T*)(rawptr); total_n_bytes_allocated() += n_bytes; } void free_memory() { if (!m_data) { return; } uint8_t *rawptr = (uint8_t*)m_data; if (rawptr) rawptr -= DEBUG_GUARD_SIZE; CUDA_CHECK_THROW(cudaFree(rawptr)); total_n_bytes_allocated() -= get_bytes(); m_data = nullptr; m_size = 0; } /// Frees memory again TCNN_HOST_DEVICE ~GPUMemory() { #ifndef __CUDA_ARCH__ try { if (m_data) { free_memory(); m_size = 0; } } catch (const std::runtime_error& error) { // Don't need to report on memory-free problems when the driver is shutting down. if (std::string{error.what()}.find("driver shutting down") == std::string::npos) { log_warning("Could not free memory: {}", error.what()); } } #endif } /** @name Resizing/enlargement * @{ */ /// Resizes the array to the exact new size, even if it is already larger void resize(const size_t size) { if (m_size != size) { if (m_size) { try { free_memory(); } catch (const std::runtime_error& error) { throw std::runtime_error{fmt::format("Could not free memory: {}", error.what())}; } } if (size > 0) { try { allocate_memory(size * sizeof(T)); } catch (const std::runtime_error& error) { throw std::runtime_error{fmt::format("Could not allocate memory: {}", error.what())}; } } m_size = size; } } /// Enlarges the array if its size is smaller void enlarge(const size_t size) { if (size > m_size) { resize(size); } } /** @} */ /** @name Memset * @{ */ /// Sets the memory of the first num_elements to value void memset(const int value, const size_t num_elements, const size_t offset = 0) { if (num_elements + offset > m_size) { throw std::runtime_error{fmt::format("Could not set memory: Number of elements {}+{} larger than allocated memory {}.", num_elements, offset, m_size)}; } CUDA_CHECK_THROW(cudaMemset(m_data + offset, value, num_elements * sizeof(T))); } /// Sets the memory of the all elements to value void memset(const int value) { memset(value, m_size); } /** @} */ /** @name Copy operations * @{ */ /// Copy data of num_elements from the raw pointer on the host void copy_from_host(const T* host_data, const size_t num_elements) { CUDA_CHECK_THROW(cudaMemcpy(data(), host_data, num_elements * sizeof(T), cudaMemcpyHostToDevice)); } /// Copy num_elements from the host vector void copy_from_host(const std::vector& data, const size_t num_elements) { if (data.size() < num_elements) { throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}.", num_elements, data.size())}; } copy_from_host(data.data(), num_elements); } /// Copies data from the raw host pointer to fill the entire array void copy_from_host(const T* data) { copy_from_host(data, m_size); } /// Copies num_elements of data from the raw host pointer after enlarging the array so that everything fits in void enlarge_and_copy_from_host(const T* data, const size_t num_elements) { enlarge(num_elements); copy_from_host(data, num_elements); } /// Copies num_elements from the host vector after enlarging the array so that everything fits in void enlarge_and_copy_from_host(const std::vector& data, const size_t num_elements) { enlarge_and_copy_from_host(data.data(), num_elements); } /// Copies the entire host vector after enlarging the array so that everything fits in void enlarge_and_copy_from_host(const std::vector& data) { enlarge_and_copy_from_host(data.data(), data.size()); } /// Copies num_elements of data from the raw host pointer after resizing the array void resize_and_copy_from_host(const T* data, const size_t num_elements) { resize(num_elements); copy_from_host(data, num_elements); } /// Copies num_elements from the host vector after resizing the array void resize_and_copy_from_host(const std::vector& data, const size_t num_elements) { resize_and_copy_from_host(data.data(), num_elements); } /// Copies the entire host vector after resizing the array void resize_and_copy_from_host(const std::vector& data) { resize_and_copy_from_host(data.data(), data.size()); } /// Copies the entire host vector to the device. Fails if there is not enough space available. void copy_from_host(const std::vector& data) { if (data.size() < m_size) { throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}.", m_size, data.size())}; } copy_from_host(data.data(), m_size); } /// Copies num_elements of data from the raw host pointer to the device. Fails if there is not enough space available. void copy_to_host(T* host_data, const size_t num_elements) const { if (num_elements > m_size) { throw std::runtime_error{fmt::format("Trying to copy {} elements, but memory size is only {}.", num_elements, m_size)}; } CUDA_CHECK_THROW(cudaMemcpy(host_data, data(), num_elements * sizeof(T), cudaMemcpyDeviceToHost)); } /// Copies num_elements from the device to a vector on the host void copy_to_host(std::vector& data, const size_t num_elements) const { if (data.size() < num_elements) { throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}.", num_elements, data.size())}; } copy_to_host(data.data(), num_elements); } /// Copies num_elements from the device to a raw pointer on the host void copy_to_host(T* data) const { copy_to_host(data, m_size); } /// Copies all elements from the device to a vector on the host void copy_to_host(std::vector& data) const { if (data.size() < m_size) { throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}", m_size, data.size())}; } copy_to_host(data.data(), m_size); } /// Copies size elements from another device array to this one, automatically resizing it void copy_from_device(const GPUMemory& other, const size_t size) { if (size == 0) { return; } if (m_size < size) { resize(size); } CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, size * sizeof(T), cudaMemcpyDeviceToDevice)); } /// Copies data from another device array to this one, automatically resizing it void copy_from_device(const GPUMemory &other) { copy_from_device(other, other.m_size); } // Created an (owned) copy of the data GPUMemory copy(size_t size) const { GPUMemory result{size}; result.copy_from_device(*this); return result; } GPUMemory copy() const { return copy(m_size); } T* data() const { check_guards(); return m_data; } View view() const { return data(); } ConstView const_view() const { return view(); } bool managed() const { return m_managed; } T& at(size_t idx) const { if (!m_managed) { throw std::runtime_error{fmt::format("GPUMemory::at() not permitted if not managed.")}; } if (idx > m_size) { throw std::runtime_error{fmt::format("GPUMemory out of bounds: idx={} size={}", idx, m_size)}; } return m_data[idx]; } TCNN_HOST_DEVICE T& operator[](size_t idx) const { #ifdef DEBUG_BUFFER_OVERRUN if (idx > m_size) { printf("WARNING: buffer overrun of %p at idx %zu\n", idx); } #endif return m_data[idx]; } TCNN_HOST_DEVICE T& operator[](uint32_t idx) const { #ifdef DEBUG_BUFFER_OVERRUN if (idx > m_size) { printf("WARNING: buffer overrun of %p at idx %u\n", idx); } #endif return m_data[idx]; } size_t get_num_elements() const { return m_size; } size_t size() const { return get_num_elements(); } size_t get_bytes() const { return m_size * sizeof(T); } size_t n_bytes() const { return get_bytes(); } size_t bytes() const { return get_bytes(); } }; class GPUMemoryArena { public: GPUMemoryArena() { m_device = cuda_device(); // Align memory at least by a cache line (128 bytes). m_alignment = (size_t)128; m_max_size = previous_multiple(cuda_memory_info().total, cuda_memory_granularity()); m_free_intervals = {{0, m_max_size}}; // Reserve an address range that would be sufficient for housing the entire // available GPU RAM (if nothing else was using the GPU). This is unlikely // to exhaust all available addresses (even if multiple GPUMemoryArenas are // used simultaneously), while also ensuring that we never exhaust the // reserved address range without running out of physical memory beforehand. if (cuda_supports_virtual_memory() && cuMemAddressReserve(&m_base_address, m_max_size, 0, 0, 0) == CUDA_SUCCESS) { return; } // Use regular memory as fallback m_fallback_memory = std::make_shared>(); static bool printed_warning = false; if (!printed_warning) { printed_warning = true; log_warning( "GPUMemoryArena: GPU {} does not support virtual memory. " "Falling back to regular allocations, which will be larger and can cause occasional stutter.", m_device ); } } GPUMemoryArena(GPUMemoryArena&& other) = default; GPUMemoryArena(const GPUMemoryArena& other) = delete; GPUMemoryArena& operator=(GPUMemoryArena&& other) = delete; GPUMemoryArena& operator=(const GPUMemoryArena& other) = delete; ~GPUMemoryArena() { if (in_use()) { log_warning("Attempting to free memory arena while it is still in use."); } try { // Make sure we're clearing the GPU memory arena on the correct device. int previous_device = cuda_device(); set_cuda_device(m_device); ScopeGuard revert_device = {[&]() { set_cuda_device(previous_device); }}; CUDA_CHECK_THROW(cudaDeviceSynchronize()); if (m_base_address) { total_n_bytes_allocated() -= m_size; CU_CHECK_THROW(cuMemUnmap(m_base_address, m_size)); for (const auto& handle : m_handles) { CU_CHECK_THROW(cuMemRelease(handle)); } CU_CHECK_THROW(cuMemAddressFree(m_base_address, m_max_size)); } } catch (const std::runtime_error& error) { // Don't need to report on memory-free problems when the driver is shutting down. if (std::string{error.what()}.find("driver shutting down") == std::string::npos) { log_warning("Could not free memory arena: {}", error.what()); } } } uint8_t* data() { return m_fallback_memory ? m_fallback_memory->data() : (uint8_t*)m_base_address; } std::shared_ptr> backing_memory() { return m_fallback_memory; } // Finds the smallest interval of free memory in the GPUMemoryArena that's // large enough to hold the requested number of bytes. Then allocates // that memory. size_t allocate(size_t n_bytes) { // Permitting zero-sized allocations is error prone if (n_bytes == 0) { n_bytes = m_alignment; } // Align allocations with the nearest cache line (at least the granularity of the memory allocations) n_bytes = next_multiple(n_bytes, m_alignment); Interval* best_candidate = &m_free_intervals.back(); for (auto& f : m_free_intervals) { if (f.size() >= n_bytes && f.size() < best_candidate->size()) { best_candidate = &f; } } size_t start = best_candidate->start; // Note: the += operator can turn `best_candidate` into an empty interval, which is fine because it will // be absorbed into adjacent free intervals in later calls to `merge_adjacent_intervals`. m_allocated_intervals[start] = best_candidate->start += n_bytes; enlarge(size()); return start; } void free(size_t start) { if (m_allocated_intervals.count(start) == 0) { throw std::runtime_error{"Attempted to free arena memory that was not allocated."}; } Interval interval = {start, m_allocated_intervals[start]}; m_allocated_intervals.erase(start); m_free_intervals.insert( std::upper_bound(std::begin(m_free_intervals), std::end(m_free_intervals), interval), interval ); merge_adjacent_intervals(); } void enlarge(size_t n_bytes) { if (n_bytes <= m_size) { return; } if (cuda_device() != m_device) { throw std::runtime_error{fmt::format("Attempted to use a GPUMemoryArena of device {} from the wrong device {}.", m_device, cuda_device())}; } log_debug("GPUMemoryArena: enlarging from {} to {}", bytes_to_string(m_size), bytes_to_string(n_bytes)); if (m_fallback_memory) { static const double GROWTH_FACTOR = 1.5; CUDA_CHECK_THROW(cudaDeviceSynchronize()); m_size = next_multiple((size_t)(n_bytes * GROWTH_FACTOR), cuda_memory_granularity()); m_fallback_memory = std::make_shared>(m_fallback_memory->copy(m_size)); CUDA_CHECK_THROW(cudaDeviceSynchronize()); return; } size_t n_bytes_to_allocate = n_bytes - m_size; n_bytes_to_allocate = next_multiple(n_bytes_to_allocate, cuda_memory_granularity()); CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = m_device; m_handles.emplace_back(); CU_CHECK_THROW(cuMemCreate(&m_handles.back(), n_bytes_to_allocate, &prop, 0)); CUmemAccessDesc access_desc = {}; access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; access_desc.location.id = prop.location.id; access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CU_CHECK_THROW(cuMemMap(m_base_address + m_size, n_bytes_to_allocate, 0, m_handles.back(), 0)); CU_CHECK_THROW(cuMemSetAccess(m_base_address + m_size, n_bytes_to_allocate, &access_desc, 1)); m_size += n_bytes_to_allocate; total_n_bytes_allocated() += n_bytes_to_allocate; // Need to synchronize the device to make sure memory is available to all streams. if (current_capture()) { current_capture()->schedule_synchronize(); } else { CUDA_CHECK_THROW(cudaDeviceSynchronize()); } } size_t size() const { return m_free_intervals.back().start; } bool in_use() const { return m_free_intervals.size() != 1 || m_free_intervals.front().size() != m_max_size; } class Allocation { public: Allocation() = default; Allocation(cudaStream_t stream, size_t offset, const std::shared_ptr& workspace) : m_stream{stream}, m_data{workspace->data() + offset}, m_offset{offset}, m_workspace{workspace}, m_backing_memory{workspace->backing_memory()} {} ~Allocation() { if (m_workspace) { m_workspace->free(m_offset); } } Allocation(const Allocation& other) = delete; Allocation& operator=(Allocation&& other) { std::swap(m_stream, other.m_stream); std::swap(m_data, other.m_data); std::swap(m_offset, other.m_offset); std::swap(m_workspace, other.m_workspace); std::swap(m_backing_memory, other.m_backing_memory); return *this; } Allocation(Allocation&& other) { *this = std::move(other); } uint8_t* data() { return m_data; } const uint8_t* data() const { return m_data; } cudaStream_t stream() const { return m_stream; } private: cudaStream_t m_stream = nullptr; uint8_t* m_data = nullptr; size_t m_offset = 0; std::shared_ptr m_workspace = nullptr; // Backing GPUMemory (if backed by a GPUMemory). Ensures that // the backing memory is only freed once all allocations that // use it were destroyed. std::shared_ptr> m_backing_memory = nullptr; }; private: void merge_adjacent_intervals() { size_t j = 0; for (size_t i = 1; i < m_free_intervals.size(); ++i) { Interval& prev = m_free_intervals[j]; Interval& cur = m_free_intervals[i]; if (prev.end == cur.start) { prev.end = cur.end; } else { ++j; m_free_intervals[j] = m_free_intervals[i]; } } m_free_intervals.resize(j+1); } std::vector> m_free_intervals; std::unordered_map m_allocated_intervals; int m_device = 0; CUdeviceptr m_base_address = {}; size_t m_size = 0; std::vector m_handles; // Used then virtual memory isn't supported. // Requires more storage + memcpy, but is more portable. std::shared_ptr> m_fallback_memory = nullptr; size_t m_alignment; size_t m_max_size; }; inline std::unordered_map>& stream_gpu_memory_arenas() { static auto* stream_gpu_memory_arenas = new std::unordered_map>{}; return *stream_gpu_memory_arenas; } inline std::unordered_map>& global_gpu_memory_arenas() { static auto* global_gpu_memory_arenas = new std::unordered_map>{}; return *global_gpu_memory_arenas; } inline GPUMemoryArena::Allocation allocate_workspace(cudaStream_t stream, size_t n_bytes) { if (n_bytes == 0) { // Return a null allocation if no bytes were requested. return {}; } auto& arena = stream ? stream_gpu_memory_arenas()[stream] : global_gpu_memory_arenas()[cuda_device()]; if (!arena) { arena = std::make_shared(); } return GPUMemoryArena::Allocation{stream, arena->allocate(n_bytes), arena}; } inline size_t align_to_cacheline(size_t bytes) { return next_multiple(bytes, (size_t)128); } template std::tuple allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size) { *alloc = allocate_workspace(stream, offset + align_to_cacheline(first_size * sizeof(First))); return std::make_tuple((First*)(alloc->data() + offset)); } template = 0> std::tuple allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size, Sizes... sizes) { auto nested = allocate_workspace_and_distribute(stream, alloc, offset + align_to_cacheline(first_size * sizeof(First)), sizes...); return std::tuple_cat(std::make_tuple((First*)(alloc->data() + offset)), nested); } template = 0> std::tuple allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, Sizes... sizes) { return allocate_workspace_and_distribute(stream, alloc, (size_t)0, sizes...); } inline void free_gpu_memory_arena(cudaStream_t stream) { if (stream) { stream_gpu_memory_arenas().erase(stream); } else { global_gpu_memory_arenas().erase(cuda_device()); } } inline void free_all_gpu_memory_arenas() { stream_gpu_memory_arenas().clear(); global_gpu_memory_arenas().clear(); } }