/* * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** @file gpu_matrix.h * @author Thomas Müller, NVIDIA * @brief Matrix whose data resides in GPU (CUDA) memory */ #pragma once #include #include #include #include #include #include #include namespace tcnn { template class GPUMatrixDynamic; template class GPUMatrix; class GPUMatrixBase { public: virtual ~GPUMatrixBase() {} virtual size_t n_bytes() const = 0; virtual void set_data_unsafe(void* data) = 0; static void allocate_shared_memory(GPUMemory& memory, const std::vector& matrices) { size_t total_n_bytes = 0; for (auto* matrix : matrices) { total_n_bytes += matrix->n_bytes(); } if (memory.bytes() < total_n_bytes) { log_debug("GPUMatrix: allocating {} shared among {} matrices.", bytes_to_string(total_n_bytes), matrices.size()); memory.resize(total_n_bytes); } size_t offset = 0; for (auto* matrix : matrices) { matrix->set_data_unsafe(memory.data() + offset); offset += matrix->n_bytes(); } } template static void allocate_shared_memory(GPUMemory& memory, std::vector>& matrices); template static void allocate_shared_memory(GPUMemory& memory, std::vector>& matrices); static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, const std::vector& matrices) { size_t total_n_bytes = 0; for (auto* matrix : matrices) { total_n_bytes += matrix->n_bytes(); } auto alloc = allocate_workspace(stream, total_n_bytes); size_t offset = 0; for (auto* matrix : matrices) { matrix->set_data_unsafe(alloc.data() + offset); offset += matrix->n_bytes(); } return alloc; } template static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector>& matrices); template static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector>& matrices); }; template class GPUMatrixDynamic : public GPUMatrixBase { public: using Type = T; using View = MatrixView; using ConstView = MatrixView; // Owning its memory as a GPUMemory GPUMatrixDynamic(uint32_t m, uint32_t n, MatrixLayout layout = CM) : m_rows{m}, m_cols{n}, m_layout{layout} { m_malloc_allocation = std::make_shared>(m * n * sizeof(T)); m_data = (T*)m_malloc_allocation->data(); set_stride_contiguous(); } // Owning its memory as an allocation from a stream's memory arena GPUMatrixDynamic(uint32_t m, uint32_t n, cudaStream_t stream, MatrixLayout layout = CM) : m_rows{m}, m_cols{n}, m_layout{layout} { m_arena_allocation = std::make_shared(allocate_workspace(stream, m * n * sizeof(T))); m_data = (T*)m_arena_allocation->data(); set_stride_contiguous(); } // Pointing to external memory explicit GPUMatrixDynamic(T* data, uint32_t m, uint32_t n, MatrixLayout layout = CM, uint32_t stride = 0, std::shared_ptr> malloc_allocation = nullptr, std::shared_ptr arena_allocation = nullptr) : m_data{data}, m_layout{layout}, m_malloc_allocation{malloc_allocation}, m_arena_allocation{arena_allocation} { set(data, m, n, stride); } GPUMatrixDynamic() : GPUMatrixDynamic{nullptr, 0, 0} {} GPUMatrixDynamic& operator=(GPUMatrixDynamic&& other) { std::swap(m_data, other.m_data); std::swap(m_rows, other.m_rows); std::swap(m_cols, other.m_cols); std::swap(m_stride, other.m_stride); std::swap(m_layout, other.m_layout); std::swap(m_malloc_allocation, other.m_malloc_allocation); std::swap(m_arena_allocation, other.m_arena_allocation); return *this; } GPUMatrixDynamic(GPUMatrixDynamic&& other) { *this = std::move(other); } GPUMatrixDynamic(const GPUMatrixDynamic& other) = delete; GPUMatrixDynamic& operator=(const GPUMatrixDynamic& other) = delete; virtual ~GPUMatrixDynamic() {} void set_data_unsafe(void* data) override { m_data = (T*)data; } void set_size_unsafe(uint32_t rows, uint32_t cols, uint32_t stride = 0) { m_rows = rows; m_cols = cols; if (stride == 0) { set_stride_contiguous(); } else { m_stride = stride; } } void set(T* data, uint32_t rows, uint32_t cols, uint32_t stride = 0) { set_data_unsafe(data); set_size_unsafe(rows, cols, stride); } void resize(uint32_t rows, uint32_t cols) { if (m_arena_allocation) { cudaStream_t stream = m_arena_allocation->stream(); m_arena_allocation.reset(); // reset is called explicitly to ensure memory is freed before being allocated m_arena_allocation = std::make_shared(allocate_workspace(stream, rows * cols * sizeof(T))); m_data = (T*)m_arena_allocation->data(); } else if (m_malloc_allocation || !data()) { m_malloc_allocation.reset(); // reset is called explicitly to ensure memory is freed before being allocated m_malloc_allocation = std::make_shared>(rows * cols * sizeof(T)); m_data = (T*)m_malloc_allocation->data(); } else { throw std::runtime_error{"GPUMatrix::resize is not permitted when the underlying memory is not owned. Use GPUMatrix::set instead."}; } set_size_unsafe(rows, cols); } uint32_t stride_contiguous() const { return m_layout == CM ? m() : n(); } bool is_contiguous() const { return m_stride == stride_contiguous(); } void set_stride_contiguous() { m_stride = stride_contiguous(); } GPUMatrixDynamic slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const { return GPUMatrixDynamic{ data() + (layout() == CM ? (offset_rows + offset_cols * stride()) : (offset_cols + offset_rows * stride())), new_rows, new_cols, layout(), stride(), m_malloc_allocation, m_arena_allocation, }; } GPUMatrixDynamic slice_rows(uint32_t offset, uint32_t size) const { return slice(offset, size, 0, cols()); } GPUMatrixDynamic slice_cols(uint32_t offset, uint32_t size) const { return slice(0, rows(), offset, size); } GPUMatrixDynamic alias() const { return slice(0, rows(), 0, cols()); } View view() const { return {data(), layout() == CM ? 1u : stride(), layout() == CM ? stride() : 1u}; } ConstView const_view() const { return view(); } uint32_t rows() const { return m_rows; } uint32_t fan_out() const { return m_rows; } uint32_t m() const { return m_rows; } uint32_t cols() const { return m_cols; } uint32_t fan_in() const { return m_cols; } uint32_t n() const { return m_cols; } uint32_t stride() const { return m_stride; } PitchedPtr pitched_ptr() { return {data(), stride()}; } PitchedPtr pitched_ptr() const { return {data(), stride()}; } uint32_t n_elements() const { return m_rows * m_cols; } size_t n_bytes() const override { return n_elements() * sizeof(T); } MatrixLayout layout() const { return m_layout; } MatrixLayout transposed_layout() const { return m_layout == RM ? CM : RM; } T* data() const { return m_data; } void memset(int value) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); CUDA_CHECK_THROW(cudaMemset(data(), value, n_bytes())); } void memset_async(cudaStream_t stream, int value) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); CUDA_CHECK_THROW(cudaMemsetAsync(data(), value, n_bytes(), stream)); } std::vector to_cpu_vector() { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); std::vector v(n_elements()); CUDA_CHECK_THROW(cudaMemcpy(v.data(), data(), n_bytes(), cudaMemcpyDeviceToHost)); return v; } // Various initializations void initialize_uniform(pcg32& rnd, float low, float high) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); // Define probability distribution float scale = high - low; // Sample initialized values std::vector new_data(n_elements()); for (size_t i = 0; i < new_data.size(); ++i) { new_data[i] = (T)(low + rnd.next_float() * scale); } CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } void initialize_xavier_uniform(pcg32& rnd, float scale = 1) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); // Define probability distribution scale *= std::sqrt(6.0f / (float)(fan_in() + fan_out())); // Sample initialized values std::vector new_data(n_elements()); for (size_t i = 0; i < new_data.size(); ++i) { new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); } CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } void initialize_fa_uniform_forward(pcg32& rnd, float scale = 1) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); // Define probability distribution scale *= std::sqrt(1.0f / (float)fan_in()); // Sample initialized values std::vector new_data(n_elements()); for (size_t i = 0; i < new_data.size(); ++i) { new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); } CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } void initialize_fa_uniform_backward(pcg32& rnd, float scale = 1) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); // Define probability distribution scale *= std::sqrt(1.0f / (float)fan_out()); // Sample initialized values std::vector new_data(n_elements()); for (size_t i = 0; i < new_data.size(); ++i) { new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); } CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } void initialize_siren_uniform(pcg32& rnd, float scale = 1) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); // Define probability distribution scale *= std::sqrt(6.0f / (float)fan_in()); // Sample initialized values std::vector new_data(n_elements()); for (size_t i = 0; i < new_data.size(); ++i) { new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); } CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } void initialize_siren_uniform_first(pcg32& rnd, float scale = 1) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); // Define probability distribution // The 30 in the first layer comes from https://vsitzmann.github.io/siren/ scale *= 30.0f / (float)fan_in(); // Sample initialized values std::vector new_data(n_elements()); for (size_t i = 0; i < new_data.size(); ++i) { new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); } CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } void initialize_constant(float val) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); std::vector new_data(n_elements(), (T)val); CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } void initialize_diagonal(float val = 1) { CHECK_THROW(data()); CHECK_THROW(is_contiguous()); CHECK_THROW(n() == m()); // Must be square for diagonal init to make sense std::vector new_data(n_elements(), (T)0); for (uint32_t i = 0; i < n(); ++i) { new_data[i + i*n()] = (T)val; } CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); } GPUMatrixDynamic transposed() const { return GPUMatrixDynamic(data(), n(), m(), transposed_layout(), stride(), m_malloc_allocation, m_arena_allocation); } GPUMatrix rm() const { CHECK_THROW(m_layout == RM); return GPUMatrix(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation); } GPUMatrix cm() const { CHECK_THROW(m_layout == CM); return GPUMatrix(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation); } private: T* m_data; uint32_t m_rows, m_cols, m_stride; MatrixLayout m_layout; // References to corresponding memory allocations. These ensure that // m_data does not accidentally become dangling. std::shared_ptr> m_malloc_allocation; std::shared_ptr m_arena_allocation; }; template class GPUMatrix : public GPUMatrixDynamic { public: static const MatrixLayout static_layout = _layout; static const MatrixLayout static_transposed_layout = _layout == RM ? CM : RM; // Owning its memory as a GPUMemory GPUMatrix(uint32_t m, uint32_t n) : GPUMatrixDynamic{m, n, static_layout} { } // Owning its memory as an allocation from a stream's memory arena GPUMatrix(uint32_t m, uint32_t n, cudaStream_t stream) : GPUMatrixDynamic{m, n, stream, static_layout} { } // Pointing to external memory explicit GPUMatrix(T* data, uint32_t m, uint32_t n, uint32_t stride = 0, std::shared_ptr> malloc_allocation = nullptr, std::shared_ptr arena_allocation = nullptr) : GPUMatrixDynamic{data, m, n, static_layout, stride, malloc_allocation, arena_allocation} { } GPUMatrix() : GPUMatrix{nullptr, 0, 0} {} GPUMatrix& operator=(GPUMatrixDynamic&& other) { *((GPUMatrixDynamic*)this) = std::move(other); if (static_layout != this->layout()) { throw std::runtime_error{"GPUMatrix must be constructed from a GPUMatrixDynamic with matching layout."}; } return *this; } GPUMatrix(GPUMatrixDynamic&& other) noexcept { *this = std::move(other); } GPUMatrix& operator=(GPUMatrix&& other) noexcept { *((GPUMatrixDynamic*)this) = std::move(other); return *this; } GPUMatrix(GPUMatrix&& other) noexcept { *this = std::move(other); } GPUMatrix(const GPUMatrixDynamic& other) = delete; GPUMatrix& operator=(const GPUMatrixDynamic& other) = delete; virtual ~GPUMatrix() {} GPUMatrix slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const { return ((GPUMatrixDynamic*)this)->slice(offset_rows, new_rows, offset_cols, new_cols); } GPUMatrix slice_rows(uint32_t offset, uint32_t size) const { return ((GPUMatrixDynamic*)this)->slice_rows(offset, size); } GPUMatrix slice_cols(uint32_t offset, uint32_t size) const { return ((GPUMatrixDynamic*)this)->slice_cols(offset, size); } GPUMatrix alias() const { return ((GPUMatrixDynamic*)this)->alias(); } GPUMatrix transposed() const { return ((GPUMatrixDynamic*)this)->transposed(); } }; template void GPUMatrixBase::allocate_shared_memory(GPUMemory& memory, std::vector>& matrices) { std::vector matrix_pointers; for (auto& matrix : matrices) { matrix_pointers.emplace_back(&matrix); } allocate_shared_memory(memory, matrix_pointers); } template void GPUMatrixBase::allocate_shared_memory(GPUMemory& memory, std::vector>& matrices) { std::vector matrix_pointers; for (auto& matrix : matrices) { matrix_pointers.emplace_back(&matrix); } allocate_shared_memory(memory, matrix_pointers); } template GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector>& matrices) { std::vector matrix_pointers; for (auto& matrix : matrices) { matrix_pointers.emplace_back(&matrix); } return allocate_shared_memory(stream, matrix_pointers); } template GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector>& matrices) { std::vector matrix_pointers; for (auto& matrix : matrices) { matrix_pointers.emplace_back(&matrix); } return allocate_shared_memory(stream, matrix_pointers); } }