Spaces:
Build error
Build error
| /* | |
| * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| * SPDX-License-Identifier: Apache-2.0 | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| */ | |
| /** @file gpu_matrix.h | |
| * @author Thomas Müller, NVIDIA | |
| * @brief Matrix whose data resides in GPU (CUDA) memory | |
| */ | |
| namespace tcnn { | |
| template<typename T> | |
| class GPUMatrixDynamic; | |
| template<typename T, MatrixLayout _layout> | |
| class GPUMatrix; | |
| class GPUMatrixBase { | |
| public: | |
| virtual ~GPUMatrixBase() {} | |
| virtual size_t n_bytes() const = 0; | |
| virtual void set_data_unsafe(void* data) = 0; | |
| static void allocate_shared_memory(GPUMemory<char>& memory, const std::vector<GPUMatrixBase*>& matrices) { | |
| size_t total_n_bytes = 0; | |
| for (auto* matrix : matrices) { | |
| total_n_bytes += matrix->n_bytes(); | |
| } | |
| if (memory.bytes() < total_n_bytes) { | |
| log_debug("GPUMatrix: allocating {} shared among {} matrices.", bytes_to_string(total_n_bytes), matrices.size()); | |
| memory.resize(total_n_bytes); | |
| } | |
| size_t offset = 0; | |
| for (auto* matrix : matrices) { | |
| matrix->set_data_unsafe(memory.data() + offset); | |
| offset += matrix->n_bytes(); | |
| } | |
| } | |
| template <typename T> | |
| static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices); | |
| template <typename T, MatrixLayout layout> | |
| static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices); | |
| static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, const std::vector<GPUMatrixBase*>& matrices) { | |
| size_t total_n_bytes = 0; | |
| for (auto* matrix : matrices) { | |
| total_n_bytes += matrix->n_bytes(); | |
| } | |
| auto alloc = allocate_workspace(stream, total_n_bytes); | |
| size_t offset = 0; | |
| for (auto* matrix : matrices) { | |
| matrix->set_data_unsafe(alloc.data() + offset); | |
| offset += matrix->n_bytes(); | |
| } | |
| return alloc; | |
| } | |
| template <typename T> | |
| static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices); | |
| template <typename T, MatrixLayout layout> | |
| static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices); | |
| }; | |
| template <typename T> | |
| class GPUMatrixDynamic : public GPUMatrixBase { | |
| public: | |
| using Type = T; | |
| using View = MatrixView<T>; | |
| using ConstView = MatrixView<const T>; | |
| // Owning its memory as a GPUMemory<T> | |
| GPUMatrixDynamic(uint32_t m, uint32_t n, MatrixLayout layout = CM) | |
| : m_rows{m}, m_cols{n}, m_layout{layout} { | |
| m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(m * n * sizeof(T)); | |
| m_data = (T*)m_malloc_allocation->data(); | |
| set_stride_contiguous(); | |
| } | |
| // Owning its memory as an allocation from a stream's memory arena | |
| GPUMatrixDynamic(uint32_t m, uint32_t n, cudaStream_t stream, MatrixLayout layout = CM) | |
| : m_rows{m}, m_cols{n}, m_layout{layout} { | |
| m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, m * n * sizeof(T))); | |
| m_data = (T*)m_arena_allocation->data(); | |
| set_stride_contiguous(); | |
| } | |
| // Pointing to external memory | |
| explicit GPUMatrixDynamic(T* data, uint32_t m, uint32_t n, MatrixLayout layout = CM, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr) | |
| : m_data{data}, m_layout{layout}, m_malloc_allocation{malloc_allocation}, m_arena_allocation{arena_allocation} { | |
| set(data, m, n, stride); | |
| } | |
| GPUMatrixDynamic() : GPUMatrixDynamic{nullptr, 0, 0} {} | |
| GPUMatrixDynamic<T>& operator=(GPUMatrixDynamic<T>&& other) { | |
| std::swap(m_data, other.m_data); | |
| std::swap(m_rows, other.m_rows); | |
| std::swap(m_cols, other.m_cols); | |
| std::swap(m_stride, other.m_stride); | |
| std::swap(m_layout, other.m_layout); | |
| std::swap(m_malloc_allocation, other.m_malloc_allocation); | |
| std::swap(m_arena_allocation, other.m_arena_allocation); | |
| return *this; | |
| } | |
| GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) { | |
| *this = std::move(other); | |
| } | |
| GPUMatrixDynamic(const GPUMatrixDynamic<T>& other) = delete; | |
| GPUMatrixDynamic<T>& operator=(const GPUMatrixDynamic<T>& other) = delete; | |
| virtual ~GPUMatrixDynamic() {} | |
| void set_data_unsafe(void* data) override { m_data = (T*)data; } | |
| void set_size_unsafe(uint32_t rows, uint32_t cols, uint32_t stride = 0) { | |
| m_rows = rows; | |
| m_cols = cols; | |
| if (stride == 0) { | |
| set_stride_contiguous(); | |
| } else { | |
| m_stride = stride; | |
| } | |
| } | |
| void set(T* data, uint32_t rows, uint32_t cols, uint32_t stride = 0) { | |
| set_data_unsafe(data); | |
| set_size_unsafe(rows, cols, stride); | |
| } | |
| void resize(uint32_t rows, uint32_t cols) { | |
| if (m_arena_allocation) { | |
| cudaStream_t stream = m_arena_allocation->stream(); | |
| m_arena_allocation.reset(); // reset is called explicitly to ensure memory is freed before being allocated | |
| m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, rows * cols * sizeof(T))); | |
| m_data = (T*)m_arena_allocation->data(); | |
| } else if (m_malloc_allocation || !data()) { | |
| m_malloc_allocation.reset(); // reset is called explicitly to ensure memory is freed before being allocated | |
| m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(rows * cols * sizeof(T)); | |
| m_data = (T*)m_malloc_allocation->data(); | |
| } else { | |
| throw std::runtime_error{"GPUMatrix::resize is not permitted when the underlying memory is not owned. Use GPUMatrix::set instead."}; | |
| } | |
| set_size_unsafe(rows, cols); | |
| } | |
| uint32_t stride_contiguous() const { | |
| return m_layout == CM ? m() : n(); | |
| } | |
| bool is_contiguous() const { | |
| return m_stride == stride_contiguous(); | |
| } | |
| void set_stride_contiguous() { | |
| m_stride = stride_contiguous(); | |
| } | |
| GPUMatrixDynamic<T> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const { | |
| return GPUMatrixDynamic<T>{ | |
| data() + (layout() == CM ? (offset_rows + offset_cols * stride()) : (offset_cols + offset_rows * stride())), | |
| new_rows, | |
| new_cols, | |
| layout(), | |
| stride(), | |
| m_malloc_allocation, | |
| m_arena_allocation, | |
| }; | |
| } | |
| GPUMatrixDynamic<T> slice_rows(uint32_t offset, uint32_t size) const { | |
| return slice(offset, size, 0, cols()); | |
| } | |
| GPUMatrixDynamic<T> slice_cols(uint32_t offset, uint32_t size) const { | |
| return slice(0, rows(), offset, size); | |
| } | |
| GPUMatrixDynamic<T> alias() const { | |
| return slice(0, rows(), 0, cols()); | |
| } | |
| View view() const { | |
| return {data(), layout() == CM ? 1u : stride(), layout() == CM ? stride() : 1u}; | |
| } | |
| ConstView const_view() const { | |
| return view(); | |
| } | |
| uint32_t rows() const { return m_rows; } | |
| uint32_t fan_out() const { return m_rows; } | |
| uint32_t m() const { return m_rows; } | |
| uint32_t cols() const { return m_cols; } | |
| uint32_t fan_in() const { return m_cols; } | |
| uint32_t n() const { return m_cols; } | |
| uint32_t stride() const { return m_stride; } | |
| PitchedPtr<T> pitched_ptr() { return {data(), stride()}; } | |
| PitchedPtr<const T> pitched_ptr() const { return {data(), stride()}; } | |
| uint32_t n_elements() const { return m_rows * m_cols; } | |
| size_t n_bytes() const override { return n_elements() * sizeof(T); } | |
| MatrixLayout layout() const { return m_layout; } | |
| MatrixLayout transposed_layout() const { return m_layout == RM ? CM : RM; } | |
| T* data() const { return m_data; } | |
| void memset(int value) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| CUDA_CHECK_THROW(cudaMemset(data(), value, n_bytes())); | |
| } | |
| void memset_async(cudaStream_t stream, int value) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| CUDA_CHECK_THROW(cudaMemsetAsync(data(), value, n_bytes(), stream)); | |
| } | |
| std::vector<T> to_cpu_vector() { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| std::vector<T> v(n_elements()); | |
| CUDA_CHECK_THROW(cudaMemcpy(v.data(), data(), n_bytes(), cudaMemcpyDeviceToHost)); | |
| return v; | |
| } | |
| // Various initializations | |
| void initialize_uniform(pcg32& rnd, float low, float high) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| // Define probability distribution | |
| float scale = high - low; | |
| // Sample initialized values | |
| std::vector<T> new_data(n_elements()); | |
| for (size_t i = 0; i < new_data.size(); ++i) { | |
| new_data[i] = (T)(low + rnd.next_float() * scale); | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| void initialize_xavier_uniform(pcg32& rnd, float scale = 1) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| // Define probability distribution | |
| scale *= std::sqrt(6.0f / (float)(fan_in() + fan_out())); | |
| // Sample initialized values | |
| std::vector<T> new_data(n_elements()); | |
| for (size_t i = 0; i < new_data.size(); ++i) { | |
| new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| void initialize_fa_uniform_forward(pcg32& rnd, float scale = 1) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| // Define probability distribution | |
| scale *= std::sqrt(1.0f / (float)fan_in()); | |
| // Sample initialized values | |
| std::vector<T> new_data(n_elements()); | |
| for (size_t i = 0; i < new_data.size(); ++i) { | |
| new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| void initialize_fa_uniform_backward(pcg32& rnd, float scale = 1) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| // Define probability distribution | |
| scale *= std::sqrt(1.0f / (float)fan_out()); | |
| // Sample initialized values | |
| std::vector<T> new_data(n_elements()); | |
| for (size_t i = 0; i < new_data.size(); ++i) { | |
| new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| void initialize_siren_uniform(pcg32& rnd, float scale = 1) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| // Define probability distribution | |
| scale *= std::sqrt(6.0f / (float)fan_in()); | |
| // Sample initialized values | |
| std::vector<T> new_data(n_elements()); | |
| for (size_t i = 0; i < new_data.size(); ++i) { | |
| new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| void initialize_siren_uniform_first(pcg32& rnd, float scale = 1) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| // Define probability distribution | |
| // The 30 in the first layer comes from https://vsitzmann.github.io/siren/ | |
| scale *= 30.0f / (float)fan_in(); | |
| // Sample initialized values | |
| std::vector<T> new_data(n_elements()); | |
| for (size_t i = 0; i < new_data.size(); ++i) { | |
| new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| void initialize_constant(float val) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| std::vector<T> new_data(n_elements(), (T)val); | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| void initialize_diagonal(float val = 1) { | |
| CHECK_THROW(data()); | |
| CHECK_THROW(is_contiguous()); | |
| CHECK_THROW(n() == m()); // Must be square for diagonal init to make sense | |
| std::vector<T> new_data(n_elements(), (T)0); | |
| for (uint32_t i = 0; i < n(); ++i) { | |
| new_data[i + i*n()] = (T)val; | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
| } | |
| GPUMatrixDynamic<T> transposed() const { | |
| return GPUMatrixDynamic<T>(data(), n(), m(), transposed_layout(), stride(), m_malloc_allocation, m_arena_allocation); | |
| } | |
| GPUMatrix<T, RM> rm() const { | |
| CHECK_THROW(m_layout == RM); | |
| return GPUMatrix<T, RM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation); | |
| } | |
| GPUMatrix<T, CM> cm() const { | |
| CHECK_THROW(m_layout == CM); | |
| return GPUMatrix<T, CM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation); | |
| } | |
| private: | |
| T* m_data; | |
| uint32_t m_rows, m_cols, m_stride; | |
| MatrixLayout m_layout; | |
| // References to corresponding memory allocations. These ensure that | |
| // m_data does not accidentally become dangling. | |
| std::shared_ptr<GPUMemory<uint8_t>> m_malloc_allocation; | |
| std::shared_ptr<GPUMemoryArena::Allocation> m_arena_allocation; | |
| }; | |
| template <typename T, MatrixLayout _layout = MatrixLayout::ColumnMajor> | |
| class GPUMatrix : public GPUMatrixDynamic<T> { | |
| public: | |
| static const MatrixLayout static_layout = _layout; | |
| static const MatrixLayout static_transposed_layout = _layout == RM ? CM : RM; | |
| // Owning its memory as a GPUMemory<T> | |
| GPUMatrix(uint32_t m, uint32_t n) | |
| : GPUMatrixDynamic<T>{m, n, static_layout} { } | |
| // Owning its memory as an allocation from a stream's memory arena | |
| GPUMatrix(uint32_t m, uint32_t n, cudaStream_t stream) | |
| : GPUMatrixDynamic<T>{m, n, stream, static_layout} { } | |
| // Pointing to external memory | |
| explicit GPUMatrix(T* data, uint32_t m, uint32_t n, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr) | |
| : GPUMatrixDynamic<T>{data, m, n, static_layout, stride, malloc_allocation, arena_allocation} { } | |
| GPUMatrix() : GPUMatrix{nullptr, 0, 0} {} | |
| GPUMatrix<T, static_layout>& operator=(GPUMatrixDynamic<T>&& other) { | |
| *((GPUMatrixDynamic<T>*)this) = std::move(other); | |
| if (static_layout != this->layout()) { | |
| throw std::runtime_error{"GPUMatrix must be constructed from a GPUMatrixDynamic with matching layout."}; | |
| } | |
| return *this; | |
| } | |
| GPUMatrix(GPUMatrixDynamic<T>&& other) noexcept { | |
| *this = std::move(other); | |
| } | |
| GPUMatrix<T, static_layout>& operator=(GPUMatrix<T, static_layout>&& other) noexcept { | |
| *((GPUMatrixDynamic<T>*)this) = std::move(other); | |
| return *this; | |
| } | |
| GPUMatrix(GPUMatrix<T, static_layout>&& other) noexcept { | |
| *this = std::move(other); | |
| } | |
| GPUMatrix(const GPUMatrixDynamic<T>& other) = delete; | |
| GPUMatrix<T>& operator=(const GPUMatrixDynamic<T>& other) = delete; | |
| virtual ~GPUMatrix() {} | |
| GPUMatrix<T, static_layout> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const { | |
| return ((GPUMatrixDynamic<T>*)this)->slice(offset_rows, new_rows, offset_cols, new_cols); | |
| } | |
| GPUMatrix<T, static_layout> slice_rows(uint32_t offset, uint32_t size) const { | |
| return ((GPUMatrixDynamic<T>*)this)->slice_rows(offset, size); | |
| } | |
| GPUMatrix<T, static_layout> slice_cols(uint32_t offset, uint32_t size) const { | |
| return ((GPUMatrixDynamic<T>*)this)->slice_cols(offset, size); | |
| } | |
| GPUMatrix<T, static_layout> alias() const { | |
| return ((GPUMatrixDynamic<T>*)this)->alias(); | |
| } | |
| GPUMatrix<T, static_transposed_layout> transposed() const { | |
| return ((GPUMatrixDynamic<T>*)this)->transposed(); | |
| } | |
| }; | |
| template <typename T> | |
| void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices) { | |
| std::vector<GPUMatrixBase*> matrix_pointers; | |
| for (auto& matrix : matrices) { | |
| matrix_pointers.emplace_back(&matrix); | |
| } | |
| allocate_shared_memory(memory, matrix_pointers); | |
| } | |
| template <typename T, MatrixLayout layout> | |
| void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices) { | |
| std::vector<GPUMatrixBase*> matrix_pointers; | |
| for (auto& matrix : matrices) { | |
| matrix_pointers.emplace_back(&matrix); | |
| } | |
| allocate_shared_memory(memory, matrix_pointers); | |
| } | |
| template <typename T> | |
| GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices) { | |
| std::vector<GPUMatrixBase*> matrix_pointers; | |
| for (auto& matrix : matrices) { | |
| matrix_pointers.emplace_back(&matrix); | |
| } | |
| return allocate_shared_memory(stream, matrix_pointers); | |
| } | |
| template <typename T, MatrixLayout layout> | |
| GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices) { | |
| std::vector<GPUMatrixBase*> matrix_pointers; | |
| for (auto& matrix : matrices) { | |
| matrix_pointers.emplace_back(&matrix); | |
| } | |
| return allocate_shared_memory(stream, matrix_pointers); | |
| } | |
| } | |