/* * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** @file vec.h * @author Thomas Müller, NVIDIA * @brief Tiny vector / matrix / quaternion implementation. */ #pragma once #include #include #include #include namespace tcnn { template struct conjunction : std::true_type {}; template struct conjunction : B1 {}; template struct conjunction : std::conditional_t, B1> {}; template using enable_if_size_and_type_match_t = std::enable_if_t...>::value>; #define TVEC_BODY \ using underlying_type = T; \ \ tvec() = default; \ \ TCNN_HOST_DEVICE tvec(T scalar) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ (*this)[i] = scalar; \ } \ } \ \ TCNN_HOST_DEVICE static constexpr tvec ones() { return tvec((T)1); } \ TCNN_HOST_DEVICE static constexpr tvec zero() { return tvec((T)0); } \ \ TCNN_HOST_DEVICE tvec(const T* coeffs) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ (*this)[i] = coeffs[i]; \ } \ } \ \ template \ TCNN_HOST_DEVICE tvec(const tvec& other) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ (*this)[i] = i < M ? (T)other[i] : (T)0; \ } \ } \ \ TCNN_HOST_DEVICE void to_array(T* coeffs) const { \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ coeffs[i] = (*this)[i]; \ } \ } \ \ TCNN_HOST_DEVICE T* data() { return (T*)this; } \ TCNN_HOST_DEVICE const T* data() const { return (const T*)this; } \ \ TCNN_HOST_DEVICE T& operator[](uint32_t idx) { return ((T*)this)[idx]; } \ TCNN_HOST_DEVICE const T& operator[](uint32_t idx) const { return ((T*)this)[idx]; } \ TCNN_HOST_DEVICE T& operator()(uint32_t idx) { return ((T*)this)[idx]; } \ TCNN_HOST_DEVICE const T& operator()(uint32_t idx) const { return ((T*)this)[idx]; } \ \ template \ TCNN_HOST_DEVICE tvec& slice() { \ static_assert(OFFSET + M <= N, "Slice must be part of the vector."); \ return *(tvec*)(data() + OFFSET); \ } \ \ template \ TCNN_HOST_DEVICE const tvec& slice() const { \ static_assert(OFFSET + M <= N, "Slice must be part of the vector."); \ return *(tvec*)(data() + OFFSET); \ } \ \ TCNN_HOST_DEVICE tvec& xy() { return slice<0, 2>(); } \ TCNN_HOST_DEVICE const tvec& xy() const { return slice<0, 2>(); } \ TCNN_HOST_DEVICE tvec& yz() { return slice<1, 2>(); } \ TCNN_HOST_DEVICE const tvec& yz() const { return slice<1, 2>(); } \ TCNN_HOST_DEVICE tvec& xyz() { return slice<0, 3>(); } \ TCNN_HOST_DEVICE const tvec& xyz() const { return slice<0, 3>(); } \ TCNN_HOST_DEVICE tvec& rgb() { return slice<0, 3>(); } \ TCNN_HOST_DEVICE const tvec& rgb() const { return slice<0, 3>(); } \ TCNN_HOST_DEVICE tvec& xyzw() { return slice<0, 4>(); } \ TCNN_HOST_DEVICE const tvec& rgba() const { return slice<0, 4>(); } \ \ TCNN_HOST_DEVICE static constexpr uint32_t size() { return N; } template struct alignas(ALIGNMENT) tvec { TVEC_BODY T elems[N]; template > TCNN_HOST_DEVICE tvec(Ts... coeffs) : elems{coeffs...} {} }; template struct alignas(ALIGNMENT) tvec { static constexpr uint32_t N = 1; TVEC_BODY union { T x, r; }; }; template struct alignas(ALIGNMENT) tvec { static constexpr uint32_t N = 2; TVEC_BODY union { T x, r; }; union { T y, g; }; TCNN_HOST_DEVICE tvec(T a, T b) : x{a}, y{b} {} }; template struct alignas(ALIGNMENT) tvec { static constexpr uint32_t N = 3; TVEC_BODY union { T x, r; }; union { T y, g; }; union { T z, b; }; TCNN_HOST_DEVICE tvec(T a, T b, T c) : x{a}, y{b}, z{c} {} template TCNN_HOST_DEVICE tvec(const tvec& a, T b) : x{a.x}, y{a.y}, z{b} {} template TCNN_HOST_DEVICE tvec(T a, const tvec& b) : x{a}, y{b.x}, z{b.y} {} }; template struct alignas(ALIGNMENT) tvec { static constexpr uint32_t N = 4; TVEC_BODY union { T x, r; }; union { T y, g; }; union { T z, b; }; union { T w, a; }; TCNN_HOST_DEVICE tvec(T a, T b, T c, T d) : x{a}, y{b}, z{c}, w{d} {} template TCNN_HOST_DEVICE tvec(const tvec& a, T b) : x{a.x}, y{a.y}, z{a.z}, w{b} {} template TCNN_HOST_DEVICE tvec(const tvec& a, const tvec& b) : x{a.x}, y{a.y}, z{b.x}, w{b.y} {} template TCNN_HOST_DEVICE tvec(const tvec& a, T b, T c) : x{a.x}, y{a.y}, z{b}, w{c} {} template TCNN_HOST_DEVICE tvec(T a, const tvec& b, T c) : x{a}, y{b.x}, z{b.y}, w{c} {} template TCNN_HOST_DEVICE tvec(T a, T b, const tvec& c) : x{a}, y{b}, z{c.x}, w{c.y} {} template TCNN_HOST_DEVICE tvec(T a, const tvec& b) : x{a}, y{b.x}, z{b.y}, w{b.z} {} }; #undef TVEC_BODY // Import external cwise functions into ngp namespace to avoid // name resolution problems related to the vector-values versions defined below. template TCNN_HOST_DEVICE T min(T a, T b) { return std::min(a, b); } template TCNN_HOST_DEVICE T max(T a, T b) { return std::max(a, b); } template TCNN_HOST_DEVICE T clamp(T a, T b, T c) { return a < b ? b : (c < a ? c : a); } template TCNN_HOST_DEVICE T copysign(T a, T b) { return std::copysign(a, b); } template TCNN_HOST_DEVICE T sign(T a) { return std::copysign((T)1, a); } template TCNN_HOST_DEVICE T mix(T a, T b, T c) { return a * ((T)1 - c) + b * c; } template TCNN_HOST_DEVICE T floor(T a) { return std::floor(a); } template TCNN_HOST_DEVICE T round(T a) { return std::round(a); } template TCNN_HOST_DEVICE T ceil(T a) { return std::ceil(a); } template TCNN_HOST_DEVICE T abs(T a) { return std::abs(a); } template TCNN_HOST_DEVICE T distance(T a, T b) { return std::abs(a - b); } template TCNN_HOST_DEVICE T sin(T a) { return std::sin(a); } template TCNN_HOST_DEVICE T asin(T a) { return std::asin(a); } template TCNN_HOST_DEVICE T cos(T a) { return std::cos(a); } template TCNN_HOST_DEVICE T acos(T a) { return std::acos(a); } template TCNN_HOST_DEVICE T tan(T a) { return std::tan(a); } template TCNN_HOST_DEVICE T atan(T a) { return std::atan(a); } template TCNN_HOST_DEVICE T sqrt(T a) { return std::sqrt(a); } template TCNN_HOST_DEVICE T exp(T a) { return std::exp(a); } template TCNN_HOST_DEVICE T log(T a) { return std::log(a); } template TCNN_HOST_DEVICE T exp2(T a) { return std::exp2(a); } template TCNN_HOST_DEVICE T log2(T a) { return std::log2(a); } template TCNN_HOST_DEVICE T pow(T a, T b) { return std::pow(a, b); } template TCNN_HOST_DEVICE T isfinite(T a) { #if defined(__CUDA_ARCH__) return ::isfinite(a); #else return std::isfinite(a); #endif } inline TCNN_HOST_DEVICE float fma(float a, float b, float c) { return fmaf(a, b, c); } #ifdef __CUDACC__ inline TCNN_DEVICE __half fma(__half a, __half b, __half c) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 return __hfma(a, b, c); #else return fmaf(a, b, c); #endif } #endif #define TVEC tvec #define BVEC tvec #define CWISE_OP(operation, type_result, expr, ...) \ template \ TCNN_HOST_DEVICE type_result operation(__VA_ARGS__) { \ type_result result; \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ result[i] = expr; \ } \ return result; \ } CWISE_OP(operator+, TVEC, a[i] + b[i], const TVEC& a, const TVEC& b) CWISE_OP(operator+, TVEC, a + b[i], T a, const TVEC& b) CWISE_OP(operator+, TVEC, a[i] + b, const TVEC& a, T b) CWISE_OP(operator-, TVEC, a[i] - b[i], const TVEC& a, const TVEC& b) CWISE_OP(operator-, TVEC, a - b[i], T a, const TVEC& b) CWISE_OP(operator-, TVEC, a[i] - b, const TVEC& a, T b) CWISE_OP(operator*, TVEC, a[i] * b[i], const TVEC& a, const TVEC& b) CWISE_OP(operator*, TVEC, a * b[i], T a, const TVEC& b) CWISE_OP(operator*, TVEC, a[i] * b, const TVEC& a, T b) CWISE_OP(operator/, TVEC, a[i] / b[i], const TVEC& a, const TVEC& b) CWISE_OP(operator/, TVEC, a / b[i], T a, const TVEC& b) CWISE_OP(operator/, TVEC, a[i] / b, const TVEC& a, T b) CWISE_OP(fma, TVEC, fma(a[i], b[i], c[i]), const TVEC& a, const TVEC& b, const TVEC& c) CWISE_OP(fma, TVEC, fma(a[i], b[i], c), const TVEC& a, const TVEC& b, T c) CWISE_OP(fma, TVEC, fma(a[i], b, c[i]), const TVEC& a, T b, const TVEC& c) CWISE_OP(fma, TVEC, fma(a[i], b, c), const TVEC& a, T b, T c) CWISE_OP(fma, TVEC, fma(a, b[i], c[i]), T a, const TVEC& b, const TVEC& c) CWISE_OP(fma, TVEC, fma(a, b[i], c), T a, const TVEC& b, T c) CWISE_OP(fma, TVEC, fma(a, b, c[i]), T a, T b, const TVEC& c) CWISE_OP(min, TVEC, min(a[i], b[i]), const TVEC& a, const TVEC& b) CWISE_OP(min, TVEC, min(a[i], b), const TVEC& a, T b) CWISE_OP(min, TVEC, min(a, b[i]), T a, const TVEC& b) CWISE_OP(max, TVEC, max(a[i], b[i]), const TVEC& a, const TVEC& b) CWISE_OP(max, TVEC, max(a[i], b), const TVEC& a, T b) CWISE_OP(max, TVEC, max(a, b[i]), T a, const TVEC& b) CWISE_OP(clamp, TVEC, clamp(a[i], b[i], c[i]), const TVEC& a, const TVEC& b, const TVEC& c) CWISE_OP(clamp, TVEC, clamp(a[i], b[i], c), const TVEC& a, const TVEC& b, T c) CWISE_OP(clamp, TVEC, clamp(a[i], b, c[i]), const TVEC& a, T b, const TVEC& c) CWISE_OP(clamp, TVEC, clamp(a[i], b, c), const TVEC& a, T b, T c) CWISE_OP(copysign, TVEC, copysign(a[i], b[i]), const TVEC& a, const TVEC& b) CWISE_OP(copysign, TVEC, copysign(a[i], b), const TVEC& a, T b) CWISE_OP(copysign, TVEC, copysign(a, b[i]), T a, const TVEC& b) CWISE_OP(sign, TVEC, sign(a[i]), const TVEC& a) CWISE_OP(mix, TVEC, a[i] * ((T)1 - c[i]) + b[i] * c[i], const TVEC& a, const TVEC& b, const TVEC& c) CWISE_OP(mix, TVEC, a[i] * ((T)1 - c) + b[i] * c, const TVEC& a, const TVEC& b, T c) CWISE_OP(operator-, TVEC, -a[i], const TVEC& a) CWISE_OP(floor, TVEC, floor(a[i]), const TVEC& a) CWISE_OP(round, TVEC, round(a[i]), const TVEC& a) CWISE_OP(ceil, TVEC, ceil(a[i]), const TVEC& a) CWISE_OP(abs, TVEC, abs(a[i]), const TVEC& a) CWISE_OP(sin, TVEC, sin(a[i]), const TVEC& a) CWISE_OP(asin, TVEC, asin(a[i]), const TVEC& a) CWISE_OP(cos, TVEC, cos(a[i]), const TVEC& a) CWISE_OP(acos, TVEC, acos(a[i]), const TVEC& a) CWISE_OP(tan, TVEC, tan(a[i]), const TVEC& a) CWISE_OP(atan, TVEC, atan(a[i]), const TVEC& a) CWISE_OP(sqrt, TVEC, sqrt(a[i]), const TVEC& a) CWISE_OP(exp, TVEC, exp(a[i]), const TVEC& a) CWISE_OP(log, TVEC, log(a[i]), const TVEC& a) CWISE_OP(exp2, TVEC, exp2(a[i]), const TVEC& a) CWISE_OP(log2, TVEC, log2(a[i]), const TVEC& a) CWISE_OP(pow, TVEC, pow(a[i], b), const TVEC& a, T b) CWISE_OP(pow, TVEC, pow(a[i], b[i]), const TVEC& a, const TVEC& b) CWISE_OP(isfinite, BVEC, isfinite(a[i]), const TVEC& a) #if defined(__CUDACC__) inline TCNN_DEVICE void atomic_add_gmem_float(float* addr, float in) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 int in_int = *((int*)&in); asm ("red.relaxed.gpu.global.add.f32 [%0], %1;" :: "l"(addr), "r"(in_int)); #else atomicAdd(addr, in); #endif } template TCNN_DEVICE void atomic_add(T* dst, const tvec& a) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { atomicAdd(dst + i, a[i]); } } template TCNN_DEVICE void atomic_add_gmem(float* dst, const tvec& a) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { atomic_add_gmem_float(dst + i, a[i]); } } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 // atomicAdd(__half2) is only supported with compute capability 60 and above inline TCNN_DEVICE void atomic_add_gmem_h2(half2* addr, half2 in) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 int in_int = *((int*)&in); asm ("red.relaxed.gpu.global.add.noftz.f16x2 [%0], %1;" :: "l"(addr), "r"(in_int)); #else atomicAdd(addr, in); #endif } template > TCNN_DEVICE void atomic_add(__half* dst, const tvec<__half, N, A>& a) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; i += 2) { atomicAdd((__half2*)(dst + i), __half2(a[i], a[i+1])); } } template > TCNN_DEVICE void atomic_add_gmem(__half* dst, const tvec<__half, N, A>& a) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; i += 2) { atomic_add_gmem_h2((__half2*)(dst + i), __half2(a[i], a[i+1])); } } #endif #endif #undef CWISE_OP // __half2 specializations for aligned vectors with 2*N fp16 coefficients. #if defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 #define HVEC tvec<__half, N, A> #define HALF_CWISE_OP(operation, type_result, expr, ...) \ template > \ TCNN_DEVICE type_result operation(__VA_ARGS__) { \ type_result result; \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; i += 2) { \ *(__half2*)&result[i] = expr; \ } \ return result; \ } HALF_CWISE_OP(fma, HVEC, __hfma2(*(__half2*)&a[i], *(__half2*)&b[i], *(__half2*)&c[i]), const HVEC& a, const HVEC& b, const HVEC& c) HALF_CWISE_OP(fma, HVEC, __hfma2(*(__half2*)&a[i], *(__half2*)&b[i], __half2half2(c)), const HVEC& a, const HVEC& b, __half c) HALF_CWISE_OP(fma, HVEC, __hfma2(*(__half2*)&a[i], __half2half2(b), *(__half2*)&c[i]), const HVEC& a, __half b, const HVEC& c) HALF_CWISE_OP(fma, HVEC, __hfma2(*(__half2*)&a[i], __half2half2(b), __half2half2(c)), const HVEC& a, __half b, __half c) HALF_CWISE_OP(fma, HVEC, __hfma2(__half2half2(a), *(__half2*)&b[i], *(__half2*)&c[i]), __half a, const HVEC& b, const HVEC& c) HALF_CWISE_OP(fma, HVEC, __hfma2(__half2half2(a), *(__half2*)&b[i], __half2half2(c)), __half a, const HVEC& b, __half c) HALF_CWISE_OP(fma, HVEC, __hfma2(__half2half2(a), __half2half2(b), *(__half2*)&c[i]), __half a, __half b, const HVEC& c) HALF_CWISE_OP(operator+, HVEC, __hadd2(*(__half2*)&a[i], *(__half2*)&b[i]), const HVEC& a, const HVEC& b) HALF_CWISE_OP(operator+, HVEC, __hadd2(__half2half2(a), *(__half2*)&b[i]), __half a, const HVEC& b) HALF_CWISE_OP(operator+, HVEC, __hadd2(*(__half2*)&a[i], __half2half2(b)), const HVEC& a, __half b) HALF_CWISE_OP(operator-, HVEC, __hsub2(*(__half2*)&a[i], *(__half2*)&b[i]), const HVEC& a, const HVEC& b) HALF_CWISE_OP(operator-, HVEC, __hsub2(__half2half2(a), *(__half2*)&b[i]), __half a, const HVEC& b) HALF_CWISE_OP(operator-, HVEC, __hsub2(*(__half2*)&a[i], __half2half2(b)), const HVEC& a, __half b) HALF_CWISE_OP(operator*, HVEC, __hmul2(*(__half2*)&a[i], *(__half2*)&b[i]), const HVEC& a, const HVEC& b) HALF_CWISE_OP(operator*, HVEC, __hmul2(__half2half2(a), *(__half2*)&b[i]), __half a, const HVEC& b) HALF_CWISE_OP(operator*, HVEC, __hmul2(*(__half2*)&a[i], __half2half2(b)), const HVEC& a, __half b) HALF_CWISE_OP(operator/, HVEC, __h2div(*(__half2*)&a[i], *(__half2*)&b[i]), const HVEC& a, const HVEC& b) HALF_CWISE_OP(operator/, HVEC, __h2div(*(__half2*)&a[i], __half2half2(b)), const HVEC& a, __half b) #endif #define INPLACE_OP(operation, type_b, expr) \ template \ TCNN_HOST_DEVICE TVEC& operation(TVEC& a, type_b b) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ expr; \ } \ return a; \ } INPLACE_OP(operator*=, const TVEC&, a[i] *= b[i]) INPLACE_OP(operator/=, const TVEC&, a[i] /= b[i]) INPLACE_OP(operator+=, const TVEC&, a[i] += b[i]) INPLACE_OP(operator-=, const TVEC&, a[i] -= b[i]) INPLACE_OP(operator*=, T, a[i] *= b) INPLACE_OP(operator/=, T, a[i] /= b) #undef INPLACE_OP #define REDUCTION_OP(operation, type_result, init, expr, ...) \ template \ TCNN_HOST_DEVICE type_result operation(__VA_ARGS__) { \ type_result result = init; \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ expr; \ } \ return result; \ } REDUCTION_OP(dot, T, (T)0, result += a[i] * b[i], const TVEC& a, const TVEC& b) REDUCTION_OP(sum, T, (T)0, result += a[i], const TVEC& a) REDUCTION_OP(mean, T, (T)0, result += a[i] / (T)N, const TVEC& a) REDUCTION_OP(product, T, (T)1, result *= a[i], const TVEC& a) REDUCTION_OP(min, T, (T)std::numeric_limits::infinity(), result = min(result, a[i]), const TVEC& a) REDUCTION_OP(max, T, (T)-std::numeric_limits::infinity(), result = max(result, a[i]), const TVEC& a) REDUCTION_OP(length2, T, (T)0, result += a[i] * a[i], const TVEC& a) REDUCTION_OP(operator==, bool, true, result &= a[i] == b[i], const TVEC& a, const TVEC& b) REDUCTION_OP(operator!=, bool, false, result |= a[i] != b[i], const TVEC& a, const TVEC& b) #undef REDUCTION_OP #define BOOL_REDUCTION_OP(operation, type_result, init, expr, ...) \ template \ TCNN_HOST_DEVICE type_result operation(__VA_ARGS__) { \ type_result result = init; \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ expr; \ } \ return result; \ } BOOL_REDUCTION_OP(all, bool, true, result &= a[i], const BVEC& a) BOOL_REDUCTION_OP(any, bool, false, result |= a[i], const BVEC& a) #undef BOOL_REDUCTION_OP template TCNN_HOST_DEVICE T length(const TVEC& a) { return std::sqrt(length2(a)); } template TCNN_HOST_DEVICE T distance(const TVEC& a, const TVEC& b) { return length(a - b); } template TCNN_HOST_DEVICE TVEC normalize(const TVEC& v) { T len = length(v); if (len <= (T)0) { TVEC result{(T)0}; result[0] = (T)1; return result; } return v / len; } template TCNN_HOST_DEVICE TVEC cross(const TVEC& a, const TVEC& b) { return { a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, }; } template TCNN_HOST_DEVICE TVEC faceforward(const TVEC& n, const TVEC& i, const TVEC& nref) { return n * -copysign((T)1, dot(i, nref)); } #undef TVEC #undef BVEC #define DEF_NON_TEMPLATED_VECTOR_TYPES(name, T) \ template using name = tvec; \ template using a##name = tvec; \ using name##1 = name<1>; \ using name##2 = name<2>; \ using name##3 = name<3>; \ using name##4 = name<4>; DEF_NON_TEMPLATED_VECTOR_TYPES(bvec, bool) DEF_NON_TEMPLATED_VECTOR_TYPES(vec, float) DEF_NON_TEMPLATED_VECTOR_TYPES(dvec, double) DEF_NON_TEMPLATED_VECTOR_TYPES(ivec, int) DEF_NON_TEMPLATED_VECTOR_TYPES(uvec, unsigned int) DEF_NON_TEMPLATED_VECTOR_TYPES(i32vec, int32_t) DEF_NON_TEMPLATED_VECTOR_TYPES(u32vec, uint32_t) DEF_NON_TEMPLATED_VECTOR_TYPES(i16vec, int16_t) DEF_NON_TEMPLATED_VECTOR_TYPES(u16vec, uint16_t) DEF_NON_TEMPLATED_VECTOR_TYPES(i8vec, int8_t) DEF_NON_TEMPLATED_VECTOR_TYPES(u8vec, uint8_t) #if defined(__CUDACC__) DEF_NON_TEMPLATED_VECTOR_TYPES(hvec, __half) #endif #if defined(__CUDACC__) inline TCNN_HOST_DEVICE float4 to_float4(const vec4& x) { return {x.x, x.y, x.z, x.w}; } inline TCNN_HOST_DEVICE float3 to_float3(const vec3& x) { return {x.x, x.y, x.z}; } inline TCNN_HOST_DEVICE float2 to_float2(const vec2& x) { return {x.x, x.y}; } inline TCNN_HOST_DEVICE vec4 to_vec4(const float4& x) { return {x.x, x.y, x.z, x.w}; } inline TCNN_HOST_DEVICE vec3 to_vec3(const float3& x) { return {x.x, x.y, x.z}; } inline TCNN_HOST_DEVICE vec2 to_vec2(const float2& x) { return {x.x, x.y}; } #endif template struct tmat { tmat() = default; TCNN_HOST_DEVICE tmat(T scalar) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { TCNN_PRAGMA_UNROLL for (uint32_t j = 0; j < M; ++j) { m[i][j] = i == j ? scalar : (T)0; } } } TCNN_HOST_DEVICE static constexpr tmat identity() { return tmat((T)1); } TCNN_HOST_DEVICE static constexpr tmat zero() { return tmat((T)0); } template > TCNN_HOST_DEVICE tmat(Ts... coeffs) : d{coeffs...} {} TCNN_HOST_DEVICE tmat(const T* coeffs) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { TCNN_PRAGMA_UNROLL for (uint32_t j = 0; j < M; ++j) { m[i][j] = *(coeffs++); } } } template TCNN_HOST_DEVICE tmat(const tvec& a) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { m[i] = a; } } template TCNN_HOST_DEVICE tmat(const tvec& a, const tvec& b) { static_assert(N == 2, "Matrix must have 2 columns."); m[0] = a; m[1] = b; } template TCNN_HOST_DEVICE tmat(const tvec& a, const tvec& b, const tvec& c) { static_assert(N == 3, "Matrix must have 3 columns."); m[0] = a; m[1] = b; m[2] = c; } template TCNN_HOST_DEVICE tmat(const tvec& a, const tvec& b, const tvec& c, const tvec& d) { static_assert(N == 4, "Matrix must have 4 columns."); m[0] = a; m[1] = b; m[2] = c; m[3] = d; } template TCNN_HOST_DEVICE tmat(const tmat& other) { TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { TCNN_PRAGMA_UNROLL for (uint32_t j = 0; j < M; ++j) { m[i][j] = i < P && j < O ? other[i][j] : (i == j ? (T)1 : (T)0); } } } template TCNN_HOST_DEVICE tvec operator*(const tvec& v) const { tvec result((T)0); TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { TCNN_PRAGMA_UNROLL for (uint32_t j = 0; j < M; ++j) { result[j] += m[i][j] * v[i]; } } return result; } template TCNN_HOST_DEVICE tmat operator*(const tmat& other) const { tmat result; TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < K; ++i) { result[i] = (*this) * other[i]; } return result; } TCNN_HOST_DEVICE tvec& at(uint32_t idx) { return m[idx]; } TCNN_HOST_DEVICE tvec at(uint32_t idx) const { return m[idx]; } TCNN_HOST_DEVICE tvec& operator[](uint32_t idx) { return m[idx]; } TCNN_HOST_DEVICE tvec operator[](uint32_t idx) const { return m[idx]; } TCNN_HOST_DEVICE T* data() { return d; } TCNN_HOST_DEVICE const T* data() const { return d; } union { tvec m[N]; T d[M*N]; }; }; template TCNN_HOST_DEVICE tmat& operator*=(tmat& m, const tmat& other) { m = m * other; return m; } template TCNN_HOST_DEVICE T frobenius_norm(const tmat& m) { T result = (T)0; TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { result += length2(m[i]); } return sqrt(result); } template TCNN_HOST_DEVICE tmat transpose(const tmat& m) { tmat result; TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { TCNN_PRAGMA_UNROLL for (uint32_t j = 0; j < M; ++j) { result[j][i] = m[i][j]; } } return result; } template TCNN_HOST_DEVICE tvec row(const tmat& m, int r) { tvec result; TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { result[i] = m[i][r]; } return result; } template TCNN_HOST_DEVICE tmat row(const tmat& m, int r, const tvec& v) { tmat result = m; TCNN_PRAGMA_UNROLL for (uint32_t i = 0; i < N; ++i) { result[i][r] = v[i]; } return result; } #define TMAT tmat #define CWISE_OP(operation, type_a, type_b, expr) \ template \ TCNN_HOST_DEVICE TMAT operation(type_a a, type_b b) { \ TMAT result; \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t j = 0; j < M; ++j) { \ result[i][j] = expr; \ } \ } \ return result; \ } CWISE_OP(operator+, const TMAT&, const TMAT&, a[i][j] + b[i][j]) CWISE_OP(operator-, const TMAT&, const TMAT&, a[i][j] - b[i][j]) CWISE_OP(operator*, T, const TMAT&, a * b[i][j]) CWISE_OP(operator*, const TMAT&, T, a[i][j] * b) CWISE_OP(operator/, const TMAT&, T, a[i][j] / b) #undef CWISE_OP #define INPLACE_OP(operation, type_b, expr) \ template \ TCNN_HOST_DEVICE TMAT& operation(TMAT& a, type_b b) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t j = 0; j < M; ++j) { \ expr; \ } \ } \ return a; \ } INPLACE_OP(operator+=, const TMAT&, a[i][j] += b[i][j]) INPLACE_OP(operator-=, const TMAT&, a[i][j] -= b[i][j]) INPLACE_OP(operator*=, T, a[i][j] *= b) INPLACE_OP(operator/=, T, a[i][j] /= b) #undef INPLACE_OP #define REDUCTION_OP(operation, type_result, init, expr, ...) \ template \ TCNN_HOST_DEVICE type_result operation(__VA_ARGS__) { \ type_result result = init; \ TCNN_PRAGMA_UNROLL \ for (uint32_t i = 0; i < N; ++i) { \ TCNN_PRAGMA_UNROLL \ for (uint32_t j = 0; j < M; ++j) { \ expr; \ } \ } \ return result; \ } REDUCTION_OP(operator==, bool, true, result &= a[i][j] == b[i][j], const TMAT& a, const TMAT& b) REDUCTION_OP(operator!=, bool, false, result |= a[i][j] != b[i][j], const TMAT& a, const TMAT& b) REDUCTION_OP(isfinite, bool, true, result &= isfinite(a[i][j]), const TMAT& a) #undef REDUCTION_OP // The following implementations of determinants, adjoints, inverses, and quaternions // (and only those) were adapted from glm per the MIT license, which is included below in full. // ================================================================================ // The MIT License // -------------------------------------------------------------------------------- // Copyright (c) 2005 - G-Truc Creation // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. template TCNN_HOST_DEVICE T determinant(const tmat& m) { return m[0][0] * m[1][1] - m[0][1] * m[1][0]; } template TCNN_HOST_DEVICE T determinant(const tmat& m) { return m[0][0] * (m[1][1] * m[2][2] - m[2][1] * m[1][2]) + -m[1][0] * (m[0][1] * m[2][2] - m[2][1] * m[0][2]) + m[2][0] * (m[0][1] * m[1][2] - m[1][1] * m[0][2]) ; } template TCNN_HOST_DEVICE T determinant(const tmat& m) { T s0 = m[2][2] * m[3][3] - m[3][2] * m[2][3]; T s1 = m[2][1] * m[3][3] - m[3][1] * m[2][3]; T s2 = m[2][1] * m[3][2] - m[3][1] * m[2][2]; T s3 = m[2][0] * m[3][3] - m[3][0] * m[2][3]; T s4 = m[2][0] * m[3][2] - m[3][0] * m[2][2]; T s5 = m[2][0] * m[3][1] - m[3][0] * m[2][1]; tvec coeff{ (m[1][1] * s0 - m[1][2] * s1 + m[1][3] * s2), -(m[1][0] * s0 - m[1][2] * s3 + m[1][3] * s4), (m[1][0] * s1 - m[1][1] * s3 + m[1][3] * s5), -(m[1][0] * s2 - m[1][1] * s4 + m[1][2] * s5), }; return m[0][0] * coeff[0] + m[0][1] * coeff[1] + m[0][2] * coeff[2] + m[0][3] * coeff[3] ; } template TCNN_HOST_DEVICE tmat adjoint(const tmat& m) { return { m[1][1], -m[0][1], -m[1][0], m[0][0], }; } template TCNN_HOST_DEVICE tmat adjoint(const tmat& m) { const T m00 = determinant(tmat{m[1][1], m[2][1], m[1][2], m[2][2]}); const T m01 = determinant(tmat{m[0][1], m[2][1], m[0][2], m[2][2]}); const T m02 = determinant(tmat{m[0][1], m[1][1], m[0][2], m[1][2]}); const T m10 = determinant(tmat{m[1][0], m[2][0], m[1][2], m[2][2]}); const T m11 = determinant(tmat{m[0][0], m[2][0], m[0][2], m[2][2]}); const T m12 = determinant(tmat{m[0][0], m[1][0], m[0][2], m[1][2]}); const T m20 = determinant(tmat{m[1][0], m[2][0], m[1][1], m[2][1]}); const T m21 = determinant(tmat{m[0][0], m[2][0], m[0][1], m[2][1]}); const T m22 = determinant(tmat{m[0][0], m[1][0], m[0][1], m[1][1]}); return { m00, -m01, m02, -m10, m11, -m12, m20, -m21, m22, }; } template TCNN_HOST_DEVICE tmat adjoint(const tmat& m) { const T m00 = determinant(tmat{m[1][1], m[1][2], m[1][3], m[2][1], m[2][2], m[2][3], m[3][1], m[3][2], m[3][3]}); const T m01 = determinant(tmat{m[1][0], m[1][2], m[1][3], m[2][0], m[2][2], m[2][3], m[3][0], m[3][2], m[3][3]}); const T m02 = determinant(tmat{m[1][0], m[1][1], m[1][3], m[2][0], m[2][1], m[2][3], m[3][0], m[3][1], m[3][3]}); const T m03 = determinant(tmat{m[1][0], m[1][1], m[1][2], m[2][0], m[2][1], m[2][2], m[3][0], m[3][1], m[3][2]}); const T m10 = determinant(tmat{m[0][1], m[0][2], m[0][3], m[2][1], m[2][2], m[2][3], m[3][1], m[3][2], m[3][3]}); const T m11 = determinant(tmat{m[0][0], m[0][2], m[0][3], m[2][0], m[2][2], m[2][3], m[3][0], m[3][2], m[3][3]}); const T m12 = determinant(tmat{m[0][0], m[0][1], m[0][3], m[2][0], m[2][1], m[2][3], m[3][0], m[3][1], m[3][3]}); const T m13 = determinant(tmat{m[0][0], m[0][1], m[0][2], m[2][0], m[2][1], m[2][2], m[3][0], m[3][1], m[3][2]}); const T m20 = determinant(tmat{m[0][1], m[0][2], m[0][3], m[1][1], m[1][2], m[1][3], m[3][1], m[3][2], m[3][3]}); const T m21 = determinant(tmat{m[0][0], m[0][2], m[0][3], m[1][0], m[1][2], m[1][3], m[3][0], m[3][2], m[3][3]}); const T m22 = determinant(tmat{m[0][0], m[0][1], m[0][3], m[1][0], m[1][1], m[1][3], m[3][0], m[3][1], m[3][3]}); const T m23 = determinant(tmat{m[0][0], m[0][1], m[0][2], m[1][0], m[1][1], m[1][2], m[3][0], m[3][1], m[3][2]}); const T m30 = determinant(tmat{m[0][1], m[0][2], m[0][3], m[1][1], m[1][2], m[1][3], m[2][1], m[2][2], m[2][3]}); const T m31 = determinant(tmat{m[0][0], m[0][2], m[0][3], m[1][0], m[1][2], m[1][3], m[2][0], m[2][2], m[2][3]}); const T m32 = determinant(tmat{m[0][0], m[0][1], m[0][3], m[1][0], m[1][1], m[1][3], m[2][0], m[2][1], m[2][3]}); const T m33 = determinant(tmat{m[0][0], m[0][1], m[0][2], m[1][0], m[1][1], m[1][2], m[2][0], m[2][1], m[2][2]}); return { m00, -m10, m20, -m30, -m01, m11, -m21, m31, m02, -m12, m22, -m32, -m03, m13, -m23, m33, }; } template TCNN_HOST_DEVICE tmat inverse(const tmat& m) { return adjoint(m) / determinant(m); } template TCNN_HOST_DEVICE tmat rotmat(T angle, const tvec& axis) { T s, c; sincos(angle, &s, &c); T oc = (T)1 - c; return { oc * axis.x * axis.x + c, oc * axis.x * axis.y + axis.z * s, oc * axis.z * axis.x - axis.y * s, oc * axis.x * axis.y - axis.z * s, oc * axis.y * axis.y + c, oc * axis.y * axis.z + axis.x * s, oc * axis.z * axis.x + axis.y * s, oc * axis.y * axis.z - axis.x * s, oc * axis.z * axis.z + c, }; } template TCNN_HOST_DEVICE tmat rotmat(const tvec& v) { T angle = length(v); if (angle == (T)0) { return tmat::identity(); } return rotmat(angle, v / angle); } template TCNN_HOST_DEVICE tmat mat_sqrt(const tmat& m, T eps = (T)1e-10f) { tmat X = m, Y = tmat::identity(); for (uint32_t i = 0; i < 32; ++i) { if (frobenius_norm(X * X - m) < eps) { return X; } tmat iX = inverse(X); X = (T)0.5f * (X + inverse(Y)); Y = (T)0.5f * (Y + iX); } return X; } template TCNN_HOST_DEVICE tmat mat_log_hawkins(const tmat& m, T eps = (T)1e-10f) { tmat A = m - tmat::identity(), Z = A, X = A; for (uint32_t i = 2; i < 32; ++i) { if (frobenius_norm(Z) < eps) { return X; } Z = Z * A; X += ((T)1 / (T)i) * Z; } return X; } template TCNN_HOST_DEVICE tmat mat_exp_pade(const tmat& m) { // Pade approximation with scaling; same as Matlab. // Pseudocode translated from Hawkins and Grimm [2007] tmat mX = tmat::identity(), mD = tmat::identity(), mN = tmat::identity(); T c = (T)1; constexpr uint32_t q = 6; // Matlab's default when using this algorithm T s = -(T)1; for (uint32_t k = 1; k <= q; ++k) { c = c * (q - k + 1) / (k * (2 * q - k + 1)); mX = m * mX; auto cmX = c * mX; mN = mN + cmX; mD = mD + s * cmX; s = -s; } return inverse(mD) * mN; } template TCNN_HOST_DEVICE tmat mat_log(const tmat& m) { tmat result(m); uint32_t j = 0; for (; j < 32; ++j) { if (frobenius_norm(result - tmat::identity()) < (T)1e-5f) { break; } result = mat_sqrt(result); } result = mat_log_hawkins(result); return (T)scalbnf(1.0f, j) * result; } template TCNN_HOST_DEVICE tmat mat_exp(const tmat& m) { uint32_t N_SQUARING = max(0, 1 + (int)floor(log2(frobenius_norm(m)))); tmat result = (T)scalbnf(1.0f, -N_SQUARING) * m; result = mat_exp_pade(result); for (uint32_t i = 0; i < N_SQUARING; ++i) { result *= result; } return result; } template TCNN_HOST_DEVICE tmat orthogonalize(const tmat& m) { // Iteration to bring an almost orthogonal matrix nearer to its closest // orthogonal matrix. This can be run multiple times until convergence // is measured or, alternatively, once per frame on something like a // camera matrix to ensure it does not degenerate over time. return (T)1.5f * m - (T)0.5f * (m * transpose(m) * m); } template TCNN_HOST_DEVICE tmat orthogonalize(const tmat& m) { auto rot = orthogonalize(tmat{m}); return tmat{rot[0], rot[1], rot[2], m[3]}; } template TCNN_HOST_DEVICE tmat so3_log(const tmat& m) { T tr = clamp(m[0][0] + m[1][1] + m[2][2], -(T)1 + std::numeric_limits::epsilon(), (T)1); T radians = acosf((tr - (T)1) / (T)2); return radians / sqrt(((T)1 + tr) * ((T)3 - tr)) * (m - transpose(m)); } template TCNN_HOST_DEVICE tmat so3_exp(const tmat& m) { tvec axis = {-m[2][1], m[2][0], -m[1][0]}; T radians_sq = length2(axis); if (radians_sq == (T)0) { return tmat::identity(); } T radians = sqrt(radians_sq); return tmat::identity() + (sin(radians) / radians) * m + (((T)1 - cos(radians)) / radians_sq) * (m * m); } template TCNN_HOST_DEVICE tmat se3_log(const tmat& m) { auto omega = so3_log(tmat(m)); tvec axis = {-omega[2][1], omega[2][0], -omega[1][0]}; T radians_sq = length2(axis); auto inv_trans = tmat::identity(); if (radians_sq > (T)0) { T radians = sqrt(radians_sq); inv_trans += -(T)0.5 * omega + (((T)1 - (T)0.5 * radians * cos((T)0.5 * radians) / sin((T)0.5 * radians)) / radians_sq) * (omega * omega); } return {omega[0], omega[1], omega[2], inv_trans * m[3]}; } template TCNN_HOST_DEVICE tmat se3_exp(const tmat& m) { tmat omega = m; tvec axis = {-omega[2][1], omega[2][0], -omega[1][0]}; T radians_sq = length2(axis); auto trans = tmat::identity(); if (radians_sq > (T)0) { T radians = sqrt(radians_sq); trans += (((T)1 - cos(radians)) / radians_sq) * omega + ((radians - sin(radians)) / (radians * radians_sq)) * (omega * omega); } auto rot = so3_exp(omega); return {rot[0], rot[1], rot[2], trans * m[3]}; } template TCNN_HOST_DEVICE tmat se3_log(const tmat& m) { auto result = tmat(se3_log(tmat(m))); result[3][3] = (T)0; return result; } template TCNN_HOST_DEVICE tmat se3_exp(const tmat& m) { return tmat(se3_exp(tmat(m))); } #define DEF_NON_TEMPLATED_MATRIX_TYPES(name, T) \ template \ using name = tmat; \ using name##4x4 = name<4, 4>; \ using name##4x3 = name<4, 3>; \ using name##4x2 = name<4, 2>; \ using name##3x4 = name<3, 4>; \ using name##3x3 = name<3, 3>; \ using name##3x2 = name<3, 2>; \ using name##2x4 = name<2, 4>; \ using name##2x3 = name<2, 3>; \ using name##2x2 = name<2, 2>; \ using name##4 = name##4x4; \ using name##3 = name##3x3; \ using name##2 = name##2x2; DEF_NON_TEMPLATED_MATRIX_TYPES(mat, float) DEF_NON_TEMPLATED_MATRIX_TYPES(dmat, double) #if defined(__CUDACC__) DEF_NON_TEMPLATED_MATRIX_TYPES(hmat, __half) #endif template struct tquat { tquat() = default; TCNN_HOST_DEVICE tquat(T w, T x, T y, T z) : w{w}, x{x}, y{y}, z{z} {} TCNN_HOST_DEVICE tquat(const tmat& m) { // Code adapted from https://www.euclideanspace.com/maths/geometry/rotations/conversions/matrixToQuaternion/ T tr = m[0][0] + m[1][1] + m[2][2]; if (tr > (T)0) { T S = sqrt(tr + (T)1) * (T)2; // S=4*qw w = (T)0.25 * S; x = (m[1][2] - m[2][1]) / S; y = (m[2][0] - m[0][2]) / S; z = (m[0][1] - m[1][0]) / S; } else if (m[0][0] > m[1][1] && m[0][0] > m[2][2]) { T S = sqrt((T)1 + m[0][0] - m[1][1] - m[2][2]) * (T)2; // S=4*x w = (m[1][2] - m[2][1]) / S; x = (T)0.25 * S; y = (m[1][0] + m[0][1]) / S; z = (m[2][0] + m[0][2]) / S; } else if (m[1][1] > m[2][2]) { T S = sqrt((T)1 + m[1][1] - m[0][0] - m[2][2]) * (T)2; // S=4*y w = (m[2][0] - m[0][2]) / S; x = (m[1][0] + m[0][1]) / S; y = (T)0.25 * S; z = (m[2][1] + m[1][2]) / S; } else { T S = sqrt((T)1 + m[2][2] - m[0][0] - m[1][1]) * (T)2; // S=4*z w = (m[0][1] - m[1][0]) / S; x = (m[2][0] + m[0][2]) / S; y = (m[2][1] + m[1][2]) / S; z = (T)0.25 * S; } } T w, x, y, z; }; template TCNN_HOST_DEVICE tquat operator-(const tquat& a) { return {-a.w, -a.x, -a.y, -a.z}; } template TCNN_HOST_DEVICE tquat operator+(const tquat& a, const tquat& b) { return {a.w + b.w, a.x + b.x, a.y + b.y, a.z + b.z}; } template TCNN_HOST_DEVICE tquat operator-(const tquat& a, const tquat& b) { return {a.w - b.w, a.x - b.x, a.y - b.y, a.z - b.z}; } template TCNN_HOST_DEVICE tquat operator*(T a, const tquat& b) { return {a * b.w, a * b.x, a * b.y, a * b.z}; } template TCNN_HOST_DEVICE tquat operator*(const tquat& a, T b) { return {a.w * b, a.x * b, a.y * b, a.z * b}; } template TCNN_HOST_DEVICE tquat operator/(const tquat& a, T b) { return {a.w / b, a.x / b, a.y / b, a.z / b}; } template TCNN_HOST_DEVICE T dot(const tquat& a, const tquat& b) { return (a.w * b.w + a.x * b.x) + (a.y * b.y + a.z * b.z); } template TCNN_HOST_DEVICE T length2(const tquat& a) { return dot(a, a); } template TCNN_HOST_DEVICE T length(const tquat& a) { return sqrt(length2(a)); } template TCNN_HOST_DEVICE tquat mix(const tquat& a, const tquat& b, T t) { return a * ((T)1 - t) + b * t; } template TCNN_HOST_DEVICE tquat normalize(const tquat& a) { T len = length(a); if (len <= (T)0) { return {(T)1, (T)0, (T)0, (T)0}; } return a / len; } template TCNN_HOST_DEVICE tquat cross(const tquat& a, const tquat& b) { return { a.w * b.w - a.x * b.x - a.y * b.y - a.z * b.z, a.w * b.x + a.x * b.w + a.y * b.z - a.z * b.y, a.w * b.y + a.y * b.w + a.z * b.x - a.x * b.z, a.w * b.z + a.z * b.w + a.x * b.y - a.y * b.x }; } template TCNN_HOST_DEVICE tquat slerp(const tquat& x, const tquat& y, T t) { tquat z = y; T cos_theta = dot(x, y); // If cos_theta < 0, the interpolation will take the long way around the sphere. // To fix this, one quat must be negated. if (cos_theta < (T)0) { z = -y; cos_theta = -cos_theta; } // Perform a linear interpolation when cos_theta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator if (cos_theta > (T)1 - std::numeric_limits::epsilon()) { return mix(x, z, t); } else { // Essential Mathematics, page 467 T angle = acos(cos_theta); return (sin(((T)1 - t) * angle) * x + sin(t * angle) * z) / sin(angle); } } template TCNN_HOST_DEVICE T angle(const tquat& x) { return acos(clamp(x.w, (T)-1, (T)1)) * (T)2; } template TCNN_HOST_DEVICE tvec axis(const tquat& x) { const T tmp1 = (T)1 - x.w * x.w; if (tmp1 <= (T)0) { return {(T)0, (T)0, (T)1}; } const T tmp2 = (T)1 / sqrt(tmp1); return {x.x * tmp2, x.y * tmp2, x.z * tmp2}; } template TCNN_HOST_DEVICE tmat to_mat3(const tquat& q) { T qxx = q.x * q.x, qyy = q.y * q.y, qzz = q.z * q.z; T qxz = q.x * q.z, qxy = q.x * q.y, qyz = q.y * q.z; T qwx = q.w * q.x, qwy = q.w * q.y, qwz = q.w * q.z; return { (T)1 - (T)2 * (qyy + qzz), (T)2 * (qxy + qwz), (T)2 * (qxz - qwy), (T)2 * (qxy - qwz), (T)1 - (T)2 * (qxx + qzz), (T)2 * (qyz + qwx), (T)2 * (qxz + qwy), (T)2 * (qyz - qwx), (T)1 - (T)2 * (qxx + qyy), }; } template TCNN_HOST_DEVICE tmat slerp(const tmat& a, const tmat& b, float t) { return to_mat3(normalize(slerp(normalize(tquat(a)), normalize(tquat(b)), t))); } template TCNN_HOST_DEVICE tvec rotvec(const tmat& mat) { tquat tmp = mat; return axis(tmp) * angle(tmp); } using quat = tquat; }