yonnel's picture
Upload 191 files
2b753fe verified
raw
history blame
48.9 kB
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "common.h"
#include "texture.h"
//------------------------------------------------------------------------
// Memory access and math helpers.
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float b, float c) { a[0] += b * c; }
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
static __device__ __forceinline__ void accum_to_mem(float& a, float* b, int s) { a += b[0]; }
static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
static __device__ __forceinline__ bool isfinite_vec3(const float3& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z); }
static __device__ __forceinline__ bool isfinite_vec4(const float4& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z) && isfinite(a.w); }
template<class T> static __device__ __forceinline__ T lerp (const T& a, const T& b, float c) { return a + c * (b - a); }
template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
//------------------------------------------------------------------------
// Cube map wrapping for smooth filtering across edges and corners. At corners,
// one of the texture coordinates will be negative. For correct interpolation,
// the missing texel must take the average color of the other three.
static __constant__ uint32_t c_cubeWrapMask1[48] =
{
0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
};
static __constant__ uint8_t c_cubeWrapMask2[48] =
{
0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
};
static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
{
// Calculate case number.
int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
int c = cx + cy;
if (c >= 5)
c--;
c = (face << 3) + c;
// Compute coordinates and faces.
unsigned int m = c_cubeWrapMask1[c];
int x0 = (m >> 0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
int x1 = (m >> 2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
int x2 = (m >> 4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
int x3 = (m >> 6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
int y0 = (m >> 8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
int f0 = ((m >> 16) & 15) - 1;
int f1 = ((m >> 20) & 15) - 1;
int f2 = ((m >> 24) & 15) - 1;
int f3 = ((m >> 28) ) - 1;
// Flips.
unsigned int f = c_cubeWrapMask2[c];
int w1 = w - 1;
if (f & 0x01) x0 = w1 - x0;
if (f & 0x02) x1 = w1 - x1;
if (f & 0x04) x2 = w1 - x2;
if (f & 0x08) x3 = w1 - x3;
if (f & 0x10) y0 = w1 - y0;
if (f & 0x20) y1 = w1 - y1;
if (f & 0x40) y2 = w1 - y2;
if (f & 0x80) y3 = w1 - y3;
// Done.
int4 tcOut;
tcOut.x = x0 + (y0 + f0 * w) * w;
tcOut.y = x1 + (y1 + f1 * w) * w;
tcOut.z = x2 + (y2 + f2 * w) * w;
tcOut.w = x3 + (y3 + f3 * w) * w;
return tcOut;
}
//------------------------------------------------------------------------
// Cube map indexing and gradient functions.
// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
// two parameters) and face index.
static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
{
float ax = fabsf(x);
float ay = fabsf(y);
float az = fabsf(z);
int idx;
float c;
if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
else if (ay > ax) { idx = 2; c = y; y = z; }
else { idx = 0; c = x; x = z; }
if (c < 0.f) idx += 1;
float m = __frcp_rz(fabsf(c)) * .5;
float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
float m1 = (idx != 2) ? -m : m;
x = x * m0 + .5;
y = y * m1 + .5;
if (!isfinite(x) || !isfinite(y))
return -1; // Invalid uv.
x = fminf(fmaxf(x, 0.f), 1.f);
y = fminf(fmaxf(y, 0.f), 1.f);
return idx;
}
// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c;
float c0 = gu;
float c1 = gv;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
else { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
if (c < 0.f) idx += idx;
float m = __frcp_rz(fabsf(c));
c0 = (idx & 0x34) ? -c0 : c0;
c1 = (idx & 0x2e) ? -c1 : c1;
float gl = (c0 + c1) * m;
float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
float gy = (idx & 0x0c) ? gl : -gv;
float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
gz = (idx & 0x09) ? -gz : gz;
float3 res = make_float3(gx, gy, gz) * (m * .5f);
if (!isfinite_vec3(res))
return make_float3(0.f, 0.f, 0.f); // Invalid uv.
return res;
}
// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
// indexCubeMapGrad() functions rolled together.
static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c, c0, c1;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
else { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
if (c < 0.f) idx += idx;
float m = __frcp_rz(fabsf(c));
c0 = (idx & 0x34) ? -c0 : c0;
c1 = (idx & 0x2e) ? -c1 : c1;
float gl0 = (dw.x * c0 + dw.z * c1) * m;
float gl1 = (dw.y * c0 + dw.w * c1) * m;
float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
if (idx & 0x09)
{
gz0 = -gz0;
gz1 = -gz1;
}
g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
if (!isfinite_vec3(g0) || !isfinite_vec3(g1))
{
g0 = make_float3(0.f, 0.f, 0.f); // Invalid uv.
g1 = make_float3(0.f, 0.f, 0.f);
}
}
// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c, gu, gv;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
if (c < 0.f) idx += idx;
if (idx & 0x09)
{
dvdX.z = -dvdX.z;
dvdY.z = -dvdY.z;
}
float m = __frcp_rz(fabsf(c));
float dm = m * .5f;
float mm = m * dm;
gu *= (idx & 0x34) ? -mm : mm;
gv *= (idx & 0x2e) ? -mm : mm;
float4 res;
if (idx & 0x03)
{
res = make_float4(gu * dvdX.x + dm * dvdX.z,
gu * dvdY.x + dm * dvdY.z,
gv * dvdX.x - dm * dvdX.y,
gv * dvdY.x - dm * dvdY.y);
}
else if (idx & 0x0c)
{
res = make_float4(gu * dvdX.y + dm * dvdX.x,
gu * dvdY.y + dm * dvdY.x,
gv * dvdX.y + dm * dvdX.z,
gv * dvdY.y + dm * dvdY.z);
}
else // (idx & 0x30)
{
res = make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
gu * dvdY.z + copysignf(dm, c) * dvdY.x,
gv * dvdX.z - dm * dvdX.y,
gv * dvdY.z - dm * dvdY.y);
}
if (!isfinite_vec4(res))
return make_float4(0.f, 0.f, 0.f, 0.f);
return res;
}
// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
// | d(ds/dX)/dx d(ds/dY)/dx d(dt/dX)/dx d(dt/dY)/dx |
// | d(ds/dX)/dy d(ds/dY)/dy d(dt/dX)/dy d(dt/dY)/dy |
// | d(ds/dX)/dz d(ds/dY)/dz d(dt/dX)/dz d(dt/dY)/dz |
static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c, gu, gv;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
if (c < 0.f) idx += idx;
if (idx & 0x09)
{
dvdX.z = -dvdX.z;
dvdY.z = -dvdY.z;
}
float m = __frcp_rz(c);
float dm = -m * fabsf(m) * .5;
float mm = m * m * .5;
float mu = (idx & 0x34) ? -mm : mm;
float mv = (idx & 0x2e) ? -mm : mm;
gu *= -2.0 * m * mu;
gv *= -2.0 * m * mv;
if (idx & 0x03)
{
dx.x = gu * dvdX.x + dm * dvdX.z;
dx.y = gu * dvdY.x + dm * dvdY.z;
dx.z = gv * dvdX.x - dm * dvdX.y;
dx.w = gv * dvdY.x - dm * dvdY.y;
dy.x = 0.f;
dy.y = 0.f;
dy.z = mv * dvdX.x;
dy.w = mv * dvdY.x;
dz.x = mu * dvdX.x;
dz.y = mu * dvdY.x;
dz.z = 0.f;
dz.w = 0.f;
}
else if (idx & 0x0c)
{
dx.x = mu * dvdX.y;
dx.y = mu * dvdY.y;
dx.z = 0.f;
dx.w = 0.f;
dy.x = gu * dvdX.y + dm * dvdX.x;
dy.y = gu * dvdY.y + dm * dvdY.x;
dy.z = gv * dvdX.y + dm * dvdX.z;
dy.w = gv * dvdY.y + dm * dvdY.z;
dz.x = 0.f;
dz.y = 0.f;
dz.z = mv * dvdX.y;
dz.w = mv * dvdY.y;
}
else // (idx & 0x30)
{
dx.x = mu * dvdX.z;
dx.y = mu * dvdY.z;
dx.z = 0.f;
dx.w = 0.f;
dy.x = 0.f;
dy.y = 0.f;
dy.z = mv * dvdX.z;
dy.w = mv * dvdY.z;
dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
dz.z = gv * dvdX.z - dm * dvdX.y;
dz.w = gv * dvdY.z - dm * dvdY.y;
}
}
//------------------------------------------------------------------------
// General texture indexing.
template <bool CUBE_MODE>
static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
{
int w = p.texWidth;
int h = p.texHeight;
float u = uv.x;
float v = uv.y;
// Cube map indexing.
if (CUBE_MODE)
{
// No wrap. Fold face index into tz right away.
int idx = indexCubeMap(u, v, uv.z); // Rewrites u, v.
if (idx < 0)
return -1; // Invalid uv.
tz = 6 * tz + idx;
}
else
{
// Handle boundary.
if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
{
u = u - (float)__float2int_rd(u);
v = v - (float)__float2int_rd(v);
}
}
u = u * (float)w;
v = v * (float)h;
int iu = __float2int_rd(u);
int iv = __float2int_rd(v);
// In zero boundary mode, return texture address -1.
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
{
if (iu < 0 || iu >= w || iv < 0 || iv >= h)
return -1;
}
// Otherwise clamp and calculate the coordinate properly.
iu = min(max(iu, 0), w-1);
iv = min(max(iv, 0), h-1);
return iu + w * (iv + tz * h);
}
template <bool CUBE_MODE>
static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
{
// Mip level size.
int2 sz = mipLevelSize(p, level);
int w = sz.x;
int h = sz.y;
// Compute texture-space u, v.
float u = uv.x;
float v = uv.y;
bool clampU = false;
bool clampV = false;
// Cube map indexing.
int face = 0;
if (CUBE_MODE)
{
// Neither clamp or wrap.
face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
if (face < 0)
{
tcOut.x = tcOut.y = tcOut.z = tcOut.w = -1; // Invalid uv.
return make_float2(0.f, 0.f);
}
u = u * (float)w - 0.5f;
v = v * (float)h - 0.5f;
}
else
{
if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
{
// Wrap.
u = u - (float)__float2int_rd(u);
v = v - (float)__float2int_rd(v);
}
// Move to texel space.
u = u * (float)w - 0.5f;
v = v * (float)h - 0.5f;
if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
{
// Clamp to center of edge texels.
u = fminf(fmaxf(u, 0.f), w - 1.f);
v = fminf(fmaxf(v, 0.f), h - 1.f);
clampU = (u == 0.f || u == w - 1.f);
clampV = (v == 0.f || v == h - 1.f);
}
}
// Compute texel coordinates and weights.
int iu0 = __float2int_rd(u);
int iv0 = __float2int_rd(v);
int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
int iv1 = iv0 + (clampV ? 0 : 1);
u -= (float)iu0;
v -= (float)iv0;
// Cube map wrapping.
bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
if (cubeWrap)
{
tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
tcOut += 6 * tz * w * h; // Bring in tz.
return make_float2(u, v); // Done.
}
// Fold cube map face into tz.
if (CUBE_MODE)
tz = 6 * tz + face;
// Wrap overflowing texel indices.
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
{
if (iu0 < 0) iu0 += w;
if (iv0 < 0) iv0 += h;
if (iu1 >= w) iu1 -= w;
if (iv1 >= h) iv1 -= h;
}
// Coordinates with tz folded in.
int iu0z = iu0 + tz * w * h;
int iu1z = iu1 + tz * w * h;
tcOut.x = iu0z + w * iv0;
tcOut.y = iu1z + w * iv0;
tcOut.z = iu0z + w * iv1;
tcOut.w = iu1z + w * iv1;
// Invalidate texture addresses outside unit square if we are in zero mode.
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
{
bool iu0_out = (iu0 < 0 || iu0 >= w);
bool iu1_out = (iu1 < 0 || iu1 >= w);
bool iv0_out = (iv0 < 0 || iv0 >= h);
bool iv1_out = (iv1 < 0 || iv1 >= h);
if (iu0_out || iv0_out) tcOut.x = -1;
if (iu1_out || iv0_out) tcOut.y = -1;
if (iu0_out || iv1_out) tcOut.z = -1;
if (iu1_out || iv1_out) tcOut.w = -1;
}
// All done.
return make_float2(u, v);
}
//------------------------------------------------------------------------
// Mip level calculation.
template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
{
// Do nothing if mips not in use.
if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
return;
// Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
if (!BIAS_ONLY)
{
// Get pixel derivatives of texture coordinates.
float4 uvDA;
float3 dvdX, dvdY; // Gradients use these later.
if (CUBE_MODE)
{
// Fetch.
float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];
// Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
}
else
{
// Fetch.
uvDA = ((const float4*)p.uvDA)[pidx];
}
// Scaling factors.
float uscl = p.texWidth;
float vscl = p.texHeight;
// d[s,t]/d[X,Y].
float dsdx = uvDA.x * uscl;
float dsdy = uvDA.y * uscl;
float dtdx = uvDA.z * vscl;
float dtdy = uvDA.w * vscl;
// Calculate footprint axis lengths.
float A = dsdx*dsdx + dtdx*dtdx;
float B = dsdy*dsdy + dtdy*dtdy;
float C = dsdx*dsdy + dtdx*dtdy;
float l2b = 0.5 * (A + B);
float l2n = 0.25 * (A-B)*(A-B) + C*C;
float l2a = sqrt(l2n);
float lenMinorSqr = fmaxf(0.0, l2b - l2a);
float lenMajorSqr = l2b + l2a;
// Footprint vs. mip level gradient.
if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
float dw = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
float AB = dw * .5f * (A - B);
float Cw = dw * C;
float l2aw = dw * l2a;
float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
float4 d_f_dw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
if (!CUBE_MODE)
*pdw = isfinite_vec4(d_f_dw) ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
// In cube maps, there is also a texture coordinate vs. mip level gradient.
// Only output nonzero vectors if both are free of inf/Nan garbage.
if (CUBE_MODE)
{
float4 dx, dy, dz;
indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
d_f_dv += d_dsdX_dv * d_f_ddsdX;
d_f_dv += d_dsdY_dv * d_f_ddsdY;
d_f_dv += d_dtdX_dv * d_f_ddtdX;
d_f_dv += d_dtdY_dv * d_f_ddtdY;
bool finite = isfinite_vec4(d_f_dw) && isfinite_vec3(d_f_dv);
*pdw = finite ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
*pdfdv = finite ? d_f_dv : make_float3(0.f, 0.f, 0.f);
}
}
// Finally, calculate mip level.
flevel = .5f * __log2f(lenMajorSqr); // May be inf/NaN, but clamp fixes it.
}
// Bias the mip level and clamp.
if (p.mipLevelBias)
flevel += p.mipLevelBias[pidx];
flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
// Calculate levels depending on filter mode.
level0 = __float2int_rd(flevel);
// Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
{
level1 = min(level0 + 1, p.mipLevelMax);
flevel -= level0; // Fractional part. Zero if clamped on last level.
}
}
//------------------------------------------------------------------------
// Texel fetch and accumulator helpers that understand cube map corners.
template<class T>
static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
{
// For invalid cube map uv, tc will be all negative, and all texel values will be zero.
if (corner)
{
T avg = zero_value<T>();
if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
avg *= 0.33333333f;
if (tc.x < 0) a00 = avg;
if (tc.y < 0) a10 = avg;
if (tc.z < 0) a01 = avg;
if (tc.w < 0) a11 = avg;
}
else
{
a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>();
a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>();
a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>();
a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>();
}
}
static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
{
// For invalid cube map uv, tc will be all negative, and no accumulation will take place.
if (corner)
{
float cb;
if (tc.x < 0) cb = c.x;
if (tc.y < 0) cb = c.y;
if (tc.z < 0) cb = c.z;
if (tc.w < 0) cb = c.w;
cb *= 0.33333333f;
if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
}
else
{
if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
}
}
//------------------------------------------------------------------------
// Mip builder kernel.
template<class T, int C>
static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
{
// Sizes.
int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
int2 sz_out = mipLevelSize(p, p.mipLevelOut);
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
if (px >= sz_out.x || py >= sz_out.y)
return;
// Pixel indices.
int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
// Input and output pointers.
const float* pin = p.tex[p.mipLevelOut - 1];
float* pout = (float*)p.tex[p.mipLevelOut];
// Special case: Input texture height or width is 1.
if (sz_in.x == 1 || sz_in.y == 1)
{
if (sz_in.y == 1)
pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
for (int i=0; i < p.channels; i += C)
{
T v0 = *((const T*)&pin[pidx_in0 + i]);
T v1 = *((const T*)&pin[pidx_in1 + i]);
T avg = .5f * (v0 + v1);
#if TEX_DEBUG_MIP_RETAIN_VARIANCE
avg = (avg - .5f) * 1.41421356f + .5f;
#endif
*((T*)&pout[pidx_out + i]) = avg;
}
return;
}
for (int i=0; i < p.channels; i += C)
{
T v0 = *((const T*)&pin[pidx_in0 + i]);
T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
T v2 = *((const T*)&pin[pidx_in1 + i]);
T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
T avg = .25f * (v0 + v1 + v2 + v3);
#if TEX_DEBUG_MIP_RETAIN_VARIANCE
avg = (avg - .5f) * 2.f + .5f;
#endif
*((T*)&pout[pidx_out + i]) = avg;
}
}
// Template specializations.
__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float, 1>(p); }
__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }
//------------------------------------------------------------------------
// Forward kernel.
template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
{
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
int tz = (p.texDepth == 1) ? 0 : pz;
if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
return;
// Pixel index.
int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
// Output ptr.
float* pOut = p.out + pidx * p.channels;
// Get UV.
float3 uv;
if (CUBE_MODE)
uv = ((const float3*)p.uv)[pidx];
else
uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
// Nearest mode.
if (FILTER_MODE == TEX_MODE_NEAREST)
{
int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
tc *= p.channels;
const float* pIn = p.tex[0];
// Copy if valid tc, otherwise output zero.
for (int i=0; i < p.channels; i += C)
*((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>();
return; // Exit.
}
// Calculate mip level. In 'linear' mode these will all stay zero.
float flevel = 0.f; // Fractional level.
int level0 = 0; // Discrete level 0.
int level1 = 0; // Discrete level 1.
calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
// Get texel indices and pointer for level 0.
int4 tc0 = make_int4(0, 0, 0, 0);
float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
const float* pIn0 = p.tex[level0];
bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
tc0 *= p.channels;
// Bilinear fetch.
if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
{
// Interpolate.
for (int i=0; i < p.channels; i += C, tc0 += C)
{
T a00, a10, a01, a11;
fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
*((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
}
return; // Exit.
}
// Get texel indices and pointer for level 1.
int4 tc1 = make_int4(0, 0, 0, 0);
float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
const float* pIn1 = p.tex[level1];
bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
tc1 *= p.channels;
// Trilinear fetch.
for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
{
// First level.
T a00, a10, a01, a11;
fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
T a = bilerp(a00, a10, a01, a11, uv0);
// Second level unless in magnification mode.
if (flevel > 0.f)
{
T b00, b10, b01, b11;
fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
T b = bilerp(b00, b10, b01, b11, uv1);
a = lerp(a, b, flevel); // Interpolate between levels.
}
// Write.
*((T*)&pOut[i]) = a;
}
}
// Template specializations.
__global__ void TextureFwdKernelNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
//------------------------------------------------------------------------
// Gradient mip puller kernel.
template<class T, int C>
static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
{
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
if (px >= p.texWidth || py >= p.texHeight)
return;
// Number of wide elements.
int c = p.channels;
if (C == 2) c >>= 1;
if (C == 4) c >>= 2;
// Dynamically allocated shared memory for holding a texel.
extern __shared__ float s_texelAccum[];
int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
int sharedStride = blockDim.x * blockDim.y;
# define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
// Clear the texel.
for (int i=0; i < p.channels; i++)
*TEXEL_ACCUM(i) = 0.f;
// Track texel position and accumulation weight over the mip stack.
int x = px;
int y = py;
float w = 1.f;
// Pull gradients from all levels.
int2 sz = mipLevelSize(p, 0); // Previous level size.
for (int level=1; level <= p.mipLevelMax; level++)
{
// Weight decay depends on previous level size.
if (sz.x > 1) w *= .5f;
if (sz.y > 1) w *= .5f;
// Current level size and coordinates.
sz = mipLevelSize(p, level);
x >>= 1;
y >>= 1;
T* pIn = (T*)(p.gradTex[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
for (int i=0; i < c; i++)
accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
}
// Add to main texture gradients.
T* pOut = (T*)(p.gradTex[0] + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
for (int i=0; i < c; i++)
accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
}
// Template specializations.
__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float, 1>(p); }
__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }
//------------------------------------------------------------------------
// Gradient kernel.
template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
{
// Temporary space for coalesced atomics.
CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
int tz = (p.texDepth == 1) ? 0 : pz;
if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
return;
// Pixel index.
int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
// Early exit if output gradients are zero.
const float* pDy = p.dy + pidx * p.channels;
unsigned int dmax = 0u;
if ((p.channels & 3) == 0)
{
for (int i=0; i < p.channels; i += 4)
{
uint4 dy = *((const uint4*)&pDy[i]);
dmax |= (dy.x | dy.y | dy.z | dy.w);
}
}
else
{
for (int i=0; i < p.channels; i++)
dmax |= __float_as_uint(pDy[i]);
}
// Store zeros and exit.
if (__uint_as_float(dmax) == 0.f)
{
if (CUBE_MODE)
{
if (FILTER_MODE != TEX_MODE_NEAREST)
((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
if (p.gradUVDA)
{
((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
}
if (p.gradMipLevelBias)
p.gradMipLevelBias[pidx] = 0.f;
}
}
else
{
if (FILTER_MODE != TEX_MODE_NEAREST)
((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
if (p.gradUVDA)
((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
if (p.gradMipLevelBias)
p.gradMipLevelBias[pidx] = 0.f;
}
}
return;
}
// Get UV.
float3 uv;
if (CUBE_MODE)
uv = ((const float3*)p.uv)[pidx];
else
uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
// Nearest mode - texture gradients only.
if (FILTER_MODE == TEX_MODE_NEAREST)
{
int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
if (tc < 0)
return; // Outside texture.
tc *= p.channels;
float* pOut = p.gradTex[0];
// Accumulate texture gradients.
for (int i=0; i < p.channels; i++)
caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
return; // Exit.
}
// Calculate mip level. In 'linear' mode these will all stay zero.
float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
float3 dfdv = make_float3(0.f, 0.f, 0.f);
float flevel = 0.f; // Fractional level.
int level0 = 0; // Discrete level 0.
int level1 = 0; // Discrete level 1.
calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
// UV gradient accumulators.
float gu = 0.f;
float gv = 0.f;
// Get texel indices and pointers for level 0.
int4 tc0 = make_int4(0, 0, 0, 0);
float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
const float* pIn0 = p.tex[level0];
float* pOut0 = p.gradTex[level0];
bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
tc0 *= p.channels;
// Texel weights.
float uv011 = uv0.x * uv0.y;
float uv010 = uv0.x - uv011;
float uv001 = uv0.y - uv011;
float uv000 = 1.f - uv0.x - uv001;
float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
// Attribute weights.
int2 sz0 = mipLevelSize(p, level0);
float sclu0 = (float)sz0.x;
float sclv0 = (float)sz0.y;
// Bilinear mode - texture and uv gradients.
if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
{
for (int i=0; i < p.channels; i++, tc0 += 1)
{
float dy = pDy[i];
accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
float a00, a10, a01, a11;
fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
float ad = (a11 + a00 - a10 - a01);
gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
}
// Store UV gradients and exit.
if (CUBE_MODE)
((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
else
((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
return;
}
// Accumulate fractional mip level gradient.
float df = 0; // dL/df.
// Get texel indices and pointers for level 1.
int4 tc1 = make_int4(0, 0, 0, 0);
float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
const float* pIn1 = p.tex[level1];
float* pOut1 = p.gradTex[level1];
bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
tc1 *= p.channels;
// Texel weights.
float uv111 = uv1.x * uv1.y;
float uv110 = uv1.x - uv111;
float uv101 = uv1.y - uv111;
float uv100 = 1.f - uv1.x - uv101;
float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
// Attribute weights.
int2 sz1 = mipLevelSize(p, level1);
float sclu1 = (float)sz1.x;
float sclv1 = (float)sz1.y;
// Trilinear mode.
for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
{
float dy = pDy[i];
float dy0 = (1.f - flevel) * dy;
accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
// UV gradients for first level.
float a00, a10, a01, a11;
fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
float ad = (a11 + a00 - a10 - a01);
gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
// Second level unless in magnification mode.
if (flevel > 0.f)
{
// Texture gradients for second level.
float dy1 = flevel * dy;
accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
// UV gradients for second level.
float b00, b10, b01, b11;
fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
float bd = (b11 + b00 - b10 - b01);
gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
// Mip level gradient.
float a = bilerp(a00, a10, a01, a11, uv0);
float b = bilerp(b00, b10, b01, b11, uv1);
df += (b-a) * dy;
}
}
// Store UV gradients.
if (CUBE_MODE)
((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
else
((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
// Store mip level bias gradient.
if (p.gradMipLevelBias)
p.gradMipLevelBias[pidx] = df;
// Store UV pixel differential gradients.
if (!BIAS_ONLY)
{
// Final gradients.
dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
// Store them.
if (CUBE_MODE)
{
// Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
float3 g0, g1;
indexCubeMapGrad4(uv, dw, g0, g1);
((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
}
else
((float4*)p.gradUVDA)[pidx] = dw;
}
}
// Template specializations.
__global__ void TextureGradKernelNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureGradKernelLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureGradKernelLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureGradKernelCubeNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureGradKernelLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
//------------------------------------------------------------------------