Spaces:

aelitta
/

BioMistral_gradio

Runtime error

App Files Files Community

BioMistral_gradio / llama-cpp-python /vendor /llama.cpp /ggml-cuda /vecdotq.cuh

aelitta

Upload folder using huggingface_hub

4bdb245 verified about 1 year ago

raw

history blame

45.2 kB

	#include "common.cuh"

	static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
	const uint16_t * x16 = (const uint16_t ) (x8 + sizeof(int) i32); // assume at least 2 byte alignment

	int x32 = 0;
	x32 \|= x16[0] << 0;
	x32 \|= x16[1] << 16;

	return x32;
	}

	static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
	const uint16_t * x16 = (const uint16_t ) (x8 + sizeof(int) i32); // assume at least 2 byte alignment

	int x32 = 0;
	x32 \|= x16[0] << 0;
	x32 \|= x16[1] << 16;

	return x32;
	}

	static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
	return ((const int ) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
	}

	static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
	return ((const int ) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
	}


	// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
	// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q

	#define VDR_Q4_0_Q8_1_MMVQ 2
	#define VDR_Q4_0_Q8_1_MMQ 4

	template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
	const int * v, const int * u, const float & d4, const half2 & ds8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi = 0;

	#pragma unroll
	for (int i = 0; i < vdr; ++i) {
	const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
	const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

	// SIMD dot product of quantized values
	sumi = __dp4a(vi0, u[2*i+0], sumi);
	sumi = __dp4a(vi1, u[2*i+1], sumi);
	}

	const float2 ds8f = __half22float2(ds8);

	// second part effectively subtracts 8 from each quant value
	return d4 * (sumi * ds8f.x - (8vdr/QI4_0) ds8f.y);
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q4_1_Q8_1_MMVQ 2
	#define VDR_Q4_1_Q8_1_MMQ 4

	template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
	const int * v, const int * u, const half2 & dm4, const half2 & ds8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi = 0;

	#pragma unroll
	for (int i = 0; i < vdr; ++i) {
	const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
	const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

	// SIMD dot product of quantized values
	sumi = __dp4a(vi0, u[2*i+0], sumi);
	sumi = __dp4a(vi1, u[2*i+1], sumi);
	}

	#ifdef GGML_CUDA_F16
	const float2 tmp = __half22float2(__hmul2(dm4, ds8));
	const float d4d8 = tmp.x;
	const float m4s8 = tmp.y;
	#else
	const float2 dm4f = __half22float2(dm4);
	const float2 ds8f = __half22float2(ds8);
	const float d4d8 = dm4f.x * ds8f.x;
	const float m4s8 = dm4f.y * ds8f.y;
	#endif // GGML_CUDA_F16

	// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
	return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q5_0_Q8_1_MMVQ 2
	#define VDR_Q5_0_Q8_1_MMQ 4

	template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
	const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi = 0;

	#pragma unroll
	for (int i = 0; i < vdr; ++i) {
	int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
	vi0 \|= (vh[i] << 4) & 0x00000010; // 0 -> 4
	vi0 \|= (vh[i] << 11) & 0x00001000; // 1 -> 12
	vi0 \|= (vh[i] << 18) & 0x00100000; // 2 -> 20
	vi0 \|= (vh[i] << 25) & 0x10000000; // 3 -> 28
	sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

	int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
	vi1 \|= (vh[i] >> 12) & 0x00000010; // 16 -> 4
	vi1 \|= (vh[i] >> 5) & 0x00001000; // 17 -> 12
	vi1 \|= (vh[i] << 2) & 0x00100000; // 18 -> 20
	vi1 \|= (vh[i] << 9) & 0x10000000; // 19 -> 28
	sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
	}

	const float2 ds8f = __half22float2(ds8);

	// second part effectively subtracts 16 from each quant value
	return d5 * (sumi * ds8f.x - (16vdr/QI5_0) ds8f.y);
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q5_1_Q8_1_MMVQ 2
	#define VDR_Q5_1_Q8_1_MMQ 4

	template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
	const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi = 0;

	#pragma unroll
	for (int i = 0; i < vdr; ++i) {
	int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
	vi0 \|= (vh[i] << 4) & 0x00000010; // 0 -> 4
	vi0 \|= (vh[i] << 11) & 0x00001000; // 1 -> 12
	vi0 \|= (vh[i] << 18) & 0x00100000; // 2 -> 20
	vi0 \|= (vh[i] << 25) & 0x10000000; // 3 -> 28
	sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

	int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
	vi1 \|= (vh[i] >> 12) & 0x00000010; // 16 -> 4
	vi1 \|= (vh[i] >> 5) & 0x00001000; // 17 -> 12
	vi1 \|= (vh[i] << 2) & 0x00100000; // 18 -> 20
	vi1 \|= (vh[i] << 9) & 0x10000000; // 19 -> 28
	sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
	}

	#ifdef GGML_CUDA_F16
	const float2 tmp = __half22float2(__hmul2(dm5, ds8));
	const float d5d8 = tmp.x;
	const float m5s8 = tmp.y;
	#else
	const float2 dm5f = __half22float2(dm5);
	const float2 ds8f = __half22float2(ds8);
	const float d5d8 = dm5f.x * ds8f.x;
	const float m5s8 = dm5f.y * ds8f.y;
	#endif // GGML_CUDA_F16

	// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
	return sumi*d5d8 + m5s8 / (QI5_1 / vdr);

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q8_0_Q8_1_MMVQ 2
	#define VDR_Q8_0_Q8_1_MMQ 8

	template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
	const int * v, const int * u, const float & d8_0, const float & d8_1) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi = 0;

	#pragma unroll
	for (int i = 0; i < vdr; ++i) {
	// SIMD dot product of quantized values
	sumi = __dp4a(v[i], u[i], sumi);
	}

	return d8_0d8_1 sumi;
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
	const int * v, const int * u, const half2 & dm8, const half2 & ds8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi = 0;

	#pragma unroll
	for (int i = 0; i < vdr; ++i) {
	// SIMD dot product of quantized values
	sumi = __dp4a(v[i], u[i], sumi);
	}

	#ifdef GGML_CUDA_F16
	const float2 tmp = __half22float2(__hmul2(dm8, ds8));
	const float d8d8 = tmp.x;
	const float m8s8 = tmp.y;
	#else
	const float2 dm8f = __half22float2(dm8);
	const float2 ds8f = __half22float2(ds8);
	const float d8d8 = dm8f.x * ds8f.x;
	const float m8s8 = dm8f.y * ds8f.y;
	#endif // GGML_CUDA_F16

	// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
	return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q2_K_Q8_1_MMVQ 1
	#define VDR_Q2_K_Q8_1_MMQ 2

	// contiguous v/x values
	static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
	const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
	const half2 & dm2, const float * __restrict__ d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf_d = 0.0f;
	float sumf_m = 0.0f;

	#pragma unroll
	for (int i = 0; i < QR2_K; ++i) {
	const int sc = scales[2*i];

	const int vi = (v >> (2*i)) & 0x03030303;

	sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product

	// fill int with 4x m
	int m = sc >> 4;
	m \|= m << 8;
	m \|= m << 16;
	sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
	}

	const float2 dm2f = __half22float2(dm2);

	return dm2f.xsumf_d - dm2f.ysumf_m;
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	// contiguous u/y values
	static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
	const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
	const half2 & dm2, const float & d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi_d = 0;
	int sumi_m = 0;

	#pragma unroll
	for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
	int sumi_d_sc = 0;

	const int sc = scales[i0 / (QI8_1/2)];

	// fill int with 4x m
	int m = sc >> 4;
	m \|= m << 8;
	m \|= m << 16;

	#pragma unroll
	for (int i = i0; i < i0 + QI8_1/2; ++i) {
	sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
	sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
	}

	sumi_d += sumi_d_sc * (sc & 0xF);
	}

	const float2 dm2f = __half22float2(dm2);

	return d8 * (dm2f.xsumi_d - dm2f.ysumi_m);
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q3_K_Q8_1_MMVQ 1
	#define VDR_Q3_K_Q8_1_MMQ 2

	// contiguous v/x values
	static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
	const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
	const int & scale_offset, const float & d3, const float * __restrict__ d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf = 0.0f;

	#pragma unroll
	for (int i = 0; i < QR3_K; ++i) {
	const int isc = scale_offset + 2*i;

	const int isc_low = isc % (QK_K/32);
	const int sc_shift_low = 4 * (isc / (QK_K/32));
	const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;

	const int isc_high = isc % (QK_K/64);
	const int sc_shift_high = 2 * (isc / (QK_K/64));
	const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;

	const int sc = (sc_low \| sc_high) - 32;

	const int vil = (vl >> (2*i)) & 0x03030303;

	const int vih = ((vh >> i) << 2) & 0x04040404;

	const int vi = __vsubss4(vil, vih);

	sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
	}

	return d3 * sumf;
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	// contiguous u/y values
	static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
	const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
	const float & d3, const float & d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	int sumi = 0;

	#pragma unroll
	for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
	int sumi_sc = 0;

	for (int i = i0; i < i0 + QI8_1/2; ++i) {
	sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
	}

	sumi += sumi_sc * scales[i0 / (QI8_1/2)];
	}

	return d3d8 sumi;
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q4_K_Q8_1_MMVQ 2
	#define VDR_Q4_K_Q8_1_MMQ 8

	// contiguous v/x values
	static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
	const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
	const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf_d = 0.0f;
	float sumf_m = 0.0f;

	#pragma unroll
	for (int i = 0; i < QR4_K; ++i) {
	const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
	const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;

	const int dot1 = __dp4a(v1i, u[2i+1], __dp4a(v0i, u[2i+0], 0)); // SIMD dot product
	const int dot2 = __dp4a(0x01010101, u[2i+1], __dp4a(0x01010101, u[2i+0], 0)); // sum of u

	sumf_d += d8[i] * (dot1 * sc[i]);
	sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
	}

	const float2 dm4f = __half22float2(dm4);

	return dm4f.xsumf_d - dm4f.ysumf_m;

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	// contiguous u/y values
	static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
	const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
	const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf_d = 0.0f;
	float sumf_m = 0.0f;

	#pragma unroll
	for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
	int sumi_d = 0;

	#pragma unroll
	for (int j = 0; j < QI8_1; ++j) {
	sumi_d = __dp4a((v[j] >> (4i)) & 0x0F0F0F0F, u[iQI8_1 + j], sumi_d); // SIMD dot product
	}

	const float2 ds8f = __half22float2(ds8[i]);

	sumf_d += ds8f.x * (sc[i] * sumi_d);
	sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
	}

	const float2 dm4f = __half22float2(dm4);

	return dm4f.xsumf_d - dm4f.ysumf_m;

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q5_K_Q8_1_MMVQ 2
	#define VDR_Q5_K_Q8_1_MMQ 8

	// contiguous v/x values
	static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
	const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
	const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf_d = 0.0f;
	float sumf_m = 0.0f;

	#pragma unroll
	for (int i = 0; i < QR5_K; ++i) {
	const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
	const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;

	const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
	const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;

	const int v0i = vl0i \| vh0i;
	const int v1i = vl1i \| vh1i;

	const int dot1 = __dp4a(v0i, u[2i+0], __dp4a(v1i, u[2i+1], 0)); // SIMD dot product
	const int dot2 = __dp4a(0x01010101, u[2i+0], __dp4a(0x01010101, u[2i+1], 0)); // sum of u

	sumf_d += d8[i] * (dot1 * sc[i]);
	sumf_m += d8[i] * (dot2 * m[i]);

	}

	const float2 dm5f = __half22float2(dm5);

	return dm5f.xsumf_d - dm5f.ysumf_m;

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	// contiguous u/y values
	static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
	const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
	const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf_d = 0.0f;
	float sumf_m = 0.0f;

	#pragma unroll
	for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
	int sumi_d = 0;

	#pragma unroll
	for (int j = 0; j < QI8_1; ++j) {
	sumi_d = __dp4a(v[iQI8_1 + j], u[iQI8_1 + j], sumi_d); // SIMD dot product
	}

	const float2 ds8f = __half22float2(ds8[i]);

	sumf_d += ds8f.x * (sc[i] * sumi_d);
	sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
	}

	const float2 dm4f = __half22float2(dm4);

	return dm4f.xsumf_d - dm4f.ysumf_m;

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	#define VDR_Q6_K_Q8_1_MMVQ 1
	#define VDR_Q6_K_Q8_1_MMQ 8

	// contiguous v/x values
	static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
	const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
	const float & d, const float * __restrict__ d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf = 0.0f;

	#pragma unroll
	for (int i = 0; i < QR6_K; ++i) {
	const int sc = scales[4*i];

	const int vil = (vl >> (4*i)) & 0x0F0F0F0F;

	const int vih = ((vh >> (4*i)) << 4) & 0x30303030;

	const int vi = __vsubss4((vil \| vih), 0x20202020); // vi = (vil \| vih) - 32

	sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
	}

	return d*sumf;
	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	// contiguous u/y values
	static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
	const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
	const float & d6, const float * __restrict__ d8) {

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	float sumf_d = 0.0f;

	#pragma unroll
	for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
	int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale

	#pragma unroll
	for (int i = i0; i < i0 + 2; ++i) {
	sumi_d.x = __dp4a(v[2i+0], u[2i+0], sumi_d.x); // SIMD dot product
	sumi_d.x = __dp4a(v[2i+1], u[2i+1], sumi_d.x); // SIMD dot product

	sumi_d.y = __dp4a(v[2i+4], u[2i+4], sumi_d.y); // SIMD dot product
	sumi_d.y = __dp4a(v[2i+5], u[2i+5], sumi_d.y); // SIMD dot product
	}

	sumf_d += d8[i0/4] * (sc[i0/2+0]sumi_d.x + sc[i0/2+1]sumi_d.y);
	}

	return d6 * sumf_d;

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
	}

	static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;

	int v[VDR_Q4_0_Q8_1_MMVQ];
	int u[2*VDR_Q4_0_Q8_1_MMVQ];

	#pragma unroll
	for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
	v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
	u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
	u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
	}

	return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
	}


	static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;

	int v[VDR_Q4_1_Q8_1_MMVQ];
	int u[2*VDR_Q4_1_Q8_1_MMVQ];

	#pragma unroll
	for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
	v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
	u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
	u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
	}

	return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
	}

	static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;

	int vl[VDR_Q5_0_Q8_1_MMVQ];
	int vh[VDR_Q5_0_Q8_1_MMVQ];
	int u[2*VDR_Q5_0_Q8_1_MMVQ];

	#pragma unroll
	for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
	vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
	vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
	u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
	u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
	}

	return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
	}

	static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;

	int vl[VDR_Q5_1_Q8_1_MMVQ];
	int vh[VDR_Q5_1_Q8_1_MMVQ];
	int u[2*VDR_Q5_1_Q8_1_MMVQ];

	#pragma unroll
	for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
	vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
	vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
	u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
	u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
	}

	return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
	}

	static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;

	int v[VDR_Q8_0_Q8_1_MMVQ];
	int u[VDR_Q8_0_Q8_1_MMVQ];

	#pragma unroll
	for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
	v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
	u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
	}

	return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
	}

	static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q2_K * bq2_K = (const block_q2_K *) vbq;

	const int bq8_offset = QR2_K * (iqs / QI8_1);
	const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

	const uint8_t * scales = bq2_K->scales + scale_offset;

	const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
	int u[QR2_K];
	float d8[QR2_K];

	#pragma unroll
	for (int i = 0; i < QR2_K; ++ i) {
	u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
	d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
	}

	return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
	}

	static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q3_K * bq3_K = (const block_q3_K *) vbq;

	const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
	const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

	const float d = bq3_K->d;

	const int vl = get_int_from_uint8(bq3_K->qs, iqs);

	// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
	const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;

	int u[QR3_K];
	float d8[QR3_K];

	#pragma unroll
	for (int i = 0; i < QR3_K; ++i) {
	u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
	d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
	}

	return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
	}

	static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	#ifndef GGML_QKK_64
	const block_q4_K * bq4_K = (const block_q4_K *) vbq;

	int v[2];
	int u[2*QR4_K];
	float d8[QR4_K];

	// iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
	const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));

	// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
	// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
	// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
	// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108

	const int * q4 = (const int )(bq4_K->qs + 16 bq8_offset + 4 * ((iqs/2)%4));
	v[0] = q4[0];
	v[1] = q4[4];

	const uint16_t * scales = (const uint16_t *)bq4_K->scales;
	uint16_t aux[2];
	const int j = bq8_offset/2;
	if (j < 2) {
	aux[0] = scales[j+0] & 0x3f3f;
	aux[1] = scales[j+2] & 0x3f3f;
	} else {
	aux[0] = ((scales[j+2] >> 0) & 0x0f0f) \| ((scales[j-2] & 0xc0c0) >> 2);
	aux[1] = ((scales[j+2] >> 4) & 0x0f0f) \| ((scales[j-0] & 0xc0c0) >> 2);
	}
	const uint8_t * sc = (const uint8_t *)aux;
	const uint8_t * m = sc + 2;

	for (int i = 0; i < QR4_K; ++i) {
	const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
	d8[i] = __low2float(bq8i->ds);

	const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
	u[2*i+0] = q8[0];
	u[2*i+1] = q8[4];
	}

	return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);

	#else

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	const block_q4_K * bq4_K = (const block_q4_K *) vbq;

	float sumf_d = 0.0f;
	float sumf_m = 0.0f;

	uint16_t aux16[2];
	const uint8_t * s = (const uint8_t *)aux16;

	const uint16_t * a = (const uint16_t *)bq4_K->scales;
	aux16[0] = a[0] & 0x0f0f;
	aux16[1] = (a[0] >> 4) & 0x0f0f;

	const float dall = bq4_K->dm[0];
	const float dmin = bq4_K->dm[1];

	const float d8_1 = __low2float(bq8_1[0].ds);
	const float d8_2 = __low2float(bq8_1[1].ds);

	const int ui1 = ((const int )bq8_1[0].qs + (iqs/2));
	const int ui2 = ((const int )bq8_1[0].qs + (iqs/2) + 4);
	const int ui3 = ((const int )bq8_1[1].qs + (iqs/2));
	const int ui4 = ((const int )bq8_1[1].qs + (iqs/2) + 4);

	const int * q4 = (const int *)bq4_K->qs + (iqs/2);
	const int v1 = q4[0];
	const int v2 = q4[4];

	const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
	const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
	const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
	const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));

	sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
	sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);

	return dall * sumf_d - dmin * sumf_m;

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

	#endif
	}

	static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	#ifndef GGML_QKK_64
	const block_q5_K * bq5_K = (const block_q5_K *) vbq;

	int vl[2];
	int vh[2];
	int u[2*QR5_K];
	float d8[QR5_K];

	const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
	const int * ql = (const int )(bq5_K->qs + 16 bq8_offset + 4 * ((iqs/2)%4));
	const int * qh = (const int )(bq5_K->qh + 4 ((iqs/2)%4));

	vl[0] = ql[0];
	vl[1] = ql[4];

	vh[0] = qh[0] >> bq8_offset;
	vh[1] = qh[4] >> bq8_offset;

	const uint16_t * scales = (const uint16_t *)bq5_K->scales;
	uint16_t aux[2];
	const int j = bq8_offset/2;
	if (j < 2) {
	aux[0] = scales[j+0] & 0x3f3f;
	aux[1] = scales[j+2] & 0x3f3f;
	} else {
	aux[0] = ((scales[j+2] >> 0) & 0x0f0f) \| ((scales[j-2] & 0xc0c0) >> 2);
	aux[1] = ((scales[j+2] >> 4) & 0x0f0f) \| ((scales[j-0] & 0xc0c0) >> 2);
	}
	const uint8_t * sc = (const uint8_t *)aux;
	const uint8_t * m = sc + 2;

	#pragma unroll
	for (int i = 0; i < QR5_K; ++i) {
	const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
	d8[i] = __low2float(bq8i->ds);

	const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
	u[2*i+0] = q8[0];
	u[2*i+1] = q8[4];
	}

	return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);

	#else

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	const block_q5_K * bq5_K = (const block_q5_K *) vbq;

	const int8_t * s = bq5_K->scales;

	const float d = bq5_K->d;

	const float d8_1 = __low2half(bq8_1[0].ds);
	const float d8_2 = __low2half(bq8_1[1].ds);

	const int ui1 = ((const int )bq8_1[0].qs + (iqs/2));
	const int ui2 = ((const int )bq8_1[0].qs + (iqs/2) + 4);
	const int ui3 = ((const int )bq8_1[1].qs + (iqs/2));
	const int ui4 = ((const int )bq8_1[1].qs + (iqs/2) + 4);

	const int * ql = (const int *)bq5_K->qs + (iqs/2);
	const int vl1 = ql[0];
	const int vl2 = ql[4];

	const int step = 4 * (iqs/2); // 0, 4, 8, 12
	const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
	const int in = step%8; // 0, 4, 0, 4
	const int vh = (((const int )(bq5_K->qh + in))) >> im;

	const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) \| ((vl1 >> 0) & 0x0f0f0f0f);
	const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) \| ((vl2 >> 0) & 0x0f0f0f0f);
	const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) \| ((vl1 >> 4) & 0x0f0f0f0f);
	const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) \| ((vl2 >> 4) & 0x0f0f0f0f);

	const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
	+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);

	return d * sumf_d;

	#else
	NO_DEVICE_CODE;
	#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

	#endif
	}

	static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_q6_K * bq6_K = (const block_q6_K *) vbq;

	const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
	const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
	const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));

	const int vl = get_int_from_uint8(bq6_K->ql, iqs);
	const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;

	const int8_t * scales = bq6_K->scales + scale_offset;

	int u[QR6_K];
	float d8[QR6_K];

	#pragma unroll
	for (int i = 0; i < QR6_K; ++i) {
	u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
	d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
	}

	return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
	}

	static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
	#if QK_K == 256
	const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;

	#if QR2_XXS == 8
	const int ib32 = iqs;
	const uint16_t * q2 = bq2->qs + 4*ib32;
	const uint8_t * aux8 = (const uint8_t *)q2;
	const int8_t * q8 = bq8_1[ib32].qs;
	uint32_t aux32 = q2[2] \| (q2[3] << 16);
	int sumi = 0;
	for (int l = 0; l < 4; ++l) {
	const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
	const uint8_t signs = ksigns_iq2xs[aux32 & 127];
	for (int j = 0; j < 8; ++j) {
	sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
	}
	q8 += 8;
	aux32 >>= 7;
	}
	const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
	return d * sumi;
	#else
	// iqs is 0...15
	const int ib32 = iqs/2;
	const int il = iqs%2;
	const uint16_t * q2 = bq2->qs + 4*ib32;
	const uint8_t * aux8 = (const uint8_t *)q2;
	const uint8_t * grid1 = (const uint8_t )(iq2xxs_grid + aux8[2il+0]);
	const uint8_t * grid2 = (const uint8_t )(iq2xxs_grid + aux8[2il+1]);
	const uint32_t aux32 = q2[2] \| (q2[3] << 16);
	const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
	const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
	const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
	const int8_t * q8 = bq8_1[ib32].qs + 16*il;
	int sumi1 = 0, sumi2 = 0;
	for (int j = 0; j < 8; ++j) {
	sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
	sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
	}
	return d * (sumi1 + sumi2);
	#endif
	#else
	NO_DEVICE_CODE;
	#endif
	}

	static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	#if QK_K == 256
	const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;

	const int ib32 = iqs;
	const uint16_t * q2 = bq2->qs + 4*ib32;
	const int8_t * q8 = bq8_1[ib32].qs;
	const uint8_t ls1 = bq2->scales[ib32] & 0xf;
	const uint8_t ls2 = bq2->scales[ib32] >> 4;
	int sumi1 = 0;
	for (int l = 0; l < 2; ++l) {
	const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
	const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
	const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
	const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
	sumi1 = __dp4a(grid_l, ((const int )q8 + 0), sumi1);
	sumi1 = __dp4a(grid_h, ((const int )q8 + 1), sumi1);
	q8 += 8;
	}
	int sumi2 = 0;
	for (int l = 2; l < 4; ++l) {
	const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
	const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
	const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
	const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
	sumi2 = __dp4a(grid_l, ((const int )q8 + 0), sumi2);
	sumi2 = __dp4a(grid_h, ((const int )q8 + 1), sumi2);
	q8 += 8;
	}
	const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
	return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
	#else
	GGML_UNUSED(ksigns64);
	NO_DEVICE_CODE;
	#endif
	#else
	GGML_UNUSED(ksigns64);
	NO_DEVICE_CODE;
	#endif
	}

	// TODO
	static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	#if QK_K == 256
	const block_iq2_s * bq2 = (const block_iq2_s *) vbq;

	const int ib32 = iqs;
	const int8_t * q8 = bq8_1[ib32].qs;
	const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
	const uint8_t ls1 = bq2->scales[ib32] & 0xf;
	const uint8_t ls2 = bq2->scales[ib32] >> 4;
	int sumi1 = 0;
	for (int l = 0; l < 2; ++l) {
	const uint32_t * grid = (const uint32_t )(iq2s_grid + (bq2->qs[4ib32+l] \| ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
	const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
	const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
	const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
	const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
	sumi1 = __dp4a(grid_l, ((const int )q8 + 0), sumi1);
	sumi1 = __dp4a(grid_h, ((const int )q8 + 1), sumi1);
	q8 += 8;
	}
	int sumi2 = 0;
	for (int l = 2; l < 4; ++l) {
	const uint32_t * grid = (const uint32_t )(iq2s_grid + (bq2->qs[4ib32+l] \| ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
	const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
	const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
	const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
	const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
	sumi2 = __dp4a(grid_l, ((const int )q8 + 0), sumi2);
	sumi2 = __dp4a(grid_h, ((const int )q8 + 1), sumi2);
	q8 += 8;
	}
	const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
	return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
	#else
	GGML_UNUSED(ksigns64);
	NO_DEVICE_CODE;
	#endif
	#else
	GGML_UNUSED(ksigns64);
	NO_DEVICE_CODE;
	#endif
	}

	static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	#if QK_K == 256
	const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;

	const int ib32 = iqs;
	const uint8_t * q3 = bq2->qs + 8*ib32;
	const uint16_t * gas = (const uint16_t )(bq2->qs + QK_K/4) + 2ib32;
	const int8_t * q8 = bq8_1[ib32].qs;
	uint32_t aux32 = gas[0] \| (gas[1] << 16);
	int sumi = 0;
	for (int l = 0; l < 4; ++l) {
	const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
	const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
	const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
	const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
	const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
	sumi = __dp4a(grid_l, ((int )q8+0), sumi);
	sumi = __dp4a(grid_h, ((int )q8+1), sumi);
	q8 += 8;
	aux32 >>= 7;
	}
	const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
	return d * sumi;
	#else
	NO_DEVICE_CODE;
	#endif
	#else
	NO_DEVICE_CODE;
	#endif
	}

	// TODO: don't use lookup table for signs
	static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	#if QK_K == 256
	const block_iq3_s * bq2 = (const block_iq3_s *) vbq;

	const int ib32 = iqs;
	const uint8_t * qs = bq2->qs + 8*ib32;
	const int8_t * q8 = bq8_1[ib32].qs;
	int sumi = 0;
	for (int l = 0; l < 4; ++l) {
	const uint32_t * grid1 = iq3s_grid + (qs[2l+0] \| ((bq2->qh[ib32] << (8 - 2l)) & 256));
	const uint32_t * grid2 = iq3s_grid + (qs[2l+1] \| ((bq2->qh[ib32] << (7 - 2l)) & 256));
	uint32_t signs0 = __vcmpeq4(((bq2->signs[4ib32+l] & 0xf) 0x01010101) & 0x08040201, 0x08040201);
	uint32_t signs1 = __vcmpeq4(((bq2->signs[4ib32+l] >> 4) 0x01010101) & 0x08040201, 0x08040201);
	const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
	const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
	sumi = __dp4a(grid_l, ((int )q8+0), sumi);
	sumi = __dp4a(grid_h, ((int )q8+1), sumi);
	q8 += 8;
	}
	const float d = (float)bq2->d * (1 + 2((bq2->scales[ib32/2] >> 4(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
	return d * sumi;
	#else
	NO_DEVICE_CODE;
	#endif
	#else
	NO_DEVICE_CODE;
	#endif
	}

	static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
	#if QK_K == 256
	const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

	const int ib32 = iqs;
	int sumi = 0;
	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	const int * q8 = (const int *)bq8_1[ib32].qs;
	for (int l = 0; l < 4; ++l) {
	const int * grid = (const int )(iq1s_grid_gpu + (bq1->qs[4ib32+l] \| (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
	int grid0 = grid[0] & 0x0f0f0f0f;
	int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
	sumi = __dp4a(q8[2l+1], grid1, __dp4a(q8[2l+0], grid0, sumi));
	}
	#else
	const int8_t * q8 = bq8_1[ib32].qs;
	for (int l = 0; l < 4; ++l) {
	const uint8_t * grid = (const uint8_t )(iq1s_grid_gpu + (bq1->qs[4ib32+l] \| (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
	for (int j = 0; j < 4; ++j) {
	sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
	}
	q8 += 8;
	}
	#endif
	const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
	const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
	const float d = d1q * __low2float (bq8_1[ib32].ds);
	const float m = d1q * __high2float(bq8_1[ib32].ds);
	return d * sumi + m * delta;
	#else
	NO_DEVICE_CODE;
	#endif
	}

	static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
	#if QK_K == 256
	const block_iq1_m * bq1 = (const block_iq1_m *) vbq;

	const int ib32 = iqs;
	int sumi[2] = {0, 0};
	float sumf[2] = {0.f, 0.f};
	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	const int * q8 = (const int *)bq8_1[ib32].qs;
	for (int l = 0; l < 4; ++l) {
	const int * grid = (const int )(iq1s_grid_gpu + (bq1->qs[4ib32+l] \| (((bq1->qh[2ib32+l/2] >> 4(l%2)) & 7) << 8)));
	int grid0 = grid[0] & 0x0f0f0f0f;
	int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
	sumi[l/2] = __dp4a(q8[2l+1], grid1, __dp4a(q8[2l+0], grid0, sumi[l/2]));
	const float delta = (bq1->qh[2ib32+l/2] >> 4(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
	const int sumy = __dp4a(q8[2l+1], 0x01010101, __dp4a(q8[2l+0], 0x01010101, 0));
	sumf[l/2] += delta*sumy;
	}
	#else
	const int8_t * q8 = bq8_1[ib32].qs;
	for (int l = 0; l < 4; ++l) {
	const uint8_t * grid = (const uint8_t )(iq1s_grid_gpu + (bq1->qs[4ib32+l] \| (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
	int sumy = 0;
	for (int j = 0; j < 4; ++j) {
	sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
	sumy += q8[j] + q8[j+4];
	}
	const float delta = (bq1->qh[2ib32+l/2] >> 4(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
	sumf[l/2] += delta*sumy;
	q8 += 8;
	}
	#endif
	iq1m_scale_t scale;
	const uint16_t * sc = (const uint16_t *)bq1->scales;
	scale.u16 = (sc[0] >> 12) \| ((sc[1] >> 8) & 0x00f0) \| ((sc[2] >> 4) & 0x0f00) \| (sc[3] & 0xf000);
	const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
	return d * ((sumi[0] + sumf[0]) * (2((sc[ib32/2] >> 6(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2((sc[ib32/2] >> (6(ib32%2)+3)) & 0x7) + 1));
	#else
	NO_DEVICE_CODE;
	#endif
	}

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
	int & val1, int & val2) {

	uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
	aux32 = q4 & 0x0f0f0f0f;
	uint16_t v1 = values[q8[0]] \| (values[q8[1]] << 8);
	uint16_t v2 = values[q8[2]] \| (values[q8[3]] << 8);
	val1 = v1 \| (v2 << 16);
	aux32 = (q4 >> 4) & 0x0f0f0f0f;
	v1 = values[q8[0]] \| (values[q8[1]] << 8);
	v2 = values[q8[2]] \| (values[q8[3]] << 8);
	val2 = v1 \| (v2 << 16);
	}
	#endif

	static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	const block_iq4_nl * bq = (const block_iq4_nl *) vbq;

	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
	const uint16_t * q4 = (const uint16_t )bq->qs + 2iqs;
	const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;

	const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

	int v1, v2;
	int sumi1 = 0, sumi2 = 0;
	for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
	const uint32_t aux = q4[2l] \| (q4[2l+1] << 16);
	get_int_from_table_16(aux, values, v1, v2);
	sumi1 = __dp4a(v1, q8[l+0], sumi1);
	sumi2 = __dp4a(v2, q8[l+4], sumi2);
	}

	#else
	const uint8_t * q4 = bq->qs + 4*iqs;
	const int8_t * q8 = bq8_1->qs + 4*iqs;

	int sumi1 = 0, sumi2 = 0;
	for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
	sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
	sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
	}
	#endif
	const float d = (float)bq->d * __low2float(bq8_1->ds);
	return d * (sumi1 + sumi2);
	}

	static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
	const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

	#if QK_K == 256
	#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

	const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
	const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

	// iqs is 0...7
	const int ib32 = iqs;
	const int32_t * q8 = (const int *)bq8_1[ib32].qs;
	const uint32_t * q4 = (const uint32_t )bq4->qs + 4ib32;
	const int8_t ls = ((bq4->scales_l[ib32/2] >> 4(ib32%2)) & 0xf) \| (((bq4->scales_h >> 2ib32) & 3) << 4);
	const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
	int v1, v2;
	int sumi1 = 0, sumi2 = 0;
	for (int j = 0; j < 4; ++j) {
	get_int_from_table_16(q4[j], values, v1, v2);
	sumi1 = __dp4a(v1, q8[j+0], sumi1);
	sumi2 = __dp4a(v2, q8[j+4], sumi2);
	}
	return d * (sumi1 + sumi2);

	#else
	NO_DEVICE_CODE;
	#endif
	#else
	return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
	#endif
	}