Spaces:
Running
Running
ggml : drop support for QK_K=64 (llama/7473)
Browse files* ggml : drop support for QK_K=64
ggml-ci
* opencl : restore QK_K=256 define
- ggml-common.h +0 -54
- ggml-cuda/convert.cu +0 -138
- ggml-cuda/dmmv.cu +0 -151
- ggml-cuda/mmq.cu +0 -6
- ggml-cuda/vecdotq.cuh +0 -126
- ggml-metal.m +0 -17
- ggml-metal.metal +7 -393
- ggml-opencl.cpp +1 -1
- ggml-quants.c +0 -0
- ggml-sycl.cpp +2 -470
- ggml.c +0 -12
ggml-common.h
CHANGED
|
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
|
| 65 |
// QK = number of values after dequantization
|
| 66 |
// QK_K = super-block size
|
| 67 |
|
| 68 |
-
#ifdef GGML_QKK_64
|
| 69 |
-
#define QK_K 64
|
| 70 |
-
#define K_SCALE_SIZE 4
|
| 71 |
-
#else
|
| 72 |
#define QK_K 256
|
| 73 |
#define K_SCALE_SIZE 12
|
| 74 |
-
#endif // GGML_QKK_64
|
| 75 |
|
| 76 |
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
| 77 |
// QR = QK / number of values before dequantization
|
|
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
|
| 131 |
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
| 132 |
#define QR4_NL 2
|
| 133 |
|
| 134 |
-
#if QK_K == 64
|
| 135 |
-
#define QI4_XS QI4_NL
|
| 136 |
-
#define QR4_XS QR4_NL
|
| 137 |
-
#else
|
| 138 |
#define QI4_XS (QK_K / (4*QR4_XS))
|
| 139 |
#define QR4_XS 8
|
| 140 |
-
#endif
|
| 141 |
|
| 142 |
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
| 143 |
|
|
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
|
| 228 |
// weight is represented as x = a * q
|
| 229 |
// 16 blocks of 16 elements each
|
| 230 |
// Effectively 3.4375 bits per weight
|
| 231 |
-
#ifdef GGML_QKK_64
|
| 232 |
-
typedef struct {
|
| 233 |
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 234 |
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
| 235 |
-
uint8_t scales[2];
|
| 236 |
-
ggml_half d; // super-block scale
|
| 237 |
-
} block_q3_K;
|
| 238 |
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
| 239 |
-
#else
|
| 240 |
typedef struct {
|
| 241 |
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 242 |
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
|
@@ -244,20 +225,11 @@ typedef struct {
|
|
| 244 |
ggml_half d; // super-block scale
|
| 245 |
} block_q3_K;
|
| 246 |
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
| 247 |
-
#endif
|
| 248 |
|
| 249 |
// 4-bit quantization
|
| 250 |
// 8 blocks of 32 elements each
|
| 251 |
// weight is represented as x = a * q + b
|
| 252 |
// Effectively 4.5 bits per weight
|
| 253 |
-
#ifdef GGML_QKK_64
|
| 254 |
-
typedef struct {
|
| 255 |
-
ggml_half d[2]; // super-block scales/mins
|
| 256 |
-
uint8_t scales[2]; // 4-bit block scales/mins
|
| 257 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 258 |
-
} block_q4_K;
|
| 259 |
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
| 260 |
-
#else
|
| 261 |
typedef struct {
|
| 262 |
union {
|
| 263 |
struct {
|
|
@@ -270,21 +242,11 @@ typedef struct {
|
|
| 270 |
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 271 |
} block_q4_K;
|
| 272 |
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
| 273 |
-
#endif
|
| 274 |
|
| 275 |
// 5-bit quantization
|
| 276 |
// 8 blocks of 32 elements each
|
| 277 |
// weight is represented as x = a * q + b
|
| 278 |
// Effectively 5.5 bits per weight
|
| 279 |
-
#ifdef GGML_QKK_64
|
| 280 |
-
typedef struct {
|
| 281 |
-
ggml_half d; // super-block scale
|
| 282 |
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
| 283 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 284 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 285 |
-
} block_q5_K;
|
| 286 |
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
| 287 |
-
#else
|
| 288 |
typedef struct {
|
| 289 |
union {
|
| 290 |
struct {
|
|
@@ -298,7 +260,6 @@ typedef struct {
|
|
| 298 |
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 299 |
} block_q5_K;
|
| 300 |
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
| 301 |
-
#endif
|
| 302 |
|
| 303 |
// 6-bit quantization
|
| 304 |
// weight is represented as x = a * q
|
|
@@ -356,11 +317,7 @@ typedef struct {
|
|
| 356 |
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
| 357 |
|
| 358 |
// 3.4375 bpw
|
| 359 |
-
#if QK_K == 64
|
| 360 |
-
#define IQ3S_N_SCALE 2
|
| 361 |
-
#else
|
| 362 |
#define IQ3S_N_SCALE QK_K/64
|
| 363 |
-
#endif
|
| 364 |
typedef struct {
|
| 365 |
ggml_half d;
|
| 366 |
uint8_t qs[QK_K/4];
|
|
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
|
| 381 |
typedef struct {
|
| 382 |
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
| 383 |
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
| 384 |
-
#if QK_K == 64
|
| 385 |
-
ggml_half d;
|
| 386 |
-
#endif
|
| 387 |
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
| 388 |
} block_iq1_m;
|
| 389 |
-
#if QK_K == 64
|
| 390 |
-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
| 391 |
-
#else
|
| 392 |
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
| 393 |
-
#endif
|
| 394 |
|
| 395 |
// Used by IQ1_M quants
|
| 396 |
typedef union {
|
|
@@ -406,9 +356,6 @@ typedef struct {
|
|
| 406 |
} block_iq4_nl;
|
| 407 |
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
| 408 |
|
| 409 |
-
#if QK_K == 64
|
| 410 |
-
#define block_iq4_xs block_iq4_nl
|
| 411 |
-
#else
|
| 412 |
typedef struct {
|
| 413 |
ggml_half d;
|
| 414 |
uint16_t scales_h;
|
|
@@ -416,7 +363,6 @@ typedef struct {
|
|
| 416 |
uint8_t qs[QK_K/2];
|
| 417 |
} block_iq4_xs;
|
| 418 |
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
| 419 |
-
#endif
|
| 420 |
|
| 421 |
#endif // GGML_COMMON_DECL
|
| 422 |
#endif // GGML_COMMON_DECL
|
|
|
|
| 65 |
// QK = number of values after dequantization
|
| 66 |
// QK_K = super-block size
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
#define QK_K 256
|
| 69 |
#define K_SCALE_SIZE 12
|
|
|
|
| 70 |
|
| 71 |
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
| 72 |
// QR = QK / number of values before dequantization
|
|
|
|
| 126 |
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
| 127 |
#define QR4_NL 2
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
#define QI4_XS (QK_K / (4*QR4_XS))
|
| 130 |
#define QR4_XS 8
|
|
|
|
| 131 |
|
| 132 |
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
| 133 |
|
|
|
|
| 218 |
// weight is represented as x = a * q
|
| 219 |
// 16 blocks of 16 elements each
|
| 220 |
// Effectively 3.4375 bits per weight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
typedef struct {
|
| 222 |
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 223 |
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
|
|
|
| 225 |
ggml_half d; // super-block scale
|
| 226 |
} block_q3_K;
|
| 227 |
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
|
|
|
| 228 |
|
| 229 |
// 4-bit quantization
|
| 230 |
// 8 blocks of 32 elements each
|
| 231 |
// weight is represented as x = a * q + b
|
| 232 |
// Effectively 4.5 bits per weight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
typedef struct {
|
| 234 |
union {
|
| 235 |
struct {
|
|
|
|
| 242 |
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 243 |
} block_q4_K;
|
| 244 |
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
|
|
|
| 245 |
|
| 246 |
// 5-bit quantization
|
| 247 |
// 8 blocks of 32 elements each
|
| 248 |
// weight is represented as x = a * q + b
|
| 249 |
// Effectively 5.5 bits per weight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
typedef struct {
|
| 251 |
union {
|
| 252 |
struct {
|
|
|
|
| 260 |
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 261 |
} block_q5_K;
|
| 262 |
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
|
|
|
| 263 |
|
| 264 |
// 6-bit quantization
|
| 265 |
// weight is represented as x = a * q
|
|
|
|
| 317 |
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
| 318 |
|
| 319 |
// 3.4375 bpw
|
|
|
|
|
|
|
|
|
|
| 320 |
#define IQ3S_N_SCALE QK_K/64
|
|
|
|
| 321 |
typedef struct {
|
| 322 |
ggml_half d;
|
| 323 |
uint8_t qs[QK_K/4];
|
|
|
|
| 338 |
typedef struct {
|
| 339 |
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
| 340 |
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
|
|
|
|
|
|
|
|
|
| 341 |
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
| 342 |
} block_iq1_m;
|
|
|
|
|
|
|
|
|
|
| 343 |
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
|
|
|
| 344 |
|
| 345 |
// Used by IQ1_M quants
|
| 346 |
typedef union {
|
|
|
|
| 356 |
} block_iq4_nl;
|
| 357 |
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
| 358 |
|
|
|
|
|
|
|
|
|
|
| 359 |
typedef struct {
|
| 360 |
ggml_half d;
|
| 361 |
uint16_t scales_h;
|
|
|
|
| 363 |
uint8_t qs[QK_K/2];
|
| 364 |
} block_iq4_xs;
|
| 365 |
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
|
|
|
| 366 |
|
| 367 |
#endif // GGML_COMMON_DECL
|
| 368 |
#endif // GGML_COMMON_DECL
|
ggml-cuda/convert.cu
CHANGED
|
@@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
|
| 131 |
const block_q2_K * x = (const block_q2_K *) vx;
|
| 132 |
|
| 133 |
const int64_t tid = threadIdx.x;
|
| 134 |
-
#if QK_K == 256
|
| 135 |
const int64_t n = tid/32;
|
| 136 |
const int64_t l = tid - 32*n;
|
| 137 |
const int64_t is = 8*n + l/16;
|
|
@@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
|
| 145 |
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 146 |
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
| 147 |
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
| 148 |
-
#else
|
| 149 |
-
const int64_t is = tid/16; // 0 or 1
|
| 150 |
-
const int64_t il = tid%16; // 0...15
|
| 151 |
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 152 |
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 153 |
-
float dall = __low2half(x[i].dm);
|
| 154 |
-
float dmin = __high2half(x[i].dm);
|
| 155 |
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
| 156 |
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 157 |
-
#endif
|
| 158 |
-
|
| 159 |
}
|
| 160 |
|
| 161 |
template<typename dst_t>
|
|
@@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
|
| 164 |
const int64_t i = blockIdx.x;
|
| 165 |
const block_q3_K * x = (const block_q3_K *) vx;
|
| 166 |
|
| 167 |
-
#if QK_K == 256
|
| 168 |
const int64_t r = threadIdx.x/4;
|
| 169 |
const int64_t tid = r/2;
|
| 170 |
const int64_t is0 = r%2;
|
|
@@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
|
| 188 |
const uint8_t * hm = x[i].hmask;
|
| 189 |
|
| 190 |
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
| 191 |
-
#else
|
| 192 |
-
const int64_t tid = threadIdx.x;
|
| 193 |
-
const int64_t is = tid/16; // 0 or 1
|
| 194 |
-
const int64_t il = tid%16; // 0...15
|
| 195 |
-
const int64_t im = il/8; // 0...1
|
| 196 |
-
const int64_t in = il%8; // 0...7
|
| 197 |
-
|
| 198 |
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 199 |
-
|
| 200 |
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 201 |
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
| 202 |
-
const float d = (float)x[i].d;
|
| 203 |
-
|
| 204 |
-
if (is == 0) {
|
| 205 |
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
| 206 |
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
| 207 |
-
} else {
|
| 208 |
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
| 209 |
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
| 210 |
-
}
|
| 211 |
-
#endif
|
| 212 |
-
|
| 213 |
}
|
| 214 |
|
| 215 |
-
#if QK_K == 256
|
| 216 |
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
| 217 |
if (j < 4) {
|
| 218 |
d = q[j] & 63; m = q[j + 4] & 63;
|
|
@@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
| 221 |
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
| 222 |
}
|
| 223 |
}
|
| 224 |
-
#endif
|
| 225 |
|
| 226 |
template<typename dst_t>
|
| 227 |
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
|
@@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
|
| 229 |
|
| 230 |
const int64_t i = blockIdx.x;
|
| 231 |
|
| 232 |
-
#if QK_K == 256
|
| 233 |
// assume 32 threads
|
| 234 |
const int64_t tid = threadIdx.x;
|
| 235 |
const int64_t il = tid/8;
|
|
@@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
|
| 253 |
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
| 254 |
y[l +32] = d2 * (q[l] >> 4) - m2;
|
| 255 |
}
|
| 256 |
-
#else
|
| 257 |
-
const int64_t tid = threadIdx.x;
|
| 258 |
-
const uint8_t * q = x[i].qs;
|
| 259 |
-
dst_t * y = yy + i*QK_K;
|
| 260 |
-
const float d = (float)x[i].dm[0];
|
| 261 |
-
const float m = (float)x[i].dm[1];
|
| 262 |
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
| 263 |
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
| 264 |
-
#endif
|
| 265 |
}
|
| 266 |
|
| 267 |
template<typename dst_t>
|
|
@@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
|
| 270 |
|
| 271 |
const int64_t i = blockIdx.x;
|
| 272 |
|
| 273 |
-
#if QK_K == 256
|
| 274 |
// assume 64 threads - this is very slightly better than the one below
|
| 275 |
const int64_t tid = threadIdx.x;
|
| 276 |
const int64_t il = tid/16; // il is in 0...3
|
|
@@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
|
| 297 |
hm <<= 1;
|
| 298 |
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
| 299 |
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
| 300 |
-
#else
|
| 301 |
-
const int64_t tid = threadIdx.x;
|
| 302 |
-
const uint8_t q = x[i].qs[tid];
|
| 303 |
-
const int64_t im = tid/8; // 0...3
|
| 304 |
-
const int64_t in = tid%8; // 0...7
|
| 305 |
-
const int64_t is = tid/16; // 0 or 1
|
| 306 |
-
const uint8_t h = x[i].qh[in] >> im;
|
| 307 |
-
const float d = x[i].d;
|
| 308 |
-
dst_t * y = yy + i*QK_K + tid;
|
| 309 |
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
| 310 |
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
| 311 |
-
#endif
|
| 312 |
}
|
| 313 |
|
| 314 |
template<typename dst_t>
|
|
@@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|
| 316 |
const block_q6_K * x = (const block_q6_K *) vx;
|
| 317 |
|
| 318 |
const int64_t i = blockIdx.x;
|
| 319 |
-
#if QK_K == 256
|
| 320 |
|
| 321 |
// assume 64 threads - this is very slightly better than the one below
|
| 322 |
const int64_t tid = threadIdx.x;
|
|
@@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|
| 336 |
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
| 337 |
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 338 |
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
| 339 |
-
#else
|
| 340 |
-
|
| 341 |
-
// assume 32 threads
|
| 342 |
-
const int64_t tid = threadIdx.x;
|
| 343 |
-
const int64_t ip = tid/16; // 0 or 1
|
| 344 |
-
const int64_t il = tid - 16*ip; // 0...15
|
| 345 |
-
|
| 346 |
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
| 347 |
-
|
| 348 |
-
const float d = x[i].d;
|
| 349 |
-
|
| 350 |
-
const uint8_t ql = x[i].ql[16*ip + il];
|
| 351 |
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
| 352 |
-
const int8_t * sc = x[i].scales;
|
| 353 |
-
|
| 354 |
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
| 355 |
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 356 |
-
#endif
|
| 357 |
}
|
| 358 |
|
| 359 |
template<typename dst_t>
|
|
@@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
|
| 363 |
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
| 364 |
|
| 365 |
const int64_t tid = threadIdx.x;
|
| 366 |
-
#if QK_K == 256
|
| 367 |
const int64_t il = tid/8; // 0...3
|
| 368 |
const int64_t ib = tid%8; // 0...7
|
| 369 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
|
| 374 |
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
| 375 |
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
| 376 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 377 |
-
#else
|
| 378 |
-
NO_DEVICE_CODE;
|
| 379 |
-
#endif
|
| 380 |
-
|
| 381 |
}
|
| 382 |
|
| 383 |
template<typename dst_t>
|
|
@@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|
| 387 |
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
| 388 |
|
| 389 |
const int64_t tid = threadIdx.x;
|
| 390 |
-
#if QK_K == 256
|
| 391 |
const int64_t il = tid/8; // 0...3
|
| 392 |
const int64_t ib = tid%8; // 0...7
|
| 393 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|
| 396 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 397 |
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
| 398 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 399 |
-
#else
|
| 400 |
-
NO_DEVICE_CODE;
|
| 401 |
-
#endif
|
| 402 |
-
|
| 403 |
}
|
| 404 |
|
| 405 |
template<typename dst_t>
|
|
@@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
|
| 409 |
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 410 |
|
| 411 |
const int64_t tid = threadIdx.x;
|
| 412 |
-
#if QK_K == 256
|
| 413 |
const int64_t il = tid/8; // 0...3
|
| 414 |
const int64_t ib = tid%8; // 0...7
|
| 415 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
|
| 417 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 418 |
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
| 419 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 420 |
-
#else
|
| 421 |
-
NO_DEVICE_CODE;
|
| 422 |
-
#endif
|
| 423 |
-
|
| 424 |
}
|
| 425 |
|
| 426 |
template<typename dst_t>
|
|
@@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|
| 430 |
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
| 431 |
|
| 432 |
const int64_t tid = threadIdx.x;
|
| 433 |
-
#if QK_K == 256
|
| 434 |
const int64_t il = tid/8; // 0...3
|
| 435 |
const int64_t ib = tid%8; // 0...7
|
| 436 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|
| 445 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 446 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 447 |
}
|
| 448 |
-
#else
|
| 449 |
-
NO_DEVICE_CODE;
|
| 450 |
-
#endif
|
| 451 |
-
|
| 452 |
}
|
| 453 |
|
| 454 |
template<typename dst_t>
|
|
@@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
|
| 458 |
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 459 |
|
| 460 |
const int64_t tid = threadIdx.x;
|
| 461 |
-
#if QK_K == 256
|
| 462 |
const int64_t il = tid/8; // 0...3
|
| 463 |
const int64_t ib = tid%8; // 0...7
|
| 464 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
|
| 471 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 472 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 473 |
}
|
| 474 |
-
#else
|
| 475 |
-
NO_DEVICE_CODE;
|
| 476 |
-
#endif
|
| 477 |
-
|
| 478 |
}
|
| 479 |
|
| 480 |
template<typename dst_t>
|
|
@@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
|
| 484 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 485 |
|
| 486 |
const int64_t tid = threadIdx.x;
|
| 487 |
-
#if QK_K == 256
|
| 488 |
const int64_t il = tid/8; // 0...3
|
| 489 |
const int64_t ib = tid%8; // 0...7
|
| 490 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
|
| 497 |
for (int j = 0; j < 8; ++j) {
|
| 498 |
y[j] = d * (q[j] + delta);
|
| 499 |
}
|
| 500 |
-
#else
|
| 501 |
-
NO_DEVICE_CODE;
|
| 502 |
-
#endif
|
| 503 |
-
|
| 504 |
}
|
| 505 |
|
| 506 |
template<typename dst_t>
|
|
@@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
|
| 510 |
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 511 |
|
| 512 |
const int64_t tid = threadIdx.x;
|
| 513 |
-
#if QK_K == 256
|
| 514 |
const int64_t il = tid/8; // 0...3
|
| 515 |
const int64_t ib = tid%8; // 0...7
|
| 516 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
|
| 527 |
for (int j = 0; j < 8; ++j) {
|
| 528 |
y[j] = d * (q[j] + delta);
|
| 529 |
}
|
| 530 |
-
#else
|
| 531 |
-
NO_DEVICE_CODE;
|
| 532 |
-
#endif
|
| 533 |
-
|
| 534 |
}
|
| 535 |
|
| 536 |
-
|
| 537 |
template<typename dst_t>
|
| 538 |
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 539 |
|
|
@@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
|
| 550 |
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
| 551 |
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 552 |
}
|
| 553 |
-
|
| 554 |
}
|
| 555 |
|
| 556 |
-
#if QK_K != 64
|
| 557 |
template<typename dst_t>
|
| 558 |
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 559 |
const int64_t i = blockIdx.x;
|
|
@@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
|
| 570 |
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 571 |
}
|
| 572 |
}
|
| 573 |
-
#endif
|
| 574 |
|
| 575 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 576 |
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
|
@@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
|
| 592 |
template<typename dst_t>
|
| 593 |
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 594 |
const int nb = k / QK_K;
|
| 595 |
-
#if QK_K == 256
|
| 596 |
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 597 |
-
#else
|
| 598 |
-
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 599 |
-
#endif
|
| 600 |
}
|
| 601 |
|
| 602 |
template<typename dst_t>
|
| 603 |
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 604 |
const int nb = k / QK_K;
|
| 605 |
-
#if QK_K == 256
|
| 606 |
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 607 |
-
#else
|
| 608 |
-
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 609 |
-
#endif
|
| 610 |
}
|
| 611 |
|
| 612 |
template<typename dst_t>
|
|
@@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
|
|
| 632 |
template<typename dst_t>
|
| 633 |
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 634 |
const int nb = k / QK_K;
|
| 635 |
-
#if QK_K == 256
|
| 636 |
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 637 |
-
#else
|
| 638 |
-
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 639 |
-
#endif
|
| 640 |
}
|
| 641 |
|
| 642 |
template<typename dst_t>
|
| 643 |
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 644 |
const int nb = k / QK_K;
|
| 645 |
-
#if QK_K == 256
|
| 646 |
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
| 647 |
-
#else
|
| 648 |
-
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
| 649 |
-
#endif
|
| 650 |
}
|
| 651 |
|
| 652 |
template<typename dst_t>
|
|
@@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
|
|
| 700 |
template<typename dst_t>
|
| 701 |
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 702 |
const int nb = (k + QK_K - 1) / QK_K;
|
| 703 |
-
#if QK_K == 64
|
| 704 |
-
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
| 705 |
-
#else
|
| 706 |
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
| 707 |
-
#endif
|
| 708 |
}
|
| 709 |
|
| 710 |
template <typename src_t, typename dst_t>
|
|
|
|
| 131 |
const block_q2_K * x = (const block_q2_K *) vx;
|
| 132 |
|
| 133 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 134 |
const int64_t n = tid/32;
|
| 135 |
const int64_t l = tid - 32*n;
|
| 136 |
const int64_t is = 8*n + l/16;
|
|
|
|
| 144 |
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 145 |
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
| 146 |
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
}
|
| 148 |
|
| 149 |
template<typename dst_t>
|
|
|
|
| 152 |
const int64_t i = blockIdx.x;
|
| 153 |
const block_q3_K * x = (const block_q3_K *) vx;
|
| 154 |
|
|
|
|
| 155 |
const int64_t r = threadIdx.x/4;
|
| 156 |
const int64_t tid = r/2;
|
| 157 |
const int64_t is0 = r%2;
|
|
|
|
| 175 |
const uint8_t * hm = x[i].hmask;
|
| 176 |
|
| 177 |
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
}
|
| 179 |
|
|
|
|
| 180 |
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
| 181 |
if (j < 4) {
|
| 182 |
d = q[j] & 63; m = q[j + 4] & 63;
|
|
|
|
| 185 |
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
| 186 |
}
|
| 187 |
}
|
|
|
|
| 188 |
|
| 189 |
template<typename dst_t>
|
| 190 |
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
|
|
|
| 192 |
|
| 193 |
const int64_t i = blockIdx.x;
|
| 194 |
|
|
|
|
| 195 |
// assume 32 threads
|
| 196 |
const int64_t tid = threadIdx.x;
|
| 197 |
const int64_t il = tid/8;
|
|
|
|
| 215 |
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
| 216 |
y[l +32] = d2 * (q[l] >> 4) - m2;
|
| 217 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
}
|
| 219 |
|
| 220 |
template<typename dst_t>
|
|
|
|
| 223 |
|
| 224 |
const int64_t i = blockIdx.x;
|
| 225 |
|
|
|
|
| 226 |
// assume 64 threads - this is very slightly better than the one below
|
| 227 |
const int64_t tid = threadIdx.x;
|
| 228 |
const int64_t il = tid/16; // il is in 0...3
|
|
|
|
| 249 |
hm <<= 1;
|
| 250 |
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
| 251 |
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
}
|
| 253 |
|
| 254 |
template<typename dst_t>
|
|
|
|
| 256 |
const block_q6_K * x = (const block_q6_K *) vx;
|
| 257 |
|
| 258 |
const int64_t i = blockIdx.x;
|
|
|
|
| 259 |
|
| 260 |
// assume 64 threads - this is very slightly better than the one below
|
| 261 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 275 |
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
| 276 |
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 277 |
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
}
|
| 279 |
|
| 280 |
template<typename dst_t>
|
|
|
|
| 284 |
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
| 285 |
|
| 286 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 287 |
const int64_t il = tid/8; // 0...3
|
| 288 |
const int64_t ib = tid%8; // 0...7
|
| 289 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 294 |
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
| 295 |
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
| 296 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
}
|
| 298 |
|
| 299 |
template<typename dst_t>
|
|
|
|
| 303 |
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
| 304 |
|
| 305 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 306 |
const int64_t il = tid/8; // 0...3
|
| 307 |
const int64_t ib = tid%8; // 0...7
|
| 308 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 311 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 312 |
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
| 313 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
}
|
| 315 |
|
| 316 |
template<typename dst_t>
|
|
|
|
| 320 |
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 321 |
|
| 322 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 323 |
const int64_t il = tid/8; // 0...3
|
| 324 |
const int64_t ib = tid%8; // 0...7
|
| 325 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 327 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 328 |
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
| 329 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
}
|
| 331 |
|
| 332 |
template<typename dst_t>
|
|
|
|
| 336 |
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
| 337 |
|
| 338 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 339 |
const int64_t il = tid/8; // 0...3
|
| 340 |
const int64_t ib = tid%8; // 0...7
|
| 341 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 350 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 351 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 352 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
}
|
| 354 |
|
| 355 |
template<typename dst_t>
|
|
|
|
| 359 |
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 360 |
|
| 361 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 362 |
const int64_t il = tid/8; // 0...3
|
| 363 |
const int64_t ib = tid%8; // 0...7
|
| 364 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 371 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 372 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 373 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
}
|
| 375 |
|
| 376 |
template<typename dst_t>
|
|
|
|
| 380 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 381 |
|
| 382 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 383 |
const int64_t il = tid/8; // 0...3
|
| 384 |
const int64_t ib = tid%8; // 0...7
|
| 385 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 392 |
for (int j = 0; j < 8; ++j) {
|
| 393 |
y[j] = d * (q[j] + delta);
|
| 394 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
}
|
| 396 |
|
| 397 |
template<typename dst_t>
|
|
|
|
| 401 |
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 402 |
|
| 403 |
const int64_t tid = threadIdx.x;
|
|
|
|
| 404 |
const int64_t il = tid/8; // 0...3
|
| 405 |
const int64_t ib = tid%8; // 0...7
|
| 406 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 417 |
for (int j = 0; j < 8; ++j) {
|
| 418 |
y[j] = d * (q[j] + delta);
|
| 419 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
}
|
| 421 |
|
|
|
|
| 422 |
template<typename dst_t>
|
| 423 |
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 424 |
|
|
|
|
| 435 |
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
| 436 |
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 437 |
}
|
|
|
|
| 438 |
}
|
| 439 |
|
|
|
|
| 440 |
template<typename dst_t>
|
| 441 |
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 442 |
const int64_t i = blockIdx.x;
|
|
|
|
| 453 |
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
| 454 |
}
|
| 455 |
}
|
|
|
|
| 456 |
|
| 457 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 458 |
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
|
|
|
| 474 |
template<typename dst_t>
|
| 475 |
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 476 |
const int nb = k / QK_K;
|
|
|
|
| 477 |
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
|
|
|
|
|
|
| 478 |
}
|
| 479 |
|
| 480 |
template<typename dst_t>
|
| 481 |
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 482 |
const int nb = k / QK_K;
|
|
|
|
| 483 |
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
|
|
|
|
|
|
| 484 |
}
|
| 485 |
|
| 486 |
template<typename dst_t>
|
|
|
|
| 506 |
template<typename dst_t>
|
| 507 |
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 508 |
const int nb = k / QK_K;
|
|
|
|
| 509 |
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
|
|
|
|
|
|
| 510 |
}
|
| 511 |
|
| 512 |
template<typename dst_t>
|
| 513 |
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 514 |
const int nb = k / QK_K;
|
|
|
|
| 515 |
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
|
|
|
|
|
|
|
|
|
| 516 |
}
|
| 517 |
|
| 518 |
template<typename dst_t>
|
|
|
|
| 566 |
template<typename dst_t>
|
| 567 |
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
| 568 |
const int nb = (k + QK_K - 1) / QK_K;
|
|
|
|
|
|
|
|
|
|
| 569 |
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
|
|
|
| 570 |
}
|
| 571 |
|
| 572 |
template <typename src_t, typename dst_t>
|
ggml-cuda/dmmv.cu
CHANGED
|
@@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
| 22 |
|
| 23 |
float tmp = 0; // partial sum for thread in warp
|
| 24 |
|
| 25 |
-
#if QK_K == 256
|
| 26 |
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
| 27 |
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
| 28 |
|
|
@@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
| 71 |
tmp += dall * sum1 - dmin * sum2;
|
| 72 |
|
| 73 |
}
|
| 74 |
-
#else
|
| 75 |
-
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
| 76 |
-
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
| 77 |
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
| 78 |
-
|
| 79 |
-
uint32_t uaux[2];
|
| 80 |
-
const uint8_t * d = (const uint8_t *)uaux;
|
| 81 |
-
|
| 82 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 83 |
-
|
| 84 |
-
const float * y = yy + i * QK_K + offset;
|
| 85 |
-
const uint8_t * q = x[i].qs + offset;
|
| 86 |
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
| 87 |
-
|
| 88 |
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
| 89 |
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
| 90 |
-
|
| 91 |
-
const float2 dall = __half22float2(x[i].dm);
|
| 92 |
-
|
| 93 |
-
float sum1 = 0, sum2 = 0;
|
| 94 |
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 95 |
-
const uint8_t ql = q[l];
|
| 96 |
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
| 97 |
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
| 98 |
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
| 99 |
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
| 100 |
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
| 101 |
-
}
|
| 102 |
-
tmp += dall.x * sum1 - dall.y * sum2;
|
| 103 |
-
}
|
| 104 |
-
#endif
|
| 105 |
|
| 106 |
// sum up partial sums and write back result
|
| 107 |
tmp = warp_reduce_sum(tmp);
|
|
@@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
| 123 |
|
| 124 |
float tmp = 0; // partial sum for thread in warp
|
| 125 |
|
| 126 |
-
#if QK_K == 256
|
| 127 |
-
|
| 128 |
const uint16_t kmask1 = 0x0303;
|
| 129 |
const uint16_t kmask2 = 0x0f0f;
|
| 130 |
|
|
@@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
| 175 |
tmp += d * sum;
|
| 176 |
|
| 177 |
}
|
| 178 |
-
#else
|
| 179 |
-
|
| 180 |
-
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
| 181 |
-
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
| 182 |
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
| 183 |
-
const int in = offset/8; // 0 or 1
|
| 184 |
-
const int im = offset%8; // 0...7
|
| 185 |
-
|
| 186 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 187 |
-
|
| 188 |
-
const float * y = yy + i * QK_K + offset;
|
| 189 |
-
const uint8_t * q = x[i].qs + offset;
|
| 190 |
-
const uint8_t * s = x[i].scales;
|
| 191 |
-
|
| 192 |
-
const float dall = (float)x[i].d;
|
| 193 |
-
|
| 194 |
-
float sum = 0;
|
| 195 |
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 196 |
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
| 197 |
-
const uint8_t ql = q[l];
|
| 198 |
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
| 199 |
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
| 200 |
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
| 201 |
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
| 202 |
-
}
|
| 203 |
-
tmp += sum;
|
| 204 |
-
}
|
| 205 |
-
#endif
|
| 206 |
|
| 207 |
// sum up partial sums and write back result
|
| 208 |
tmp = warp_reduce_sum(tmp);
|
|
@@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
| 221 |
|
| 222 |
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
| 223 |
|
| 224 |
-
#if QK_K == 256
|
| 225 |
const uint16_t kmask1 = 0x3f3f;
|
| 226 |
const uint16_t kmask2 = 0x0f0f;
|
| 227 |
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
| 306 |
#endif
|
| 307 |
|
| 308 |
}
|
| 309 |
-
#else
|
| 310 |
-
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
| 311 |
-
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
| 312 |
-
|
| 313 |
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 314 |
-
|
| 315 |
-
uint16_t aux16[2];
|
| 316 |
-
const uint8_t * s = (const uint8_t *)aux16;
|
| 317 |
-
|
| 318 |
-
float tmp = 0;
|
| 319 |
-
|
| 320 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 321 |
-
const uint8_t * q = x[i].qs + step;
|
| 322 |
-
const float * y = yy + i*QK_K + step;
|
| 323 |
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
| 324 |
-
aux16[0] = a[0] & 0x0f0f;
|
| 325 |
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
| 326 |
-
const float d = (float)x[i].dm[0];
|
| 327 |
-
const float m = (float)x[i].dm[1];
|
| 328 |
-
float sum = 0.f;
|
| 329 |
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 330 |
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
| 331 |
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
| 332 |
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
| 333 |
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
| 334 |
-
}
|
| 335 |
-
tmp += sum;
|
| 336 |
-
}
|
| 337 |
-
|
| 338 |
-
#endif
|
| 339 |
|
| 340 |
// sum up partial sums and write back result
|
| 341 |
tmp = warp_reduce_sum(tmp);
|
|
@@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
| 355 |
|
| 356 |
float tmp = 0; // partial sum for thread in warp
|
| 357 |
|
| 358 |
-
#if QK_K == 256
|
| 359 |
const uint16_t kmask1 = 0x3f3f;
|
| 360 |
const uint16_t kmask2 = 0x0f0f;
|
| 361 |
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
| 426 |
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
| 427 |
}
|
| 428 |
|
| 429 |
-
#else
|
| 430 |
-
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
| 431 |
-
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
| 432 |
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 433 |
-
const int im = step/8;
|
| 434 |
-
const int in = step%8;
|
| 435 |
-
|
| 436 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 437 |
-
const uint8_t * q = x[i].qs + step;
|
| 438 |
-
const int8_t * s = x[i].scales;
|
| 439 |
-
const float * y = yy + i*QK_K + step;
|
| 440 |
-
const float d = x[i].d;
|
| 441 |
-
float sum = 0.f;
|
| 442 |
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 443 |
-
const uint8_t h = x[i].qh[in+j] >> im;
|
| 444 |
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
| 445 |
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
| 446 |
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
| 447 |
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
| 448 |
-
}
|
| 449 |
-
tmp += sum;
|
| 450 |
-
}
|
| 451 |
-
#endif
|
| 452 |
-
|
| 453 |
// sum up partial sums and write back result
|
| 454 |
tmp = warp_reduce_sum(tmp);
|
| 455 |
|
|
@@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
| 470 |
|
| 471 |
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
| 472 |
|
| 473 |
-
#if QK_K == 256
|
| 474 |
-
|
| 475 |
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
| 476 |
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
| 477 |
|
|
@@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
| 526 |
|
| 527 |
}
|
| 528 |
|
| 529 |
-
#else
|
| 530 |
-
|
| 531 |
-
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
| 532 |
-
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
| 533 |
-
|
| 534 |
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 535 |
-
|
| 536 |
-
float tmp = 0; // partial sum for thread in warp
|
| 537 |
-
|
| 538 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 539 |
-
|
| 540 |
-
const float * y = yy + i * QK_K + step;
|
| 541 |
-
const uint8_t * ql = x[i].ql + step;
|
| 542 |
-
const uint8_t * qh = x[i].qh + step;
|
| 543 |
-
const int8_t * s = x[i].scales;
|
| 544 |
-
|
| 545 |
-
const float d = x[i+0].d;
|
| 546 |
-
|
| 547 |
-
float sum = 0;
|
| 548 |
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 549 |
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
| 550 |
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
| 551 |
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
| 552 |
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
| 553 |
-
}
|
| 554 |
-
tmp += sum;
|
| 555 |
-
|
| 556 |
-
}
|
| 557 |
-
|
| 558 |
-
#endif
|
| 559 |
-
|
| 560 |
// sum up partial sums and write back result
|
| 561 |
tmp = warp_reduce_sum(tmp);
|
| 562 |
|
|
|
|
| 22 |
|
| 23 |
float tmp = 0; // partial sum for thread in warp
|
| 24 |
|
|
|
|
| 25 |
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
| 26 |
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
| 27 |
|
|
|
|
| 70 |
tmp += dall * sum1 - dmin * sum2;
|
| 71 |
|
| 72 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
// sum up partial sums and write back result
|
| 75 |
tmp = warp_reduce_sum(tmp);
|
|
|
|
| 91 |
|
| 92 |
float tmp = 0; // partial sum for thread in warp
|
| 93 |
|
|
|
|
|
|
|
| 94 |
const uint16_t kmask1 = 0x0303;
|
| 95 |
const uint16_t kmask2 = 0x0f0f;
|
| 96 |
|
|
|
|
| 141 |
tmp += d * sum;
|
| 142 |
|
| 143 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
// sum up partial sums and write back result
|
| 146 |
tmp = warp_reduce_sum(tmp);
|
|
|
|
| 159 |
|
| 160 |
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
| 161 |
|
|
|
|
| 162 |
const uint16_t kmask1 = 0x3f3f;
|
| 163 |
const uint16_t kmask2 = 0x0f0f;
|
| 164 |
const uint16_t kmask3 = 0xc0c0;
|
|
|
|
| 243 |
#endif
|
| 244 |
|
| 245 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
// sum up partial sums and write back result
|
| 248 |
tmp = warp_reduce_sum(tmp);
|
|
|
|
| 262 |
|
| 263 |
float tmp = 0; // partial sum for thread in warp
|
| 264 |
|
|
|
|
| 265 |
const uint16_t kmask1 = 0x3f3f;
|
| 266 |
const uint16_t kmask2 = 0x0f0f;
|
| 267 |
const uint16_t kmask3 = 0xc0c0;
|
|
|
|
| 332 |
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
| 333 |
}
|
| 334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
// sum up partial sums and write back result
|
| 336 |
tmp = warp_reduce_sum(tmp);
|
| 337 |
|
|
|
|
| 352 |
|
| 353 |
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
| 354 |
|
|
|
|
|
|
|
| 355 |
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
| 356 |
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
| 357 |
|
|
|
|
| 406 |
|
| 407 |
}
|
| 408 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
// sum up partial sums and write back result
|
| 410 |
tmp = warp_reduce_sum(tmp);
|
| 411 |
|
ggml-cuda/mmq.cu
CHANGED
|
@@ -826,11 +826,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 826 |
|
| 827 |
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 828 |
|
| 829 |
-
#if QK_K == 256
|
| 830 |
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
| 831 |
-
#else
|
| 832 |
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
| 833 |
-
#endif
|
| 834 |
}
|
| 835 |
|
| 836 |
#pragma unroll
|
|
@@ -933,9 +929,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 933 |
|
| 934 |
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 935 |
|
| 936 |
-
#if QK_K == 256
|
| 937 |
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
| 938 |
-
#endif
|
| 939 |
}
|
| 940 |
|
| 941 |
#pragma unroll
|
|
|
|
| 826 |
|
| 827 |
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 828 |
|
|
|
|
| 829 |
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
|
|
|
|
|
|
|
|
|
| 830 |
}
|
| 831 |
|
| 832 |
#pragma unroll
|
|
|
|
| 929 |
|
| 930 |
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 931 |
|
|
|
|
| 932 |
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
|
|
|
| 933 |
}
|
| 934 |
|
| 935 |
#pragma unroll
|
ggml-cuda/vecdotq.cuh
CHANGED
|
@@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
| 712 |
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
| 713 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 714 |
|
| 715 |
-
#ifndef GGML_QKK_64
|
| 716 |
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
| 717 |
|
| 718 |
int v[2];
|
|
@@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
| 754 |
}
|
| 755 |
|
| 756 |
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
| 757 |
-
|
| 758 |
-
#else
|
| 759 |
-
|
| 760 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 761 |
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
| 762 |
-
|
| 763 |
-
float sumf_d = 0.0f;
|
| 764 |
-
float sumf_m = 0.0f;
|
| 765 |
-
|
| 766 |
-
uint16_t aux16[2];
|
| 767 |
-
const uint8_t * s = (const uint8_t *)aux16;
|
| 768 |
-
|
| 769 |
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
| 770 |
-
aux16[0] = a[0] & 0x0f0f;
|
| 771 |
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
| 772 |
-
|
| 773 |
-
const float dall = bq4_K->dm[0];
|
| 774 |
-
const float dmin = bq4_K->dm[1];
|
| 775 |
-
|
| 776 |
-
const float d8_1 = __low2float(bq8_1[0].ds);
|
| 777 |
-
const float d8_2 = __low2float(bq8_1[1].ds);
|
| 778 |
-
|
| 779 |
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
| 780 |
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
| 781 |
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
| 782 |
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
| 783 |
-
|
| 784 |
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
| 785 |
-
const int v1 = q4[0];
|
| 786 |
-
const int v2 = q4[4];
|
| 787 |
-
|
| 788 |
-
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
| 789 |
-
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
| 790 |
-
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
| 791 |
-
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
| 792 |
-
|
| 793 |
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
| 794 |
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
| 795 |
-
|
| 796 |
-
return dall * sumf_d - dmin * sumf_m;
|
| 797 |
-
|
| 798 |
-
#else
|
| 799 |
-
NO_DEVICE_CODE;
|
| 800 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 801 |
-
|
| 802 |
-
#endif
|
| 803 |
}
|
| 804 |
|
| 805 |
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
| 806 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 807 |
|
| 808 |
-
#ifndef GGML_QKK_64
|
| 809 |
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
| 810 |
|
| 811 |
int vl[2];
|
|
@@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
| 847 |
}
|
| 848 |
|
| 849 |
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
| 850 |
-
|
| 851 |
-
#else
|
| 852 |
-
|
| 853 |
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 854 |
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
| 855 |
-
|
| 856 |
-
const int8_t * s = bq5_K->scales;
|
| 857 |
-
|
| 858 |
-
const float d = bq5_K->d;
|
| 859 |
-
|
| 860 |
-
const float d8_1 = __low2half(bq8_1[0].ds);
|
| 861 |
-
const float d8_2 = __low2half(bq8_1[1].ds);
|
| 862 |
-
|
| 863 |
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
| 864 |
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
| 865 |
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
| 866 |
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
| 867 |
-
|
| 868 |
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
| 869 |
-
const int vl1 = ql[0];
|
| 870 |
-
const int vl2 = ql[4];
|
| 871 |
-
|
| 872 |
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
| 873 |
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
| 874 |
-
const int in = step%8; // 0, 4, 0, 4
|
| 875 |
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
| 876 |
-
|
| 877 |
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
| 878 |
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
| 879 |
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
| 880 |
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
| 881 |
-
|
| 882 |
-
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
| 883 |
-
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
| 884 |
-
|
| 885 |
-
return d * sumf_d;
|
| 886 |
-
|
| 887 |
-
#else
|
| 888 |
-
NO_DEVICE_CODE;
|
| 889 |
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
| 890 |
-
|
| 891 |
-
#endif
|
| 892 |
}
|
| 893 |
|
| 894 |
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
@@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
| 919 |
|
| 920 |
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
| 921 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 922 |
-
#if QK_K == 256
|
| 923 |
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
| 924 |
|
| 925 |
#if QR2_XXS == 8
|
|
@@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|
| 960 |
}
|
| 961 |
return d * (sumi1 + sumi2);
|
| 962 |
#endif
|
| 963 |
-
#else
|
| 964 |
-
NO_DEVICE_CODE;
|
| 965 |
-
#endif
|
| 966 |
}
|
| 967 |
|
| 968 |
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
| 969 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 970 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 971 |
-
#if QK_K == 256
|
| 972 |
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
| 973 |
|
| 974 |
const int ib32 = iqs;
|
|
@@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
| 1002 |
GGML_UNUSED(ksigns64);
|
| 1003 |
NO_DEVICE_CODE;
|
| 1004 |
#endif
|
| 1005 |
-
#else
|
| 1006 |
-
GGML_UNUSED(ksigns64);
|
| 1007 |
-
NO_DEVICE_CODE;
|
| 1008 |
-
#endif
|
| 1009 |
}
|
| 1010 |
|
| 1011 |
// TODO
|
| 1012 |
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
| 1013 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 1014 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 1015 |
-
#if QK_K == 256
|
| 1016 |
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
| 1017 |
|
| 1018 |
const int ib32 = iqs;
|
|
@@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
|
| 1048 |
GGML_UNUSED(ksigns64);
|
| 1049 |
NO_DEVICE_CODE;
|
| 1050 |
#endif
|
| 1051 |
-
#else
|
| 1052 |
-
GGML_UNUSED(ksigns64);
|
| 1053 |
-
NO_DEVICE_CODE;
|
| 1054 |
-
#endif
|
| 1055 |
}
|
| 1056 |
|
| 1057 |
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
| 1058 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 1059 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 1060 |
-
#if QK_K == 256
|
| 1061 |
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
| 1062 |
|
| 1063 |
const int ib32 = iqs;
|
|
@@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
|
| 1082 |
#else
|
| 1083 |
NO_DEVICE_CODE;
|
| 1084 |
#endif
|
| 1085 |
-
#else
|
| 1086 |
-
NO_DEVICE_CODE;
|
| 1087 |
-
#endif
|
| 1088 |
}
|
| 1089 |
|
| 1090 |
// TODO: don't use lookup table for signs
|
| 1091 |
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
| 1092 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 1093 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 1094 |
-
#if QK_K == 256
|
| 1095 |
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
| 1096 |
|
| 1097 |
const int ib32 = iqs;
|
|
@@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
|
| 1114 |
#else
|
| 1115 |
NO_DEVICE_CODE;
|
| 1116 |
#endif
|
| 1117 |
-
#else
|
| 1118 |
-
NO_DEVICE_CODE;
|
| 1119 |
-
#endif
|
| 1120 |
}
|
| 1121 |
|
| 1122 |
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
| 1123 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 1124 |
-
#if QK_K == 256
|
| 1125 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 1126 |
|
| 1127 |
const int ib32 = iqs;
|
|
@@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
|
| 1149 |
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
| 1150 |
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
| 1151 |
return d * sumi + m * delta;
|
| 1152 |
-
#else
|
| 1153 |
-
NO_DEVICE_CODE;
|
| 1154 |
-
#endif
|
| 1155 |
}
|
| 1156 |
|
| 1157 |
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
| 1158 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 1159 |
-
#if QK_K == 256
|
| 1160 |
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
| 1161 |
|
| 1162 |
const int ib32 = iqs;
|
|
@@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
|
| 1192 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 1193 |
const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
|
| 1194 |
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
| 1195 |
-
#else
|
| 1196 |
-
NO_DEVICE_CODE;
|
| 1197 |
-
#endif
|
| 1198 |
}
|
| 1199 |
|
| 1200 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
@@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
|
| 1250 |
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
| 1251 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 1252 |
|
| 1253 |
-
#if QK_K == 256
|
| 1254 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 1255 |
-
|
| 1256 |
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
| 1257 |
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
| 1258 |
|
|
@@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
|
| 1270 |
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
| 1271 |
}
|
| 1272 |
return d * (sumi1 + sumi2);
|
| 1273 |
-
|
| 1274 |
-
#else
|
| 1275 |
-
NO_DEVICE_CODE;
|
| 1276 |
-
#endif
|
| 1277 |
#else
|
| 1278 |
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
| 1279 |
#endif
|
|
|
|
| 712 |
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
| 713 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 714 |
|
|
|
|
| 715 |
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
| 716 |
|
| 717 |
int v[2];
|
|
|
|
| 753 |
}
|
| 754 |
|
| 755 |
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
}
|
| 757 |
|
| 758 |
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
| 759 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 760 |
|
|
|
|
| 761 |
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
| 762 |
|
| 763 |
int vl[2];
|
|
|
|
| 799 |
}
|
| 800 |
|
| 801 |
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
}
|
| 803 |
|
| 804 |
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
|
|
| 829 |
|
| 830 |
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
| 831 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
|
|
|
| 832 |
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
| 833 |
|
| 834 |
#if QR2_XXS == 8
|
|
|
|
| 869 |
}
|
| 870 |
return d * (sumi1 + sumi2);
|
| 871 |
#endif
|
|
|
|
|
|
|
|
|
|
| 872 |
}
|
| 873 |
|
| 874 |
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
| 875 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 876 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 877 |
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
| 878 |
|
| 879 |
const int ib32 = iqs;
|
|
|
|
| 907 |
GGML_UNUSED(ksigns64);
|
| 908 |
NO_DEVICE_CODE;
|
| 909 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 910 |
}
|
| 911 |
|
| 912 |
// TODO
|
| 913 |
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
| 914 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 915 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 916 |
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
| 917 |
|
| 918 |
const int ib32 = iqs;
|
|
|
|
| 948 |
GGML_UNUSED(ksigns64);
|
| 949 |
NO_DEVICE_CODE;
|
| 950 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
}
|
| 952 |
|
| 953 |
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
| 954 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 955 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 956 |
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
| 957 |
|
| 958 |
const int ib32 = iqs;
|
|
|
|
| 977 |
#else
|
| 978 |
NO_DEVICE_CODE;
|
| 979 |
#endif
|
|
|
|
|
|
|
|
|
|
| 980 |
}
|
| 981 |
|
| 982 |
// TODO: don't use lookup table for signs
|
| 983 |
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
| 984 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 985 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 986 |
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
| 987 |
|
| 988 |
const int ib32 = iqs;
|
|
|
|
| 1005 |
#else
|
| 1006 |
NO_DEVICE_CODE;
|
| 1007 |
#endif
|
|
|
|
|
|
|
|
|
|
| 1008 |
}
|
| 1009 |
|
| 1010 |
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
| 1011 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
|
|
|
| 1012 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 1013 |
|
| 1014 |
const int ib32 = iqs;
|
|
|
|
| 1036 |
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
| 1037 |
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
| 1038 |
return d * sumi + m * delta;
|
|
|
|
|
|
|
|
|
|
| 1039 |
}
|
| 1040 |
|
| 1041 |
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
| 1042 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
|
|
|
| 1043 |
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
| 1044 |
|
| 1045 |
const int ib32 = iqs;
|
|
|
|
| 1075 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 1076 |
const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
|
| 1077 |
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
|
|
|
|
|
|
|
|
|
| 1078 |
}
|
| 1079 |
|
| 1080 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 1130 |
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
| 1131 |
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 1132 |
|
|
|
|
| 1133 |
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 1134 |
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
| 1135 |
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
| 1136 |
|
|
|
|
| 1148 |
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
| 1149 |
}
|
| 1150 |
return d * (sumi1 + sumi2);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1151 |
#else
|
| 1152 |
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
| 1153 |
#endif
|
ggml-metal.m
CHANGED
|
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 381 |
// dictionary of preprocessor macros
|
| 382 |
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
| 383 |
|
| 384 |
-
#ifdef GGML_QKK_64
|
| 385 |
-
prep[@"GGML_QKK_64"] = @(1);
|
| 386 |
-
#endif
|
| 387 |
-
|
| 388 |
MTLCompileOptions* options = [MTLCompileOptions new];
|
| 389 |
options.preprocessorMacros = prep;
|
| 390 |
|
|
@@ -1773,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 1773 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1774 |
}
|
| 1775 |
else if (src0t == GGML_TYPE_Q3_K) {
|
| 1776 |
-
#ifdef GGML_QKK_64
|
| 1777 |
-
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1778 |
-
#else
|
| 1779 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1780 |
-
#endif
|
| 1781 |
}
|
| 1782 |
else if (src0t == GGML_TYPE_Q5_K) {
|
| 1783 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 2018 |
{
|
| 2019 |
nth0 = 4;
|
| 2020 |
nth1 = 16;
|
| 2021 |
-
#if QK_K == 64
|
| 2022 |
-
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
| 2023 |
-
#else
|
| 2024 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
| 2025 |
-
#endif
|
| 2026 |
-
|
| 2027 |
} break;
|
| 2028 |
default:
|
| 2029 |
{
|
|
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 2088 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 2089 |
}
|
| 2090 |
else if (src0t == GGML_TYPE_Q3_K) {
|
| 2091 |
-
#ifdef GGML_QKK_64
|
| 2092 |
-
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 2093 |
-
#else
|
| 2094 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 2095 |
-
#endif
|
| 2096 |
}
|
| 2097 |
else if (src0t == GGML_TYPE_Q5_K) {
|
| 2098 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
|
|
| 381 |
// dictionary of preprocessor macros
|
| 382 |
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
MTLCompileOptions* options = [MTLCompileOptions new];
|
| 385 |
options.preprocessorMacros = prep;
|
| 386 |
|
|
|
|
| 1769 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1770 |
}
|
| 1771 |
else if (src0t == GGML_TYPE_Q3_K) {
|
|
|
|
|
|
|
|
|
|
| 1772 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
|
|
| 1773 |
}
|
| 1774 |
else if (src0t == GGML_TYPE_Q5_K) {
|
| 1775 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
|
|
| 2010 |
{
|
| 2011 |
nth0 = 4;
|
| 2012 |
nth1 = 16;
|
|
|
|
|
|
|
|
|
|
| 2013 |
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
|
|
|
|
|
|
| 2014 |
} break;
|
| 2015 |
default:
|
| 2016 |
{
|
|
|
|
| 2075 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 2076 |
}
|
| 2077 |
else if (src0t == GGML_TYPE_Q3_K) {
|
|
|
|
|
|
|
|
|
|
| 2078 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
|
|
| 2079 |
}
|
| 2080 |
else if (src0t == GGML_TYPE_Q5_K) {
|
| 2081 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
ggml-metal.metal
CHANGED
|
@@ -3386,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
|
| 3386 |
|
| 3387 |
const int step = sizeof(block_q2_K) * nb;
|
| 3388 |
|
| 3389 |
-
#if QK_K == 256
|
| 3390 |
const int ix = tiisg/8; // 0...3
|
| 3391 |
const int it = tiisg%8; // 0...7
|
| 3392 |
const int iq = it/4; // 0 or 1
|
|
@@ -3438,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
|
| 3438 |
|
| 3439 |
y4 += 4 * QK_K;
|
| 3440 |
}
|
| 3441 |
-
#else
|
| 3442 |
-
const int ix = tiisg/2; // 0...15
|
| 3443 |
-
const int it = tiisg%2; // 0...1
|
| 3444 |
-
|
| 3445 |
-
device const float * y4 = y + ix * QK_K + 8 * it;
|
| 3446 |
-
|
| 3447 |
-
for (int ib = ix; ib < nb; ib += 16) {
|
| 3448 |
-
|
| 3449 |
-
float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
| 3450 |
-
for (int i = 0; i < 8; ++i) {
|
| 3451 |
-
yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
|
| 3452 |
-
yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
|
| 3453 |
-
yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
|
| 3454 |
-
yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
|
| 3455 |
-
}
|
| 3456 |
-
|
| 3457 |
-
device const uint8_t * sc = (device const uint8_t *)x[ib].scales;
|
| 3458 |
-
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
| 3459 |
-
device const half * dh = &x[ib].d;
|
| 3460 |
-
|
| 3461 |
-
for (int row = 0; row < N_DST; row++) {
|
| 3462 |
-
|
| 3463 |
-
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
| 3464 |
-
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
| 3465 |
-
for (int i = 0; i < 8; i += 2) {
|
| 3466 |
-
acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
|
| 3467 |
-
acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
|
| 3468 |
-
acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
|
| 3469 |
-
acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
|
| 3470 |
-
acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
|
| 3471 |
-
acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
|
| 3472 |
-
acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
|
| 3473 |
-
acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
|
| 3474 |
-
}
|
| 3475 |
-
|
| 3476 |
-
float dall = dh[0];
|
| 3477 |
-
float dmin = dh[1];
|
| 3478 |
-
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
|
| 3479 |
-
(acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
|
| 3480 |
-
(acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
|
| 3481 |
-
(acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
|
| 3482 |
-
dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
|
| 3483 |
-
|
| 3484 |
-
qs += step/2;
|
| 3485 |
-
sc += step;
|
| 3486 |
-
dh += step/2;
|
| 3487 |
-
}
|
| 3488 |
-
|
| 3489 |
-
y4 += 16 * QK_K;
|
| 3490 |
-
}
|
| 3491 |
-
#endif
|
| 3492 |
|
| 3493 |
for (int row = 0; row < N_DST; ++row) {
|
| 3494 |
all_sum = simd_sum(sumf[row]);
|
|
@@ -3526,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32(
|
|
| 3526 |
kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
| 3527 |
}
|
| 3528 |
|
| 3529 |
-
#if QK_K == 256
|
| 3530 |
void kernel_mul_mv_q3_K_f32_impl(
|
| 3531 |
device const void * src0,
|
| 3532 |
device const float * src1,
|
|
@@ -3685,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl(
|
|
| 3685 |
}
|
| 3686 |
}
|
| 3687 |
}
|
| 3688 |
-
#else
|
| 3689 |
-
void kernel_mul_mv_q3_K_f32_impl(
|
| 3690 |
-
device const void * src0,
|
| 3691 |
-
device const float * src1,
|
| 3692 |
-
device float * dst,
|
| 3693 |
-
constant int64_t & ne00,
|
| 3694 |
-
constant int64_t & ne01,
|
| 3695 |
-
constant int64_t & ne02,
|
| 3696 |
-
constant int64_t & ne10,
|
| 3697 |
-
constant int64_t & ne12,
|
| 3698 |
-
constant int64_t & ne0,
|
| 3699 |
-
constant int64_t & ne1,
|
| 3700 |
-
constant uint & r2,
|
| 3701 |
-
constant uint & r3,
|
| 3702 |
-
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
| 3703 |
-
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 3704 |
-
uint tiisg[[thread_index_in_simdgroup]],
|
| 3705 |
-
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
| 3706 |
-
|
| 3707 |
-
const int nb = ne00/QK_K;
|
| 3708 |
-
|
| 3709 |
-
const int64_t r0 = tgpig.x;
|
| 3710 |
-
const int64_t r1 = tgpig.y;
|
| 3711 |
-
const int64_t im = tgpig.z;
|
| 3712 |
-
|
| 3713 |
-
const int row = 2 * r0 + sgitg;
|
| 3714 |
-
|
| 3715 |
-
const uint i12 = im%ne12;
|
| 3716 |
-
const uint i13 = im/ne12;
|
| 3717 |
-
|
| 3718 |
-
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 3719 |
-
|
| 3720 |
-
device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
|
| 3721 |
-
device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 3722 |
-
|
| 3723 |
-
const int ix = tiisg/4;
|
| 3724 |
-
const int il = 4 * (tiisg%4);// 0, 4, 8, 12
|
| 3725 |
-
const int iq = il/8; // 0, 0, 1, 1
|
| 3726 |
-
const int in = il%8; // 0, 4, 0, 4
|
| 3727 |
-
|
| 3728 |
-
float2 sum = {0.f, 0.f};
|
| 3729 |
-
|
| 3730 |
-
for (int i = ix; i < nb; i += 8) {
|
| 3731 |
-
|
| 3732 |
-
const float d_all = (float)(x[i].d);
|
| 3733 |
-
|
| 3734 |
-
device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
|
| 3735 |
-
device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
|
| 3736 |
-
device const uint16_t * s = (device const uint16_t *)(x[i].scales);
|
| 3737 |
-
device const float * y = yy + i * QK_K + il;
|
| 3738 |
-
|
| 3739 |
-
const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
|
| 3740 |
-
const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
|
| 3741 |
-
const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
|
| 3742 |
-
const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
|
| 3743 |
-
|
| 3744 |
-
for (int l = 0; l < 4; l += 2) {
|
| 3745 |
-
const uint16_t hm = h[l/2] >> iq;
|
| 3746 |
-
sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4))
|
| 3747 |
-
+ y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
|
| 3748 |
-
+ y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
|
| 3749 |
-
+ y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
|
| 3750 |
-
sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
|
| 3751 |
-
+ y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
|
| 3752 |
-
+ y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
|
| 3753 |
-
+ y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
|
| 3754 |
-
}
|
| 3755 |
-
|
| 3756 |
-
}
|
| 3757 |
-
const float sumf = sum[0] + sum[1] * 1.f/256.f;
|
| 3758 |
-
|
| 3759 |
-
const float tot = simd_sum(sumf);
|
| 3760 |
-
if (tiisg == 0) {
|
| 3761 |
-
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
| 3762 |
-
}
|
| 3763 |
-
|
| 3764 |
-
}
|
| 3765 |
-
#endif
|
| 3766 |
|
| 3767 |
[[host_name("kernel_mul_mv_q3_K_f32")]]
|
| 3768 |
kernel void kernel_mul_mv_q3_K_f32(
|
|
@@ -3792,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32(
|
|
| 3792 |
kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
| 3793 |
}
|
| 3794 |
|
| 3795 |
-
#if QK_K == 256
|
| 3796 |
void kernel_mul_mv_q4_K_f32_impl(
|
| 3797 |
device const void * src0,
|
| 3798 |
device const float * src1,
|
|
@@ -3906,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl(
|
|
| 3906 |
}
|
| 3907 |
}
|
| 3908 |
}
|
| 3909 |
-
#else
|
| 3910 |
-
void kernel_mul_mv_q4_K_f32_impl(
|
| 3911 |
-
device const void * src0,
|
| 3912 |
-
device const float * src1,
|
| 3913 |
-
device float * dst,
|
| 3914 |
-
constant int64_t & ne00,
|
| 3915 |
-
constant int64_t & ne01,
|
| 3916 |
-
constant int64_t & ne02,
|
| 3917 |
-
constant int64_t & ne10,
|
| 3918 |
-
constant int64_t & ne12,
|
| 3919 |
-
constant int64_t & ne0,
|
| 3920 |
-
constant int64_t & ne1,
|
| 3921 |
-
constant uint & r2,
|
| 3922 |
-
constant uint & r3,
|
| 3923 |
-
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
| 3924 |
-
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 3925 |
-
uint tiisg[[thread_index_in_simdgroup]],
|
| 3926 |
-
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
| 3927 |
-
|
| 3928 |
-
const int ix = tiisg/4; // 0...7
|
| 3929 |
-
const int it = tiisg%4; // 0...3
|
| 3930 |
-
|
| 3931 |
-
const int nb = ne00/QK_K;
|
| 3932 |
-
const int r0 = tgpig.x;
|
| 3933 |
-
const int r1 = tgpig.y;
|
| 3934 |
-
const int im = tgpig.z;
|
| 3935 |
-
const int first_row = r0 * N_DST;
|
| 3936 |
-
const int ib_row = first_row * nb;
|
| 3937 |
-
|
| 3938 |
-
const uint i12 = im%ne12;
|
| 3939 |
-
const uint i13 = im/ne12;
|
| 3940 |
-
|
| 3941 |
-
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 3942 |
-
|
| 3943 |
-
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
| 3944 |
-
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 3945 |
-
|
| 3946 |
-
float yl[8];
|
| 3947 |
-
float yh[8];
|
| 3948 |
-
float sumf[N_DST]={0.f}, all_sum;
|
| 3949 |
-
|
| 3950 |
-
const int step = sizeof(block_q4_K) * nb / 2;
|
| 3951 |
-
|
| 3952 |
-
device const float * y4 = y + ix * QK_K + 8 * it;
|
| 3953 |
-
|
| 3954 |
-
uint16_t sc16[4];
|
| 3955 |
-
|
| 3956 |
-
for (int ib = ix; ib < nb; ib += 8) {
|
| 3957 |
-
|
| 3958 |
-
float2 sumy = {0.f, 0.f};
|
| 3959 |
-
for (int i = 0; i < 8; ++i) {
|
| 3960 |
-
yl[i] = y4[i+ 0]; sumy[0] += yl[i];
|
| 3961 |
-
yh[i] = y4[i+32]; sumy[1] += yh[i];
|
| 3962 |
-
}
|
| 3963 |
-
|
| 3964 |
-
device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
|
| 3965 |
-
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
| 3966 |
-
device const half * dh = x[ib].d;
|
| 3967 |
-
|
| 3968 |
-
for (int row = 0; row < N_DST; row++) {
|
| 3969 |
-
|
| 3970 |
-
sc16[0] = sc[0] & 0x000f;
|
| 3971 |
-
sc16[1] = sc[0] & 0x0f00;
|
| 3972 |
-
sc16[2] = sc[0] & 0x00f0;
|
| 3973 |
-
sc16[3] = sc[0] & 0xf000;
|
| 3974 |
-
|
| 3975 |
-
float2 acc1 = {0.f, 0.f};
|
| 3976 |
-
float2 acc2 = {0.f, 0.f};
|
| 3977 |
-
for (int i = 0; i < 8; i += 2) {
|
| 3978 |
-
acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
|
| 3979 |
-
acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
|
| 3980 |
-
acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
|
| 3981 |
-
acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
|
| 3982 |
-
}
|
| 3983 |
-
|
| 3984 |
-
float dall = dh[0];
|
| 3985 |
-
float dmin = dh[1];
|
| 3986 |
-
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
|
| 3987 |
-
(acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
|
| 3988 |
-
dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
|
| 3989 |
-
|
| 3990 |
-
qs += step;
|
| 3991 |
-
sc += step;
|
| 3992 |
-
dh += step;
|
| 3993 |
-
}
|
| 3994 |
-
|
| 3995 |
-
y4 += 8 * QK_K;
|
| 3996 |
-
}
|
| 3997 |
-
|
| 3998 |
-
for (int row = 0; row < N_DST; ++row) {
|
| 3999 |
-
all_sum = simd_sum(sumf[row]);
|
| 4000 |
-
if (tiisg == 0) {
|
| 4001 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
| 4002 |
-
}
|
| 4003 |
-
}
|
| 4004 |
-
}
|
| 4005 |
-
#endif
|
| 4006 |
|
| 4007 |
[[host_name("kernel_mul_mv_q4_K_f32")]]
|
| 4008 |
kernel void kernel_mul_mv_q4_K_f32(
|
|
@@ -4070,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
|
| 4070 |
|
| 4071 |
const int step = sizeof(block_q5_K) * nb;
|
| 4072 |
|
| 4073 |
-
#if QK_K == 256
|
| 4074 |
-
#
|
| 4075 |
float yl[16], yh[16];
|
| 4076 |
|
| 4077 |
const uint16_t kmask1 = 0x3f3f;
|
|
@@ -4154,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
|
| 4154 |
y1 += 4 * QK_K;
|
| 4155 |
|
| 4156 |
}
|
| 4157 |
-
#else
|
| 4158 |
-
float yl[8], yh[8];
|
| 4159 |
-
|
| 4160 |
-
const int il = 4 * (tiisg/8); // 0, 4, 8, 12
|
| 4161 |
-
const int ix = tiisg%8;
|
| 4162 |
-
const int iq = il/8; // 0, 0, 1, 1
|
| 4163 |
-
const int in = il%8; // 0, 4, 0, 4
|
| 4164 |
-
|
| 4165 |
-
device const float * y = yy + ix*QK_K + il;
|
| 4166 |
-
|
| 4167 |
-
for (int i = ix; i < nb; i += 8) {
|
| 4168 |
-
|
| 4169 |
-
for (int l = 0; l < 4; ++l) {
|
| 4170 |
-
yl[l+0] = y[l+ 0];
|
| 4171 |
-
yl[l+4] = y[l+16];
|
| 4172 |
-
yh[l+0] = y[l+32];
|
| 4173 |
-
yh[l+4] = y[l+48];
|
| 4174 |
-
}
|
| 4175 |
-
|
| 4176 |
-
device const half * dh = &x[i].d;
|
| 4177 |
-
device const uint8_t * q = x[i].qs + il;
|
| 4178 |
-
device const uint8_t * h = x[i].qh + in;
|
| 4179 |
-
device const int8_t * s = x[i].scales;
|
| 4180 |
-
|
| 4181 |
-
for (int row = 0; row < 2; ++row) {
|
| 4182 |
-
|
| 4183 |
-
const float d = dh[0];
|
| 4184 |
-
|
| 4185 |
-
float2 acc = {0.f, 0.f};
|
| 4186 |
-
for (int l = 0; l < 4; ++l) {
|
| 4187 |
-
const uint8_t hl = h[l] >> iq;
|
| 4188 |
-
acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
|
| 4189 |
-
+ yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
|
| 4190 |
-
acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
|
| 4191 |
-
+ yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
|
| 4192 |
-
}
|
| 4193 |
-
sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
|
| 4194 |
-
|
| 4195 |
-
q += step;
|
| 4196 |
-
h += step;
|
| 4197 |
-
s += step;
|
| 4198 |
-
dh += step/2;
|
| 4199 |
-
|
| 4200 |
-
}
|
| 4201 |
-
|
| 4202 |
-
y += 8 * QK_K;
|
| 4203 |
-
}
|
| 4204 |
-
#endif
|
| 4205 |
|
| 4206 |
for (int row = 0; row < 2; ++row) {
|
| 4207 |
const float tot = simd_sum(sumf[row]);
|
|
@@ -4280,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
|
| 4280 |
|
| 4281 |
float sumf = 0;
|
| 4282 |
|
| 4283 |
-
#if QK_K == 256
|
| 4284 |
const int tid = tiisg/2;
|
| 4285 |
const int ix = tiisg%2;
|
| 4286 |
const int ip = tid/8; // 0 or 1
|
|
@@ -4316,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
|
| 4316 |
|
| 4317 |
}
|
| 4318 |
|
| 4319 |
-
#else
|
| 4320 |
-
const int ix = tiisg/4;
|
| 4321 |
-
const int il = 4*(tiisg%4);
|
| 4322 |
-
|
| 4323 |
-
for (int i = ix; i < nb; i += 8) {
|
| 4324 |
-
device const float * y = yy + i * QK_K + il;
|
| 4325 |
-
device const uint8_t * ql = x[i].ql + il;
|
| 4326 |
-
device const uint8_t * qh = x[i].qh + il;
|
| 4327 |
-
device const int8_t * s = x[i].scales;
|
| 4328 |
-
|
| 4329 |
-
const float d = x[i].d;
|
| 4330 |
-
|
| 4331 |
-
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
| 4332 |
-
for (int l = 0; l < 4; ++l) {
|
| 4333 |
-
sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
| 4334 |
-
sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
| 4335 |
-
sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
|
| 4336 |
-
sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
| 4337 |
-
}
|
| 4338 |
-
sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
|
| 4339 |
-
}
|
| 4340 |
-
|
| 4341 |
-
#endif
|
| 4342 |
-
|
| 4343 |
const float tot = simd_sum(sumf);
|
| 4344 |
if (tiisg == 0) {
|
| 4345 |
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
|
@@ -5173,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
|
| 5173 |
|
| 5174 |
device const float * y4 = y + 32 * ix;
|
| 5175 |
|
| 5176 |
-
#if QK_K != 64
|
| 5177 |
iq1m_scale_t scale;
|
| 5178 |
-
#endif
|
| 5179 |
|
| 5180 |
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
| 5181 |
|
|
@@ -5196,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
|
| 5196 |
device const uint16_t * sc = (device const uint16_t *)xr->scales;
|
| 5197 |
|
| 5198 |
for (int row = 0; row < N_DST; row++) {
|
| 5199 |
-
|
| 5200 |
-
#if QK_K != 64
|
| 5201 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 5202 |
-
#endif
|
| 5203 |
|
| 5204 |
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
| 5205 |
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
|
@@ -5215,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
|
| 5215 |
}
|
| 5216 |
const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 5217 |
const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 5218 |
-
|
| 5219 |
-
const float d = (float) *((device const half *)(sc - 1));
|
| 5220 |
-
sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
|
| 5221 |
-
(sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
|
| 5222 |
-
#else
|
| 5223 |
sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
|
| 5224 |
(sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
|
| 5225 |
-
#endif
|
| 5226 |
|
| 5227 |
sc += nb*sizeof(block_iq1_m)/2;
|
| 5228 |
qs += nb*sizeof(block_iq1_m);
|
|
@@ -5334,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
|
| 5334 |
}
|
| 5335 |
}
|
| 5336 |
|
| 5337 |
-
#if QK_K != 64
|
| 5338 |
void kernel_mul_mv_iq4_xs_f32_impl(
|
| 5339 |
device const void * src0,
|
| 5340 |
device const float * src1,
|
|
@@ -5429,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
|
| 5429 |
}
|
| 5430 |
}
|
| 5431 |
}
|
| 5432 |
-
#endif
|
| 5433 |
|
| 5434 |
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
| 5435 |
kernel void kernel_mul_mv_iq1_s_f32(
|
|
@@ -5542,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
|
|
| 5542 |
uint tiisg[[thread_index_in_simdgroup]],
|
| 5543 |
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
| 5544 |
|
| 5545 |
-
#if QK_K == 64
|
| 5546 |
-
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
| 5547 |
-
#else
|
| 5548 |
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
| 5549 |
-
#endif
|
| 5550 |
}
|
| 5551 |
|
| 5552 |
//============================= templates and their specializations =============================
|
|
@@ -5672,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
|
|
| 5672 |
float dl, ml;
|
| 5673 |
uint8_t sc = xb->scales[il];
|
| 5674 |
|
| 5675 |
-
#if QK_K == 256
|
| 5676 |
q = q + 32*(il/8) + 16*(il&1);
|
| 5677 |
il = (il/2)%4;
|
| 5678 |
-
|
| 5679 |
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
| 5680 |
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
| 5681 |
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
|
@@ -5691,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
|
| 5691 |
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
| 5692 |
device const int8_t * scales = (device const int8_t *)xb->scales;
|
| 5693 |
|
| 5694 |
-
#if QK_K == 256
|
| 5695 |
q = q + 32 * (il/8) + 16 * (il&1);
|
| 5696 |
h = h + 16 * (il&1);
|
| 5697 |
uint8_t m = 1 << (il/2);
|
|
@@ -5712,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
|
| 5712 |
for (int i = 0; i < 16; ++i) {
|
| 5713 |
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
| 5714 |
}
|
| 5715 |
-
#else
|
| 5716 |
-
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
| 5717 |
-
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
| 5718 |
-
float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
|
| 5719 |
-
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
| 5720 |
-
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
| 5721 |
-
uint8_t m = 1<<(il*2);
|
| 5722 |
-
for (int i = 0; i < 16; ++i) {
|
| 5723 |
-
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
|
| 5724 |
-
}
|
| 5725 |
-
#endif
|
| 5726 |
}
|
| 5727 |
|
| 5728 |
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
|
@@ -5734,7 +5401,6 @@ template <typename type4x4>
|
|
| 5734 |
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
| 5735 |
device const uchar * q = xb->qs;
|
| 5736 |
|
| 5737 |
-
#if QK_K == 256
|
| 5738 |
short is = (il/4) * 2;
|
| 5739 |
q = q + (il/4) * 32 + 16 * (il&1);
|
| 5740 |
il = il & 3;
|
|
@@ -5743,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|
| 5743 |
const float min = xb->dmin;
|
| 5744 |
const float dl = d * sc[0];
|
| 5745 |
const float ml = min * sc[1];
|
| 5746 |
-
|
| 5747 |
-
(void) get_scale_min_k4_just2;
|
| 5748 |
-
|
| 5749 |
-
q = q + 16 * (il&1);
|
| 5750 |
-
device const uint8_t * s = xb->scales;
|
| 5751 |
-
device const half2 * dh = (device const half2 *)xb->d;
|
| 5752 |
-
const float2 d = (float2)dh[0];
|
| 5753 |
-
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
|
| 5754 |
-
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
|
| 5755 |
-
#endif
|
| 5756 |
const ushort mask = il<2 ? 0x0F : 0xF0;
|
| 5757 |
for (int i = 0; i < 16; ++i) {
|
| 5758 |
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
|
@@ -5764,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
|
| 5764 |
device const uint8_t * q = xb->qs;
|
| 5765 |
device const uint8_t * qh = xb->qh;
|
| 5766 |
|
| 5767 |
-
#if QK_K == 256
|
| 5768 |
short is = (il/4) * 2;
|
| 5769 |
q = q + 32 * (il/4) + 16 * (il&1);
|
| 5770 |
qh = qh + 16 * (il&1);
|
|
@@ -5781,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
|
| 5781 |
for (int i = 0; i < 16; ++i) {
|
| 5782 |
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
| 5783 |
}
|
| 5784 |
-
#else
|
| 5785 |
-
q = q + 16 * (il&1);
|
| 5786 |
-
device const int8_t * s = xb->scales;
|
| 5787 |
-
const float dl = xb->d * s[il];
|
| 5788 |
-
uint8_t m = 1<<(il*2);
|
| 5789 |
-
const float coef = il<2 ? 1.f : 1.f/16.f;
|
| 5790 |
-
const ushort mask = il<2 ? 0x0F : 0xF0;
|
| 5791 |
-
for (int i = 0; i < 16; ++i) {
|
| 5792 |
-
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
|
| 5793 |
-
}
|
| 5794 |
-
#endif
|
| 5795 |
}
|
| 5796 |
|
| 5797 |
template <typename type4x4>
|
|
@@ -5801,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
|
|
| 5801 |
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
| 5802 |
device const int8_t * scales = (device const int8_t *)xb->scales;
|
| 5803 |
|
| 5804 |
-
#if QK_K == 256
|
| 5805 |
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
| 5806 |
qh = qh + 32*(il/8) + 16*(il&1);
|
| 5807 |
float sc = scales[(il%2) + 2 * ((il/2))];
|
| 5808 |
il = (il/2) & 3;
|
| 5809 |
-
|
| 5810 |
-
ql = ql + 16 * (il&1);
|
| 5811 |
-
float sc = scales[il];
|
| 5812 |
-
#endif
|
| 5813 |
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
| 5814 |
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
| 5815 |
const float coef = il>1 ? 1.f/16.f : 1.f;
|
|
@@ -5966,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
|
|
| 5966 |
const int ib32 = il/2;
|
| 5967 |
il = il%2;
|
| 5968 |
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
| 5969 |
-
|
| 5970 |
-
const float d = xb->d;
|
| 5971 |
-
#else
|
| 5972 |
iq1m_scale_t scale;
|
| 5973 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 5974 |
const float d = scale.f16;
|
| 5975 |
-
|
| 5976 |
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
| 5977 |
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
| 5978 |
-
|
| 5979 |
-
const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
|
| 5980 |
-
#else
|
| 5981 |
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
| 5982 |
-
#endif
|
| 5983 |
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 5984 |
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 5985 |
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
|
@@ -6009,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
|
|
| 6009 |
|
| 6010 |
template <typename type4x4>
|
| 6011 |
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
| 6012 |
-
#if QK_K == 64
|
| 6013 |
-
dequantize_iq4_nl(xb, il, reg);
|
| 6014 |
-
#else
|
| 6015 |
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
| 6016 |
const int ib32 = il/2;
|
| 6017 |
il = il%2;
|
|
@@ -6028,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
|
|
| 6028 |
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
| 6029 |
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
| 6030 |
}
|
| 6031 |
-
#endif
|
| 6032 |
}
|
| 6033 |
|
| 6034 |
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
|
@@ -6533,11 +6165,7 @@ kernel void kernel_mul_mm_id(
|
|
| 6533 |
sgitg);
|
| 6534 |
}
|
| 6535 |
|
| 6536 |
-
#if QK_K == 256
|
| 6537 |
#define QK_NL 16
|
| 6538 |
-
#else
|
| 6539 |
-
#define QK_NL 4
|
| 6540 |
-
#endif
|
| 6541 |
|
| 6542 |
//
|
| 6543 |
// get rows
|
|
@@ -6577,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_r
|
|
| 6577 |
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6578 |
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
| 6579 |
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 6580 |
-
#if QK_K == 64
|
| 6581 |
-
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
|
| 6582 |
-
#else
|
| 6583 |
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
| 6584 |
-
#endif
|
| 6585 |
|
| 6586 |
//
|
| 6587 |
// matrix-matrix multiplication
|
|
@@ -6609,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_m
|
|
| 6609 |
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6610 |
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
| 6611 |
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 6612 |
-
#if QK_K == 64
|
| 6613 |
-
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
|
| 6614 |
-
#else
|
| 6615 |
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
| 6616 |
-
#endif
|
| 6617 |
|
| 6618 |
//
|
| 6619 |
// indirect matrix-matrix multiplication
|
|
@@ -6641,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel
|
|
| 6641 |
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6642 |
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
| 6643 |
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
| 6644 |
-
#if QK_K == 64
|
| 6645 |
-
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
|
| 6646 |
-
#else
|
| 6647 |
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
| 6648 |
-
#endif
|
| 6649 |
|
| 6650 |
//
|
| 6651 |
// matrix-vector multiplication
|
|
@@ -6854,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
|
|
| 6854 |
template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
|
| 6855 |
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
| 6856 |
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
| 6857 |
-
#if QK_K != 64
|
| 6858 |
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
| 6859 |
-
#endif
|
| 6860 |
|
|
|
|
| 3386 |
|
| 3387 |
const int step = sizeof(block_q2_K) * nb;
|
| 3388 |
|
|
|
|
| 3389 |
const int ix = tiisg/8; // 0...3
|
| 3390 |
const int it = tiisg%8; // 0...7
|
| 3391 |
const int iq = it/4; // 0 or 1
|
|
|
|
| 3437 |
|
| 3438 |
y4 += 4 * QK_K;
|
| 3439 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3440 |
|
| 3441 |
for (int row = 0; row < N_DST; ++row) {
|
| 3442 |
all_sum = simd_sum(sumf[row]);
|
|
|
|
| 3474 |
kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
| 3475 |
}
|
| 3476 |
|
|
|
|
| 3477 |
void kernel_mul_mv_q3_K_f32_impl(
|
| 3478 |
device const void * src0,
|
| 3479 |
device const float * src1,
|
|
|
|
| 3632 |
}
|
| 3633 |
}
|
| 3634 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3635 |
|
| 3636 |
[[host_name("kernel_mul_mv_q3_K_f32")]]
|
| 3637 |
kernel void kernel_mul_mv_q3_K_f32(
|
|
|
|
| 3661 |
kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
| 3662 |
}
|
| 3663 |
|
|
|
|
| 3664 |
void kernel_mul_mv_q4_K_f32_impl(
|
| 3665 |
device const void * src0,
|
| 3666 |
device const float * src1,
|
|
|
|
| 3774 |
}
|
| 3775 |
}
|
| 3776 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3777 |
|
| 3778 |
[[host_name("kernel_mul_mv_q4_K_f32")]]
|
| 3779 |
kernel void kernel_mul_mv_q4_K_f32(
|
|
|
|
| 3841 |
|
| 3842 |
const int step = sizeof(block_q5_K) * nb;
|
| 3843 |
|
|
|
|
|
|
|
| 3844 |
float yl[16], yh[16];
|
| 3845 |
|
| 3846 |
const uint16_t kmask1 = 0x3f3f;
|
|
|
|
| 3923 |
y1 += 4 * QK_K;
|
| 3924 |
|
| 3925 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3926 |
|
| 3927 |
for (int row = 0; row < 2; ++row) {
|
| 3928 |
const float tot = simd_sum(sumf[row]);
|
|
|
|
| 4001 |
|
| 4002 |
float sumf = 0;
|
| 4003 |
|
|
|
|
| 4004 |
const int tid = tiisg/2;
|
| 4005 |
const int ix = tiisg%2;
|
| 4006 |
const int ip = tid/8; // 0 or 1
|
|
|
|
| 4036 |
|
| 4037 |
}
|
| 4038 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4039 |
const float tot = simd_sum(sumf);
|
| 4040 |
if (tiisg == 0) {
|
| 4041 |
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
|
|
|
| 4869 |
|
| 4870 |
device const float * y4 = y + 32 * ix;
|
| 4871 |
|
|
|
|
| 4872 |
iq1m_scale_t scale;
|
|
|
|
| 4873 |
|
| 4874 |
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
| 4875 |
|
|
|
|
| 4890 |
device const uint16_t * sc = (device const uint16_t *)xr->scales;
|
| 4891 |
|
| 4892 |
for (int row = 0; row < N_DST; row++) {
|
|
|
|
|
|
|
| 4893 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
|
|
| 4894 |
|
| 4895 |
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
| 4896 |
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
|
|
|
| 4906 |
}
|
| 4907 |
const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 4908 |
const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 4909 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4910 |
sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
|
| 4911 |
(sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
|
|
|
|
| 4912 |
|
| 4913 |
sc += nb*sizeof(block_iq1_m)/2;
|
| 4914 |
qs += nb*sizeof(block_iq1_m);
|
|
|
|
| 5020 |
}
|
| 5021 |
}
|
| 5022 |
|
|
|
|
| 5023 |
void kernel_mul_mv_iq4_xs_f32_impl(
|
| 5024 |
device const void * src0,
|
| 5025 |
device const float * src1,
|
|
|
|
| 5114 |
}
|
| 5115 |
}
|
| 5116 |
}
|
|
|
|
| 5117 |
|
| 5118 |
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
| 5119 |
kernel void kernel_mul_mv_iq1_s_f32(
|
|
|
|
| 5226 |
uint tiisg[[thread_index_in_simdgroup]],
|
| 5227 |
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
| 5228 |
|
|
|
|
|
|
|
|
|
|
| 5229 |
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
|
|
| 5230 |
}
|
| 5231 |
|
| 5232 |
//============================= templates and their specializations =============================
|
|
|
|
| 5352 |
float dl, ml;
|
| 5353 |
uint8_t sc = xb->scales[il];
|
| 5354 |
|
|
|
|
| 5355 |
q = q + 32*(il/8) + 16*(il&1);
|
| 5356 |
il = (il/2)%4;
|
| 5357 |
+
|
| 5358 |
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
| 5359 |
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
| 5360 |
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
|
|
|
| 5370 |
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
| 5371 |
device const int8_t * scales = (device const int8_t *)xb->scales;
|
| 5372 |
|
|
|
|
| 5373 |
q = q + 32 * (il/8) + 16 * (il&1);
|
| 5374 |
h = h + 16 * (il&1);
|
| 5375 |
uint8_t m = 1 << (il/2);
|
|
|
|
| 5390 |
for (int i = 0; i < 16; ++i) {
|
| 5391 |
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
| 5392 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5393 |
}
|
| 5394 |
|
| 5395 |
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
|
|
|
| 5401 |
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
| 5402 |
device const uchar * q = xb->qs;
|
| 5403 |
|
|
|
|
| 5404 |
short is = (il/4) * 2;
|
| 5405 |
q = q + (il/4) * 32 + 16 * (il&1);
|
| 5406 |
il = il & 3;
|
|
|
|
| 5409 |
const float min = xb->dmin;
|
| 5410 |
const float dl = d * sc[0];
|
| 5411 |
const float ml = min * sc[1];
|
| 5412 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5413 |
const ushort mask = il<2 ? 0x0F : 0xF0;
|
| 5414 |
for (int i = 0; i < 16; ++i) {
|
| 5415 |
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
|
|
|
| 5421 |
device const uint8_t * q = xb->qs;
|
| 5422 |
device const uint8_t * qh = xb->qh;
|
| 5423 |
|
|
|
|
| 5424 |
short is = (il/4) * 2;
|
| 5425 |
q = q + 32 * (il/4) + 16 * (il&1);
|
| 5426 |
qh = qh + 16 * (il&1);
|
|
|
|
| 5437 |
for (int i = 0; i < 16; ++i) {
|
| 5438 |
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
| 5439 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5440 |
}
|
| 5441 |
|
| 5442 |
template <typename type4x4>
|
|
|
|
| 5446 |
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
| 5447 |
device const int8_t * scales = (device const int8_t *)xb->scales;
|
| 5448 |
|
|
|
|
| 5449 |
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
| 5450 |
qh = qh + 32*(il/8) + 16*(il&1);
|
| 5451 |
float sc = scales[(il%2) + 2 * ((il/2))];
|
| 5452 |
il = (il/2) & 3;
|
| 5453 |
+
|
|
|
|
|
|
|
|
|
|
| 5454 |
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
| 5455 |
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
| 5456 |
const float coef = il>1 ? 1.f/16.f : 1.f;
|
|
|
|
| 5607 |
const int ib32 = il/2;
|
| 5608 |
il = il%2;
|
| 5609 |
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
| 5610 |
+
|
|
|
|
|
|
|
| 5611 |
iq1m_scale_t scale;
|
| 5612 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 5613 |
const float d = scale.f16;
|
| 5614 |
+
|
| 5615 |
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
| 5616 |
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
| 5617 |
+
|
|
|
|
|
|
|
| 5618 |
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
|
|
|
| 5619 |
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 5620 |
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
| 5621 |
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
|
|
|
| 5645 |
|
| 5646 |
template <typename type4x4>
|
| 5647 |
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
|
|
|
|
|
|
|
|
|
| 5648 |
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
| 5649 |
const int ib32 = il/2;
|
| 5650 |
il = il%2;
|
|
|
|
| 5661 |
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
| 5662 |
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
| 5663 |
}
|
|
|
|
| 5664 |
}
|
| 5665 |
|
| 5666 |
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
|
|
|
| 6165 |
sgitg);
|
| 6166 |
}
|
| 6167 |
|
|
|
|
| 6168 |
#define QK_NL 16
|
|
|
|
|
|
|
|
|
|
| 6169 |
|
| 6170 |
//
|
| 6171 |
// get rows
|
|
|
|
| 6205 |
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6206 |
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
| 6207 |
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
|
|
|
|
|
|
|
|
| 6208 |
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
|
|
| 6209 |
|
| 6210 |
//
|
| 6211 |
// matrix-matrix multiplication
|
|
|
|
| 6233 |
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6234 |
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
| 6235 |
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
|
|
|
|
|
|
|
|
| 6236 |
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
|
|
| 6237 |
|
| 6238 |
//
|
| 6239 |
// indirect matrix-matrix multiplication
|
|
|
|
| 6261 |
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
| 6262 |
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
| 6263 |
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
|
|
|
|
|
|
|
|
| 6264 |
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
|
|
| 6265 |
|
| 6266 |
//
|
| 6267 |
// matrix-vector multiplication
|
|
|
|
| 6470 |
template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
|
| 6471 |
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
| 6472 |
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
|
|
|
| 6473 |
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
|
|
|
| 6474 |
|
ggml-opencl.cpp
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
#include "ggml-opencl.h"
|
| 3 |
#include "ggml-backend-impl.h"
|
| 4 |
|
|
|
|
| 1 |
+
#include "ggml.h"
|
| 2 |
#include "ggml-opencl.h"
|
| 3 |
#include "ggml-backend-impl.h"
|
| 4 |
|
ggml-quants.c
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml-sycl.cpp
CHANGED
|
@@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4197 |
const block_q2_K * x = (const block_q2_K *) vx;
|
| 4198 |
|
| 4199 |
const int tid = item_ct1.get_local_id(2);
|
| 4200 |
-
#if QK_K == 256
|
| 4201 |
const int n = tid/32;
|
| 4202 |
const int l = tid - 32*n;
|
| 4203 |
const int is = 8*n + l/16;
|
|
@@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4211 |
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 4212 |
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
| 4213 |
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
| 4214 |
-
#else
|
| 4215 |
-
const int is = tid/16; // 0 or 1
|
| 4216 |
-
const int il = tid%16; // 0...15
|
| 4217 |
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 4218 |
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 4219 |
-
|
| 4220 |
-
float dall = x[i].dm[0];
|
| 4221 |
-
float dmin = x[i].dm[1];
|
| 4222 |
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
| 4223 |
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 4224 |
-
#endif
|
| 4225 |
-
|
| 4226 |
}
|
| 4227 |
|
| 4228 |
template<typename dst_t>
|
|
@@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4232 |
const int i = item_ct1.get_group(2);
|
| 4233 |
const block_q3_K * x = (const block_q3_K *) vx;
|
| 4234 |
|
| 4235 |
-
#if QK_K == 256
|
| 4236 |
const int r = item_ct1.get_local_id(2) / 4;
|
| 4237 |
const int tid = r/2;
|
| 4238 |
const int is0 = r%2;
|
|
@@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4256 |
const uint8_t * hm = x[i].hmask;
|
| 4257 |
|
| 4258 |
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
| 4259 |
-
#else
|
| 4260 |
-
const int tid = item_ct1.get_local_id(2);
|
| 4261 |
-
const int is = tid/16; // 0 or 1
|
| 4262 |
-
const int il = tid%16; // 0...15
|
| 4263 |
-
const int im = il/8; // 0...1
|
| 4264 |
-
const int in = il%8; // 0...7
|
| 4265 |
-
|
| 4266 |
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 4267 |
-
|
| 4268 |
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 4269 |
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
| 4270 |
-
const float d = (float)x[i].d;
|
| 4271 |
-
|
| 4272 |
-
if (is == 0) {
|
| 4273 |
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
| 4274 |
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
| 4275 |
-
} else {
|
| 4276 |
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
| 4277 |
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
| 4278 |
-
}
|
| 4279 |
-
#endif
|
| 4280 |
-
|
| 4281 |
}
|
| 4282 |
|
| 4283 |
-
#if QK_K == 256
|
| 4284 |
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
| 4285 |
if (j < 4) {
|
| 4286 |
d = q[j] & 63; m = q[j + 4] & 63;
|
|
@@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
| 4289 |
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
| 4290 |
}
|
| 4291 |
}
|
| 4292 |
-
#endif
|
| 4293 |
|
| 4294 |
template<typename dst_t>
|
| 4295 |
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
@@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4298 |
|
| 4299 |
const int i = item_ct1.get_group(2);
|
| 4300 |
|
| 4301 |
-
#if QK_K == 256
|
| 4302 |
// assume 32 threads
|
| 4303 |
const int tid = item_ct1.get_local_id(2);
|
| 4304 |
const int il = tid/8;
|
|
@@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4322 |
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
| 4323 |
y[l +32] = d2 * (q[l] >> 4) - m2;
|
| 4324 |
}
|
| 4325 |
-
#else
|
| 4326 |
-
const int tid = item_ct1.get_local_id(2);
|
| 4327 |
-
const uint8_t * q = x[i].qs;
|
| 4328 |
-
dst_t * y = yy + i*QK_K;
|
| 4329 |
-
const float d = (float)x[i].dm[0];
|
| 4330 |
-
const float m = (float)x[i].dm[1];
|
| 4331 |
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
| 4332 |
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
| 4333 |
-
#endif
|
| 4334 |
}
|
| 4335 |
|
| 4336 |
template<typename dst_t>
|
|
@@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4340 |
|
| 4341 |
const int i = item_ct1.get_group(2);
|
| 4342 |
|
| 4343 |
-
#if QK_K == 256
|
| 4344 |
// assume 64 threads - this is very slightly better than the one below
|
| 4345 |
const int tid = item_ct1.get_local_id(2);
|
| 4346 |
const int il = tid/16; // il is in 0...3
|
|
@@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4367 |
hm <<= 1;
|
| 4368 |
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
| 4369 |
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
| 4370 |
-
#else
|
| 4371 |
-
const int tid = item_ct1.get_local_id(2);
|
| 4372 |
-
const uint8_t q = x[i].qs[tid];
|
| 4373 |
-
const int im = tid/8; // 0...3
|
| 4374 |
-
const int in = tid%8; // 0...7
|
| 4375 |
-
const int is = tid/16; // 0 or 1
|
| 4376 |
-
const uint8_t h = x[i].qh[in] >> im;
|
| 4377 |
-
const float d = x[i].d;
|
| 4378 |
-
dst_t * y = yy + i*QK_K + tid;
|
| 4379 |
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
| 4380 |
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
| 4381 |
-
#endif
|
| 4382 |
}
|
| 4383 |
|
| 4384 |
template<typename dst_t>
|
|
@@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4387 |
const block_q6_K * x = (const block_q6_K *) vx;
|
| 4388 |
|
| 4389 |
const int i = item_ct1.get_group(2);
|
| 4390 |
-
#if QK_K == 256
|
| 4391 |
|
| 4392 |
// assume 64 threads - this is very slightly better than the one below
|
| 4393 |
const int tid = item_ct1.get_local_id(2);
|
|
@@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
| 4407 |
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
| 4408 |
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 4409 |
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
| 4410 |
-
#else
|
| 4411 |
-
|
| 4412 |
-
// assume 32 threads
|
| 4413 |
-
const int tid = item_ct1.get_local_id(2);
|
| 4414 |
-
const int ip = tid/16; // 0 or 1
|
| 4415 |
-
const int il = tid - 16*ip; // 0...15
|
| 4416 |
-
|
| 4417 |
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
| 4418 |
-
|
| 4419 |
-
const float d = x[i].d;
|
| 4420 |
-
|
| 4421 |
-
const uint8_t ql = x[i].ql[16*ip + il];
|
| 4422 |
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
| 4423 |
-
const int8_t * sc = x[i].scales;
|
| 4424 |
-
|
| 4425 |
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
| 4426 |
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 4427 |
-
#endif
|
| 4428 |
}
|
| 4429 |
|
| 4430 |
template<typename dst_t>
|
|
@@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
| 4438 |
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
| 4439 |
|
| 4440 |
const int tid = item_ct1.get_local_id(2);
|
| 4441 |
-
#if QK_K == 256
|
| 4442 |
const int il = tid/8; // 0...3
|
| 4443 |
const int ib = tid%8; // 0...7
|
| 4444 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
| 4449 |
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
| 4450 |
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
| 4451 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
| 4452 |
-
#else
|
| 4453 |
-
assert(false);
|
| 4454 |
-
#endif
|
| 4455 |
-
|
| 4456 |
}
|
| 4457 |
|
| 4458 |
template<typename dst_t>
|
|
@@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
| 4466 |
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
| 4467 |
|
| 4468 |
const int tid = item_ct1.get_local_id(2);
|
| 4469 |
-
#if QK_K == 256
|
| 4470 |
const int il = tid/8; // 0...3
|
| 4471 |
const int ib = tid%8; // 0...7
|
| 4472 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
| 4475 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 4476 |
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
| 4477 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 4478 |
-
#else
|
| 4479 |
-
assert(false);
|
| 4480 |
-
#endif
|
| 4481 |
-
|
| 4482 |
}
|
| 4483 |
|
| 4484 |
template <typename dst_t>
|
|
@@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4490 |
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 4491 |
|
| 4492 |
const int tid = item_ct1.get_local_id(2);
|
| 4493 |
-
#if QK_K == 256
|
| 4494 |
const int il = tid/8; // 0...3
|
| 4495 |
const int ib = tid%8; // 0...7
|
| 4496 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4498 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 4499 |
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
| 4500 |
#pragma unroll
|
| 4501 |
-
for (int j = 0; j < 8; ++j)
|
| 4502 |
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 4503 |
-
|
| 4504 |
-
assert(false);
|
| 4505 |
-
|
| 4506 |
-
#endif
|
| 4507 |
-
|
| 4508 |
}
|
| 4509 |
|
| 4510 |
template<typename dst_t>
|
|
@@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
| 4518 |
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
| 4519 |
|
| 4520 |
const int tid = item_ct1.get_local_id(2);
|
| 4521 |
-
#if QK_K == 256
|
| 4522 |
const int il = tid/8; // 0...3
|
| 4523 |
const int ib = tid%8; // 0...7
|
| 4524 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
| 4533 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 4534 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 4535 |
}
|
| 4536 |
-
#else
|
| 4537 |
-
assert(false);
|
| 4538 |
-
#endif
|
| 4539 |
-
|
| 4540 |
}
|
| 4541 |
|
| 4542 |
template <typename dst_t>
|
|
@@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4549 |
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 4550 |
|
| 4551 |
const int tid = item_ct1.get_local_id(2);
|
| 4552 |
-
#if QK_K == 256
|
| 4553 |
const int il = tid/8; // 0...3
|
| 4554 |
const int ib = tid%8; // 0...7
|
| 4555 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4563 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 4564 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 4565 |
}
|
| 4566 |
-
#else
|
| 4567 |
-
assert(false);
|
| 4568 |
-
#endif
|
| 4569 |
-
|
| 4570 |
}
|
| 4571 |
|
| 4572 |
template <typename dst_t>
|
|
@@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4579 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 4580 |
|
| 4581 |
const int tid = item_ct1.get_local_id(2);
|
| 4582 |
-
#if QK_K == 256
|
| 4583 |
const int il = tid/8; // 0...3
|
| 4584 |
const int ib = tid%8; // 0...7
|
| 4585 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4593 |
for (int j = 0; j < 8; ++j) {
|
| 4594 |
y[j] = d * (q[j] + delta);
|
| 4595 |
}
|
| 4596 |
-
#else
|
| 4597 |
-
assert(false);
|
| 4598 |
-
#endif
|
| 4599 |
-
|
| 4600 |
}
|
| 4601 |
|
| 4602 |
template <typename dst_t>
|
|
@@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4609 |
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 4610 |
|
| 4611 |
const int tid = item_ct1.get_local_id(2);
|
| 4612 |
-
#if QK_K == 256
|
| 4613 |
const int il = tid/8; // 0...3
|
| 4614 |
const int ib = tid%8; // 0...7
|
| 4615 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
| 4627 |
for (int j = 0; j < 8; ++j) {
|
| 4628 |
y[j] = d * (q[j] + delta);
|
| 4629 |
}
|
| 4630 |
-
#else
|
| 4631 |
-
assert(false);
|
| 4632 |
-
#endif
|
| 4633 |
-
|
| 4634 |
}
|
| 4635 |
|
| 4636 |
template <typename dst_t>
|
|
@@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
| 4704 |
|
| 4705 |
float tmp = 0; // partial sum for thread in warp
|
| 4706 |
|
| 4707 |
-
#if QK_K == 256
|
| 4708 |
const int tid =
|
| 4709 |
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
| 4710 |
const int ix =
|
|
@@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
| 4755 |
tmp += dall * sum1 - dmin * sum2;
|
| 4756 |
|
| 4757 |
}
|
| 4758 |
-
#else
|
| 4759 |
-
const int tid = item_ct1.get_local_id(2) /
|
| 4760 |
-
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
| 4761 |
-
const int ix = item_ct1.get_local_id(2) %
|
| 4762 |
-
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
| 4763 |
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
| 4764 |
-
|
| 4765 |
-
uint32_t uaux[2];
|
| 4766 |
-
const uint8_t * d = (const uint8_t *)uaux;
|
| 4767 |
-
|
| 4768 |
-
|
| 4769 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 4770 |
-
|
| 4771 |
-
const float * y = yy + i * QK_K + offset;
|
| 4772 |
-
const uint8_t * q = x[i].qs + offset;
|
| 4773 |
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
| 4774 |
-
|
| 4775 |
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
| 4776 |
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
| 4777 |
-
|
| 4778 |
-
const sycl::float2 dall =
|
| 4779 |
-
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
| 4780 |
-
|
| 4781 |
-
float sum1 = 0, sum2 = 0;
|
| 4782 |
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 4783 |
-
const uint8_t ql = q[l];
|
| 4784 |
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
| 4785 |
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
| 4786 |
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
| 4787 |
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
| 4788 |
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
| 4789 |
-
}
|
| 4790 |
-
tmp += dall.x() * sum1 - dall.y() * sum2;
|
| 4791 |
-
}
|
| 4792 |
-
|
| 4793 |
-
#endif
|
| 4794 |
|
| 4795 |
// sum up partial sums and write back result
|
| 4796 |
#pragma unroll
|
|
@@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
| 4828 |
|
| 4829 |
float tmp = 0; // partial sum for thread in warp
|
| 4830 |
|
| 4831 |
-
#if QK_K == 256
|
| 4832 |
-
|
| 4833 |
const uint16_t kmask1 = 0x0303;
|
| 4834 |
const uint16_t kmask2 = 0x0f0f;
|
| 4835 |
|
|
@@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
| 4882 |
tmp += d * sum;
|
| 4883 |
|
| 4884 |
}
|
| 4885 |
-
#else
|
| 4886 |
-
|
| 4887 |
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
| 4888 |
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
| 4889 |
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
| 4890 |
-
const int in = offset/8; // 0 or 1
|
| 4891 |
-
const int im = offset%8; // 0...7
|
| 4892 |
-
|
| 4893 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 4894 |
-
|
| 4895 |
-
const float * y = yy + i * QK_K + offset;
|
| 4896 |
-
const uint8_t * q = x[i].qs + offset;
|
| 4897 |
-
const uint8_t * s = x[i].scales;
|
| 4898 |
-
|
| 4899 |
-
const float dall = (float)x[i].d;
|
| 4900 |
-
|
| 4901 |
-
float sum = 0;
|
| 4902 |
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 4903 |
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
| 4904 |
-
const uint8_t ql = q[l];
|
| 4905 |
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
| 4906 |
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
| 4907 |
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
| 4908 |
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
| 4909 |
-
}
|
| 4910 |
-
tmp += sum;
|
| 4911 |
-
}
|
| 4912 |
-
#endif
|
| 4913 |
|
| 4914 |
// sum up partial sums and write back result
|
| 4915 |
#pragma unroll
|
|
@@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
| 4944 |
|
| 4945 |
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
| 4946 |
|
| 4947 |
-
#if QK_K == 256
|
| 4948 |
const uint16_t kmask1 = 0x3f3f;
|
| 4949 |
const uint16_t kmask2 = 0x0f0f;
|
| 4950 |
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
| 5033 |
#endif
|
| 5034 |
|
| 5035 |
}
|
| 5036 |
-
#else
|
| 5037 |
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
| 5038 |
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
| 5039 |
-
|
| 5040 |
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 5041 |
-
|
| 5042 |
-
uint16_t aux16[2];
|
| 5043 |
-
const uint8_t * s = (const uint8_t *)aux16;
|
| 5044 |
-
|
| 5045 |
-
float tmp = 0;
|
| 5046 |
-
|
| 5047 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 5048 |
-
const uint8_t * q = x[i].qs + step;
|
| 5049 |
-
const float * y = yy + i*QK_K + step;
|
| 5050 |
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
| 5051 |
-
aux16[0] = a[0] & 0x0f0f;
|
| 5052 |
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
| 5053 |
-
const float d = (float)x[i].dm[0];
|
| 5054 |
-
const float m = (float)x[i].dm[1];
|
| 5055 |
-
float sum = 0.f;
|
| 5056 |
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 5057 |
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
| 5058 |
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
| 5059 |
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
| 5060 |
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
| 5061 |
-
}
|
| 5062 |
-
tmp += sum;
|
| 5063 |
-
}
|
| 5064 |
-
|
| 5065 |
-
#endif
|
| 5066 |
|
| 5067 |
// sum up partial sums and write back result
|
| 5068 |
#pragma unroll
|
|
@@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
| 5097 |
|
| 5098 |
float tmp = 0; // partial sum for thread in warp
|
| 5099 |
|
| 5100 |
-
#if QK_K == 256
|
| 5101 |
const uint16_t kmask1 = 0x3f3f;
|
| 5102 |
const uint16_t kmask2 = 0x0f0f;
|
| 5103 |
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
| 5174 |
dmin * smin;
|
| 5175 |
}
|
| 5176 |
|
| 5177 |
-
#else
|
| 5178 |
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
| 5179 |
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
| 5180 |
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 5181 |
-
const int im = step/8;
|
| 5182 |
-
const int in = step%8;
|
| 5183 |
-
|
| 5184 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 5185 |
-
const uint8_t * q = x[i].qs + step;
|
| 5186 |
-
const int8_t * s = x[i].scales;
|
| 5187 |
-
const float * y = yy + i*QK_K + step;
|
| 5188 |
-
const float d = x[i].d;
|
| 5189 |
-
float sum = 0.f;
|
| 5190 |
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 5191 |
-
const uint8_t h = x[i].qh[in+j] >> im;
|
| 5192 |
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
| 5193 |
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
| 5194 |
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
| 5195 |
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
| 5196 |
-
}
|
| 5197 |
-
tmp += sum;
|
| 5198 |
-
}
|
| 5199 |
-
#endif
|
| 5200 |
-
|
| 5201 |
// sum up partial sums and write back result
|
| 5202 |
#pragma unroll
|
| 5203 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
| 5224 |
|
| 5225 |
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
| 5226 |
|
| 5227 |
-
#if QK_K == 256
|
| 5228 |
-
|
| 5229 |
const int tid =
|
| 5230 |
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
| 5231 |
const int ix =
|
|
@@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
| 5282 |
|
| 5283 |
}
|
| 5284 |
|
| 5285 |
-
#else
|
| 5286 |
-
|
| 5287 |
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
| 5288 |
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
| 5289 |
-
|
| 5290 |
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
| 5291 |
-
|
| 5292 |
-
float tmp = 0; // partial sum for thread in warp
|
| 5293 |
-
|
| 5294 |
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
| 5295 |
-
|
| 5296 |
-
const float * y = yy + i * QK_K + step;
|
| 5297 |
-
const uint8_t * ql = x[i].ql + step;
|
| 5298 |
-
const uint8_t * qh = x[i].qh + step;
|
| 5299 |
-
const int8_t * s = x[i].scales;
|
| 5300 |
-
|
| 5301 |
-
const float d = x[i+0].d;
|
| 5302 |
-
|
| 5303 |
-
float sum = 0;
|
| 5304 |
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
| 5305 |
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
| 5306 |
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
| 5307 |
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
| 5308 |
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
| 5309 |
-
}
|
| 5310 |
-
tmp += sum;
|
| 5311 |
-
|
| 5312 |
-
}
|
| 5313 |
-
|
| 5314 |
-
#endif
|
| 5315 |
-
|
| 5316 |
// sum up partial sums and write back result
|
| 5317 |
#pragma unroll
|
| 5318 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -6857,7 +6586,6 @@ static __dpct_inline__ float
|
|
| 6857 |
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
| 6858 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 6859 |
|
| 6860 |
-
#ifndef GGML_QKK_64
|
| 6861 |
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
| 6862 |
|
| 6863 |
int v[2];
|
|
@@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
| 6899 |
}
|
| 6900 |
|
| 6901 |
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
| 6902 |
-
|
| 6903 |
-
#else
|
| 6904 |
-
|
| 6905 |
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
| 6906 |
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
| 6907 |
-
|
| 6908 |
-
float sumf_d = 0.0f;
|
| 6909 |
-
float sumf_m = 0.0f;
|
| 6910 |
-
|
| 6911 |
-
uint16_t aux16[2];
|
| 6912 |
-
const uint8_t * s = (const uint8_t *)aux16;
|
| 6913 |
-
|
| 6914 |
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
| 6915 |
-
aux16[0] = a[0] & 0x0f0f;
|
| 6916 |
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
| 6917 |
-
|
| 6918 |
-
const float dall = bq4_K->dm[0];
|
| 6919 |
-
const float dmin = bq4_K->dm[1];
|
| 6920 |
-
|
| 6921 |
-
const float d8_1 = bq8_1[0].ds[0];
|
| 6922 |
-
const float d8_2 = bq8_1[1].ds[1];
|
| 6923 |
-
|
| 6924 |
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
| 6925 |
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
| 6926 |
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
| 6927 |
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
| 6928 |
-
|
| 6929 |
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
| 6930 |
-
const int v1 = q4[0];
|
| 6931 |
-
const int v2 = q4[4];
|
| 6932 |
-
|
| 6933 |
-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
| 6934 |
-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
| 6935 |
-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
| 6936 |
-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
| 6937 |
-
|
| 6938 |
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
| 6939 |
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
| 6940 |
-
|
| 6941 |
-
return dall * sumf_d - dmin * sumf_m;
|
| 6942 |
-
|
| 6943 |
-
#else
|
| 6944 |
-
bad_arch();
|
| 6945 |
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
| 6946 |
-
|
| 6947 |
-
#endif
|
| 6948 |
}
|
| 6949 |
|
| 6950 |
template <int mmq_y>
|
|
@@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
| 7003 |
|
| 7004 |
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 7005 |
|
| 7006 |
-
#if QK_K == 256
|
| 7007 |
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
| 7008 |
-
#else
|
| 7009 |
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
| 7010 |
-
#endif
|
| 7011 |
}
|
| 7012 |
|
| 7013 |
#pragma unroll
|
|
@@ -7050,7 +6728,6 @@ static __dpct_inline__ float
|
|
| 7050 |
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
| 7051 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7052 |
|
| 7053 |
-
#ifndef GGML_QKK_64
|
| 7054 |
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
| 7055 |
|
| 7056 |
int vl[2];
|
|
@@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
| 7092 |
}
|
| 7093 |
|
| 7094 |
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
| 7095 |
-
|
| 7096 |
-
#else
|
| 7097 |
-
|
| 7098 |
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
| 7099 |
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
| 7100 |
-
|
| 7101 |
-
const int8_t * s = bq5_K->scales;
|
| 7102 |
-
|
| 7103 |
-
const float d = bq5_K->d;
|
| 7104 |
-
|
| 7105 |
-
const float d8_1 = bq8_1[0].ds[0];
|
| 7106 |
-
const float d8_2 = bq8_1[1].ds[1];
|
| 7107 |
-
|
| 7108 |
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
| 7109 |
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
| 7110 |
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
| 7111 |
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
| 7112 |
-
|
| 7113 |
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
| 7114 |
-
const int vl1 = ql[0];
|
| 7115 |
-
const int vl2 = ql[4];
|
| 7116 |
-
|
| 7117 |
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
| 7118 |
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
| 7119 |
-
const int in = step%8; // 0, 4, 0, 4
|
| 7120 |
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
| 7121 |
-
|
| 7122 |
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
| 7123 |
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
| 7124 |
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
| 7125 |
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
| 7126 |
-
|
| 7127 |
-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
| 7128 |
-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
| 7129 |
-
|
| 7130 |
-
return d * sumf_d;
|
| 7131 |
-
|
| 7132 |
-
#else
|
| 7133 |
-
bad_arch();
|
| 7134 |
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
| 7135 |
-
|
| 7136 |
-
#endif
|
| 7137 |
}
|
| 7138 |
|
| 7139 |
template <int mmq_y>
|
|
@@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
| 7205 |
|
| 7206 |
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 7207 |
|
| 7208 |
-
#if QK_K == 256
|
| 7209 |
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
| 7210 |
-
#endif
|
| 7211 |
}
|
| 7212 |
|
| 7213 |
#pragma unroll
|
|
@@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
| 7387 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7388 |
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
| 7389 |
const uint8_t *kmask_iq2xs) {
|
| 7390 |
-
#if QK_K == 256
|
| 7391 |
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
| 7392 |
|
| 7393 |
#if QR2_XXS == 8
|
|
@@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
| 7428 |
}
|
| 7429 |
return d * (sumi1 + sumi2);
|
| 7430 |
#endif
|
| 7431 |
-
#else
|
| 7432 |
-
assert(false);
|
| 7433 |
-
return 0.f;
|
| 7434 |
-
#endif
|
| 7435 |
}
|
| 7436 |
|
| 7437 |
static __dpct_inline__ float
|
|
@@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
| 7440 |
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
| 7441 |
#if DPCT_COMPATIBILITY_TEMP >= \
|
| 7442 |
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 7443 |
-
#if QK_K == 256
|
| 7444 |
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
| 7445 |
|
| 7446 |
const int ib32 = iqs;
|
|
@@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
| 7478 |
assert(false);
|
| 7479 |
return 0.f;
|
| 7480 |
#endif
|
| 7481 |
-
#else
|
| 7482 |
-
assert(false);
|
| 7483 |
-
return 0.f;
|
| 7484 |
-
#endif
|
| 7485 |
}
|
| 7486 |
|
| 7487 |
static __dpct_inline__ float
|
| 7488 |
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
| 7489 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7490 |
-
#if QK_K == 256
|
| 7491 |
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
| 7492 |
|
| 7493 |
const int ib32 = iqs;
|
|
@@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
| 7531 |
}
|
| 7532 |
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
| 7533 |
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
| 7534 |
-
#else
|
| 7535 |
-
assert(false);
|
| 7536 |
-
#endif
|
| 7537 |
}
|
| 7538 |
|
| 7539 |
static __dpct_inline__ float
|
|
@@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
| 7542 |
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
| 7543 |
#if DPCT_COMPATIBILITY_TEMP >= \
|
| 7544 |
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
| 7545 |
-
#if QK_K == 256
|
| 7546 |
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
| 7547 |
|
| 7548 |
const int ib32 = iqs;
|
|
@@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
| 7570 |
assert(false);
|
| 7571 |
return 0.f;
|
| 7572 |
#endif
|
| 7573 |
-
#else
|
| 7574 |
-
assert(false);
|
| 7575 |
-
return 0.f;
|
| 7576 |
-
#endif
|
| 7577 |
}
|
| 7578 |
|
| 7579 |
static __dpct_inline__ float
|
| 7580 |
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
| 7581 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7582 |
const uint32_t *iq3s_grid) {
|
| 7583 |
-
#if QK_K == 256
|
| 7584 |
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
| 7585 |
|
| 7586 |
const int ib32 = iqs;
|
|
@@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
| 7609 |
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
| 7610 |
bq8_1[ib32].ds[0];
|
| 7611 |
return d * sumi;
|
| 7612 |
-
#else
|
| 7613 |
-
assert(false);
|
| 7614 |
-
#endif
|
| 7615 |
}
|
| 7616 |
|
| 7617 |
static __dpct_inline__ float
|
| 7618 |
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
| 7619 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7620 |
const uint32_t *iq1s_grid_gpu) {
|
| 7621 |
-
#if QK_K == 256
|
| 7622 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 7623 |
|
| 7624 |
const int ib32 = iqs;
|
|
@@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
| 7637 |
const float d = d1q * bq8_1[ib32].ds[0];
|
| 7638 |
const float m = d1q * bq8_1[ib32].ds[1];
|
| 7639 |
return d * sumi + m * delta;
|
| 7640 |
-
#else
|
| 7641 |
-
assert(false);
|
| 7642 |
-
#endif
|
| 7643 |
}
|
| 7644 |
|
| 7645 |
static __dpct_inline__ float
|
| 7646 |
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
| 7647 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7648 |
-
#if QK_K == 256
|
| 7649 |
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
| 7650 |
|
| 7651 |
const int ib32 = iqs;
|
|
@@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
| 7670 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 7671 |
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
| 7672 |
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
| 7673 |
-
#else
|
| 7674 |
-
assert(false);
|
| 7675 |
-
#endif
|
| 7676 |
}
|
| 7677 |
|
| 7678 |
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
|
@@ -7720,7 +7322,6 @@ static __dpct_inline__ float
|
|
| 7720 |
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
| 7721 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7722 |
|
| 7723 |
-
#if QK_K == 256
|
| 7724 |
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
| 7725 |
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
| 7726 |
|
|
@@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
| 7738 |
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
| 7739 |
}
|
| 7740 |
return d * (sumi1 + sumi2);
|
| 7741 |
-
#else
|
| 7742 |
-
assert(false);
|
| 7743 |
-
#endif
|
| 7744 |
}
|
| 7745 |
|
| 7746 |
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
|
@@ -10203,7 +9801,6 @@ template <typename dst_t>
|
|
| 10203 |
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
| 10204 |
dpct::queue_ptr stream) {
|
| 10205 |
const int nb = k / QK_K;
|
| 10206 |
-
#if QK_K == 256
|
| 10207 |
{
|
| 10208 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10209 |
{sycl::aspect::fp16});
|
|
@@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
| 10215 |
dequantize_block_q2_K(vx, y, item_ct1);
|
| 10216 |
});
|
| 10217 |
}
|
| 10218 |
-
#else
|
| 10219 |
-
{
|
| 10220 |
-
dpct::has_capability_or_fail(stream->get_device(),
|
| 10221 |
-
{sycl::aspect::fp16});
|
| 10222 |
-
|
| 10223 |
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10224 |
-
sycl::range<3>(1, 1, 32),
|
| 10225 |
-
sycl::range<3>(1, 1, 32)),
|
| 10226 |
-
[=](sycl::nd_item<3> item_ct1) {
|
| 10227 |
-
dequantize_block_q2_K(vx, y, item_ct1);
|
| 10228 |
-
});
|
| 10229 |
-
}
|
| 10230 |
-
|
| 10231 |
-
#endif
|
| 10232 |
}
|
| 10233 |
|
| 10234 |
template <typename dst_t>
|
| 10235 |
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
| 10236 |
dpct::queue_ptr stream) {
|
| 10237 |
const int nb = k / QK_K;
|
| 10238 |
-
#if QK_K == 256
|
| 10239 |
{
|
| 10240 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10241 |
{sycl::aspect::fp16});
|
|
@@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
| 10247 |
dequantize_block_q3_K(vx, y, item_ct1);
|
| 10248 |
});
|
| 10249 |
}
|
| 10250 |
-
#else
|
| 10251 |
-
{
|
| 10252 |
-
dpct::has_capability_or_fail(stream->get_device(),
|
| 10253 |
-
{sycl::aspect::fp16});
|
| 10254 |
-
|
| 10255 |
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10256 |
-
sycl::range<3>(1, 1, 32),
|
| 10257 |
-
sycl::range<3>(1, 1, 32)),
|
| 10258 |
-
[=](sycl::nd_item<3> item_ct1) {
|
| 10259 |
-
dequantize_block_q3_K(vx, y, item_ct1);
|
| 10260 |
-
});
|
| 10261 |
-
}
|
| 10262 |
-
#endif
|
| 10263 |
}
|
| 10264 |
|
| 10265 |
template <typename dst_t>
|
|
@@ -10320,7 +9889,6 @@ template <typename dst_t>
|
|
| 10320 |
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
| 10321 |
dpct::queue_ptr stream) {
|
| 10322 |
const int nb = k / QK_K;
|
| 10323 |
-
#if QK_K == 256
|
| 10324 |
{
|
| 10325 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10326 |
{sycl::aspect::fp16});
|
|
@@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
| 10332 |
dequantize_block_q5_K(vx, y, item_ct1);
|
| 10333 |
});
|
| 10334 |
}
|
| 10335 |
-
#else
|
| 10336 |
-
{
|
| 10337 |
-
dpct::has_capability_or_fail(stream->get_device(),
|
| 10338 |
-
{sycl::aspect::fp16});
|
| 10339 |
-
|
| 10340 |
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10341 |
-
sycl::range<3>(1, 1, 32),
|
| 10342 |
-
sycl::range<3>(1, 1, 32)),
|
| 10343 |
-
[=](sycl::nd_item<3> item_ct1) {
|
| 10344 |
-
dequantize_block_q5_K(vx, y, item_ct1);
|
| 10345 |
-
});
|
| 10346 |
-
}
|
| 10347 |
-
|
| 10348 |
-
#endif
|
| 10349 |
}
|
| 10350 |
|
| 10351 |
template <typename dst_t>
|
| 10352 |
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
| 10353 |
dpct::queue_ptr stream) {
|
| 10354 |
const int nb = k / QK_K;
|
| 10355 |
-
#if QK_K == 256
|
| 10356 |
{
|
| 10357 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10358 |
{sycl::aspect::fp16});
|
|
@@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
| 10364 |
dequantize_block_q6_K(vx, y, item_ct1);
|
| 10365 |
});
|
| 10366 |
}
|
| 10367 |
-
#else
|
| 10368 |
-
{
|
| 10369 |
-
dpct::has_capability_or_fail(stream->get_device(),
|
| 10370 |
-
{sycl::aspect::fp16});
|
| 10371 |
-
|
| 10372 |
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
| 10373 |
-
sycl::range<3>(1, 1, 32),
|
| 10374 |
-
sycl::range<3>(1, 1, 32)),
|
| 10375 |
-
[=](sycl::nd_item<3> item_ct1) {
|
| 10376 |
-
dequantize_block_q6_K(vx, y, item_ct1);
|
| 10377 |
-
});
|
| 10378 |
-
}
|
| 10379 |
-
|
| 10380 |
-
#endif
|
| 10381 |
}
|
| 10382 |
|
| 10383 |
template <typename dst_t>
|
|
@@ -10529,9 +10068,6 @@ template <typename dst_t>
|
|
| 10529 |
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
| 10530 |
dpct::queue_ptr stream) {
|
| 10531 |
const int nb = (k + QK_K - 1) / QK_K;
|
| 10532 |
-
#if QK_K == 64
|
| 10533 |
-
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
| 10534 |
-
#else
|
| 10535 |
{
|
| 10536 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10537 |
{sycl::aspect::fp16});
|
|
@@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
| 10546 |
});
|
| 10547 |
});
|
| 10548 |
}
|
| 10549 |
-
#endif
|
| 10550 |
}
|
| 10551 |
|
| 10552 |
|
|
@@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 12051 |
const int nrows_y, const int nrows_dst,
|
| 12052 |
dpct::queue_ptr stream) try {
|
| 12053 |
|
| 12054 |
-
#if QK_K == 256
|
| 12055 |
-
|
| 12056 |
int id;
|
| 12057 |
SYCL_CHECK(
|
| 12058 |
CHECK_TRY_ERROR(id = get_current_device_id()));
|
|
@@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 12167 |
});
|
| 12168 |
}
|
| 12169 |
}
|
| 12170 |
-
#endif
|
| 12171 |
}
|
| 12172 |
catch (sycl::exception const &exc) {
|
| 12173 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
|
|
| 4197 |
const block_q2_K * x = (const block_q2_K *) vx;
|
| 4198 |
|
| 4199 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4200 |
const int n = tid/32;
|
| 4201 |
const int l = tid - 32*n;
|
| 4202 |
const int is = 8*n + l/16;
|
|
|
|
| 4210 |
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
| 4211 |
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
| 4212 |
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4213 |
}
|
| 4214 |
|
| 4215 |
template<typename dst_t>
|
|
|
|
| 4219 |
const int i = item_ct1.get_group(2);
|
| 4220 |
const block_q3_K * x = (const block_q3_K *) vx;
|
| 4221 |
|
|
|
|
| 4222 |
const int r = item_ct1.get_local_id(2) / 4;
|
| 4223 |
const int tid = r/2;
|
| 4224 |
const int is0 = r%2;
|
|
|
|
| 4242 |
const uint8_t * hm = x[i].hmask;
|
| 4243 |
|
| 4244 |
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4245 |
}
|
| 4246 |
|
|
|
|
| 4247 |
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
| 4248 |
if (j < 4) {
|
| 4249 |
d = q[j] & 63; m = q[j + 4] & 63;
|
|
|
|
| 4252 |
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
| 4253 |
}
|
| 4254 |
}
|
|
|
|
| 4255 |
|
| 4256 |
template<typename dst_t>
|
| 4257 |
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
|
|
| 4260 |
|
| 4261 |
const int i = item_ct1.get_group(2);
|
| 4262 |
|
|
|
|
| 4263 |
// assume 32 threads
|
| 4264 |
const int tid = item_ct1.get_local_id(2);
|
| 4265 |
const int il = tid/8;
|
|
|
|
| 4283 |
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
| 4284 |
y[l +32] = d2 * (q[l] >> 4) - m2;
|
| 4285 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4286 |
}
|
| 4287 |
|
| 4288 |
template<typename dst_t>
|
|
|
|
| 4292 |
|
| 4293 |
const int i = item_ct1.get_group(2);
|
| 4294 |
|
|
|
|
| 4295 |
// assume 64 threads - this is very slightly better than the one below
|
| 4296 |
const int tid = item_ct1.get_local_id(2);
|
| 4297 |
const int il = tid/16; // il is in 0...3
|
|
|
|
| 4318 |
hm <<= 1;
|
| 4319 |
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
| 4320 |
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4321 |
}
|
| 4322 |
|
| 4323 |
template<typename dst_t>
|
|
|
|
| 4326 |
const block_q6_K * x = (const block_q6_K *) vx;
|
| 4327 |
|
| 4328 |
const int i = item_ct1.get_group(2);
|
|
|
|
| 4329 |
|
| 4330 |
// assume 64 threads - this is very slightly better than the one below
|
| 4331 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4345 |
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
| 4346 |
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
| 4347 |
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4348 |
}
|
| 4349 |
|
| 4350 |
template<typename dst_t>
|
|
|
|
| 4358 |
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
| 4359 |
|
| 4360 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4361 |
const int il = tid/8; // 0...3
|
| 4362 |
const int ib = tid%8; // 0...7
|
| 4363 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 4368 |
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
| 4369 |
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
| 4370 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4371 |
}
|
| 4372 |
|
| 4373 |
template<typename dst_t>
|
|
|
|
| 4381 |
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
| 4382 |
|
| 4383 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4384 |
const int il = tid/8; // 0...3
|
| 4385 |
const int ib = tid%8; // 0...7
|
| 4386 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 4389 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 4390 |
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
| 4391 |
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4392 |
}
|
| 4393 |
|
| 4394 |
template <typename dst_t>
|
|
|
|
| 4400 |
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 4401 |
|
| 4402 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4403 |
const int il = tid/8; // 0...3
|
| 4404 |
const int ib = tid%8; // 0...7
|
| 4405 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 4407 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
| 4408 |
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
| 4409 |
#pragma unroll
|
| 4410 |
+
for (int j = 0; j < 8; ++j) {
|
| 4411 |
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
| 4412 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4413 |
}
|
| 4414 |
|
| 4415 |
template<typename dst_t>
|
|
|
|
| 4423 |
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
| 4424 |
|
| 4425 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4426 |
const int il = tid/8; // 0...3
|
| 4427 |
const int ib = tid%8; // 0...7
|
| 4428 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 4437 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 4438 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 4439 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4440 |
}
|
| 4441 |
|
| 4442 |
template <typename dst_t>
|
|
|
|
| 4449 |
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 4450 |
|
| 4451 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4452 |
const int il = tid/8; // 0...3
|
| 4453 |
const int ib = tid%8; // 0...7
|
| 4454 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 4462 |
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 4463 |
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 4464 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4465 |
}
|
| 4466 |
|
| 4467 |
template <typename dst_t>
|
|
|
|
| 4474 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 4475 |
|
| 4476 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4477 |
const int il = tid/8; // 0...3
|
| 4478 |
const int ib = tid%8; // 0...7
|
| 4479 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 4487 |
for (int j = 0; j < 8; ++j) {
|
| 4488 |
y[j] = d * (q[j] + delta);
|
| 4489 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4490 |
}
|
| 4491 |
|
| 4492 |
template <typename dst_t>
|
|
|
|
| 4499 |
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 4500 |
|
| 4501 |
const int tid = item_ct1.get_local_id(2);
|
|
|
|
| 4502 |
const int il = tid/8; // 0...3
|
| 4503 |
const int ib = tid%8; // 0...7
|
| 4504 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
|
|
| 4516 |
for (int j = 0; j < 8; ++j) {
|
| 4517 |
y[j] = d * (q[j] + delta);
|
| 4518 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4519 |
}
|
| 4520 |
|
| 4521 |
template <typename dst_t>
|
|
|
|
| 4589 |
|
| 4590 |
float tmp = 0; // partial sum for thread in warp
|
| 4591 |
|
|
|
|
| 4592 |
const int tid =
|
| 4593 |
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
| 4594 |
const int ix =
|
|
|
|
| 4639 |
tmp += dall * sum1 - dmin * sum2;
|
| 4640 |
|
| 4641 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4642 |
|
| 4643 |
// sum up partial sums and write back result
|
| 4644 |
#pragma unroll
|
|
|
|
| 4676 |
|
| 4677 |
float tmp = 0; // partial sum for thread in warp
|
| 4678 |
|
|
|
|
|
|
|
| 4679 |
const uint16_t kmask1 = 0x0303;
|
| 4680 |
const uint16_t kmask2 = 0x0f0f;
|
| 4681 |
|
|
|
|
| 4728 |
tmp += d * sum;
|
| 4729 |
|
| 4730 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4731 |
|
| 4732 |
// sum up partial sums and write back result
|
| 4733 |
#pragma unroll
|
|
|
|
| 4762 |
|
| 4763 |
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
| 4764 |
|
|
|
|
| 4765 |
const uint16_t kmask1 = 0x3f3f;
|
| 4766 |
const uint16_t kmask2 = 0x0f0f;
|
| 4767 |
const uint16_t kmask3 = 0xc0c0;
|
|
|
|
| 4850 |
#endif
|
| 4851 |
|
| 4852 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4853 |
|
| 4854 |
// sum up partial sums and write back result
|
| 4855 |
#pragma unroll
|
|
|
|
| 4884 |
|
| 4885 |
float tmp = 0; // partial sum for thread in warp
|
| 4886 |
|
|
|
|
| 4887 |
const uint16_t kmask1 = 0x3f3f;
|
| 4888 |
const uint16_t kmask2 = 0x0f0f;
|
| 4889 |
const uint16_t kmask3 = 0xc0c0;
|
|
|
|
| 4960 |
dmin * smin;
|
| 4961 |
}
|
| 4962 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4963 |
// sum up partial sums and write back result
|
| 4964 |
#pragma unroll
|
| 4965 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
| 4986 |
|
| 4987 |
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
| 4988 |
|
|
|
|
|
|
|
| 4989 |
const int tid =
|
| 4990 |
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
| 4991 |
const int ix =
|
|
|
|
| 5042 |
|
| 5043 |
}
|
| 5044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5045 |
// sum up partial sums and write back result
|
| 5046 |
#pragma unroll
|
| 5047 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
| 6586 |
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
| 6587 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 6588 |
|
|
|
|
| 6589 |
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
| 6590 |
|
| 6591 |
int v[2];
|
|
|
|
| 6627 |
}
|
| 6628 |
|
| 6629 |
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6630 |
}
|
| 6631 |
|
| 6632 |
template <int mmq_y>
|
|
|
|
| 6685 |
|
| 6686 |
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 6687 |
|
|
|
|
| 6688 |
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
|
|
|
|
|
|
|
|
|
| 6689 |
}
|
| 6690 |
|
| 6691 |
#pragma unroll
|
|
|
|
| 6728 |
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
| 6729 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 6730 |
|
|
|
|
| 6731 |
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
| 6732 |
|
| 6733 |
int vl[2];
|
|
|
|
| 6769 |
}
|
| 6770 |
|
| 6771 |
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6772 |
}
|
| 6773 |
|
| 6774 |
template <int mmq_y>
|
|
|
|
| 6840 |
|
| 6841 |
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
| 6842 |
|
|
|
|
| 6843 |
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
|
|
|
| 6844 |
}
|
| 6845 |
|
| 6846 |
#pragma unroll
|
|
|
|
| 7020 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7021 |
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
| 7022 |
const uint8_t *kmask_iq2xs) {
|
|
|
|
| 7023 |
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
| 7024 |
|
| 7025 |
#if QR2_XXS == 8
|
|
|
|
| 7060 |
}
|
| 7061 |
return d * (sumi1 + sumi2);
|
| 7062 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7063 |
}
|
| 7064 |
|
| 7065 |
static __dpct_inline__ float
|
|
|
|
| 7068 |
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
| 7069 |
#if DPCT_COMPATIBILITY_TEMP >= \
|
| 7070 |
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 7071 |
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
| 7072 |
|
| 7073 |
const int ib32 = iqs;
|
|
|
|
| 7105 |
assert(false);
|
| 7106 |
return 0.f;
|
| 7107 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7108 |
}
|
| 7109 |
|
| 7110 |
static __dpct_inline__ float
|
| 7111 |
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
| 7112 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
|
|
| 7113 |
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
| 7114 |
|
| 7115 |
const int ib32 = iqs;
|
|
|
|
| 7153 |
}
|
| 7154 |
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
| 7155 |
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
|
|
|
|
|
|
|
|
|
| 7156 |
}
|
| 7157 |
|
| 7158 |
static __dpct_inline__ float
|
|
|
|
| 7161 |
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
| 7162 |
#if DPCT_COMPATIBILITY_TEMP >= \
|
| 7163 |
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
|
|
| 7164 |
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
| 7165 |
|
| 7166 |
const int ib32 = iqs;
|
|
|
|
| 7188 |
assert(false);
|
| 7189 |
return 0.f;
|
| 7190 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7191 |
}
|
| 7192 |
|
| 7193 |
static __dpct_inline__ float
|
| 7194 |
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
| 7195 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7196 |
const uint32_t *iq3s_grid) {
|
|
|
|
| 7197 |
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
| 7198 |
|
| 7199 |
const int ib32 = iqs;
|
|
|
|
| 7222 |
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
| 7223 |
bq8_1[ib32].ds[0];
|
| 7224 |
return d * sumi;
|
|
|
|
|
|
|
|
|
|
| 7225 |
}
|
| 7226 |
|
| 7227 |
static __dpct_inline__ float
|
| 7228 |
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
| 7229 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7230 |
const uint32_t *iq1s_grid_gpu) {
|
|
|
|
| 7231 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 7232 |
|
| 7233 |
const int ib32 = iqs;
|
|
|
|
| 7246 |
const float d = d1q * bq8_1[ib32].ds[0];
|
| 7247 |
const float m = d1q * bq8_1[ib32].ds[1];
|
| 7248 |
return d * sumi + m * delta;
|
|
|
|
|
|
|
|
|
|
| 7249 |
}
|
| 7250 |
|
| 7251 |
static __dpct_inline__ float
|
| 7252 |
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
| 7253 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
|
|
| 7254 |
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
| 7255 |
|
| 7256 |
const int ib32 = iqs;
|
|
|
|
| 7275 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 7276 |
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
| 7277 |
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
|
|
|
|
|
|
|
|
|
| 7278 |
}
|
| 7279 |
|
| 7280 |
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
|
|
|
| 7322 |
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
| 7323 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
| 7324 |
|
|
|
|
| 7325 |
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
| 7326 |
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
| 7327 |
|
|
|
|
| 7339 |
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
| 7340 |
}
|
| 7341 |
return d * (sumi1 + sumi2);
|
|
|
|
|
|
|
|
|
|
| 7342 |
}
|
| 7343 |
|
| 7344 |
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
|
|
|
| 9801 |
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
| 9802 |
dpct::queue_ptr stream) {
|
| 9803 |
const int nb = k / QK_K;
|
|
|
|
| 9804 |
{
|
| 9805 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 9806 |
{sycl::aspect::fp16});
|
|
|
|
| 9812 |
dequantize_block_q2_K(vx, y, item_ct1);
|
| 9813 |
});
|
| 9814 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9815 |
}
|
| 9816 |
|
| 9817 |
template <typename dst_t>
|
| 9818 |
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
| 9819 |
dpct::queue_ptr stream) {
|
| 9820 |
const int nb = k / QK_K;
|
|
|
|
| 9821 |
{
|
| 9822 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 9823 |
{sycl::aspect::fp16});
|
|
|
|
| 9829 |
dequantize_block_q3_K(vx, y, item_ct1);
|
| 9830 |
});
|
| 9831 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9832 |
}
|
| 9833 |
|
| 9834 |
template <typename dst_t>
|
|
|
|
| 9889 |
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
| 9890 |
dpct::queue_ptr stream) {
|
| 9891 |
const int nb = k / QK_K;
|
|
|
|
| 9892 |
{
|
| 9893 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 9894 |
{sycl::aspect::fp16});
|
|
|
|
| 9900 |
dequantize_block_q5_K(vx, y, item_ct1);
|
| 9901 |
});
|
| 9902 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9903 |
}
|
| 9904 |
|
| 9905 |
template <typename dst_t>
|
| 9906 |
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
| 9907 |
dpct::queue_ptr stream) {
|
| 9908 |
const int nb = k / QK_K;
|
|
|
|
| 9909 |
{
|
| 9910 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 9911 |
{sycl::aspect::fp16});
|
|
|
|
| 9917 |
dequantize_block_q6_K(vx, y, item_ct1);
|
| 9918 |
});
|
| 9919 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9920 |
}
|
| 9921 |
|
| 9922 |
template <typename dst_t>
|
|
|
|
| 10068 |
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
| 10069 |
dpct::queue_ptr stream) {
|
| 10070 |
const int nb = (k + QK_K - 1) / QK_K;
|
|
|
|
|
|
|
|
|
|
| 10071 |
{
|
| 10072 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 10073 |
{sycl::aspect::fp16});
|
|
|
|
| 10082 |
});
|
| 10083 |
});
|
| 10084 |
}
|
|
|
|
| 10085 |
}
|
| 10086 |
|
| 10087 |
|
|
|
|
| 11586 |
const int nrows_y, const int nrows_dst,
|
| 11587 |
dpct::queue_ptr stream) try {
|
| 11588 |
|
|
|
|
|
|
|
| 11589 |
int id;
|
| 11590 |
SYCL_CHECK(
|
| 11591 |
CHECK_TRY_ERROR(id = get_current_device_id()));
|
|
|
|
| 11700 |
});
|
| 11701 |
}
|
| 11702 |
}
|
|
|
|
| 11703 |
}
|
| 11704 |
catch (sycl::exception const &exc) {
|
| 11705 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
ggml.c
CHANGED
|
@@ -871,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 871 |
},
|
| 872 |
[GGML_TYPE_IQ4_XS] = {
|
| 873 |
.type_name = "iq4_xs",
|
| 874 |
-
#if QK_K == 64
|
| 875 |
-
.blck_size = QK4_NL,
|
| 876 |
-
#else
|
| 877 |
.blck_size = QK_K,
|
| 878 |
-
#endif
|
| 879 |
.type_size = sizeof(block_iq4_xs),
|
| 880 |
.is_quantized = true,
|
| 881 |
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
| 882 |
.from_float = quantize_row_iq4_xs,
|
| 883 |
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
| 884 |
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
| 885 |
-
#if QK_K == 64
|
| 886 |
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 887 |
-
#else
|
| 888 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 889 |
-
#endif
|
| 890 |
.nrows = 1,
|
| 891 |
},
|
| 892 |
[GGML_TYPE_Q8_K] = {
|
|
@@ -22117,11 +22109,7 @@ size_t ggml_quantize_chunk(
|
|
| 22117 |
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 22118 |
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 22119 |
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 22120 |
-
#if QK_K == 64
|
| 22121 |
-
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 22122 |
-
#else
|
| 22123 |
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 22124 |
-
#endif
|
| 22125 |
case GGML_TYPE_F16:
|
| 22126 |
{
|
| 22127 |
size_t elemsize = sizeof(ggml_fp16_t);
|
|
|
|
| 871 |
},
|
| 872 |
[GGML_TYPE_IQ4_XS] = {
|
| 873 |
.type_name = "iq4_xs",
|
|
|
|
|
|
|
|
|
|
| 874 |
.blck_size = QK_K,
|
|
|
|
| 875 |
.type_size = sizeof(block_iq4_xs),
|
| 876 |
.is_quantized = true,
|
| 877 |
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
| 878 |
.from_float = quantize_row_iq4_xs,
|
| 879 |
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
| 880 |
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
|
|
|
|
|
|
|
|
|
| 881 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
|
|
|
| 882 |
.nrows = 1,
|
| 883 |
},
|
| 884 |
[GGML_TYPE_Q8_K] = {
|
|
|
|
| 22109 |
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 22110 |
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 22111 |
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
|
|
|
|
|
|
|
|
| 22112 |
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
|
|
| 22113 |
case GGML_TYPE_F16:
|
| 22114 |
{
|
| 22115 |
size_t elemsize = sizeof(ggml_fp16_t);
|