ggerganov commited on
Commit
8737d46
·
1 Parent(s): be0ec58

ggml : drop support for QK_K=64 (llama/7473)

Browse files

* ggml : drop support for QK_K=64

ggml-ci

* opencl : restore QK_K=256 define

ggml-common.h CHANGED
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
65
  // QK = number of values after dequantization
66
  // QK_K = super-block size
67
 
68
- #ifdef GGML_QKK_64
69
- #define QK_K 64
70
- #define K_SCALE_SIZE 4
71
- #else
72
  #define QK_K 256
73
  #define K_SCALE_SIZE 12
74
- #endif // GGML_QKK_64
75
 
76
  #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
77
  // QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131
  #define QI4_NL (QK4_NL / (4*QR4_NL))
132
  #define QR4_NL 2
133
 
134
- #if QK_K == 64
135
- #define QI4_XS QI4_NL
136
- #define QR4_XS QR4_NL
137
- #else
138
  #define QI4_XS (QK_K / (4*QR4_XS))
139
  #define QR4_XS 8
140
- #endif
141
 
142
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143
 
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228
  // weight is represented as x = a * q
229
  // 16 blocks of 16 elements each
230
  // Effectively 3.4375 bits per weight
231
- #ifdef GGML_QKK_64
232
- typedef struct {
233
- uint8_t hmask[QK_K/8]; // quants - high bit
234
- uint8_t qs[QK_K/4]; // quants - low 2 bits
235
- uint8_t scales[2];
236
- ggml_half d; // super-block scale
237
- } block_q3_K;
238
- static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239
- #else
240
  typedef struct {
241
  uint8_t hmask[QK_K/8]; // quants - high bit
242
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -244,20 +225,11 @@ typedef struct {
244
  ggml_half d; // super-block scale
245
  } block_q3_K;
246
  static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247
- #endif
248
 
249
  // 4-bit quantization
250
  // 8 blocks of 32 elements each
251
  // weight is represented as x = a * q + b
252
  // Effectively 4.5 bits per weight
253
- #ifdef GGML_QKK_64
254
- typedef struct {
255
- ggml_half d[2]; // super-block scales/mins
256
- uint8_t scales[2]; // 4-bit block scales/mins
257
- uint8_t qs[QK_K/2]; // 4--bit quants
258
- } block_q4_K;
259
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260
- #else
261
  typedef struct {
262
  union {
263
  struct {
@@ -270,21 +242,11 @@ typedef struct {
270
  uint8_t qs[QK_K/2]; // 4--bit quants
271
  } block_q4_K;
272
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273
- #endif
274
 
275
  // 5-bit quantization
276
  // 8 blocks of 32 elements each
277
  // weight is represented as x = a * q + b
278
  // Effectively 5.5 bits per weight
279
- #ifdef GGML_QKK_64
280
- typedef struct {
281
- ggml_half d; // super-block scale
282
- int8_t scales[QK_K/16]; // 8-bit block scales
283
- uint8_t qh[QK_K/8]; // quants, high bit
284
- uint8_t qs[QK_K/2]; // quants, low 4 bits
285
- } block_q5_K;
286
- static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287
- #else
288
  typedef struct {
289
  union {
290
  struct {
@@ -298,7 +260,6 @@ typedef struct {
298
  uint8_t qs[QK_K/2]; // quants, low 4 bits
299
  } block_q5_K;
300
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301
- #endif
302
 
303
  // 6-bit quantization
304
  // weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357
 
358
  // 3.4375 bpw
359
- #if QK_K == 64
360
- #define IQ3S_N_SCALE 2
361
- #else
362
  #define IQ3S_N_SCALE QK_K/64
363
- #endif
364
  typedef struct {
365
  ggml_half d;
366
  uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381
  typedef struct {
382
  uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
  uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
- #if QK_K == 64
385
- ggml_half d;
386
- #endif
387
  uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
  } block_iq1_m;
389
- #if QK_K == 64
390
- static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
- #else
392
  static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
- #endif
394
 
395
  // Used by IQ1_M quants
396
  typedef union {
@@ -406,9 +356,6 @@ typedef struct {
406
  } block_iq4_nl;
407
  static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408
 
409
- #if QK_K == 64
410
- #define block_iq4_xs block_iq4_nl
411
- #else
412
  typedef struct {
413
  ggml_half d;
414
  uint16_t scales_h;
@@ -416,7 +363,6 @@ typedef struct {
416
  uint8_t qs[QK_K/2];
417
  } block_iq4_xs;
418
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419
- #endif
420
 
421
  #endif // GGML_COMMON_DECL
422
  #endif // GGML_COMMON_DECL
 
65
  // QK = number of values after dequantization
66
  // QK_K = super-block size
67
 
 
 
 
 
68
  #define QK_K 256
69
  #define K_SCALE_SIZE 12
 
70
 
71
  #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
72
  // QR = QK / number of values before dequantization
 
126
  #define QI4_NL (QK4_NL / (4*QR4_NL))
127
  #define QR4_NL 2
128
 
 
 
 
 
129
  #define QI4_XS (QK_K / (4*QR4_XS))
130
  #define QR4_XS 8
 
131
 
132
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
133
 
 
218
  // weight is represented as x = a * q
219
  // 16 blocks of 16 elements each
220
  // Effectively 3.4375 bits per weight
 
 
 
 
 
 
 
 
 
221
  typedef struct {
222
  uint8_t hmask[QK_K/8]; // quants - high bit
223
  uint8_t qs[QK_K/4]; // quants - low 2 bits
 
225
  ggml_half d; // super-block scale
226
  } block_q3_K;
227
  static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
 
228
 
229
  // 4-bit quantization
230
  // 8 blocks of 32 elements each
231
  // weight is represented as x = a * q + b
232
  // Effectively 4.5 bits per weight
 
 
 
 
 
 
 
 
233
  typedef struct {
234
  union {
235
  struct {
 
242
  uint8_t qs[QK_K/2]; // 4--bit quants
243
  } block_q4_K;
244
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
 
245
 
246
  // 5-bit quantization
247
  // 8 blocks of 32 elements each
248
  // weight is represented as x = a * q + b
249
  // Effectively 5.5 bits per weight
 
 
 
 
 
 
 
 
 
250
  typedef struct {
251
  union {
252
  struct {
 
260
  uint8_t qs[QK_K/2]; // quants, low 4 bits
261
  } block_q5_K;
262
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 
263
 
264
  // 6-bit quantization
265
  // weight is represented as x = a * q
 
317
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
318
 
319
  // 3.4375 bpw
 
 
 
320
  #define IQ3S_N_SCALE QK_K/64
 
321
  typedef struct {
322
  ggml_half d;
323
  uint8_t qs[QK_K/4];
 
338
  typedef struct {
339
  uint8_t qs[QK_K/8]; // grid index, low 8 bits
340
  uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
 
 
 
341
  uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
342
  } block_iq1_m;
 
 
 
343
  static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
 
344
 
345
  // Used by IQ1_M quants
346
  typedef union {
 
356
  } block_iq4_nl;
357
  static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
358
 
 
 
 
359
  typedef struct {
360
  ggml_half d;
361
  uint16_t scales_h;
 
363
  uint8_t qs[QK_K/2];
364
  } block_iq4_xs;
365
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
366
 
367
  #endif // GGML_COMMON_DECL
368
  #endif // GGML_COMMON_DECL
ggml-cuda/convert.cu CHANGED
@@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
131
  const block_q2_K * x = (const block_q2_K *) vx;
132
 
133
  const int64_t tid = threadIdx.x;
134
- #if QK_K == 256
135
  const int64_t n = tid/32;
136
  const int64_t l = tid - 32*n;
137
  const int64_t is = 8*n + l/16;
@@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
145
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
146
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
147
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
148
- #else
149
- const int64_t is = tid/16; // 0 or 1
150
- const int64_t il = tid%16; // 0...15
151
- const uint8_t q = x[i].qs[il] >> (2*is);
152
- dst_t * y = yy + i*QK_K + 16*is + il;
153
- float dall = __low2half(x[i].dm);
154
- float dmin = __high2half(x[i].dm);
155
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
156
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
157
- #endif
158
-
159
  }
160
 
161
  template<typename dst_t>
@@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
164
  const int64_t i = blockIdx.x;
165
  const block_q3_K * x = (const block_q3_K *) vx;
166
 
167
- #if QK_K == 256
168
  const int64_t r = threadIdx.x/4;
169
  const int64_t tid = r/2;
170
  const int64_t is0 = r%2;
@@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
188
  const uint8_t * hm = x[i].hmask;
189
 
190
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
191
- #else
192
- const int64_t tid = threadIdx.x;
193
- const int64_t is = tid/16; // 0 or 1
194
- const int64_t il = tid%16; // 0...15
195
- const int64_t im = il/8; // 0...1
196
- const int64_t in = il%8; // 0...7
197
-
198
- dst_t * y = yy + i*QK_K + 16*is + il;
199
-
200
- const uint8_t q = x[i].qs[il] >> (2*is);
201
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
202
- const float d = (float)x[i].d;
203
-
204
- if (is == 0) {
205
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
206
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
207
- } else {
208
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
209
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
210
- }
211
- #endif
212
-
213
  }
214
 
215
- #if QK_K == 256
216
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
217
  if (j < 4) {
218
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
221
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
222
  }
223
  }
224
- #endif
225
 
226
  template<typename dst_t>
227
  static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
@@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
229
 
230
  const int64_t i = blockIdx.x;
231
 
232
- #if QK_K == 256
233
  // assume 32 threads
234
  const int64_t tid = threadIdx.x;
235
  const int64_t il = tid/8;
@@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
253
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
254
  y[l +32] = d2 * (q[l] >> 4) - m2;
255
  }
256
- #else
257
- const int64_t tid = threadIdx.x;
258
- const uint8_t * q = x[i].qs;
259
- dst_t * y = yy + i*QK_K;
260
- const float d = (float)x[i].dm[0];
261
- const float m = (float)x[i].dm[1];
262
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
263
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
264
- #endif
265
  }
266
 
267
  template<typename dst_t>
@@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
270
 
271
  const int64_t i = blockIdx.x;
272
 
273
- #if QK_K == 256
274
  // assume 64 threads - this is very slightly better than the one below
275
  const int64_t tid = threadIdx.x;
276
  const int64_t il = tid/16; // il is in 0...3
@@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
297
  hm <<= 1;
298
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
299
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
300
- #else
301
- const int64_t tid = threadIdx.x;
302
- const uint8_t q = x[i].qs[tid];
303
- const int64_t im = tid/8; // 0...3
304
- const int64_t in = tid%8; // 0...7
305
- const int64_t is = tid/16; // 0 or 1
306
- const uint8_t h = x[i].qh[in] >> im;
307
- const float d = x[i].d;
308
- dst_t * y = yy + i*QK_K + tid;
309
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
310
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
311
- #endif
312
  }
313
 
314
  template<typename dst_t>
@@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
316
  const block_q6_K * x = (const block_q6_K *) vx;
317
 
318
  const int64_t i = blockIdx.x;
319
- #if QK_K == 256
320
 
321
  // assume 64 threads - this is very slightly better than the one below
322
  const int64_t tid = threadIdx.x;
@@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
336
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
337
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
338
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
339
- #else
340
-
341
- // assume 32 threads
342
- const int64_t tid = threadIdx.x;
343
- const int64_t ip = tid/16; // 0 or 1
344
- const int64_t il = tid - 16*ip; // 0...15
345
-
346
- dst_t * y = yy + i*QK_K + 16*ip + il;
347
-
348
- const float d = x[i].d;
349
-
350
- const uint8_t ql = x[i].ql[16*ip + il];
351
- const uint8_t qh = x[i].qh[il] >> (2*ip);
352
- const int8_t * sc = x[i].scales;
353
-
354
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
355
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
356
- #endif
357
  }
358
 
359
  template<typename dst_t>
@@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
363
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
364
 
365
  const int64_t tid = threadIdx.x;
366
- #if QK_K == 256
367
  const int64_t il = tid/8; // 0...3
368
  const int64_t ib = tid%8; // 0...7
369
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
374
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
375
  const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
376
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
377
- #else
378
- NO_DEVICE_CODE;
379
- #endif
380
-
381
  }
382
 
383
  template<typename dst_t>
@@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
387
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
388
 
389
  const int64_t tid = threadIdx.x;
390
- #if QK_K == 256
391
  const int64_t il = tid/8; // 0...3
392
  const int64_t ib = tid%8; // 0...7
393
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
396
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
397
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
398
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
399
- #else
400
- NO_DEVICE_CODE;
401
- #endif
402
-
403
  }
404
 
405
  template<typename dst_t>
@@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
409
  const block_iq2_s * x = (const block_iq2_s *) vx;
410
 
411
  const int64_t tid = threadIdx.x;
412
- #if QK_K == 256
413
  const int64_t il = tid/8; // 0...3
414
  const int64_t ib = tid%8; // 0...7
415
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
417
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
418
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
419
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
420
- #else
421
- NO_DEVICE_CODE;
422
- #endif
423
-
424
  }
425
 
426
  template<typename dst_t>
@@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
430
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
431
 
432
  const int64_t tid = threadIdx.x;
433
- #if QK_K == 256
434
  const int64_t il = tid/8; // 0...3
435
  const int64_t ib = tid%8; // 0...7
436
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
445
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
446
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
447
  }
448
- #else
449
- NO_DEVICE_CODE;
450
- #endif
451
-
452
  }
453
 
454
  template<typename dst_t>
@@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
458
  const block_iq3_s * x = (const block_iq3_s *) vx;
459
 
460
  const int64_t tid = threadIdx.x;
461
- #if QK_K == 256
462
  const int64_t il = tid/8; // 0...3
463
  const int64_t ib = tid%8; // 0...7
464
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
471
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
472
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
473
  }
474
- #else
475
- NO_DEVICE_CODE;
476
- #endif
477
-
478
  }
479
 
480
  template<typename dst_t>
@@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
484
  const block_iq1_s * x = (const block_iq1_s *) vx;
485
 
486
  const int64_t tid = threadIdx.x;
487
- #if QK_K == 256
488
  const int64_t il = tid/8; // 0...3
489
  const int64_t ib = tid%8; // 0...7
490
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
497
  for (int j = 0; j < 8; ++j) {
498
  y[j] = d * (q[j] + delta);
499
  }
500
- #else
501
- NO_DEVICE_CODE;
502
- #endif
503
-
504
  }
505
 
506
  template<typename dst_t>
@@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
510
  const block_iq1_m * x = (const block_iq1_m *) vx;
511
 
512
  const int64_t tid = threadIdx.x;
513
- #if QK_K == 256
514
  const int64_t il = tid/8; // 0...3
515
  const int64_t ib = tid%8; // 0...7
516
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
527
  for (int j = 0; j < 8; ++j) {
528
  y[j] = d * (q[j] + delta);
529
  }
530
- #else
531
- NO_DEVICE_CODE;
532
- #endif
533
-
534
  }
535
 
536
-
537
  template<typename dst_t>
538
  static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
539
 
@@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
550
  y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
551
  y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
552
  }
553
-
554
  }
555
 
556
- #if QK_K != 64
557
  template<typename dst_t>
558
  static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
559
  const int64_t i = blockIdx.x;
@@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
570
  y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
571
  }
572
  }
573
- #endif
574
 
575
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
576
  static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
@@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
592
  template<typename dst_t>
593
  static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
594
  const int nb = k / QK_K;
595
- #if QK_K == 256
596
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
597
- #else
598
- dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
599
- #endif
600
  }
601
 
602
  template<typename dst_t>
603
  static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
604
  const int nb = k / QK_K;
605
- #if QK_K == 256
606
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
607
- #else
608
- dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
609
- #endif
610
  }
611
 
612
  template<typename dst_t>
@@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
632
  template<typename dst_t>
633
  static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
634
  const int nb = k / QK_K;
635
- #if QK_K == 256
636
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
637
- #else
638
- dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
639
- #endif
640
  }
641
 
642
  template<typename dst_t>
643
  static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
644
  const int nb = k / QK_K;
645
- #if QK_K == 256
646
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
647
- #else
648
- dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
649
- #endif
650
  }
651
 
652
  template<typename dst_t>
@@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
700
  template<typename dst_t>
701
  static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
702
  const int nb = (k + QK_K - 1) / QK_K;
703
- #if QK_K == 64
704
- dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
705
- #else
706
  dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
707
- #endif
708
  }
709
 
710
  template <typename src_t, typename dst_t>
 
131
  const block_q2_K * x = (const block_q2_K *) vx;
132
 
133
  const int64_t tid = threadIdx.x;
 
134
  const int64_t n = tid/32;
135
  const int64_t l = tid - 32*n;
136
  const int64_t is = 8*n + l/16;
 
144
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
145
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
146
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
 
 
 
 
 
 
 
 
 
 
 
147
  }
148
 
149
  template<typename dst_t>
 
152
  const int64_t i = blockIdx.x;
153
  const block_q3_K * x = (const block_q3_K *) vx;
154
 
 
155
  const int64_t r = threadIdx.x/4;
156
  const int64_t tid = r/2;
157
  const int64_t is0 = r%2;
 
175
  const uint8_t * hm = x[i].hmask;
176
 
177
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  }
179
 
 
180
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
181
  if (j < 4) {
182
  d = q[j] & 63; m = q[j + 4] & 63;
 
185
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
186
  }
187
  }
 
188
 
189
  template<typename dst_t>
190
  static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
192
 
193
  const int64_t i = blockIdx.x;
194
 
 
195
  // assume 32 threads
196
  const int64_t tid = threadIdx.x;
197
  const int64_t il = tid/8;
 
215
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
216
  y[l +32] = d2 * (q[l] >> 4) - m2;
217
  }
 
 
 
 
 
 
 
 
 
218
  }
219
 
220
  template<typename dst_t>
 
223
 
224
  const int64_t i = blockIdx.x;
225
 
 
226
  // assume 64 threads - this is very slightly better than the one below
227
  const int64_t tid = threadIdx.x;
228
  const int64_t il = tid/16; // il is in 0...3
 
249
  hm <<= 1;
250
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
251
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
 
 
 
 
 
 
 
 
 
 
 
 
252
  }
253
 
254
  template<typename dst_t>
 
256
  const block_q6_K * x = (const block_q6_K *) vx;
257
 
258
  const int64_t i = blockIdx.x;
 
259
 
260
  // assume 64 threads - this is very slightly better than the one below
261
  const int64_t tid = threadIdx.x;
 
275
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
276
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
277
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  }
279
 
280
  template<typename dst_t>
 
284
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
285
 
286
  const int64_t tid = threadIdx.x;
 
287
  const int64_t il = tid/8; // 0...3
288
  const int64_t ib = tid%8; // 0...7
289
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
294
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
295
  const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
296
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 
 
 
 
297
  }
298
 
299
  template<typename dst_t>
 
303
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
304
 
305
  const int64_t tid = threadIdx.x;
 
306
  const int64_t il = tid/8; // 0...3
307
  const int64_t ib = tid%8; // 0...7
308
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
311
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
312
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
313
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 
 
 
 
314
  }
315
 
316
  template<typename dst_t>
 
320
  const block_iq2_s * x = (const block_iq2_s *) vx;
321
 
322
  const int64_t tid = threadIdx.x;
 
323
  const int64_t il = tid/8; // 0...3
324
  const int64_t ib = tid%8; // 0...7
325
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
327
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
328
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
329
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 
 
 
 
330
  }
331
 
332
  template<typename dst_t>
 
336
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
337
 
338
  const int64_t tid = threadIdx.x;
 
339
  const int64_t il = tid/8; // 0...3
340
  const int64_t ib = tid%8; // 0...7
341
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
350
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
351
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
352
  }
 
 
 
 
353
  }
354
 
355
  template<typename dst_t>
 
359
  const block_iq3_s * x = (const block_iq3_s *) vx;
360
 
361
  const int64_t tid = threadIdx.x;
 
362
  const int64_t il = tid/8; // 0...3
363
  const int64_t ib = tid%8; // 0...7
364
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
371
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
372
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
373
  }
 
 
 
 
374
  }
375
 
376
  template<typename dst_t>
 
380
  const block_iq1_s * x = (const block_iq1_s *) vx;
381
 
382
  const int64_t tid = threadIdx.x;
 
383
  const int64_t il = tid/8; // 0...3
384
  const int64_t ib = tid%8; // 0...7
385
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
392
  for (int j = 0; j < 8; ++j) {
393
  y[j] = d * (q[j] + delta);
394
  }
 
 
 
 
395
  }
396
 
397
  template<typename dst_t>
 
401
  const block_iq1_m * x = (const block_iq1_m *) vx;
402
 
403
  const int64_t tid = threadIdx.x;
 
404
  const int64_t il = tid/8; // 0...3
405
  const int64_t ib = tid%8; // 0...7
406
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
417
  for (int j = 0; j < 8; ++j) {
418
  y[j] = d * (q[j] + delta);
419
  }
 
 
 
 
420
  }
421
 
 
422
  template<typename dst_t>
423
  static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
424
 
 
435
  y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
436
  y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
437
  }
 
438
  }
439
 
 
440
  template<typename dst_t>
441
  static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
442
  const int64_t i = blockIdx.x;
 
453
  y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
454
  }
455
  }
 
456
 
457
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
458
  static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
 
474
  template<typename dst_t>
475
  static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
476
  const int nb = k / QK_K;
 
477
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
478
  }
479
 
480
  template<typename dst_t>
481
  static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
482
  const int nb = k / QK_K;
 
483
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
484
  }
485
 
486
  template<typename dst_t>
 
506
  template<typename dst_t>
507
  static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
508
  const int nb = k / QK_K;
 
509
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
510
  }
511
 
512
  template<typename dst_t>
513
  static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
514
  const int nb = k / QK_K;
 
515
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
516
  }
517
 
518
  template<typename dst_t>
 
566
  template<typename dst_t>
567
  static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
568
  const int nb = (k + QK_K - 1) / QK_K;
 
 
 
569
  dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 
570
  }
571
 
572
  template <typename src_t, typename dst_t>
ggml-cuda/dmmv.cu CHANGED
@@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
22
 
23
  float tmp = 0; // partial sum for thread in warp
24
 
25
- #if QK_K == 256
26
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
27
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
28
 
@@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
71
  tmp += dall * sum1 - dmin * sum2;
72
 
73
  }
74
- #else
75
- const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
76
- const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
77
- const int offset = tid * K_QUANTS_PER_ITERATION;
78
-
79
- uint32_t uaux[2];
80
- const uint8_t * d = (const uint8_t *)uaux;
81
-
82
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
83
-
84
- const float * y = yy + i * QK_K + offset;
85
- const uint8_t * q = x[i].qs + offset;
86
- const uint32_t * s = (const uint32_t *)x[i].scales;
87
-
88
- uaux[0] = s[0] & 0x0f0f0f0f;
89
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
90
-
91
- const float2 dall = __half22float2(x[i].dm);
92
-
93
- float sum1 = 0, sum2 = 0;
94
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
95
- const uint8_t ql = q[l];
96
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
97
- + y[l+16] * d[1] * ((ql >> 2) & 3)
98
- + y[l+32] * d[2] * ((ql >> 4) & 3)
99
- + y[l+48] * d[3] * ((ql >> 6) & 3);
100
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
101
- }
102
- tmp += dall.x * sum1 - dall.y * sum2;
103
- }
104
- #endif
105
 
106
  // sum up partial sums and write back result
107
  tmp = warp_reduce_sum(tmp);
@@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
123
 
124
  float tmp = 0; // partial sum for thread in warp
125
 
126
- #if QK_K == 256
127
-
128
  const uint16_t kmask1 = 0x0303;
129
  const uint16_t kmask2 = 0x0f0f;
130
 
@@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
175
  tmp += d * sum;
176
 
177
  }
178
- #else
179
-
180
- const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
181
- const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
182
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
183
- const int in = offset/8; // 0 or 1
184
- const int im = offset%8; // 0...7
185
-
186
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
187
-
188
- const float * y = yy + i * QK_K + offset;
189
- const uint8_t * q = x[i].qs + offset;
190
- const uint8_t * s = x[i].scales;
191
-
192
- const float dall = (float)x[i].d;
193
-
194
- float sum = 0;
195
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
196
- const uint8_t hl = x[i].hmask[im+l] >> in;
197
- const uint8_t ql = q[l];
198
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
199
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
200
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
201
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
202
- }
203
- tmp += sum;
204
- }
205
- #endif
206
 
207
  // sum up partial sums and write back result
208
  tmp = warp_reduce_sum(tmp);
@@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
221
 
222
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
223
 
224
- #if QK_K == 256
225
  const uint16_t kmask1 = 0x3f3f;
226
  const uint16_t kmask2 = 0x0f0f;
227
  const uint16_t kmask3 = 0xc0c0;
@@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
306
  #endif
307
 
308
  }
309
- #else
310
- const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
311
- const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
312
-
313
- const int step = tid * K_QUANTS_PER_ITERATION;
314
-
315
- uint16_t aux16[2];
316
- const uint8_t * s = (const uint8_t *)aux16;
317
-
318
- float tmp = 0;
319
-
320
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
321
- const uint8_t * q = x[i].qs + step;
322
- const float * y = yy + i*QK_K + step;
323
- const uint16_t * a = (const uint16_t *)x[i].scales;
324
- aux16[0] = a[0] & 0x0f0f;
325
- aux16[1] = (a[0] >> 4) & 0x0f0f;
326
- const float d = (float)x[i].dm[0];
327
- const float m = (float)x[i].dm[1];
328
- float sum = 0.f;
329
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
330
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
331
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
332
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
333
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
334
- }
335
- tmp += sum;
336
- }
337
-
338
- #endif
339
 
340
  // sum up partial sums and write back result
341
  tmp = warp_reduce_sum(tmp);
@@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
355
 
356
  float tmp = 0; // partial sum for thread in warp
357
 
358
- #if QK_K == 256
359
  const uint16_t kmask1 = 0x3f3f;
360
  const uint16_t kmask2 = 0x0f0f;
361
  const uint16_t kmask3 = 0xc0c0;
@@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
426
  tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
427
  }
428
 
429
- #else
430
- const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
431
- const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
432
- const int step = tid * K_QUANTS_PER_ITERATION;
433
- const int im = step/8;
434
- const int in = step%8;
435
-
436
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
437
- const uint8_t * q = x[i].qs + step;
438
- const int8_t * s = x[i].scales;
439
- const float * y = yy + i*QK_K + step;
440
- const float d = x[i].d;
441
- float sum = 0.f;
442
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
443
- const uint8_t h = x[i].qh[in+j] >> im;
444
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
445
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
446
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
447
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
448
- }
449
- tmp += sum;
450
- }
451
- #endif
452
-
453
  // sum up partial sums and write back result
454
  tmp = warp_reduce_sum(tmp);
455
 
@@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
470
 
471
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
472
 
473
- #if QK_K == 256
474
-
475
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
476
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
477
 
@@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
526
 
527
  }
528
 
529
- #else
530
-
531
- const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
532
- const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
533
-
534
- const int step = tid * K_QUANTS_PER_ITERATION;
535
-
536
- float tmp = 0; // partial sum for thread in warp
537
-
538
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
539
-
540
- const float * y = yy + i * QK_K + step;
541
- const uint8_t * ql = x[i].ql + step;
542
- const uint8_t * qh = x[i].qh + step;
543
- const int8_t * s = x[i].scales;
544
-
545
- const float d = x[i+0].d;
546
-
547
- float sum = 0;
548
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
549
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
550
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
551
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
552
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
553
- }
554
- tmp += sum;
555
-
556
- }
557
-
558
- #endif
559
-
560
  // sum up partial sums and write back result
561
  tmp = warp_reduce_sum(tmp);
562
 
 
22
 
23
  float tmp = 0; // partial sum for thread in warp
24
 
 
25
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
26
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
27
 
 
70
  tmp += dall * sum1 - dmin * sum2;
71
 
72
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  // sum up partial sums and write back result
75
  tmp = warp_reduce_sum(tmp);
 
91
 
92
  float tmp = 0; // partial sum for thread in warp
93
 
 
 
94
  const uint16_t kmask1 = 0x0303;
95
  const uint16_t kmask2 = 0x0f0f;
96
 
 
141
  tmp += d * sum;
142
 
143
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  // sum up partial sums and write back result
146
  tmp = warp_reduce_sum(tmp);
 
159
 
160
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
161
 
 
162
  const uint16_t kmask1 = 0x3f3f;
163
  const uint16_t kmask2 = 0x0f0f;
164
  const uint16_t kmask3 = 0xc0c0;
 
243
  #endif
244
 
245
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  // sum up partial sums and write back result
248
  tmp = warp_reduce_sum(tmp);
 
262
 
263
  float tmp = 0; // partial sum for thread in warp
264
 
 
265
  const uint16_t kmask1 = 0x3f3f;
266
  const uint16_t kmask2 = 0x0f0f;
267
  const uint16_t kmask3 = 0xc0c0;
 
332
  tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
333
  }
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  // sum up partial sums and write back result
336
  tmp = warp_reduce_sum(tmp);
337
 
 
352
 
353
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
354
 
 
 
355
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
356
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
357
 
 
406
 
407
  }
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  // sum up partial sums and write back result
410
  tmp = warp_reduce_sum(tmp);
411
 
ggml-cuda/mmq.cu CHANGED
@@ -826,11 +826,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
826
 
827
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
828
 
829
- #if QK_K == 256
830
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
831
- #else
832
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
833
- #endif
834
  }
835
 
836
  #pragma unroll
@@ -933,9 +929,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
933
 
934
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
935
 
936
- #if QK_K == 256
937
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
938
- #endif
939
  }
940
 
941
  #pragma unroll
 
826
 
827
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
828
 
 
829
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
 
 
 
830
  }
831
 
832
  #pragma unroll
 
929
 
930
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
931
 
 
932
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
 
933
  }
934
 
935
  #pragma unroll
ggml-cuda/vecdotq.cuh CHANGED
@@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
712
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
713
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
714
 
715
- #ifndef GGML_QKK_64
716
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
717
 
718
  int v[2];
@@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
754
  }
755
 
756
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
757
-
758
- #else
759
-
760
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
761
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
762
-
763
- float sumf_d = 0.0f;
764
- float sumf_m = 0.0f;
765
-
766
- uint16_t aux16[2];
767
- const uint8_t * s = (const uint8_t *)aux16;
768
-
769
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
770
- aux16[0] = a[0] & 0x0f0f;
771
- aux16[1] = (a[0] >> 4) & 0x0f0f;
772
-
773
- const float dall = bq4_K->dm[0];
774
- const float dmin = bq4_K->dm[1];
775
-
776
- const float d8_1 = __low2float(bq8_1[0].ds);
777
- const float d8_2 = __low2float(bq8_1[1].ds);
778
-
779
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
780
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
781
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
782
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
783
-
784
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
785
- const int v1 = q4[0];
786
- const int v2 = q4[4];
787
-
788
- const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
789
- const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
790
- const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
791
- const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
792
-
793
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
794
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
795
-
796
- return dall * sumf_d - dmin * sumf_m;
797
-
798
- #else
799
- NO_DEVICE_CODE;
800
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
801
-
802
- #endif
803
  }
804
 
805
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
806
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
807
 
808
- #ifndef GGML_QKK_64
809
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
810
 
811
  int vl[2];
@@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
847
  }
848
 
849
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
850
-
851
- #else
852
-
853
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
854
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
855
-
856
- const int8_t * s = bq5_K->scales;
857
-
858
- const float d = bq5_K->d;
859
-
860
- const float d8_1 = __low2half(bq8_1[0].ds);
861
- const float d8_2 = __low2half(bq8_1[1].ds);
862
-
863
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
864
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
865
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
866
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
867
-
868
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
869
- const int vl1 = ql[0];
870
- const int vl2 = ql[4];
871
-
872
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
873
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
874
- const int in = step%8; // 0, 4, 0, 4
875
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
876
-
877
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
878
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
879
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
880
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
881
-
882
- const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
883
- + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
884
-
885
- return d * sumf_d;
886
-
887
- #else
888
- NO_DEVICE_CODE;
889
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
890
-
891
- #endif
892
  }
893
 
894
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
919
 
920
  static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
921
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
922
- #if QK_K == 256
923
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
924
 
925
  #if QR2_XXS == 8
@@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
960
  }
961
  return d * (sumi1 + sumi2);
962
  #endif
963
- #else
964
- NO_DEVICE_CODE;
965
- #endif
966
  }
967
 
968
  static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
969
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
970
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
971
- #if QK_K == 256
972
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
973
 
974
  const int ib32 = iqs;
@@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
1002
  GGML_UNUSED(ksigns64);
1003
  NO_DEVICE_CODE;
1004
  #endif
1005
- #else
1006
- GGML_UNUSED(ksigns64);
1007
- NO_DEVICE_CODE;
1008
- #endif
1009
  }
1010
 
1011
  // TODO
1012
  static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
1013
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1014
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1015
- #if QK_K == 256
1016
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
1017
 
1018
  const int ib32 = iqs;
@@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
1048
  GGML_UNUSED(ksigns64);
1049
  NO_DEVICE_CODE;
1050
  #endif
1051
- #else
1052
- GGML_UNUSED(ksigns64);
1053
- NO_DEVICE_CODE;
1054
- #endif
1055
  }
1056
 
1057
  static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
1058
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1059
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1060
- #if QK_K == 256
1061
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
1062
 
1063
  const int ib32 = iqs;
@@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
1082
  #else
1083
  NO_DEVICE_CODE;
1084
  #endif
1085
- #else
1086
- NO_DEVICE_CODE;
1087
- #endif
1088
  }
1089
 
1090
  // TODO: don't use lookup table for signs
1091
  static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
1092
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1093
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1094
- #if QK_K == 256
1095
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
1096
 
1097
  const int ib32 = iqs;
@@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
1114
  #else
1115
  NO_DEVICE_CODE;
1116
  #endif
1117
- #else
1118
- NO_DEVICE_CODE;
1119
- #endif
1120
  }
1121
 
1122
  static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
1123
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1124
- #if QK_K == 256
1125
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
1126
 
1127
  const int ib32 = iqs;
@@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
1149
  const float d = d1q * __low2float (bq8_1[ib32].ds);
1150
  const float m = d1q * __high2float(bq8_1[ib32].ds);
1151
  return d * sumi + m * delta;
1152
- #else
1153
- NO_DEVICE_CODE;
1154
- #endif
1155
  }
1156
 
1157
  static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
1158
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1159
- #if QK_K == 256
1160
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
1161
 
1162
  const int ib32 = iqs;
@@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
1192
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1193
  const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
1194
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
1195
- #else
1196
- NO_DEVICE_CODE;
1197
- #endif
1198
  }
1199
 
1200
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
1250
  static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
1251
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1252
 
1253
- #if QK_K == 256
1254
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1255
-
1256
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
1257
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1258
 
@@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
1270
  sumi2 = __dp4a(v2, q8[j+4], sumi2);
1271
  }
1272
  return d * (sumi1 + sumi2);
1273
-
1274
- #else
1275
- NO_DEVICE_CODE;
1276
- #endif
1277
  #else
1278
  return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
1279
  #endif
 
712
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
713
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
714
 
 
715
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
716
 
717
  int v[2];
 
753
  }
754
 
755
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756
  }
757
 
758
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
759
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
760
 
 
761
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
762
 
763
  int vl[2];
 
799
  }
800
 
801
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
  }
803
 
804
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 
829
 
830
  static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
831
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 
832
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
833
 
834
  #if QR2_XXS == 8
 
869
  }
870
  return d * (sumi1 + sumi2);
871
  #endif
 
 
 
872
  }
873
 
874
  static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
875
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
876
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
877
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
878
 
879
  const int ib32 = iqs;
 
907
  GGML_UNUSED(ksigns64);
908
  NO_DEVICE_CODE;
909
  #endif
 
 
 
 
910
  }
911
 
912
  // TODO
913
  static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
914
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
915
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
916
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
917
 
918
  const int ib32 = iqs;
 
948
  GGML_UNUSED(ksigns64);
949
  NO_DEVICE_CODE;
950
  #endif
 
 
 
 
951
  }
952
 
953
  static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
954
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
955
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
956
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
957
 
958
  const int ib32 = iqs;
 
977
  #else
978
  NO_DEVICE_CODE;
979
  #endif
 
 
 
980
  }
981
 
982
  // TODO: don't use lookup table for signs
983
  static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
984
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
985
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
986
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
987
 
988
  const int ib32 = iqs;
 
1005
  #else
1006
  NO_DEVICE_CODE;
1007
  #endif
 
 
 
1008
  }
1009
 
1010
  static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
1011
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 
1012
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
1013
 
1014
  const int ib32 = iqs;
 
1036
  const float d = d1q * __low2float (bq8_1[ib32].ds);
1037
  const float m = d1q * __high2float(bq8_1[ib32].ds);
1038
  return d * sumi + m * delta;
 
 
 
1039
  }
1040
 
1041
  static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
1042
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 
1043
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
1044
 
1045
  const int ib32 = iqs;
 
1075
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1076
  const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
1077
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
 
 
 
1078
  }
1079
 
1080
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
1130
  static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
1131
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1132
 
 
1133
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
1134
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
1135
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1136
 
 
1148
  sumi2 = __dp4a(v2, q8[j+4], sumi2);
1149
  }
1150
  return d * (sumi1 + sumi2);
 
 
 
 
1151
  #else
1152
  return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
1153
  #endif
ggml-metal.m CHANGED
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
381
  // dictionary of preprocessor macros
382
  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
383
 
384
- #ifdef GGML_QKK_64
385
- prep[@"GGML_QKK_64"] = @(1);
386
- #endif
387
-
388
  MTLCompileOptions* options = [MTLCompileOptions new];
389
  options.preprocessorMacros = prep;
390
 
@@ -1773,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
1773
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1774
  }
1775
  else if (src0t == GGML_TYPE_Q3_K) {
1776
- #ifdef GGML_QKK_64
1777
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1778
- #else
1779
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1780
- #endif
1781
  }
1782
  else if (src0t == GGML_TYPE_Q5_K) {
1783
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
2018
  {
2019
  nth0 = 4;
2020
  nth1 = 16;
2021
- #if QK_K == 64
2022
- pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
2023
- #else
2024
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
2025
- #endif
2026
-
2027
  } break;
2028
  default:
2029
  {
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
2088
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2089
  }
2090
  else if (src0t == GGML_TYPE_Q3_K) {
2091
- #ifdef GGML_QKK_64
2092
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2093
- #else
2094
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2095
- #endif
2096
  }
2097
  else if (src0t == GGML_TYPE_Q5_K) {
2098
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 
381
  // dictionary of preprocessor macros
382
  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
383
 
 
 
 
 
384
  MTLCompileOptions* options = [MTLCompileOptions new];
385
  options.preprocessorMacros = prep;
386
 
 
1769
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1770
  }
1771
  else if (src0t == GGML_TYPE_Q3_K) {
 
 
 
1772
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 
1773
  }
1774
  else if (src0t == GGML_TYPE_Q5_K) {
1775
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 
2010
  {
2011
  nth0 = 4;
2012
  nth1 = 16;
 
 
 
2013
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
 
 
2014
  } break;
2015
  default:
2016
  {
 
2075
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2076
  }
2077
  else if (src0t == GGML_TYPE_Q3_K) {
 
 
 
2078
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 
2079
  }
2080
  else if (src0t == GGML_TYPE_Q5_K) {
2081
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
ggml-metal.metal CHANGED
@@ -3386,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl(
3386
 
3387
  const int step = sizeof(block_q2_K) * nb;
3388
 
3389
- #if QK_K == 256
3390
  const int ix = tiisg/8; // 0...3
3391
  const int it = tiisg%8; // 0...7
3392
  const int iq = it/4; // 0 or 1
@@ -3438,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl(
3438
 
3439
  y4 += 4 * QK_K;
3440
  }
3441
- #else
3442
- const int ix = tiisg/2; // 0...15
3443
- const int it = tiisg%2; // 0...1
3444
-
3445
- device const float * y4 = y + ix * QK_K + 8 * it;
3446
-
3447
- for (int ib = ix; ib < nb; ib += 16) {
3448
-
3449
- float4 sumy = {0.f, 0.f, 0.f, 0.f};
3450
- for (int i = 0; i < 8; ++i) {
3451
- yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
3452
- yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
3453
- yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
3454
- yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
3455
- }
3456
-
3457
- device const uint8_t * sc = (device const uint8_t *)x[ib].scales;
3458
- device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
3459
- device const half * dh = &x[ib].d;
3460
-
3461
- for (int row = 0; row < N_DST; row++) {
3462
-
3463
- float4 acc1 = {0.f, 0.f, 0.f, 0.f};
3464
- float4 acc2 = {0.f, 0.f, 0.f, 0.f};
3465
- for (int i = 0; i < 8; i += 2) {
3466
- acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
3467
- acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
3468
- acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
3469
- acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
3470
- acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
3471
- acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
3472
- acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
3473
- acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
3474
- }
3475
-
3476
- float dall = dh[0];
3477
- float dmin = dh[1];
3478
- sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
3479
- (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
3480
- (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
3481
- (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
3482
- dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
3483
-
3484
- qs += step/2;
3485
- sc += step;
3486
- dh += step/2;
3487
- }
3488
-
3489
- y4 += 16 * QK_K;
3490
- }
3491
- #endif
3492
 
3493
  for (int row = 0; row < N_DST; ++row) {
3494
  all_sum = simd_sum(sumf[row]);
@@ -3526,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32(
3526
  kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
3527
  }
3528
 
3529
- #if QK_K == 256
3530
  void kernel_mul_mv_q3_K_f32_impl(
3531
  device const void * src0,
3532
  device const float * src1,
@@ -3685,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl(
3685
  }
3686
  }
3687
  }
3688
- #else
3689
- void kernel_mul_mv_q3_K_f32_impl(
3690
- device const void * src0,
3691
- device const float * src1,
3692
- device float * dst,
3693
- constant int64_t & ne00,
3694
- constant int64_t & ne01,
3695
- constant int64_t & ne02,
3696
- constant int64_t & ne10,
3697
- constant int64_t & ne12,
3698
- constant int64_t & ne0,
3699
- constant int64_t & ne1,
3700
- constant uint & r2,
3701
- constant uint & r3,
3702
- threadgroup int8_t * shared_values [[threadgroup(0)]],
3703
- uint3 tgpig[[threadgroup_position_in_grid]],
3704
- uint tiisg[[thread_index_in_simdgroup]],
3705
- uint sgitg[[simdgroup_index_in_threadgroup]]) {
3706
-
3707
- const int nb = ne00/QK_K;
3708
-
3709
- const int64_t r0 = tgpig.x;
3710
- const int64_t r1 = tgpig.y;
3711
- const int64_t im = tgpig.z;
3712
-
3713
- const int row = 2 * r0 + sgitg;
3714
-
3715
- const uint i12 = im%ne12;
3716
- const uint i13 = im/ne12;
3717
-
3718
- const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
3719
-
3720
- device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
3721
- device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
3722
-
3723
- const int ix = tiisg/4;
3724
- const int il = 4 * (tiisg%4);// 0, 4, 8, 12
3725
- const int iq = il/8; // 0, 0, 1, 1
3726
- const int in = il%8; // 0, 4, 0, 4
3727
-
3728
- float2 sum = {0.f, 0.f};
3729
-
3730
- for (int i = ix; i < nb; i += 8) {
3731
-
3732
- const float d_all = (float)(x[i].d);
3733
-
3734
- device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
3735
- device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
3736
- device const uint16_t * s = (device const uint16_t *)(x[i].scales);
3737
- device const float * y = yy + i * QK_K + il;
3738
-
3739
- const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
3740
- const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
3741
- const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
3742
- const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
3743
-
3744
- for (int l = 0; l < 4; l += 2) {
3745
- const uint16_t hm = h[l/2] >> iq;
3746
- sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4))
3747
- + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
3748
- + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
3749
- + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
3750
- sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
3751
- + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
3752
- + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
3753
- + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
3754
- }
3755
-
3756
- }
3757
- const float sumf = sum[0] + sum[1] * 1.f/256.f;
3758
-
3759
- const float tot = simd_sum(sumf);
3760
- if (tiisg == 0) {
3761
- dst[r1*ne0 + im*ne0*ne1 + row] = tot;
3762
- }
3763
-
3764
- }
3765
- #endif
3766
 
3767
  [[host_name("kernel_mul_mv_q3_K_f32")]]
3768
  kernel void kernel_mul_mv_q3_K_f32(
@@ -3792,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32(
3792
  kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
3793
  }
3794
 
3795
- #if QK_K == 256
3796
  void kernel_mul_mv_q4_K_f32_impl(
3797
  device const void * src0,
3798
  device const float * src1,
@@ -3906,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl(
3906
  }
3907
  }
3908
  }
3909
- #else
3910
- void kernel_mul_mv_q4_K_f32_impl(
3911
- device const void * src0,
3912
- device const float * src1,
3913
- device float * dst,
3914
- constant int64_t & ne00,
3915
- constant int64_t & ne01,
3916
- constant int64_t & ne02,
3917
- constant int64_t & ne10,
3918
- constant int64_t & ne12,
3919
- constant int64_t & ne0,
3920
- constant int64_t & ne1,
3921
- constant uint & r2,
3922
- constant uint & r3,
3923
- threadgroup int8_t * shared_values [[threadgroup(0)]],
3924
- uint3 tgpig[[threadgroup_position_in_grid]],
3925
- uint tiisg[[thread_index_in_simdgroup]],
3926
- uint sgitg[[simdgroup_index_in_threadgroup]]) {
3927
-
3928
- const int ix = tiisg/4; // 0...7
3929
- const int it = tiisg%4; // 0...3
3930
-
3931
- const int nb = ne00/QK_K;
3932
- const int r0 = tgpig.x;
3933
- const int r1 = tgpig.y;
3934
- const int im = tgpig.z;
3935
- const int first_row = r0 * N_DST;
3936
- const int ib_row = first_row * nb;
3937
-
3938
- const uint i12 = im%ne12;
3939
- const uint i13 = im/ne12;
3940
-
3941
- const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
3942
-
3943
- device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
3944
- device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
3945
-
3946
- float yl[8];
3947
- float yh[8];
3948
- float sumf[N_DST]={0.f}, all_sum;
3949
-
3950
- const int step = sizeof(block_q4_K) * nb / 2;
3951
-
3952
- device const float * y4 = y + ix * QK_K + 8 * it;
3953
-
3954
- uint16_t sc16[4];
3955
-
3956
- for (int ib = ix; ib < nb; ib += 8) {
3957
-
3958
- float2 sumy = {0.f, 0.f};
3959
- for (int i = 0; i < 8; ++i) {
3960
- yl[i] = y4[i+ 0]; sumy[0] += yl[i];
3961
- yh[i] = y4[i+32]; sumy[1] += yh[i];
3962
- }
3963
-
3964
- device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
3965
- device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
3966
- device const half * dh = x[ib].d;
3967
-
3968
- for (int row = 0; row < N_DST; row++) {
3969
-
3970
- sc16[0] = sc[0] & 0x000f;
3971
- sc16[1] = sc[0] & 0x0f00;
3972
- sc16[2] = sc[0] & 0x00f0;
3973
- sc16[3] = sc[0] & 0xf000;
3974
-
3975
- float2 acc1 = {0.f, 0.f};
3976
- float2 acc2 = {0.f, 0.f};
3977
- for (int i = 0; i < 8; i += 2) {
3978
- acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
3979
- acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
3980
- acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
3981
- acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
3982
- }
3983
-
3984
- float dall = dh[0];
3985
- float dmin = dh[1];
3986
- sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
3987
- (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
3988
- dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
3989
-
3990
- qs += step;
3991
- sc += step;
3992
- dh += step;
3993
- }
3994
-
3995
- y4 += 8 * QK_K;
3996
- }
3997
-
3998
- for (int row = 0; row < N_DST; ++row) {
3999
- all_sum = simd_sum(sumf[row]);
4000
- if (tiisg == 0) {
4001
- dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
4002
- }
4003
- }
4004
- }
4005
- #endif
4006
 
4007
  [[host_name("kernel_mul_mv_q4_K_f32")]]
4008
  kernel void kernel_mul_mv_q4_K_f32(
@@ -4070,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl(
4070
 
4071
  const int step = sizeof(block_q5_K) * nb;
4072
 
4073
- #if QK_K == 256
4074
- #
4075
  float yl[16], yh[16];
4076
 
4077
  const uint16_t kmask1 = 0x3f3f;
@@ -4154,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl(
4154
  y1 += 4 * QK_K;
4155
 
4156
  }
4157
- #else
4158
- float yl[8], yh[8];
4159
-
4160
- const int il = 4 * (tiisg/8); // 0, 4, 8, 12
4161
- const int ix = tiisg%8;
4162
- const int iq = il/8; // 0, 0, 1, 1
4163
- const int in = il%8; // 0, 4, 0, 4
4164
-
4165
- device const float * y = yy + ix*QK_K + il;
4166
-
4167
- for (int i = ix; i < nb; i += 8) {
4168
-
4169
- for (int l = 0; l < 4; ++l) {
4170
- yl[l+0] = y[l+ 0];
4171
- yl[l+4] = y[l+16];
4172
- yh[l+0] = y[l+32];
4173
- yh[l+4] = y[l+48];
4174
- }
4175
-
4176
- device const half * dh = &x[i].d;
4177
- device const uint8_t * q = x[i].qs + il;
4178
- device const uint8_t * h = x[i].qh + in;
4179
- device const int8_t * s = x[i].scales;
4180
-
4181
- for (int row = 0; row < 2; ++row) {
4182
-
4183
- const float d = dh[0];
4184
-
4185
- float2 acc = {0.f, 0.f};
4186
- for (int l = 0; l < 4; ++l) {
4187
- const uint8_t hl = h[l] >> iq;
4188
- acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
4189
- + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
4190
- acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
4191
- + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
4192
- }
4193
- sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
4194
-
4195
- q += step;
4196
- h += step;
4197
- s += step;
4198
- dh += step/2;
4199
-
4200
- }
4201
-
4202
- y += 8 * QK_K;
4203
- }
4204
- #endif
4205
 
4206
  for (int row = 0; row < 2; ++row) {
4207
  const float tot = simd_sum(sumf[row]);
@@ -4280,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl(
4280
 
4281
  float sumf = 0;
4282
 
4283
- #if QK_K == 256
4284
  const int tid = tiisg/2;
4285
  const int ix = tiisg%2;
4286
  const int ip = tid/8; // 0 or 1
@@ -4316,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl(
4316
 
4317
  }
4318
 
4319
- #else
4320
- const int ix = tiisg/4;
4321
- const int il = 4*(tiisg%4);
4322
-
4323
- for (int i = ix; i < nb; i += 8) {
4324
- device const float * y = yy + i * QK_K + il;
4325
- device const uint8_t * ql = x[i].ql + il;
4326
- device const uint8_t * qh = x[i].qh + il;
4327
- device const int8_t * s = x[i].scales;
4328
-
4329
- const float d = x[i].d;
4330
-
4331
- float4 sums = {0.f, 0.f, 0.f, 0.f};
4332
- for (int l = 0; l < 4; ++l) {
4333
- sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
4334
- sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
4335
- sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
4336
- sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
4337
- }
4338
- sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
4339
- }
4340
-
4341
- #endif
4342
-
4343
  const float tot = simd_sum(sumf);
4344
  if (tiisg == 0) {
4345
  dst[r1*ne0 + im*ne0*ne1 + row] = tot;
@@ -5173,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
5173
 
5174
  device const float * y4 = y + 32 * ix;
5175
 
5176
- #if QK_K != 64
5177
  iq1m_scale_t scale;
5178
- #endif
5179
 
5180
  for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
5181
 
@@ -5196,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
5196
  device const uint16_t * sc = (device const uint16_t *)xr->scales;
5197
 
5198
  for (int row = 0; row < N_DST; row++) {
5199
-
5200
- #if QK_K != 64
5201
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
5202
- #endif
5203
 
5204
  constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
5205
  constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
@@ -5215,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
5215
  }
5216
  const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
5217
  const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
5218
- #if QK_K == 64
5219
- const float d = (float) *((device const half *)(sc - 1));
5220
- sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
5221
- (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
5222
- #else
5223
  sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
5224
  (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
5225
- #endif
5226
 
5227
  sc += nb*sizeof(block_iq1_m)/2;
5228
  qs += nb*sizeof(block_iq1_m);
@@ -5334,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
5334
  }
5335
  }
5336
 
5337
- #if QK_K != 64
5338
  void kernel_mul_mv_iq4_xs_f32_impl(
5339
  device const void * src0,
5340
  device const float * src1,
@@ -5429,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
5429
  }
5430
  }
5431
  }
5432
- #endif
5433
 
5434
  [[host_name("kernel_mul_mv_iq1_s_f32")]]
5435
  kernel void kernel_mul_mv_iq1_s_f32(
@@ -5542,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
5542
  uint tiisg[[thread_index_in_simdgroup]],
5543
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
5544
 
5545
- #if QK_K == 64
5546
- kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
5547
- #else
5548
  kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
5549
- #endif
5550
  }
5551
 
5552
  //============================= templates and their specializations =============================
@@ -5672,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
5672
  float dl, ml;
5673
  uint8_t sc = xb->scales[il];
5674
 
5675
- #if QK_K == 256
5676
  q = q + 32*(il/8) + 16*(il&1);
5677
  il = (il/2)%4;
5678
- #endif
5679
  half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
5680
  uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
5681
  dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
@@ -5691,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
5691
  device const uint8_t * h = (device const uint8_t *)xb->hmask;
5692
  device const int8_t * scales = (device const int8_t *)xb->scales;
5693
 
5694
- #if QK_K == 256
5695
  q = q + 32 * (il/8) + 16 * (il&1);
5696
  h = h + 16 * (il&1);
5697
  uint8_t m = 1 << (il/2);
@@ -5712,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
5712
  for (int i = 0; i < 16; ++i) {
5713
  reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
5714
  }
5715
- #else
5716
- float kcoef = il&1 ? 1.f/16.f : 1.f;
5717
- uint16_t kmask = il&1 ? 0xF0 : 0x0F;
5718
- float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
5719
- float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
5720
- uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
5721
- uint8_t m = 1<<(il*2);
5722
- for (int i = 0; i < 16; ++i) {
5723
- reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
5724
- }
5725
- #endif
5726
  }
5727
 
5728
  static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
@@ -5734,7 +5401,6 @@ template <typename type4x4>
5734
  void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
5735
  device const uchar * q = xb->qs;
5736
 
5737
- #if QK_K == 256
5738
  short is = (il/4) * 2;
5739
  q = q + (il/4) * 32 + 16 * (il&1);
5740
  il = il & 3;
@@ -5743,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
5743
  const float min = xb->dmin;
5744
  const float dl = d * sc[0];
5745
  const float ml = min * sc[1];
5746
- #else
5747
- (void) get_scale_min_k4_just2;
5748
-
5749
- q = q + 16 * (il&1);
5750
- device const uint8_t * s = xb->scales;
5751
- device const half2 * dh = (device const half2 *)xb->d;
5752
- const float2 d = (float2)dh[0];
5753
- const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
5754
- const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
5755
- #endif
5756
  const ushort mask = il<2 ? 0x0F : 0xF0;
5757
  for (int i = 0; i < 16; ++i) {
5758
  reg[i/4][i%4] = dl * (q[i] & mask) - ml;
@@ -5764,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
5764
  device const uint8_t * q = xb->qs;
5765
  device const uint8_t * qh = xb->qh;
5766
 
5767
- #if QK_K == 256
5768
  short is = (il/4) * 2;
5769
  q = q + 32 * (il/4) + 16 * (il&1);
5770
  qh = qh + 16 * (il&1);
@@ -5781,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
5781
  for (int i = 0; i < 16; ++i) {
5782
  reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
5783
  }
5784
- #else
5785
- q = q + 16 * (il&1);
5786
- device const int8_t * s = xb->scales;
5787
- const float dl = xb->d * s[il];
5788
- uint8_t m = 1<<(il*2);
5789
- const float coef = il<2 ? 1.f : 1.f/16.f;
5790
- const ushort mask = il<2 ? 0x0F : 0xF0;
5791
- for (int i = 0; i < 16; ++i) {
5792
- reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
5793
- }
5794
- #endif
5795
  }
5796
 
5797
  template <typename type4x4>
@@ -5801,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
5801
  device const uint8_t * qh = (device const uint8_t *)xb->qh;
5802
  device const int8_t * scales = (device const int8_t *)xb->scales;
5803
 
5804
- #if QK_K == 256
5805
  ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
5806
  qh = qh + 32*(il/8) + 16*(il&1);
5807
  float sc = scales[(il%2) + 2 * ((il/2))];
5808
  il = (il/2) & 3;
5809
- #else
5810
- ql = ql + 16 * (il&1);
5811
- float sc = scales[il];
5812
- #endif
5813
  const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
5814
  const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
5815
  const float coef = il>1 ? 1.f/16.f : 1.f;
@@ -5966,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
5966
  const int ib32 = il/2;
5967
  il = il%2;
5968
  device const uint16_t * sc = (device const uint16_t *)xb->scales;
5969
- #if QK_K == 64
5970
- const float d = xb->d;
5971
- #else
5972
  iq1m_scale_t scale;
5973
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
5974
  const float d = scale.f16;
5975
- #endif
5976
  device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
5977
  device const uint8_t * qh = xb->qh + 2*ib32 + il;
5978
- #if QK_K == 64
5979
- const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
5980
- #else
5981
  const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
5982
- #endif
5983
  const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
5984
  const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
5985
  constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
@@ -6009,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
6009
 
6010
  template <typename type4x4>
6011
  void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
6012
- #if QK_K == 64
6013
- dequantize_iq4_nl(xb, il, reg);
6014
- #else
6015
  // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
6016
  const int ib32 = il/2;
6017
  il = il%2;
@@ -6028,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
6028
  reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
6029
  reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
6030
  }
6031
- #endif
6032
  }
6033
 
6034
  template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
@@ -6533,11 +6165,7 @@ kernel void kernel_mul_mm_id(
6533
  sgitg);
6534
  }
6535
 
6536
- #if QK_K == 256
6537
  #define QK_NL 16
6538
- #else
6539
- #define QK_NL 4
6540
- #endif
6541
 
6542
  //
6543
  // get rows
@@ -6577,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_r
6577
  template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
6578
  template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
6579
  template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
6580
- #if QK_K == 64
6581
- template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
6582
- #else
6583
  template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
6584
- #endif
6585
 
6586
  //
6587
  // matrix-matrix multiplication
@@ -6609,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_m
6609
  template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
6610
  template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
6611
  template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
6612
- #if QK_K == 64
6613
- template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
6614
- #else
6615
  template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
6616
- #endif
6617
 
6618
  //
6619
  // indirect matrix-matrix multiplication
@@ -6641,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel
6641
  template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
6642
  template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
6643
  template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
6644
- #if QK_K == 64
6645
- template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
6646
- #else
6647
  template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
6648
- #endif
6649
 
6650
  //
6651
  // matrix-vector multiplication
@@ -6854,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
6854
  template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
6855
  template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
6856
  template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
6857
- #if QK_K != 64
6858
  template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
6859
- #endif
6860
 
 
3386
 
3387
  const int step = sizeof(block_q2_K) * nb;
3388
 
 
3389
  const int ix = tiisg/8; // 0...3
3390
  const int it = tiisg%8; // 0...7
3391
  const int iq = it/4; // 0 or 1
 
3437
 
3438
  y4 += 4 * QK_K;
3439
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3440
 
3441
  for (int row = 0; row < N_DST; ++row) {
3442
  all_sum = simd_sum(sumf[row]);
 
3474
  kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
3475
  }
3476
 
 
3477
  void kernel_mul_mv_q3_K_f32_impl(
3478
  device const void * src0,
3479
  device const float * src1,
 
3632
  }
3633
  }
3634
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3635
 
3636
  [[host_name("kernel_mul_mv_q3_K_f32")]]
3637
  kernel void kernel_mul_mv_q3_K_f32(
 
3661
  kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
3662
  }
3663
 
 
3664
  void kernel_mul_mv_q4_K_f32_impl(
3665
  device const void * src0,
3666
  device const float * src1,
 
3774
  }
3775
  }
3776
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3777
 
3778
  [[host_name("kernel_mul_mv_q4_K_f32")]]
3779
  kernel void kernel_mul_mv_q4_K_f32(
 
3841
 
3842
  const int step = sizeof(block_q5_K) * nb;
3843
 
 
 
3844
  float yl[16], yh[16];
3845
 
3846
  const uint16_t kmask1 = 0x3f3f;
 
3923
  y1 += 4 * QK_K;
3924
 
3925
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3926
 
3927
  for (int row = 0; row < 2; ++row) {
3928
  const float tot = simd_sum(sumf[row]);
 
4001
 
4002
  float sumf = 0;
4003
 
 
4004
  const int tid = tiisg/2;
4005
  const int ix = tiisg%2;
4006
  const int ip = tid/8; // 0 or 1
 
4036
 
4037
  }
4038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4039
  const float tot = simd_sum(sumf);
4040
  if (tiisg == 0) {
4041
  dst[r1*ne0 + im*ne0*ne1 + row] = tot;
 
4869
 
4870
  device const float * y4 = y + 32 * ix;
4871
 
 
4872
  iq1m_scale_t scale;
 
4873
 
4874
  for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
4875
 
 
4890
  device const uint16_t * sc = (device const uint16_t *)xr->scales;
4891
 
4892
  for (int row = 0; row < N_DST; row++) {
 
 
4893
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
 
4894
 
4895
  constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
4896
  constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
 
4906
  }
4907
  const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
4908
  const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
4909
+
 
 
 
 
4910
  sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
4911
  (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
 
4912
 
4913
  sc += nb*sizeof(block_iq1_m)/2;
4914
  qs += nb*sizeof(block_iq1_m);
 
5020
  }
5021
  }
5022
 
 
5023
  void kernel_mul_mv_iq4_xs_f32_impl(
5024
  device const void * src0,
5025
  device const float * src1,
 
5114
  }
5115
  }
5116
  }
 
5117
 
5118
  [[host_name("kernel_mul_mv_iq1_s_f32")]]
5119
  kernel void kernel_mul_mv_iq1_s_f32(
 
5226
  uint tiisg[[thread_index_in_simdgroup]],
5227
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
5228
 
 
 
 
5229
  kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 
5230
  }
5231
 
5232
  //============================= templates and their specializations =============================
 
5352
  float dl, ml;
5353
  uint8_t sc = xb->scales[il];
5354
 
 
5355
  q = q + 32*(il/8) + 16*(il&1);
5356
  il = (il/2)%4;
5357
+
5358
  half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
5359
  uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
5360
  dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
 
5370
  device const uint8_t * h = (device const uint8_t *)xb->hmask;
5371
  device const int8_t * scales = (device const int8_t *)xb->scales;
5372
 
 
5373
  q = q + 32 * (il/8) + 16 * (il&1);
5374
  h = h + 16 * (il&1);
5375
  uint8_t m = 1 << (il/2);
 
5390
  for (int i = 0; i < 16; ++i) {
5391
  reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
5392
  }
 
 
 
 
 
 
 
 
 
 
 
5393
  }
5394
 
5395
  static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
 
5401
  void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
5402
  device const uchar * q = xb->qs;
5403
 
 
5404
  short is = (il/4) * 2;
5405
  q = q + (il/4) * 32 + 16 * (il&1);
5406
  il = il & 3;
 
5409
  const float min = xb->dmin;
5410
  const float dl = d * sc[0];
5411
  const float ml = min * sc[1];
5412
+
 
 
 
 
 
 
 
 
 
5413
  const ushort mask = il<2 ? 0x0F : 0xF0;
5414
  for (int i = 0; i < 16; ++i) {
5415
  reg[i/4][i%4] = dl * (q[i] & mask) - ml;
 
5421
  device const uint8_t * q = xb->qs;
5422
  device const uint8_t * qh = xb->qh;
5423
 
 
5424
  short is = (il/4) * 2;
5425
  q = q + 32 * (il/4) + 16 * (il&1);
5426
  qh = qh + 16 * (il&1);
 
5437
  for (int i = 0; i < 16; ++i) {
5438
  reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
5439
  }
 
 
 
 
 
 
 
 
 
 
 
5440
  }
5441
 
5442
  template <typename type4x4>
 
5446
  device const uint8_t * qh = (device const uint8_t *)xb->qh;
5447
  device const int8_t * scales = (device const int8_t *)xb->scales;
5448
 
 
5449
  ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
5450
  qh = qh + 32*(il/8) + 16*(il&1);
5451
  float sc = scales[(il%2) + 2 * ((il/2))];
5452
  il = (il/2) & 3;
5453
+
 
 
 
5454
  const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
5455
  const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
5456
  const float coef = il>1 ? 1.f/16.f : 1.f;
 
5607
  const int ib32 = il/2;
5608
  il = il%2;
5609
  device const uint16_t * sc = (device const uint16_t *)xb->scales;
5610
+
 
 
5611
  iq1m_scale_t scale;
5612
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
5613
  const float d = scale.f16;
5614
+
5615
  device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
5616
  device const uint8_t * qh = xb->qh + 2*ib32 + il;
5617
+
 
 
5618
  const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
 
5619
  const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
5620
  const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
5621
  constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
 
5645
 
5646
  template <typename type4x4>
5647
  void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
 
 
 
5648
  // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
5649
  const int ib32 = il/2;
5650
  il = il%2;
 
5661
  reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
5662
  reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
5663
  }
 
5664
  }
5665
 
5666
  template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
 
6165
  sgitg);
6166
  }
6167
 
 
6168
  #define QK_NL 16
 
 
 
6169
 
6170
  //
6171
  // get rows
 
6205
  template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
6206
  template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
6207
  template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
 
 
 
6208
  template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
 
6209
 
6210
  //
6211
  // matrix-matrix multiplication
 
6233
  template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
6234
  template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
6235
  template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
 
 
 
6236
  template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
 
6237
 
6238
  //
6239
  // indirect matrix-matrix multiplication
 
6261
  template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
6262
  template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
6263
  template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
 
 
 
6264
  template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
 
6265
 
6266
  //
6267
  // matrix-vector multiplication
 
6470
  template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
6471
  template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
6472
  template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
 
6473
  template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
 
6474
 
ggml-opencl.cpp CHANGED
@@ -1,4 +1,4 @@
1
- #include "ggml.h"
2
  #include "ggml-opencl.h"
3
  #include "ggml-backend-impl.h"
4
 
 
1
+ #include "ggml.h"
2
  #include "ggml-opencl.h"
3
  #include "ggml-backend-impl.h"
4
 
ggml-quants.c CHANGED
The diff for this file is too large to render. See raw diff
 
ggml-sycl.cpp CHANGED
@@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4197
  const block_q2_K * x = (const block_q2_K *) vx;
4198
 
4199
  const int tid = item_ct1.get_local_id(2);
4200
- #if QK_K == 256
4201
  const int n = tid/32;
4202
  const int l = tid - 32*n;
4203
  const int is = 8*n + l/16;
@@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4211
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
4212
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
4213
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
4214
- #else
4215
- const int is = tid/16; // 0 or 1
4216
- const int il = tid%16; // 0...15
4217
- const uint8_t q = x[i].qs[il] >> (2*is);
4218
- dst_t * y = yy + i*QK_K + 16*is + il;
4219
-
4220
- float dall = x[i].dm[0];
4221
- float dmin = x[i].dm[1];
4222
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
4223
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
4224
- #endif
4225
-
4226
  }
4227
 
4228
  template<typename dst_t>
@@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4232
  const int i = item_ct1.get_group(2);
4233
  const block_q3_K * x = (const block_q3_K *) vx;
4234
 
4235
- #if QK_K == 256
4236
  const int r = item_ct1.get_local_id(2) / 4;
4237
  const int tid = r/2;
4238
  const int is0 = r%2;
@@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4256
  const uint8_t * hm = x[i].hmask;
4257
 
4258
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
4259
- #else
4260
- const int tid = item_ct1.get_local_id(2);
4261
- const int is = tid/16; // 0 or 1
4262
- const int il = tid%16; // 0...15
4263
- const int im = il/8; // 0...1
4264
- const int in = il%8; // 0...7
4265
-
4266
- dst_t * y = yy + i*QK_K + 16*is + il;
4267
-
4268
- const uint8_t q = x[i].qs[il] >> (2*is);
4269
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
4270
- const float d = (float)x[i].d;
4271
-
4272
- if (is == 0) {
4273
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4274
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4275
- } else {
4276
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4277
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4278
- }
4279
- #endif
4280
-
4281
  }
4282
 
4283
- #if QK_K == 256
4284
  static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
4285
  if (j < 4) {
4286
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
4289
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
4290
  }
4291
  }
4292
- #endif
4293
 
4294
  template<typename dst_t>
4295
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4298
 
4299
  const int i = item_ct1.get_group(2);
4300
 
4301
- #if QK_K == 256
4302
  // assume 32 threads
4303
  const int tid = item_ct1.get_local_id(2);
4304
  const int il = tid/8;
@@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4322
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
4323
  y[l +32] = d2 * (q[l] >> 4) - m2;
4324
  }
4325
- #else
4326
- const int tid = item_ct1.get_local_id(2);
4327
- const uint8_t * q = x[i].qs;
4328
- dst_t * y = yy + i*QK_K;
4329
- const float d = (float)x[i].dm[0];
4330
- const float m = (float)x[i].dm[1];
4331
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
4332
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
4333
- #endif
4334
  }
4335
 
4336
  template<typename dst_t>
@@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4340
 
4341
  const int i = item_ct1.get_group(2);
4342
 
4343
- #if QK_K == 256
4344
  // assume 64 threads - this is very slightly better than the one below
4345
  const int tid = item_ct1.get_local_id(2);
4346
  const int il = tid/16; // il is in 0...3
@@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4367
  hm <<= 1;
4368
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
4369
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
4370
- #else
4371
- const int tid = item_ct1.get_local_id(2);
4372
- const uint8_t q = x[i].qs[tid];
4373
- const int im = tid/8; // 0...3
4374
- const int in = tid%8; // 0...7
4375
- const int is = tid/16; // 0 or 1
4376
- const uint8_t h = x[i].qh[in] >> im;
4377
- const float d = x[i].d;
4378
- dst_t * y = yy + i*QK_K + tid;
4379
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
4380
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
4381
- #endif
4382
  }
4383
 
4384
  template<typename dst_t>
@@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4387
  const block_q6_K * x = (const block_q6_K *) vx;
4388
 
4389
  const int i = item_ct1.get_group(2);
4390
- #if QK_K == 256
4391
 
4392
  // assume 64 threads - this is very slightly better than the one below
4393
  const int tid = item_ct1.get_local_id(2);
@@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4407
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
4408
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4409
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
4410
- #else
4411
-
4412
- // assume 32 threads
4413
- const int tid = item_ct1.get_local_id(2);
4414
- const int ip = tid/16; // 0 or 1
4415
- const int il = tid - 16*ip; // 0...15
4416
-
4417
- dst_t * y = yy + i*QK_K + 16*ip + il;
4418
-
4419
- const float d = x[i].d;
4420
-
4421
- const uint8_t ql = x[i].ql[16*ip + il];
4422
- const uint8_t qh = x[i].qh[il] >> (2*ip);
4423
- const int8_t * sc = x[i].scales;
4424
-
4425
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
4426
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4427
- #endif
4428
  }
4429
 
4430
  template<typename dst_t>
@@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4438
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
4439
 
4440
  const int tid = item_ct1.get_local_id(2);
4441
- #if QK_K == 256
4442
  const int il = tid/8; // 0...3
4443
  const int ib = tid%8; // 0...7
4444
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4449
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
4450
  const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
4451
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
4452
- #else
4453
- assert(false);
4454
- #endif
4455
-
4456
  }
4457
 
4458
  template<typename dst_t>
@@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4466
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
4467
 
4468
  const int tid = item_ct1.get_local_id(2);
4469
- #if QK_K == 256
4470
  const int il = tid/8; // 0...3
4471
  const int ib = tid%8; // 0...7
4472
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4475
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4476
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
4477
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4478
- #else
4479
- assert(false);
4480
- #endif
4481
-
4482
  }
4483
 
4484
  template <typename dst_t>
@@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4490
  const block_iq2_s * x = (const block_iq2_s *) vx;
4491
 
4492
  const int tid = item_ct1.get_local_id(2);
4493
- #if QK_K == 256
4494
  const int il = tid/8; // 0...3
4495
  const int ib = tid%8; // 0...7
4496
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4498
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4499
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4500
  #pragma unroll
4501
- for (int j = 0; j < 8; ++j)
4502
  y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4503
- #else
4504
- assert(false);
4505
-
4506
- #endif
4507
-
4508
  }
4509
 
4510
  template<typename dst_t>
@@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4518
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
4519
 
4520
  const int tid = item_ct1.get_local_id(2);
4521
- #if QK_K == 256
4522
  const int il = tid/8; // 0...3
4523
  const int ib = tid%8; // 0...7
4524
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4533
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4534
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4535
  }
4536
- #else
4537
- assert(false);
4538
- #endif
4539
-
4540
  }
4541
 
4542
  template <typename dst_t>
@@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4549
  const block_iq3_s * x = (const block_iq3_s *) vx;
4550
 
4551
  const int tid = item_ct1.get_local_id(2);
4552
- #if QK_K == 256
4553
  const int il = tid/8; // 0...3
4554
  const int ib = tid%8; // 0...7
4555
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4563
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4564
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4565
  }
4566
- #else
4567
- assert(false);
4568
- #endif
4569
-
4570
  }
4571
 
4572
  template <typename dst_t>
@@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4579
  const block_iq1_s * x = (const block_iq1_s *) vx;
4580
 
4581
  const int tid = item_ct1.get_local_id(2);
4582
- #if QK_K == 256
4583
  const int il = tid/8; // 0...3
4584
  const int ib = tid%8; // 0...7
4585
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4593
  for (int j = 0; j < 8; ++j) {
4594
  y[j] = d * (q[j] + delta);
4595
  }
4596
- #else
4597
- assert(false);
4598
- #endif
4599
-
4600
  }
4601
 
4602
  template <typename dst_t>
@@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4609
  const block_iq1_m * x = (const block_iq1_m *) vx;
4610
 
4611
  const int tid = item_ct1.get_local_id(2);
4612
- #if QK_K == 256
4613
  const int il = tid/8; // 0...3
4614
  const int ib = tid%8; // 0...7
4615
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4627
  for (int j = 0; j < 8; ++j) {
4628
  y[j] = d * (q[j] + delta);
4629
  }
4630
- #else
4631
- assert(false);
4632
- #endif
4633
-
4634
  }
4635
 
4636
  template <typename dst_t>
@@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4704
 
4705
  float tmp = 0; // partial sum for thread in warp
4706
 
4707
- #if QK_K == 256
4708
  const int tid =
4709
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
4710
  const int ix =
@@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4755
  tmp += dall * sum1 - dmin * sum2;
4756
 
4757
  }
4758
- #else
4759
- const int tid = item_ct1.get_local_id(2) /
4760
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4761
- const int ix = item_ct1.get_local_id(2) %
4762
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4763
- const int offset = tid * K_QUANTS_PER_ITERATION;
4764
-
4765
- uint32_t uaux[2];
4766
- const uint8_t * d = (const uint8_t *)uaux;
4767
-
4768
-
4769
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4770
-
4771
- const float * y = yy + i * QK_K + offset;
4772
- const uint8_t * q = x[i].qs + offset;
4773
- const uint32_t * s = (const uint32_t *)x[i].scales;
4774
-
4775
- uaux[0] = s[0] & 0x0f0f0f0f;
4776
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
4777
-
4778
- const sycl::float2 dall =
4779
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
4780
-
4781
- float sum1 = 0, sum2 = 0;
4782
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4783
- const uint8_t ql = q[l];
4784
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
4785
- + y[l+16] * d[1] * ((ql >> 2) & 3)
4786
- + y[l+32] * d[2] * ((ql >> 4) & 3)
4787
- + y[l+48] * d[3] * ((ql >> 6) & 3);
4788
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
4789
- }
4790
- tmp += dall.x() * sum1 - dall.y() * sum2;
4791
- }
4792
-
4793
- #endif
4794
 
4795
  // sum up partial sums and write back result
4796
  #pragma unroll
@@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4828
 
4829
  float tmp = 0; // partial sum for thread in warp
4830
 
4831
- #if QK_K == 256
4832
-
4833
  const uint16_t kmask1 = 0x0303;
4834
  const uint16_t kmask2 = 0x0f0f;
4835
 
@@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4882
  tmp += d * sum;
4883
 
4884
  }
4885
- #else
4886
-
4887
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4888
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4889
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
4890
- const int in = offset/8; // 0 or 1
4891
- const int im = offset%8; // 0...7
4892
-
4893
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4894
-
4895
- const float * y = yy + i * QK_K + offset;
4896
- const uint8_t * q = x[i].qs + offset;
4897
- const uint8_t * s = x[i].scales;
4898
-
4899
- const float dall = (float)x[i].d;
4900
-
4901
- float sum = 0;
4902
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4903
- const uint8_t hl = x[i].hmask[im+l] >> in;
4904
- const uint8_t ql = q[l];
4905
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
4906
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
4907
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
4908
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
4909
- }
4910
- tmp += sum;
4911
- }
4912
- #endif
4913
 
4914
  // sum up partial sums and write back result
4915
  #pragma unroll
@@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
4944
 
4945
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
4946
 
4947
- #if QK_K == 256
4948
  const uint16_t kmask1 = 0x3f3f;
4949
  const uint16_t kmask2 = 0x0f0f;
4950
  const uint16_t kmask3 = 0xc0c0;
@@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
5033
  #endif
5034
 
5035
  }
5036
- #else
5037
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5038
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5039
-
5040
- const int step = tid * K_QUANTS_PER_ITERATION;
5041
-
5042
- uint16_t aux16[2];
5043
- const uint8_t * s = (const uint8_t *)aux16;
5044
-
5045
- float tmp = 0;
5046
-
5047
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5048
- const uint8_t * q = x[i].qs + step;
5049
- const float * y = yy + i*QK_K + step;
5050
- const uint16_t * a = (const uint16_t *)x[i].scales;
5051
- aux16[0] = a[0] & 0x0f0f;
5052
- aux16[1] = (a[0] >> 4) & 0x0f0f;
5053
- const float d = (float)x[i].dm[0];
5054
- const float m = (float)x[i].dm[1];
5055
- float sum = 0.f;
5056
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5057
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
5058
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
5059
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
5060
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
5061
- }
5062
- tmp += sum;
5063
- }
5064
-
5065
- #endif
5066
 
5067
  // sum up partial sums and write back result
5068
  #pragma unroll
@@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5097
 
5098
  float tmp = 0; // partial sum for thread in warp
5099
 
5100
- #if QK_K == 256
5101
  const uint16_t kmask1 = 0x3f3f;
5102
  const uint16_t kmask2 = 0x0f0f;
5103
  const uint16_t kmask3 = 0xc0c0;
@@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5174
  dmin * smin;
5175
  }
5176
 
5177
- #else
5178
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5179
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5180
- const int step = tid * K_QUANTS_PER_ITERATION;
5181
- const int im = step/8;
5182
- const int in = step%8;
5183
-
5184
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5185
- const uint8_t * q = x[i].qs + step;
5186
- const int8_t * s = x[i].scales;
5187
- const float * y = yy + i*QK_K + step;
5188
- const float d = x[i].d;
5189
- float sum = 0.f;
5190
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5191
- const uint8_t h = x[i].qh[in+j] >> im;
5192
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
5193
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
5194
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
5195
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
5196
- }
5197
- tmp += sum;
5198
- }
5199
- #endif
5200
-
5201
  // sum up partial sums and write back result
5202
  #pragma unroll
5203
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5224
 
5225
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
5226
 
5227
- #if QK_K == 256
5228
-
5229
  const int tid =
5230
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
5231
  const int ix =
@@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5282
 
5283
  }
5284
 
5285
- #else
5286
-
5287
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
5288
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
5289
-
5290
- const int step = tid * K_QUANTS_PER_ITERATION;
5291
-
5292
- float tmp = 0; // partial sum for thread in warp
5293
-
5294
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5295
-
5296
- const float * y = yy + i * QK_K + step;
5297
- const uint8_t * ql = x[i].ql + step;
5298
- const uint8_t * qh = x[i].qh + step;
5299
- const int8_t * s = x[i].scales;
5300
-
5301
- const float d = x[i+0].d;
5302
-
5303
- float sum = 0;
5304
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5305
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
5306
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
5307
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
5308
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
5309
- }
5310
- tmp += sum;
5311
-
5312
- }
5313
-
5314
- #endif
5315
-
5316
  // sum up partial sums and write back result
5317
  #pragma unroll
5318
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6857,7 +6586,6 @@ static __dpct_inline__ float
6857
  vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6858
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6859
 
6860
- #ifndef GGML_QKK_64
6861
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6862
 
6863
  int v[2];
@@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6899
  }
6900
 
6901
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
6902
-
6903
- #else
6904
-
6905
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
6906
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6907
-
6908
- float sumf_d = 0.0f;
6909
- float sumf_m = 0.0f;
6910
-
6911
- uint16_t aux16[2];
6912
- const uint8_t * s = (const uint8_t *)aux16;
6913
-
6914
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
6915
- aux16[0] = a[0] & 0x0f0f;
6916
- aux16[1] = (a[0] >> 4) & 0x0f0f;
6917
-
6918
- const float dall = bq4_K->dm[0];
6919
- const float dmin = bq4_K->dm[1];
6920
-
6921
- const float d8_1 = bq8_1[0].ds[0];
6922
- const float d8_2 = bq8_1[1].ds[1];
6923
-
6924
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
6925
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
6926
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
6927
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
6928
-
6929
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
6930
- const int v1 = q4[0];
6931
- const int v2 = q4[4];
6932
-
6933
- const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
6934
- const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
6935
- const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
6936
- const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
6937
-
6938
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
6939
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
6940
-
6941
- return dall * sumf_d - dmin * sumf_m;
6942
-
6943
- #else
6944
- bad_arch();
6945
- #endif // __SYCL_ARCH__ >= VER_4VEC
6946
-
6947
- #endif
6948
  }
6949
 
6950
  template <int mmq_y>
@@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
7003
 
7004
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
7005
 
7006
- #if QK_K == 256
7007
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
7008
- #else
7009
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
7010
- #endif
7011
  }
7012
 
7013
  #pragma unroll
@@ -7050,7 +6728,6 @@ static __dpct_inline__ float
7050
  vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7051
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7052
 
7053
- #ifndef GGML_QKK_64
7054
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7055
 
7056
  int vl[2];
@@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7092
  }
7093
 
7094
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
7095
-
7096
- #else
7097
-
7098
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
7099
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7100
-
7101
- const int8_t * s = bq5_K->scales;
7102
-
7103
- const float d = bq5_K->d;
7104
-
7105
- const float d8_1 = bq8_1[0].ds[0];
7106
- const float d8_2 = bq8_1[1].ds[1];
7107
-
7108
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
7109
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
7110
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
7111
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
7112
-
7113
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
7114
- const int vl1 = ql[0];
7115
- const int vl2 = ql[4];
7116
-
7117
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
7118
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
7119
- const int in = step%8; // 0, 4, 0, 4
7120
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
7121
-
7122
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
7123
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
7124
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
7125
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
7126
-
7127
- const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
7128
- + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
7129
-
7130
- return d * sumf_d;
7131
-
7132
- #else
7133
- bad_arch();
7134
- #endif // __SYCL_ARCH__ >= VER_4VEC
7135
-
7136
- #endif
7137
  }
7138
 
7139
  template <int mmq_y>
@@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
7205
 
7206
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
7207
 
7208
- #if QK_K == 256
7209
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
7210
- #endif
7211
  }
7212
 
7213
  #pragma unroll
@@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7387
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7388
  const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
7389
  const uint8_t *kmask_iq2xs) {
7390
- #if QK_K == 256
7391
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
7392
 
7393
  #if QR2_XXS == 8
@@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7428
  }
7429
  return d * (sumi1 + sumi2);
7430
  #endif
7431
- #else
7432
- assert(false);
7433
- return 0.f;
7434
- #endif
7435
  }
7436
 
7437
  static __dpct_inline__ float
@@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7440
  const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
7441
  #if DPCT_COMPATIBILITY_TEMP >= \
7442
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7443
- #if QK_K == 256
7444
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
7445
 
7446
  const int ib32 = iqs;
@@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7478
  assert(false);
7479
  return 0.f;
7480
  #endif
7481
- #else
7482
- assert(false);
7483
- return 0.f;
7484
- #endif
7485
  }
7486
 
7487
  static __dpct_inline__ float
7488
  vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7489
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7490
- #if QK_K == 256
7491
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7492
 
7493
  const int ib32 = iqs;
@@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7531
  }
7532
  const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7533
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7534
- #else
7535
- assert(false);
7536
- #endif
7537
  }
7538
 
7539
  static __dpct_inline__ float
@@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7542
  const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
7543
  #if DPCT_COMPATIBILITY_TEMP >= \
7544
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7545
- #if QK_K == 256
7546
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
7547
 
7548
  const int ib32 = iqs;
@@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7570
  assert(false);
7571
  return 0.f;
7572
  #endif
7573
- #else
7574
- assert(false);
7575
- return 0.f;
7576
- #endif
7577
  }
7578
 
7579
  static __dpct_inline__ float
7580
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7581
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7582
  const uint32_t *iq3s_grid) {
7583
- #if QK_K == 256
7584
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7585
 
7586
  const int ib32 = iqs;
@@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7609
  (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7610
  bq8_1[ib32].ds[0];
7611
  return d * sumi;
7612
- #else
7613
- assert(false);
7614
- #endif
7615
  }
7616
 
7617
  static __dpct_inline__ float
7618
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7619
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7620
  const uint32_t *iq1s_grid_gpu) {
7621
- #if QK_K == 256
7622
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7623
 
7624
  const int ib32 = iqs;
@@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7637
  const float d = d1q * bq8_1[ib32].ds[0];
7638
  const float m = d1q * bq8_1[ib32].ds[1];
7639
  return d * sumi + m * delta;
7640
- #else
7641
- assert(false);
7642
- #endif
7643
  }
7644
 
7645
  static __dpct_inline__ float
7646
  vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7647
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7648
- #if QK_K == 256
7649
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7650
 
7651
  const int ib32 = iqs;
@@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7670
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7671
  const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7672
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7673
- #else
7674
- assert(false);
7675
- #endif
7676
  }
7677
 
7678
  static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7720,7 +7322,6 @@ static __dpct_inline__ float
7720
  vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7721
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7722
 
7723
- #if QK_K == 256
7724
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7725
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7726
 
@@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7738
  sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7739
  }
7740
  return d * (sumi1 + sumi2);
7741
- #else
7742
- assert(false);
7743
- #endif
7744
  }
7745
 
7746
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -10203,7 +9801,6 @@ template <typename dst_t>
10203
  static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10204
  dpct::queue_ptr stream) {
10205
  const int nb = k / QK_K;
10206
- #if QK_K == 256
10207
  {
10208
  dpct::has_capability_or_fail(stream->get_device(),
10209
  {sycl::aspect::fp16});
@@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10215
  dequantize_block_q2_K(vx, y, item_ct1);
10216
  });
10217
  }
10218
- #else
10219
- {
10220
- dpct::has_capability_or_fail(stream->get_device(),
10221
- {sycl::aspect::fp16});
10222
-
10223
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10224
- sycl::range<3>(1, 1, 32),
10225
- sycl::range<3>(1, 1, 32)),
10226
- [=](sycl::nd_item<3> item_ct1) {
10227
- dequantize_block_q2_K(vx, y, item_ct1);
10228
- });
10229
- }
10230
-
10231
- #endif
10232
  }
10233
 
10234
  template <typename dst_t>
10235
  static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10236
  dpct::queue_ptr stream) {
10237
  const int nb = k / QK_K;
10238
- #if QK_K == 256
10239
  {
10240
  dpct::has_capability_or_fail(stream->get_device(),
10241
  {sycl::aspect::fp16});
@@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10247
  dequantize_block_q3_K(vx, y, item_ct1);
10248
  });
10249
  }
10250
- #else
10251
- {
10252
- dpct::has_capability_or_fail(stream->get_device(),
10253
- {sycl::aspect::fp16});
10254
-
10255
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10256
- sycl::range<3>(1, 1, 32),
10257
- sycl::range<3>(1, 1, 32)),
10258
- [=](sycl::nd_item<3> item_ct1) {
10259
- dequantize_block_q3_K(vx, y, item_ct1);
10260
- });
10261
- }
10262
- #endif
10263
  }
10264
 
10265
  template <typename dst_t>
@@ -10320,7 +9889,6 @@ template <typename dst_t>
10320
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10321
  dpct::queue_ptr stream) {
10322
  const int nb = k / QK_K;
10323
- #if QK_K == 256
10324
  {
10325
  dpct::has_capability_or_fail(stream->get_device(),
10326
  {sycl::aspect::fp16});
@@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10332
  dequantize_block_q5_K(vx, y, item_ct1);
10333
  });
10334
  }
10335
- #else
10336
- {
10337
- dpct::has_capability_or_fail(stream->get_device(),
10338
- {sycl::aspect::fp16});
10339
-
10340
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10341
- sycl::range<3>(1, 1, 32),
10342
- sycl::range<3>(1, 1, 32)),
10343
- [=](sycl::nd_item<3> item_ct1) {
10344
- dequantize_block_q5_K(vx, y, item_ct1);
10345
- });
10346
- }
10347
-
10348
- #endif
10349
  }
10350
 
10351
  template <typename dst_t>
10352
  static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10353
  dpct::queue_ptr stream) {
10354
  const int nb = k / QK_K;
10355
- #if QK_K == 256
10356
  {
10357
  dpct::has_capability_or_fail(stream->get_device(),
10358
  {sycl::aspect::fp16});
@@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10364
  dequantize_block_q6_K(vx, y, item_ct1);
10365
  });
10366
  }
10367
- #else
10368
- {
10369
- dpct::has_capability_or_fail(stream->get_device(),
10370
- {sycl::aspect::fp16});
10371
-
10372
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10373
- sycl::range<3>(1, 1, 32),
10374
- sycl::range<3>(1, 1, 32)),
10375
- [=](sycl::nd_item<3> item_ct1) {
10376
- dequantize_block_q6_K(vx, y, item_ct1);
10377
- });
10378
- }
10379
-
10380
- #endif
10381
  }
10382
 
10383
  template <typename dst_t>
@@ -10529,9 +10068,6 @@ template <typename dst_t>
10529
  static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10530
  dpct::queue_ptr stream) {
10531
  const int nb = (k + QK_K - 1) / QK_K;
10532
- #if QK_K == 64
10533
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10534
- #else
10535
  {
10536
  dpct::has_capability_or_fail(stream->get_device(),
10537
  {sycl::aspect::fp16});
@@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10546
  });
10547
  });
10548
  }
10549
- #endif
10550
  }
10551
 
10552
 
@@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12051
  const int nrows_y, const int nrows_dst,
12052
  dpct::queue_ptr stream) try {
12053
 
12054
- #if QK_K == 256
12055
-
12056
  int id;
12057
  SYCL_CHECK(
12058
  CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12167
  });
12168
  }
12169
  }
12170
- #endif
12171
  }
12172
  catch (sycl::exception const &exc) {
12173
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
 
4197
  const block_q2_K * x = (const block_q2_K *) vx;
4198
 
4199
  const int tid = item_ct1.get_local_id(2);
 
4200
  const int n = tid/32;
4201
  const int l = tid - 32*n;
4202
  const int is = 8*n + l/16;
 
4210
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
4211
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
4212
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
 
 
 
 
 
 
 
 
 
 
 
 
4213
  }
4214
 
4215
  template<typename dst_t>
 
4219
  const int i = item_ct1.get_group(2);
4220
  const block_q3_K * x = (const block_q3_K *) vx;
4221
 
 
4222
  const int r = item_ct1.get_local_id(2) / 4;
4223
  const int tid = r/2;
4224
  const int is0 = r%2;
 
4242
  const uint8_t * hm = x[i].hmask;
4243
 
4244
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4245
  }
4246
 
 
4247
  static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
4248
  if (j < 4) {
4249
  d = q[j] & 63; m = q[j + 4] & 63;
 
4252
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
4253
  }
4254
  }
 
4255
 
4256
  template<typename dst_t>
4257
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
 
4260
 
4261
  const int i = item_ct1.get_group(2);
4262
 
 
4263
  // assume 32 threads
4264
  const int tid = item_ct1.get_local_id(2);
4265
  const int il = tid/8;
 
4283
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
4284
  y[l +32] = d2 * (q[l] >> 4) - m2;
4285
  }
 
 
 
 
 
 
 
 
 
4286
  }
4287
 
4288
  template<typename dst_t>
 
4292
 
4293
  const int i = item_ct1.get_group(2);
4294
 
 
4295
  // assume 64 threads - this is very slightly better than the one below
4296
  const int tid = item_ct1.get_local_id(2);
4297
  const int il = tid/16; // il is in 0...3
 
4318
  hm <<= 1;
4319
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
4320
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
 
 
 
 
 
 
 
 
 
 
 
 
4321
  }
4322
 
4323
  template<typename dst_t>
 
4326
  const block_q6_K * x = (const block_q6_K *) vx;
4327
 
4328
  const int i = item_ct1.get_group(2);
 
4329
 
4330
  // assume 64 threads - this is very slightly better than the one below
4331
  const int tid = item_ct1.get_local_id(2);
 
4345
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
4346
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4347
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4348
  }
4349
 
4350
  template<typename dst_t>
 
4358
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
4359
 
4360
  const int tid = item_ct1.get_local_id(2);
 
4361
  const int il = tid/8; // 0...3
4362
  const int ib = tid%8; // 0...7
4363
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
4368
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
4369
  const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
4370
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
 
 
 
 
4371
  }
4372
 
4373
  template<typename dst_t>
 
4381
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
4382
 
4383
  const int tid = item_ct1.get_local_id(2);
 
4384
  const int il = tid/8; // 0...3
4385
  const int ib = tid%8; // 0...7
4386
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
4389
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4390
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
4391
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 
 
 
 
4392
  }
4393
 
4394
  template <typename dst_t>
 
4400
  const block_iq2_s * x = (const block_iq2_s *) vx;
4401
 
4402
  const int tid = item_ct1.get_local_id(2);
 
4403
  const int il = tid/8; // 0...3
4404
  const int ib = tid%8; // 0...7
4405
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
4407
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4408
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4409
  #pragma unroll
4410
+ for (int j = 0; j < 8; ++j) {
4411
  y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4412
+ }
 
 
 
 
4413
  }
4414
 
4415
  template<typename dst_t>
 
4423
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
4424
 
4425
  const int tid = item_ct1.get_local_id(2);
 
4426
  const int il = tid/8; // 0...3
4427
  const int ib = tid%8; // 0...7
4428
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
4437
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4438
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4439
  }
 
 
 
 
4440
  }
4441
 
4442
  template <typename dst_t>
 
4449
  const block_iq3_s * x = (const block_iq3_s *) vx;
4450
 
4451
  const int tid = item_ct1.get_local_id(2);
 
4452
  const int il = tid/8; // 0...3
4453
  const int ib = tid%8; // 0...7
4454
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
4462
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4463
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4464
  }
 
 
 
 
4465
  }
4466
 
4467
  template <typename dst_t>
 
4474
  const block_iq1_s * x = (const block_iq1_s *) vx;
4475
 
4476
  const int tid = item_ct1.get_local_id(2);
 
4477
  const int il = tid/8; // 0...3
4478
  const int ib = tid%8; // 0...7
4479
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
4487
  for (int j = 0; j < 8; ++j) {
4488
  y[j] = d * (q[j] + delta);
4489
  }
 
 
 
 
4490
  }
4491
 
4492
  template <typename dst_t>
 
4499
  const block_iq1_m * x = (const block_iq1_m *) vx;
4500
 
4501
  const int tid = item_ct1.get_local_id(2);
 
4502
  const int il = tid/8; // 0...3
4503
  const int ib = tid%8; // 0...7
4504
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 
4516
  for (int j = 0; j < 8; ++j) {
4517
  y[j] = d * (q[j] + delta);
4518
  }
 
 
 
 
4519
  }
4520
 
4521
  template <typename dst_t>
 
4589
 
4590
  float tmp = 0; // partial sum for thread in warp
4591
 
 
4592
  const int tid =
4593
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
4594
  const int ix =
 
4639
  tmp += dall * sum1 - dmin * sum2;
4640
 
4641
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4642
 
4643
  // sum up partial sums and write back result
4644
  #pragma unroll
 
4676
 
4677
  float tmp = 0; // partial sum for thread in warp
4678
 
 
 
4679
  const uint16_t kmask1 = 0x0303;
4680
  const uint16_t kmask2 = 0x0f0f;
4681
 
 
4728
  tmp += d * sum;
4729
 
4730
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4731
 
4732
  // sum up partial sums and write back result
4733
  #pragma unroll
 
4762
 
4763
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
4764
 
 
4765
  const uint16_t kmask1 = 0x3f3f;
4766
  const uint16_t kmask2 = 0x0f0f;
4767
  const uint16_t kmask3 = 0xc0c0;
 
4850
  #endif
4851
 
4852
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4853
 
4854
  // sum up partial sums and write back result
4855
  #pragma unroll
 
4884
 
4885
  float tmp = 0; // partial sum for thread in warp
4886
 
 
4887
  const uint16_t kmask1 = 0x3f3f;
4888
  const uint16_t kmask2 = 0x0f0f;
4889
  const uint16_t kmask3 = 0xc0c0;
 
4960
  dmin * smin;
4961
  }
4962
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4963
  // sum up partial sums and write back result
4964
  #pragma unroll
4965
  for (int mask = 16; mask > 0; mask >>= 1) {
 
4986
 
4987
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
4988
 
 
 
4989
  const int tid =
4990
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
4991
  const int ix =
 
5042
 
5043
  }
5044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5045
  // sum up partial sums and write back result
5046
  #pragma unroll
5047
  for (int mask = 16; mask > 0; mask >>= 1) {
 
6586
  vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6587
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6588
 
 
6589
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6590
 
6591
  int v[2];
 
6627
  }
6628
 
6629
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6630
  }
6631
 
6632
  template <int mmq_y>
 
6685
 
6686
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
6687
 
 
6688
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
 
 
 
6689
  }
6690
 
6691
  #pragma unroll
 
6728
  vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
6729
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6730
 
 
6731
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
6732
 
6733
  int vl[2];
 
6769
  }
6770
 
6771
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6772
  }
6773
 
6774
  template <int mmq_y>
 
6840
 
6841
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
6842
 
 
6843
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
 
6844
  }
6845
 
6846
  #pragma unroll
 
7020
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7021
  const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
7022
  const uint8_t *kmask_iq2xs) {
 
7023
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
7024
 
7025
  #if QR2_XXS == 8
 
7060
  }
7061
  return d * (sumi1 + sumi2);
7062
  #endif
 
 
 
 
7063
  }
7064
 
7065
  static __dpct_inline__ float
 
7068
  const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
7069
  #if DPCT_COMPATIBILITY_TEMP >= \
7070
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
7071
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
7072
 
7073
  const int ib32 = iqs;
 
7105
  assert(false);
7106
  return 0.f;
7107
  #endif
 
 
 
 
7108
  }
7109
 
7110
  static __dpct_inline__ float
7111
  vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7112
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
7113
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7114
 
7115
  const int ib32 = iqs;
 
7153
  }
7154
  const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7155
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 
 
 
7156
  }
7157
 
7158
  static __dpct_inline__ float
 
7161
  const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
7162
  #if DPCT_COMPATIBILITY_TEMP >= \
7163
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
 
7164
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
7165
 
7166
  const int ib32 = iqs;
 
7188
  assert(false);
7189
  return 0.f;
7190
  #endif
 
 
 
 
7191
  }
7192
 
7193
  static __dpct_inline__ float
7194
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7195
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7196
  const uint32_t *iq3s_grid) {
 
7197
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7198
 
7199
  const int ib32 = iqs;
 
7222
  (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7223
  bq8_1[ib32].ds[0];
7224
  return d * sumi;
 
 
 
7225
  }
7226
 
7227
  static __dpct_inline__ float
7228
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7229
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7230
  const uint32_t *iq1s_grid_gpu) {
 
7231
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7232
 
7233
  const int ib32 = iqs;
 
7246
  const float d = d1q * bq8_1[ib32].ds[0];
7247
  const float m = d1q * bq8_1[ib32].ds[1];
7248
  return d * sumi + m * delta;
 
 
 
7249
  }
7250
 
7251
  static __dpct_inline__ float
7252
  vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7253
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
7254
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7255
 
7256
  const int ib32 = iqs;
 
7275
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7276
  const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7277
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
 
 
 
7278
  }
7279
 
7280
  static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
 
7322
  vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7323
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7324
 
 
7325
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7326
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7327
 
 
7339
  sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7340
  }
7341
  return d * (sumi1 + sumi2);
 
 
 
7342
  }
7343
 
7344
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
 
9801
  static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
9802
  dpct::queue_ptr stream) {
9803
  const int nb = k / QK_K;
 
9804
  {
9805
  dpct::has_capability_or_fail(stream->get_device(),
9806
  {sycl::aspect::fp16});
 
9812
  dequantize_block_q2_K(vx, y, item_ct1);
9813
  });
9814
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9815
  }
9816
 
9817
  template <typename dst_t>
9818
  static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
9819
  dpct::queue_ptr stream) {
9820
  const int nb = k / QK_K;
 
9821
  {
9822
  dpct::has_capability_or_fail(stream->get_device(),
9823
  {sycl::aspect::fp16});
 
9829
  dequantize_block_q3_K(vx, y, item_ct1);
9830
  });
9831
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
9832
  }
9833
 
9834
  template <typename dst_t>
 
9889
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
9890
  dpct::queue_ptr stream) {
9891
  const int nb = k / QK_K;
 
9892
  {
9893
  dpct::has_capability_or_fail(stream->get_device(),
9894
  {sycl::aspect::fp16});
 
9900
  dequantize_block_q5_K(vx, y, item_ct1);
9901
  });
9902
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9903
  }
9904
 
9905
  template <typename dst_t>
9906
  static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
9907
  dpct::queue_ptr stream) {
9908
  const int nb = k / QK_K;
 
9909
  {
9910
  dpct::has_capability_or_fail(stream->get_device(),
9911
  {sycl::aspect::fp16});
 
9917
  dequantize_block_q6_K(vx, y, item_ct1);
9918
  });
9919
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9920
  }
9921
 
9922
  template <typename dst_t>
 
10068
  static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10069
  dpct::queue_ptr stream) {
10070
  const int nb = (k + QK_K - 1) / QK_K;
 
 
 
10071
  {
10072
  dpct::has_capability_or_fail(stream->get_device(),
10073
  {sycl::aspect::fp16});
 
10082
  });
10083
  });
10084
  }
 
10085
  }
10086
 
10087
 
 
11586
  const int nrows_y, const int nrows_dst,
11587
  dpct::queue_ptr stream) try {
11588
 
 
 
11589
  int id;
11590
  SYCL_CHECK(
11591
  CHECK_TRY_ERROR(id = get_current_device_id()));
 
11700
  });
11701
  }
11702
  }
 
11703
  }
11704
  catch (sycl::exception const &exc) {
11705
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
ggml.c CHANGED
@@ -871,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
871
  },
872
  [GGML_TYPE_IQ4_XS] = {
873
  .type_name = "iq4_xs",
874
- #if QK_K == 64
875
- .blck_size = QK4_NL,
876
- #else
877
  .blck_size = QK_K,
878
- #endif
879
  .type_size = sizeof(block_iq4_xs),
880
  .is_quantized = true,
881
  .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
882
  .from_float = quantize_row_iq4_xs,
883
  .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
884
  .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
885
- #if QK_K == 64
886
- .vec_dot_type = GGML_TYPE_Q8_0,
887
- #else
888
  .vec_dot_type = GGML_TYPE_Q8_K,
889
- #endif
890
  .nrows = 1,
891
  },
892
  [GGML_TYPE_Q8_K] = {
@@ -22117,11 +22109,7 @@ size_t ggml_quantize_chunk(
22117
  case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22118
  case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22119
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22120
- #if QK_K == 64
22121
- case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22122
- #else
22123
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22124
- #endif
22125
  case GGML_TYPE_F16:
22126
  {
22127
  size_t elemsize = sizeof(ggml_fp16_t);
 
871
  },
872
  [GGML_TYPE_IQ4_XS] = {
873
  .type_name = "iq4_xs",
 
 
 
874
  .blck_size = QK_K,
 
875
  .type_size = sizeof(block_iq4_xs),
876
  .is_quantized = true,
877
  .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
878
  .from_float = quantize_row_iq4_xs,
879
  .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
880
  .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
 
 
 
881
  .vec_dot_type = GGML_TYPE_Q8_K,
 
882
  .nrows = 1,
883
  },
884
  [GGML_TYPE_Q8_K] = {
 
22109
  case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22110
  case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22111
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 
 
 
22112
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 
22113
  case GGML_TYPE_F16:
22114
  {
22115
  size_t elemsize = sizeof(ggml_fp16_t);