dranger003 commited on
Commit
531387f
·
1 Parent(s): 22e446d

Fix more int overflow during quant (PPL/CUDA). (llama/6563)

Browse files

* Fix more int overflow during quant.

* Fix some more int overflow in softmax.

* Revert back to int64_t.

Files changed (2) hide show
  1. ggml-cuda/convert.cu +84 -84
  2. ggml-cuda/softmax.cu +4 -4
ggml-cuda/convert.cu CHANGED
@@ -5,16 +5,16 @@
5
 
6
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
7
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
8
- const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
9
 
10
  if (i >= k) {
11
  return;
12
  }
13
 
14
  const int64_t ib = i/qk; // block index
15
- const int iqs = (i%qk)/qr; // quant index
16
- const int iybs = i - i%qk; // y block start index
17
- const int y_offset = qr == 1 ? 1 : qk/2;
18
 
19
  // dequantize
20
  dfloat2 v;
@@ -29,7 +29,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
29
  #if __CUDA_ARCH__ >= CC_PASCAL
30
  constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
31
 
32
- const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
33
  const int * x0 = ((int *) vx) + blockIdx.x * nint;
34
  half2 * y2 = (half2 *) (y + i0);
35
 
@@ -73,9 +73,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
73
  const int64_t i = blockIdx.x;
74
 
75
  // assume 32 threads
76
- const int tid = threadIdx.x;
77
- const int il = tid/8;
78
- const int ir = tid%8;
79
  const int64_t ib = 8*i + ir;
80
  if (ib >= nb32) {
81
  return;
@@ -101,9 +101,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
101
  const int64_t i = blockIdx.x;
102
 
103
  // assume 32 threads
104
- const int tid = threadIdx.x;
105
- const int il = tid/8;
106
- const int ir = tid%8;
107
  const int64_t ib = 8*i + ir;
108
  if (ib >= nb32) {
109
  return;
@@ -127,14 +127,14 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
127
  template<typename dst_t>
128
  static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
129
 
130
- const int i = blockIdx.x;
131
  const block_q2_K * x = (const block_q2_K *) vx;
132
 
133
- const int tid = threadIdx.x;
134
  #if QK_K == 256
135
- const int n = tid/32;
136
- const int l = tid - 32*n;
137
- const int is = 8*n + l/16;
138
 
139
  const uint8_t q = x[i].qs[32*n + l];
140
  dst_t * y = yy + i*QK_K + 128*n;
@@ -146,8 +146,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
146
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
147
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
148
  #else
149
- const int is = tid/16; // 0 or 1
150
- const int il = tid%16; // 0...15
151
  const uint8_t q = x[i].qs[il] >> (2*is);
152
  dst_t * y = yy + i*QK_K + 16*is + il;
153
  float dall = __low2half(x[i].dm);
@@ -161,19 +161,19 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
161
  template<typename dst_t>
162
  static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
163
 
164
- const int i = blockIdx.x;
165
  const block_q3_K * x = (const block_q3_K *) vx;
166
 
167
  #if QK_K == 256
168
- const int r = threadIdx.x/4;
169
- const int tid = r/2;
170
- const int is0 = r%2;
171
- const int l0 = 16*is0 + 4*(threadIdx.x%4);
172
- const int n = tid / 4;
173
- const int j = tid - 4*n;
174
 
175
  uint8_t m = 1 << (4*n + j);
176
- int is = 8*n + 2*j + is0;
177
  int shift = 2*j;
178
 
179
  int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
@@ -189,11 +189,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
189
 
190
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
191
  #else
192
- const int tid = threadIdx.x;
193
- const int is = tid/16; // 0 or 1
194
- const int il = tid%16; // 0...15
195
- const int im = il/8; // 0...1
196
- const int in = il%8; // 0...7
197
 
198
  dst_t * y = yy + i*QK_K + 16*is + il;
199
 
@@ -227,15 +227,15 @@ template<typename dst_t>
227
  static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
228
  const block_q4_K * x = (const block_q4_K *) vx;
229
 
230
- const int i = blockIdx.x;
231
 
232
  #if QK_K == 256
233
  // assume 32 threads
234
- const int tid = threadIdx.x;
235
- const int il = tid/8;
236
- const int ir = tid%8;
237
- const int is = 2*il;
238
- const int n = 4;
239
 
240
  dst_t * y = yy + i*QK_K + 64*il + n*ir;
241
 
@@ -254,7 +254,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
254
  y[l +32] = d2 * (q[l] >> 4) - m2;
255
  }
256
  #else
257
- const int tid = threadIdx.x;
258
  const uint8_t * q = x[i].qs;
259
  dst_t * y = yy + i*QK_K;
260
  const float d = (float)x[i].dm[0];
@@ -268,14 +268,14 @@ template<typename dst_t>
268
  static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
269
  const block_q5_K * x = (const block_q5_K *) vx;
270
 
271
- const int i = blockIdx.x;
272
 
273
  #if QK_K == 256
274
  // assume 64 threads - this is very slightly better than the one below
275
- const int tid = threadIdx.x;
276
- const int il = tid/16; // il is in 0...3
277
- const int ir = tid%16; // ir is in 0...15
278
- const int is = 2*il; // is is in 0...6
279
 
280
  dst_t * y = yy + i*QK_K + 64*il + 2*ir;
281
 
@@ -298,11 +298,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
298
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
299
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
300
  #else
301
- const int tid = threadIdx.x;
302
  const uint8_t q = x[i].qs[tid];
303
- const int im = tid/8; // 0...3
304
- const int in = tid%8; // 0...7
305
- const int is = tid/16; // 0 or 1
306
  const uint8_t h = x[i].qh[in] >> im;
307
  const float d = x[i].d;
308
  dst_t * y = yy + i*QK_K + tid;
@@ -359,13 +359,13 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
359
  template<typename dst_t>
360
  static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
361
 
362
- const int i = blockIdx.x;
363
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
364
 
365
- const int tid = threadIdx.x;
366
  #if QK_K == 256
367
- const int il = tid/8; // 0...3
368
- const int ib = tid%8; // 0...7
369
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
370
  const uint16_t * q2 = x[i].qs + 4*ib;
371
  const uint8_t * aux8 = (const uint8_t *)q2;
@@ -383,13 +383,13 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
383
  template<typename dst_t>
384
  static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
385
 
386
- const int i = blockIdx.x;
387
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
388
 
389
- const int tid = threadIdx.x;
390
  #if QK_K == 256
391
- const int il = tid/8; // 0...3
392
- const int ib = tid%8; // 0...7
393
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
394
  const uint16_t * q2 = x[i].qs + 4*ib;
395
  const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
@@ -405,13 +405,13 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
405
  template<typename dst_t>
406
  static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
407
 
408
- const int i = blockIdx.x;
409
  const block_iq2_s * x = (const block_iq2_s *) vx;
410
 
411
- const int tid = threadIdx.x;
412
  #if QK_K == 256
413
- const int il = tid/8; // 0...3
414
- const int ib = tid%8; // 0...7
415
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
416
  const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
417
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
@@ -426,13 +426,13 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
426
  template<typename dst_t>
427
  static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
428
 
429
- const int i = blockIdx.x;
430
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
431
 
432
- const int tid = threadIdx.x;
433
  #if QK_K == 256
434
- const int il = tid/8; // 0...3
435
- const int ib = tid%8; // 0...7
436
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
437
  const uint8_t * q3 = x[i].qs + 8*ib;
438
  const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
@@ -454,13 +454,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
454
  template<typename dst_t>
455
  static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
456
 
457
- const int i = blockIdx.x;
458
  const block_iq3_s * x = (const block_iq3_s *) vx;
459
 
460
- const int tid = threadIdx.x;
461
  #if QK_K == 256
462
- const int il = tid/8; // 0...3
463
- const int ib = tid%8; // 0...7
464
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
465
  const uint8_t * qs = x[i].qs + 8*ib;
466
  const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
@@ -480,13 +480,13 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
480
  template<typename dst_t>
481
  static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
482
 
483
- const int i = blockIdx.x;
484
  const block_iq1_s * x = (const block_iq1_s *) vx;
485
 
486
- const int tid = threadIdx.x;
487
  #if QK_K == 256
488
- const int il = tid/8; // 0...3
489
- const int ib = tid%8; // 0...7
490
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
491
  const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
492
  const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
@@ -506,18 +506,18 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
506
  template<typename dst_t>
507
  static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
508
 
509
- const int i = blockIdx.x;
510
  const block_iq1_m * x = (const block_iq1_m *) vx;
511
 
512
- const int tid = threadIdx.x;
513
  #if QK_K == 256
514
- const int il = tid/8; // 0...3
515
- const int ib = tid%8; // 0...7
516
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
517
  const uint16_t * sc = (const uint16_t *)x[i].scales;
518
  iq1m_scale_t scale;
519
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
520
- const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
521
  const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
522
  const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
523
  uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
@@ -537,12 +537,12 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
537
  template<typename dst_t>
538
  static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
539
 
540
- const int i = blockIdx.x;
541
  const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
542
 
543
- const int tid = threadIdx.x;
544
- const int il = tid/8; // 0...3
545
- const int ib = tid%8; // 0...7
546
  dst_t * y = yy + i*QK_K + 32*ib + 4*il;
547
  const uint8_t * q4 = x[ib].qs + 4*il;
548
  const float d = (float)x[ib].d;
@@ -556,12 +556,12 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
556
  #if QK_K != 64
557
  template<typename dst_t>
558
  static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
559
- const int i = blockIdx.x;
560
  const block_iq4_xs * x = (const block_iq4_xs *)vx;
561
 
562
- const int tid = threadIdx.x;
563
- const int il = tid/8; // 0...3
564
- const int ib = tid%8; // 0...7
565
  dst_t * y = yy + i*QK_K + 32*ib + 4*il;
566
  const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
567
  const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
 
5
 
6
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
7
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
8
+ const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
9
 
10
  if (i >= k) {
11
  return;
12
  }
13
 
14
  const int64_t ib = i/qk; // block index
15
+ const int64_t iqs = (i%qk)/qr; // quant index
16
+ const int64_t iybs = i - i%qk; // y block start index
17
+ const int64_t y_offset = qr == 1 ? 1 : qk/2;
18
 
19
  // dequantize
20
  dfloat2 v;
 
29
  #if __CUDA_ARCH__ >= CC_PASCAL
30
  constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
31
 
32
+ const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
33
  const int * x0 = ((int *) vx) + blockIdx.x * nint;
34
  half2 * y2 = (half2 *) (y + i0);
35
 
 
73
  const int64_t i = blockIdx.x;
74
 
75
  // assume 32 threads
76
+ const int64_t tid = threadIdx.x;
77
+ const int64_t il = tid/8;
78
+ const int64_t ir = tid%8;
79
  const int64_t ib = 8*i + ir;
80
  if (ib >= nb32) {
81
  return;
 
101
  const int64_t i = blockIdx.x;
102
 
103
  // assume 32 threads
104
+ const int64_t tid = threadIdx.x;
105
+ const int64_t il = tid/8;
106
+ const int64_t ir = tid%8;
107
  const int64_t ib = 8*i + ir;
108
  if (ib >= nb32) {
109
  return;
 
127
  template<typename dst_t>
128
  static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
129
 
130
+ const int64_t i = blockIdx.x;
131
  const block_q2_K * x = (const block_q2_K *) vx;
132
 
133
+ const int64_t tid = threadIdx.x;
134
  #if QK_K == 256
135
+ const int64_t n = tid/32;
136
+ const int64_t l = tid - 32*n;
137
+ const int64_t is = 8*n + l/16;
138
 
139
  const uint8_t q = x[i].qs[32*n + l];
140
  dst_t * y = yy + i*QK_K + 128*n;
 
146
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
147
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
148
  #else
149
+ const int64_t is = tid/16; // 0 or 1
150
+ const int64_t il = tid%16; // 0...15
151
  const uint8_t q = x[i].qs[il] >> (2*is);
152
  dst_t * y = yy + i*QK_K + 16*is + il;
153
  float dall = __low2half(x[i].dm);
 
161
  template<typename dst_t>
162
  static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
163
 
164
+ const int64_t i = blockIdx.x;
165
  const block_q3_K * x = (const block_q3_K *) vx;
166
 
167
  #if QK_K == 256
168
+ const int64_t r = threadIdx.x/4;
169
+ const int64_t tid = r/2;
170
+ const int64_t is0 = r%2;
171
+ const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
172
+ const int64_t n = tid / 4;
173
+ const int64_t j = tid - 4*n;
174
 
175
  uint8_t m = 1 << (4*n + j);
176
+ int64_t is = 8*n + 2*j + is0;
177
  int shift = 2*j;
178
 
179
  int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
 
189
 
190
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
191
  #else
192
+ const int64_t tid = threadIdx.x;
193
+ const int64_t is = tid/16; // 0 or 1
194
+ const int64_t il = tid%16; // 0...15
195
+ const int64_t im = il/8; // 0...1
196
+ const int64_t in = il%8; // 0...7
197
 
198
  dst_t * y = yy + i*QK_K + 16*is + il;
199
 
 
227
  static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
228
  const block_q4_K * x = (const block_q4_K *) vx;
229
 
230
+ const int64_t i = blockIdx.x;
231
 
232
  #if QK_K == 256
233
  // assume 32 threads
234
+ const int64_t tid = threadIdx.x;
235
+ const int64_t il = tid/8;
236
+ const int64_t ir = tid%8;
237
+ const int64_t is = 2*il;
238
+ const int64_t n = 4;
239
 
240
  dst_t * y = yy + i*QK_K + 64*il + n*ir;
241
 
 
254
  y[l +32] = d2 * (q[l] >> 4) - m2;
255
  }
256
  #else
257
+ const int64_t tid = threadIdx.x;
258
  const uint8_t * q = x[i].qs;
259
  dst_t * y = yy + i*QK_K;
260
  const float d = (float)x[i].dm[0];
 
268
  static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
269
  const block_q5_K * x = (const block_q5_K *) vx;
270
 
271
+ const int64_t i = blockIdx.x;
272
 
273
  #if QK_K == 256
274
  // assume 64 threads - this is very slightly better than the one below
275
+ const int64_t tid = threadIdx.x;
276
+ const int64_t il = tid/16; // il is in 0...3
277
+ const int64_t ir = tid%16; // ir is in 0...15
278
+ const int64_t is = 2*il; // is is in 0...6
279
 
280
  dst_t * y = yy + i*QK_K + 64*il + 2*ir;
281
 
 
298
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
299
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
300
  #else
301
+ const int64_t tid = threadIdx.x;
302
  const uint8_t q = x[i].qs[tid];
303
+ const int64_t im = tid/8; // 0...3
304
+ const int64_t in = tid%8; // 0...7
305
+ const int64_t is = tid/16; // 0 or 1
306
  const uint8_t h = x[i].qh[in] >> im;
307
  const float d = x[i].d;
308
  dst_t * y = yy + i*QK_K + tid;
 
359
  template<typename dst_t>
360
  static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
361
 
362
+ const int64_t i = blockIdx.x;
363
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
364
 
365
+ const int64_t tid = threadIdx.x;
366
  #if QK_K == 256
367
+ const int64_t il = tid/8; // 0...3
368
+ const int64_t ib = tid%8; // 0...7
369
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
370
  const uint16_t * q2 = x[i].qs + 4*ib;
371
  const uint8_t * aux8 = (const uint8_t *)q2;
 
383
  template<typename dst_t>
384
  static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
385
 
386
+ const int64_t i = blockIdx.x;
387
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
388
 
389
+ const int64_t tid = threadIdx.x;
390
  #if QK_K == 256
391
+ const int64_t il = tid/8; // 0...3
392
+ const int64_t ib = tid%8; // 0...7
393
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
394
  const uint16_t * q2 = x[i].qs + 4*ib;
395
  const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
 
405
  template<typename dst_t>
406
  static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
407
 
408
+ const int64_t i = blockIdx.x;
409
  const block_iq2_s * x = (const block_iq2_s *) vx;
410
 
411
+ const int64_t tid = threadIdx.x;
412
  #if QK_K == 256
413
+ const int64_t il = tid/8; // 0...3
414
+ const int64_t ib = tid%8; // 0...7
415
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
416
  const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
417
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
 
426
  template<typename dst_t>
427
  static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
428
 
429
+ const int64_t i = blockIdx.x;
430
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
431
 
432
+ const int64_t tid = threadIdx.x;
433
  #if QK_K == 256
434
+ const int64_t il = tid/8; // 0...3
435
+ const int64_t ib = tid%8; // 0...7
436
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
437
  const uint8_t * q3 = x[i].qs + 8*ib;
438
  const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
 
454
  template<typename dst_t>
455
  static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
456
 
457
+ const int64_t i = blockIdx.x;
458
  const block_iq3_s * x = (const block_iq3_s *) vx;
459
 
460
+ const int64_t tid = threadIdx.x;
461
  #if QK_K == 256
462
+ const int64_t il = tid/8; // 0...3
463
+ const int64_t ib = tid%8; // 0...7
464
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
465
  const uint8_t * qs = x[i].qs + 8*ib;
466
  const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
 
480
  template<typename dst_t>
481
  static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
482
 
483
+ const int64_t i = blockIdx.x;
484
  const block_iq1_s * x = (const block_iq1_s *) vx;
485
 
486
+ const int64_t tid = threadIdx.x;
487
  #if QK_K == 256
488
+ const int64_t il = tid/8; // 0...3
489
+ const int64_t ib = tid%8; // 0...7
490
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
491
  const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
492
  const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
 
506
  template<typename dst_t>
507
  static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
508
 
509
+ const int64_t i = blockIdx.x;
510
  const block_iq1_m * x = (const block_iq1_m *) vx;
511
 
512
+ const int64_t tid = threadIdx.x;
513
  #if QK_K == 256
514
+ const int64_t il = tid/8; // 0...3
515
+ const int64_t ib = tid%8; // 0...7
516
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
517
  const uint16_t * sc = (const uint16_t *)x[i].scales;
518
  iq1m_scale_t scale;
519
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
520
+ const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
521
  const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
522
  const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
523
  uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
 
537
  template<typename dst_t>
538
  static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
539
 
540
+ const int64_t i = blockIdx.x;
541
  const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
542
 
543
+ const int64_t tid = threadIdx.x;
544
+ const int64_t il = tid/8; // 0...3
545
+ const int64_t ib = tid%8; // 0...7
546
  dst_t * y = yy + i*QK_K + 32*ib + 4*il;
547
  const uint8_t * q4 = x[ib].qs + 4*il;
548
  const float d = (float)x[ib].d;
 
556
  #if QK_K != 64
557
  template<typename dst_t>
558
  static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
559
+ const int64_t i = blockIdx.x;
560
  const block_iq4_xs * x = (const block_iq4_xs *)vx;
561
 
562
+ const int64_t tid = threadIdx.x;
563
+ const int64_t il = tid/8; // 0...3
564
+ const int64_t ib = tid%8; // 0...7
565
  dst_t * y = yy + i*QK_K + 32*ib + 4*il;
566
  const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
567
  const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
ggml-cuda/softmax.cu CHANGED
@@ -28,7 +28,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
28
  extern __shared__ float data_soft_max_f32[];
29
  float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
30
  // shared memory buffer to cache values between iterations:
31
- float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
32
 
33
  float max_val = -INFINITY;
34
 
@@ -40,8 +40,8 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
40
  break;
41
  }
42
 
43
- const int ix = rowx*ncols + col;
44
- const int iy = rowy*ncols + col;
45
 
46
  const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
47
 
@@ -109,7 +109,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
109
  return;
110
  }
111
 
112
- const int idst = rowx*ncols + col;
113
  dst[idst] = vals[col] * inv_sum;
114
  }
115
  }
 
28
  extern __shared__ float data_soft_max_f32[];
29
  float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
30
  // shared memory buffer to cache values between iterations:
31
+ float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
32
 
33
  float max_val = -INFINITY;
34
 
 
40
  break;
41
  }
42
 
43
+ const int64_t ix = (int64_t)rowx*ncols + col;
44
+ const int64_t iy = (int64_t)rowy*ncols + col;
45
 
46
  const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
47
 
 
109
  return;
110
  }
111
 
112
+ const int64_t idst = (int64_t)rowx*ncols + col;
113
  dst[idst] = vals[col] * inv_sum;
114
  }
115
  }