Spaces:
Running
Running
Commit
·
531387f
1
Parent(s):
22e446d
Fix more int overflow during quant (PPL/CUDA). (llama/6563)
Browse files* Fix more int overflow during quant.
* Fix some more int overflow in softmax.
* Revert back to int64_t.
- ggml-cuda/convert.cu +84 -84
- ggml-cuda/softmax.cu +4 -4
ggml-cuda/convert.cu
CHANGED
|
@@ -5,16 +5,16 @@
|
|
| 5 |
|
| 6 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 7 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
| 8 |
-
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
| 9 |
|
| 10 |
if (i >= k) {
|
| 11 |
return;
|
| 12 |
}
|
| 13 |
|
| 14 |
const int64_t ib = i/qk; // block index
|
| 15 |
-
const
|
| 16 |
-
const
|
| 17 |
-
const
|
| 18 |
|
| 19 |
// dequantize
|
| 20 |
dfloat2 v;
|
|
@@ -29,7 +29,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|
| 29 |
#if __CUDA_ARCH__ >= CC_PASCAL
|
| 30 |
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 31 |
|
| 32 |
-
const
|
| 33 |
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
| 34 |
half2 * y2 = (half2 *) (y + i0);
|
| 35 |
|
|
@@ -73,9 +73,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
|
| 73 |
const int64_t i = blockIdx.x;
|
| 74 |
|
| 75 |
// assume 32 threads
|
| 76 |
-
const
|
| 77 |
-
const
|
| 78 |
-
const
|
| 79 |
const int64_t ib = 8*i + ir;
|
| 80 |
if (ib >= nb32) {
|
| 81 |
return;
|
|
@@ -101,9 +101,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
|
|
| 101 |
const int64_t i = blockIdx.x;
|
| 102 |
|
| 103 |
// assume 32 threads
|
| 104 |
-
const
|
| 105 |
-
const
|
| 106 |
-
const
|
| 107 |
const int64_t ib = 8*i + ir;
|
| 108 |
if (ib >= nb32) {
|
| 109 |
return;
|
|
@@ -127,14 +127,14 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
|
|
| 127 |
template<typename dst_t>
|
| 128 |
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 129 |
|
| 130 |
-
const
|
| 131 |
const block_q2_K * x = (const block_q2_K *) vx;
|
| 132 |
|
| 133 |
-
const
|
| 134 |
#if QK_K == 256
|
| 135 |
-
const
|
| 136 |
-
const
|
| 137 |
-
const
|
| 138 |
|
| 139 |
const uint8_t q = x[i].qs[32*n + l];
|
| 140 |
dst_t * y = yy + i*QK_K + 128*n;
|
|
@@ -146,8 +146,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
|
| 146 |
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
| 147 |
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
| 148 |
#else
|
| 149 |
-
const
|
| 150 |
-
const
|
| 151 |
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 152 |
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 153 |
float dall = __low2half(x[i].dm);
|
|
@@ -161,19 +161,19 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
|
| 161 |
template<typename dst_t>
|
| 162 |
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 163 |
|
| 164 |
-
const
|
| 165 |
const block_q3_K * x = (const block_q3_K *) vx;
|
| 166 |
|
| 167 |
#if QK_K == 256
|
| 168 |
-
const
|
| 169 |
-
const
|
| 170 |
-
const
|
| 171 |
-
const
|
| 172 |
-
const
|
| 173 |
-
const
|
| 174 |
|
| 175 |
uint8_t m = 1 << (4*n + j);
|
| 176 |
-
|
| 177 |
int shift = 2*j;
|
| 178 |
|
| 179 |
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
|
|
@@ -189,11 +189,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
|
| 189 |
|
| 190 |
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
| 191 |
#else
|
| 192 |
-
const
|
| 193 |
-
const
|
| 194 |
-
const
|
| 195 |
-
const
|
| 196 |
-
const
|
| 197 |
|
| 198 |
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 199 |
|
|
@@ -227,15 +227,15 @@ template<typename dst_t>
|
|
| 227 |
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 228 |
const block_q4_K * x = (const block_q4_K *) vx;
|
| 229 |
|
| 230 |
-
const
|
| 231 |
|
| 232 |
#if QK_K == 256
|
| 233 |
// assume 32 threads
|
| 234 |
-
const
|
| 235 |
-
const
|
| 236 |
-
const
|
| 237 |
-
const
|
| 238 |
-
const
|
| 239 |
|
| 240 |
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
| 241 |
|
|
@@ -254,7 +254,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
|
| 254 |
y[l +32] = d2 * (q[l] >> 4) - m2;
|
| 255 |
}
|
| 256 |
#else
|
| 257 |
-
const
|
| 258 |
const uint8_t * q = x[i].qs;
|
| 259 |
dst_t * y = yy + i*QK_K;
|
| 260 |
const float d = (float)x[i].dm[0];
|
|
@@ -268,14 +268,14 @@ template<typename dst_t>
|
|
| 268 |
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 269 |
const block_q5_K * x = (const block_q5_K *) vx;
|
| 270 |
|
| 271 |
-
const
|
| 272 |
|
| 273 |
#if QK_K == 256
|
| 274 |
// assume 64 threads - this is very slightly better than the one below
|
| 275 |
-
const
|
| 276 |
-
const
|
| 277 |
-
const
|
| 278 |
-
const
|
| 279 |
|
| 280 |
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
| 281 |
|
|
@@ -298,11 +298,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
|
| 298 |
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
| 299 |
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
| 300 |
#else
|
| 301 |
-
const
|
| 302 |
const uint8_t q = x[i].qs[tid];
|
| 303 |
-
const
|
| 304 |
-
const
|
| 305 |
-
const
|
| 306 |
const uint8_t h = x[i].qh[in] >> im;
|
| 307 |
const float d = x[i].d;
|
| 308 |
dst_t * y = yy + i*QK_K + tid;
|
|
@@ -359,13 +359,13 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|
| 359 |
template<typename dst_t>
|
| 360 |
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 361 |
|
| 362 |
-
const
|
| 363 |
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
| 364 |
|
| 365 |
-
const
|
| 366 |
#if QK_K == 256
|
| 367 |
-
const
|
| 368 |
-
const
|
| 369 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 370 |
const uint16_t * q2 = x[i].qs + 4*ib;
|
| 371 |
const uint8_t * aux8 = (const uint8_t *)q2;
|
|
@@ -383,13 +383,13 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
|
| 383 |
template<typename dst_t>
|
| 384 |
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 385 |
|
| 386 |
-
const
|
| 387 |
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
| 388 |
|
| 389 |
-
const
|
| 390 |
#if QK_K == 256
|
| 391 |
-
const
|
| 392 |
-
const
|
| 393 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 394 |
const uint16_t * q2 = x[i].qs + 4*ib;
|
| 395 |
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
|
@@ -405,13 +405,13 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|
| 405 |
template<typename dst_t>
|
| 406 |
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 407 |
|
| 408 |
-
const
|
| 409 |
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 410 |
|
| 411 |
-
const
|
| 412 |
#if QK_K == 256
|
| 413 |
-
const
|
| 414 |
-
const
|
| 415 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 416 |
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
| 417 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
|
@@ -426,13 +426,13 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
|
| 426 |
template<typename dst_t>
|
| 427 |
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 428 |
|
| 429 |
-
const
|
| 430 |
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
| 431 |
|
| 432 |
-
const
|
| 433 |
#if QK_K == 256
|
| 434 |
-
const
|
| 435 |
-
const
|
| 436 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 437 |
const uint8_t * q3 = x[i].qs + 8*ib;
|
| 438 |
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
|
@@ -454,13 +454,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|
| 454 |
template<typename dst_t>
|
| 455 |
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 456 |
|
| 457 |
-
const
|
| 458 |
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 459 |
|
| 460 |
-
const
|
| 461 |
#if QK_K == 256
|
| 462 |
-
const
|
| 463 |
-
const
|
| 464 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 465 |
const uint8_t * qs = x[i].qs + 8*ib;
|
| 466 |
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
|
@@ -480,13 +480,13 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
|
| 480 |
template<typename dst_t>
|
| 481 |
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 482 |
|
| 483 |
-
const
|
| 484 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 485 |
|
| 486 |
-
const
|
| 487 |
#if QK_K == 256
|
| 488 |
-
const
|
| 489 |
-
const
|
| 490 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 491 |
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
| 492 |
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
|
@@ -506,18 +506,18 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
|
| 506 |
template<typename dst_t>
|
| 507 |
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 508 |
|
| 509 |
-
const
|
| 510 |
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 511 |
|
| 512 |
-
const
|
| 513 |
#if QK_K == 256
|
| 514 |
-
const
|
| 515 |
-
const
|
| 516 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 517 |
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
| 518 |
iq1m_scale_t scale;
|
| 519 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 520 |
-
const
|
| 521 |
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
| 522 |
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
| 523 |
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
|
@@ -537,12 +537,12 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
|
| 537 |
template<typename dst_t>
|
| 538 |
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 539 |
|
| 540 |
-
const
|
| 541 |
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
| 542 |
|
| 543 |
-
const
|
| 544 |
-
const
|
| 545 |
-
const
|
| 546 |
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 547 |
const uint8_t * q4 = x[ib].qs + 4*il;
|
| 548 |
const float d = (float)x[ib].d;
|
|
@@ -556,12 +556,12 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
|
| 556 |
#if QK_K != 64
|
| 557 |
template<typename dst_t>
|
| 558 |
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 559 |
-
const
|
| 560 |
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
| 561 |
|
| 562 |
-
const
|
| 563 |
-
const
|
| 564 |
-
const
|
| 565 |
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 566 |
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
| 567 |
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
|
|
|
| 5 |
|
| 6 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 7 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
| 8 |
+
const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
|
| 9 |
|
| 10 |
if (i >= k) {
|
| 11 |
return;
|
| 12 |
}
|
| 13 |
|
| 14 |
const int64_t ib = i/qk; // block index
|
| 15 |
+
const int64_t iqs = (i%qk)/qr; // quant index
|
| 16 |
+
const int64_t iybs = i - i%qk; // y block start index
|
| 17 |
+
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
| 18 |
|
| 19 |
// dequantize
|
| 20 |
dfloat2 v;
|
|
|
|
| 29 |
#if __CUDA_ARCH__ >= CC_PASCAL
|
| 30 |
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 31 |
|
| 32 |
+
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
| 33 |
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
| 34 |
half2 * y2 = (half2 *) (y + i0);
|
| 35 |
|
|
|
|
| 73 |
const int64_t i = blockIdx.x;
|
| 74 |
|
| 75 |
// assume 32 threads
|
| 76 |
+
const int64_t tid = threadIdx.x;
|
| 77 |
+
const int64_t il = tid/8;
|
| 78 |
+
const int64_t ir = tid%8;
|
| 79 |
const int64_t ib = 8*i + ir;
|
| 80 |
if (ib >= nb32) {
|
| 81 |
return;
|
|
|
|
| 101 |
const int64_t i = blockIdx.x;
|
| 102 |
|
| 103 |
// assume 32 threads
|
| 104 |
+
const int64_t tid = threadIdx.x;
|
| 105 |
+
const int64_t il = tid/8;
|
| 106 |
+
const int64_t ir = tid%8;
|
| 107 |
const int64_t ib = 8*i + ir;
|
| 108 |
if (ib >= nb32) {
|
| 109 |
return;
|
|
|
|
| 127 |
template<typename dst_t>
|
| 128 |
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 129 |
|
| 130 |
+
const int64_t i = blockIdx.x;
|
| 131 |
const block_q2_K * x = (const block_q2_K *) vx;
|
| 132 |
|
| 133 |
+
const int64_t tid = threadIdx.x;
|
| 134 |
#if QK_K == 256
|
| 135 |
+
const int64_t n = tid/32;
|
| 136 |
+
const int64_t l = tid - 32*n;
|
| 137 |
+
const int64_t is = 8*n + l/16;
|
| 138 |
|
| 139 |
const uint8_t q = x[i].qs[32*n + l];
|
| 140 |
dst_t * y = yy + i*QK_K + 128*n;
|
|
|
|
| 146 |
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
| 147 |
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
| 148 |
#else
|
| 149 |
+
const int64_t is = tid/16; // 0 or 1
|
| 150 |
+
const int64_t il = tid%16; // 0...15
|
| 151 |
const uint8_t q = x[i].qs[il] >> (2*is);
|
| 152 |
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 153 |
float dall = __low2half(x[i].dm);
|
|
|
|
| 161 |
template<typename dst_t>
|
| 162 |
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 163 |
|
| 164 |
+
const int64_t i = blockIdx.x;
|
| 165 |
const block_q3_K * x = (const block_q3_K *) vx;
|
| 166 |
|
| 167 |
#if QK_K == 256
|
| 168 |
+
const int64_t r = threadIdx.x/4;
|
| 169 |
+
const int64_t tid = r/2;
|
| 170 |
+
const int64_t is0 = r%2;
|
| 171 |
+
const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
|
| 172 |
+
const int64_t n = tid / 4;
|
| 173 |
+
const int64_t j = tid - 4*n;
|
| 174 |
|
| 175 |
uint8_t m = 1 << (4*n + j);
|
| 176 |
+
int64_t is = 8*n + 2*j + is0;
|
| 177 |
int shift = 2*j;
|
| 178 |
|
| 179 |
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
|
|
|
|
| 189 |
|
| 190 |
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
| 191 |
#else
|
| 192 |
+
const int64_t tid = threadIdx.x;
|
| 193 |
+
const int64_t is = tid/16; // 0 or 1
|
| 194 |
+
const int64_t il = tid%16; // 0...15
|
| 195 |
+
const int64_t im = il/8; // 0...1
|
| 196 |
+
const int64_t in = il%8; // 0...7
|
| 197 |
|
| 198 |
dst_t * y = yy + i*QK_K + 16*is + il;
|
| 199 |
|
|
|
|
| 227 |
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 228 |
const block_q4_K * x = (const block_q4_K *) vx;
|
| 229 |
|
| 230 |
+
const int64_t i = blockIdx.x;
|
| 231 |
|
| 232 |
#if QK_K == 256
|
| 233 |
// assume 32 threads
|
| 234 |
+
const int64_t tid = threadIdx.x;
|
| 235 |
+
const int64_t il = tid/8;
|
| 236 |
+
const int64_t ir = tid%8;
|
| 237 |
+
const int64_t is = 2*il;
|
| 238 |
+
const int64_t n = 4;
|
| 239 |
|
| 240 |
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
| 241 |
|
|
|
|
| 254 |
y[l +32] = d2 * (q[l] >> 4) - m2;
|
| 255 |
}
|
| 256 |
#else
|
| 257 |
+
const int64_t tid = threadIdx.x;
|
| 258 |
const uint8_t * q = x[i].qs;
|
| 259 |
dst_t * y = yy + i*QK_K;
|
| 260 |
const float d = (float)x[i].dm[0];
|
|
|
|
| 268 |
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 269 |
const block_q5_K * x = (const block_q5_K *) vx;
|
| 270 |
|
| 271 |
+
const int64_t i = blockIdx.x;
|
| 272 |
|
| 273 |
#if QK_K == 256
|
| 274 |
// assume 64 threads - this is very slightly better than the one below
|
| 275 |
+
const int64_t tid = threadIdx.x;
|
| 276 |
+
const int64_t il = tid/16; // il is in 0...3
|
| 277 |
+
const int64_t ir = tid%16; // ir is in 0...15
|
| 278 |
+
const int64_t is = 2*il; // is is in 0...6
|
| 279 |
|
| 280 |
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
| 281 |
|
|
|
|
| 298 |
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
| 299 |
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
| 300 |
#else
|
| 301 |
+
const int64_t tid = threadIdx.x;
|
| 302 |
const uint8_t q = x[i].qs[tid];
|
| 303 |
+
const int64_t im = tid/8; // 0...3
|
| 304 |
+
const int64_t in = tid%8; // 0...7
|
| 305 |
+
const int64_t is = tid/16; // 0 or 1
|
| 306 |
const uint8_t h = x[i].qh[in] >> im;
|
| 307 |
const float d = x[i].d;
|
| 308 |
dst_t * y = yy + i*QK_K + tid;
|
|
|
|
| 359 |
template<typename dst_t>
|
| 360 |
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 361 |
|
| 362 |
+
const int64_t i = blockIdx.x;
|
| 363 |
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
| 364 |
|
| 365 |
+
const int64_t tid = threadIdx.x;
|
| 366 |
#if QK_K == 256
|
| 367 |
+
const int64_t il = tid/8; // 0...3
|
| 368 |
+
const int64_t ib = tid%8; // 0...7
|
| 369 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 370 |
const uint16_t * q2 = x[i].qs + 4*ib;
|
| 371 |
const uint8_t * aux8 = (const uint8_t *)q2;
|
|
|
|
| 383 |
template<typename dst_t>
|
| 384 |
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 385 |
|
| 386 |
+
const int64_t i = blockIdx.x;
|
| 387 |
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
| 388 |
|
| 389 |
+
const int64_t tid = threadIdx.x;
|
| 390 |
#if QK_K == 256
|
| 391 |
+
const int64_t il = tid/8; // 0...3
|
| 392 |
+
const int64_t ib = tid%8; // 0...7
|
| 393 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 394 |
const uint16_t * q2 = x[i].qs + 4*ib;
|
| 395 |
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
|
|
|
| 405 |
template<typename dst_t>
|
| 406 |
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 407 |
|
| 408 |
+
const int64_t i = blockIdx.x;
|
| 409 |
const block_iq2_s * x = (const block_iq2_s *) vx;
|
| 410 |
|
| 411 |
+
const int64_t tid = threadIdx.x;
|
| 412 |
#if QK_K == 256
|
| 413 |
+
const int64_t il = tid/8; // 0...3
|
| 414 |
+
const int64_t ib = tid%8; // 0...7
|
| 415 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 416 |
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
| 417 |
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
|
|
|
| 426 |
template<typename dst_t>
|
| 427 |
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 428 |
|
| 429 |
+
const int64_t i = blockIdx.x;
|
| 430 |
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
| 431 |
|
| 432 |
+
const int64_t tid = threadIdx.x;
|
| 433 |
#if QK_K == 256
|
| 434 |
+
const int64_t il = tid/8; // 0...3
|
| 435 |
+
const int64_t ib = tid%8; // 0...7
|
| 436 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 437 |
const uint8_t * q3 = x[i].qs + 8*ib;
|
| 438 |
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
|
|
|
| 454 |
template<typename dst_t>
|
| 455 |
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 456 |
|
| 457 |
+
const int64_t i = blockIdx.x;
|
| 458 |
const block_iq3_s * x = (const block_iq3_s *) vx;
|
| 459 |
|
| 460 |
+
const int64_t tid = threadIdx.x;
|
| 461 |
#if QK_K == 256
|
| 462 |
+
const int64_t il = tid/8; // 0...3
|
| 463 |
+
const int64_t ib = tid%8; // 0...7
|
| 464 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 465 |
const uint8_t * qs = x[i].qs + 8*ib;
|
| 466 |
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
|
|
|
| 480 |
template<typename dst_t>
|
| 481 |
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 482 |
|
| 483 |
+
const int64_t i = blockIdx.x;
|
| 484 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 485 |
|
| 486 |
+
const int64_t tid = threadIdx.x;
|
| 487 |
#if QK_K == 256
|
| 488 |
+
const int64_t il = tid/8; // 0...3
|
| 489 |
+
const int64_t ib = tid%8; // 0...7
|
| 490 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 491 |
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
| 492 |
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
|
|
|
| 506 |
template<typename dst_t>
|
| 507 |
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 508 |
|
| 509 |
+
const int64_t i = blockIdx.x;
|
| 510 |
const block_iq1_m * x = (const block_iq1_m *) vx;
|
| 511 |
|
| 512 |
+
const int64_t tid = threadIdx.x;
|
| 513 |
#if QK_K == 256
|
| 514 |
+
const int64_t il = tid/8; // 0...3
|
| 515 |
+
const int64_t ib = tid%8; // 0...7
|
| 516 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 517 |
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
| 518 |
iq1m_scale_t scale;
|
| 519 |
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
| 520 |
+
const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
| 521 |
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
| 522 |
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
| 523 |
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
|
|
|
| 537 |
template<typename dst_t>
|
| 538 |
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 539 |
|
| 540 |
+
const int64_t i = blockIdx.x;
|
| 541 |
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
| 542 |
|
| 543 |
+
const int64_t tid = threadIdx.x;
|
| 544 |
+
const int64_t il = tid/8; // 0...3
|
| 545 |
+
const int64_t ib = tid%8; // 0...7
|
| 546 |
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 547 |
const uint8_t * q4 = x[ib].qs + 4*il;
|
| 548 |
const float d = (float)x[ib].d;
|
|
|
|
| 556 |
#if QK_K != 64
|
| 557 |
template<typename dst_t>
|
| 558 |
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
| 559 |
+
const int64_t i = blockIdx.x;
|
| 560 |
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
| 561 |
|
| 562 |
+
const int64_t tid = threadIdx.x;
|
| 563 |
+
const int64_t il = tid/8; // 0...3
|
| 564 |
+
const int64_t ib = tid%8; // 0...7
|
| 565 |
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
| 566 |
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
| 567 |
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
ggml-cuda/softmax.cu
CHANGED
|
@@ -28,7 +28,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|
| 28 |
extern __shared__ float data_soft_max_f32[];
|
| 29 |
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
| 30 |
// shared memory buffer to cache values between iterations:
|
| 31 |
-
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
|
| 32 |
|
| 33 |
float max_val = -INFINITY;
|
| 34 |
|
|
@@ -40,8 +40,8 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|
| 40 |
break;
|
| 41 |
}
|
| 42 |
|
| 43 |
-
const
|
| 44 |
-
const
|
| 45 |
|
| 46 |
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
| 47 |
|
|
@@ -109,7 +109,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|
| 109 |
return;
|
| 110 |
}
|
| 111 |
|
| 112 |
-
const
|
| 113 |
dst[idst] = vals[col] * inv_sum;
|
| 114 |
}
|
| 115 |
}
|
|
|
|
| 28 |
extern __shared__ float data_soft_max_f32[];
|
| 29 |
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
| 30 |
// shared memory buffer to cache values between iterations:
|
| 31 |
+
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
|
| 32 |
|
| 33 |
float max_val = -INFINITY;
|
| 34 |
|
|
|
|
| 40 |
break;
|
| 41 |
}
|
| 42 |
|
| 43 |
+
const int64_t ix = (int64_t)rowx*ncols + col;
|
| 44 |
+
const int64_t iy = (int64_t)rowy*ncols + col;
|
| 45 |
|
| 46 |
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
| 47 |
|
|
|
|
| 109 |
return;
|
| 110 |
}
|
| 111 |
|
| 112 |
+
const int64_t idst = (int64_t)rowx*ncols + col;
|
| 113 |
dst[idst] = vals[col] * inv_sum;
|
| 114 |
}
|
| 115 |
}
|