JohannesGaessler commited on
Commit
855a9fe
·
1 Parent(s): 3cae2d9

CPU/CUDA: fix (GQA) mul mat back, add CUDA support (llama/11380)

Browse files
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -7883,7 +7883,7 @@ static void ggml_compute_forward_out_prod_f32(
7883
 
7884
  float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
7885
  float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
7886
- float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7887
 
7888
  ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
7889
  }
@@ -7892,7 +7892,7 @@ static void ggml_compute_forward_out_prod_f32(
7892
 
7893
  float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
7894
  float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
7895
- float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7896
 
7897
  ggml_vec_mad_f32(ne0, d, s0, *s1);
7898
  }
 
7883
 
7884
  float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
7885
  float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
7886
+ float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7887
 
7888
  ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
7889
  }
 
7892
 
7893
  float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
7894
  float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
7895
+ float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7896
 
7897
  ggml_vec_mad_f32(ne0, d, s0, *s1);
7898
  }
ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED
@@ -416,7 +416,8 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
416
  case GGML_OP_IM2COL_BACK:
417
  return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
418
  case GGML_OP_OUT_PROD:
419
- return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
 
420
  default:
421
  return true;
422
  }
 
416
  case GGML_OP_IM2COL_BACK:
417
  return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
418
  case GGML_OP_OUT_PROD:
419
+ return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
420
+ src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
421
  default:
422
  return true;
423
  }
ggml/src/ggml-cuda/binbcast.cu CHANGED
@@ -93,26 +93,31 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
93
 
94
  template <typename T>
95
  static __global__ void k_repeat_back(
96
- const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
97
- const int64_t ne0, const int64_t ne1, const int64_t ne2) {
 
98
 
99
- const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
100
- const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
101
- const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
 
 
102
 
103
  if (tid0 >= ne0) {
104
  return;
105
  }
106
 
107
  T sum = 0;
108
- for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
109
- for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
110
- for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
111
- sum += src[i2*ne01*ne00 + i1*ne00 + i0];
 
 
112
  }
113
  }
114
  }
115
- dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
116
  }
117
 
118
  template<float (*bin_op)(const float, const float)>
@@ -274,12 +279,14 @@ struct bin_bcast_cuda {
274
 
275
  template <typename T>
276
  static void repeat_back_cuda(
277
- const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
278
- const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
 
279
 
280
  const dim3 block_dims(WARP_SIZE, 1, 1);
281
- const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
282
- k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
 
283
  }
284
 
285
  template<class op>
@@ -326,27 +333,26 @@ void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst
326
  const ggml_tensor * src0 = dst->src[0];
327
 
328
  GGML_ASSERT(src0->type == dst->type);
329
- GGML_ASSERT(ggml_is_contiguous(src0));
330
  GGML_ASSERT(ggml_is_contiguous(dst));
331
  GGML_ASSERT(ggml_can_repeat(dst, src0));
332
 
333
  cudaStream_t stream = ctx.stream();
334
 
335
- const int64_t ne00 = src0->ne[0];
336
- const int64_t ne01 = src0->ne[1];
337
- const int64_t ne02 = src0->ne[2];
338
- GGML_ASSERT(src0->ne[3] == 1);
339
 
340
- const int64_t ne0 = dst->ne[0];
341
- const int64_t ne1 = dst->ne[1];
342
- const int64_t ne2 = dst->ne[2];
343
- GGML_ASSERT(dst->ne[3] == 1);
 
344
 
345
  switch (dst->type) {
346
  case GGML_TYPE_F32: {
347
  const float * src0_d = (const float *) src0->data;
348
  float * dst_d = (float *) dst->data;
349
- repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
350
  } break;
351
  default: {
352
  GGML_ASSERT(false);
 
93
 
94
  template <typename T>
95
  static __global__ void k_repeat_back(
96
+ const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
97
+ const size_t s00, const size_t s01, const size_t s02, const size_t s03,
98
+ const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
99
 
100
+ const int64_t tid0 = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
101
+ const int64_t tid1 = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
102
+ const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
103
+ const int64_t tid2 = tid23 % ne2;
104
+ const int64_t tid3 = tid23 / ne2;
105
 
106
  if (tid0 >= ne0) {
107
  return;
108
  }
109
 
110
  T sum = 0;
111
+ for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
112
+ for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
113
+ for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
114
+ for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
115
+ sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
116
+ }
117
  }
118
  }
119
  }
120
+ dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
121
  }
122
 
123
  template<float (*bin_op)(const float, const float)>
 
279
 
280
  template <typename T>
281
  static void repeat_back_cuda(
282
+ const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
283
+ const size_t s00, const size_t s01, const size_t s02, const size_t s03,
284
+ const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
285
 
286
  const dim3 block_dims(WARP_SIZE, 1, 1);
287
+ const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
288
+ k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
289
+ (src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
290
  }
291
 
292
  template<class op>
 
333
  const ggml_tensor * src0 = dst->src[0];
334
 
335
  GGML_ASSERT(src0->type == dst->type);
 
336
  GGML_ASSERT(ggml_is_contiguous(dst));
337
  GGML_ASSERT(ggml_can_repeat(dst, src0));
338
 
339
  cudaStream_t stream = ctx.stream();
340
 
341
+ GGML_TENSOR_UNARY_OP_LOCALS;
342
+
343
+ GGML_ASSERT(ne2*ne3 <= (1 << 15));
 
344
 
345
+ const size_t ts = ggml_type_size(src0->type);
346
+ const size_t s00 = nb00 / ts;
347
+ const size_t s01 = nb01 / ts;
348
+ const size_t s02 = nb02 / ts;
349
+ const size_t s03 = nb03 / ts;
350
 
351
  switch (dst->type) {
352
  case GGML_TYPE_F32: {
353
  const float * src0_d = (const float *) src0->data;
354
  float * dst_d = (float *) dst->data;
355
+ repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
356
  } break;
357
  default: {
358
  GGML_ASSERT(false);
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -3002,7 +3002,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3002
  return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3003
  } break;
3004
  case GGML_OP_REPEAT_BACK:
3005
- return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;
3006
  case GGML_OP_CONCAT:
3007
  {
3008
  ggml_type src0_type = op->src[0]->type;
 
3002
  return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3003
  } break;
3004
  case GGML_OP_REPEAT_BACK:
3005
+ return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
3006
  case GGML_OP_CONCAT:
3007
  {
3008
  ggml_type src0_type = op->src[0]->type;
ggml/src/ggml-cuda/out-prod.cu CHANGED
@@ -34,6 +34,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
34
 
35
  CUBLAS_CHECK(cublasSetStream(handle, stream));
36
 
 
 
 
37
  const bool src1_T = ggml_is_transposed(src1);
38
  const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
39
  const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
@@ -57,9 +60,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
57
  CUBLAS_CHECK(
58
  cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
59
  ne0, ne1, ne01,
60
- &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, ne00,
61
  src1_d + i3 *s13 + i2 *s12, ldb,
62
- &beta, dst_d + i3 *s3 + i2 *s2, ne0));
63
  }
64
  }
65
  }
 
34
 
35
  CUBLAS_CHECK(cublasSetStream(handle, stream));
36
 
37
+ const int64_t lda = nb01 / sizeof(float);
38
+ const int64_t ldc = nb1 / sizeof(float);
39
+
40
  const bool src1_T = ggml_is_transposed(src1);
41
  const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
42
  const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
 
60
  CUBLAS_CHECK(
61
  cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
62
  ne0, ne1, ne01,
63
+ &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
64
  src1_d + i3 *s13 + i2 *s12, ldb,
65
+ &beta, dst_d + i3 *s3 + i2 *s2, ldc));
66
  }
67
  }
68
  }
ggml/src/ggml.c CHANGED
@@ -5343,7 +5343,7 @@ static void ggml_compute_backward(
5343
  } break;
5344
  case GGML_OP_MUL: {
5345
  if (src0_needs_grads) {
5346
- ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad));
5347
  }
5348
  if (src1_needs_grads) {
5349
  struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
@@ -5435,21 +5435,25 @@ static void ggml_compute_backward(
5435
  // src1.shape [n,p,qq,rr]
5436
 
5437
  if (src0_needs_grads) {
5438
- struct ggml_tensor * s1_tg =
 
 
5439
  ggml_out_prod(ctx, // [n,m,qq,rr]
5440
  src1, // [n,p,qq,rr]
5441
  grad); // [m,p,qq,rr]
5442
- const int64_t qq = s1_tg->ne[2];
5443
- const int64_t rr = s1_tg->ne[3];
5444
- const int64_t q1 = src0->ne[2];
5445
- const int64_t r1 = src0->ne[3];
5446
- const bool ne2_broadcasted = qq > q1;
5447
- const bool ne3_broadcasted = rr > r1;
5448
- if (ne2_broadcasted || ne3_broadcasted) {
5449
- // sum broadcast repetitions of s1_tg into shape of src0
5450
- s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
 
 
5451
  }
5452
- ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
5453
  }
5454
  if (src1_needs_grads) {
5455
  ggml_add_or_set(ctx, cgraph, isrc1,
@@ -5518,7 +5522,9 @@ static void ggml_compute_backward(
5518
  if (src0_needs_grads) {
5519
  GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
5520
  GGML_ASSERT(ggml_is_contiguous(grad));
5521
- ggml_add_or_set(ctx, cgraph, isrc0, grad);
 
 
5522
  }
5523
  } break;
5524
  case GGML_OP_RESHAPE: {
 
5343
  } break;
5344
  case GGML_OP_MUL: {
5345
  if (src0_needs_grads) {
5346
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, src1));
5347
  }
5348
  if (src1_needs_grads) {
5349
  struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
 
5435
  // src1.shape [n,p,qq,rr]
5436
 
5437
  if (src0_needs_grads) {
5438
+ GGML_ASSERT(grad->ne[2] == src1->ne[2]);
5439
+ GGML_ASSERT(grad->ne[3] == src1->ne[3]);
5440
+ struct ggml_tensor * tmp =
5441
  ggml_out_prod(ctx, // [n,m,qq,rr]
5442
  src1, // [n,p,qq,rr]
5443
  grad); // [m,p,qq,rr]
5444
+ if (!ggml_are_same_shape(tmp, src0)) {
5445
+ GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
5446
+ GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
5447
+ GGML_ASSERT(tmp->ne[3] == 1);
5448
+
5449
+ const int64_t nr2 = tmp->ne[2] / src0->ne[2];
5450
+ const size_t nb2 = tmp->nb[2] * nr2;
5451
+ const size_t nb3 = tmp->nb[2];
5452
+
5453
+ tmp = ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
5454
+ tmp = ggml_repeat_back(ctx, tmp, src0);
5455
  }
5456
+ ggml_add_or_set(ctx, cgraph, isrc0, tmp);
5457
  }
5458
  if (src1_needs_grads) {
5459
  ggml_add_or_set(ctx, cgraph, isrc1,
 
5522
  if (src0_needs_grads) {
5523
  GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
5524
  GGML_ASSERT(ggml_is_contiguous(grad));
5525
+ GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
5526
+ ggml_add_or_set(ctx, cgraph, isrc0,
5527
+ ggml_are_same_shape(tensor, src0) ? grad : ggml_reshape(ctx, grad, src0));
5528
  }
5529
  } break;
5530
  case GGML_OP_RESHAPE: {