Spaces:
Running
Running
Commit
·
855a9fe
1
Parent(s):
3cae2d9
CPU/CUDA: fix (GQA) mul mat back, add CUDA support (llama/11380)
Browse files- ggml/src/ggml-cpu/ggml-cpu.c +2 -2
- ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
- ggml/src/ggml-cuda/binbcast.cu +30 -24
- ggml/src/ggml-cuda/ggml-cuda.cu +1 -1
- ggml/src/ggml-cuda/out-prod.cu +5 -2
- ggml/src/ggml.c +19 -13
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -7883,7 +7883,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
| 7883 |
|
| 7884 |
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
| 7885 |
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
| 7886 |
-
float * d = (float *) ((char *) dst->data + ( i1*nb1
|
| 7887 |
|
| 7888 |
ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
|
| 7889 |
}
|
|
@@ -7892,7 +7892,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
| 7892 |
|
| 7893 |
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
| 7894 |
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
| 7895 |
-
float * d = (float *) ((char *) dst->data + ( i1*nb1
|
| 7896 |
|
| 7897 |
ggml_vec_mad_f32(ne0, d, s0, *s1);
|
| 7898 |
}
|
|
|
|
| 7883 |
|
| 7884 |
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
| 7885 |
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
| 7886 |
+
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
| 7887 |
|
| 7888 |
ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
|
| 7889 |
}
|
|
|
|
| 7892 |
|
| 7893 |
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
| 7894 |
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
| 7895 |
+
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
| 7896 |
|
| 7897 |
ggml_vec_mad_f32(ne0, d, s0, *s1);
|
| 7898 |
}
|
ggml/src/ggml-cpu/ggml-cpu.cpp
CHANGED
|
@@ -416,7 +416,8 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
| 416 |
case GGML_OP_IM2COL_BACK:
|
| 417 |
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
| 418 |
case GGML_OP_OUT_PROD:
|
| 419 |
-
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)
|
|
|
|
| 420 |
default:
|
| 421 |
return true;
|
| 422 |
}
|
|
|
|
| 416 |
case GGML_OP_IM2COL_BACK:
|
| 417 |
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
| 418 |
case GGML_OP_OUT_PROD:
|
| 419 |
+
return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
|
| 420 |
+
src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
| 421 |
default:
|
| 422 |
return true;
|
| 423 |
}
|
ggml/src/ggml-cuda/binbcast.cu
CHANGED
|
@@ -93,26 +93,31 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
|
|
| 93 |
|
| 94 |
template <typename T>
|
| 95 |
static __global__ void k_repeat_back(
|
| 96 |
-
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
| 97 |
-
const
|
|
|
|
| 98 |
|
| 99 |
-
const int64_t tid0
|
| 100 |
-
const int64_t tid1
|
| 101 |
-
const int64_t
|
|
|
|
|
|
|
| 102 |
|
| 103 |
if (tid0 >= ne0) {
|
| 104 |
return;
|
| 105 |
}
|
| 106 |
|
| 107 |
T sum = 0;
|
| 108 |
-
for (int64_t
|
| 109 |
-
for (int64_t
|
| 110 |
-
for (int64_t
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
}
|
| 113 |
}
|
| 114 |
}
|
| 115 |
-
dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
|
| 116 |
}
|
| 117 |
|
| 118 |
template<float (*bin_op)(const float, const float)>
|
|
@@ -274,12 +279,14 @@ struct bin_bcast_cuda {
|
|
| 274 |
|
| 275 |
template <typename T>
|
| 276 |
static void repeat_back_cuda(
|
| 277 |
-
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
| 278 |
-
const
|
|
|
|
| 279 |
|
| 280 |
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 281 |
-
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
|
| 282 |
-
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
|
|
|
|
| 283 |
}
|
| 284 |
|
| 285 |
template<class op>
|
|
@@ -326,27 +333,26 @@ void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|
| 326 |
const ggml_tensor * src0 = dst->src[0];
|
| 327 |
|
| 328 |
GGML_ASSERT(src0->type == dst->type);
|
| 329 |
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 330 |
GGML_ASSERT(ggml_is_contiguous(dst));
|
| 331 |
GGML_ASSERT(ggml_can_repeat(dst, src0));
|
| 332 |
|
| 333 |
cudaStream_t stream = ctx.stream();
|
| 334 |
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
GGML_ASSERT(src0->ne[3] == 1);
|
| 339 |
|
| 340 |
-
const
|
| 341 |
-
const
|
| 342 |
-
const
|
| 343 |
-
|
|
|
|
| 344 |
|
| 345 |
switch (dst->type) {
|
| 346 |
case GGML_TYPE_F32: {
|
| 347 |
const float * src0_d = (const float *) src0->data;
|
| 348 |
float * dst_d = (float *) dst->data;
|
| 349 |
-
repeat_back_cuda
|
| 350 |
} break;
|
| 351 |
default: {
|
| 352 |
GGML_ASSERT(false);
|
|
|
|
| 93 |
|
| 94 |
template <typename T>
|
| 95 |
static __global__ void k_repeat_back(
|
| 96 |
+
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
| 97 |
+
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
|
| 98 |
+
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
|
| 99 |
|
| 100 |
+
const int64_t tid0 = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
|
| 101 |
+
const int64_t tid1 = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
|
| 102 |
+
const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
|
| 103 |
+
const int64_t tid2 = tid23 % ne2;
|
| 104 |
+
const int64_t tid3 = tid23 / ne2;
|
| 105 |
|
| 106 |
if (tid0 >= ne0) {
|
| 107 |
return;
|
| 108 |
}
|
| 109 |
|
| 110 |
T sum = 0;
|
| 111 |
+
for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
|
| 112 |
+
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
|
| 113 |
+
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
|
| 114 |
+
for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
|
| 115 |
+
sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
|
| 116 |
+
}
|
| 117 |
}
|
| 118 |
}
|
| 119 |
}
|
| 120 |
+
dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
|
| 121 |
}
|
| 122 |
|
| 123 |
template<float (*bin_op)(const float, const float)>
|
|
|
|
| 279 |
|
| 280 |
template <typename T>
|
| 281 |
static void repeat_back_cuda(
|
| 282 |
+
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
| 283 |
+
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
|
| 284 |
+
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
| 285 |
|
| 286 |
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 287 |
+
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
|
| 288 |
+
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
|
| 289 |
+
(src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
|
| 290 |
}
|
| 291 |
|
| 292 |
template<class op>
|
|
|
|
| 333 |
const ggml_tensor * src0 = dst->src[0];
|
| 334 |
|
| 335 |
GGML_ASSERT(src0->type == dst->type);
|
|
|
|
| 336 |
GGML_ASSERT(ggml_is_contiguous(dst));
|
| 337 |
GGML_ASSERT(ggml_can_repeat(dst, src0));
|
| 338 |
|
| 339 |
cudaStream_t stream = ctx.stream();
|
| 340 |
|
| 341 |
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
| 342 |
+
|
| 343 |
+
GGML_ASSERT(ne2*ne3 <= (1 << 15));
|
|
|
|
| 344 |
|
| 345 |
+
const size_t ts = ggml_type_size(src0->type);
|
| 346 |
+
const size_t s00 = nb00 / ts;
|
| 347 |
+
const size_t s01 = nb01 / ts;
|
| 348 |
+
const size_t s02 = nb02 / ts;
|
| 349 |
+
const size_t s03 = nb03 / ts;
|
| 350 |
|
| 351 |
switch (dst->type) {
|
| 352 |
case GGML_TYPE_F32: {
|
| 353 |
const float * src0_d = (const float *) src0->data;
|
| 354 |
float * dst_d = (float *) dst->data;
|
| 355 |
+
repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
|
| 356 |
} break;
|
| 357 |
default: {
|
| 358 |
GGML_ASSERT(false);
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -3002,7 +3002,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3002 |
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
| 3003 |
} break;
|
| 3004 |
case GGML_OP_REPEAT_BACK:
|
| 3005 |
-
return op->type == GGML_TYPE_F32 && op->src[0]->ne[3]
|
| 3006 |
case GGML_OP_CONCAT:
|
| 3007 |
{
|
| 3008 |
ggml_type src0_type = op->src[0]->type;
|
|
|
|
| 3002 |
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
| 3003 |
} break;
|
| 3004 |
case GGML_OP_REPEAT_BACK:
|
| 3005 |
+
return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
|
| 3006 |
case GGML_OP_CONCAT:
|
| 3007 |
{
|
| 3008 |
ggml_type src0_type = op->src[0]->type;
|
ggml/src/ggml-cuda/out-prod.cu
CHANGED
|
@@ -34,6 +34,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 34 |
|
| 35 |
CUBLAS_CHECK(cublasSetStream(handle, stream));
|
| 36 |
|
|
|
|
|
|
|
|
|
|
| 37 |
const bool src1_T = ggml_is_transposed(src1);
|
| 38 |
const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
|
| 39 |
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
|
@@ -57,9 +60,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 57 |
CUBLAS_CHECK(
|
| 58 |
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
|
| 59 |
ne0, ne1, ne01,
|
| 60 |
-
&alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02,
|
| 61 |
src1_d + i3 *s13 + i2 *s12, ldb,
|
| 62 |
-
&beta, dst_d + i3 *s3 + i2 *s2,
|
| 63 |
}
|
| 64 |
}
|
| 65 |
}
|
|
|
|
| 34 |
|
| 35 |
CUBLAS_CHECK(cublasSetStream(handle, stream));
|
| 36 |
|
| 37 |
+
const int64_t lda = nb01 / sizeof(float);
|
| 38 |
+
const int64_t ldc = nb1 / sizeof(float);
|
| 39 |
+
|
| 40 |
const bool src1_T = ggml_is_transposed(src1);
|
| 41 |
const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
|
| 42 |
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
|
|
|
| 60 |
CUBLAS_CHECK(
|
| 61 |
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
|
| 62 |
ne0, ne1, ne01,
|
| 63 |
+
&alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
|
| 64 |
src1_d + i3 *s13 + i2 *s12, ldb,
|
| 65 |
+
&beta, dst_d + i3 *s3 + i2 *s2, ldc));
|
| 66 |
}
|
| 67 |
}
|
| 68 |
}
|
ggml/src/ggml.c
CHANGED
|
@@ -5343,7 +5343,7 @@ static void ggml_compute_backward(
|
|
| 5343 |
} break;
|
| 5344 |
case GGML_OP_MUL: {
|
| 5345 |
if (src0_needs_grads) {
|
| 5346 |
-
ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx,
|
| 5347 |
}
|
| 5348 |
if (src1_needs_grads) {
|
| 5349 |
struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
|
|
@@ -5435,21 +5435,25 @@ static void ggml_compute_backward(
|
|
| 5435 |
// src1.shape [n,p,qq,rr]
|
| 5436 |
|
| 5437 |
if (src0_needs_grads) {
|
| 5438 |
-
|
|
|
|
|
|
|
| 5439 |
ggml_out_prod(ctx, // [n,m,qq,rr]
|
| 5440 |
src1, // [n,p,qq,rr]
|
| 5441 |
grad); // [m,p,qq,rr]
|
| 5442 |
-
|
| 5443 |
-
|
| 5444 |
-
|
| 5445 |
-
|
| 5446 |
-
|
| 5447 |
-
|
| 5448 |
-
|
| 5449 |
-
|
| 5450 |
-
|
|
|
|
|
|
|
| 5451 |
}
|
| 5452 |
-
ggml_add_or_set(ctx, cgraph, isrc0,
|
| 5453 |
}
|
| 5454 |
if (src1_needs_grads) {
|
| 5455 |
ggml_add_or_set(ctx, cgraph, isrc1,
|
|
@@ -5518,7 +5522,9 @@ static void ggml_compute_backward(
|
|
| 5518 |
if (src0_needs_grads) {
|
| 5519 |
GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
|
| 5520 |
GGML_ASSERT(ggml_is_contiguous(grad));
|
| 5521 |
-
|
|
|
|
|
|
|
| 5522 |
}
|
| 5523 |
} break;
|
| 5524 |
case GGML_OP_RESHAPE: {
|
|
|
|
| 5343 |
} break;
|
| 5344 |
case GGML_OP_MUL: {
|
| 5345 |
if (src0_needs_grads) {
|
| 5346 |
+
ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, src1));
|
| 5347 |
}
|
| 5348 |
if (src1_needs_grads) {
|
| 5349 |
struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
|
|
|
|
| 5435 |
// src1.shape [n,p,qq,rr]
|
| 5436 |
|
| 5437 |
if (src0_needs_grads) {
|
| 5438 |
+
GGML_ASSERT(grad->ne[2] == src1->ne[2]);
|
| 5439 |
+
GGML_ASSERT(grad->ne[3] == src1->ne[3]);
|
| 5440 |
+
struct ggml_tensor * tmp =
|
| 5441 |
ggml_out_prod(ctx, // [n,m,qq,rr]
|
| 5442 |
src1, // [n,p,qq,rr]
|
| 5443 |
grad); // [m,p,qq,rr]
|
| 5444 |
+
if (!ggml_are_same_shape(tmp, src0)) {
|
| 5445 |
+
GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
|
| 5446 |
+
GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
|
| 5447 |
+
GGML_ASSERT(tmp->ne[3] == 1);
|
| 5448 |
+
|
| 5449 |
+
const int64_t nr2 = tmp->ne[2] / src0->ne[2];
|
| 5450 |
+
const size_t nb2 = tmp->nb[2] * nr2;
|
| 5451 |
+
const size_t nb3 = tmp->nb[2];
|
| 5452 |
+
|
| 5453 |
+
tmp = ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
|
| 5454 |
+
tmp = ggml_repeat_back(ctx, tmp, src0);
|
| 5455 |
}
|
| 5456 |
+
ggml_add_or_set(ctx, cgraph, isrc0, tmp);
|
| 5457 |
}
|
| 5458 |
if (src1_needs_grads) {
|
| 5459 |
ggml_add_or_set(ctx, cgraph, isrc1,
|
|
|
|
| 5522 |
if (src0_needs_grads) {
|
| 5523 |
GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
|
| 5524 |
GGML_ASSERT(ggml_is_contiguous(grad));
|
| 5525 |
+
GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
|
| 5526 |
+
ggml_add_or_set(ctx, cgraph, isrc0,
|
| 5527 |
+
ggml_are_same_shape(tensor, src0) ? grad : ggml_reshape(ctx, grad, src0));
|
| 5528 |
}
|
| 5529 |
} break;
|
| 5530 |
case GGML_OP_RESHAPE: {
|