Spaces:
Running
Running
lhez
commited on
Commit
·
94449e3
1
Parent(s):
dc68418
opencl: Noncontiguous `norm`, `rms_norm`, disable `fp16` for some ops (llama/12217)
Browse files* opencl: support noncontiguous `norm`
* opencl: support noncontiguous `rms_norm`
* opencl: disable fp16 for `ADD`, `MUL`, `SCALE`, `RELU`, `GELU`, `SILU`, `CLAMP`
ggml/src/ggml-opencl/ggml-opencl.cpp
CHANGED
|
@@ -1007,17 +1007,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
| 1007 |
case GGML_OP_ADD:
|
| 1008 |
case GGML_OP_SCALE:
|
| 1009 |
case GGML_OP_MUL:
|
| 1010 |
-
return
|
| 1011 |
case GGML_OP_UNARY:
|
| 1012 |
switch (ggml_get_unary_op(op)) {
|
| 1013 |
case GGML_UNARY_OP_GELU:
|
| 1014 |
case GGML_UNARY_OP_SILU:
|
| 1015 |
case GGML_UNARY_OP_RELU:
|
| 1016 |
-
return ggml_is_contiguous(op->src[0]);
|
| 1017 |
default:
|
| 1018 |
return false;
|
| 1019 |
}
|
| 1020 |
case GGML_OP_CLAMP:
|
|
|
|
| 1021 |
case GGML_OP_SOFT_MAX:
|
| 1022 |
case GGML_OP_NORM:
|
| 1023 |
case GGML_OP_RMS_NORM:
|
|
@@ -2573,26 +2574,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
| 2573 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 2574 |
|
| 2575 |
const int ne00 = src0 ? src0->ne[0] : 0;
|
| 2576 |
-
const
|
|
|
|
|
|
|
| 2577 |
|
| 2578 |
-
|
|
|
|
|
|
|
| 2579 |
|
| 2580 |
const int nth = MIN(64, ne00);
|
| 2581 |
|
| 2582 |
cl_kernel kernel = backend_ctx->kernel_norm;
|
| 2583 |
|
| 2584 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2585 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2586 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2587 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2588 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2589 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2590 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2591 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2592 |
-
|
| 2593 |
-
|
|
|
|
|
|
|
|
|
|
| 2594 |
|
| 2595 |
-
size_t global_work_size[] = {(size_t)
|
| 2596 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 2597 |
|
| 2598 |
#ifdef GGML_OPENCL_PROFILING
|
|
@@ -2630,16 +2638,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 2630 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 2631 |
|
| 2632 |
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2633 |
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
|
|
|
|
|
| 2634 |
|
| 2635 |
GGML_ASSERT(ne00 % 4 == 0);
|
| 2636 |
-
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
| 2637 |
|
| 2638 |
const int nth = MIN(64, ne00);
|
| 2639 |
|
| 2640 |
-
|
| 2641 |
-
|
| 2642 |
-
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
| 2643 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 2644 |
|
| 2645 |
cl_kernel kernel = backend_ctx->kernel_rms_norm;
|
|
@@ -2654,15 +2665,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 2654 |
sizeof(local_work_size), local_work_size,
|
| 2655 |
sizeof(size_t), &sgs, NULL));
|
| 2656 |
|
| 2657 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2658 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2659 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2660 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2661 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2662 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2663 |
-
CL_CHECK(clSetKernelArg(kernel,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2664 |
// This is local memory - the size depends on subgroup size.
|
| 2665 |
-
CL_CHECK(clSetKernelArg(kernel,
|
| 2666 |
|
| 2667 |
#ifdef GGML_OPENCL_PROFILING
|
| 2668 |
cl_event evt;
|
|
|
|
| 1007 |
case GGML_OP_ADD:
|
| 1008 |
case GGML_OP_SCALE:
|
| 1009 |
case GGML_OP_MUL:
|
| 1010 |
+
return op->src[0]->type == GGML_TYPE_F32;
|
| 1011 |
case GGML_OP_UNARY:
|
| 1012 |
switch (ggml_get_unary_op(op)) {
|
| 1013 |
case GGML_UNARY_OP_GELU:
|
| 1014 |
case GGML_UNARY_OP_SILU:
|
| 1015 |
case GGML_UNARY_OP_RELU:
|
| 1016 |
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
| 1017 |
default:
|
| 1018 |
return false;
|
| 1019 |
}
|
| 1020 |
case GGML_OP_CLAMP:
|
| 1021 |
+
return op->src[0]->type == GGML_TYPE_F32;
|
| 1022 |
case GGML_OP_SOFT_MAX:
|
| 1023 |
case GGML_OP_NORM:
|
| 1024 |
case GGML_OP_RMS_NORM:
|
|
|
|
| 2574 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 2575 |
|
| 2576 |
const int ne00 = src0 ? src0->ne[0] : 0;
|
| 2577 |
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
| 2578 |
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
| 2579 |
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
| 2580 |
|
| 2581 |
+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
| 2582 |
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
| 2583 |
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
| 2584 |
|
| 2585 |
const int nth = MIN(64, ne00);
|
| 2586 |
|
| 2587 |
cl_kernel kernel = backend_ctx->kernel_norm;
|
| 2588 |
|
| 2589 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 2590 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 2591 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
| 2592 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
| 2593 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
| 2594 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
| 2595 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
| 2596 |
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
| 2597 |
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
| 2598 |
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
| 2599 |
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
| 2600 |
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
| 2601 |
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
| 2602 |
|
| 2603 |
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
| 2604 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 2605 |
|
| 2606 |
#ifdef GGML_OPENCL_PROFILING
|
|
|
|
| 2638 |
memcpy(&eps, dst->op_params, sizeof(float));
|
| 2639 |
|
| 2640 |
const int ne00 = src0 ? src0->ne[0] : 0;
|
| 2641 |
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
| 2642 |
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
| 2643 |
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
| 2644 |
+
|
| 2645 |
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
| 2646 |
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
| 2647 |
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
| 2648 |
|
| 2649 |
GGML_ASSERT(ne00 % 4 == 0);
|
|
|
|
| 2650 |
|
| 2651 |
const int nth = MIN(64, ne00);
|
| 2652 |
|
| 2653 |
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
|
|
|
|
|
| 2654 |
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
| 2655 |
|
| 2656 |
cl_kernel kernel = backend_ctx->kernel_rms_norm;
|
|
|
|
| 2665 |
sizeof(local_work_size), local_work_size,
|
| 2666 |
sizeof(size_t), &sgs, NULL));
|
| 2667 |
|
| 2668 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 2669 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 2670 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
| 2671 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
| 2672 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
| 2673 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
| 2674 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
| 2675 |
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
| 2676 |
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
| 2677 |
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
| 2678 |
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
| 2679 |
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
| 2680 |
// This is local memory - the size depends on subgroup size.
|
| 2681 |
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
| 2682 |
|
| 2683 |
#ifdef GGML_OPENCL_PROFILING
|
| 2684 |
cl_event evt;
|
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
CHANGED
|
@@ -506,14 +506,23 @@ kernel void kernel_norm(
|
|
| 506 |
global float * dst,
|
| 507 |
ulong offsetd,
|
| 508 |
int ne00,
|
|
|
|
|
|
|
|
|
|
| 509 |
ulong nb01,
|
|
|
|
|
|
|
| 510 |
float eps,
|
| 511 |
local float * sum
|
| 512 |
) {
|
| 513 |
src0 = (global void*)((global char*)src0 + offset0);
|
| 514 |
dst = (global void*)((global char*)dst + offsetd);
|
| 515 |
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
// MEAN
|
| 519 |
// parallel sum
|
|
@@ -533,7 +542,7 @@ kernel void kernel_norm(
|
|
| 533 |
|
| 534 |
// recenter and VARIANCE
|
| 535 |
barrier(CLK_LOCAL_MEM_FENCE);
|
| 536 |
-
global float * y = dst +
|
| 537 |
sum[get_local_id(0)] = 0.0f;
|
| 538 |
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
| 539 |
y[i00] = x[i00] - mean;
|
|
@@ -566,14 +575,23 @@ kernel void kernel_rms_norm(
|
|
| 566 |
global float * dst,
|
| 567 |
ulong offsetd,
|
| 568 |
int ne00,
|
|
|
|
|
|
|
|
|
|
| 569 |
ulong nb01,
|
|
|
|
|
|
|
| 570 |
float eps,
|
| 571 |
local float * sum // Note, the size depends on number of subgroups
|
| 572 |
) {
|
| 573 |
src0 = (global void*)((global char*)src0 + offset0);
|
| 574 |
dst = (global float*)((global char*)dst + offsetd);
|
| 575 |
|
| 576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
global float * x_scalar = (global float *) x;
|
| 578 |
float4 sumf = 0;
|
| 579 |
float all_sum = 0;
|
|
@@ -607,7 +625,7 @@ kernel void kernel_rms_norm(
|
|
| 607 |
const float mean = sum[0];
|
| 608 |
const float scale = 1.0f/sqrt(mean + eps);
|
| 609 |
|
| 610 |
-
global float4 * y = (global float4 *) (dst +
|
| 611 |
global float * y_scalar = (global float *) y;
|
| 612 |
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
| 613 |
y[i00] = x[i00] * scale;
|
|
|
|
| 506 |
global float * dst,
|
| 507 |
ulong offsetd,
|
| 508 |
int ne00,
|
| 509 |
+
int ne01,
|
| 510 |
+
int ne02,
|
| 511 |
+
int ne03,
|
| 512 |
ulong nb01,
|
| 513 |
+
ulong nb02,
|
| 514 |
+
ulong nb03,
|
| 515 |
float eps,
|
| 516 |
local float * sum
|
| 517 |
) {
|
| 518 |
src0 = (global void*)((global char*)src0 + offset0);
|
| 519 |
dst = (global void*)((global char*)dst + offsetd);
|
| 520 |
|
| 521 |
+
int i03 = get_group_id(2);
|
| 522 |
+
int i02 = get_group_id(1);
|
| 523 |
+
int i01 = get_group_id(0);
|
| 524 |
+
|
| 525 |
+
global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
| 526 |
|
| 527 |
// MEAN
|
| 528 |
// parallel sum
|
|
|
|
| 542 |
|
| 543 |
// recenter and VARIANCE
|
| 544 |
barrier(CLK_LOCAL_MEM_FENCE);
|
| 545 |
+
global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
| 546 |
sum[get_local_id(0)] = 0.0f;
|
| 547 |
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
| 548 |
y[i00] = x[i00] - mean;
|
|
|
|
| 575 |
global float * dst,
|
| 576 |
ulong offsetd,
|
| 577 |
int ne00,
|
| 578 |
+
int ne01,
|
| 579 |
+
int ne02,
|
| 580 |
+
int ne03,
|
| 581 |
ulong nb01,
|
| 582 |
+
ulong nb02,
|
| 583 |
+
ulong nb03,
|
| 584 |
float eps,
|
| 585 |
local float * sum // Note, the size depends on number of subgroups
|
| 586 |
) {
|
| 587 |
src0 = (global void*)((global char*)src0 + offset0);
|
| 588 |
dst = (global float*)((global char*)dst + offsetd);
|
| 589 |
|
| 590 |
+
int i03 = get_group_id(2);
|
| 591 |
+
int i02 = get_group_id(1);
|
| 592 |
+
int i01 = get_group_id(0);
|
| 593 |
+
|
| 594 |
+
global float4 * x = (global float4 *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
| 595 |
global float * x_scalar = (global float *) x;
|
| 596 |
float4 sumf = 0;
|
| 597 |
float all_sum = 0;
|
|
|
|
| 625 |
const float mean = sum[0];
|
| 626 |
const float scale = 1.0f/sqrt(mean + eps);
|
| 627 |
|
| 628 |
+
global float4 * y = (global float4 *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
| 629 |
global float * y_scalar = (global float *) y;
|
| 630 |
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
| 631 |
y[i00] = x[i00] * scale;
|