lhez shawngu-quic commited on
Commit
1a0281c
·
1 Parent(s): 78a1865

opencl: add initial mxfp4 support via mv (llama/15270)

Browse files

* opencl: add reference `mul_mv_mxfp4_f32`

* opencl: add reference `mul_mv_id` for mxfp4

* Q4_0 tranpose fix for Adreno

---------

Co-authored-by: shawngu-quic <[email protected]>

ggml/src/ggml-opencl/CMakeLists.txt CHANGED
@@ -82,7 +82,9 @@ set(GGML_OPENCL_KERNELS
82
  mul_mv_q4_0_f32_1d_8x_flat
83
  mul_mv_q4_0_f32_1d_16x_flat
84
  mul_mv_q6_k
 
85
  mul_mv_id_q4_0_f32_8x_flat
 
86
  mul_mm_f32_f32_l4_lm
87
  mul_mm_f16_f32_l4_lm
88
  mul
 
82
  mul_mv_q4_0_f32_1d_8x_flat
83
  mul_mv_q4_0_f32_1d_16x_flat
84
  mul_mv_q6_k
85
+ mul_mv_mxfp4_f32
86
  mul_mv_id_q4_0_f32_8x_flat
87
+ mul_mv_id_mxfp4_f32
88
  mul_mm_f32_f32_l4_lm
89
  mul_mm_f16_f32_l4_lm
90
  mul
ggml/src/ggml-opencl/ggml-opencl.cpp CHANGED
@@ -365,6 +365,7 @@ struct ggml_backend_opencl_context {
365
  cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
366
  cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
367
  cl_program program_mul_mv_q6_K;
 
368
  cl_program program_mul_mv_f16_f16;
369
  cl_program program_mul_mv_f16_f32_1row;
370
  cl_program program_mul_mv_f16_f32_l4;
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
398
  cl_program program_conv_2d_f16_f32;
399
  cl_program program_tsembd;
400
  cl_program program_mul_mv_id_q4_0_f32_8x_flat;
 
401
  cl_program program_mul_mm_f32_f32_l4_lm;
402
  cl_program program_mul_mm_f16_f32_l4_lm;
403
 
@@ -439,6 +441,7 @@ struct ggml_backend_opencl_context {
439
  cl_kernel kernel_convert_block_q4_0_noshuffle;
440
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
441
  cl_kernel kernel_mul_mv_q6_K_f32;
 
442
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
443
  cl_kernel kernel_argsort_f32_i32;
444
  cl_kernel kernel_sum_rows_f32;
@@ -455,6 +458,7 @@ struct ggml_backend_opencl_context {
455
  cl_kernel kernel_conv_2d_f16_f32;
456
  cl_kernel kernel_timestep_embedding;
457
  cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
 
458
  cl_kernel kernel_mul_mm_f32_f32_l4_lm;
459
  cl_kernel kernel_mul_mm_f16_f32_l4_lm;
460
 
@@ -577,6 +581,7 @@ struct ggml_backend_opencl_context {
577
  cl_kernel kernel_transpose_32;
578
  cl_kernel kernel_transpose_32_16;
579
  cl_kernel kernel_transpose_16;
 
580
 
581
  cl_mem A_s_d_max; // max scale buffer size for transpose
582
  cl_mem A_q_d_max; // max weight buffer size for transpose
@@ -971,6 +976,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
971
  GGML_LOG_CONT(".");
972
  }
973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974
  // mul_mv_f16_f16
975
  {
976
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1611,6 +1632,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1611
  GGML_LOG_CONT(".");
1612
  }
1613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1614
  // Adreno kernels
1615
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1616
  // transpose
@@ -1628,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1628
  CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
1629
  CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
1630
  CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
 
1631
  GGML_LOG_CONT(".");
1632
  }
1633
 
@@ -2552,13 +2590,14 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2552
  return true;
2553
  } else if (op->src[0]->type == GGML_TYPE_F32) {
2554
  return op->src[1]->type == GGML_TYPE_F32;
2555
- } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
2556
  op->src[0]->type == GGML_TYPE_Q6_K) {
2557
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2558
  }
2559
  return false;
2560
  case GGML_OP_MUL_MAT_ID:
2561
- if (op->src[0]->type == GGML_TYPE_Q4_0) {
 
2562
  if (op->src[1]->type == GGML_TYPE_F32) {
2563
  return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2564
  }
@@ -2944,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2944
  // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
2945
  CL_CHECK(err);
2946
 
2947
- // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
 
 
 
2948
  size_t d_size_bytes = M * (K / 32) * 2;
2949
  region.origin = 0;
2950
  region.size = d_size_bytes;
@@ -2985,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2985
  qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
2986
  CL_CHECK(err);
2987
 
2988
- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
2989
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 
 
 
 
 
 
 
2990
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
2991
- img_desc_1d.image_width = M * K / 32 / 4;
2992
  img_desc_1d.buffer = extra->d;
2993
  d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
2994
  CL_CHECK(err);
@@ -3024,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3024
  int width_s = K / 32 / 4;
3025
 
3026
  kernel = backend_ctx->kernel_transpose_16;
 
 
 
 
3027
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
3028
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
3029
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
@@ -6254,11 +6305,47 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6254
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
6255
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
6256
  break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6257
  default:
6258
  GGML_ASSERT(false && "not implemented");
6259
  }
6260
 
6261
- if (src0t == GGML_TYPE_Q4_0 ||
6262
  src0t == GGML_TYPE_Q4_1 ||
6263
  src0t == GGML_TYPE_Q8_0 ||
6264
  src0t == GGML_TYPE_Q2_K) {
@@ -6307,10 +6394,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6307
 
6308
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6309
 
 
6310
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6311
  ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
6312
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6313
 
 
6314
  cl_ulong offset1 = extra1->offset + src1->view_offs;
6315
  cl_ulong offset2 = extra2->offset + src2->view_offs;
6316
  cl_ulong offsetd = extrad->offset + dst->view_offs;
@@ -6325,7 +6414,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6325
  const int ne03 = src0->ne[3];
6326
 
6327
  const cl_ulong nb00 = src0->nb[0];
 
6328
  const cl_ulong nb02 = src0->nb[2];
 
6329
 
6330
  const int ne10 = src1->ne[0];
6331
  const int ne11 = src1->ne[1];
@@ -6334,6 +6425,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6334
 
6335
  const cl_ulong nb11 = src1->nb[1];
6336
  const cl_ulong nb12 = src1->nb[2];
 
6337
 
6338
  const int ne20 = src2->ne[0];
6339
  const int ne21 = src2->ne[1];
@@ -6401,6 +6493,49 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6401
 
6402
  break;
6403
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6404
  default:
6405
  GGML_ASSERT(false && "not implemented");;
6406
  }
 
365
  cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
366
  cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
367
  cl_program program_mul_mv_q6_K;
368
+ cl_program program_mul_mv_mxfp4_f32;
369
  cl_program program_mul_mv_f16_f16;
370
  cl_program program_mul_mv_f16_f32_1row;
371
  cl_program program_mul_mv_f16_f32_l4;
 
399
  cl_program program_conv_2d_f16_f32;
400
  cl_program program_tsembd;
401
  cl_program program_mul_mv_id_q4_0_f32_8x_flat;
402
+ cl_program program_mul_mv_id_mxfp4_f32;
403
  cl_program program_mul_mm_f32_f32_l4_lm;
404
  cl_program program_mul_mm_f16_f32_l4_lm;
405
 
 
441
  cl_kernel kernel_convert_block_q4_0_noshuffle;
442
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
443
  cl_kernel kernel_mul_mv_q6_K_f32;
444
+ cl_kernel kernel_mul_mv_mxfp4_f32;
445
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
446
  cl_kernel kernel_argsort_f32_i32;
447
  cl_kernel kernel_sum_rows_f32;
 
458
  cl_kernel kernel_conv_2d_f16_f32;
459
  cl_kernel kernel_timestep_embedding;
460
  cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
461
+ cl_kernel kernel_mul_mv_id_mxfp4_f32;
462
  cl_kernel kernel_mul_mm_f32_f32_l4_lm;
463
  cl_kernel kernel_mul_mm_f16_f32_l4_lm;
464
 
 
581
  cl_kernel kernel_transpose_32;
582
  cl_kernel kernel_transpose_32_16;
583
  cl_kernel kernel_transpose_16;
584
+ cl_kernel kernel_transpose_16_4x1;
585
 
586
  cl_mem A_s_d_max; // max scale buffer size for transpose
587
  cl_mem A_q_d_max; // max weight buffer size for transpose
 
976
  GGML_LOG_CONT(".");
977
  }
978
 
979
+ // mul_mv_mxfp4_f32
980
+ {
981
+ #ifdef GGML_OPENCL_EMBED_KERNELS
982
+ const std::string kernel_src {
983
+ #include "mul_mv_mxfp4_f32.cl.h"
984
+ };
985
+ #else
986
+ const std::string kernel_src = read_file("mul_mv_mxfp4_f32.cl");
987
+ #endif
988
+ backend_ctx->program_mul_mv_mxfp4_f32 =
989
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
990
+
991
+ CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32, "kernel_mul_mv_mxfp4_f32", &err), err));
992
+ GGML_LOG_CONT(".");
993
+ }
994
+
995
  // mul_mv_f16_f16
996
  {
997
  #ifdef GGML_OPENCL_EMBED_KERNELS
 
1632
  GGML_LOG_CONT(".");
1633
  }
1634
 
1635
+ // mul_mv_id_mxfp4_f32
1636
+ {
1637
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1638
+ const std::string kernel_src {
1639
+ #include "mul_mv_id_mxfp4_f32.cl.h"
1640
+ };
1641
+ #else
1642
+ const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32.cl");
1643
+ #endif
1644
+ backend_ctx->program_mul_mv_id_mxfp4_f32 =
1645
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1646
+
1647
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32, "kernel_mul_mv_id_mxfp4_f32", &err), err));
1648
+ GGML_LOG_CONT(".");
1649
+ }
1650
+
1651
  // Adreno kernels
1652
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1653
  // transpose
 
1665
  CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
1666
  CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
1667
  CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
1668
+ CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
1669
  GGML_LOG_CONT(".");
1670
  }
1671
 
 
2590
  return true;
2591
  } else if (op->src[0]->type == GGML_TYPE_F32) {
2592
  return op->src[1]->type == GGML_TYPE_F32;
2593
+ } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
2594
  op->src[0]->type == GGML_TYPE_Q6_K) {
2595
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2596
  }
2597
  return false;
2598
  case GGML_OP_MUL_MAT_ID:
2599
+ if (op->src[0]->type == GGML_TYPE_Q4_0 ||
2600
+ op->src[0]->type == GGML_TYPE_MXFP4) {
2601
  if (op->src[1]->type == GGML_TYPE_F32) {
2602
  return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2603
  }
 
2983
  // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
2984
  CL_CHECK(err);
2985
 
2986
+ bool K_tile_trans = true;
2987
+ if ((K / 32) % 4 != 0){
2988
+ K_tile_trans =false;
2989
+ }
2990
  size_t d_size_bytes = M * (K / 32) * 2;
2991
  region.origin = 0;
2992
  region.size = d_size_bytes;
 
3027
  qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
3028
  CL_CHECK(err);
3029
 
 
3030
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
3031
+ if (K_tile_trans) {
3032
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3033
+ img_desc_1d.image_width = M * K / 32 / 4;
3034
+ } else {
3035
+ img_fmt_1d = { CL_R, CL_HALF_FLOAT };
3036
+ img_desc_1d.image_width = M * K / 32;
3037
+ }
3038
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 
3039
  img_desc_1d.buffer = extra->d;
3040
  d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
3041
  CL_CHECK(err);
 
3071
  int width_s = K / 32 / 4;
3072
 
3073
  kernel = backend_ctx->kernel_transpose_16;
3074
+ if (!K_tile_trans) {
3075
+ kernel = backend_ctx->kernel_transpose_16_4x1;
3076
+ width_s = K / 32;
3077
+ }
3078
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
3079
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
3080
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
 
6305
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
6306
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
6307
  break;
6308
+ case GGML_TYPE_MXFP4: {
6309
+ kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
6310
+
6311
+ if (backend_ctx->gpu_family == INTEL) {
6312
+ nth0 = 16;
6313
+ nth1 = 2;
6314
+ ndst = nth1*2;
6315
+ } else if (backend_ctx->gpu_family == ADRENO) {
6316
+ nth0 = 64;
6317
+ nth1 = 2;
6318
+ ndst = nth1*2;
6319
+ } else {
6320
+ GGML_ASSERT(false && "TODO: Unknown GPU");
6321
+ }
6322
+
6323
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6324
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6325
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6326
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6327
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6328
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6329
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
6330
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
6331
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
6332
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
6333
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
6334
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
6335
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
6336
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
6337
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
6338
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
6339
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
6340
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
6341
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
6342
+ break;
6343
+ }
6344
  default:
6345
  GGML_ASSERT(false && "not implemented");
6346
  }
6347
 
6348
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
6349
  src0t == GGML_TYPE_Q4_1 ||
6350
  src0t == GGML_TYPE_Q8_0 ||
6351
  src0t == GGML_TYPE_Q2_K) {
 
6394
 
6395
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6396
 
6397
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6398
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6399
  ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
6400
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6401
 
6402
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
6403
  cl_ulong offset1 = extra1->offset + src1->view_offs;
6404
  cl_ulong offset2 = extra2->offset + src2->view_offs;
6405
  cl_ulong offsetd = extrad->offset + dst->view_offs;
 
6414
  const int ne03 = src0->ne[3];
6415
 
6416
  const cl_ulong nb00 = src0->nb[0];
6417
+ const cl_ulong nb01 = src0->nb[1];
6418
  const cl_ulong nb02 = src0->nb[2];
6419
+ const cl_ulong nb03 = src0->nb[3];
6420
 
6421
  const int ne10 = src1->ne[0];
6422
  const int ne11 = src1->ne[1];
 
6425
 
6426
  const cl_ulong nb11 = src1->nb[1];
6427
  const cl_ulong nb12 = src1->nb[2];
6428
+ const cl_ulong nb13 = src1->nb[3];
6429
 
6430
  const int ne20 = src2->ne[0];
6431
  const int ne21 = src2->ne[1];
 
6493
 
6494
  break;
6495
  }
6496
+ case GGML_TYPE_MXFP4: {
6497
+ kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
6498
+
6499
+ if (backend_ctx->gpu_family == INTEL) {
6500
+ sgs = 16;
6501
+ nsg = 2;
6502
+ ndst = 2;
6503
+ } else if (backend_ctx->gpu_family == ADRENO) {
6504
+ sgs = 64;
6505
+ nsg = 2;
6506
+ ndst = 2;
6507
+ } else {
6508
+ GGML_ASSERT(false && "TODO: Unknown GPU");
6509
+ }
6510
+
6511
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6512
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6513
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6514
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6515
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
6516
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
6517
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
6518
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
6519
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
6520
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
6521
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
6522
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
6523
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
6524
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
6525
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
6526
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
6527
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
6528
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
6529
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
6530
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
6531
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
6532
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
6533
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
6534
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
6535
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
6536
+
6537
+ break;
6538
+ }
6539
  default:
6540
  GGML_ASSERT(false && "not implemented");;
6541
  }
ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK_MXFP4 32
22
+ typedef struct {
23
+ uchar e; // E8M0
24
+ uchar qs[QK_MXFP4/2];
25
+ } block_mxfp4;
26
+
27
+ constant static float kvalues_mxfp4_f[16] = {
28
+ 0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
29
+ };
30
+
31
+ static inline float e8m0_to_fp32(uchar x) {
32
+ int bits;
33
+
34
+ if (x == 0) {
35
+ bits = 0x00400000;
36
+ } else {
37
+ bits = (uint) x << 23;
38
+ }
39
+
40
+ return as_float(bits);
41
+ }
42
+
43
+ #ifdef INTEL_GPU
44
+ #define N_R0_MXFP4 2 // number of rows each subgroup works on
45
+ #define N_SG_MXFP4 2 // number of subgroups in a work group
46
+ #define N_SIMDWIDTH 16 // subgroup size
47
+ #elif defined (ADRENO_GPU)
48
+ #define N_R0_MXFP4 2
49
+ #define N_SG_MXFP4 2
50
+ #define N_SIMDWIDTH 64
51
+ #endif
52
+
53
+ inline void mul_mv_mxfp4_f32(
54
+ global char * src0,
55
+ global char * src1,
56
+ global char * dst,
57
+ int ne00,
58
+ ulong nb01,
59
+ ulong nb02,
60
+ ulong nb03,
61
+ int ne12,
62
+ ulong nb11,
63
+ ulong nb12,
64
+ ulong nb13,
65
+ int ne0,
66
+ int ne1,
67
+ int r2,
68
+ int r3,
69
+ local char * shmem
70
+ ) {
71
+ local float * shmem_f32 = (local float *) shmem;
72
+ int nb = ne00/QK_MXFP4;
73
+
74
+ int r0 = get_group_id(0);
75
+ int r1 = get_group_id(1);
76
+ int im = 0;
77
+
78
+ int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
79
+
80
+ uint i12 = im%ne12;
81
+ uint i13 = im/ne12;
82
+
83
+ ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
84
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
85
+
86
+ global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
87
+ global float * y = (global float *) (src1 + offset_src1);
88
+
89
+ const short ix = get_sub_group_local_id()/2; // 0...15
90
+ const short it = get_sub_group_local_id()%2; // 0 or 1
91
+
92
+ shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
93
+ barrier(CLK_LOCAL_MEM_FENCE);
94
+
95
+ float4 yl[4];
96
+ float sumf[N_R0_MXFP4] = {0.f};
97
+
98
+ global float * yb = y + ix * QK_MXFP4 + it * 8;
99
+
100
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
101
+ global float4 * y4 = (global float4 *)yb;
102
+ yl[0] = y4[0];
103
+ yl[1] = y4[4];
104
+ yl[2] = y4[1];
105
+ yl[3] = y4[5];
106
+
107
+ for (short row = 0; row < N_R0_MXFP4; row++) {
108
+ global block_mxfp4 * xb = x + row*nb + ib;
109
+ global uchar * q2 = (global uchar *)(xb->qs + 8*it);
110
+
111
+ float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]);
112
+ float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4 ], shmem_f32[q2[1] >> 4 ], shmem_f32[q2[2] >> 4 ], shmem_f32[q2[3] >> 4 ]);
113
+ float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] & 0x0F], shmem_f32[q2[5] & 0x0F], shmem_f32[q2[6] & 0x0F], shmem_f32[q2[7] & 0x0F]);
114
+ float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4 ], shmem_f32[q2[5] >> 4 ], shmem_f32[q2[6] >> 4 ], shmem_f32[q2[7] >> 4 ]);
115
+
116
+ acc1 = (acc1 + acc3) + (acc2 + acc4);
117
+
118
+ sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
119
+ }
120
+
121
+ yb += (N_SIMDWIDTH/2) * QK_MXFP4;
122
+ }
123
+
124
+ global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
125
+
126
+ for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
127
+ float sum_all = sub_group_reduce_add(sumf[row]);
128
+ if (get_sub_group_local_id() == 0) {
129
+ dst_f32[first_row + row] = sum_all;
130
+ }
131
+ }
132
+ }
133
+
134
+ #ifdef INTEL_GPU
135
+ REQD_SUBGROUP_SIZE_16
136
+ #elif defined (ADRENO_GPU)
137
+ REQD_SUBGROUP_SIZE_64
138
+ #endif
139
+ kernel void kernel_mul_mv_id_mxfp4_f32(
140
+ global char * src0,
141
+ ulong offset0,
142
+ global char * src1,
143
+ ulong offset1,
144
+ global char * src2,
145
+ ulong offset2,
146
+ global char * dst,
147
+ ulong offsetd,
148
+ int ne00,
149
+ ulong nb01,
150
+ ulong nb02,
151
+ ulong nb03,
152
+ int ne11,
153
+ int ne12,
154
+ ulong nb11,
155
+ ulong nb12,
156
+ ulong nb13,
157
+ int ne20,
158
+ int ne21,
159
+ ulong nb21,
160
+ int ne0,
161
+ int ne1,
162
+ int r2,
163
+ int r3,
164
+ local char * shmem
165
+ ) {
166
+ src0 = (global char *)((global char *)src0 + offset0);
167
+ src1 = (global char *)((global char *)src1 + offset1);
168
+ src2 = (global char *)((global char *)src2 + offset2);
169
+ dst = (global char *)((global char *)dst + offsetd);
170
+
171
+ const int iid1 = get_group_id(2)/ne20;
172
+ const int idx = get_group_id(2)%ne20;
173
+
174
+ int i02 = ((global int *) (src2 + iid1*nb21))[idx];
175
+
176
+ int i11 = idx % ne11;
177
+ int i12 = iid1;
178
+
179
+ int i1 = idx;
180
+ int i2 = i12;
181
+
182
+ global char * src0_cur = src0 + i02*nb02;
183
+ global char * src1_cur = src1 + i11*nb11 + i12*nb12;
184
+
185
+ global char * dst_cur = dst + (i1*ne0 + i2*ne1*ne0)*sizeof(float);
186
+
187
+ mul_mv_mxfp4_f32(src0_cur, src1_cur, dst_cur,
188
+ ne00, nb01, nb02, nb03, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shmem);
189
+ }
ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK_MXFP4 32
22
+ typedef struct {
23
+ uchar e; // E8M0
24
+ uchar qs[QK_MXFP4/2];
25
+ } block_mxfp4;
26
+
27
+ constant static float kvalues_mxfp4_f[16] = {
28
+ 0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
29
+ };
30
+
31
+ static inline float e8m0_to_fp32(uchar x) {
32
+ int bits;
33
+
34
+ if (x == 0) {
35
+ bits = 0x00400000;
36
+ } else {
37
+ bits = (uint) x << 23;
38
+ }
39
+
40
+ return as_float(bits);
41
+ }
42
+
43
+ #ifdef INTEL_GPU
44
+ #define N_R0_MXFP4 2 // number of rows each subgroup works on
45
+ #define N_SG_MXFP4 2 // number of subgroups in a work group
46
+ #define N_SIMDWIDTH 16 // subgroup size
47
+ #elif defined (ADRENO_GPU)
48
+ #define N_R0_MXFP4 2
49
+ #define N_SG_MXFP4 2
50
+ #define N_SIMDWIDTH 64
51
+ #endif
52
+
53
+ #ifdef INTEL_GPU
54
+ REQD_SUBGROUP_SIZE_16
55
+ #elif defined (ADRENO_GPU)
56
+ REQD_SUBGROUP_SIZE_64
57
+ #endif
58
+ kernel void kernel_mul_mv_mxfp4_f32(
59
+ global char * src0,
60
+ ulong offset0,
61
+ global char * src1,
62
+ ulong offset1,
63
+ global char * dst,
64
+ ulong offsetd,
65
+ int ne00,
66
+ ulong nb01,
67
+ ulong nb02,
68
+ ulong nb03,
69
+ int ne12,
70
+ ulong nb11,
71
+ ulong nb12,
72
+ ulong nb13,
73
+ int ne0,
74
+ int ne1,
75
+ int r2,
76
+ int r3,
77
+ local char * shmem
78
+ ) {
79
+ src0 = (global char*)((global char*)src0 + offset0);
80
+ src1 = (global char*)((global char*)src1 + offset1);
81
+ dst = (global char*)((global char*)dst + offsetd);
82
+
83
+ local float * shmem_f32 = (local float *) shmem;
84
+ int nb = ne00/QK_MXFP4;
85
+
86
+ int r0 = get_group_id(0);
87
+ int r1 = get_group_id(1);
88
+ int im = get_group_id(2);
89
+
90
+ int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
91
+
92
+ uint i12 = im%ne12;
93
+ uint i13 = im/ne12;
94
+
95
+ ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
96
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
97
+
98
+ global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
99
+ global float * y = (global float *) (src1 + offset_src1);
100
+
101
+ const short ix = get_sub_group_local_id()/2; // 0...15
102
+ const short it = get_sub_group_local_id()%2; // 0 or 1
103
+
104
+ shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
105
+ barrier(CLK_LOCAL_MEM_FENCE);
106
+
107
+ float4 yl[4];
108
+ float sumf[N_R0_MXFP4] = {0.f};
109
+
110
+ global float * yb = y + ix * QK_MXFP4 + it * 8;
111
+
112
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
113
+ global float4 * y4 = (global float4 *)yb;
114
+ yl[0] = y4[0];
115
+ yl[1] = y4[4];
116
+ yl[2] = y4[1];
117
+ yl[3] = y4[5];
118
+
119
+ for (short row = 0; row < N_R0_MXFP4; row++) {
120
+ global block_mxfp4 * xb = x + row*nb + ib;
121
+ global uchar * q2 = (global uchar *)(xb->qs + 8*it);
122
+
123
+ float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]);
124
+ float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4 ], shmem_f32[q2[1] >> 4 ], shmem_f32[q2[2] >> 4 ], shmem_f32[q2[3] >> 4 ]);
125
+ float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] & 0x0F], shmem_f32[q2[5] & 0x0F], shmem_f32[q2[6] & 0x0F], shmem_f32[q2[7] & 0x0F]);
126
+ float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4 ], shmem_f32[q2[5] >> 4 ], shmem_f32[q2[6] >> 4 ], shmem_f32[q2[7] >> 4 ]);
127
+
128
+ acc1 = (acc1 + acc3) + (acc2 + acc4);
129
+
130
+ sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
131
+ }
132
+
133
+ yb += (N_SIMDWIDTH/2) * QK_MXFP4;
134
+ }
135
+
136
+ global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
137
+
138
+ for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
139
+ float sum_all = sub_group_reduce_add(sumf[row]);
140
+ if (get_sub_group_local_id() == 0) {
141
+ dst_f32[first_row + row] = sum_all;
142
+ }
143
+ }
144
+ }
ggml/src/ggml-opencl/kernels/transpose.cl CHANGED
@@ -24,6 +24,26 @@ kernel void kernel_transpose_16(
24
  write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
25
  }
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  // 32-bit transpose, loading/storing a 4x4 tile of elements
28
  kernel void kernel_transpose_32(
29
  __read_only image1d_buffer_t input,
 
24
  write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
25
  }
26
 
27
+ // Padded kernel for irregular shape
28
+ kernel void kernel_transpose_16_4x1(
29
+ __read_only image1d_buffer_t input,
30
+ __write_only image1d_buffer_t output,
31
+ const uint rows,
32
+ const uint cols
33
+ ) {
34
+
35
+ const int i = get_global_id(0);
36
+ const int j = get_global_id(1);
37
+ const int j_2 = j << 2;
38
+
39
+ half temp0 = read_imageh(input, (j_2 + 0) * cols + i).x;
40
+ half temp1 = read_imageh(input, (j_2 + 1) * cols + i).x;
41
+ half temp2 = read_imageh(input, (j_2 + 2) * cols + i).x;
42
+ half temp3 = read_imageh(input, (j_2 + 3) * cols + i).x;
43
+
44
+ write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
45
+ }
46
+
47
  // 32-bit transpose, loading/storing a 4x4 tile of elements
48
  kernel void kernel_transpose_32(
49
  __read_only image1d_buffer_t input,