Spaces:
Running
Running
CUDA: fix softmax compile for old CUDA versions (llama/4862)
Browse files- ggml-cuda.cu +18 -16
ggml-cuda.cu
CHANGED
|
@@ -116,6 +116,8 @@
|
|
| 116 |
#include "ggml.h"
|
| 117 |
#include "ggml-backend-impl.h"
|
| 118 |
|
|
|
|
|
|
|
| 119 |
#define CC_PASCAL 600
|
| 120 |
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
| 121 |
#define CC_VOLTA 700
|
|
@@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
| 605 |
}
|
| 606 |
|
| 607 |
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
| 608 |
-
#if
|
| 609 |
-
(void) a;
|
| 610 |
-
bad_arch();
|
| 611 |
-
#else
|
| 612 |
#pragma unroll
|
| 613 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 614 |
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
| 615 |
}
|
| 616 |
return a;
|
| 617 |
-
#
|
|
|
|
|
|
|
|
|
|
| 618 |
}
|
| 619 |
|
| 620 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
@@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
| 626 |
}
|
| 627 |
|
| 628 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 629 |
-
#if
|
| 630 |
-
(void) x;
|
| 631 |
-
bad_arch();
|
| 632 |
-
#else
|
| 633 |
#pragma unroll
|
| 634 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 635 |
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
| 636 |
}
|
| 637 |
return x;
|
| 638 |
-
#
|
|
|
|
|
|
|
|
|
|
| 639 |
}
|
| 640 |
|
| 641 |
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
|
@@ -5613,7 +5615,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
| 5613 |
|
| 5614 |
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
| 5615 |
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
| 5616 |
-
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 5617 |
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
| 5618 |
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
| 5619 |
|
|
@@ -5738,7 +5740,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|
| 5738 |
#else
|
| 5739 |
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
| 5740 |
bad_arch();
|
| 5741 |
-
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 5742 |
}
|
| 5743 |
|
| 5744 |
template <bool vals_smem, int ncols_template, int block_size_template>
|
|
@@ -8574,15 +8576,15 @@ static void ggml_cuda_op_soft_max(
|
|
| 8574 |
float scale = 1.0f;
|
| 8575 |
memcpy(&scale, dst->op_params, sizeof(float));
|
| 8576 |
|
| 8577 |
-
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 8578 |
-
const bool use_f16_soft_max = false;
|
| 8579 |
-
#else
|
| 8580 |
#ifdef GGML_CUDA_F16
|
| 8581 |
const bool use_f16_soft_max = true;
|
| 8582 |
#else
|
| 8583 |
const bool use_f16_soft_max = false;
|
| 8584 |
#endif // GGML_CUDA_F16
|
| 8585 |
-
#
|
|
|
|
|
|
|
| 8586 |
|
| 8587 |
if (use_f16_soft_max) {
|
| 8588 |
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
|
|
|
| 116 |
#include "ggml.h"
|
| 117 |
#include "ggml-backend-impl.h"
|
| 118 |
|
| 119 |
+
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 120 |
+
|
| 121 |
#define CC_PASCAL 600
|
| 122 |
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
| 123 |
#define CC_VOLTA 700
|
|
|
|
| 607 |
}
|
| 608 |
|
| 609 |
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
| 610 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
|
|
|
|
|
|
|
|
| 611 |
#pragma unroll
|
| 612 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 613 |
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
| 614 |
}
|
| 615 |
return a;
|
| 616 |
+
#else
|
| 617 |
+
(void) a;
|
| 618 |
+
bad_arch();
|
| 619 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 620 |
}
|
| 621 |
|
| 622 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
|
|
| 628 |
}
|
| 629 |
|
| 630 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 631 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
|
|
|
|
|
|
|
|
|
| 632 |
#pragma unroll
|
| 633 |
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 634 |
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
| 635 |
}
|
| 636 |
return x;
|
| 637 |
+
#else
|
| 638 |
+
(void) x;
|
| 639 |
+
bad_arch();
|
| 640 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 641 |
}
|
| 642 |
|
| 643 |
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
|
|
|
| 5615 |
|
| 5616 |
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
| 5617 |
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
| 5618 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 5619 |
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
| 5620 |
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
| 5621 |
|
|
|
|
| 5740 |
#else
|
| 5741 |
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
| 5742 |
bad_arch();
|
| 5743 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 5744 |
}
|
| 5745 |
|
| 5746 |
template <bool vals_smem, int ncols_template, int block_size_template>
|
|
|
|
| 8576 |
float scale = 1.0f;
|
| 8577 |
memcpy(&scale, dst->op_params, sizeof(float));
|
| 8578 |
|
| 8579 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
|
|
|
|
|
|
|
| 8580 |
#ifdef GGML_CUDA_F16
|
| 8581 |
const bool use_f16_soft_max = true;
|
| 8582 |
#else
|
| 8583 |
const bool use_f16_soft_max = false;
|
| 8584 |
#endif // GGML_CUDA_F16
|
| 8585 |
+
#else
|
| 8586 |
+
const bool use_f16_soft_max = false;
|
| 8587 |
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
| 8588 |
|
| 8589 |
if (use_f16_soft_max) {
|
| 8590 |
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|