JohannesGaessler commited on
Commit
5eda533
·
unverified ·
1 Parent(s): c0b17f1

CUDA: fix softmax compile for old CUDA versions (llama/4862)

Browse files
Files changed (1) hide show
  1. ggml-cuda.cu +18 -16
ggml-cuda.cu CHANGED
@@ -116,6 +116,8 @@
116
  #include "ggml.h"
117
  #include "ggml-backend-impl.h"
118
 
 
 
119
  #define CC_PASCAL 600
120
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
121
  #define CC_VOLTA 700
@@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
605
  }
606
 
607
  static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
608
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
609
- (void) a;
610
- bad_arch();
611
- #else
612
  #pragma unroll
613
  for (int mask = 16; mask > 0; mask >>= 1) {
614
  a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
615
  }
616
  return a;
617
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 
 
 
618
  }
619
 
620
  static __device__ __forceinline__ float warp_reduce_max(float x) {
@@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
626
  }
627
 
628
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
629
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
630
- (void) x;
631
- bad_arch();
632
- #else
633
  #pragma unroll
634
  for (int mask = 16; mask > 0; mask >>= 1) {
635
  x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
636
  }
637
  return x;
638
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 
 
 
639
  }
640
 
641
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
@@ -5613,7 +5615,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
5613
 
5614
  template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
5615
  static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5616
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5617
  const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
5618
  const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
5619
 
@@ -5738,7 +5740,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
5738
  #else
5739
  (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5740
  bad_arch();
5741
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5742
  }
5743
 
5744
  template <bool vals_smem, int ncols_template, int block_size_template>
@@ -8574,15 +8576,15 @@ static void ggml_cuda_op_soft_max(
8574
  float scale = 1.0f;
8575
  memcpy(&scale, dst->op_params, sizeof(float));
8576
 
8577
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8578
- const bool use_f16_soft_max = false;
8579
- #else
8580
  #ifdef GGML_CUDA_F16
8581
  const bool use_f16_soft_max = true;
8582
  #else
8583
  const bool use_f16_soft_max = false;
8584
  #endif // GGML_CUDA_F16
8585
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 
 
8586
 
8587
  if (use_f16_soft_max) {
8588
  soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
 
116
  #include "ggml.h"
117
  #include "ggml-backend-impl.h"
118
 
119
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
120
+
121
  #define CC_PASCAL 600
122
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
123
  #define CC_VOLTA 700
 
607
  }
608
 
609
  static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
610
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 
 
 
611
  #pragma unroll
612
  for (int mask = 16; mask > 0; mask >>= 1) {
613
  a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
614
  }
615
  return a;
616
+ #else
617
+ (void) a;
618
+ bad_arch();
619
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
620
  }
621
 
622
  static __device__ __forceinline__ float warp_reduce_max(float x) {
 
628
  }
629
 
630
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
631
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 
 
 
632
  #pragma unroll
633
  for (int mask = 16; mask > 0; mask >>= 1) {
634
  x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
635
  }
636
  return x;
637
+ #else
638
+ (void) x;
639
+ bad_arch();
640
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
641
  }
642
 
643
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
 
5615
 
5616
  template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
5617
  static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5618
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5619
  const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
5620
  const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
5621
 
 
5740
  #else
5741
  (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5742
  bad_arch();
5743
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5744
  }
5745
 
5746
  template <bool vals_smem, int ncols_template, int block_size_template>
 
8576
  float scale = 1.0f;
8577
  memcpy(&scale, dst->op_params, sizeof(float));
8578
 
8579
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
 
 
8580
  #ifdef GGML_CUDA_F16
8581
  const bool use_f16_soft_max = true;
8582
  #else
8583
  const bool use_f16_soft_max = false;
8584
  #endif // GGML_CUDA_F16
8585
+ #else
8586
+ const bool use_f16_soft_max = false;
8587
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
8588
 
8589
  if (use_f16_soft_max) {
8590
  soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);