Spaces:
Running
Running
ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)
Browse files* ggml : sync latest llama.cpp (view_src + alloc improvements)
* ggml : fix build
- ggml-cuda.cu +83 -24
- ggml-metal.m +82 -51
- ggml-metal.metal +160 -74
- ggml-opencl.cpp +7 -7
- ggml.c +502 -350
- ggml.h +21 -13
ggml-cuda.cu
CHANGED
|
@@ -81,12 +81,29 @@
|
|
| 81 |
#if defined(GGML_USE_HIPBLAS)
|
| 82 |
#define __CUDA_ARCH__ 1300
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
| 85 |
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
| 86 |
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
| 87 |
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
|
|
| 88 |
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
| 89 |
return reinterpret_cast<const int&>(c);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
}
|
| 91 |
|
| 92 |
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
| 447 |
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
| 448 |
}
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
| 451 |
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 452 |
const int tid = threadIdx.x;
|
| 453 |
|
| 454 |
const float eps = 1e-5f;
|
| 455 |
|
| 456 |
-
|
| 457 |
-
float var = 0.0f;
|
| 458 |
|
| 459 |
-
for (int col = tid; col < ncols; col +=
|
| 460 |
const float xi = x[row*ncols + col];
|
| 461 |
-
|
| 462 |
-
|
| 463 |
}
|
| 464 |
|
| 465 |
// sum up partial sums
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
}
|
| 471 |
|
| 472 |
-
mean
|
| 473 |
-
var =
|
| 474 |
-
const float
|
| 475 |
|
| 476 |
-
for (int col = tid; col < ncols; col +=
|
| 477 |
-
dst[row*ncols + col] = (x[row*ncols + col] - mean) *
|
| 478 |
}
|
| 479 |
}
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
| 482 |
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 483 |
const int tid = threadIdx.x;
|
| 484 |
|
| 485 |
float tmp = 0.0f; // partial sum for thread in warp
|
| 486 |
|
| 487 |
-
for (int col = tid; col < ncols; col +=
|
| 488 |
const float xi = x[row*ncols + col];
|
| 489 |
tmp += xi * xi;
|
| 490 |
}
|
| 491 |
|
| 492 |
// sum up partial sums
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
}
|
| 497 |
|
| 498 |
const float mean = tmp / ncols;
|
| 499 |
const float scale = rsqrtf(mean + eps);
|
| 500 |
|
| 501 |
-
for (int col = tid; col < ncols; col +=
|
| 502 |
dst[row*ncols + col] = scale * x[row*ncols + col];
|
| 503 |
}
|
| 504 |
}
|
|
@@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
| 4186 |
|
| 4187 |
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 4188 |
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
| 4189 |
-
|
| 4190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4191 |
}
|
| 4192 |
|
| 4193 |
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
| 4194 |
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
| 4195 |
-
|
| 4196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4197 |
}
|
| 4198 |
|
| 4199 |
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
|
@@ -5721,7 +5781,6 @@ inline void ggml_cuda_op_alibi(
|
|
| 5721 |
(void) src1;
|
| 5722 |
(void) src0_ddq_i;
|
| 5723 |
(void) src1_ddf_i;
|
| 5724 |
-
(void) i02;
|
| 5725 |
(void) i1;
|
| 5726 |
}
|
| 5727 |
|
|
|
|
| 81 |
#if defined(GGML_USE_HIPBLAS)
|
| 82 |
#define __CUDA_ARCH__ 1300
|
| 83 |
|
| 84 |
+
#ifndef __has_builtin
|
| 85 |
+
#define __has_builtin(x) 0
|
| 86 |
+
#endif
|
| 87 |
+
|
| 88 |
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
| 89 |
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
| 90 |
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
| 91 |
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
| 92 |
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
| 93 |
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
| 94 |
return reinterpret_cast<const int&>(c);
|
| 95 |
+
#else
|
| 96 |
+
int8x4_t c;
|
| 97 |
+
int16_t tmp;
|
| 98 |
+
#pragma unroll
|
| 99 |
+
for (int i = 0; i < 4; i++) {
|
| 100 |
+
tmp = va[i] - vb[i];
|
| 101 |
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
| 102 |
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
| 103 |
+
c[i] = tmp;
|
| 104 |
+
}
|
| 105 |
+
return reinterpret_cast<int&>(c);
|
| 106 |
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
| 107 |
}
|
| 108 |
|
| 109 |
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
|
|
| 464 |
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
| 465 |
}
|
| 466 |
|
| 467 |
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
| 468 |
+
#pragma unroll
|
| 469 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 470 |
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
| 471 |
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
| 472 |
+
}
|
| 473 |
+
return a;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
template <int block_size>
|
| 477 |
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
| 478 |
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 479 |
const int tid = threadIdx.x;
|
| 480 |
|
| 481 |
const float eps = 1e-5f;
|
| 482 |
|
| 483 |
+
float2 mean_var = make_float2(0.f, 0.f);
|
|
|
|
| 484 |
|
| 485 |
+
for (int col = tid; col < ncols; col += block_size) {
|
| 486 |
const float xi = x[row*ncols + col];
|
| 487 |
+
mean_var.x += xi;
|
| 488 |
+
mean_var.y += xi * xi;
|
| 489 |
}
|
| 490 |
|
| 491 |
// sum up partial sums
|
| 492 |
+
mean_var = warp_reduce_sum(mean_var);
|
| 493 |
+
if (block_size > WARP_SIZE) {
|
| 494 |
+
__shared__ float2 s_sum[32];
|
| 495 |
+
int warp_id = threadIdx.x / WARP_SIZE;
|
| 496 |
+
int lane_id = threadIdx.x % WARP_SIZE;
|
| 497 |
+
if (lane_id == 0) {
|
| 498 |
+
s_sum[warp_id] = mean_var;
|
| 499 |
+
}
|
| 500 |
+
__syncthreads();
|
| 501 |
+
mean_var = s_sum[lane_id];
|
| 502 |
+
mean_var = warp_reduce_sum(mean_var);
|
| 503 |
}
|
| 504 |
|
| 505 |
+
const float mean = mean_var.x / ncols;
|
| 506 |
+
const float var = mean_var.y / ncols - mean * mean;
|
| 507 |
+
const float inv_std = rsqrtf(var + eps);
|
| 508 |
|
| 509 |
+
for (int col = tid; col < ncols; col += block_size) {
|
| 510 |
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
| 511 |
}
|
| 512 |
}
|
| 513 |
|
| 514 |
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 515 |
+
#pragma unroll
|
| 516 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 517 |
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
| 518 |
+
}
|
| 519 |
+
return x;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
template <int block_size>
|
| 523 |
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
| 524 |
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
| 525 |
const int tid = threadIdx.x;
|
| 526 |
|
| 527 |
float tmp = 0.0f; // partial sum for thread in warp
|
| 528 |
|
| 529 |
+
for (int col = tid; col < ncols; col += block_size) {
|
| 530 |
const float xi = x[row*ncols + col];
|
| 531 |
tmp += xi * xi;
|
| 532 |
}
|
| 533 |
|
| 534 |
// sum up partial sums
|
| 535 |
+
tmp = warp_reduce_sum(tmp);
|
| 536 |
+
if (block_size > WARP_SIZE) {
|
| 537 |
+
__shared__ float s_sum[32];
|
| 538 |
+
int warp_id = threadIdx.x / WARP_SIZE;
|
| 539 |
+
int lane_id = threadIdx.x % WARP_SIZE;
|
| 540 |
+
if (lane_id == 0) {
|
| 541 |
+
s_sum[warp_id] = tmp;
|
| 542 |
+
}
|
| 543 |
+
__syncthreads();
|
| 544 |
+
tmp = s_sum[lane_id];
|
| 545 |
+
tmp = warp_reduce_sum(tmp);
|
| 546 |
}
|
| 547 |
|
| 548 |
const float mean = tmp / ncols;
|
| 549 |
const float scale = rsqrtf(mean + eps);
|
| 550 |
|
| 551 |
+
for (int col = tid; col < ncols; col += block_size) {
|
| 552 |
dst[row*ncols + col] = scale * x[row*ncols + col];
|
| 553 |
}
|
| 554 |
}
|
|
|
|
| 4236 |
|
| 4237 |
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 4238 |
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
| 4239 |
+
if (ncols < 1024) {
|
| 4240 |
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 4241 |
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
| 4242 |
+
} else {
|
| 4243 |
+
const dim3 block_dims(1024, 1, 1);
|
| 4244 |
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
| 4245 |
+
}
|
| 4246 |
}
|
| 4247 |
|
| 4248 |
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
| 4249 |
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
| 4250 |
+
if (ncols < 1024) {
|
| 4251 |
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 4252 |
+
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
| 4253 |
+
} else {
|
| 4254 |
+
const dim3 block_dims(1024, 1, 1);
|
| 4255 |
+
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
| 4256 |
+
}
|
| 4257 |
}
|
| 4258 |
|
| 4259 |
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
|
|
|
| 5781 |
(void) src1;
|
| 5782 |
(void) src0_ddq_i;
|
| 5783 |
(void) src1_ddf_i;
|
|
|
|
| 5784 |
(void) i1;
|
| 5785 |
}
|
| 5786 |
|
ggml-metal.m
CHANGED
|
@@ -11,6 +11,7 @@
|
|
| 11 |
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
| 12 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 13 |
|
|
|
|
| 14 |
#ifdef GGML_METAL_NDEBUG
|
| 15 |
#define metal_printf(...)
|
| 16 |
#else
|
|
@@ -75,6 +76,7 @@ struct ggml_metal_context {
|
|
| 75 |
GGML_METAL_DECL_KERNEL(rms_norm);
|
| 76 |
GGML_METAL_DECL_KERNEL(norm);
|
| 77 |
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
|
|
|
| 78 |
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
| 79 |
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
| 80 |
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
|
@@ -113,12 +115,26 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
| 113 |
@end
|
| 114 |
|
| 115 |
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
| 119 |
|
|
|
|
|
|
|
|
|
|
| 120 |
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
| 121 |
-
ctx->device = MTLCreateSystemDefaultDevice();
|
| 122 |
ctx->queue = [ctx->device newCommandQueue];
|
| 123 |
ctx->n_buffers = 0;
|
| 124 |
ctx->concur_list_len = 0;
|
|
@@ -132,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 132 |
|
| 133 |
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
| 134 |
if (error) {
|
| 135 |
-
|
| 136 |
return NULL;
|
| 137 |
}
|
| 138 |
}
|
|
@@ -146,11 +162,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 146 |
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
| 147 |
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
| 148 |
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
| 149 |
-
|
| 150 |
|
| 151 |
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
| 152 |
if (error) {
|
| 153 |
-
|
| 154 |
return NULL;
|
| 155 |
}
|
| 156 |
|
|
@@ -162,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 162 |
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
| 163 |
#endif
|
| 164 |
if (error) {
|
| 165 |
-
|
| 166 |
return NULL;
|
| 167 |
}
|
| 168 |
}
|
|
@@ -174,11 +190,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 174 |
#define GGML_METAL_ADD_KERNEL(name) \
|
| 175 |
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
| 176 |
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
| 177 |
-
|
| 178 |
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
| 179 |
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
| 180 |
if (error) { \
|
| 181 |
-
|
| 182 |
return NULL; \
|
| 183 |
}
|
| 184 |
|
|
@@ -204,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 204 |
GGML_METAL_ADD_KERNEL(rms_norm);
|
| 205 |
GGML_METAL_ADD_KERNEL(norm);
|
| 206 |
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
|
|
|
| 207 |
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
| 208 |
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
| 209 |
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
|
@@ -230,19 +247,19 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 230 |
#undef GGML_METAL_ADD_KERNEL
|
| 231 |
}
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
| 235 |
if (ctx->device.maxTransferRate != 0) {
|
| 236 |
-
|
| 237 |
} else {
|
| 238 |
-
|
| 239 |
}
|
| 240 |
|
| 241 |
return ctx;
|
| 242 |
}
|
| 243 |
|
| 244 |
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
| 245 |
-
|
| 246 |
#define GGML_METAL_DEL_KERNEL(name) \
|
| 247 |
[ctx->function_##name release]; \
|
| 248 |
[ctx->pipeline_##name release];
|
|
@@ -269,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
| 269 |
GGML_METAL_DEL_KERNEL(rms_norm);
|
| 270 |
GGML_METAL_DEL_KERNEL(norm);
|
| 271 |
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
|
|
|
| 272 |
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
| 273 |
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
| 274 |
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
|
@@ -311,7 +329,7 @@ void * ggml_metal_host_malloc(size_t n) {
|
|
| 311 |
void * data = NULL;
|
| 312 |
const int result = posix_memalign((void **) &data, getpagesize(), n);
|
| 313 |
if (result != 0) {
|
| 314 |
-
|
| 315 |
return NULL;
|
| 316 |
}
|
| 317 |
|
|
@@ -339,7 +357,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|
| 339 |
// Metal buffer based on the host memory pointer
|
| 340 |
//
|
| 341 |
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
| 342 |
-
//
|
| 343 |
|
| 344 |
const int64_t tsize = ggml_nbytes(t);
|
| 345 |
|
|
@@ -350,13 +368,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|
| 350 |
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
| 351 |
*offs = (size_t) ioffs;
|
| 352 |
|
| 353 |
-
//
|
| 354 |
|
| 355 |
return ctx->buffers[i].metal;
|
| 356 |
}
|
| 357 |
}
|
| 358 |
|
| 359 |
-
|
| 360 |
|
| 361 |
return nil;
|
| 362 |
}
|
|
@@ -368,7 +386,7 @@ bool ggml_metal_add_buffer(
|
|
| 368 |
size_t size,
|
| 369 |
size_t max_size) {
|
| 370 |
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
| 371 |
-
|
| 372 |
return false;
|
| 373 |
}
|
| 374 |
|
|
@@ -378,7 +396,7 @@ bool ggml_metal_add_buffer(
|
|
| 378 |
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
|
| 379 |
|
| 380 |
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
| 381 |
-
|
| 382 |
return false;
|
| 383 |
}
|
| 384 |
}
|
|
@@ -399,11 +417,11 @@ bool ggml_metal_add_buffer(
|
|
| 399 |
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 400 |
|
| 401 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 402 |
-
|
| 403 |
return false;
|
| 404 |
}
|
| 405 |
|
| 406 |
-
|
| 407 |
|
| 408 |
++ctx->n_buffers;
|
| 409 |
} else {
|
|
@@ -423,27 +441,27 @@ bool ggml_metal_add_buffer(
|
|
| 423 |
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 424 |
|
| 425 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 426 |
-
|
| 427 |
return false;
|
| 428 |
}
|
| 429 |
|
| 430 |
-
|
| 431 |
if (i + size_step < size) {
|
| 432 |
-
|
| 433 |
}
|
| 434 |
|
| 435 |
++ctx->n_buffers;
|
| 436 |
}
|
| 437 |
}
|
| 438 |
|
| 439 |
-
|
| 440 |
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
| 441 |
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
| 442 |
|
| 443 |
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
| 444 |
-
|
| 445 |
} else {
|
| 446 |
-
|
| 447 |
}
|
| 448 |
}
|
| 449 |
|
|
@@ -453,8 +471,6 @@ bool ggml_metal_add_buffer(
|
|
| 453 |
void ggml_metal_set_tensor(
|
| 454 |
struct ggml_metal_context * ctx,
|
| 455 |
struct ggml_tensor * t) {
|
| 456 |
-
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
|
| 457 |
-
|
| 458 |
size_t offs;
|
| 459 |
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
|
| 460 |
|
|
@@ -464,8 +480,6 @@ void ggml_metal_set_tensor(
|
|
| 464 |
void ggml_metal_get_tensor(
|
| 465 |
struct ggml_metal_context * ctx,
|
| 466 |
struct ggml_tensor * t) {
|
| 467 |
-
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
|
| 468 |
-
|
| 469 |
size_t offs;
|
| 470 |
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
|
| 471 |
|
|
@@ -560,15 +574,13 @@ void ggml_metal_graph_find_concurrency(
|
|
| 560 |
}
|
| 561 |
|
| 562 |
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
| 563 |
-
|
| 564 |
}
|
| 565 |
}
|
| 566 |
|
| 567 |
void ggml_metal_graph_compute(
|
| 568 |
struct ggml_metal_context * ctx,
|
| 569 |
struct ggml_cgraph * gf) {
|
| 570 |
-
metal_printf("%s: evaluating graph\n", __func__);
|
| 571 |
-
|
| 572 |
@autoreleasepool {
|
| 573 |
|
| 574 |
// if there is ctx->concur_list, dispatch concurrently
|
|
@@ -616,7 +628,7 @@ void ggml_metal_graph_compute(
|
|
| 616 |
continue;
|
| 617 |
}
|
| 618 |
|
| 619 |
-
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
| 620 |
|
| 621 |
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
| 622 |
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
|
@@ -685,6 +697,12 @@ void ggml_metal_graph_compute(
|
|
| 685 |
} break;
|
| 686 |
case GGML_OP_ADD:
|
| 687 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 688 |
if (ggml_nelements(src1) == ne10) {
|
| 689 |
// src1 is a row
|
| 690 |
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
|
@@ -694,14 +712,20 @@ void ggml_metal_graph_compute(
|
|
| 694 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
| 695 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
| 696 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
| 697 |
-
[encoder setBytes:&
|
| 698 |
|
| 699 |
-
const int64_t n = ggml_nelements(dst);
|
| 700 |
|
| 701 |
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
| 702 |
} break;
|
| 703 |
case GGML_OP_MUL:
|
| 704 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
if (ggml_nelements(src1) == ne10) {
|
| 706 |
// src1 is a row
|
| 707 |
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
|
@@ -711,9 +735,9 @@ void ggml_metal_graph_compute(
|
|
| 711 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
| 712 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
| 713 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
| 714 |
-
[encoder setBytes:&
|
| 715 |
|
| 716 |
-
const int64_t n = ggml_nelements(dst);
|
| 717 |
|
| 718 |
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
| 719 |
} break;
|
|
@@ -764,7 +788,7 @@ void ggml_metal_graph_compute(
|
|
| 764 |
} break;
|
| 765 |
default:
|
| 766 |
{
|
| 767 |
-
|
| 768 |
GGML_ASSERT(false);
|
| 769 |
}
|
| 770 |
} break;
|
|
@@ -845,9 +869,13 @@ void ggml_metal_graph_compute(
|
|
| 845 |
switch (src0t) {
|
| 846 |
case GGML_TYPE_F16:
|
| 847 |
{
|
| 848 |
-
nth0 =
|
| 849 |
nth1 = 1;
|
| 850 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
} break;
|
| 852 |
case GGML_TYPE_Q4_0:
|
| 853 |
{
|
|
@@ -899,8 +927,8 @@ void ggml_metal_graph_compute(
|
|
| 899 |
GGML_ASSERT(ne02 == 1);
|
| 900 |
GGML_ASSERT(ne12 == 1);
|
| 901 |
|
| 902 |
-
nth0 =
|
| 903 |
-
nth1 = 32;
|
| 904 |
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
| 905 |
} break;
|
| 906 |
case GGML_TYPE_Q5_K:
|
|
@@ -923,7 +951,7 @@ void ggml_metal_graph_compute(
|
|
| 923 |
} break;
|
| 924 |
default:
|
| 925 |
{
|
| 926 |
-
|
| 927 |
GGML_ASSERT(false && "not implemented");
|
| 928 |
}
|
| 929 |
};
|
|
@@ -948,9 +976,12 @@ void ggml_metal_graph_compute(
|
|
| 948 |
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
| 949 |
|
| 950 |
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
| 951 |
-
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
| 952 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 953 |
}
|
|
|
|
|
|
|
|
|
|
| 954 |
else if (src0t == GGML_TYPE_Q3_K) {
|
| 955 |
#ifdef GGML_QKK_64
|
| 956 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
@@ -964,8 +995,8 @@ void ggml_metal_graph_compute(
|
|
| 964 |
else if (src0t == GGML_TYPE_Q6_K) {
|
| 965 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 966 |
} else {
|
| 967 |
-
|
| 968 |
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01,
|
| 969 |
}
|
| 970 |
}
|
| 971 |
} break;
|
|
@@ -1161,7 +1192,7 @@ void ggml_metal_graph_compute(
|
|
| 1161 |
} break;
|
| 1162 |
default:
|
| 1163 |
{
|
| 1164 |
-
|
| 1165 |
GGML_ASSERT(false);
|
| 1166 |
}
|
| 1167 |
}
|
|
@@ -1186,7 +1217,7 @@ void ggml_metal_graph_compute(
|
|
| 1186 |
|
| 1187 |
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
| 1188 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 1189 |
-
|
| 1190 |
GGML_ASSERT(false);
|
| 1191 |
}
|
| 1192 |
}
|
|
|
|
| 11 |
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
| 12 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 13 |
|
| 14 |
+
// TODO: temporary - reuse llama.cpp logging
|
| 15 |
#ifdef GGML_METAL_NDEBUG
|
| 16 |
#define metal_printf(...)
|
| 17 |
#else
|
|
|
|
| 76 |
GGML_METAL_DECL_KERNEL(rms_norm);
|
| 77 |
GGML_METAL_DECL_KERNEL(norm);
|
| 78 |
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
| 79 |
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
| 80 |
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
| 81 |
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
| 82 |
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
|
|
|
| 115 |
@end
|
| 116 |
|
| 117 |
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
| 118 |
+
metal_printf("%s: allocating\n", __func__);
|
| 119 |
+
|
| 120 |
+
// Show all the Metal device instances in the system
|
| 121 |
+
NSArray * devices = MTLCopyAllDevices();
|
| 122 |
+
id <MTLDevice> device;
|
| 123 |
+
NSString * s;
|
| 124 |
+
for (device in devices) {
|
| 125 |
+
s = [device name];
|
| 126 |
+
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
| 127 |
+
}
|
| 128 |
|
| 129 |
+
// Pick and show default Metal device
|
| 130 |
+
device = MTLCreateSystemDefaultDevice();
|
| 131 |
+
s = [device name];
|
| 132 |
+
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
| 133 |
|
| 134 |
+
// Configure context
|
| 135 |
+
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
| 136 |
+
ctx->device = device;
|
| 137 |
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
|
|
|
| 138 |
ctx->queue = [ctx->device newCommandQueue];
|
| 139 |
ctx->n_buffers = 0;
|
| 140 |
ctx->concur_list_len = 0;
|
|
|
|
| 148 |
|
| 149 |
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
| 150 |
if (error) {
|
| 151 |
+
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
| 152 |
return NULL;
|
| 153 |
}
|
| 154 |
}
|
|
|
|
| 162 |
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
| 163 |
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
| 164 |
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
| 165 |
+
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
|
| 166 |
|
| 167 |
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
| 168 |
if (error) {
|
| 169 |
+
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
| 170 |
return NULL;
|
| 171 |
}
|
| 172 |
|
|
|
|
| 178 |
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
| 179 |
#endif
|
| 180 |
if (error) {
|
| 181 |
+
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
| 182 |
return NULL;
|
| 183 |
}
|
| 184 |
}
|
|
|
|
| 190 |
#define GGML_METAL_ADD_KERNEL(name) \
|
| 191 |
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
| 192 |
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
| 193 |
+
metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
| 194 |
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
| 195 |
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
| 196 |
if (error) { \
|
| 197 |
+
metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
| 198 |
return NULL; \
|
| 199 |
}
|
| 200 |
|
|
|
|
| 220 |
GGML_METAL_ADD_KERNEL(rms_norm);
|
| 221 |
GGML_METAL_ADD_KERNEL(norm);
|
| 222 |
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
| 223 |
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
| 224 |
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
| 225 |
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
| 226 |
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
|
|
|
| 247 |
#undef GGML_METAL_ADD_KERNEL
|
| 248 |
}
|
| 249 |
|
| 250 |
+
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
| 251 |
+
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
| 252 |
if (ctx->device.maxTransferRate != 0) {
|
| 253 |
+
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
| 254 |
} else {
|
| 255 |
+
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
| 256 |
}
|
| 257 |
|
| 258 |
return ctx;
|
| 259 |
}
|
| 260 |
|
| 261 |
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
| 262 |
+
metal_printf("%s: deallocating\n", __func__);
|
| 263 |
#define GGML_METAL_DEL_KERNEL(name) \
|
| 264 |
[ctx->function_##name release]; \
|
| 265 |
[ctx->pipeline_##name release];
|
|
|
|
| 286 |
GGML_METAL_DEL_KERNEL(rms_norm);
|
| 287 |
GGML_METAL_DEL_KERNEL(norm);
|
| 288 |
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
| 289 |
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
| 290 |
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
| 291 |
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
| 292 |
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
|
|
|
| 329 |
void * data = NULL;
|
| 330 |
const int result = posix_memalign((void **) &data, getpagesize(), n);
|
| 331 |
if (result != 0) {
|
| 332 |
+
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
| 333 |
return NULL;
|
| 334 |
}
|
| 335 |
|
|
|
|
| 357 |
// Metal buffer based on the host memory pointer
|
| 358 |
//
|
| 359 |
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
| 360 |
+
//metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
| 361 |
|
| 362 |
const int64_t tsize = ggml_nbytes(t);
|
| 363 |
|
|
|
|
| 368 |
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
| 369 |
*offs = (size_t) ioffs;
|
| 370 |
|
| 371 |
+
//metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
| 372 |
|
| 373 |
return ctx->buffers[i].metal;
|
| 374 |
}
|
| 375 |
}
|
| 376 |
|
| 377 |
+
metal_printf("%s: error: buffer is nil\n", __func__);
|
| 378 |
|
| 379 |
return nil;
|
| 380 |
}
|
|
|
|
| 386 |
size_t size,
|
| 387 |
size_t max_size) {
|
| 388 |
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
| 389 |
+
metal_printf("%s: too many buffers\n", __func__);
|
| 390 |
return false;
|
| 391 |
}
|
| 392 |
|
|
|
|
| 396 |
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
|
| 397 |
|
| 398 |
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
| 399 |
+
metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
| 400 |
return false;
|
| 401 |
}
|
| 402 |
}
|
|
|
|
| 417 |
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 418 |
|
| 419 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 420 |
+
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
| 421 |
return false;
|
| 422 |
}
|
| 423 |
|
| 424 |
+
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
| 425 |
|
| 426 |
++ctx->n_buffers;
|
| 427 |
} else {
|
|
|
|
| 441 |
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 442 |
|
| 443 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 444 |
+
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
| 445 |
return false;
|
| 446 |
}
|
| 447 |
|
| 448 |
+
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
| 449 |
if (i + size_step < size) {
|
| 450 |
+
metal_printf("\n");
|
| 451 |
}
|
| 452 |
|
| 453 |
++ctx->n_buffers;
|
| 454 |
}
|
| 455 |
}
|
| 456 |
|
| 457 |
+
metal_printf(", (%8.2f / %8.2f)",
|
| 458 |
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
| 459 |
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
| 460 |
|
| 461 |
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
| 462 |
+
metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
|
| 463 |
} else {
|
| 464 |
+
metal_printf("\n");
|
| 465 |
}
|
| 466 |
}
|
| 467 |
|
|
|
|
| 471 |
void ggml_metal_set_tensor(
|
| 472 |
struct ggml_metal_context * ctx,
|
| 473 |
struct ggml_tensor * t) {
|
|
|
|
|
|
|
| 474 |
size_t offs;
|
| 475 |
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
|
| 476 |
|
|
|
|
| 480 |
void ggml_metal_get_tensor(
|
| 481 |
struct ggml_metal_context * ctx,
|
| 482 |
struct ggml_tensor * t) {
|
|
|
|
|
|
|
| 483 |
size_t offs;
|
| 484 |
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
|
| 485 |
|
|
|
|
| 574 |
}
|
| 575 |
|
| 576 |
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
| 577 |
+
metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
|
| 578 |
}
|
| 579 |
}
|
| 580 |
|
| 581 |
void ggml_metal_graph_compute(
|
| 582 |
struct ggml_metal_context * ctx,
|
| 583 |
struct ggml_cgraph * gf) {
|
|
|
|
|
|
|
| 584 |
@autoreleasepool {
|
| 585 |
|
| 586 |
// if there is ctx->concur_list, dispatch concurrently
|
|
|
|
| 628 |
continue;
|
| 629 |
}
|
| 630 |
|
| 631 |
+
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
| 632 |
|
| 633 |
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
| 634 |
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
|
|
|
| 697 |
} break;
|
| 698 |
case GGML_OP_ADD:
|
| 699 |
{
|
| 700 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 701 |
+
|
| 702 |
+
// utilize float4
|
| 703 |
+
GGML_ASSERT(ne00 % 4 == 0);
|
| 704 |
+
const int64_t nb = ne00/4;
|
| 705 |
+
|
| 706 |
if (ggml_nelements(src1) == ne10) {
|
| 707 |
// src1 is a row
|
| 708 |
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
|
|
|
| 712 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
| 713 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
| 714 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
| 715 |
+
[encoder setBytes:&nb length:sizeof(nb) atIndex:3];
|
| 716 |
|
| 717 |
+
const int64_t n = ggml_nelements(dst)/4;
|
| 718 |
|
| 719 |
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
| 720 |
} break;
|
| 721 |
case GGML_OP_MUL:
|
| 722 |
{
|
| 723 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 724 |
+
|
| 725 |
+
// utilize float4
|
| 726 |
+
GGML_ASSERT(ne00 % 4 == 0);
|
| 727 |
+
const int64_t nb = ne00/4;
|
| 728 |
+
|
| 729 |
if (ggml_nelements(src1) == ne10) {
|
| 730 |
// src1 is a row
|
| 731 |
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
|
|
|
| 735 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
| 736 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
| 737 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
| 738 |
+
[encoder setBytes:&nb length:sizeof(nb) atIndex:3];
|
| 739 |
|
| 740 |
+
const int64_t n = ggml_nelements(dst)/4;
|
| 741 |
|
| 742 |
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
| 743 |
} break;
|
|
|
|
| 788 |
} break;
|
| 789 |
default:
|
| 790 |
{
|
| 791 |
+
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
| 792 |
GGML_ASSERT(false);
|
| 793 |
}
|
| 794 |
} break;
|
|
|
|
| 869 |
switch (src0t) {
|
| 870 |
case GGML_TYPE_F16:
|
| 871 |
{
|
| 872 |
+
nth0 = 32;
|
| 873 |
nth1 = 1;
|
| 874 |
+
if (ne11 * ne12 < 4) {
|
| 875 |
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
| 876 |
+
} else {
|
| 877 |
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
| 878 |
+
}
|
| 879 |
} break;
|
| 880 |
case GGML_TYPE_Q4_0:
|
| 881 |
{
|
|
|
|
| 927 |
GGML_ASSERT(ne02 == 1);
|
| 928 |
GGML_ASSERT(ne12 == 1);
|
| 929 |
|
| 930 |
+
nth0 = 4; //1;
|
| 931 |
+
nth1 = 8; //32;
|
| 932 |
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
| 933 |
} break;
|
| 934 |
case GGML_TYPE_Q5_K:
|
|
|
|
| 951 |
} break;
|
| 952 |
default:
|
| 953 |
{
|
| 954 |
+
metal_printf("Asserting on type %d\n",(int)src0t);
|
| 955 |
GGML_ASSERT(false && "not implemented");
|
| 956 |
}
|
| 957 |
};
|
|
|
|
| 976 |
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
| 977 |
|
| 978 |
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
| 979 |
+
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
| 980 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 981 |
}
|
| 982 |
+
else if (src0t == GGML_TYPE_Q4_K) {
|
| 983 |
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 984 |
+
}
|
| 985 |
else if (src0t == GGML_TYPE_Q3_K) {
|
| 986 |
#ifdef GGML_QKK_64
|
| 987 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
|
|
| 995 |
else if (src0t == GGML_TYPE_Q6_K) {
|
| 996 |
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 997 |
} else {
|
| 998 |
+
int64_t ny = (ne11 + 3)/4;
|
| 999 |
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
| 1000 |
}
|
| 1001 |
}
|
| 1002 |
} break;
|
|
|
|
| 1192 |
} break;
|
| 1193 |
default:
|
| 1194 |
{
|
| 1195 |
+
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
| 1196 |
GGML_ASSERT(false);
|
| 1197 |
}
|
| 1198 |
}
|
|
|
|
| 1217 |
|
| 1218 |
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
| 1219 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 1220 |
+
metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
| 1221 |
GGML_ASSERT(false);
|
| 1222 |
}
|
| 1223 |
}
|
ggml-metal.metal
CHANGED
|
@@ -25,9 +25,9 @@ typedef struct {
|
|
| 25 |
} block_q8_0;
|
| 26 |
|
| 27 |
kernel void kernel_add(
|
| 28 |
-
device const
|
| 29 |
-
device const
|
| 30 |
-
device
|
| 31 |
uint tpig[[thread_position_in_grid]]) {
|
| 32 |
dst[tpig] = src0[tpig] + src1[tpig];
|
| 33 |
}
|
|
@@ -35,18 +35,18 @@ kernel void kernel_add(
|
|
| 35 |
// assumption: src1 is a row
|
| 36 |
// broadcast src1 into src0
|
| 37 |
kernel void kernel_add_row(
|
| 38 |
-
device const
|
| 39 |
-
device const
|
| 40 |
-
device
|
| 41 |
-
constant int64_t &
|
| 42 |
uint tpig[[thread_position_in_grid]]) {
|
| 43 |
-
dst[tpig] = src0[tpig] + src1[tpig %
|
| 44 |
}
|
| 45 |
|
| 46 |
kernel void kernel_mul(
|
| 47 |
-
device const
|
| 48 |
-
device const
|
| 49 |
-
device
|
| 50 |
uint tpig[[thread_position_in_grid]]) {
|
| 51 |
dst[tpig] = src0[tpig] * src1[tpig];
|
| 52 |
}
|
|
@@ -54,12 +54,12 @@ kernel void kernel_mul(
|
|
| 54 |
// assumption: src1 is a row
|
| 55 |
// broadcast src1 into src0
|
| 56 |
kernel void kernel_mul_row(
|
| 57 |
-
device const
|
| 58 |
-
device const
|
| 59 |
-
device
|
| 60 |
-
constant
|
| 61 |
uint tpig[[thread_position_in_grid]]) {
|
| 62 |
-
dst[tpig] = src0[tpig] * src1[tpig %
|
| 63 |
}
|
| 64 |
|
| 65 |
kernel void kernel_scale(
|
|
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
|
|
| 133 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 134 |
}
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
| 140 |
|
| 141 |
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 142 |
|
| 143 |
const float max = buf[0];
|
| 144 |
|
| 145 |
// parallel sum
|
| 146 |
buf[tpitg[0]] = 0.0f;
|
| 147 |
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
}
|
| 150 |
|
| 151 |
// reduce
|
|
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
|
|
| 157 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 158 |
}
|
| 159 |
|
| 160 |
-
// broadcast
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
| 164 |
|
| 165 |
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 166 |
|
| 167 |
const float sum = buf[0];
|
| 168 |
|
| 169 |
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
| 170 |
-
pdst[i00]
|
| 171 |
}
|
| 172 |
}
|
| 173 |
|
|
@@ -214,25 +220,27 @@ kernel void kernel_norm(
|
|
| 214 |
}
|
| 215 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 216 |
}
|
| 217 |
-
|
| 218 |
-
if (tpitg == 0) {
|
| 219 |
-
|
| 220 |
-
}
|
| 221 |
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 222 |
const float mean = sum[0];
|
| 223 |
|
| 224 |
-
// recenter
|
| 225 |
device float * y = dst + tgpig*ne00;
|
| 226 |
-
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
| 227 |
-
y[i00] = x[i00] - mean;
|
| 228 |
-
}
|
| 229 |
-
|
| 230 |
-
// VARIANCE
|
| 231 |
-
// parallel sum
|
| 232 |
sum[tpitg] = 0.0f;
|
| 233 |
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
|
|
| 234 |
sum[tpitg] += y[i00] * y[i00];
|
| 235 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
// reduce
|
| 237 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 238 |
for (uint i = ntg/2; i > 0; i /= 2) {
|
|
@@ -241,11 +249,11 @@ kernel void kernel_norm(
|
|
| 241 |
}
|
| 242 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 243 |
}
|
| 244 |
-
|
| 245 |
-
if (tpitg == 0) {
|
| 246 |
-
|
| 247 |
-
}
|
| 248 |
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 249 |
const float variance = sum[0];
|
| 250 |
|
| 251 |
const float scale = 1.0f/sqrt(variance + eps);
|
|
@@ -435,6 +443,8 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
| 435 |
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
| 436 |
}
|
| 437 |
|
|
|
|
|
|
|
| 438 |
kernel void kernel_mul_mat_q8_0_f32(
|
| 439 |
device const void * src0,
|
| 440 |
device const float * src1,
|
|
@@ -463,30 +473,30 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
| 463 |
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
| 464 |
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 465 |
|
| 466 |
-
float yl[
|
| 467 |
float sumf[nr]={0.f};
|
| 468 |
|
| 469 |
-
const int ix = tiisg/
|
| 470 |
-
const int il = tiisg%
|
| 471 |
|
| 472 |
-
device const float * yb = y + ix * QK8_0 +
|
| 473 |
|
| 474 |
-
// each thread in a SIMD group deals with
|
| 475 |
-
for (int ib = ix; ib < nb; ib += nw/
|
| 476 |
-
for (int i = 0; i <
|
| 477 |
yl[i] = yb[i];
|
| 478 |
}
|
| 479 |
|
| 480 |
for (int row = 0; row < nr; row++) {
|
| 481 |
-
device const int8_t * qs = x[ib+row*nb].qs +
|
| 482 |
float sumq = 0.f;
|
| 483 |
-
for (int iq = 0; iq <
|
| 484 |
sumq += qs[iq] * yl[iq];
|
| 485 |
}
|
| 486 |
sumf[row] += sumq*x[ib+row*nb].d;
|
| 487 |
}
|
| 488 |
|
| 489 |
-
yb +=
|
| 490 |
}
|
| 491 |
|
| 492 |
for (int row = 0; row < nr; ++row) {
|
|
@@ -497,7 +507,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
| 497 |
}
|
| 498 |
}
|
| 499 |
|
| 500 |
-
kernel void
|
| 501 |
device const char * src0,
|
| 502 |
device const char * src1,
|
| 503 |
device float * dst,
|
|
@@ -515,11 +525,8 @@ kernel void kernel_mul_mat_f16_f32(
|
|
| 515 |
constant uint64_t & nb12,
|
| 516 |
constant int64_t & ne0,
|
| 517 |
constant int64_t & ne1,
|
| 518 |
-
threadgroup float * sum [[threadgroup(0)]],
|
| 519 |
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 520 |
-
|
| 521 |
-
uint3 tpitg[[thread_position_in_threadgroup]],
|
| 522 |
-
uint3 tptg[[threads_per_threadgroup]]) {
|
| 523 |
|
| 524 |
const int64_t r0 = tgpig.x;
|
| 525 |
const int64_t r1 = tgpig.y;
|
|
@@ -528,24 +535,102 @@ kernel void kernel_mul_mat_f16_f32(
|
|
| 528 |
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
| 529 |
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
| 530 |
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
}
|
| 536 |
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
}
|
| 543 |
-
|
| 544 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
|
| 546 |
-
|
| 547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
}
|
|
|
|
| 549 |
}
|
| 550 |
|
| 551 |
kernel void kernel_alibi_f32(
|
|
@@ -1244,7 +1329,8 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
| 1244 |
const int r0 = tgpig.x;
|
| 1245 |
const int r1 = tgpig.y;
|
| 1246 |
const int r2 = tgpig.z;
|
| 1247 |
-
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
|
|
| 1248 |
const int ib_row = first_row * nb;
|
| 1249 |
const uint offset0 = r2/gqa*(nb*ne0);
|
| 1250 |
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
|
|
|
| 25 |
} block_q8_0;
|
| 26 |
|
| 27 |
kernel void kernel_add(
|
| 28 |
+
device const float4 * src0,
|
| 29 |
+
device const float4 * src1,
|
| 30 |
+
device float4 * dst,
|
| 31 |
uint tpig[[thread_position_in_grid]]) {
|
| 32 |
dst[tpig] = src0[tpig] + src1[tpig];
|
| 33 |
}
|
|
|
|
| 35 |
// assumption: src1 is a row
|
| 36 |
// broadcast src1 into src0
|
| 37 |
kernel void kernel_add_row(
|
| 38 |
+
device const float4 * src0,
|
| 39 |
+
device const float4 * src1,
|
| 40 |
+
device float4 * dst,
|
| 41 |
+
constant int64_t & nb,
|
| 42 |
uint tpig[[thread_position_in_grid]]) {
|
| 43 |
+
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
| 44 |
}
|
| 45 |
|
| 46 |
kernel void kernel_mul(
|
| 47 |
+
device const float4 * src0,
|
| 48 |
+
device const float4 * src1,
|
| 49 |
+
device float4 * dst,
|
| 50 |
uint tpig[[thread_position_in_grid]]) {
|
| 51 |
dst[tpig] = src0[tpig] * src1[tpig];
|
| 52 |
}
|
|
|
|
| 54 |
// assumption: src1 is a row
|
| 55 |
// broadcast src1 into src0
|
| 56 |
kernel void kernel_mul_row(
|
| 57 |
+
device const float4 * src0,
|
| 58 |
+
device const float4 * src1,
|
| 59 |
+
device float4 * dst,
|
| 60 |
+
constant int64_t & nb,
|
| 61 |
uint tpig[[thread_position_in_grid]]) {
|
| 62 |
+
dst[tpig] = src0[tpig] * src1[tpig % nb];
|
| 63 |
}
|
| 64 |
|
| 65 |
kernel void kernel_scale(
|
|
|
|
| 133 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 134 |
}
|
| 135 |
|
| 136 |
+
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
| 137 |
+
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
| 138 |
+
//if (tpitg[0] == 0) {
|
| 139 |
+
// buf[0] = buf[0];
|
| 140 |
+
//}
|
| 141 |
|
| 142 |
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 143 |
|
| 144 |
const float max = buf[0];
|
| 145 |
|
| 146 |
// parallel sum
|
| 147 |
buf[tpitg[0]] = 0.0f;
|
| 148 |
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
| 149 |
+
const float exp_psrc0 = exp(psrc0[i00] - max);
|
| 150 |
+
buf[tpitg[0]] += exp_psrc0;
|
| 151 |
+
// Remember the result of exp here. exp is expensive, so we really do not
|
| 152 |
+
// whish to compute it twice.
|
| 153 |
+
pdst[i00] = exp_psrc0;
|
| 154 |
}
|
| 155 |
|
| 156 |
// reduce
|
|
|
|
| 162 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 163 |
}
|
| 164 |
|
| 165 |
+
// broadcast - not needed, see above
|
| 166 |
+
//// broadcast
|
| 167 |
+
//if (tpitg[0] == 0) {
|
| 168 |
+
// buf[0] = buf[0];
|
| 169 |
+
//}
|
| 170 |
|
| 171 |
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 172 |
|
| 173 |
const float sum = buf[0];
|
| 174 |
|
| 175 |
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
| 176 |
+
pdst[i00] /= sum;
|
| 177 |
}
|
| 178 |
}
|
| 179 |
|
|
|
|
| 220 |
}
|
| 221 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 222 |
}
|
| 223 |
+
//// broadcast
|
| 224 |
+
//if (tpitg == 0) {
|
| 225 |
+
// sum[0] /= ne00;
|
| 226 |
+
//}
|
| 227 |
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 228 |
const float mean = sum[0];
|
| 229 |
|
| 230 |
+
// recenter and VARIANCE
|
| 231 |
device float * y = dst + tgpig*ne00;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
sum[tpitg] = 0.0f;
|
| 233 |
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
| 234 |
+
y[i00] = x[i00] - mean;
|
| 235 |
sum[tpitg] += y[i00] * y[i00];
|
| 236 |
}
|
| 237 |
+
|
| 238 |
+
//// VARIANCE
|
| 239 |
+
//// parallel sum
|
| 240 |
+
//sum[tpitg] = 0.0f;
|
| 241 |
+
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
| 242 |
+
// sum[tpitg] += y[i00] * y[i00];
|
| 243 |
+
//}
|
| 244 |
// reduce
|
| 245 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 246 |
for (uint i = ntg/2; i > 0; i /= 2) {
|
|
|
|
| 249 |
}
|
| 250 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 251 |
}
|
| 252 |
+
//// broadcast
|
| 253 |
+
//if (tpitg == 0) {
|
| 254 |
+
// sum[0] /= ne00;
|
| 255 |
+
//}
|
| 256 |
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
| 257 |
const float variance = sum[0];
|
| 258 |
|
| 259 |
const float scale = 1.0f/sqrt(variance + eps);
|
|
|
|
| 443 |
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
| 444 |
}
|
| 445 |
|
| 446 |
+
#define NB_Q8_0 8
|
| 447 |
+
|
| 448 |
kernel void kernel_mul_mat_q8_0_f32(
|
| 449 |
device const void * src0,
|
| 450 |
device const float * src1,
|
|
|
|
| 473 |
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
| 474 |
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 475 |
|
| 476 |
+
float yl[NB_Q8_0];
|
| 477 |
float sumf[nr]={0.f};
|
| 478 |
|
| 479 |
+
const int ix = tiisg/4;
|
| 480 |
+
const int il = tiisg%4;
|
| 481 |
|
| 482 |
+
device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
|
| 483 |
|
| 484 |
+
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
| 485 |
+
for (int ib = ix; ib < nb; ib += nw/4) {
|
| 486 |
+
for (int i = 0; i < NB_Q8_0; ++i) {
|
| 487 |
yl[i] = yb[i];
|
| 488 |
}
|
| 489 |
|
| 490 |
for (int row = 0; row < nr; row++) {
|
| 491 |
+
device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
|
| 492 |
float sumq = 0.f;
|
| 493 |
+
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
| 494 |
sumq += qs[iq] * yl[iq];
|
| 495 |
}
|
| 496 |
sumf[row] += sumq*x[ib+row*nb].d;
|
| 497 |
}
|
| 498 |
|
| 499 |
+
yb += NB_Q8_0 * nw;
|
| 500 |
}
|
| 501 |
|
| 502 |
for (int row = 0; row < nr; ++row) {
|
|
|
|
| 507 |
}
|
| 508 |
}
|
| 509 |
|
| 510 |
+
kernel void kernel_mul_mat_f16_f32_1row(
|
| 511 |
device const char * src0,
|
| 512 |
device const char * src1,
|
| 513 |
device float * dst,
|
|
|
|
| 525 |
constant uint64_t & nb12,
|
| 526 |
constant int64_t & ne0,
|
| 527 |
constant int64_t & ne1,
|
|
|
|
| 528 |
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 529 |
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
|
|
|
|
|
|
| 530 |
|
| 531 |
const int64_t r0 = tgpig.x;
|
| 532 |
const int64_t r1 = tgpig.y;
|
|
|
|
| 535 |
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
| 536 |
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
| 537 |
|
| 538 |
+
float sumf = 0;
|
| 539 |
+
if (ne00 < 128) {
|
| 540 |
+
for (int i = tiisg; i < ne00; i += 32) {
|
| 541 |
+
sumf += (float) x[i] * (float) y[i];
|
| 542 |
+
}
|
| 543 |
+
float all_sum = simd_sum(sumf);
|
| 544 |
+
if (tiisg == 0) {
|
| 545 |
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
| 546 |
+
}
|
| 547 |
+
} else {
|
| 548 |
+
device const half4 * x4 = (device const half4 *) x;
|
| 549 |
+
device const float4 * y4 = (device const float4 *) y;
|
| 550 |
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
| 551 |
+
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
|
| 552 |
+
}
|
| 553 |
+
float all_sum = simd_sum(sumf);
|
| 554 |
+
if (tiisg == 0) {
|
| 555 |
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
| 556 |
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
| 557 |
+
}
|
| 558 |
}
|
| 559 |
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
#define N_F16_F32 4
|
| 563 |
+
|
| 564 |
+
kernel void kernel_mul_mat_f16_f32(
|
| 565 |
+
device const char * src0,
|
| 566 |
+
device const char * src1,
|
| 567 |
+
device float * dst,
|
| 568 |
+
constant int64_t & ne00,
|
| 569 |
+
constant int64_t & ne01,
|
| 570 |
+
constant int64_t & ne02,
|
| 571 |
+
constant uint64_t & nb00,
|
| 572 |
+
constant uint64_t & nb01,
|
| 573 |
+
constant uint64_t & nb02,
|
| 574 |
+
constant int64_t & ne10,
|
| 575 |
+
constant int64_t & ne11,
|
| 576 |
+
constant int64_t & ne12,
|
| 577 |
+
constant uint64_t & nb10,
|
| 578 |
+
constant uint64_t & nb11,
|
| 579 |
+
constant uint64_t & nb12,
|
| 580 |
+
constant int64_t & ne0,
|
| 581 |
+
constant int64_t & ne1,
|
| 582 |
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
| 583 |
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
| 584 |
+
|
| 585 |
+
const int64_t r0 = tgpig.x;
|
| 586 |
+
const int64_t rb = tgpig.y*N_F16_F32;
|
| 587 |
+
const int64_t im = tgpig.z;
|
| 588 |
+
|
| 589 |
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
| 590 |
+
|
| 591 |
+
if (ne00 < 128) {
|
| 592 |
+
for (int row = 0; row < N_F16_F32; ++row) {
|
| 593 |
+
int r1 = rb + row;
|
| 594 |
+
if (r1 >= ne11) {
|
| 595 |
+
break;
|
| 596 |
+
}
|
| 597 |
+
|
| 598 |
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
| 599 |
+
|
| 600 |
+
float sumf = 0;
|
| 601 |
+
for (int i = tiisg; i < ne00; i += 32) {
|
| 602 |
+
sumf += (float) x[i] * (float) y[i];
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
float all_sum = simd_sum(sumf);
|
| 606 |
+
if (tiisg == 0) {
|
| 607 |
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
| 608 |
+
}
|
| 609 |
}
|
| 610 |
+
} else {
|
| 611 |
+
device const half4 * x4 = (device const half4 *)x;
|
| 612 |
+
for (int row = 0; row < N_F16_F32; ++row) {
|
| 613 |
+
int r1 = rb + row;
|
| 614 |
+
if (r1 >= ne11) {
|
| 615 |
+
break;
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
| 619 |
+
device const float4 * y4 = (device const float4 *) y;
|
| 620 |
|
| 621 |
+
float sumf = 0;
|
| 622 |
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
| 623 |
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
float all_sum = simd_sum(sumf);
|
| 627 |
+
if (tiisg == 0) {
|
| 628 |
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
| 629 |
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
| 630 |
+
}
|
| 631 |
+
}
|
| 632 |
}
|
| 633 |
+
|
| 634 |
}
|
| 635 |
|
| 636 |
kernel void kernel_alibi_f32(
|
|
|
|
| 1329 |
const int r0 = tgpig.x;
|
| 1330 |
const int r1 = tgpig.y;
|
| 1331 |
const int r2 = tgpig.z;
|
| 1332 |
+
//const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
| 1333 |
+
const int first_row = r0 * N_DST;
|
| 1334 |
const int ib_row = first_row * nb;
|
| 1335 |
const uint offset0 = r2/gqa*(nb*ne0);
|
| 1336 |
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
ggml-opencl.cpp
CHANGED
|
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
|
| 1334 |
return;
|
| 1335 |
}
|
| 1336 |
|
| 1337 |
-
cl_mem mem = (cl_mem)tensor->
|
| 1338 |
clReleaseMemObject(mem);
|
| 1339 |
}
|
| 1340 |
|
|
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
| 1393 |
size_t d_size;
|
| 1394 |
|
| 1395 |
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
| 1396 |
-
cl_mem d_Y = (cl_mem) src1->
|
| 1397 |
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
| 1398 |
|
| 1399 |
|
|
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1491 |
size_t d_size;
|
| 1492 |
cl_mem d_X;
|
| 1493 |
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
| 1494 |
-
d_X = (cl_mem) src0->
|
| 1495 |
} else {
|
| 1496 |
-
d_X = ggml_cl_pool_malloc(sizeof(
|
| 1497 |
}
|
| 1498 |
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
| 1499 |
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
|
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1567 |
size_t d_size;
|
| 1568 |
cl_mem d_X;
|
| 1569 |
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
| 1570 |
-
d_X = (cl_mem) src0->
|
| 1571 |
} else {
|
| 1572 |
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
| 1573 |
}
|
|
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
| 1697 |
events.emplace_back();
|
| 1698 |
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
| 1699 |
} else if (src0->backend == GGML_BACKEND_GPU) {
|
| 1700 |
-
d_Q = (cl_mem) src0->
|
| 1701 |
} else {
|
| 1702 |
GGML_ASSERT(false);
|
| 1703 |
}
|
|
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
| 1860 |
|
| 1861 |
CL_CHECK(clFinish(queue));
|
| 1862 |
|
| 1863 |
-
tensor->
|
| 1864 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 1865 |
}
|
|
|
|
| 1334 |
return;
|
| 1335 |
}
|
| 1336 |
|
| 1337 |
+
cl_mem mem = (cl_mem)tensor->extra;
|
| 1338 |
clReleaseMemObject(mem);
|
| 1339 |
}
|
| 1340 |
|
|
|
|
| 1393 |
size_t d_size;
|
| 1394 |
|
| 1395 |
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
| 1396 |
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
| 1397 |
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
| 1398 |
|
| 1399 |
|
|
|
|
| 1491 |
size_t d_size;
|
| 1492 |
cl_mem d_X;
|
| 1493 |
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
| 1494 |
+
d_X = (cl_mem) src0->extra;
|
| 1495 |
} else {
|
| 1496 |
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
| 1497 |
}
|
| 1498 |
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
| 1499 |
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
|
|
|
| 1567 |
size_t d_size;
|
| 1568 |
cl_mem d_X;
|
| 1569 |
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
| 1570 |
+
d_X = (cl_mem) src0->extra;
|
| 1571 |
} else {
|
| 1572 |
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
| 1573 |
}
|
|
|
|
| 1697 |
events.emplace_back();
|
| 1698 |
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
| 1699 |
} else if (src0->backend == GGML_BACKEND_GPU) {
|
| 1700 |
+
d_Q = (cl_mem) src0->extra;
|
| 1701 |
} else {
|
| 1702 |
GGML_ASSERT(false);
|
| 1703 |
}
|
|
|
|
| 1860 |
|
| 1861 |
CL_CHECK(clFinish(queue));
|
| 1862 |
|
| 1863 |
+
tensor->extra = dst;
|
| 1864 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 1865 |
}
|
ggml.c
CHANGED
|
@@ -47,6 +47,10 @@
|
|
| 47 |
// disable "possible loss of data" to avoid hundreds of casts
|
| 48 |
// we should just be careful :)
|
| 49 |
#pragma warning(disable: 4244 4267)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
#endif
|
| 51 |
|
| 52 |
#if defined(_WIN32)
|
|
@@ -123,6 +127,8 @@ typedef void * thread_ret_t;
|
|
| 123 |
#define GGML_GELU_FP16
|
| 124 |
#define GGML_GELU_QUICK_FP16
|
| 125 |
#define GGML_SILU_FP16
|
|
|
|
|
|
|
| 126 |
|
| 127 |
#define GGML_SOFT_MAX_UNROLL 4
|
| 128 |
#define GGML_VEC_DOT_UNROLL 2
|
|
@@ -186,8 +192,8 @@ typedef void * thread_ret_t;
|
|
| 186 |
//
|
| 187 |
|
| 188 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 189 |
-
#define GGML_ALIGNED_MALLOC(size)
|
| 190 |
-
#define GGML_ALIGNED_FREE(ptr)
|
| 191 |
#else
|
| 192 |
inline static void * ggml_aligned_malloc(size_t size) {
|
| 193 |
void * aligned_memory = NULL;
|
|
@@ -212,8 +218,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
| 212 |
}
|
| 213 |
return aligned_memory;
|
| 214 |
}
|
| 215 |
-
#define GGML_ALIGNED_MALLOC(size)
|
| 216 |
-
#define GGML_ALIGNED_FREE(ptr)
|
| 217 |
#endif
|
| 218 |
|
| 219 |
#define UNUSED GGML_UNUSED
|
|
@@ -301,6 +307,10 @@ typedef double ggml_float;
|
|
| 301 |
#endif
|
| 302 |
#endif
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
#ifdef __F16C__
|
| 305 |
|
| 306 |
#ifdef _MSC_VER
|
|
@@ -665,7 +675,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
| 665 |
}
|
| 666 |
|
| 667 |
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
| 668 |
-
#
|
| 669 |
const __m256i zero = _mm256_setzero_si256();
|
| 670 |
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
| 671 |
return _mm256_cvtepi32_ps(summed_pairs);
|
|
@@ -678,7 +688,7 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy)
|
|
| 678 |
|
| 679 |
// multiply int8_t, add results pairwise twice and return as float vector
|
| 680 |
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
| 681 |
-
#
|
| 682 |
const __m256i zero = _mm256_setzero_si256();
|
| 683 |
const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
|
| 684 |
return _mm256_cvtepi32_ps(summed_pairs);
|
|
@@ -694,7 +704,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|
| 694 |
static inline __m128i packNibbles( __m256i bytes )
|
| 695 |
{
|
| 696 |
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
| 697 |
-
#
|
| 698 |
const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
|
| 699 |
bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
|
| 700 |
return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
|
|
@@ -813,46 +823,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
| 813 |
|
| 814 |
#if !defined(__aarch64__)
|
| 815 |
|
| 816 |
-
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
| 817 |
-
return
|
| 818 |
-
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
| 819 |
-
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
| 820 |
-
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
| 821 |
-
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
| 822 |
-
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
| 823 |
-
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
| 824 |
-
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
| 825 |
-
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
| 826 |
-
}
|
| 827 |
-
|
| 828 |
-
inline static int16_t vaddvq_s8(int8x16_t v) {
|
| 829 |
-
return
|
| 830 |
-
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
| 831 |
-
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
| 832 |
-
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
| 833 |
-
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
| 834 |
-
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
| 835 |
-
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
| 836 |
-
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
| 837 |
-
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
| 838 |
-
}
|
| 839 |
-
|
| 840 |
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
| 841 |
-
return
|
| 842 |
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
| 843 |
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
| 844 |
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
| 845 |
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
| 846 |
-
}
|
| 847 |
-
|
| 848 |
-
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
| 849 |
-
return
|
| 850 |
-
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
| 851 |
-
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
| 852 |
-
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
| 853 |
-
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
| 854 |
-
}
|
| 855 |
-
|
| 856 |
inline static int32_t vaddvq_s32(int32x4_t v) {
|
| 857 |
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
| 858 |
}
|
|
@@ -861,12 +831,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
| 861 |
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
| 862 |
}
|
| 863 |
|
| 864 |
-
inline static float vminvq_f32(float32x4_t v) {
|
| 865 |
-
return
|
| 866 |
-
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
| 867 |
-
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
| 868 |
-
}
|
| 869 |
-
|
| 870 |
inline static float vmaxvq_f32(float32x4_t v) {
|
| 871 |
return
|
| 872 |
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
|
@@ -1294,7 +1258,6 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
| 1294 |
#endif
|
| 1295 |
}
|
| 1296 |
#else
|
| 1297 |
-
(void)nb;
|
| 1298 |
// scalar
|
| 1299 |
quantize_row_q8_0_reference(x, y, k);
|
| 1300 |
#endif
|
|
@@ -1513,7 +1476,6 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
|
|
| 1513 |
#endif
|
| 1514 |
}
|
| 1515 |
#else
|
| 1516 |
-
(void)nb;
|
| 1517 |
// scalar
|
| 1518 |
quantize_row_q8_1_reference(x, y, k);
|
| 1519 |
#endif
|
|
@@ -2679,6 +2641,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2679 |
}
|
| 2680 |
|
| 2681 |
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2682 |
#else
|
| 2683 |
// scalar
|
| 2684 |
float sumf = 0.0;
|
|
@@ -2805,6 +2802,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2805 |
}
|
| 2806 |
|
| 2807 |
*s = hsum_float_8(acc) + summs;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2808 |
#else
|
| 2809 |
// scalar
|
| 2810 |
float sumf = 0.0;
|
|
@@ -3039,6 +3068,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 3039 |
}
|
| 3040 |
|
| 3041 |
*s = hsum_float_8(acc);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3042 |
#else
|
| 3043 |
// scalar
|
| 3044 |
float sumf = 0.0;
|
|
@@ -3295,6 +3394,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 3295 |
}
|
| 3296 |
|
| 3297 |
*s = hsum_float_8(acc) + summs;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3298 |
#else
|
| 3299 |
// scalar
|
| 3300 |
float sumf = 0.0;
|
|
@@ -3406,6 +3571,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
| 3406 |
}
|
| 3407 |
|
| 3408 |
*s = hsum_float_8(acc);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3409 |
#else
|
| 3410 |
// scalar
|
| 3411 |
float sumf = 0.0;
|
|
@@ -4106,16 +4291,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
| 4106 |
}
|
| 4107 |
|
| 4108 |
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
| 4109 |
-
|
| 4110 |
-
|
| 4111 |
-
|
| 4112 |
-
|
| 4113 |
-
|
| 4114 |
-
// return tensor->ne[3]*tensor->nb[3]
|
| 4115 |
-
//
|
| 4116 |
-
// is enough, but just in case, adding the second part
|
| 4117 |
-
|
| 4118 |
-
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
|
| 4119 |
}
|
| 4120 |
|
| 4121 |
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
@@ -4569,36 +4749,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
| 4569 |
enum ggml_type type,
|
| 4570 |
int n_dims,
|
| 4571 |
const int64_t * ne,
|
| 4572 |
-
|
|
|
|
| 4573 |
|
| 4574 |
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
| 4575 |
|
| 4576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4577 |
|
| 4578 |
-
|
| 4579 |
-
|
| 4580 |
-
|
| 4581 |
-
data_size *= ne[i];
|
| 4582 |
-
}
|
| 4583 |
}
|
| 4584 |
|
| 4585 |
-
|
| 4586 |
-
|
| 4587 |
-
|
| 4588 |
-
|
| 4589 |
-
|
| 4590 |
-
|
| 4591 |
-
return NULL;
|
| 4592 |
-
}
|
| 4593 |
|
| 4594 |
-
|
| 4595 |
|
| 4596 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4597 |
|
| 4598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4599 |
}
|
| 4600 |
|
| 4601 |
-
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE +
|
| 4602 |
|
| 4603 |
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
|
| 4604 |
|
|
@@ -4618,7 +4813,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
| 4618 |
/*.perf_runs =*/ 0,
|
| 4619 |
/*.perf_cycles =*/ 0,
|
| 4620 |
/*.perf_time_us =*/ 0,
|
| 4621 |
-
/*.
|
|
|
|
|
|
|
| 4622 |
/*.name =*/ { 0 },
|
| 4623 |
/*.extra =*/ NULL,
|
| 4624 |
/*.padding =*/ { 0 },
|
|
@@ -4642,28 +4839,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
| 4642 |
return result;
|
| 4643 |
}
|
| 4644 |
|
| 4645 |
-
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
| 4646 |
-
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
| 4647 |
-
assert(params_size <= GGML_MAX_OP_PARAMS);
|
| 4648 |
-
memcpy(tensor->op_params, params, params_size);
|
| 4649 |
-
}
|
| 4650 |
-
|
| 4651 |
-
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
| 4652 |
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
| 4653 |
-
return ((const int32_t *)(tensor->op_params))[i];
|
| 4654 |
-
}
|
| 4655 |
-
|
| 4656 |
-
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
| 4657 |
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
| 4658 |
-
((int32_t *)(tensor->op_params))[i] = value;
|
| 4659 |
-
}
|
| 4660 |
-
|
| 4661 |
struct ggml_tensor * ggml_new_tensor(
|
| 4662 |
struct ggml_context * ctx,
|
| 4663 |
enum ggml_type type,
|
| 4664 |
int n_dims,
|
| 4665 |
const int64_t * ne) {
|
| 4666 |
-
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
| 4667 |
}
|
| 4668 |
|
| 4669 |
struct ggml_tensor * ggml_new_tensor_1d(
|
|
@@ -4728,7 +4909,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
| 4728 |
}
|
| 4729 |
|
| 4730 |
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
| 4731 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4732 |
}
|
| 4733 |
|
| 4734 |
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
|
@@ -5004,13 +5201,6 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
|
|
| 5004 |
return tensor;
|
| 5005 |
}
|
| 5006 |
|
| 5007 |
-
#ifdef __GNUC__
|
| 5008 |
-
#ifdef __MINGW32__
|
| 5009 |
-
__attribute__((gnu_format(printf, 2, 3)))
|
| 5010 |
-
#else
|
| 5011 |
-
__attribute__((format(printf, 2, 3)))
|
| 5012 |
-
#endif
|
| 5013 |
-
#endif
|
| 5014 |
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
| 5015 |
va_list args;
|
| 5016 |
va_start(args, fmt);
|
|
@@ -5021,14 +5211,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
| 5021 |
|
| 5022 |
struct ggml_tensor * ggml_view_tensor(
|
| 5023 |
struct ggml_context * ctx,
|
| 5024 |
-
|
| 5025 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src
|
| 5026 |
ggml_format_name(result, "%s (view)", src->name);
|
| 5027 |
|
| 5028 |
-
|
| 5029 |
-
|
| 5030 |
-
|
| 5031 |
-
result->nb[3] = src->nb[3];
|
| 5032 |
|
| 5033 |
return result;
|
| 5034 |
}
|
|
@@ -5601,7 +5790,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
| 5601 |
|
| 5602 |
// ggml_concat
|
| 5603 |
|
| 5604 |
-
struct ggml_tensor* ggml_concat(
|
| 5605 |
struct ggml_context* ctx,
|
| 5606 |
struct ggml_tensor* a,
|
| 5607 |
struct ggml_tensor* b) {
|
|
@@ -5868,7 +6057,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
| 5868 |
struct ggml_tensor * ggml_rms_norm_back(
|
| 5869 |
struct ggml_context * ctx,
|
| 5870 |
struct ggml_tensor * a,
|
| 5871 |
-
struct ggml_tensor * b
|
|
|
|
| 5872 |
bool is_node = false;
|
| 5873 |
|
| 5874 |
if (a->grad) {
|
|
@@ -5878,6 +6068,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
| 5878 |
|
| 5879 |
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
| 5880 |
|
|
|
|
|
|
|
| 5881 |
result->op = GGML_OP_RMS_NORM_BACK;
|
| 5882 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 5883 |
result->src[0] = a;
|
|
@@ -6207,7 +6399,7 @@ struct ggml_tensor * ggml_reshape(
|
|
| 6207 |
//GGML_ASSERT(false);
|
| 6208 |
}
|
| 6209 |
|
| 6210 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a
|
| 6211 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6212 |
|
| 6213 |
result->op = GGML_OP_RESHAPE;
|
|
@@ -6231,7 +6423,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
| 6231 |
}
|
| 6232 |
|
| 6233 |
const int64_t ne[1] = { ne0 };
|
| 6234 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a
|
| 6235 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6236 |
|
| 6237 |
result->op = GGML_OP_RESHAPE;
|
|
@@ -6256,7 +6448,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
| 6256 |
}
|
| 6257 |
|
| 6258 |
const int64_t ne[2] = { ne0, ne1 };
|
| 6259 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a
|
| 6260 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6261 |
|
| 6262 |
result->op = GGML_OP_RESHAPE;
|
|
@@ -6282,7 +6474,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
| 6282 |
}
|
| 6283 |
|
| 6284 |
const int64_t ne[3] = { ne0, ne1, ne2 };
|
| 6285 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a
|
| 6286 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6287 |
|
| 6288 |
result->op = GGML_OP_RESHAPE;
|
|
@@ -6292,7 +6484,6 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
| 6292 |
return result;
|
| 6293 |
}
|
| 6294 |
|
| 6295 |
-
|
| 6296 |
struct ggml_tensor * ggml_reshape_4d(
|
| 6297 |
struct ggml_context * ctx,
|
| 6298 |
struct ggml_tensor * a,
|
|
@@ -6310,7 +6501,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
| 6310 |
}
|
| 6311 |
|
| 6312 |
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
| 6313 |
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a
|
| 6314 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6315 |
|
| 6316 |
result->op = GGML_OP_RESHAPE;
|
|
@@ -6320,46 +6511,40 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
| 6320 |
return result;
|
| 6321 |
}
|
| 6322 |
|
| 6323 |
-
|
| 6324 |
-
|
| 6325 |
-
static struct ggml_tensor * ggml_view_tensor_offset(
|
| 6326 |
struct ggml_context * ctx,
|
| 6327 |
struct ggml_tensor * a,
|
| 6328 |
int n_dims,
|
| 6329 |
const int64_t * ne,
|
| 6330 |
size_t offset) {
|
| 6331 |
-
// don't calculate an offset from an unallocated tensor
|
| 6332 |
-
void * data = NULL;
|
| 6333 |
-
if (a->data != NULL) {
|
| 6334 |
-
data = (char *) a->data + offset;
|
| 6335 |
-
}
|
| 6336 |
|
| 6337 |
-
|
| 6338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6339 |
ggml_format_name(result, "%s (view)", a->name);
|
| 6340 |
|
| 6341 |
ggml_set_op_params(result, &offset, sizeof(offset));
|
| 6342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6343 |
return result;
|
| 6344 |
}
|
| 6345 |
|
|
|
|
|
|
|
| 6346 |
struct ggml_tensor * ggml_view_1d(
|
| 6347 |
struct ggml_context * ctx,
|
| 6348 |
struct ggml_tensor * a,
|
| 6349 |
int64_t ne0,
|
| 6350 |
size_t offset) {
|
| 6351 |
|
| 6352 |
-
|
| 6353 |
-
|
| 6354 |
-
if (a->grad) {
|
| 6355 |
-
is_node = true;
|
| 6356 |
-
}
|
| 6357 |
-
|
| 6358 |
-
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
| 6359 |
-
|
| 6360 |
-
result->op = GGML_OP_VIEW;
|
| 6361 |
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 6362 |
-
result->src[0] = a;
|
| 6363 |
|
| 6364 |
return result;
|
| 6365 |
}
|
|
@@ -6374,24 +6559,14 @@ struct ggml_tensor * ggml_view_2d(
|
|
| 6374 |
size_t nb1,
|
| 6375 |
size_t offset) {
|
| 6376 |
|
| 6377 |
-
|
| 6378 |
-
|
| 6379 |
-
if (a->grad) {
|
| 6380 |
-
is_node = true;
|
| 6381 |
-
}
|
| 6382 |
-
|
| 6383 |
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
| 6384 |
|
| 6385 |
-
struct ggml_tensor * result =
|
| 6386 |
|
| 6387 |
result->nb[1] = nb1;
|
| 6388 |
result->nb[2] = result->nb[1]*ne1;
|
| 6389 |
result->nb[3] = result->nb[2];
|
| 6390 |
|
| 6391 |
-
result->op = GGML_OP_VIEW;
|
| 6392 |
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 6393 |
-
result->src[0] = a;
|
| 6394 |
-
|
| 6395 |
return result;
|
| 6396 |
}
|
| 6397 |
|
|
@@ -6407,24 +6582,14 @@ struct ggml_tensor * ggml_view_3d(
|
|
| 6407 |
size_t nb2,
|
| 6408 |
size_t offset) {
|
| 6409 |
|
| 6410 |
-
|
| 6411 |
-
|
| 6412 |
-
if (a->grad) {
|
| 6413 |
-
is_node = true;
|
| 6414 |
-
}
|
| 6415 |
-
|
| 6416 |
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
| 6417 |
|
| 6418 |
-
struct ggml_tensor * result =
|
| 6419 |
|
| 6420 |
result->nb[1] = nb1;
|
| 6421 |
result->nb[2] = nb2;
|
| 6422 |
result->nb[3] = result->nb[2]*ne2;
|
| 6423 |
|
| 6424 |
-
result->op = GGML_OP_VIEW;
|
| 6425 |
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 6426 |
-
result->src[0] = a;
|
| 6427 |
-
|
| 6428 |
return result;
|
| 6429 |
}
|
| 6430 |
|
|
@@ -6442,24 +6607,14 @@ struct ggml_tensor * ggml_view_4d(
|
|
| 6442 |
size_t nb3,
|
| 6443 |
size_t offset) {
|
| 6444 |
|
| 6445 |
-
|
| 6446 |
-
|
| 6447 |
-
if (a->grad) {
|
| 6448 |
-
is_node = true;
|
| 6449 |
-
}
|
| 6450 |
-
|
| 6451 |
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
| 6452 |
|
| 6453 |
-
struct ggml_tensor * result =
|
| 6454 |
|
| 6455 |
result->nb[1] = nb1;
|
| 6456 |
result->nb[2] = nb2;
|
| 6457 |
result->nb[3] = nb3;
|
| 6458 |
|
| 6459 |
-
result->op = GGML_OP_VIEW;
|
| 6460 |
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 6461 |
-
result->src[0] = a;
|
| 6462 |
-
|
| 6463 |
return result;
|
| 6464 |
}
|
| 6465 |
|
|
@@ -6646,7 +6801,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
| 6646 |
|
| 6647 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 6648 |
|
| 6649 |
-
int32_t params[] = { n_past
|
| 6650 |
ggml_set_op_params(result, params, sizeof(params));
|
| 6651 |
|
| 6652 |
result->op = GGML_OP_DIAG_MASK_INF;
|
|
@@ -6663,7 +6818,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
|
|
| 6663 |
return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
|
| 6664 |
}
|
| 6665 |
|
| 6666 |
-
|
| 6667 |
struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
| 6668 |
struct ggml_context * ctx,
|
| 6669 |
struct ggml_tensor * a,
|
|
@@ -6686,7 +6840,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
| 6686 |
|
| 6687 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 6688 |
|
| 6689 |
-
int32_t params[] = { n_past
|
| 6690 |
ggml_set_op_params(result, params, sizeof(params));
|
| 6691 |
|
| 6692 |
result->op = GGML_OP_DIAG_MASK_ZERO;
|
|
@@ -7475,6 +7629,8 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
|
|
| 7475 |
}
|
| 7476 |
|
| 7477 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
|
|
|
|
|
| 7478 |
result->op = GGML_OP_ADD_REL_POS;
|
| 7479 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 7480 |
result->src[0] = a;
|
|
@@ -9452,6 +9608,8 @@ static void ggml_compute_forward_div_f32(
|
|
| 9452 |
|
| 9453 |
|
| 9454 |
#ifdef GGML_USE_ACCELERATE
|
|
|
|
|
|
|
| 9455 |
vDSP_vdiv(
|
| 9456 |
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
| 9457 |
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
|
@@ -10758,7 +10916,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
| 10758 |
|
| 10759 |
GGML_TENSOR_BINARY_OP_LOCALS;
|
| 10760 |
|
| 10761 |
-
|
|
|
|
| 10762 |
|
| 10763 |
// TODO: optimize
|
| 10764 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
@@ -11936,8 +12095,8 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
| 11936 |
const int ith = params->ith;
|
| 11937 |
const int nth = params->nth;
|
| 11938 |
|
| 11939 |
-
const int n_past =
|
| 11940 |
-
const bool inplace =
|
| 11941 |
|
| 11942 |
GGML_ASSERT(n_past >= 0);
|
| 11943 |
|
|
@@ -12148,6 +12307,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|
| 12148 |
// dx = J * dy
|
| 12149 |
// dxk = sum_i(Jki * dyi)
|
| 12150 |
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
|
|
|
| 12151 |
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
| 12152 |
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
| 12153 |
// dxk = -yk * dot(y, dy) + yk*dyk
|
|
@@ -13938,7 +14098,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
| 13938 |
vvexpf(S, S, &Mup);
|
| 13939 |
ggml_vec_sum_f32(Mup, &sum, S);
|
| 13940 |
#else
|
| 13941 |
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
| 13942 |
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
| 13943 |
|
| 13944 |
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
@@ -13948,9 +14108,13 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
| 13948 |
if (SS[j] == -INFINITY) {
|
| 13949 |
SS[j] = 0.0f;
|
| 13950 |
} else {
|
|
|
|
|
|
|
|
|
|
| 13951 |
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
| 13952 |
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
| 13953 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
|
|
|
| 13954 |
sump[j] += (ggml_float)val;
|
| 13955 |
SS[j] = val;
|
| 13956 |
}
|
|
@@ -14528,7 +14692,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
| 14528 |
vvexpf(SM, SM, &Mup);
|
| 14529 |
ggml_vec_sum_f32(Mup, &sum, SM);
|
| 14530 |
#else
|
| 14531 |
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
| 14532 |
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
| 14533 |
|
| 14534 |
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
@@ -14539,9 +14703,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
| 14539 |
if (SR[j] == -INFINITY) {
|
| 14540 |
SW[j] = 0.0f;
|
| 14541 |
} else {
|
|
|
|
|
|
|
|
|
|
| 14542 |
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
| 14543 |
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
| 14544 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
|
|
|
| 14545 |
sump[j] += (ggml_float)val;
|
| 14546 |
SW[j] = val;
|
| 14547 |
}
|
|
@@ -14987,11 +15155,8 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|
| 14987 |
const struct ggml_tensor * src1,
|
| 14988 |
const struct ggml_tensor * src2,
|
| 14989 |
struct ggml_tensor * dst) {
|
| 14990 |
-
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
| 14991 |
-
GGML_ASSERT(src0->nb[0] == dst->nb[0] && src0->nb[1] == dst->nb[1]
|
| 14992 |
-
&& src0->nb[2] == dst->nb[2] && src0->nb[3] == dst->nb[3]);
|
| 14993 |
|
| 14994 |
-
const bool inplace =
|
| 14995 |
if (!inplace && params->type == GGML_TASK_INIT) {
|
| 14996 |
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
| 14997 |
return;
|
|
@@ -15282,6 +15447,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
| 15282 |
const int nc = src0->ne[0];
|
| 15283 |
const int nr = ggml_nrows(src0);
|
| 15284 |
|
|
|
|
|
|
|
| 15285 |
if (params->type == GGML_TASK_INIT) {
|
| 15286 |
if (ith == 0) {
|
| 15287 |
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
|
@@ -15293,7 +15460,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
| 15293 |
if (ith == 0) {
|
| 15294 |
float * dp = (float *) dst->data;
|
| 15295 |
ggml_vec_sum_f32(nth, dp, sums);
|
| 15296 |
-
dp[0] *= -1.0f;
|
| 15297 |
}
|
| 15298 |
return;
|
| 15299 |
}
|
|
@@ -15310,7 +15477,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
| 15310 |
for (int i1 = ir0; i1 < ir1; i1++) {
|
| 15311 |
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
| 15312 |
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
| 15313 |
-
float * st = (float *) params->wdata + nth + ith*nc;
|
| 15314 |
|
| 15315 |
#ifndef NDEBUG
|
| 15316 |
for (int i = 0; i < nc; ++i) {
|
|
@@ -15325,15 +15492,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
| 15325 |
float max = -INFINITY;
|
| 15326 |
ggml_vec_max_f32(nc, &max, s0);
|
| 15327 |
|
| 15328 |
-
uint16_t scvt;
|
| 15329 |
for (int i = 0; i < nc; i++) {
|
| 15330 |
if (s0[i] == -INFINITY) {
|
| 15331 |
st[i] = 0.0f;
|
| 15332 |
} else {
|
| 15333 |
-
|
|
|
|
|
|
|
|
|
|
| 15334 |
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
| 15335 |
memcpy(&scvt, &s, sizeof(scvt));
|
| 15336 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
|
|
|
| 15337 |
sum += (ggml_float)val;
|
| 15338 |
st[i] = val;
|
| 15339 |
}
|
|
@@ -15349,7 +15520,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
| 15349 |
ggml_vec_log_f32(nc, st, st);
|
| 15350 |
ggml_vec_mul_f32(nc, st, st, s1);
|
| 15351 |
|
| 15352 |
-
|
|
|
|
|
|
|
| 15353 |
|
| 15354 |
#ifndef NDEBUG
|
| 15355 |
for (int i = 0; i < nc; ++i) {
|
|
@@ -15399,7 +15572,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
| 15399 |
return;
|
| 15400 |
}
|
| 15401 |
|
| 15402 |
-
const
|
| 15403 |
|
| 15404 |
// TODO: handle transposed/permuted matrices
|
| 15405 |
const int64_t nc = src0->ne[0];
|
|
@@ -15418,7 +15591,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
| 15418 |
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
| 15419 |
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
| 15420 |
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
| 15421 |
-
float * sm = (float *) params->wdata + ith*nc;
|
| 15422 |
|
| 15423 |
#ifndef NDEBUG
|
| 15424 |
for (int i = 0; i < nc; ++i) {
|
|
@@ -15427,54 +15599,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
| 15427 |
assert(!isnan(s1[i]));
|
| 15428 |
}
|
| 15429 |
#endif
|
| 15430 |
-
// step by step explanation:
|
| 15431 |
-
{
|
| 15432 |
-
//float * sums = (float *) params->wdata;
|
| 15433 |
-
|
| 15434 |
-
// forward pass with annotated gradients from backward pass
|
| 15435 |
-
// (built by going in reverse operation order, adding to gradients of current operation args)
|
| 15436 |
-
// st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
|
| 15437 |
-
// from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
| 15438 |
-
// ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
|
| 15439 |
-
// ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
|
| 15440 |
-
// ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
|
| 15441 |
-
// ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
|
| 15442 |
-
// ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
|
| 15443 |
-
// ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
|
| 15444 |
-
|
| 15445 |
-
// substitute into grad[st1], because we can reuse softmax_back from this point on
|
| 15446 |
-
// grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
|
| 15447 |
-
// postorder:
|
| 15448 |
-
// grad[st1] := softmax(s0)
|
| 15449 |
-
// grad[st1] := grad[st1]*(1.0 - eps)
|
| 15450 |
-
// grad[st1] := grad[st1] + eps
|
| 15451 |
-
// grad[st1] := s1 / grad[st1]
|
| 15452 |
-
// grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
|
| 15453 |
-
|
| 15454 |
-
// src0 gradients by going through softmax_back
|
| 15455 |
-
// grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
| 15456 |
-
// from softmax_back:
|
| 15457 |
-
// dxk = yk * (dyk - dot(y, dy))
|
| 15458 |
-
// dot_y_dy := dot(y, dy)
|
| 15459 |
-
// dx := dy
|
| 15460 |
-
// dx := dx - dot_y_dy
|
| 15461 |
-
// dx := dx * y
|
| 15462 |
-
// postorder:
|
| 15463 |
-
// dot_st1_dst1 := dot(st1, grad[st1])
|
| 15464 |
-
// grad[s0] := grad[st1]
|
| 15465 |
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
| 15466 |
-
// grad[s0] := grad[s0] * st1
|
| 15467 |
-
|
| 15468 |
-
// prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
|
| 15469 |
-
// sm := softmax(s0)
|
| 15470 |
-
// grad[s0] := sm*(1.0 - eps)
|
| 15471 |
-
// grad[s0] := grad[s0] + eps
|
| 15472 |
-
// grad[s0] := s1 / grad[s0]
|
| 15473 |
-
// grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
|
| 15474 |
-
// dot_st1_dst1 := dot(sm, grad[s0])
|
| 15475 |
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
| 15476 |
-
// grad[s0] := grad[s0] * sm
|
| 15477 |
-
}
|
| 15478 |
|
| 15479 |
// soft_max
|
| 15480 |
ggml_float sum = 0.0;
|
|
@@ -15482,39 +15606,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
| 15482 |
float max = -INFINITY;
|
| 15483 |
ggml_vec_max_f32(nc, &max, s0);
|
| 15484 |
|
| 15485 |
-
uint16_t scvt;
|
| 15486 |
for (int i = 0; i < nc; i++) {
|
| 15487 |
if (s0[i] == -INFINITY) {
|
| 15488 |
-
|
| 15489 |
} else {
|
| 15490 |
-
|
|
|
|
|
|
|
|
|
|
| 15491 |
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
| 15492 |
memcpy(&scvt, &s, sizeof(scvt));
|
| 15493 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
|
|
|
| 15494 |
sum += (ggml_float)val;
|
| 15495 |
-
|
| 15496 |
}
|
| 15497 |
}
|
| 15498 |
|
| 15499 |
assert(sum > 0.0);
|
| 15500 |
-
sum = 1.0/sum;
|
| 15501 |
}
|
| 15502 |
|
| 15503 |
-
|
| 15504 |
-
ggml_vec_scale_f32(nc,
|
| 15505 |
-
|
| 15506 |
-
|
| 15507 |
-
|
| 15508 |
-
|
| 15509 |
-
ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
|
| 15510 |
-
ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
|
| 15511 |
-
ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
|
| 15512 |
-
ggml_vec_mul_f32 (nc, ds0, ds0, sm);
|
| 15513 |
|
| 15514 |
#ifndef NDEBUG
|
| 15515 |
for (int i = 0; i < nc; ++i) {
|
| 15516 |
-
assert(!isnan(sm[i]));
|
| 15517 |
-
assert(!isinf(sm[i]));
|
| 15518 |
assert(!isnan(ds0[i]));
|
| 15519 |
assert(!isinf(ds0[i]));
|
| 15520 |
}
|
|
@@ -16069,9 +16191,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
| 16069 |
{
|
| 16070 |
// necessary for llama
|
| 16071 |
if (src0->grad) {
|
|
|
|
|
|
|
|
|
|
| 16072 |
src0->grad = ggml_add_impl(ctx,
|
| 16073 |
src0->grad,
|
| 16074 |
-
ggml_rms_norm_back(ctx, src0, tensor->grad),
|
| 16075 |
inplace);
|
| 16076 |
}
|
| 16077 |
} break;
|
|
@@ -16839,9 +16964,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
| 16839 |
return result;
|
| 16840 |
}
|
| 16841 |
|
| 16842 |
-
|
| 16843 |
-
struct ggml_cgraph result = *gf;
|
| 16844 |
-
|
| 16845 |
GGML_ASSERT(gf->n_nodes > 0);
|
| 16846 |
|
| 16847 |
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
|
@@ -16865,15 +16988,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|
| 16865 |
}
|
| 16866 |
}
|
| 16867 |
|
| 16868 |
-
for (int i =
|
| 16869 |
struct ggml_tensor * node = gf->nodes[i];
|
| 16870 |
|
| 16871 |
if (node->is_param) {
|
| 16872 |
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
| 16873 |
-
ggml_build_forward_expand(
|
| 16874 |
}
|
| 16875 |
}
|
|
|
|
| 16876 |
|
|
|
|
|
|
|
|
|
|
| 16877 |
return result;
|
| 16878 |
}
|
| 16879 |
|
|
@@ -17549,10 +17676,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
| 17549 |
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
| 17550 |
{
|
| 17551 |
n_tasks = n_threads;
|
| 17552 |
-
|
| 17553 |
-
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
| 17554 |
-
|
| 17555 |
-
work_size = MAX(work_size, cur);
|
| 17556 |
} break;
|
| 17557 |
case GGML_OP_NONE:
|
| 17558 |
{
|
|
@@ -18430,14 +18553,16 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
| 18430 |
struct ggml_opt_params params,
|
| 18431 |
struct ggml_tensor * f,
|
| 18432 |
struct ggml_cgraph * gf,
|
| 18433 |
-
struct ggml_cgraph * gb
|
|
|
|
|
|
|
| 18434 |
GGML_ASSERT(ggml_is_scalar(f));
|
| 18435 |
|
| 18436 |
// these will store the parameters we want to optimize
|
| 18437 |
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
| 18438 |
|
| 18439 |
int np = 0;
|
| 18440 |
-
|
| 18441 |
for (int i = 0; i < gf->n_nodes; ++i) {
|
| 18442 |
if (gf->nodes[i]->is_param) {
|
| 18443 |
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
|
@@ -18456,31 +18581,32 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
| 18456 |
}
|
| 18457 |
|
| 18458 |
// constants
|
| 18459 |
-
|
| 18460 |
-
const float
|
| 18461 |
-
const float
|
| 18462 |
const float beta1 = params.adam.beta1;
|
| 18463 |
const float beta2 = params.adam.beta2;
|
| 18464 |
const float eps = params.adam.eps;
|
|
|
|
|
|
|
| 18465 |
|
| 18466 |
-
float * x = opt->adam.x->data; // view of the parameters
|
| 18467 |
-
float * g1 = opt->adam.g1->data; // gradient
|
| 18468 |
-
float * g2 = opt->adam.g2->data; // gradient squared
|
| 18469 |
float * m = opt->adam.m->data; // first moment
|
| 18470 |
float * v = opt->adam.v->data; // second moment
|
| 18471 |
-
float * mh = opt->adam.mh->data; // first moment hat
|
| 18472 |
-
float * vh = opt->adam.vh->data; // second moment hat
|
| 18473 |
|
| 18474 |
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
| 18475 |
|
| 18476 |
-
|
| 18477 |
-
|
|
|
|
| 18478 |
|
| 18479 |
// compute the function value
|
| 18480 |
ggml_graph_reset (gf);
|
| 18481 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18482 |
|
| 18483 |
-
|
|
|
|
|
|
|
|
|
|
| 18484 |
|
| 18485 |
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
| 18486 |
opt->adam.fx_best = opt->adam.fx_prev;
|
|
@@ -18488,6 +18614,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
| 18488 |
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
| 18489 |
}
|
| 18490 |
|
|
|
|
|
|
|
|
|
|
| 18491 |
// initialize
|
| 18492 |
if (opt->just_initialized) {
|
| 18493 |
opt->adam.n_no_improvement = 0;
|
|
@@ -18520,50 +18649,55 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
| 18520 |
UNUSED(t_start_cpu);
|
| 18521 |
|
| 18522 |
{
|
| 18523 |
-
|
| 18524 |
-
|
| 18525 |
-
|
| 18526 |
-
|
| 18527 |
-
|
| 18528 |
-
|
| 18529 |
-
|
| 18530 |
-
|
| 18531 |
-
|
| 18532 |
-
|
| 18533 |
-
|
| 18534 |
-
|
| 18535 |
-
|
| 18536 |
-
|
| 18537 |
-
|
| 18538 |
-
|
| 18539 |
-
|
| 18540 |
-
|
| 18541 |
-
|
| 18542 |
-
|
| 18543 |
-
|
| 18544 |
-
|
| 18545 |
-
|
| 18546 |
-
|
| 18547 |
-
|
| 18548 |
-
|
| 18549 |
-
|
| 18550 |
-
|
| 18551 |
-
|
| 18552 |
-
|
| 18553 |
-
|
| 18554 |
-
|
| 18555 |
-
|
|
|
|
|
|
|
|
|
|
| 18556 |
|
| 18557 |
-
|
| 18558 |
-
|
| 18559 |
}
|
| 18560 |
|
| 18561 |
ggml_graph_reset (gf);
|
| 18562 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18563 |
|
| 18564 |
-
|
| 18565 |
|
| 18566 |
const float fx = ggml_get_f32_1d(f, 0);
|
|
|
|
|
|
|
| 18567 |
|
| 18568 |
// check convergence
|
| 18569 |
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
|
@@ -18632,7 +18766,6 @@ struct ggml_lbfgs_iteration_data {
|
|
| 18632 |
};
|
| 18633 |
|
| 18634 |
static enum ggml_opt_result linesearch_backtracking(
|
| 18635 |
-
struct ggml_context * ctx,
|
| 18636 |
const struct ggml_opt_params * params,
|
| 18637 |
int nx,
|
| 18638 |
float * x,
|
|
@@ -18644,8 +18777,11 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
| 18644 |
struct ggml_tensor * f,
|
| 18645 |
struct ggml_cgraph * gf,
|
| 18646 |
struct ggml_cgraph * gb,
|
|
|
|
| 18647 |
const int np,
|
| 18648 |
-
struct ggml_tensor * ps[]
|
|
|
|
|
|
|
| 18649 |
int count = 0;
|
| 18650 |
|
| 18651 |
float width = 0.0f;
|
|
@@ -18674,6 +18810,12 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
| 18674 |
dgtest = params->lbfgs.ftol*dginit;
|
| 18675 |
|
| 18676 |
while (true) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18677 |
ggml_vec_cpy_f32(nx, x, xp);
|
| 18678 |
ggml_vec_mad_f32(nx, x, d, *step);
|
| 18679 |
|
|
@@ -18684,7 +18826,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
| 18684 |
ggml_graph_reset (gf);
|
| 18685 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18686 |
|
| 18687 |
-
|
| 18688 |
|
| 18689 |
ggml_opt_get_grad(np, ps, g);
|
| 18690 |
|
|
@@ -18718,7 +18860,6 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
| 18718 |
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
| 18719 |
return count;
|
| 18720 |
}
|
| 18721 |
-
return count;
|
| 18722 |
}
|
| 18723 |
}
|
| 18724 |
|
|
@@ -18744,7 +18885,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18744 |
struct ggml_opt_params params,
|
| 18745 |
struct ggml_tensor * f,
|
| 18746 |
struct ggml_cgraph * gf,
|
| 18747 |
-
struct ggml_cgraph * gb
|
|
|
|
|
|
|
| 18748 |
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
| 18749 |
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
| 18750 |
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
|
@@ -18776,6 +18919,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18776 |
opt->iter = iter;
|
| 18777 |
}
|
| 18778 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18779 |
float * x = opt->lbfgs.x->data; // current parameters
|
| 18780 |
float * xp = opt->lbfgs.xp->data; // previous parameters
|
| 18781 |
float * g = opt->lbfgs.g->data; // current gradient
|
|
@@ -18797,6 +18944,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18797 |
float * lm_s = opt->lbfgs.lms->data;
|
| 18798 |
float * lm_y = opt->lbfgs.lmy->data;
|
| 18799 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18800 |
// evaluate the function value and its gradient
|
| 18801 |
{
|
| 18802 |
ggml_opt_set_params(np, ps, x);
|
|
@@ -18804,11 +18957,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18804 |
ggml_graph_reset (gf);
|
| 18805 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18806 |
|
| 18807 |
-
|
| 18808 |
|
| 18809 |
ggml_opt_get_grad(np, ps, g);
|
| 18810 |
|
| 18811 |
fx = ggml_get_f32_1d(f, 0);
|
|
|
|
|
|
|
|
|
|
| 18812 |
}
|
| 18813 |
|
| 18814 |
// search direction = -gradient
|
|
@@ -18863,7 +19019,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18863 |
ggml_vec_cpy_f32(nx, xp, x);
|
| 18864 |
ggml_vec_cpy_f32(nx, gp, g);
|
| 18865 |
|
| 18866 |
-
ls = linesearch_backtracking(
|
| 18867 |
|
| 18868 |
if (ls < 0) {
|
| 18869 |
// linesearch failed - go back to the previous point and return
|
|
@@ -18873,6 +19029,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18873 |
return ls;
|
| 18874 |
}
|
| 18875 |
|
|
|
|
|
|
|
| 18876 |
ggml_vec_norm_f32(nx, &xnorm, x);
|
| 18877 |
ggml_vec_norm_f32(nx, &gnorm, g);
|
| 18878 |
|
|
@@ -18930,7 +19088,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
| 18930 |
// ys = y^t \cdot s -> 1 / \rho.
|
| 18931 |
// yy = y^t \cdot y.
|
| 18932 |
//
|
| 18933 |
-
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]
|
| 18934 |
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
| 18935 |
|
| 18936 |
lm_ys[end[0]] = ys;
|
|
@@ -18993,13 +19151,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
| 18993 |
.adam = {
|
| 18994 |
.n_iter = 10000,
|
| 18995 |
.sched = 1.000f,
|
| 18996 |
-
.decay = 0.
|
|
|
|
| 18997 |
.alpha = 0.001f,
|
| 18998 |
.beta1 = 0.9f,
|
| 18999 |
.beta2 = 0.999f,
|
| 19000 |
.eps = 1e-8f,
|
| 19001 |
.eps_f = 1e-5f,
|
| 19002 |
.eps_g = 1e-3f,
|
|
|
|
| 19003 |
},
|
| 19004 |
};
|
| 19005 |
} break;
|
|
@@ -19049,23 +19209,13 @@ GGML_API void ggml_opt_init(
|
|
| 19049 |
switch (opt->params.type) {
|
| 19050 |
case GGML_OPT_ADAM:
|
| 19051 |
{
|
| 19052 |
-
opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19053 |
-
opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19054 |
-
opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19055 |
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19056 |
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19057 |
-
opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19058 |
-
opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19059 |
opt->adam.pf = params.past > 0
|
| 19060 |
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
| 19061 |
: NULL;
|
| 19062 |
-
ggml_set_zero(opt->adam.x);
|
| 19063 |
-
ggml_set_zero(opt->adam.g1);
|
| 19064 |
-
ggml_set_zero(opt->adam.g2);
|
| 19065 |
ggml_set_zero(opt->adam.m);
|
| 19066 |
ggml_set_zero(opt->adam.v);
|
| 19067 |
-
ggml_set_zero(opt->adam.mh);
|
| 19068 |
-
ggml_set_zero(opt->adam.vh);
|
| 19069 |
if (opt->adam.pf) {
|
| 19070 |
ggml_set_zero(opt->adam.pf);
|
| 19071 |
}
|
|
@@ -19149,7 +19299,7 @@ enum ggml_opt_result ggml_opt_resume(
|
|
| 19149 |
*gf = ggml_build_forward (f);
|
| 19150 |
*gb = ggml_build_backward(ctx, gf, true);
|
| 19151 |
|
| 19152 |
-
return ggml_opt_resume_g(ctx, opt, f, gf, gb);
|
| 19153 |
}
|
| 19154 |
|
| 19155 |
enum ggml_opt_result ggml_opt_resume_g(
|
|
@@ -19157,7 +19307,9 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
| 19157 |
struct ggml_opt_context * opt,
|
| 19158 |
struct ggml_tensor * f,
|
| 19159 |
struct ggml_cgraph * gf,
|
| 19160 |
-
struct ggml_cgraph * gb
|
|
|
|
|
|
|
| 19161 |
|
| 19162 |
// build forward + backward compute graphs
|
| 19163 |
enum ggml_opt_result result = GGML_OPT_OK;
|
|
@@ -19165,11 +19317,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
| 19165 |
switch (opt->params.type) {
|
| 19166 |
case GGML_OPT_ADAM:
|
| 19167 |
{
|
| 19168 |
-
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
| 19169 |
} break;
|
| 19170 |
case GGML_OPT_LBFGS:
|
| 19171 |
{
|
| 19172 |
-
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
|
| 19173 |
} break;
|
| 19174 |
}
|
| 19175 |
|
|
@@ -19624,7 +19776,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
| 19624 |
|
| 19625 |
// read the kv pairs
|
| 19626 |
{
|
| 19627 |
-
ctx->kv =
|
| 19628 |
|
| 19629 |
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
| 19630 |
struct gguf_kv * kv = &ctx->kv[i];
|
|
@@ -19707,7 +19859,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
| 19707 |
|
| 19708 |
// read the tensor infos
|
| 19709 |
{
|
| 19710 |
-
ctx->infos =
|
| 19711 |
|
| 19712 |
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
| 19713 |
struct gguf_tensor_info * info = &ctx->infos[i];
|
|
@@ -19908,7 +20060,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
| 19908 |
}
|
| 19909 |
}
|
| 19910 |
|
| 19911 |
-
|
| 19912 |
}
|
| 19913 |
|
| 19914 |
if (ctx->infos) {
|
|
@@ -19920,7 +20072,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
| 19920 |
}
|
| 19921 |
}
|
| 19922 |
|
| 19923 |
-
|
| 19924 |
}
|
| 19925 |
|
| 19926 |
GGML_ALIGNED_FREE(ctx);
|
|
|
|
| 47 |
// disable "possible loss of data" to avoid hundreds of casts
|
| 48 |
// we should just be careful :)
|
| 49 |
#pragma warning(disable: 4244 4267)
|
| 50 |
+
|
| 51 |
+
// disable POSIX deprecation warnigns
|
| 52 |
+
// these functions are never going away, anyway
|
| 53 |
+
#pragma warning(disable: 4996)
|
| 54 |
#endif
|
| 55 |
|
| 56 |
#if defined(_WIN32)
|
|
|
|
| 127 |
#define GGML_GELU_FP16
|
| 128 |
#define GGML_GELU_QUICK_FP16
|
| 129 |
#define GGML_SILU_FP16
|
| 130 |
+
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
| 131 |
+
// #define GGML_FLASH_ATTN_EXP_FP16
|
| 132 |
|
| 133 |
#define GGML_SOFT_MAX_UNROLL 4
|
| 134 |
#define GGML_VEC_DOT_UNROLL 2
|
|
|
|
| 192 |
//
|
| 193 |
|
| 194 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 195 |
+
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
| 196 |
+
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
| 197 |
#else
|
| 198 |
inline static void * ggml_aligned_malloc(size_t size) {
|
| 199 |
void * aligned_memory = NULL;
|
|
|
|
| 218 |
}
|
| 219 |
return aligned_memory;
|
| 220 |
}
|
| 221 |
+
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
| 222 |
+
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
| 223 |
#endif
|
| 224 |
|
| 225 |
#define UNUSED GGML_UNUSED
|
|
|
|
| 307 |
#endif
|
| 308 |
#endif
|
| 309 |
|
| 310 |
+
#ifdef __riscv_v_intrinsic
|
| 311 |
+
#include <riscv_vector.h>
|
| 312 |
+
#endif
|
| 313 |
+
|
| 314 |
#ifdef __F16C__
|
| 315 |
|
| 316 |
#ifdef _MSC_VER
|
|
|
|
| 675 |
}
|
| 676 |
|
| 677 |
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
| 678 |
+
#if __AVXVNNI__
|
| 679 |
const __m256i zero = _mm256_setzero_si256();
|
| 680 |
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
| 681 |
return _mm256_cvtepi32_ps(summed_pairs);
|
|
|
|
| 688 |
|
| 689 |
// multiply int8_t, add results pairwise twice and return as float vector
|
| 690 |
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
| 691 |
+
#if __AVXVNNIINT8__
|
| 692 |
const __m256i zero = _mm256_setzero_si256();
|
| 693 |
const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
|
| 694 |
return _mm256_cvtepi32_ps(summed_pairs);
|
|
|
|
| 704 |
static inline __m128i packNibbles( __m256i bytes )
|
| 705 |
{
|
| 706 |
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
| 707 |
+
#if __AVX512F__
|
| 708 |
const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
|
| 709 |
bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
|
| 710 |
return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
|
|
|
|
| 823 |
|
| 824 |
#if !defined(__aarch64__)
|
| 825 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 826 |
inline static int32_t vaddvq_s32(int32x4_t v) {
|
| 827 |
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
| 828 |
}
|
|
|
|
| 831 |
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
| 832 |
}
|
| 833 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
inline static float vmaxvq_f32(float32x4_t v) {
|
| 835 |
return
|
| 836 |
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
|
|
|
| 1258 |
#endif
|
| 1259 |
}
|
| 1260 |
#else
|
|
|
|
| 1261 |
// scalar
|
| 1262 |
quantize_row_q8_0_reference(x, y, k);
|
| 1263 |
#endif
|
|
|
|
| 1476 |
#endif
|
| 1477 |
}
|
| 1478 |
#else
|
|
|
|
| 1479 |
// scalar
|
| 1480 |
quantize_row_q8_1_reference(x, y, k);
|
| 1481 |
#endif
|
|
|
|
| 2641 |
}
|
| 2642 |
|
| 2643 |
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
| 2644 |
+
#elif defined(__riscv_v_intrinsic)
|
| 2645 |
+
float sumf = 0.0;
|
| 2646 |
+
|
| 2647 |
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
| 2648 |
+
|
| 2649 |
+
for (int i = 0; i < nb; i++) {
|
| 2650 |
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
| 2651 |
+
|
| 2652 |
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
| 2653 |
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
| 2654 |
+
|
| 2655 |
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
| 2656 |
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
| 2657 |
+
|
| 2658 |
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
| 2659 |
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
| 2660 |
+
|
| 2661 |
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
|
| 2662 |
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
|
| 2663 |
+
|
| 2664 |
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
| 2665 |
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
| 2666 |
+
|
| 2667 |
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
| 2668 |
+
|
| 2669 |
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
| 2670 |
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
| 2671 |
+
|
| 2672 |
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
| 2673 |
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 2674 |
+
|
| 2675 |
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
| 2676 |
+
}
|
| 2677 |
+
|
| 2678 |
+
*s = sumf;
|
| 2679 |
#else
|
| 2680 |
// scalar
|
| 2681 |
float sumf = 0.0;
|
|
|
|
| 2802 |
}
|
| 2803 |
|
| 2804 |
*s = hsum_float_8(acc) + summs;
|
| 2805 |
+
#elif defined(__riscv_v_intrinsic)
|
| 2806 |
+
float sumf = 0.0;
|
| 2807 |
+
|
| 2808 |
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
| 2809 |
+
|
| 2810 |
+
for (int i = 0; i < nb; i++) {
|
| 2811 |
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
| 2812 |
+
|
| 2813 |
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
| 2814 |
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
| 2815 |
+
|
| 2816 |
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
| 2817 |
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
| 2818 |
+
|
| 2819 |
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
| 2820 |
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
| 2821 |
+
|
| 2822 |
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
| 2823 |
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
| 2824 |
+
|
| 2825 |
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
| 2826 |
+
|
| 2827 |
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
| 2828 |
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
| 2829 |
+
|
| 2830 |
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
| 2831 |
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 2832 |
+
|
| 2833 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
| 2834 |
+
}
|
| 2835 |
+
|
| 2836 |
+
*s = sumf;
|
| 2837 |
#else
|
| 2838 |
// scalar
|
| 2839 |
float sumf = 0.0;
|
|
|
|
| 3068 |
}
|
| 3069 |
|
| 3070 |
*s = hsum_float_8(acc);
|
| 3071 |
+
#elif defined(__riscv_v_intrinsic)
|
| 3072 |
+
float sumf = 0.0;
|
| 3073 |
+
|
| 3074 |
+
uint32_t qh;
|
| 3075 |
+
|
| 3076 |
+
// These temp values are for masking and shift operations
|
| 3077 |
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
| 3078 |
+
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
| 3079 |
+
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
|
| 3080 |
+
|
| 3081 |
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
| 3082 |
+
|
| 3083 |
+
for (int i = 0; i < nb; i++) {
|
| 3084 |
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
| 3085 |
+
|
| 3086 |
+
// temporary registers
|
| 3087 |
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
|
| 3088 |
+
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
|
| 3089 |
+
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
|
| 3090 |
+
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
|
| 3091 |
+
|
| 3092 |
+
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
| 3093 |
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
|
| 3094 |
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
|
| 3095 |
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
| 3096 |
+
|
| 3097 |
+
// ((qh & (1u << (j + 16))) >> (j + 12));
|
| 3098 |
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
|
| 3099 |
+
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
|
| 3100 |
+
|
| 3101 |
+
// narrowing
|
| 3102 |
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
|
| 3103 |
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
| 3104 |
+
|
| 3105 |
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
|
| 3106 |
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
| 3107 |
+
|
| 3108 |
+
// load
|
| 3109 |
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
| 3110 |
+
|
| 3111 |
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
| 3112 |
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
| 3113 |
+
|
| 3114 |
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
| 3115 |
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
| 3116 |
+
|
| 3117 |
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
| 3118 |
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
| 3119 |
+
|
| 3120 |
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
| 3121 |
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
| 3122 |
+
|
| 3123 |
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
|
| 3124 |
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
|
| 3125 |
+
|
| 3126 |
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
| 3127 |
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
| 3128 |
+
|
| 3129 |
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
| 3130 |
+
|
| 3131 |
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
| 3132 |
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
| 3133 |
+
|
| 3134 |
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
| 3135 |
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 3136 |
+
|
| 3137 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
| 3138 |
+
}
|
| 3139 |
+
|
| 3140 |
+
*s = sumf;
|
| 3141 |
#else
|
| 3142 |
// scalar
|
| 3143 |
float sumf = 0.0;
|
|
|
|
| 3394 |
}
|
| 3395 |
|
| 3396 |
*s = hsum_float_8(acc) + summs;
|
| 3397 |
+
#elif defined(__riscv_v_intrinsic)
|
| 3398 |
+
float sumf = 0.0;
|
| 3399 |
+
|
| 3400 |
+
uint32_t qh;
|
| 3401 |
+
|
| 3402 |
+
// These temp values are for shift operations
|
| 3403 |
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
| 3404 |
+
|
| 3405 |
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
| 3406 |
+
|
| 3407 |
+
for (int i = 0; i < nb; i++) {
|
| 3408 |
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
| 3409 |
+
|
| 3410 |
+
// temporary registers
|
| 3411 |
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
|
| 3412 |
+
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
|
| 3413 |
+
|
| 3414 |
+
// load qh
|
| 3415 |
+
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
|
| 3416 |
+
|
| 3417 |
+
// ((qh >> (j + 0)) << 4) & 0x10;
|
| 3418 |
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
|
| 3419 |
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
| 3420 |
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
|
| 3421 |
+
|
| 3422 |
+
// ((qh >> (j + 12)) ) & 0x10;
|
| 3423 |
+
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
|
| 3424 |
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
|
| 3425 |
+
|
| 3426 |
+
// narrowing
|
| 3427 |
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
|
| 3428 |
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
| 3429 |
+
|
| 3430 |
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
|
| 3431 |
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
| 3432 |
+
|
| 3433 |
+
// load
|
| 3434 |
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
| 3435 |
+
|
| 3436 |
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
| 3437 |
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
| 3438 |
+
|
| 3439 |
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
| 3440 |
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
| 3441 |
+
|
| 3442 |
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
| 3443 |
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
| 3444 |
+
|
| 3445 |
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
| 3446 |
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
| 3447 |
+
|
| 3448 |
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
| 3449 |
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
| 3450 |
+
|
| 3451 |
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
| 3452 |
+
|
| 3453 |
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
| 3454 |
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
| 3455 |
+
|
| 3456 |
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
| 3457 |
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 3458 |
+
|
| 3459 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
| 3460 |
+
}
|
| 3461 |
+
|
| 3462 |
+
*s = sumf;
|
| 3463 |
#else
|
| 3464 |
// scalar
|
| 3465 |
float sumf = 0.0;
|
|
|
|
| 3571 |
}
|
| 3572 |
|
| 3573 |
*s = hsum_float_8(acc);
|
| 3574 |
+
#elif defined(__riscv_v_intrinsic)
|
| 3575 |
+
float sumf = 0.0;
|
| 3576 |
+
size_t vl = __riscv_vsetvl_e8m1(qk);
|
| 3577 |
+
|
| 3578 |
+
for (int i = 0; i < nb; i++) {
|
| 3579 |
+
// load elements
|
| 3580 |
+
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
| 3581 |
+
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
| 3582 |
+
|
| 3583 |
+
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
| 3584 |
+
|
| 3585 |
+
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
| 3586 |
+
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
| 3587 |
+
|
| 3588 |
+
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
| 3589 |
+
|
| 3590 |
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
| 3591 |
+
}
|
| 3592 |
+
|
| 3593 |
+
*s = sumf;
|
| 3594 |
#else
|
| 3595 |
// scalar
|
| 3596 |
float sumf = 0.0;
|
|
|
|
| 4291 |
}
|
| 4292 |
|
| 4293 |
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
| 4294 |
+
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
|
| 4295 |
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
| 4296 |
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
| 4297 |
+
}
|
| 4298 |
+
return nbytes;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4299 |
}
|
| 4300 |
|
| 4301 |
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
|
|
| 4749 |
enum ggml_type type,
|
| 4750 |
int n_dims,
|
| 4751 |
const int64_t * ne,
|
| 4752 |
+
struct ggml_tensor * view_src,
|
| 4753 |
+
size_t view_offs) {
|
| 4754 |
|
| 4755 |
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
| 4756 |
|
| 4757 |
+
// find the base tensor and absolute offset
|
| 4758 |
+
if (view_src != NULL && view_src->view_src != NULL) {
|
| 4759 |
+
view_offs += view_src->view_offs;
|
| 4760 |
+
view_src = view_src->view_src;
|
| 4761 |
+
}
|
| 4762 |
|
| 4763 |
+
size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
| 4764 |
+
for (int i = 1; i < n_dims; i++) {
|
| 4765 |
+
data_size *= ne[i];
|
|
|
|
|
|
|
| 4766 |
}
|
| 4767 |
|
| 4768 |
+
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
| 4769 |
+
|
| 4770 |
+
void * data = view_src != NULL ? view_src->data : NULL;
|
| 4771 |
+
if (data != NULL) {
|
| 4772 |
+
data = (char *) data + view_offs;
|
| 4773 |
+
}
|
|
|
|
|
|
|
| 4774 |
|
| 4775 |
+
size_t obj_alloc_size = 0;
|
| 4776 |
|
| 4777 |
+
if (view_src == NULL && ctx->no_alloc == false) {
|
| 4778 |
+
if (ctx->scratch.data != NULL) {
|
| 4779 |
+
// allocate tensor data in the scratch buffer
|
| 4780 |
+
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
| 4781 |
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
| 4782 |
+
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
| 4783 |
+
assert(false);
|
| 4784 |
+
return NULL;
|
| 4785 |
+
}
|
| 4786 |
|
| 4787 |
+
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
|
| 4788 |
+
|
| 4789 |
+
ctx->scratch.offs += data_size;
|
| 4790 |
+
} else {
|
| 4791 |
+
// allocate tensor data in the context's memory pool
|
| 4792 |
+
obj_alloc_size = data_size;
|
| 4793 |
+
}
|
| 4794 |
}
|
| 4795 |
|
| 4796 |
+
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
|
| 4797 |
|
| 4798 |
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
|
| 4799 |
|
|
|
|
| 4813 |
/*.perf_runs =*/ 0,
|
| 4814 |
/*.perf_cycles =*/ 0,
|
| 4815 |
/*.perf_time_us =*/ 0,
|
| 4816 |
+
/*.view_src =*/ view_src,
|
| 4817 |
+
/*.view_offs =*/ view_offs,
|
| 4818 |
+
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
|
| 4819 |
/*.name =*/ { 0 },
|
| 4820 |
/*.extra =*/ NULL,
|
| 4821 |
/*.padding =*/ { 0 },
|
|
|
|
| 4839 |
return result;
|
| 4840 |
}
|
| 4841 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4842 |
struct ggml_tensor * ggml_new_tensor(
|
| 4843 |
struct ggml_context * ctx,
|
| 4844 |
enum ggml_type type,
|
| 4845 |
int n_dims,
|
| 4846 |
const int64_t * ne) {
|
| 4847 |
+
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
|
| 4848 |
}
|
| 4849 |
|
| 4850 |
struct ggml_tensor * ggml_new_tensor_1d(
|
|
|
|
| 4909 |
}
|
| 4910 |
|
| 4911 |
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
| 4912 |
+
return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
|
| 4913 |
+
}
|
| 4914 |
+
|
| 4915 |
+
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
| 4916 |
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
| 4917 |
+
assert(params_size <= GGML_MAX_OP_PARAMS);
|
| 4918 |
+
memcpy(tensor->op_params, params, params_size);
|
| 4919 |
+
}
|
| 4920 |
+
|
| 4921 |
+
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
| 4922 |
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
| 4923 |
+
return ((const int32_t *)(tensor->op_params))[i];
|
| 4924 |
+
}
|
| 4925 |
+
|
| 4926 |
+
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
| 4927 |
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
| 4928 |
+
((int32_t *)(tensor->op_params))[i] = value;
|
| 4929 |
}
|
| 4930 |
|
| 4931 |
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
|
|
|
| 5201 |
return tensor;
|
| 5202 |
}
|
| 5203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5204 |
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
| 5205 |
va_list args;
|
| 5206 |
va_start(args, fmt);
|
|
|
|
| 5211 |
|
| 5212 |
struct ggml_tensor * ggml_view_tensor(
|
| 5213 |
struct ggml_context * ctx,
|
| 5214 |
+
struct ggml_tensor * src) {
|
| 5215 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
|
| 5216 |
ggml_format_name(result, "%s (view)", src->name);
|
| 5217 |
|
| 5218 |
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 5219 |
+
result->nb[i] = src->nb[i];
|
| 5220 |
+
}
|
|
|
|
| 5221 |
|
| 5222 |
return result;
|
| 5223 |
}
|
|
|
|
| 5790 |
|
| 5791 |
// ggml_concat
|
| 5792 |
|
| 5793 |
+
struct ggml_tensor * ggml_concat(
|
| 5794 |
struct ggml_context* ctx,
|
| 5795 |
struct ggml_tensor* a,
|
| 5796 |
struct ggml_tensor* b) {
|
|
|
|
| 6057 |
struct ggml_tensor * ggml_rms_norm_back(
|
| 6058 |
struct ggml_context * ctx,
|
| 6059 |
struct ggml_tensor * a,
|
| 6060 |
+
struct ggml_tensor * b,
|
| 6061 |
+
float eps) {
|
| 6062 |
bool is_node = false;
|
| 6063 |
|
| 6064 |
if (a->grad) {
|
|
|
|
| 6068 |
|
| 6069 |
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
| 6070 |
|
| 6071 |
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
| 6072 |
+
|
| 6073 |
result->op = GGML_OP_RMS_NORM_BACK;
|
| 6074 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 6075 |
result->src[0] = a;
|
|
|
|
| 6399 |
//GGML_ASSERT(false);
|
| 6400 |
}
|
| 6401 |
|
| 6402 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
|
| 6403 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6404 |
|
| 6405 |
result->op = GGML_OP_RESHAPE;
|
|
|
|
| 6423 |
}
|
| 6424 |
|
| 6425 |
const int64_t ne[1] = { ne0 };
|
| 6426 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
|
| 6427 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6428 |
|
| 6429 |
result->op = GGML_OP_RESHAPE;
|
|
|
|
| 6448 |
}
|
| 6449 |
|
| 6450 |
const int64_t ne[2] = { ne0, ne1 };
|
| 6451 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
|
| 6452 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6453 |
|
| 6454 |
result->op = GGML_OP_RESHAPE;
|
|
|
|
| 6474 |
}
|
| 6475 |
|
| 6476 |
const int64_t ne[3] = { ne0, ne1, ne2 };
|
| 6477 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
|
| 6478 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6479 |
|
| 6480 |
result->op = GGML_OP_RESHAPE;
|
|
|
|
| 6484 |
return result;
|
| 6485 |
}
|
| 6486 |
|
|
|
|
| 6487 |
struct ggml_tensor * ggml_reshape_4d(
|
| 6488 |
struct ggml_context * ctx,
|
| 6489 |
struct ggml_tensor * a,
|
|
|
|
| 6501 |
}
|
| 6502 |
|
| 6503 |
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
| 6504 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
|
| 6505 |
ggml_format_name(result, "%s (reshaped)", a->name);
|
| 6506 |
|
| 6507 |
result->op = GGML_OP_RESHAPE;
|
|
|
|
| 6511 |
return result;
|
| 6512 |
}
|
| 6513 |
|
| 6514 |
+
static struct ggml_tensor * ggml_view_impl(
|
|
|
|
|
|
|
| 6515 |
struct ggml_context * ctx,
|
| 6516 |
struct ggml_tensor * a,
|
| 6517 |
int n_dims,
|
| 6518 |
const int64_t * ne,
|
| 6519 |
size_t offset) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6520 |
|
| 6521 |
+
bool is_node = false;
|
| 6522 |
|
| 6523 |
+
if (a->grad) {
|
| 6524 |
+
is_node = true;
|
| 6525 |
+
}
|
| 6526 |
+
|
| 6527 |
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
|
| 6528 |
ggml_format_name(result, "%s (view)", a->name);
|
| 6529 |
|
| 6530 |
ggml_set_op_params(result, &offset, sizeof(offset));
|
| 6531 |
|
| 6532 |
+
result->op = GGML_OP_VIEW;
|
| 6533 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 6534 |
+
result->src[0] = a;
|
| 6535 |
+
|
| 6536 |
return result;
|
| 6537 |
}
|
| 6538 |
|
| 6539 |
+
// ggml_view_1d
|
| 6540 |
+
|
| 6541 |
struct ggml_tensor * ggml_view_1d(
|
| 6542 |
struct ggml_context * ctx,
|
| 6543 |
struct ggml_tensor * a,
|
| 6544 |
int64_t ne0,
|
| 6545 |
size_t offset) {
|
| 6546 |
|
| 6547 |
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6548 |
|
| 6549 |
return result;
|
| 6550 |
}
|
|
|
|
| 6559 |
size_t nb1,
|
| 6560 |
size_t offset) {
|
| 6561 |
|
| 6562 |
+
const int64_t ne[2] = { ne0, ne1 };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6563 |
|
| 6564 |
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
|
| 6565 |
|
| 6566 |
result->nb[1] = nb1;
|
| 6567 |
result->nb[2] = result->nb[1]*ne1;
|
| 6568 |
result->nb[3] = result->nb[2];
|
| 6569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6570 |
return result;
|
| 6571 |
}
|
| 6572 |
|
|
|
|
| 6582 |
size_t nb2,
|
| 6583 |
size_t offset) {
|
| 6584 |
|
| 6585 |
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6586 |
|
| 6587 |
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
|
| 6588 |
|
| 6589 |
result->nb[1] = nb1;
|
| 6590 |
result->nb[2] = nb2;
|
| 6591 |
result->nb[3] = result->nb[2]*ne2;
|
| 6592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6593 |
return result;
|
| 6594 |
}
|
| 6595 |
|
|
|
|
| 6607 |
size_t nb3,
|
| 6608 |
size_t offset) {
|
| 6609 |
|
| 6610 |
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6611 |
|
| 6612 |
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
|
| 6613 |
|
| 6614 |
result->nb[1] = nb1;
|
| 6615 |
result->nb[2] = nb2;
|
| 6616 |
result->nb[3] = nb3;
|
| 6617 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6618 |
return result;
|
| 6619 |
}
|
| 6620 |
|
|
|
|
| 6801 |
|
| 6802 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 6803 |
|
| 6804 |
+
int32_t params[] = { n_past };
|
| 6805 |
ggml_set_op_params(result, params, sizeof(params));
|
| 6806 |
|
| 6807 |
result->op = GGML_OP_DIAG_MASK_INF;
|
|
|
|
| 6818 |
return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
|
| 6819 |
}
|
| 6820 |
|
|
|
|
| 6821 |
struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
| 6822 |
struct ggml_context * ctx,
|
| 6823 |
struct ggml_tensor * a,
|
|
|
|
| 6840 |
|
| 6841 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 6842 |
|
| 6843 |
+
int32_t params[] = { n_past };
|
| 6844 |
ggml_set_op_params(result, params, sizeof(params));
|
| 6845 |
|
| 6846 |
result->op = GGML_OP_DIAG_MASK_ZERO;
|
|
|
|
| 7629 |
}
|
| 7630 |
|
| 7631 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 7632 |
+
ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
|
| 7633 |
+
|
| 7634 |
result->op = GGML_OP_ADD_REL_POS;
|
| 7635 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 7636 |
result->src[0] = a;
|
|
|
|
| 9608 |
|
| 9609 |
|
| 9610 |
#ifdef GGML_USE_ACCELERATE
|
| 9611 |
+
UNUSED(ggml_vec_div_f32);
|
| 9612 |
+
|
| 9613 |
vDSP_vdiv(
|
| 9614 |
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
| 9615 |
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
|
|
|
| 10916 |
|
| 10917 |
GGML_TENSOR_BINARY_OP_LOCALS;
|
| 10918 |
|
| 10919 |
+
float eps;
|
| 10920 |
+
memcpy(&eps, dst->op_params, sizeof(float));
|
| 10921 |
|
| 10922 |
// TODO: optimize
|
| 10923 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
|
|
| 12095 |
const int ith = params->ith;
|
| 12096 |
const int nth = params->nth;
|
| 12097 |
|
| 12098 |
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
| 12099 |
+
const bool inplace = src0->data == dst->data;
|
| 12100 |
|
| 12101 |
GGML_ASSERT(n_past >= 0);
|
| 12102 |
|
|
|
|
| 12307 |
// dx = J * dy
|
| 12308 |
// dxk = sum_i(Jki * dyi)
|
| 12309 |
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
| 12310 |
+
// dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
|
| 12311 |
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
| 12312 |
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
| 12313 |
// dxk = -yk * dot(y, dy) + yk*dyk
|
|
|
|
| 14098 |
vvexpf(S, S, &Mup);
|
| 14099 |
ggml_vec_sum_f32(Mup, &sum, S);
|
| 14100 |
#else
|
| 14101 |
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
| 14102 |
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
| 14103 |
|
| 14104 |
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
|
|
| 14108 |
if (SS[j] == -INFINITY) {
|
| 14109 |
SS[j] = 0.0f;
|
| 14110 |
} else {
|
| 14111 |
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
| 14112 |
+
const float val = expf(SS[j] - max);
|
| 14113 |
+
#else
|
| 14114 |
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
| 14115 |
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
| 14116 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
| 14117 |
+
#endif
|
| 14118 |
sump[j] += (ggml_float)val;
|
| 14119 |
SS[j] = val;
|
| 14120 |
}
|
|
|
|
| 14692 |
vvexpf(SM, SM, &Mup);
|
| 14693 |
ggml_vec_sum_f32(Mup, &sum, SM);
|
| 14694 |
#else
|
| 14695 |
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
| 14696 |
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
| 14697 |
|
| 14698 |
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
|
|
| 14703 |
if (SR[j] == -INFINITY) {
|
| 14704 |
SW[j] = 0.0f;
|
| 14705 |
} else {
|
| 14706 |
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
| 14707 |
+
const float val = expf(SR[j] - max);
|
| 14708 |
+
#else
|
| 14709 |
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
| 14710 |
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
| 14711 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
| 14712 |
+
#endif
|
| 14713 |
sump[j] += (ggml_float)val;
|
| 14714 |
SW[j] = val;
|
| 14715 |
}
|
|
|
|
| 15155 |
const struct ggml_tensor * src1,
|
| 15156 |
const struct ggml_tensor * src2,
|
| 15157 |
struct ggml_tensor * dst) {
|
|
|
|
|
|
|
|
|
|
| 15158 |
|
| 15159 |
+
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
| 15160 |
if (!inplace && params->type == GGML_TASK_INIT) {
|
| 15161 |
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
| 15162 |
return;
|
|
|
|
| 15447 |
const int nc = src0->ne[0];
|
| 15448 |
const int nr = ggml_nrows(src0);
|
| 15449 |
|
| 15450 |
+
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
| 15451 |
+
|
| 15452 |
if (params->type == GGML_TASK_INIT) {
|
| 15453 |
if (ith == 0) {
|
| 15454 |
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
|
|
|
| 15460 |
if (ith == 0) {
|
| 15461 |
float * dp = (float *) dst->data;
|
| 15462 |
ggml_vec_sum_f32(nth, dp, sums);
|
| 15463 |
+
dp[0] *= -1.0f / (float) nr;
|
| 15464 |
}
|
| 15465 |
return;
|
| 15466 |
}
|
|
|
|
| 15477 |
for (int i1 = ir0; i1 < ir1; i1++) {
|
| 15478 |
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
| 15479 |
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
| 15480 |
+
float * st = ((float *) params->wdata) + nth + ith*nc;
|
| 15481 |
|
| 15482 |
#ifndef NDEBUG
|
| 15483 |
for (int i = 0; i < nc; ++i) {
|
|
|
|
| 15492 |
float max = -INFINITY;
|
| 15493 |
ggml_vec_max_f32(nc, &max, s0);
|
| 15494 |
|
| 15495 |
+
uint16_t scvt; UNUSED(scvt);
|
| 15496 |
for (int i = 0; i < nc; i++) {
|
| 15497 |
if (s0[i] == -INFINITY) {
|
| 15498 |
st[i] = 0.0f;
|
| 15499 |
} else {
|
| 15500 |
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
| 15501 |
+
const float s = s0[i] - max;
|
| 15502 |
+
const float val = expf(s);
|
| 15503 |
+
#else
|
| 15504 |
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
| 15505 |
memcpy(&scvt, &s, sizeof(scvt));
|
| 15506 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
| 15507 |
+
#endif
|
| 15508 |
sum += (ggml_float)val;
|
| 15509 |
st[i] = val;
|
| 15510 |
}
|
|
|
|
| 15520 |
ggml_vec_log_f32(nc, st, st);
|
| 15521 |
ggml_vec_mul_f32(nc, st, st, s1);
|
| 15522 |
|
| 15523 |
+
float st_sum = 0;
|
| 15524 |
+
ggml_vec_sum_f32(nc, &st_sum, st);
|
| 15525 |
+
sums[ith] += st_sum;
|
| 15526 |
|
| 15527 |
#ifndef NDEBUG
|
| 15528 |
for (int i = 0; i < nc; ++i) {
|
|
|
|
| 15572 |
return;
|
| 15573 |
}
|
| 15574 |
|
| 15575 |
+
const double eps = 1e-9;
|
| 15576 |
|
| 15577 |
// TODO: handle transposed/permuted matrices
|
| 15578 |
const int64_t nc = src0->ne[0];
|
|
|
|
| 15591 |
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
| 15592 |
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
| 15593 |
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
|
|
|
| 15594 |
|
| 15595 |
#ifndef NDEBUG
|
| 15596 |
for (int i = 0; i < nc; ++i) {
|
|
|
|
| 15599 |
assert(!isnan(s1[i]));
|
| 15600 |
}
|
| 15601 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15602 |
|
| 15603 |
// soft_max
|
| 15604 |
ggml_float sum = 0.0;
|
|
|
|
| 15606 |
float max = -INFINITY;
|
| 15607 |
ggml_vec_max_f32(nc, &max, s0);
|
| 15608 |
|
| 15609 |
+
uint16_t scvt; UNUSED(scvt);
|
| 15610 |
for (int i = 0; i < nc; i++) {
|
| 15611 |
if (s0[i] == -INFINITY) {
|
| 15612 |
+
ds0[i] = 0.0f;
|
| 15613 |
} else {
|
| 15614 |
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
| 15615 |
+
const float s = s0[i] - max;
|
| 15616 |
+
const float val = expf(s);
|
| 15617 |
+
#else
|
| 15618 |
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
| 15619 |
memcpy(&scvt, &s, sizeof(scvt));
|
| 15620 |
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
| 15621 |
+
#endif
|
| 15622 |
sum += (ggml_float)val;
|
| 15623 |
+
ds0[i] = val;
|
| 15624 |
}
|
| 15625 |
}
|
| 15626 |
|
| 15627 |
assert(sum > 0.0);
|
| 15628 |
+
sum = (1.0 - eps)/sum;
|
| 15629 |
}
|
| 15630 |
|
| 15631 |
+
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
| 15632 |
+
ggml_vec_scale_f32(nc, ds0, sum);
|
| 15633 |
+
ggml_vec_add1_f32(nc, ds0, ds0, eps);
|
| 15634 |
+
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
| 15635 |
+
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
| 15636 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15637 |
|
| 15638 |
#ifndef NDEBUG
|
| 15639 |
for (int i = 0; i < nc; ++i) {
|
|
|
|
|
|
|
| 15640 |
assert(!isnan(ds0[i]));
|
| 15641 |
assert(!isinf(ds0[i]));
|
| 15642 |
}
|
|
|
|
| 16191 |
{
|
| 16192 |
// necessary for llama
|
| 16193 |
if (src0->grad) {
|
| 16194 |
+
float eps;
|
| 16195 |
+
memcpy(&eps, tensor->op_params, sizeof(float));
|
| 16196 |
+
|
| 16197 |
src0->grad = ggml_add_impl(ctx,
|
| 16198 |
src0->grad,
|
| 16199 |
+
ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
|
| 16200 |
inplace);
|
| 16201 |
}
|
| 16202 |
} break;
|
|
|
|
| 16964 |
return result;
|
| 16965 |
}
|
| 16966 |
|
| 16967 |
+
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
|
|
|
|
|
|
| 16968 |
GGML_ASSERT(gf->n_nodes > 0);
|
| 16969 |
|
| 16970 |
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
|
|
|
| 16988 |
}
|
| 16989 |
}
|
| 16990 |
|
| 16991 |
+
for (int i = 0; i < gf->n_nodes; i++) {
|
| 16992 |
struct ggml_tensor * node = gf->nodes[i];
|
| 16993 |
|
| 16994 |
if (node->is_param) {
|
| 16995 |
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
| 16996 |
+
ggml_build_forward_expand(gb, node->grad);
|
| 16997 |
}
|
| 16998 |
}
|
| 16999 |
+
}
|
| 17000 |
|
| 17001 |
+
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
| 17002 |
+
struct ggml_cgraph result = *gf;
|
| 17003 |
+
ggml_build_backward_expand(ctx, gf, &result, keep);
|
| 17004 |
return result;
|
| 17005 |
}
|
| 17006 |
|
|
|
|
| 17676 |
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
| 17677 |
{
|
| 17678 |
n_tasks = n_threads;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17679 |
} break;
|
| 17680 |
case GGML_OP_NONE:
|
| 17681 |
{
|
|
|
|
| 18553 |
struct ggml_opt_params params,
|
| 18554 |
struct ggml_tensor * f,
|
| 18555 |
struct ggml_cgraph * gf,
|
| 18556 |
+
struct ggml_cgraph * gb,
|
| 18557 |
+
ggml_opt_callback callback,
|
| 18558 |
+
void * callback_data) {
|
| 18559 |
GGML_ASSERT(ggml_is_scalar(f));
|
| 18560 |
|
| 18561 |
// these will store the parameters we want to optimize
|
| 18562 |
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
| 18563 |
|
| 18564 |
int np = 0;
|
| 18565 |
+
int64_t nx = 0;
|
| 18566 |
for (int i = 0; i < gf->n_nodes; ++i) {
|
| 18567 |
if (gf->nodes[i]->is_param) {
|
| 18568 |
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
|
|
|
| 18581 |
}
|
| 18582 |
|
| 18583 |
// constants
|
| 18584 |
+
float sched = params.adam.sched;
|
| 18585 |
+
const float alpha = params.adam.alpha;
|
| 18586 |
+
const float decay = params.adam.decay * alpha;
|
| 18587 |
const float beta1 = params.adam.beta1;
|
| 18588 |
const float beta2 = params.adam.beta2;
|
| 18589 |
const float eps = params.adam.eps;
|
| 18590 |
+
const float gclip = params.adam.gclip;
|
| 18591 |
+
const int decay_min_ndim = params.adam.decay_min_ndim;
|
| 18592 |
|
|
|
|
|
|
|
|
|
|
| 18593 |
float * m = opt->adam.m->data; // first moment
|
| 18594 |
float * v = opt->adam.v->data; // second moment
|
|
|
|
|
|
|
| 18595 |
|
| 18596 |
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
| 18597 |
|
| 18598 |
+
if (callback) {
|
| 18599 |
+
callback(callback_data, &sched);
|
| 18600 |
+
}
|
| 18601 |
|
| 18602 |
// compute the function value
|
| 18603 |
ggml_graph_reset (gf);
|
| 18604 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18605 |
|
| 18606 |
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
| 18607 |
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
| 18608 |
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
| 18609 |
+
ggml_graph_compute(gb, &cplan);
|
| 18610 |
|
| 18611 |
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
| 18612 |
opt->adam.fx_best = opt->adam.fx_prev;
|
|
|
|
| 18614 |
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
| 18615 |
}
|
| 18616 |
|
| 18617 |
+
opt->loss_before = opt->adam.fx_prev;
|
| 18618 |
+
opt->loss_after = opt->adam.fx_prev;
|
| 18619 |
+
|
| 18620 |
// initialize
|
| 18621 |
if (opt->just_initialized) {
|
| 18622 |
opt->adam.n_no_improvement = 0;
|
|
|
|
| 18649 |
UNUSED(t_start_cpu);
|
| 18650 |
|
| 18651 |
{
|
| 18652 |
+
float gnorm = 1.0f;
|
| 18653 |
+
if (gclip > 0.0f) {
|
| 18654 |
+
// gradient clipping
|
| 18655 |
+
ggml_float sum = 0.0;
|
| 18656 |
+
for (int p = 0; p < np; ++p) {
|
| 18657 |
+
const int64_t ne = ggml_nelements(ps[p]);
|
| 18658 |
+
for (int64_t j = 0; j < ne; ++j) {
|
| 18659 |
+
float g = ggml_get_f32_1d(ps[p]->grad, j);
|
| 18660 |
+
sum += (ggml_float)(g*g);
|
| 18661 |
+
}
|
| 18662 |
+
}
|
| 18663 |
+
ggml_float norm = sqrt(sum);
|
| 18664 |
+
if (norm > (ggml_float) gclip) {
|
| 18665 |
+
gnorm = (float) ((ggml_float) gclip / norm);
|
| 18666 |
+
}
|
| 18667 |
+
}
|
| 18668 |
+
const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
|
| 18669 |
+
const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
|
| 18670 |
+
int64_t i = 0;
|
| 18671 |
+
for (int p = 0; p < np; ++p) {
|
| 18672 |
+
const int64_t ne = ggml_nelements(ps[p]);
|
| 18673 |
+
const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
| 18674 |
+
for (int64_t j = 0; j < ne; ++j) {
|
| 18675 |
+
float x = ggml_get_f32_1d(ps[p], j);
|
| 18676 |
+
float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
|
| 18677 |
+
m[i] = m[i]*beta1 + g*(1.0f - beta1);
|
| 18678 |
+
v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
|
| 18679 |
+
float mh = m[i]*beta1h;
|
| 18680 |
+
float vh = v[i]*beta2h;
|
| 18681 |
+
vh = sqrtf(vh) + eps;
|
| 18682 |
+
x = x*(1.0f - p_decay) - mh/vh;
|
| 18683 |
+
ggml_set_f32_1d(ps[p], j, x);
|
| 18684 |
+
++i;
|
| 18685 |
+
}
|
| 18686 |
+
}
|
| 18687 |
+
}
|
| 18688 |
|
| 18689 |
+
if (callback) {
|
| 18690 |
+
callback(callback_data, &sched);
|
| 18691 |
}
|
| 18692 |
|
| 18693 |
ggml_graph_reset (gf);
|
| 18694 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18695 |
|
| 18696 |
+
ggml_graph_compute(gb, &cplan);
|
| 18697 |
|
| 18698 |
const float fx = ggml_get_f32_1d(f, 0);
|
| 18699 |
+
opt->loss_after = fx;
|
| 18700 |
+
|
| 18701 |
|
| 18702 |
// check convergence
|
| 18703 |
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
|
|
|
| 18766 |
};
|
| 18767 |
|
| 18768 |
static enum ggml_opt_result linesearch_backtracking(
|
|
|
|
| 18769 |
const struct ggml_opt_params * params,
|
| 18770 |
int nx,
|
| 18771 |
float * x,
|
|
|
|
| 18777 |
struct ggml_tensor * f,
|
| 18778 |
struct ggml_cgraph * gf,
|
| 18779 |
struct ggml_cgraph * gb,
|
| 18780 |
+
struct ggml_cplan * cplan,
|
| 18781 |
const int np,
|
| 18782 |
+
struct ggml_tensor * ps[],
|
| 18783 |
+
ggml_opt_callback callback,
|
| 18784 |
+
void * callback_data) {
|
| 18785 |
int count = 0;
|
| 18786 |
|
| 18787 |
float width = 0.0f;
|
|
|
|
| 18810 |
dgtest = params->lbfgs.ftol*dginit;
|
| 18811 |
|
| 18812 |
while (true) {
|
| 18813 |
+
if (callback) {
|
| 18814 |
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
| 18815 |
+
float sched = 0;
|
| 18816 |
+
callback(callback_data, &sched);
|
| 18817 |
+
}
|
| 18818 |
+
|
| 18819 |
ggml_vec_cpy_f32(nx, x, xp);
|
| 18820 |
ggml_vec_mad_f32(nx, x, d, *step);
|
| 18821 |
|
|
|
|
| 18826 |
ggml_graph_reset (gf);
|
| 18827 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18828 |
|
| 18829 |
+
ggml_graph_compute(gb, cplan);
|
| 18830 |
|
| 18831 |
ggml_opt_get_grad(np, ps, g);
|
| 18832 |
|
|
|
|
| 18860 |
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
| 18861 |
return count;
|
| 18862 |
}
|
|
|
|
| 18863 |
}
|
| 18864 |
}
|
| 18865 |
|
|
|
|
| 18885 |
struct ggml_opt_params params,
|
| 18886 |
struct ggml_tensor * f,
|
| 18887 |
struct ggml_cgraph * gf,
|
| 18888 |
+
struct ggml_cgraph * gb,
|
| 18889 |
+
ggml_opt_callback callback,
|
| 18890 |
+
void * callback_data) {
|
| 18891 |
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
| 18892 |
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
| 18893 |
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
|
|
|
| 18919 |
opt->iter = iter;
|
| 18920 |
}
|
| 18921 |
|
| 18922 |
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
| 18923 |
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
| 18924 |
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
| 18925 |
+
|
| 18926 |
float * x = opt->lbfgs.x->data; // current parameters
|
| 18927 |
float * xp = opt->lbfgs.xp->data; // previous parameters
|
| 18928 |
float * g = opt->lbfgs.g->data; // current gradient
|
|
|
|
| 18944 |
float * lm_s = opt->lbfgs.lms->data;
|
| 18945 |
float * lm_y = opt->lbfgs.lmy->data;
|
| 18946 |
|
| 18947 |
+
if (callback) {
|
| 18948 |
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
| 18949 |
+
float sched = 0;
|
| 18950 |
+
callback(callback_data, &sched);
|
| 18951 |
+
}
|
| 18952 |
+
|
| 18953 |
// evaluate the function value and its gradient
|
| 18954 |
{
|
| 18955 |
ggml_opt_set_params(np, ps, x);
|
|
|
|
| 18957 |
ggml_graph_reset (gf);
|
| 18958 |
ggml_set_f32 (f->grad, 1.0f);
|
| 18959 |
|
| 18960 |
+
ggml_graph_compute(gb, &cplan);
|
| 18961 |
|
| 18962 |
ggml_opt_get_grad(np, ps, g);
|
| 18963 |
|
| 18964 |
fx = ggml_get_f32_1d(f, 0);
|
| 18965 |
+
|
| 18966 |
+
opt->loss_before = fx;
|
| 18967 |
+
opt->loss_after = fx;
|
| 18968 |
}
|
| 18969 |
|
| 18970 |
// search direction = -gradient
|
|
|
|
| 19019 |
ggml_vec_cpy_f32(nx, xp, x);
|
| 19020 |
ggml_vec_cpy_f32(nx, gp, g);
|
| 19021 |
|
| 19022 |
+
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
|
| 19023 |
|
| 19024 |
if (ls < 0) {
|
| 19025 |
// linesearch failed - go back to the previous point and return
|
|
|
|
| 19029 |
return ls;
|
| 19030 |
}
|
| 19031 |
|
| 19032 |
+
opt->loss_after = fx;
|
| 19033 |
+
|
| 19034 |
ggml_vec_norm_f32(nx, &xnorm, x);
|
| 19035 |
ggml_vec_norm_f32(nx, &gnorm, g);
|
| 19036 |
|
|
|
|
| 19088 |
// ys = y^t \cdot s -> 1 / \rho.
|
| 19089 |
// yy = y^t \cdot y.
|
| 19090 |
//
|
| 19091 |
+
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
| 19092 |
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
| 19093 |
|
| 19094 |
lm_ys[end[0]] = ys;
|
|
|
|
| 19151 |
.adam = {
|
| 19152 |
.n_iter = 10000,
|
| 19153 |
.sched = 1.000f,
|
| 19154 |
+
.decay = 0.0f,
|
| 19155 |
+
.decay_min_ndim = 2,
|
| 19156 |
.alpha = 0.001f,
|
| 19157 |
.beta1 = 0.9f,
|
| 19158 |
.beta2 = 0.999f,
|
| 19159 |
.eps = 1e-8f,
|
| 19160 |
.eps_f = 1e-5f,
|
| 19161 |
.eps_g = 1e-3f,
|
| 19162 |
+
.gclip = 0.0f,
|
| 19163 |
},
|
| 19164 |
};
|
| 19165 |
} break;
|
|
|
|
| 19209 |
switch (opt->params.type) {
|
| 19210 |
case GGML_OPT_ADAM:
|
| 19211 |
{
|
|
|
|
|
|
|
|
|
|
| 19212 |
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
| 19213 |
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
|
|
|
|
| 19214 |
opt->adam.pf = params.past > 0
|
| 19215 |
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
| 19216 |
: NULL;
|
|
|
|
|
|
|
|
|
|
| 19217 |
ggml_set_zero(opt->adam.m);
|
| 19218 |
ggml_set_zero(opt->adam.v);
|
|
|
|
|
|
|
| 19219 |
if (opt->adam.pf) {
|
| 19220 |
ggml_set_zero(opt->adam.pf);
|
| 19221 |
}
|
|
|
|
| 19299 |
*gf = ggml_build_forward (f);
|
| 19300 |
*gb = ggml_build_backward(ctx, gf, true);
|
| 19301 |
|
| 19302 |
+
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
| 19303 |
}
|
| 19304 |
|
| 19305 |
enum ggml_opt_result ggml_opt_resume_g(
|
|
|
|
| 19307 |
struct ggml_opt_context * opt,
|
| 19308 |
struct ggml_tensor * f,
|
| 19309 |
struct ggml_cgraph * gf,
|
| 19310 |
+
struct ggml_cgraph * gb,
|
| 19311 |
+
ggml_opt_callback callback,
|
| 19312 |
+
void * callback_data) {
|
| 19313 |
|
| 19314 |
// build forward + backward compute graphs
|
| 19315 |
enum ggml_opt_result result = GGML_OPT_OK;
|
|
|
|
| 19317 |
switch (opt->params.type) {
|
| 19318 |
case GGML_OPT_ADAM:
|
| 19319 |
{
|
| 19320 |
+
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
| 19321 |
} break;
|
| 19322 |
case GGML_OPT_LBFGS:
|
| 19323 |
{
|
| 19324 |
+
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
| 19325 |
} break;
|
| 19326 |
}
|
| 19327 |
|
|
|
|
| 19776 |
|
| 19777 |
// read the kv pairs
|
| 19778 |
{
|
| 19779 |
+
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
| 19780 |
|
| 19781 |
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
| 19782 |
struct gguf_kv * kv = &ctx->kv[i];
|
|
|
|
| 19859 |
|
| 19860 |
// read the tensor infos
|
| 19861 |
{
|
| 19862 |
+
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
| 19863 |
|
| 19864 |
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
| 19865 |
struct gguf_tensor_info * info = &ctx->infos[i];
|
|
|
|
| 20060 |
}
|
| 20061 |
}
|
| 20062 |
|
| 20063 |
+
free(ctx->kv);
|
| 20064 |
}
|
| 20065 |
|
| 20066 |
if (ctx->infos) {
|
|
|
|
| 20072 |
}
|
| 20073 |
}
|
| 20074 |
|
| 20075 |
+
free(ctx->infos);
|
| 20076 |
}
|
| 20077 |
|
| 20078 |
GGML_ALIGNED_FREE(ctx);
|
ggml.h
CHANGED
|
@@ -479,6 +479,9 @@ extern "C" {
|
|
| 479 |
int64_t perf_cycles;
|
| 480 |
int64_t perf_time_us;
|
| 481 |
|
|
|
|
|
|
|
|
|
|
| 482 |
void * data;
|
| 483 |
|
| 484 |
char name[GGML_MAX_NAME];
|
|
@@ -661,7 +664,7 @@ extern "C" {
|
|
| 661 |
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
| 662 |
|
| 663 |
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
| 664 |
-
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx,
|
| 665 |
|
| 666 |
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
| 667 |
|
|
@@ -952,11 +955,11 @@ extern "C" {
|
|
| 952 |
|
| 953 |
// a - x
|
| 954 |
// b - dy
|
| 955 |
-
// TODO: update with configurable eps
|
| 956 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
| 957 |
struct ggml_context * ctx,
|
| 958 |
struct ggml_tensor * a,
|
| 959 |
-
struct ggml_tensor * b
|
|
|
|
| 960 |
|
| 961 |
// A: n columns, m rows
|
| 962 |
// B: n columns, p rows (i.e. we transpose it internally)
|
|
@@ -1612,7 +1615,8 @@ extern "C" {
|
|
| 1612 |
struct ggml_tensor * tensor);
|
| 1613 |
|
| 1614 |
|
| 1615 |
-
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
|
|
|
| 1616 |
|
| 1617 |
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
| 1618 |
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
|
@@ -1677,6 +1681,8 @@ extern "C" {
|
|
| 1677 |
GGML_LINESEARCH_INVALID_PARAMETERS,
|
| 1678 |
};
|
| 1679 |
|
|
|
|
|
|
|
| 1680 |
// optimization parameters
|
| 1681 |
//
|
| 1682 |
// see ggml.c (ggml_opt_default_params) for default values
|
|
@@ -1712,12 +1718,14 @@ extern "C" {
|
|
| 1712 |
|
| 1713 |
float sched; // schedule multiplier (fixed, decay or warmup)
|
| 1714 |
float decay; // weight decay for AdamW, use 0.0f to disable
|
|
|
|
| 1715 |
float alpha; // learning rate
|
| 1716 |
float beta1;
|
| 1717 |
float beta2;
|
| 1718 |
float eps; // epsilon for numerical stability
|
| 1719 |
float eps_f; // epsilon for convergence test
|
| 1720 |
float eps_g; // epsilon for convergence test
|
|
|
|
| 1721 |
} adam;
|
| 1722 |
|
| 1723 |
// LBFGS parameters
|
|
@@ -1745,14 +1753,12 @@ extern "C" {
|
|
| 1745 |
|
| 1746 |
bool just_initialized;
|
| 1747 |
|
|
|
|
|
|
|
|
|
|
| 1748 |
struct {
|
| 1749 |
-
struct ggml_tensor * x; // view of the parameters
|
| 1750 |
-
struct ggml_tensor * g1; // gradient
|
| 1751 |
-
struct ggml_tensor * g2; // gradient squared
|
| 1752 |
struct ggml_tensor * m; // first moment
|
| 1753 |
struct ggml_tensor * v; // second moment
|
| 1754 |
-
struct ggml_tensor * mh; // first moment hat
|
| 1755 |
-
struct ggml_tensor * vh; // second moment hat
|
| 1756 |
struct ggml_tensor * pf; // past function values
|
| 1757 |
float fx_best;
|
| 1758 |
float fx_prev;
|
|
@@ -1789,10 +1795,10 @@ extern "C" {
|
|
| 1789 |
|
| 1790 |
// initialize optimizer context
|
| 1791 |
GGML_API void ggml_opt_init(
|
| 1792 |
-
struct ggml_context
|
| 1793 |
struct ggml_opt_context * opt,
|
| 1794 |
-
struct ggml_opt_params
|
| 1795 |
-
int64_t
|
| 1796 |
|
| 1797 |
// continue optimizing the function defined by the tensor f
|
| 1798 |
GGML_API enum ggml_opt_result ggml_opt_resume(
|
|
@@ -1806,7 +1812,9 @@ extern "C" {
|
|
| 1806 |
struct ggml_opt_context * opt,
|
| 1807 |
struct ggml_tensor * f,
|
| 1808 |
struct ggml_cgraph * gf,
|
| 1809 |
-
struct ggml_cgraph * gb
|
|
|
|
|
|
|
| 1810 |
|
| 1811 |
//
|
| 1812 |
// quantization
|
|
|
|
| 479 |
int64_t perf_cycles;
|
| 480 |
int64_t perf_time_us;
|
| 481 |
|
| 482 |
+
struct ggml_tensor * view_src;
|
| 483 |
+
size_t view_offs;
|
| 484 |
+
|
| 485 |
void * data;
|
| 486 |
|
| 487 |
char name[GGML_MAX_NAME];
|
|
|
|
| 664 |
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
| 665 |
|
| 666 |
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
| 667 |
+
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
| 668 |
|
| 669 |
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
| 670 |
|
|
|
|
| 955 |
|
| 956 |
// a - x
|
| 957 |
// b - dy
|
|
|
|
| 958 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
| 959 |
struct ggml_context * ctx,
|
| 960 |
struct ggml_tensor * a,
|
| 961 |
+
struct ggml_tensor * b,
|
| 962 |
+
float eps);
|
| 963 |
|
| 964 |
// A: n columns, m rows
|
| 965 |
// B: n columns, p rows (i.e. we transpose it internally)
|
|
|
|
| 1615 |
struct ggml_tensor * tensor);
|
| 1616 |
|
| 1617 |
|
| 1618 |
+
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
| 1619 |
+
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
| 1620 |
|
| 1621 |
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
| 1622 |
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
|
|
|
| 1681 |
GGML_LINESEARCH_INVALID_PARAMETERS,
|
| 1682 |
};
|
| 1683 |
|
| 1684 |
+
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
| 1685 |
+
|
| 1686 |
// optimization parameters
|
| 1687 |
//
|
| 1688 |
// see ggml.c (ggml_opt_default_params) for default values
|
|
|
|
| 1718 |
|
| 1719 |
float sched; // schedule multiplier (fixed, decay or warmup)
|
| 1720 |
float decay; // weight decay for AdamW, use 0.0f to disable
|
| 1721 |
+
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
| 1722 |
float alpha; // learning rate
|
| 1723 |
float beta1;
|
| 1724 |
float beta2;
|
| 1725 |
float eps; // epsilon for numerical stability
|
| 1726 |
float eps_f; // epsilon for convergence test
|
| 1727 |
float eps_g; // epsilon for convergence test
|
| 1728 |
+
float gclip; // gradient clipping
|
| 1729 |
} adam;
|
| 1730 |
|
| 1731 |
// LBFGS parameters
|
|
|
|
| 1753 |
|
| 1754 |
bool just_initialized;
|
| 1755 |
|
| 1756 |
+
float loss_before;
|
| 1757 |
+
float loss_after;
|
| 1758 |
+
|
| 1759 |
struct {
|
|
|
|
|
|
|
|
|
|
| 1760 |
struct ggml_tensor * m; // first moment
|
| 1761 |
struct ggml_tensor * v; // second moment
|
|
|
|
|
|
|
| 1762 |
struct ggml_tensor * pf; // past function values
|
| 1763 |
float fx_best;
|
| 1764 |
float fx_prev;
|
|
|
|
| 1795 |
|
| 1796 |
// initialize optimizer context
|
| 1797 |
GGML_API void ggml_opt_init(
|
| 1798 |
+
struct ggml_context * ctx,
|
| 1799 |
struct ggml_opt_context * opt,
|
| 1800 |
+
struct ggml_opt_params params,
|
| 1801 |
+
int64_t nx);
|
| 1802 |
|
| 1803 |
// continue optimizing the function defined by the tensor f
|
| 1804 |
GGML_API enum ggml_opt_result ggml_opt_resume(
|
|
|
|
| 1812 |
struct ggml_opt_context * opt,
|
| 1813 |
struct ggml_tensor * f,
|
| 1814 |
struct ggml_cgraph * gf,
|
| 1815 |
+
struct ggml_cgraph * gb,
|
| 1816 |
+
ggml_opt_callback callback,
|
| 1817 |
+
void * callback_data);
|
| 1818 |
|
| 1819 |
//
|
| 1820 |
// quantization
|