ggerganov commited on
Commit
8bb66c1
·
unverified ·
1 Parent(s): 16b5542

ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

Browse files

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build

Files changed (6) hide show
  1. ggml-cuda.cu +83 -24
  2. ggml-metal.m +82 -51
  3. ggml-metal.metal +160 -74
  4. ggml-opencl.cpp +7 -7
  5. ggml.c +502 -350
  6. ggml.h +21 -13
ggml-cuda.cu CHANGED
@@ -81,12 +81,29 @@
81
  #if defined(GGML_USE_HIPBLAS)
82
  #define __CUDA_ARCH__ 1300
83
 
 
 
 
 
84
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 
88
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
  return reinterpret_cast<const int&>(c);
 
 
 
 
 
 
 
 
 
 
 
 
90
  }
91
 
92
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
447
  dst[i] = x[i] / (1.0f + expf(-x[i]));
448
  }
449
 
 
 
 
 
 
 
 
 
 
 
450
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
451
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
452
  const int tid = threadIdx.x;
453
 
454
  const float eps = 1e-5f;
455
 
456
- float mean = 0.0f;
457
- float var = 0.0f;
458
 
459
- for (int col = tid; col < ncols; col += WARP_SIZE) {
460
  const float xi = x[row*ncols + col];
461
- mean += xi;
462
- var += xi * xi;
463
  }
464
 
465
  // sum up partial sums
466
- #pragma unroll
467
- for (int mask = 16; mask > 0; mask >>= 1) {
468
- mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
469
- var += __shfl_xor_sync(0xffffffff, var, mask, 32);
 
 
 
 
 
 
 
470
  }
471
 
472
- mean /= ncols;
473
- var = var / ncols - mean * mean;
474
- const float inv_var = rsqrtf(var + eps);
475
 
476
- for (int col = tid; col < ncols; col += WARP_SIZE) {
477
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
478
  }
479
  }
480
 
 
 
 
 
 
 
 
 
 
481
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
482
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
483
  const int tid = threadIdx.x;
484
 
485
  float tmp = 0.0f; // partial sum for thread in warp
486
 
487
- for (int col = tid; col < ncols; col += WARP_SIZE) {
488
  const float xi = x[row*ncols + col];
489
  tmp += xi * xi;
490
  }
491
 
492
  // sum up partial sums
493
- #pragma unroll
494
- for (int mask = 16; mask > 0; mask >>= 1) {
495
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
 
 
 
 
 
 
 
 
496
  }
497
 
498
  const float mean = tmp / ncols;
499
  const float scale = rsqrtf(mean + eps);
500
 
501
- for (int col = tid; col < ncols; col += WARP_SIZE) {
502
  dst[row*ncols + col] = scale * x[row*ncols + col];
503
  }
504
  }
@@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4186
 
4187
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4188
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4189
- const dim3 block_dims(WARP_SIZE, 1, 1);
4190
- norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 
 
 
 
 
4191
  }
4192
 
4193
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4194
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4195
- const dim3 block_dims(WARP_SIZE, 1, 1);
4196
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 
 
 
 
 
4197
  }
4198
 
4199
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -5721,7 +5781,6 @@ inline void ggml_cuda_op_alibi(
5721
  (void) src1;
5722
  (void) src0_ddq_i;
5723
  (void) src1_ddf_i;
5724
- (void) i02;
5725
  (void) i1;
5726
  }
5727
 
 
81
  #if defined(GGML_USE_HIPBLAS)
82
  #define __CUDA_ARCH__ 1300
83
 
84
+ #ifndef __has_builtin
85
+ #define __has_builtin(x) 0
86
+ #endif
87
+
88
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
89
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
90
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
91
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
92
+ #if __has_builtin(__builtin_elementwise_sub_sat)
93
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
94
  return reinterpret_cast<const int&>(c);
95
+ #else
96
+ int8x4_t c;
97
+ int16_t tmp;
98
+ #pragma unroll
99
+ for (int i = 0; i < 4; i++) {
100
+ tmp = va[i] - vb[i];
101
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
102
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
103
+ c[i] = tmp;
104
+ }
105
+ return reinterpret_cast<int&>(c);
106
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
107
  }
108
 
109
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 
464
  dst[i] = x[i] / (1.0f + expf(-x[i]));
465
  }
466
 
467
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
468
+ #pragma unroll
469
+ for (int mask = 16; mask > 0; mask >>= 1) {
470
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
471
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
472
+ }
473
+ return a;
474
+ }
475
+
476
+ template <int block_size>
477
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
478
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
479
  const int tid = threadIdx.x;
480
 
481
  const float eps = 1e-5f;
482
 
483
+ float2 mean_var = make_float2(0.f, 0.f);
 
484
 
485
+ for (int col = tid; col < ncols; col += block_size) {
486
  const float xi = x[row*ncols + col];
487
+ mean_var.x += xi;
488
+ mean_var.y += xi * xi;
489
  }
490
 
491
  // sum up partial sums
492
+ mean_var = warp_reduce_sum(mean_var);
493
+ if (block_size > WARP_SIZE) {
494
+ __shared__ float2 s_sum[32];
495
+ int warp_id = threadIdx.x / WARP_SIZE;
496
+ int lane_id = threadIdx.x % WARP_SIZE;
497
+ if (lane_id == 0) {
498
+ s_sum[warp_id] = mean_var;
499
+ }
500
+ __syncthreads();
501
+ mean_var = s_sum[lane_id];
502
+ mean_var = warp_reduce_sum(mean_var);
503
  }
504
 
505
+ const float mean = mean_var.x / ncols;
506
+ const float var = mean_var.y / ncols - mean * mean;
507
+ const float inv_std = rsqrtf(var + eps);
508
 
509
+ for (int col = tid; col < ncols; col += block_size) {
510
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
511
  }
512
  }
513
 
514
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
515
+ #pragma unroll
516
+ for (int mask = 16; mask > 0; mask >>= 1) {
517
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
518
+ }
519
+ return x;
520
+ }
521
+
522
+ template <int block_size>
523
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
524
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
525
  const int tid = threadIdx.x;
526
 
527
  float tmp = 0.0f; // partial sum for thread in warp
528
 
529
+ for (int col = tid; col < ncols; col += block_size) {
530
  const float xi = x[row*ncols + col];
531
  tmp += xi * xi;
532
  }
533
 
534
  // sum up partial sums
535
+ tmp = warp_reduce_sum(tmp);
536
+ if (block_size > WARP_SIZE) {
537
+ __shared__ float s_sum[32];
538
+ int warp_id = threadIdx.x / WARP_SIZE;
539
+ int lane_id = threadIdx.x % WARP_SIZE;
540
+ if (lane_id == 0) {
541
+ s_sum[warp_id] = tmp;
542
+ }
543
+ __syncthreads();
544
+ tmp = s_sum[lane_id];
545
+ tmp = warp_reduce_sum(tmp);
546
  }
547
 
548
  const float mean = tmp / ncols;
549
  const float scale = rsqrtf(mean + eps);
550
 
551
+ for (int col = tid; col < ncols; col += block_size) {
552
  dst[row*ncols + col] = scale * x[row*ncols + col];
553
  }
554
  }
 
4236
 
4237
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4238
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4239
+ if (ncols < 1024) {
4240
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4241
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4242
+ } else {
4243
+ const dim3 block_dims(1024, 1, 1);
4244
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4245
+ }
4246
  }
4247
 
4248
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4249
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4250
+ if (ncols < 1024) {
4251
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4252
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4253
+ } else {
4254
+ const dim3 block_dims(1024, 1, 1);
4255
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4256
+ }
4257
  }
4258
 
4259
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
 
5781
  (void) src1;
5782
  (void) src0_ddq_i;
5783
  (void) src1_ddf_i;
 
5784
  (void) i1;
5785
  }
5786
 
ggml-metal.m CHANGED
@@ -11,6 +11,7 @@
11
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
 
 
14
  #ifdef GGML_METAL_NDEBUG
15
  #define metal_printf(...)
16
  #else
@@ -75,6 +76,7 @@ struct ggml_metal_context {
75
  GGML_METAL_DECL_KERNEL(rms_norm);
76
  GGML_METAL_DECL_KERNEL(norm);
77
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
 
78
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
79
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
80
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -113,12 +115,26 @@ static NSString * const msl_library_source = @"see metal.metal";
113
  @end
114
 
115
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
116
- fprintf(stderr, "%s: allocating\n", __func__);
 
 
 
 
 
 
 
 
 
117
 
118
- struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 
 
 
119
 
 
 
 
120
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
121
- ctx->device = MTLCreateSystemDefaultDevice();
122
  ctx->queue = [ctx->device newCommandQueue];
123
  ctx->n_buffers = 0;
124
  ctx->concur_list_len = 0;
@@ -132,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
132
 
133
  ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
134
  if (error) {
135
- fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
136
  return NULL;
137
  }
138
  }
@@ -146,11 +162,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
146
  //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
147
  NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
148
  NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
149
- fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
150
 
151
  NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
152
  if (error) {
153
- fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
154
  return NULL;
155
  }
156
 
@@ -162,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
162
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
163
  #endif
164
  if (error) {
165
- fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
166
  return NULL;
167
  }
168
  }
@@ -174,11 +190,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
174
  #define GGML_METAL_ADD_KERNEL(name) \
175
  ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
176
  ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
177
- fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
178
  (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
179
  (int) ctx->pipeline_##name.threadExecutionWidth); \
180
  if (error) { \
181
- fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
182
  return NULL; \
183
  }
184
 
@@ -204,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
204
  GGML_METAL_ADD_KERNEL(rms_norm);
205
  GGML_METAL_ADD_KERNEL(norm);
206
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
 
207
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
208
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
209
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -230,19 +247,19 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
230
  #undef GGML_METAL_ADD_KERNEL
231
  }
232
 
233
- fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
234
- fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
235
  if (ctx->device.maxTransferRate != 0) {
236
- fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
237
  } else {
238
- fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
239
  }
240
 
241
  return ctx;
242
  }
243
 
244
  void ggml_metal_free(struct ggml_metal_context * ctx) {
245
- fprintf(stderr, "%s: deallocating\n", __func__);
246
  #define GGML_METAL_DEL_KERNEL(name) \
247
  [ctx->function_##name release]; \
248
  [ctx->pipeline_##name release];
@@ -269,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
269
  GGML_METAL_DEL_KERNEL(rms_norm);
270
  GGML_METAL_DEL_KERNEL(norm);
271
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
 
272
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
273
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
274
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -311,7 +329,7 @@ void * ggml_metal_host_malloc(size_t n) {
311
  void * data = NULL;
312
  const int result = posix_memalign((void **) &data, getpagesize(), n);
313
  if (result != 0) {
314
- fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
315
  return NULL;
316
  }
317
 
@@ -339,7 +357,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
339
  // Metal buffer based on the host memory pointer
340
  //
341
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
342
- //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
343
 
344
  const int64_t tsize = ggml_nbytes(t);
345
 
@@ -350,13 +368,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
350
  if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
351
  *offs = (size_t) ioffs;
352
 
353
- //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
354
 
355
  return ctx->buffers[i].metal;
356
  }
357
  }
358
 
359
- fprintf(stderr, "%s: error: buffer is nil\n", __func__);
360
 
361
  return nil;
362
  }
@@ -368,7 +386,7 @@ bool ggml_metal_add_buffer(
368
  size_t size,
369
  size_t max_size) {
370
  if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
371
- fprintf(stderr, "%s: too many buffers\n", __func__);
372
  return false;
373
  }
374
 
@@ -378,7 +396,7 @@ bool ggml_metal_add_buffer(
378
  const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
379
 
380
  if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
381
- fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
382
  return false;
383
  }
384
  }
@@ -399,11 +417,11 @@ bool ggml_metal_add_buffer(
399
  ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
400
 
401
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
402
- fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
403
  return false;
404
  }
405
 
406
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
407
 
408
  ++ctx->n_buffers;
409
  } else {
@@ -423,27 +441,27 @@ bool ggml_metal_add_buffer(
423
  ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
424
 
425
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
426
- fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
427
  return false;
428
  }
429
 
430
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
431
  if (i + size_step < size) {
432
- fprintf(stderr, "\n");
433
  }
434
 
435
  ++ctx->n_buffers;
436
  }
437
  }
438
 
439
- fprintf(stderr, ", (%8.2f / %8.2f)",
440
  ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
441
  ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
442
 
443
  if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
444
- fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
445
  } else {
446
- fprintf(stderr, "\n");
447
  }
448
  }
449
 
@@ -453,8 +471,6 @@ bool ggml_metal_add_buffer(
453
  void ggml_metal_set_tensor(
454
  struct ggml_metal_context * ctx,
455
  struct ggml_tensor * t) {
456
- metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
457
-
458
  size_t offs;
459
  id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
460
 
@@ -464,8 +480,6 @@ void ggml_metal_set_tensor(
464
  void ggml_metal_get_tensor(
465
  struct ggml_metal_context * ctx,
466
  struct ggml_tensor * t) {
467
- metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
468
-
469
  size_t offs;
470
  id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
471
 
@@ -560,15 +574,13 @@ void ggml_metal_graph_find_concurrency(
560
  }
561
 
562
  if (ctx->concur_list_len > GGML_MAX_CONCUR) {
563
- fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
564
  }
565
  }
566
 
567
  void ggml_metal_graph_compute(
568
  struct ggml_metal_context * ctx,
569
  struct ggml_cgraph * gf) {
570
- metal_printf("%s: evaluating graph\n", __func__);
571
-
572
  @autoreleasepool {
573
 
574
  // if there is ctx->concur_list, dispatch concurrently
@@ -616,7 +628,7 @@ void ggml_metal_graph_compute(
616
  continue;
617
  }
618
 
619
- metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
620
 
621
  struct ggml_tensor * src0 = gf->nodes[i]->src[0];
622
  struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@@ -685,6 +697,12 @@ void ggml_metal_graph_compute(
685
  } break;
686
  case GGML_OP_ADD:
687
  {
 
 
 
 
 
 
688
  if (ggml_nelements(src1) == ne10) {
689
  // src1 is a row
690
  [encoder setComputePipelineState:ctx->pipeline_add_row];
@@ -694,14 +712,20 @@ void ggml_metal_graph_compute(
694
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
695
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
696
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
697
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
698
 
699
- const int64_t n = ggml_nelements(dst);
700
 
701
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
702
  } break;
703
  case GGML_OP_MUL:
704
  {
 
 
 
 
 
 
705
  if (ggml_nelements(src1) == ne10) {
706
  // src1 is a row
707
  [encoder setComputePipelineState:ctx->pipeline_mul_row];
@@ -711,9 +735,9 @@ void ggml_metal_graph_compute(
711
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
712
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
713
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
714
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
715
 
716
- const int64_t n = ggml_nelements(dst);
717
 
718
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
719
  } break;
@@ -764,7 +788,7 @@ void ggml_metal_graph_compute(
764
  } break;
765
  default:
766
  {
767
- fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
768
  GGML_ASSERT(false);
769
  }
770
  } break;
@@ -845,9 +869,13 @@ void ggml_metal_graph_compute(
845
  switch (src0t) {
846
  case GGML_TYPE_F16:
847
  {
848
- nth0 = 64;
849
  nth1 = 1;
850
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
 
 
 
 
851
  } break;
852
  case GGML_TYPE_Q4_0:
853
  {
@@ -899,8 +927,8 @@ void ggml_metal_graph_compute(
899
  GGML_ASSERT(ne02 == 1);
900
  GGML_ASSERT(ne12 == 1);
901
 
902
- nth0 = 2;
903
- nth1 = 32;
904
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
905
  } break;
906
  case GGML_TYPE_Q5_K:
@@ -923,7 +951,7 @@ void ggml_metal_graph_compute(
923
  } break;
924
  default:
925
  {
926
- fprintf(stderr, "Asserting on type %d\n",(int)src0t);
927
  GGML_ASSERT(false && "not implemented");
928
  }
929
  };
@@ -948,9 +976,12 @@ void ggml_metal_graph_compute(
948
  [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
949
 
950
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
951
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
952
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
953
  }
 
 
 
954
  else if (src0t == GGML_TYPE_Q3_K) {
955
  #ifdef GGML_QKK_64
956
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -964,8 +995,8 @@ void ggml_metal_graph_compute(
964
  else if (src0t == GGML_TYPE_Q6_K) {
965
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
966
  } else {
967
- [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
968
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
969
  }
970
  }
971
  } break;
@@ -1161,7 +1192,7 @@ void ggml_metal_graph_compute(
1161
  } break;
1162
  default:
1163
  {
1164
- fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1165
  GGML_ASSERT(false);
1166
  }
1167
  }
@@ -1186,7 +1217,7 @@ void ggml_metal_graph_compute(
1186
 
1187
  MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
1188
  if (status != MTLCommandBufferStatusCompleted) {
1189
- fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
1190
  GGML_ASSERT(false);
1191
  }
1192
  }
 
11
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
 
14
+ // TODO: temporary - reuse llama.cpp logging
15
  #ifdef GGML_METAL_NDEBUG
16
  #define metal_printf(...)
17
  #else
 
76
  GGML_METAL_DECL_KERNEL(rms_norm);
77
  GGML_METAL_DECL_KERNEL(norm);
78
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
79
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
80
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
81
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
82
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
 
115
  @end
116
 
117
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
118
+ metal_printf("%s: allocating\n", __func__);
119
+
120
+ // Show all the Metal device instances in the system
121
+ NSArray * devices = MTLCopyAllDevices();
122
+ id <MTLDevice> device;
123
+ NSString * s;
124
+ for (device in devices) {
125
+ s = [device name];
126
+ metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
127
+ }
128
 
129
+ // Pick and show default Metal device
130
+ device = MTLCreateSystemDefaultDevice();
131
+ s = [device name];
132
+ metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
133
 
134
+ // Configure context
135
+ struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
136
+ ctx->device = device;
137
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 
138
  ctx->queue = [ctx->device newCommandQueue];
139
  ctx->n_buffers = 0;
140
  ctx->concur_list_len = 0;
 
148
 
149
  ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
150
  if (error) {
151
+ metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
152
  return NULL;
153
  }
154
  }
 
162
  //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
163
  NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
164
  NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
165
+ metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
166
 
167
  NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
168
  if (error) {
169
+ metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
170
  return NULL;
171
  }
172
 
 
178
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
179
  #endif
180
  if (error) {
181
+ metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
182
  return NULL;
183
  }
184
  }
 
190
  #define GGML_METAL_ADD_KERNEL(name) \
191
  ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
192
  ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
193
+ metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
194
  (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
195
  (int) ctx->pipeline_##name.threadExecutionWidth); \
196
  if (error) { \
197
+ metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
198
  return NULL; \
199
  }
200
 
 
220
  GGML_METAL_ADD_KERNEL(rms_norm);
221
  GGML_METAL_ADD_KERNEL(norm);
222
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
223
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
224
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
225
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
226
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
 
247
  #undef GGML_METAL_ADD_KERNEL
248
  }
249
 
250
+ metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
251
+ metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
252
  if (ctx->device.maxTransferRate != 0) {
253
+ metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
254
  } else {
255
+ metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
256
  }
257
 
258
  return ctx;
259
  }
260
 
261
  void ggml_metal_free(struct ggml_metal_context * ctx) {
262
+ metal_printf("%s: deallocating\n", __func__);
263
  #define GGML_METAL_DEL_KERNEL(name) \
264
  [ctx->function_##name release]; \
265
  [ctx->pipeline_##name release];
 
286
  GGML_METAL_DEL_KERNEL(rms_norm);
287
  GGML_METAL_DEL_KERNEL(norm);
288
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
289
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
290
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
291
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
292
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
 
329
  void * data = NULL;
330
  const int result = posix_memalign((void **) &data, getpagesize(), n);
331
  if (result != 0) {
332
+ metal_printf("%s: error: posix_memalign failed\n", __func__);
333
  return NULL;
334
  }
335
 
 
357
  // Metal buffer based on the host memory pointer
358
  //
359
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
360
+ //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
361
 
362
  const int64_t tsize = ggml_nbytes(t);
363
 
 
368
  if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
369
  *offs = (size_t) ioffs;
370
 
371
+ //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
372
 
373
  return ctx->buffers[i].metal;
374
  }
375
  }
376
 
377
+ metal_printf("%s: error: buffer is nil\n", __func__);
378
 
379
  return nil;
380
  }
 
386
  size_t size,
387
  size_t max_size) {
388
  if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
389
+ metal_printf("%s: too many buffers\n", __func__);
390
  return false;
391
  }
392
 
 
396
  const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
397
 
398
  if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
399
+ metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
400
  return false;
401
  }
402
  }
 
417
  ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
418
 
419
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
420
+ metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
421
  return false;
422
  }
423
 
424
+ metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
425
 
426
  ++ctx->n_buffers;
427
  } else {
 
441
  ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
442
 
443
  if (ctx->buffers[ctx->n_buffers].metal == nil) {
444
+ metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
445
  return false;
446
  }
447
 
448
+ metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
449
  if (i + size_step < size) {
450
+ metal_printf("\n");
451
  }
452
 
453
  ++ctx->n_buffers;
454
  }
455
  }
456
 
457
+ metal_printf(", (%8.2f / %8.2f)",
458
  ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
459
  ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
460
 
461
  if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
462
+ metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
463
  } else {
464
+ metal_printf("\n");
465
  }
466
  }
467
 
 
471
  void ggml_metal_set_tensor(
472
  struct ggml_metal_context * ctx,
473
  struct ggml_tensor * t) {
 
 
474
  size_t offs;
475
  id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
476
 
 
480
  void ggml_metal_get_tensor(
481
  struct ggml_metal_context * ctx,
482
  struct ggml_tensor * t) {
 
 
483
  size_t offs;
484
  id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
485
 
 
574
  }
575
 
576
  if (ctx->concur_list_len > GGML_MAX_CONCUR) {
577
+ metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
578
  }
579
  }
580
 
581
  void ggml_metal_graph_compute(
582
  struct ggml_metal_context * ctx,
583
  struct ggml_cgraph * gf) {
 
 
584
  @autoreleasepool {
585
 
586
  // if there is ctx->concur_list, dispatch concurrently
 
628
  continue;
629
  }
630
 
631
+ //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
632
 
633
  struct ggml_tensor * src0 = gf->nodes[i]->src[0];
634
  struct ggml_tensor * src1 = gf->nodes[i]->src[1];
 
697
  } break;
698
  case GGML_OP_ADD:
699
  {
700
+ GGML_ASSERT(ggml_is_contiguous(src0));
701
+
702
+ // utilize float4
703
+ GGML_ASSERT(ne00 % 4 == 0);
704
+ const int64_t nb = ne00/4;
705
+
706
  if (ggml_nelements(src1) == ne10) {
707
  // src1 is a row
708
  [encoder setComputePipelineState:ctx->pipeline_add_row];
 
712
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
713
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
714
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
715
+ [encoder setBytes:&nb length:sizeof(nb) atIndex:3];
716
 
717
+ const int64_t n = ggml_nelements(dst)/4;
718
 
719
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
720
  } break;
721
  case GGML_OP_MUL:
722
  {
723
+ GGML_ASSERT(ggml_is_contiguous(src0));
724
+
725
+ // utilize float4
726
+ GGML_ASSERT(ne00 % 4 == 0);
727
+ const int64_t nb = ne00/4;
728
+
729
  if (ggml_nelements(src1) == ne10) {
730
  // src1 is a row
731
  [encoder setComputePipelineState:ctx->pipeline_mul_row];
 
735
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
736
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
737
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
738
+ [encoder setBytes:&nb length:sizeof(nb) atIndex:3];
739
 
740
+ const int64_t n = ggml_nelements(dst)/4;
741
 
742
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
743
  } break;
 
788
  } break;
789
  default:
790
  {
791
+ metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
792
  GGML_ASSERT(false);
793
  }
794
  } break;
 
869
  switch (src0t) {
870
  case GGML_TYPE_F16:
871
  {
872
+ nth0 = 32;
873
  nth1 = 1;
874
+ if (ne11 * ne12 < 4) {
875
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
876
+ } else {
877
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
878
+ }
879
  } break;
880
  case GGML_TYPE_Q4_0:
881
  {
 
927
  GGML_ASSERT(ne02 == 1);
928
  GGML_ASSERT(ne12 == 1);
929
 
930
+ nth0 = 4; //1;
931
+ nth1 = 8; //32;
932
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
933
  } break;
934
  case GGML_TYPE_Q5_K:
 
951
  } break;
952
  default:
953
  {
954
+ metal_printf("Asserting on type %d\n",(int)src0t);
955
  GGML_ASSERT(false && "not implemented");
956
  }
957
  };
 
976
  [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
977
 
978
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
979
+ src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
980
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
981
  }
982
+ else if (src0t == GGML_TYPE_Q4_K) {
983
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
984
+ }
985
  else if (src0t == GGML_TYPE_Q3_K) {
986
  #ifdef GGML_QKK_64
987
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 
995
  else if (src0t == GGML_TYPE_Q6_K) {
996
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
997
  } else {
998
+ int64_t ny = (ne11 + 3)/4;
999
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1000
  }
1001
  }
1002
  } break;
 
1192
  } break;
1193
  default:
1194
  {
1195
+ metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1196
  GGML_ASSERT(false);
1197
  }
1198
  }
 
1217
 
1218
  MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
1219
  if (status != MTLCommandBufferStatusCompleted) {
1220
+ metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
1221
  GGML_ASSERT(false);
1222
  }
1223
  }
ggml-metal.metal CHANGED
@@ -25,9 +25,9 @@ typedef struct {
25
  } block_q8_0;
26
 
27
  kernel void kernel_add(
28
- device const float * src0,
29
- device const float * src1,
30
- device float * dst,
31
  uint tpig[[thread_position_in_grid]]) {
32
  dst[tpig] = src0[tpig] + src1[tpig];
33
  }
@@ -35,18 +35,18 @@ kernel void kernel_add(
35
  // assumption: src1 is a row
36
  // broadcast src1 into src0
37
  kernel void kernel_add_row(
38
- device const float * src0,
39
- device const float * src1,
40
- device float * dst,
41
- constant int64_t & ne00,
42
  uint tpig[[thread_position_in_grid]]) {
43
- dst[tpig] = src0[tpig] + src1[tpig % ne00];
44
  }
45
 
46
  kernel void kernel_mul(
47
- device const float * src0,
48
- device const float * src1,
49
- device float * dst,
50
  uint tpig[[thread_position_in_grid]]) {
51
  dst[tpig] = src0[tpig] * src1[tpig];
52
  }
@@ -54,12 +54,12 @@ kernel void kernel_mul(
54
  // assumption: src1 is a row
55
  // broadcast src1 into src0
56
  kernel void kernel_mul_row(
57
- device const float * src0,
58
- device const float * src1,
59
- device float * dst,
60
- constant int64_t & ne00,
61
  uint tpig[[thread_position_in_grid]]) {
62
- dst[tpig] = src0[tpig] * src1[tpig % ne00];
63
  }
64
 
65
  kernel void kernel_scale(
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
133
  threadgroup_barrier(mem_flags::mem_threadgroup);
134
  }
135
 
136
- // broadcast
137
- if (tpitg[0] == 0) {
138
- buf[0] = buf[0];
139
- }
 
140
 
141
- threadgroup_barrier(mem_flags::mem_threadgroup);
142
 
143
  const float max = buf[0];
144
 
145
  // parallel sum
146
  buf[tpitg[0]] = 0.0f;
147
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
148
- buf[tpitg[0]] += exp(psrc0[i00] - max);
 
 
 
 
149
  }
150
 
151
  // reduce
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
157
  threadgroup_barrier(mem_flags::mem_threadgroup);
158
  }
159
 
160
- // broadcast
161
- if (tpitg[0] == 0) {
162
- buf[0] = buf[0];
163
- }
 
164
 
165
- threadgroup_barrier(mem_flags::mem_threadgroup);
166
 
167
  const float sum = buf[0];
168
 
169
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
170
- pdst[i00] = exp(psrc0[i00] - max) / sum;
171
  }
172
  }
173
 
@@ -214,25 +220,27 @@ kernel void kernel_norm(
214
  }
215
  threadgroup_barrier(mem_flags::mem_threadgroup);
216
  }
217
- // broadcast
218
- if (tpitg == 0) {
219
- sum[0] /= ne00;
220
- }
221
- threadgroup_barrier(mem_flags::mem_threadgroup);
222
  const float mean = sum[0];
223
 
224
- // recenter
225
  device float * y = dst + tgpig*ne00;
226
- for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
227
- y[i00] = x[i00] - mean;
228
- }
229
-
230
- // VARIANCE
231
- // parallel sum
232
  sum[tpitg] = 0.0f;
233
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
 
234
  sum[tpitg] += y[i00] * y[i00];
235
  }
 
 
 
 
 
 
 
236
  // reduce
237
  threadgroup_barrier(mem_flags::mem_threadgroup);
238
  for (uint i = ntg/2; i > 0; i /= 2) {
@@ -241,11 +249,11 @@ kernel void kernel_norm(
241
  }
242
  threadgroup_barrier(mem_flags::mem_threadgroup);
243
  }
244
- // broadcast
245
- if (tpitg == 0) {
246
- sum[0] /= ne00;
247
- }
248
- threadgroup_barrier(mem_flags::mem_threadgroup);
249
  const float variance = sum[0];
250
 
251
  const float scale = 1.0f/sqrt(variance + eps);
@@ -435,6 +443,8 @@ kernel void kernel_mul_mat_q4_1_f32(
435
  mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
436
  }
437
 
 
 
438
  kernel void kernel_mul_mat_q8_0_f32(
439
  device const void * src0,
440
  device const float * src1,
@@ -463,30 +473,30 @@ kernel void kernel_mul_mat_q8_0_f32(
463
  device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
464
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
465
 
466
- float yl[16];
467
  float sumf[nr]={0.f};
468
 
469
- const int ix = tiisg/2;
470
- const int il = tiisg%2;
471
 
472
- device const float * yb = y + ix * QK8_0 + 16*il;
473
 
474
- // each thread in a SIMD group deals with half a block.
475
- for (int ib = ix; ib < nb; ib += nw/2) {
476
- for (int i = 0; i < 16; ++i) {
477
  yl[i] = yb[i];
478
  }
479
 
480
  for (int row = 0; row < nr; row++) {
481
- device const int8_t * qs = x[ib+row*nb].qs + 16*il;
482
  float sumq = 0.f;
483
- for (int iq = 0; iq < 16; ++iq) {
484
  sumq += qs[iq] * yl[iq];
485
  }
486
  sumf[row] += sumq*x[ib+row*nb].d;
487
  }
488
 
489
- yb += QK8_0 * 16;
490
  }
491
 
492
  for (int row = 0; row < nr; ++row) {
@@ -497,7 +507,7 @@ kernel void kernel_mul_mat_q8_0_f32(
497
  }
498
  }
499
 
500
- kernel void kernel_mul_mat_f16_f32(
501
  device const char * src0,
502
  device const char * src1,
503
  device float * dst,
@@ -515,11 +525,8 @@ kernel void kernel_mul_mat_f16_f32(
515
  constant uint64_t & nb12,
516
  constant int64_t & ne0,
517
  constant int64_t & ne1,
518
- threadgroup float * sum [[threadgroup(0)]],
519
  uint3 tgpig[[threadgroup_position_in_grid]],
520
- uint3 tpig[[thread_position_in_grid]],
521
- uint3 tpitg[[thread_position_in_threadgroup]],
522
- uint3 tptg[[threads_per_threadgroup]]) {
523
 
524
  const int64_t r0 = tgpig.x;
525
  const int64_t r1 = tgpig.y;
@@ -528,24 +535,102 @@ kernel void kernel_mul_mat_f16_f32(
528
  device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
529
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
530
 
531
- sum[tpitg.x] = 0.0f;
532
-
533
- for (int i = tpitg.x; i < ne00; i += tptg.x) {
534
- sum[tpitg.x] += (float) x[i] * (float) y[i];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  }
536
 
537
- // accumulate the sum from all threads in the threadgroup
538
- threadgroup_barrier(mem_flags::mem_threadgroup);
539
- for (uint i = tptg.x/2; i > 0; i /= 2) {
540
- if (tpitg.x < i) {
541
- sum[tpitg.x] += sum[tpitg.x + i];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  }
543
- threadgroup_barrier(mem_flags::mem_threadgroup);
544
- }
 
 
 
 
 
 
 
 
545
 
546
- if (tpitg.x == 0) {
547
- dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
 
 
 
 
 
 
 
 
 
548
  }
 
549
  }
550
 
551
  kernel void kernel_alibi_f32(
@@ -1244,7 +1329,8 @@ kernel void kernel_mul_mat_q4_K_f32(
1244
  const int r0 = tgpig.x;
1245
  const int r1 = tgpig.y;
1246
  const int r2 = tgpig.z;
1247
- const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
 
1248
  const int ib_row = first_row * nb;
1249
  const uint offset0 = r2/gqa*(nb*ne0);
1250
  device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
 
25
  } block_q8_0;
26
 
27
  kernel void kernel_add(
28
+ device const float4 * src0,
29
+ device const float4 * src1,
30
+ device float4 * dst,
31
  uint tpig[[thread_position_in_grid]]) {
32
  dst[tpig] = src0[tpig] + src1[tpig];
33
  }
 
35
  // assumption: src1 is a row
36
  // broadcast src1 into src0
37
  kernel void kernel_add_row(
38
+ device const float4 * src0,
39
+ device const float4 * src1,
40
+ device float4 * dst,
41
+ constant int64_t & nb,
42
  uint tpig[[thread_position_in_grid]]) {
43
+ dst[tpig] = src0[tpig] + src1[tpig % nb];
44
  }
45
 
46
  kernel void kernel_mul(
47
+ device const float4 * src0,
48
+ device const float4 * src1,
49
+ device float4 * dst,
50
  uint tpig[[thread_position_in_grid]]) {
51
  dst[tpig] = src0[tpig] * src1[tpig];
52
  }
 
54
  // assumption: src1 is a row
55
  // broadcast src1 into src0
56
  kernel void kernel_mul_row(
57
+ device const float4 * src0,
58
+ device const float4 * src1,
59
+ device float4 * dst,
60
+ constant int64_t & nb,
61
  uint tpig[[thread_position_in_grid]]) {
62
+ dst[tpig] = src0[tpig] * src1[tpig % nb];
63
  }
64
 
65
  kernel void kernel_scale(
 
133
  threadgroup_barrier(mem_flags::mem_threadgroup);
134
  }
135
 
136
+ //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
137
+ // the loop, and when that is done, buf[0] has the correct (synchronized) value
138
+ //if (tpitg[0] == 0) {
139
+ // buf[0] = buf[0];
140
+ //}
141
 
142
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
143
 
144
  const float max = buf[0];
145
 
146
  // parallel sum
147
  buf[tpitg[0]] = 0.0f;
148
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
149
+ const float exp_psrc0 = exp(psrc0[i00] - max);
150
+ buf[tpitg[0]] += exp_psrc0;
151
+ // Remember the result of exp here. exp is expensive, so we really do not
152
+ // whish to compute it twice.
153
+ pdst[i00] = exp_psrc0;
154
  }
155
 
156
  // reduce
 
162
  threadgroup_barrier(mem_flags::mem_threadgroup);
163
  }
164
 
165
+ // broadcast - not needed, see above
166
+ //// broadcast
167
+ //if (tpitg[0] == 0) {
168
+ // buf[0] = buf[0];
169
+ //}
170
 
171
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
172
 
173
  const float sum = buf[0];
174
 
175
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
176
+ pdst[i00] /= sum;
177
  }
178
  }
179
 
 
220
  }
221
  threadgroup_barrier(mem_flags::mem_threadgroup);
222
  }
223
+ //// broadcast
224
+ //if (tpitg == 0) {
225
+ // sum[0] /= ne00;
226
+ //}
227
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
228
  const float mean = sum[0];
229
 
230
+ // recenter and VARIANCE
231
  device float * y = dst + tgpig*ne00;
 
 
 
 
 
 
232
  sum[tpitg] = 0.0f;
233
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
234
+ y[i00] = x[i00] - mean;
235
  sum[tpitg] += y[i00] * y[i00];
236
  }
237
+
238
+ //// VARIANCE
239
+ //// parallel sum
240
+ //sum[tpitg] = 0.0f;
241
+ //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
242
+ // sum[tpitg] += y[i00] * y[i00];
243
+ //}
244
  // reduce
245
  threadgroup_barrier(mem_flags::mem_threadgroup);
246
  for (uint i = ntg/2; i > 0; i /= 2) {
 
249
  }
250
  threadgroup_barrier(mem_flags::mem_threadgroup);
251
  }
252
+ //// broadcast
253
+ //if (tpitg == 0) {
254
+ // sum[0] /= ne00;
255
+ //}
256
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
257
  const float variance = sum[0];
258
 
259
  const float scale = 1.0f/sqrt(variance + eps);
 
443
  mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
444
  }
445
 
446
+ #define NB_Q8_0 8
447
+
448
  kernel void kernel_mul_mat_q8_0_f32(
449
  device const void * src0,
450
  device const float * src1,
 
473
  device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
474
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
475
 
476
+ float yl[NB_Q8_0];
477
  float sumf[nr]={0.f};
478
 
479
+ const int ix = tiisg/4;
480
+ const int il = tiisg%4;
481
 
482
+ device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
483
 
484
+ // each thread in a SIMD group deals with NB_Q8_0 quants at a time
485
+ for (int ib = ix; ib < nb; ib += nw/4) {
486
+ for (int i = 0; i < NB_Q8_0; ++i) {
487
  yl[i] = yb[i];
488
  }
489
 
490
  for (int row = 0; row < nr; row++) {
491
+ device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
492
  float sumq = 0.f;
493
+ for (int iq = 0; iq < NB_Q8_0; ++iq) {
494
  sumq += qs[iq] * yl[iq];
495
  }
496
  sumf[row] += sumq*x[ib+row*nb].d;
497
  }
498
 
499
+ yb += NB_Q8_0 * nw;
500
  }
501
 
502
  for (int row = 0; row < nr; ++row) {
 
507
  }
508
  }
509
 
510
+ kernel void kernel_mul_mat_f16_f32_1row(
511
  device const char * src0,
512
  device const char * src1,
513
  device float * dst,
 
525
  constant uint64_t & nb12,
526
  constant int64_t & ne0,
527
  constant int64_t & ne1,
 
528
  uint3 tgpig[[threadgroup_position_in_grid]],
529
+ uint tiisg[[thread_index_in_simdgroup]]) {
 
 
530
 
531
  const int64_t r0 = tgpig.x;
532
  const int64_t r1 = tgpig.y;
 
535
  device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
536
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
537
 
538
+ float sumf = 0;
539
+ if (ne00 < 128) {
540
+ for (int i = tiisg; i < ne00; i += 32) {
541
+ sumf += (float) x[i] * (float) y[i];
542
+ }
543
+ float all_sum = simd_sum(sumf);
544
+ if (tiisg == 0) {
545
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
546
+ }
547
+ } else {
548
+ device const half4 * x4 = (device const half4 *) x;
549
+ device const float4 * y4 = (device const float4 *) y;
550
+ for (int i = tiisg; i < ne00/4; i += 32) {
551
+ for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
552
+ }
553
+ float all_sum = simd_sum(sumf);
554
+ if (tiisg == 0) {
555
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
556
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
557
+ }
558
  }
559
 
560
+ }
561
+
562
+ #define N_F16_F32 4
563
+
564
+ kernel void kernel_mul_mat_f16_f32(
565
+ device const char * src0,
566
+ device const char * src1,
567
+ device float * dst,
568
+ constant int64_t & ne00,
569
+ constant int64_t & ne01,
570
+ constant int64_t & ne02,
571
+ constant uint64_t & nb00,
572
+ constant uint64_t & nb01,
573
+ constant uint64_t & nb02,
574
+ constant int64_t & ne10,
575
+ constant int64_t & ne11,
576
+ constant int64_t & ne12,
577
+ constant uint64_t & nb10,
578
+ constant uint64_t & nb11,
579
+ constant uint64_t & nb12,
580
+ constant int64_t & ne0,
581
+ constant int64_t & ne1,
582
+ uint3 tgpig[[threadgroup_position_in_grid]],
583
+ uint tiisg[[thread_index_in_simdgroup]]) {
584
+
585
+ const int64_t r0 = tgpig.x;
586
+ const int64_t rb = tgpig.y*N_F16_F32;
587
+ const int64_t im = tgpig.z;
588
+
589
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
590
+
591
+ if (ne00 < 128) {
592
+ for (int row = 0; row < N_F16_F32; ++row) {
593
+ int r1 = rb + row;
594
+ if (r1 >= ne11) {
595
+ break;
596
+ }
597
+
598
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
599
+
600
+ float sumf = 0;
601
+ for (int i = tiisg; i < ne00; i += 32) {
602
+ sumf += (float) x[i] * (float) y[i];
603
+ }
604
+
605
+ float all_sum = simd_sum(sumf);
606
+ if (tiisg == 0) {
607
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
608
+ }
609
  }
610
+ } else {
611
+ device const half4 * x4 = (device const half4 *)x;
612
+ for (int row = 0; row < N_F16_F32; ++row) {
613
+ int r1 = rb + row;
614
+ if (r1 >= ne11) {
615
+ break;
616
+ }
617
+
618
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
619
+ device const float4 * y4 = (device const float4 *) y;
620
 
621
+ float sumf = 0;
622
+ for (int i = tiisg; i < ne00/4; i += 32) {
623
+ for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
624
+ }
625
+
626
+ float all_sum = simd_sum(sumf);
627
+ if (tiisg == 0) {
628
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
629
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
630
+ }
631
+ }
632
  }
633
+
634
  }
635
 
636
  kernel void kernel_alibi_f32(
 
1329
  const int r0 = tgpig.x;
1330
  const int r1 = tgpig.y;
1331
  const int r2 = tgpig.z;
1332
+ //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1333
+ const int first_row = r0 * N_DST;
1334
  const int ib_row = first_row * nb;
1335
  const uint offset0 = r2/gqa*(nb*ne0);
1336
  device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
ggml-opencl.cpp CHANGED
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1334
  return;
1335
  }
1336
 
1337
- cl_mem mem = (cl_mem)tensor->data;
1338
  clReleaseMemObject(mem);
1339
  }
1340
 
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
  size_t d_size;
1394
 
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
- cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
 
1399
 
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1491
  size_t d_size;
1492
  cl_mem d_X;
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
- d_X = (cl_mem) src0->data;
1495
  } else {
1496
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1497
  }
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1567
  size_t d_size;
1568
  cl_mem d_X;
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->data;
1571
  } else {
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
  }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1697
  events.emplace_back();
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
- d_Q = (cl_mem) src0->data;
1701
  } else {
1702
  GGML_ASSERT(false);
1703
  }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1860
 
1861
  CL_CHECK(clFinish(queue));
1862
 
1863
- tensor->data = dst;
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
  }
 
1334
  return;
1335
  }
1336
 
1337
+ cl_mem mem = (cl_mem)tensor->extra;
1338
  clReleaseMemObject(mem);
1339
  }
1340
 
 
1393
  size_t d_size;
1394
 
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
 
1399
 
 
1491
  size_t d_size;
1492
  cl_mem d_X;
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
+ d_X = (cl_mem) src0->extra;
1495
  } else {
1496
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1497
  }
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
 
1567
  size_t d_size;
1568
  cl_mem d_X;
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
+ d_X = (cl_mem) src0->extra;
1571
  } else {
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
  }
 
1697
  events.emplace_back();
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
+ d_Q = (cl_mem) src0->extra;
1701
  } else {
1702
  GGML_ASSERT(false);
1703
  }
 
1860
 
1861
  CL_CHECK(clFinish(queue));
1862
 
1863
+ tensor->extra = dst;
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
  }
ggml.c CHANGED
@@ -47,6 +47,10 @@
47
  // disable "possible loss of data" to avoid hundreds of casts
48
  // we should just be careful :)
49
  #pragma warning(disable: 4244 4267)
 
 
 
 
50
  #endif
51
 
52
  #if defined(_WIN32)
@@ -123,6 +127,8 @@ typedef void * thread_ret_t;
123
  #define GGML_GELU_FP16
124
  #define GGML_GELU_QUICK_FP16
125
  #define GGML_SILU_FP16
 
 
126
 
127
  #define GGML_SOFT_MAX_UNROLL 4
128
  #define GGML_VEC_DOT_UNROLL 2
@@ -186,8 +192,8 @@ typedef void * thread_ret_t;
186
  //
187
 
188
  #if defined(_MSC_VER) || defined(__MINGW32__)
189
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
190
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
191
  #else
192
  inline static void * ggml_aligned_malloc(size_t size) {
193
  void * aligned_memory = NULL;
@@ -212,8 +218,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
212
  }
213
  return aligned_memory;
214
  }
215
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
216
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
217
  #endif
218
 
219
  #define UNUSED GGML_UNUSED
@@ -301,6 +307,10 @@ typedef double ggml_float;
301
  #endif
302
  #endif
303
 
 
 
 
 
304
  #ifdef __F16C__
305
 
306
  #ifdef _MSC_VER
@@ -665,7 +675,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
665
  }
666
 
667
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
668
- #ifdef __AVXVNNI__
669
  const __m256i zero = _mm256_setzero_si256();
670
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
671
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -678,7 +688,7 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy)
678
 
679
  // multiply int8_t, add results pairwise twice and return as float vector
680
  static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
681
- #ifdef __AVXVNNIINT8__
682
  const __m256i zero = _mm256_setzero_si256();
683
  const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
684
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -694,7 +704,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
694
  static inline __m128i packNibbles( __m256i bytes )
695
  {
696
  // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
697
- #ifdef __AVX512F__
698
  const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
699
  bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
700
  return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
@@ -813,46 +823,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
813
 
814
  #if !defined(__aarch64__)
815
 
816
- inline static uint16_t vaddvq_u8(uint8x16_t v) {
817
- return
818
- (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
819
- (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
820
- (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
821
- (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
822
- (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
823
- (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
824
- (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
825
- (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
826
- }
827
-
828
- inline static int16_t vaddvq_s8(int8x16_t v) {
829
- return
830
- (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
831
- (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
832
- (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
833
- (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
834
- (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
835
- (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
836
- (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
837
- (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
838
- }
839
-
840
- inline static int32_t vaddvq_s16(int16x8_t v) {
841
- return
842
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
843
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
844
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
845
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
846
- }
847
-
848
- inline static uint32_t vaddvq_u16(uint16x8_t v) {
849
- return
850
- (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
851
- (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
852
- (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
853
- (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
854
- }
855
-
856
  inline static int32_t vaddvq_s32(int32x4_t v) {
857
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
858
  }
@@ -861,12 +831,6 @@ inline static float vaddvq_f32(float32x4_t v) {
861
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
862
  }
863
 
864
- inline static float vminvq_f32(float32x4_t v) {
865
- return
866
- MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
867
- MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
868
- }
869
-
870
  inline static float vmaxvq_f32(float32x4_t v) {
871
  return
872
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -1294,7 +1258,6 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1294
  #endif
1295
  }
1296
  #else
1297
- (void)nb;
1298
  // scalar
1299
  quantize_row_q8_0_reference(x, y, k);
1300
  #endif
@@ -1513,7 +1476,6 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1513
  #endif
1514
  }
1515
  #else
1516
- (void)nb;
1517
  // scalar
1518
  quantize_row_q8_1_reference(x, y, k);
1519
  #endif
@@ -2679,6 +2641,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2679
  }
2680
 
2681
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2682
  #else
2683
  // scalar
2684
  float sumf = 0.0;
@@ -2805,6 +2802,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2805
  }
2806
 
2807
  *s = hsum_float_8(acc) + summs;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2808
  #else
2809
  // scalar
2810
  float sumf = 0.0;
@@ -3039,6 +3068,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3039
  }
3040
 
3041
  *s = hsum_float_8(acc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3042
  #else
3043
  // scalar
3044
  float sumf = 0.0;
@@ -3295,6 +3394,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3295
  }
3296
 
3297
  *s = hsum_float_8(acc) + summs;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3298
  #else
3299
  // scalar
3300
  float sumf = 0.0;
@@ -3406,6 +3571,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3406
  }
3407
 
3408
  *s = hsum_float_8(acc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3409
  #else
3410
  // scalar
3411
  float sumf = 0.0;
@@ -4106,16 +4291,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4106
  }
4107
 
4108
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4109
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4110
-
4111
- // this should handle cases where the tensor is not contiguous in memory
4112
- // probaby just:
4113
- //
4114
- // return tensor->ne[3]*tensor->nb[3]
4115
- //
4116
- // is enough, but just in case, adding the second part
4117
-
4118
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
4119
  }
4120
 
4121
  size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@@ -4569,36 +4749,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4569
  enum ggml_type type,
4570
  int n_dims,
4571
  const int64_t * ne,
4572
- void * data) {
 
4573
 
4574
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4575
 
4576
- size_t data_size = 0;
 
 
 
 
4577
 
4578
- if (data == NULL && !ctx->no_alloc) {
4579
- data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4580
- for (int i = 1; i < n_dims; i++) {
4581
- data_size *= ne[i];
4582
- }
4583
  }
4584
 
4585
- if (ctx->scratch.data != NULL && data == NULL) {
4586
- // allocate tensor data in the scratch buffer
4587
- if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4588
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4589
- __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4590
- assert(false);
4591
- return NULL;
4592
- }
4593
 
4594
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4595
 
4596
- ctx->scratch.offs += data_size;
 
 
 
 
 
 
 
 
4597
 
4598
- data_size = 0;
 
 
 
 
 
 
4599
  }
4600
 
4601
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4602
 
4603
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4604
 
@@ -4618,7 +4813,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4618
  /*.perf_runs =*/ 0,
4619
  /*.perf_cycles =*/ 0,
4620
  /*.perf_time_us =*/ 0,
4621
- /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
 
 
4622
  /*.name =*/ { 0 },
4623
  /*.extra =*/ NULL,
4624
  /*.padding =*/ { 0 },
@@ -4642,28 +4839,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4642
  return result;
4643
  }
4644
 
4645
- static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4646
- GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4647
- assert(params_size <= GGML_MAX_OP_PARAMS);
4648
- memcpy(tensor->op_params, params, params_size);
4649
- }
4650
-
4651
- static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4652
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4653
- return ((const int32_t *)(tensor->op_params))[i];
4654
- }
4655
-
4656
- static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4657
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4658
- ((int32_t *)(tensor->op_params))[i] = value;
4659
- }
4660
-
4661
  struct ggml_tensor * ggml_new_tensor(
4662
  struct ggml_context * ctx,
4663
  enum ggml_type type,
4664
  int n_dims,
4665
  const int64_t * ne) {
4666
- return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4667
  }
4668
 
4669
  struct ggml_tensor * ggml_new_tensor_1d(
@@ -4728,7 +4909,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
4728
  }
4729
 
4730
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4731
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4732
  }
4733
 
4734
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@@ -5004,13 +5201,6 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
5004
  return tensor;
5005
  }
5006
 
5007
- #ifdef __GNUC__
5008
- #ifdef __MINGW32__
5009
- __attribute__((gnu_format(printf, 2, 3)))
5010
- #else
5011
- __attribute__((format(printf, 2, 3)))
5012
- #endif
5013
- #endif
5014
  struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
5015
  va_list args;
5016
  va_start(args, fmt);
@@ -5021,14 +5211,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
5021
 
5022
  struct ggml_tensor * ggml_view_tensor(
5023
  struct ggml_context * ctx,
5024
- const struct ggml_tensor * src) {
5025
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
5026
  ggml_format_name(result, "%s (view)", src->name);
5027
 
5028
- result->nb[0] = src->nb[0];
5029
- result->nb[1] = src->nb[1];
5030
- result->nb[2] = src->nb[2];
5031
- result->nb[3] = src->nb[3];
5032
 
5033
  return result;
5034
  }
@@ -5601,7 +5790,7 @@ struct ggml_tensor * ggml_repeat_back(
5601
 
5602
  // ggml_concat
5603
 
5604
- struct ggml_tensor* ggml_concat(
5605
  struct ggml_context* ctx,
5606
  struct ggml_tensor* a,
5607
  struct ggml_tensor* b) {
@@ -5868,7 +6057,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5868
  struct ggml_tensor * ggml_rms_norm_back(
5869
  struct ggml_context * ctx,
5870
  struct ggml_tensor * a,
5871
- struct ggml_tensor * b) {
 
5872
  bool is_node = false;
5873
 
5874
  if (a->grad) {
@@ -5878,6 +6068,8 @@ struct ggml_tensor * ggml_rms_norm_back(
5878
 
5879
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5880
 
 
 
5881
  result->op = GGML_OP_RMS_NORM_BACK;
5882
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5883
  result->src[0] = a;
@@ -6207,7 +6399,7 @@ struct ggml_tensor * ggml_reshape(
6207
  //GGML_ASSERT(false);
6208
  }
6209
 
6210
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6211
  ggml_format_name(result, "%s (reshaped)", a->name);
6212
 
6213
  result->op = GGML_OP_RESHAPE;
@@ -6231,7 +6423,7 @@ struct ggml_tensor * ggml_reshape_1d(
6231
  }
6232
 
6233
  const int64_t ne[1] = { ne0 };
6234
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6235
  ggml_format_name(result, "%s (reshaped)", a->name);
6236
 
6237
  result->op = GGML_OP_RESHAPE;
@@ -6256,7 +6448,7 @@ struct ggml_tensor * ggml_reshape_2d(
6256
  }
6257
 
6258
  const int64_t ne[2] = { ne0, ne1 };
6259
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6260
  ggml_format_name(result, "%s (reshaped)", a->name);
6261
 
6262
  result->op = GGML_OP_RESHAPE;
@@ -6282,7 +6474,7 @@ struct ggml_tensor * ggml_reshape_3d(
6282
  }
6283
 
6284
  const int64_t ne[3] = { ne0, ne1, ne2 };
6285
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6286
  ggml_format_name(result, "%s (reshaped)", a->name);
6287
 
6288
  result->op = GGML_OP_RESHAPE;
@@ -6292,7 +6484,6 @@ struct ggml_tensor * ggml_reshape_3d(
6292
  return result;
6293
  }
6294
 
6295
-
6296
  struct ggml_tensor * ggml_reshape_4d(
6297
  struct ggml_context * ctx,
6298
  struct ggml_tensor * a,
@@ -6310,7 +6501,7 @@ struct ggml_tensor * ggml_reshape_4d(
6310
  }
6311
 
6312
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6313
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6314
  ggml_format_name(result, "%s (reshaped)", a->name);
6315
 
6316
  result->op = GGML_OP_RESHAPE;
@@ -6320,46 +6511,40 @@ struct ggml_tensor * ggml_reshape_4d(
6320
  return result;
6321
  }
6322
 
6323
- // ggml_view_1d
6324
-
6325
- static struct ggml_tensor * ggml_view_tensor_offset(
6326
  struct ggml_context * ctx,
6327
  struct ggml_tensor * a,
6328
  int n_dims,
6329
  const int64_t * ne,
6330
  size_t offset) {
6331
- // don't calculate an offset from an unallocated tensor
6332
- void * data = NULL;
6333
- if (a->data != NULL) {
6334
- data = (char *) a->data + offset;
6335
- }
6336
 
6337
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6338
 
 
 
 
 
 
6339
  ggml_format_name(result, "%s (view)", a->name);
6340
 
6341
  ggml_set_op_params(result, &offset, sizeof(offset));
6342
 
 
 
 
 
6343
  return result;
6344
  }
6345
 
 
 
6346
  struct ggml_tensor * ggml_view_1d(
6347
  struct ggml_context * ctx,
6348
  struct ggml_tensor * a,
6349
  int64_t ne0,
6350
  size_t offset) {
6351
 
6352
- bool is_node = false;
6353
-
6354
- if (a->grad) {
6355
- is_node = true;
6356
- }
6357
-
6358
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6359
-
6360
- result->op = GGML_OP_VIEW;
6361
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6362
- result->src[0] = a;
6363
 
6364
  return result;
6365
  }
@@ -6374,24 +6559,14 @@ struct ggml_tensor * ggml_view_2d(
6374
  size_t nb1,
6375
  size_t offset) {
6376
 
6377
- bool is_node = false;
6378
-
6379
- if (a->grad) {
6380
- is_node = true;
6381
- }
6382
-
6383
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6384
 
6385
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6386
 
6387
  result->nb[1] = nb1;
6388
  result->nb[2] = result->nb[1]*ne1;
6389
  result->nb[3] = result->nb[2];
6390
 
6391
- result->op = GGML_OP_VIEW;
6392
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6393
- result->src[0] = a;
6394
-
6395
  return result;
6396
  }
6397
 
@@ -6407,24 +6582,14 @@ struct ggml_tensor * ggml_view_3d(
6407
  size_t nb2,
6408
  size_t offset) {
6409
 
6410
- bool is_node = false;
6411
-
6412
- if (a->grad) {
6413
- is_node = true;
6414
- }
6415
-
6416
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6417
 
6418
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6419
 
6420
  result->nb[1] = nb1;
6421
  result->nb[2] = nb2;
6422
  result->nb[3] = result->nb[2]*ne2;
6423
 
6424
- result->op = GGML_OP_VIEW;
6425
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6426
- result->src[0] = a;
6427
-
6428
  return result;
6429
  }
6430
 
@@ -6442,24 +6607,14 @@ struct ggml_tensor * ggml_view_4d(
6442
  size_t nb3,
6443
  size_t offset) {
6444
 
6445
- bool is_node = false;
6446
-
6447
- if (a->grad) {
6448
- is_node = true;
6449
- }
6450
-
6451
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6452
 
6453
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6454
 
6455
  result->nb[1] = nb1;
6456
  result->nb[2] = nb2;
6457
  result->nb[3] = nb3;
6458
 
6459
- result->op = GGML_OP_VIEW;
6460
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6461
- result->src[0] = a;
6462
-
6463
  return result;
6464
  }
6465
 
@@ -6646,7 +6801,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6646
 
6647
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6648
 
6649
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6650
  ggml_set_op_params(result, params, sizeof(params));
6651
 
6652
  result->op = GGML_OP_DIAG_MASK_INF;
@@ -6663,7 +6818,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
6663
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6664
  }
6665
 
6666
-
6667
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6668
  struct ggml_context * ctx,
6669
  struct ggml_tensor * a,
@@ -6686,7 +6840,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6686
 
6687
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6688
 
6689
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6690
  ggml_set_op_params(result, params, sizeof(params));
6691
 
6692
  result->op = GGML_OP_DIAG_MASK_ZERO;
@@ -7475,6 +7629,8 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
7475
  }
7476
 
7477
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
 
7478
  result->op = GGML_OP_ADD_REL_POS;
7479
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7480
  result->src[0] = a;
@@ -9452,6 +9608,8 @@ static void ggml_compute_forward_div_f32(
9452
 
9453
 
9454
  #ifdef GGML_USE_ACCELERATE
 
 
9455
  vDSP_vdiv(
9456
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9457
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -10758,7 +10916,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
10758
 
10759
  GGML_TENSOR_BINARY_OP_LOCALS;
10760
 
10761
- const float eps = 1e-6f; // TODO: make this a parameter
 
10762
 
10763
  // TODO: optimize
10764
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11936,8 +12095,8 @@ static void ggml_compute_forward_diag_mask_f32(
11936
  const int ith = params->ith;
11937
  const int nth = params->nth;
11938
 
11939
- const int n_past = ((int32_t *) dst->op_params)[0];
11940
- const bool inplace = (bool)((int32_t *) dst->op_params)[1];
11941
 
11942
  GGML_ASSERT(n_past >= 0);
11943
 
@@ -12148,6 +12307,7 @@ static void ggml_compute_forward_soft_max_back_f32(
12148
  // dx = J * dy
12149
  // dxk = sum_i(Jki * dyi)
12150
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
 
12151
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
12152
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
12153
  // dxk = -yk * dot(y, dy) + yk*dyk
@@ -13938,7 +14098,7 @@ static void ggml_compute_forward_flash_attn_f32(
13938
  vvexpf(S, S, &Mup);
13939
  ggml_vec_sum_f32(Mup, &sum, S);
13940
  #else
13941
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
13942
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13943
 
13944
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13948,9 +14108,13 @@ static void ggml_compute_forward_flash_attn_f32(
13948
  if (SS[j] == -INFINITY) {
13949
  SS[j] = 0.0f;
13950
  } else {
 
 
 
13951
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
13952
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13953
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
 
13954
  sump[j] += (ggml_float)val;
13955
  SS[j] = val;
13956
  }
@@ -14528,7 +14692,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14528
  vvexpf(SM, SM, &Mup);
14529
  ggml_vec_sum_f32(Mup, &sum, SM);
14530
  #else
14531
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14532
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
14533
 
14534
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -14539,9 +14703,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
14539
  if (SR[j] == -INFINITY) {
14540
  SW[j] = 0.0f;
14541
  } else {
 
 
 
14542
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
14543
  memcpy(&scvt[j], &s, sizeof(uint16_t));
14544
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
 
14545
  sump[j] += (ggml_float)val;
14546
  SW[j] = val;
14547
  }
@@ -14987,11 +15155,8 @@ static void ggml_compute_forward_add_rel_pos_f32(
14987
  const struct ggml_tensor * src1,
14988
  const struct ggml_tensor * src2,
14989
  struct ggml_tensor * dst) {
14990
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
14991
- GGML_ASSERT(src0->nb[0] == dst->nb[0] && src0->nb[1] == dst->nb[1]
14992
- && src0->nb[2] == dst->nb[2] && src0->nb[3] == dst->nb[3]);
14993
 
14994
- const bool inplace = dst->data == src0->data;
14995
  if (!inplace && params->type == GGML_TASK_INIT) {
14996
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14997
  return;
@@ -15282,6 +15447,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15282
  const int nc = src0->ne[0];
15283
  const int nr = ggml_nrows(src0);
15284
 
 
 
15285
  if (params->type == GGML_TASK_INIT) {
15286
  if (ith == 0) {
15287
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -15293,7 +15460,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15293
  if (ith == 0) {
15294
  float * dp = (float *) dst->data;
15295
  ggml_vec_sum_f32(nth, dp, sums);
15296
- dp[0] *= -1.0f;
15297
  }
15298
  return;
15299
  }
@@ -15310,7 +15477,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15310
  for (int i1 = ir0; i1 < ir1; i1++) {
15311
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15312
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15313
- float * st = (float *) params->wdata + nth + ith*nc;
15314
 
15315
  #ifndef NDEBUG
15316
  for (int i = 0; i < nc; ++i) {
@@ -15325,15 +15492,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15325
  float max = -INFINITY;
15326
  ggml_vec_max_f32(nc, &max, s0);
15327
 
15328
- uint16_t scvt;
15329
  for (int i = 0; i < nc; i++) {
15330
  if (s0[i] == -INFINITY) {
15331
  st[i] = 0.0f;
15332
  } else {
15333
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
 
 
 
15334
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15335
  memcpy(&scvt, &s, sizeof(scvt));
15336
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
 
15337
  sum += (ggml_float)val;
15338
  st[i] = val;
15339
  }
@@ -15349,7 +15520,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15349
  ggml_vec_log_f32(nc, st, st);
15350
  ggml_vec_mul_f32(nc, st, st, s1);
15351
 
15352
- ggml_vec_sum_f32(nc, sums + ith, st);
 
 
15353
 
15354
  #ifndef NDEBUG
15355
  for (int i = 0; i < nc; ++i) {
@@ -15399,7 +15572,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15399
  return;
15400
  }
15401
 
15402
- const float eps = 1e-9f;
15403
 
15404
  // TODO: handle transposed/permuted matrices
15405
  const int64_t nc = src0->ne[0];
@@ -15418,7 +15591,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15418
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
15419
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15420
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15421
- float * sm = (float *) params->wdata + ith*nc;
15422
 
15423
  #ifndef NDEBUG
15424
  for (int i = 0; i < nc; ++i) {
@@ -15427,54 +15599,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15427
  assert(!isnan(s1[i]));
15428
  }
15429
  #endif
15430
- // step by step explanation:
15431
- {
15432
- //float * sums = (float *) params->wdata;
15433
-
15434
- // forward pass with annotated gradients from backward pass
15435
- // (built by going in reverse operation order, adding to gradients of current operation args)
15436
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
15437
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15438
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
15439
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
15440
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
15441
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
15442
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
15443
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
15444
-
15445
- // substitute into grad[st1], because we can reuse softmax_back from this point on
15446
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
15447
- // postorder:
15448
- // grad[st1] := softmax(s0)
15449
- // grad[st1] := grad[st1]*(1.0 - eps)
15450
- // grad[st1] := grad[st1] + eps
15451
- // grad[st1] := s1 / grad[st1]
15452
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
15453
-
15454
- // src0 gradients by going through softmax_back
15455
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15456
- // from softmax_back:
15457
- // dxk = yk * (dyk - dot(y, dy))
15458
- // dot_y_dy := dot(y, dy)
15459
- // dx := dy
15460
- // dx := dx - dot_y_dy
15461
- // dx := dx * y
15462
- // postorder:
15463
- // dot_st1_dst1 := dot(st1, grad[st1])
15464
- // grad[s0] := grad[st1]
15465
- // grad[s0] := grad[s0] - dot_st1_dst1
15466
- // grad[s0] := grad[s0] * st1
15467
-
15468
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
15469
- // sm := softmax(s0)
15470
- // grad[s0] := sm*(1.0 - eps)
15471
- // grad[s0] := grad[s0] + eps
15472
- // grad[s0] := s1 / grad[s0]
15473
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
15474
- // dot_st1_dst1 := dot(sm, grad[s0])
15475
- // grad[s0] := grad[s0] - dot_st1_dst1
15476
- // grad[s0] := grad[s0] * sm
15477
- }
15478
 
15479
  // soft_max
15480
  ggml_float sum = 0.0;
@@ -15482,39 +15606,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15482
  float max = -INFINITY;
15483
  ggml_vec_max_f32(nc, &max, s0);
15484
 
15485
- uint16_t scvt;
15486
  for (int i = 0; i < nc; i++) {
15487
  if (s0[i] == -INFINITY) {
15488
- sm[i] = 0.0f;
15489
  } else {
15490
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
 
 
 
15491
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15492
  memcpy(&scvt, &s, sizeof(scvt));
15493
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
 
15494
  sum += (ggml_float)val;
15495
- sm[i] = val;
15496
  }
15497
  }
15498
 
15499
  assert(sum > 0.0);
15500
- sum = 1.0/sum;
15501
  }
15502
 
15503
- float dot_st1_dst1 = 0;
15504
- ggml_vec_scale_f32(nc, sm, sum);
15505
- ggml_vec_cpy_f32 (nc, ds0, sm);
15506
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
15507
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
15508
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
15509
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
15510
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
15511
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
15512
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
15513
 
15514
  #ifndef NDEBUG
15515
  for (int i = 0; i < nc; ++i) {
15516
- assert(!isnan(sm[i]));
15517
- assert(!isinf(sm[i]));
15518
  assert(!isnan(ds0[i]));
15519
  assert(!isinf(ds0[i]));
15520
  }
@@ -16069,9 +16191,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16069
  {
16070
  // necessary for llama
16071
  if (src0->grad) {
 
 
 
16072
  src0->grad = ggml_add_impl(ctx,
16073
  src0->grad,
16074
- ggml_rms_norm_back(ctx, src0, tensor->grad),
16075
  inplace);
16076
  }
16077
  } break;
@@ -16839,9 +16964,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16839
  return result;
16840
  }
16841
 
16842
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
16843
- struct ggml_cgraph result = *gf;
16844
-
16845
  GGML_ASSERT(gf->n_nodes > 0);
16846
 
16847
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16865,15 +16988,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16865
  }
16866
  }
16867
 
16868
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
16869
  struct ggml_tensor * node = gf->nodes[i];
16870
 
16871
  if (node->is_param) {
16872
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16873
- ggml_build_forward_expand(&result, node->grad);
16874
  }
16875
  }
 
16876
 
 
 
 
16877
  return result;
16878
  }
16879
 
@@ -17549,10 +17676,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
17549
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
17550
  {
17551
  n_tasks = n_threads;
17552
-
17553
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
17554
-
17555
- work_size = MAX(work_size, cur);
17556
  } break;
17557
  case GGML_OP_NONE:
17558
  {
@@ -18430,14 +18553,16 @@ static enum ggml_opt_result ggml_opt_adam(
18430
  struct ggml_opt_params params,
18431
  struct ggml_tensor * f,
18432
  struct ggml_cgraph * gf,
18433
- struct ggml_cgraph * gb) {
 
 
18434
  GGML_ASSERT(ggml_is_scalar(f));
18435
 
18436
  // these will store the parameters we want to optimize
18437
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
18438
 
18439
  int np = 0;
18440
- int nx = 0;
18441
  for (int i = 0; i < gf->n_nodes; ++i) {
18442
  if (gf->nodes[i]->is_param) {
18443
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -18456,31 +18581,32 @@ static enum ggml_opt_result ggml_opt_adam(
18456
  }
18457
 
18458
  // constants
18459
- const float sched = params.adam.sched;
18460
- const float decay = params.adam.decay * sched;
18461
- const float alpha = params.adam.alpha * sched;
18462
  const float beta1 = params.adam.beta1;
18463
  const float beta2 = params.adam.beta2;
18464
  const float eps = params.adam.eps;
 
 
18465
 
18466
- float * x = opt->adam.x->data; // view of the parameters
18467
- float * g1 = opt->adam.g1->data; // gradient
18468
- float * g2 = opt->adam.g2->data; // gradient squared
18469
  float * m = opt->adam.m->data; // first moment
18470
  float * v = opt->adam.v->data; // second moment
18471
- float * mh = opt->adam.mh->data; // first moment hat
18472
- float * vh = opt->adam.vh->data; // second moment hat
18473
 
18474
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18475
 
18476
- // update view
18477
- ggml_opt_get_params(np, ps, x);
 
18478
 
18479
  // compute the function value
18480
  ggml_graph_reset (gf);
18481
  ggml_set_f32 (f->grad, 1.0f);
18482
 
18483
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
 
 
18484
 
18485
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
18486
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -18488,6 +18614,9 @@ static enum ggml_opt_result ggml_opt_adam(
18488
  pf[opt->iter % params.past] = opt->adam.fx_prev;
18489
  }
18490
 
 
 
 
18491
  // initialize
18492
  if (opt->just_initialized) {
18493
  opt->adam.n_no_improvement = 0;
@@ -18520,50 +18649,55 @@ static enum ggml_opt_result ggml_opt_adam(
18520
  UNUSED(t_start_cpu);
18521
 
18522
  {
18523
- // update the gradient
18524
- ggml_opt_get_grad(np, ps, g1);
18525
-
18526
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
18527
- ggml_vec_scale_f32(nx, m, beta1);
18528
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
18529
-
18530
- // g2 = g1^2
18531
- ggml_vec_sqr_f32 (nx, g2, g1);
18532
-
18533
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
18534
- ggml_vec_scale_f32(nx, v, beta2);
18535
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
18536
-
18537
- // m^hat = m_t / (1 - beta1^t)
18538
- // v^hat = v_t / (1 - beta2^t)
18539
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
18540
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
18541
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
18542
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
18543
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
18544
- ggml_vec_cpy_f32 (nx, mh, m);
18545
- ggml_vec_cpy_f32 (nx, vh, v);
18546
-
18547
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
18548
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
18549
-
18550
- ggml_vec_sqrt_f32 (nx, vh, vh);
18551
- ggml_vec_acc1_f32 (nx, vh, eps);
18552
-
18553
- ggml_vec_div_f32 (nx, mh, mh, vh);
18554
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
18555
- ggml_vec_sub_f32 (nx, x, x, mh);
 
 
 
18556
 
18557
- // update the parameters
18558
- ggml_opt_set_params(np, ps, x);
18559
  }
18560
 
18561
  ggml_graph_reset (gf);
18562
  ggml_set_f32 (f->grad, 1.0f);
18563
 
18564
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18565
 
18566
  const float fx = ggml_get_f32_1d(f, 0);
 
 
18567
 
18568
  // check convergence
18569
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -18632,7 +18766,6 @@ struct ggml_lbfgs_iteration_data {
18632
  };
18633
 
18634
  static enum ggml_opt_result linesearch_backtracking(
18635
- struct ggml_context * ctx,
18636
  const struct ggml_opt_params * params,
18637
  int nx,
18638
  float * x,
@@ -18644,8 +18777,11 @@ static enum ggml_opt_result linesearch_backtracking(
18644
  struct ggml_tensor * f,
18645
  struct ggml_cgraph * gf,
18646
  struct ggml_cgraph * gb,
 
18647
  const int np,
18648
- struct ggml_tensor * ps[]) {
 
 
18649
  int count = 0;
18650
 
18651
  float width = 0.0f;
@@ -18674,6 +18810,12 @@ static enum ggml_opt_result linesearch_backtracking(
18674
  dgtest = params->lbfgs.ftol*dginit;
18675
 
18676
  while (true) {
 
 
 
 
 
 
18677
  ggml_vec_cpy_f32(nx, x, xp);
18678
  ggml_vec_mad_f32(nx, x, d, *step);
18679
 
@@ -18684,7 +18826,7 @@ static enum ggml_opt_result linesearch_backtracking(
18684
  ggml_graph_reset (gf);
18685
  ggml_set_f32 (f->grad, 1.0f);
18686
 
18687
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
18688
 
18689
  ggml_opt_get_grad(np, ps, g);
18690
 
@@ -18718,7 +18860,6 @@ static enum ggml_opt_result linesearch_backtracking(
18718
  // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
18719
  return count;
18720
  }
18721
- return count;
18722
  }
18723
  }
18724
 
@@ -18744,7 +18885,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18744
  struct ggml_opt_params params,
18745
  struct ggml_tensor * f,
18746
  struct ggml_cgraph * gf,
18747
- struct ggml_cgraph * gb) {
 
 
18748
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18749
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18750
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -18776,6 +18919,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18776
  opt->iter = iter;
18777
  }
18778
 
 
 
 
 
18779
  float * x = opt->lbfgs.x->data; // current parameters
18780
  float * xp = opt->lbfgs.xp->data; // previous parameters
18781
  float * g = opt->lbfgs.g->data; // current gradient
@@ -18797,6 +18944,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18797
  float * lm_s = opt->lbfgs.lms->data;
18798
  float * lm_y = opt->lbfgs.lmy->data;
18799
 
 
 
 
 
 
 
18800
  // evaluate the function value and its gradient
18801
  {
18802
  ggml_opt_set_params(np, ps, x);
@@ -18804,11 +18957,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18804
  ggml_graph_reset (gf);
18805
  ggml_set_f32 (f->grad, 1.0f);
18806
 
18807
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18808
 
18809
  ggml_opt_get_grad(np, ps, g);
18810
 
18811
  fx = ggml_get_f32_1d(f, 0);
 
 
 
18812
  }
18813
 
18814
  // search direction = -gradient
@@ -18863,7 +19019,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18863
  ggml_vec_cpy_f32(nx, xp, x);
18864
  ggml_vec_cpy_f32(nx, gp, g);
18865
 
18866
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
18867
 
18868
  if (ls < 0) {
18869
  // linesearch failed - go back to the previous point and return
@@ -18873,6 +19029,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18873
  return ls;
18874
  }
18875
 
 
 
18876
  ggml_vec_norm_f32(nx, &xnorm, x);
18877
  ggml_vec_norm_f32(nx, &gnorm, g);
18878
 
@@ -18930,7 +19088,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18930
  // ys = y^t \cdot s -> 1 / \rho.
18931
  // yy = y^t \cdot y.
18932
  //
18933
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
18934
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18935
 
18936
  lm_ys[end[0]] = ys;
@@ -18993,13 +19151,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18993
  .adam = {
18994
  .n_iter = 10000,
18995
  .sched = 1.000f,
18996
- .decay = 0.001f,
 
18997
  .alpha = 0.001f,
18998
  .beta1 = 0.9f,
18999
  .beta2 = 0.999f,
19000
  .eps = 1e-8f,
19001
  .eps_f = 1e-5f,
19002
  .eps_g = 1e-3f,
 
19003
  },
19004
  };
19005
  } break;
@@ -19049,23 +19209,13 @@ GGML_API void ggml_opt_init(
19049
  switch (opt->params.type) {
19050
  case GGML_OPT_ADAM:
19051
  {
19052
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19053
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19054
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19055
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19056
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19057
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19058
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19059
  opt->adam.pf = params.past > 0
19060
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
19061
  : NULL;
19062
- ggml_set_zero(opt->adam.x);
19063
- ggml_set_zero(opt->adam.g1);
19064
- ggml_set_zero(opt->adam.g2);
19065
  ggml_set_zero(opt->adam.m);
19066
  ggml_set_zero(opt->adam.v);
19067
- ggml_set_zero(opt->adam.mh);
19068
- ggml_set_zero(opt->adam.vh);
19069
  if (opt->adam.pf) {
19070
  ggml_set_zero(opt->adam.pf);
19071
  }
@@ -19149,7 +19299,7 @@ enum ggml_opt_result ggml_opt_resume(
19149
  *gf = ggml_build_forward (f);
19150
  *gb = ggml_build_backward(ctx, gf, true);
19151
 
19152
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
19153
  }
19154
 
19155
  enum ggml_opt_result ggml_opt_resume_g(
@@ -19157,7 +19307,9 @@ enum ggml_opt_result ggml_opt_resume_g(
19157
  struct ggml_opt_context * opt,
19158
  struct ggml_tensor * f,
19159
  struct ggml_cgraph * gf,
19160
- struct ggml_cgraph * gb) {
 
 
19161
 
19162
  // build forward + backward compute graphs
19163
  enum ggml_opt_result result = GGML_OPT_OK;
@@ -19165,11 +19317,11 @@ enum ggml_opt_result ggml_opt_resume_g(
19165
  switch (opt->params.type) {
19166
  case GGML_OPT_ADAM:
19167
  {
19168
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
19169
  } break;
19170
  case GGML_OPT_LBFGS:
19171
  {
19172
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
19173
  } break;
19174
  }
19175
 
@@ -19624,7 +19776,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19624
 
19625
  // read the kv pairs
19626
  {
19627
- ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19628
 
19629
  for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19630
  struct gguf_kv * kv = &ctx->kv[i];
@@ -19707,7 +19859,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19707
 
19708
  // read the tensor infos
19709
  {
19710
- ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19711
 
19712
  for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19713
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19908,7 +20060,7 @@ void gguf_free(struct gguf_context * ctx) {
19908
  }
19909
  }
19910
 
19911
- GGML_ALIGNED_FREE(ctx->kv);
19912
  }
19913
 
19914
  if (ctx->infos) {
@@ -19920,7 +20072,7 @@ void gguf_free(struct gguf_context * ctx) {
19920
  }
19921
  }
19922
 
19923
- GGML_ALIGNED_FREE(ctx->infos);
19924
  }
19925
 
19926
  GGML_ALIGNED_FREE(ctx);
 
47
  // disable "possible loss of data" to avoid hundreds of casts
48
  // we should just be careful :)
49
  #pragma warning(disable: 4244 4267)
50
+
51
+ // disable POSIX deprecation warnigns
52
+ // these functions are never going away, anyway
53
+ #pragma warning(disable: 4996)
54
  #endif
55
 
56
  #if defined(_WIN32)
 
127
  #define GGML_GELU_FP16
128
  #define GGML_GELU_QUICK_FP16
129
  #define GGML_SILU_FP16
130
+ // #define GGML_CROSS_ENTROPY_EXP_FP16
131
+ // #define GGML_FLASH_ATTN_EXP_FP16
132
 
133
  #define GGML_SOFT_MAX_UNROLL 4
134
  #define GGML_VEC_DOT_UNROLL 2
 
192
  //
193
 
194
  #if defined(_MSC_VER) || defined(__MINGW32__)
195
+ #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
+ #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
  #else
198
  inline static void * ggml_aligned_malloc(size_t size) {
199
  void * aligned_memory = NULL;
 
218
  }
219
  return aligned_memory;
220
  }
221
+ #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
222
+ #define GGML_ALIGNED_FREE(ptr) free(ptr)
223
  #endif
224
 
225
  #define UNUSED GGML_UNUSED
 
307
  #endif
308
  #endif
309
 
310
+ #ifdef __riscv_v_intrinsic
311
+ #include <riscv_vector.h>
312
+ #endif
313
+
314
  #ifdef __F16C__
315
 
316
  #ifdef _MSC_VER
 
675
  }
676
 
677
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
678
+ #if __AVXVNNI__
679
  const __m256i zero = _mm256_setzero_si256();
680
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
681
  return _mm256_cvtepi32_ps(summed_pairs);
 
688
 
689
  // multiply int8_t, add results pairwise twice and return as float vector
690
  static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
691
+ #if __AVXVNNIINT8__
692
  const __m256i zero = _mm256_setzero_si256();
693
  const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
694
  return _mm256_cvtepi32_ps(summed_pairs);
 
704
  static inline __m128i packNibbles( __m256i bytes )
705
  {
706
  // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
707
+ #if __AVX512F__
708
  const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
709
  bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
710
  return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
 
823
 
824
  #if !defined(__aarch64__)
825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
  inline static int32_t vaddvq_s32(int32x4_t v) {
827
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
828
  }
 
831
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
832
  }
833
 
 
 
 
 
 
 
834
  inline static float vmaxvq_f32(float32x4_t v) {
835
  return
836
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
 
1258
  #endif
1259
  }
1260
  #else
 
1261
  // scalar
1262
  quantize_row_q8_0_reference(x, y, k);
1263
  #endif
 
1476
  #endif
1477
  }
1478
  #else
 
1479
  // scalar
1480
  quantize_row_q8_1_reference(x, y, k);
1481
  #endif
 
2641
  }
2642
 
2643
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2644
+ #elif defined(__riscv_v_intrinsic)
2645
+ float sumf = 0.0;
2646
+
2647
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2648
+
2649
+ for (int i = 0; i < nb; i++) {
2650
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2651
+
2652
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2653
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2654
+
2655
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2656
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2657
+
2658
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2659
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2660
+
2661
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2662
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2663
+
2664
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2665
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2666
+
2667
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2668
+
2669
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2670
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2671
+
2672
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2673
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2674
+
2675
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2676
+ }
2677
+
2678
+ *s = sumf;
2679
  #else
2680
  // scalar
2681
  float sumf = 0.0;
 
2802
  }
2803
 
2804
  *s = hsum_float_8(acc) + summs;
2805
+ #elif defined(__riscv_v_intrinsic)
2806
+ float sumf = 0.0;
2807
+
2808
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2809
+
2810
+ for (int i = 0; i < nb; i++) {
2811
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2812
+
2813
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2814
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2815
+
2816
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2817
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2818
+
2819
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2820
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2821
+
2822
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2823
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2824
+
2825
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2826
+
2827
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2828
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2829
+
2830
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2831
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2832
+
2833
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2834
+ }
2835
+
2836
+ *s = sumf;
2837
  #else
2838
  // scalar
2839
  float sumf = 0.0;
 
3068
  }
3069
 
3070
  *s = hsum_float_8(acc);
3071
+ #elif defined(__riscv_v_intrinsic)
3072
+ float sumf = 0.0;
3073
+
3074
+ uint32_t qh;
3075
+
3076
+ // These temp values are for masking and shift operations
3077
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3078
+ uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3079
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3080
+
3081
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3082
+
3083
+ for (int i = 0; i < nb; i++) {
3084
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3085
+
3086
+ // temporary registers
3087
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3088
+ vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3089
+ vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3090
+ vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3091
+
3092
+ // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3093
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3094
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3095
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3096
+
3097
+ // ((qh & (1u << (j + 16))) >> (j + 12));
3098
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3099
+ vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3100
+
3101
+ // narrowing
3102
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3103
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3104
+
3105
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3106
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3107
+
3108
+ // load
3109
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3110
+
3111
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3112
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3113
+
3114
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3115
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3116
+
3117
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3118
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3119
+
3120
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3121
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3122
+
3123
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3124
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3125
+
3126
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3127
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3128
+
3129
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3130
+
3131
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3132
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3133
+
3134
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3135
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3136
+
3137
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3138
+ }
3139
+
3140
+ *s = sumf;
3141
  #else
3142
  // scalar
3143
  float sumf = 0.0;
 
3394
  }
3395
 
3396
  *s = hsum_float_8(acc) + summs;
3397
+ #elif defined(__riscv_v_intrinsic)
3398
+ float sumf = 0.0;
3399
+
3400
+ uint32_t qh;
3401
+
3402
+ // These temp values are for shift operations
3403
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3404
+
3405
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3406
+
3407
+ for (int i = 0; i < nb; i++) {
3408
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3409
+
3410
+ // temporary registers
3411
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3412
+ vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3413
+
3414
+ // load qh
3415
+ vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3416
+
3417
+ // ((qh >> (j + 0)) << 4) & 0x10;
3418
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3419
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3420
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3421
+
3422
+ // ((qh >> (j + 12)) ) & 0x10;
3423
+ vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3424
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3425
+
3426
+ // narrowing
3427
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3428
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3429
+
3430
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3431
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3432
+
3433
+ // load
3434
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3435
+
3436
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3437
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3438
+
3439
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3440
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3441
+
3442
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3443
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3444
+
3445
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3446
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3447
+
3448
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3449
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3450
+
3451
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3452
+
3453
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3454
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3455
+
3456
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3457
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3458
+
3459
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3460
+ }
3461
+
3462
+ *s = sumf;
3463
  #else
3464
  // scalar
3465
  float sumf = 0.0;
 
3571
  }
3572
 
3573
  *s = hsum_float_8(acc);
3574
+ #elif defined(__riscv_v_intrinsic)
3575
+ float sumf = 0.0;
3576
+ size_t vl = __riscv_vsetvl_e8m1(qk);
3577
+
3578
+ for (int i = 0; i < nb; i++) {
3579
+ // load elements
3580
+ vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
3581
+ vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
3582
+
3583
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
3584
+
3585
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
3586
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
3587
+
3588
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
3589
+
3590
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3591
+ }
3592
+
3593
+ *s = sumf;
3594
  #else
3595
  // scalar
3596
  float sumf = 0.0;
 
4291
  }
4292
 
4293
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4294
+ size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4295
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4296
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4297
+ }
4298
+ return nbytes;
 
 
 
 
 
4299
  }
4300
 
4301
  size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
 
4749
  enum ggml_type type,
4750
  int n_dims,
4751
  const int64_t * ne,
4752
+ struct ggml_tensor * view_src,
4753
+ size_t view_offs) {
4754
 
4755
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4756
 
4757
+ // find the base tensor and absolute offset
4758
+ if (view_src != NULL && view_src->view_src != NULL) {
4759
+ view_offs += view_src->view_offs;
4760
+ view_src = view_src->view_src;
4761
+ }
4762
 
4763
+ size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4764
+ for (int i = 1; i < n_dims; i++) {
4765
+ data_size *= ne[i];
 
 
4766
  }
4767
 
4768
+ GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
4769
+
4770
+ void * data = view_src != NULL ? view_src->data : NULL;
4771
+ if (data != NULL) {
4772
+ data = (char *) data + view_offs;
4773
+ }
 
 
4774
 
4775
+ size_t obj_alloc_size = 0;
4776
 
4777
+ if (view_src == NULL && ctx->no_alloc == false) {
4778
+ if (ctx->scratch.data != NULL) {
4779
+ // allocate tensor data in the scratch buffer
4780
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4781
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4782
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4783
+ assert(false);
4784
+ return NULL;
4785
+ }
4786
 
4787
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4788
+
4789
+ ctx->scratch.offs += data_size;
4790
+ } else {
4791
+ // allocate tensor data in the context's memory pool
4792
+ obj_alloc_size = data_size;
4793
+ }
4794
  }
4795
 
4796
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
4797
 
4798
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4799
 
 
4813
  /*.perf_runs =*/ 0,
4814
  /*.perf_cycles =*/ 0,
4815
  /*.perf_time_us =*/ 0,
4816
+ /*.view_src =*/ view_src,
4817
+ /*.view_offs =*/ view_offs,
4818
+ /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
4819
  /*.name =*/ { 0 },
4820
  /*.extra =*/ NULL,
4821
  /*.padding =*/ { 0 },
 
4839
  return result;
4840
  }
4841
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4842
  struct ggml_tensor * ggml_new_tensor(
4843
  struct ggml_context * ctx,
4844
  enum ggml_type type,
4845
  int n_dims,
4846
  const int64_t * ne) {
4847
+ return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
4848
  }
4849
 
4850
  struct ggml_tensor * ggml_new_tensor_1d(
 
4909
  }
4910
 
4911
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4912
+ return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
4913
+ }
4914
+
4915
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4916
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4917
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4918
+ memcpy(tensor->op_params, params, params_size);
4919
+ }
4920
+
4921
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4922
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4923
+ return ((const int32_t *)(tensor->op_params))[i];
4924
+ }
4925
+
4926
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4927
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4928
+ ((int32_t *)(tensor->op_params))[i] = value;
4929
  }
4930
 
4931
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
 
5201
  return tensor;
5202
  }
5203
 
 
 
 
 
 
 
 
5204
  struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
5205
  va_list args;
5206
  va_start(args, fmt);
 
5211
 
5212
  struct ggml_tensor * ggml_view_tensor(
5213
  struct ggml_context * ctx,
5214
+ struct ggml_tensor * src) {
5215
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
5216
  ggml_format_name(result, "%s (view)", src->name);
5217
 
5218
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
5219
+ result->nb[i] = src->nb[i];
5220
+ }
 
5221
 
5222
  return result;
5223
  }
 
5790
 
5791
  // ggml_concat
5792
 
5793
+ struct ggml_tensor * ggml_concat(
5794
  struct ggml_context* ctx,
5795
  struct ggml_tensor* a,
5796
  struct ggml_tensor* b) {
 
6057
  struct ggml_tensor * ggml_rms_norm_back(
6058
  struct ggml_context * ctx,
6059
  struct ggml_tensor * a,
6060
+ struct ggml_tensor * b,
6061
+ float eps) {
6062
  bool is_node = false;
6063
 
6064
  if (a->grad) {
 
6068
 
6069
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6070
 
6071
+ ggml_set_op_params(result, &eps, sizeof(eps));
6072
+
6073
  result->op = GGML_OP_RMS_NORM_BACK;
6074
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6075
  result->src[0] = a;
 
6399
  //GGML_ASSERT(false);
6400
  }
6401
 
6402
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
6403
  ggml_format_name(result, "%s (reshaped)", a->name);
6404
 
6405
  result->op = GGML_OP_RESHAPE;
 
6423
  }
6424
 
6425
  const int64_t ne[1] = { ne0 };
6426
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
6427
  ggml_format_name(result, "%s (reshaped)", a->name);
6428
 
6429
  result->op = GGML_OP_RESHAPE;
 
6448
  }
6449
 
6450
  const int64_t ne[2] = { ne0, ne1 };
6451
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
6452
  ggml_format_name(result, "%s (reshaped)", a->name);
6453
 
6454
  result->op = GGML_OP_RESHAPE;
 
6474
  }
6475
 
6476
  const int64_t ne[3] = { ne0, ne1, ne2 };
6477
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
6478
  ggml_format_name(result, "%s (reshaped)", a->name);
6479
 
6480
  result->op = GGML_OP_RESHAPE;
 
6484
  return result;
6485
  }
6486
 
 
6487
  struct ggml_tensor * ggml_reshape_4d(
6488
  struct ggml_context * ctx,
6489
  struct ggml_tensor * a,
 
6501
  }
6502
 
6503
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6504
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
6505
  ggml_format_name(result, "%s (reshaped)", a->name);
6506
 
6507
  result->op = GGML_OP_RESHAPE;
 
6511
  return result;
6512
  }
6513
 
6514
+ static struct ggml_tensor * ggml_view_impl(
 
 
6515
  struct ggml_context * ctx,
6516
  struct ggml_tensor * a,
6517
  int n_dims,
6518
  const int64_t * ne,
6519
  size_t offset) {
 
 
 
 
 
6520
 
6521
+ bool is_node = false;
6522
 
6523
+ if (a->grad) {
6524
+ is_node = true;
6525
+ }
6526
+
6527
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
6528
  ggml_format_name(result, "%s (view)", a->name);
6529
 
6530
  ggml_set_op_params(result, &offset, sizeof(offset));
6531
 
6532
+ result->op = GGML_OP_VIEW;
6533
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6534
+ result->src[0] = a;
6535
+
6536
  return result;
6537
  }
6538
 
6539
+ // ggml_view_1d
6540
+
6541
  struct ggml_tensor * ggml_view_1d(
6542
  struct ggml_context * ctx,
6543
  struct ggml_tensor * a,
6544
  int64_t ne0,
6545
  size_t offset) {
6546
 
6547
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
 
 
 
 
 
 
 
 
 
 
6548
 
6549
  return result;
6550
  }
 
6559
  size_t nb1,
6560
  size_t offset) {
6561
 
6562
+ const int64_t ne[2] = { ne0, ne1 };
 
 
 
 
 
 
6563
 
6564
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
6565
 
6566
  result->nb[1] = nb1;
6567
  result->nb[2] = result->nb[1]*ne1;
6568
  result->nb[3] = result->nb[2];
6569
 
 
 
 
 
6570
  return result;
6571
  }
6572
 
 
6582
  size_t nb2,
6583
  size_t offset) {
6584
 
6585
+ const int64_t ne[3] = { ne0, ne1, ne2 };
 
 
 
 
 
 
6586
 
6587
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
6588
 
6589
  result->nb[1] = nb1;
6590
  result->nb[2] = nb2;
6591
  result->nb[3] = result->nb[2]*ne2;
6592
 
 
 
 
 
6593
  return result;
6594
  }
6595
 
 
6607
  size_t nb3,
6608
  size_t offset) {
6609
 
6610
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
 
 
 
 
 
 
6611
 
6612
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
6613
 
6614
  result->nb[1] = nb1;
6615
  result->nb[2] = nb2;
6616
  result->nb[3] = nb3;
6617
 
 
 
 
 
6618
  return result;
6619
  }
6620
 
 
6801
 
6802
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6803
 
6804
+ int32_t params[] = { n_past };
6805
  ggml_set_op_params(result, params, sizeof(params));
6806
 
6807
  result->op = GGML_OP_DIAG_MASK_INF;
 
6818
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6819
  }
6820
 
 
6821
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6822
  struct ggml_context * ctx,
6823
  struct ggml_tensor * a,
 
6840
 
6841
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6842
 
6843
+ int32_t params[] = { n_past };
6844
  ggml_set_op_params(result, params, sizeof(params));
6845
 
6846
  result->op = GGML_OP_DIAG_MASK_ZERO;
 
7629
  }
7630
 
7631
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7632
+ ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
7633
+
7634
  result->op = GGML_OP_ADD_REL_POS;
7635
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7636
  result->src[0] = a;
 
9608
 
9609
 
9610
  #ifdef GGML_USE_ACCELERATE
9611
+ UNUSED(ggml_vec_div_f32);
9612
+
9613
  vDSP_vdiv(
9614
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9615
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
 
10916
 
10917
  GGML_TENSOR_BINARY_OP_LOCALS;
10918
 
10919
+ float eps;
10920
+ memcpy(&eps, dst->op_params, sizeof(float));
10921
 
10922
  // TODO: optimize
10923
  for (int64_t i03 = 0; i03 < ne03; i03++) {
 
12095
  const int ith = params->ith;
12096
  const int nth = params->nth;
12097
 
12098
+ const int n_past = ((int32_t *) dst->op_params)[0];
12099
+ const bool inplace = src0->data == dst->data;
12100
 
12101
  GGML_ASSERT(n_past >= 0);
12102
 
 
12307
  // dx = J * dy
12308
  // dxk = sum_i(Jki * dyi)
12309
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
12310
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
12311
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
12312
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
12313
  // dxk = -yk * dot(y, dy) + yk*dyk
 
14098
  vvexpf(S, S, &Mup);
14099
  ggml_vec_sum_f32(Mup, &sum, S);
14100
  #else
14101
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
14102
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
14103
 
14104
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
 
14108
  if (SS[j] == -INFINITY) {
14109
  SS[j] = 0.0f;
14110
  } else {
14111
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14112
+ const float val = expf(SS[j] - max);
14113
+ #else
14114
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
14115
  memcpy(&scvt[j], &s, sizeof(uint16_t));
14116
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14117
+ #endif
14118
  sump[j] += (ggml_float)val;
14119
  SS[j] = val;
14120
  }
 
14692
  vvexpf(SM, SM, &Mup);
14693
  ggml_vec_sum_f32(Mup, &sum, SM);
14694
  #else
14695
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
14696
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
14697
 
14698
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
 
14703
  if (SR[j] == -INFINITY) {
14704
  SW[j] = 0.0f;
14705
  } else {
14706
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14707
+ const float val = expf(SR[j] - max);
14708
+ #else
14709
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
14710
  memcpy(&scvt[j], &s, sizeof(uint16_t));
14711
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14712
+ #endif
14713
  sump[j] += (ggml_float)val;
14714
  SW[j] = val;
14715
  }
 
15155
  const struct ggml_tensor * src1,
15156
  const struct ggml_tensor * src2,
15157
  struct ggml_tensor * dst) {
 
 
 
15158
 
15159
+ const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
15160
  if (!inplace && params->type == GGML_TASK_INIT) {
15161
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
15162
  return;
 
15447
  const int nc = src0->ne[0];
15448
  const int nr = ggml_nrows(src0);
15449
 
15450
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15451
+
15452
  if (params->type == GGML_TASK_INIT) {
15453
  if (ith == 0) {
15454
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
 
15460
  if (ith == 0) {
15461
  float * dp = (float *) dst->data;
15462
  ggml_vec_sum_f32(nth, dp, sums);
15463
+ dp[0] *= -1.0f / (float) nr;
15464
  }
15465
  return;
15466
  }
 
15477
  for (int i1 = ir0; i1 < ir1; i1++) {
15478
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15479
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15480
+ float * st = ((float *) params->wdata) + nth + ith*nc;
15481
 
15482
  #ifndef NDEBUG
15483
  for (int i = 0; i < nc; ++i) {
 
15492
  float max = -INFINITY;
15493
  ggml_vec_max_f32(nc, &max, s0);
15494
 
15495
+ uint16_t scvt; UNUSED(scvt);
15496
  for (int i = 0; i < nc; i++) {
15497
  if (s0[i] == -INFINITY) {
15498
  st[i] = 0.0f;
15499
  } else {
15500
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15501
+ const float s = s0[i] - max;
15502
+ const float val = expf(s);
15503
+ #else
15504
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15505
  memcpy(&scvt, &s, sizeof(scvt));
15506
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15507
+ #endif
15508
  sum += (ggml_float)val;
15509
  st[i] = val;
15510
  }
 
15520
  ggml_vec_log_f32(nc, st, st);
15521
  ggml_vec_mul_f32(nc, st, st, s1);
15522
 
15523
+ float st_sum = 0;
15524
+ ggml_vec_sum_f32(nc, &st_sum, st);
15525
+ sums[ith] += st_sum;
15526
 
15527
  #ifndef NDEBUG
15528
  for (int i = 0; i < nc; ++i) {
 
15572
  return;
15573
  }
15574
 
15575
+ const double eps = 1e-9;
15576
 
15577
  // TODO: handle transposed/permuted matrices
15578
  const int64_t nc = src0->ne[0];
 
15591
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
15592
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15593
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
 
15594
 
15595
  #ifndef NDEBUG
15596
  for (int i = 0; i < nc; ++i) {
 
15599
  assert(!isnan(s1[i]));
15600
  }
15601
  #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15602
 
15603
  // soft_max
15604
  ggml_float sum = 0.0;
 
15606
  float max = -INFINITY;
15607
  ggml_vec_max_f32(nc, &max, s0);
15608
 
15609
+ uint16_t scvt; UNUSED(scvt);
15610
  for (int i = 0; i < nc; i++) {
15611
  if (s0[i] == -INFINITY) {
15612
+ ds0[i] = 0.0f;
15613
  } else {
15614
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15615
+ const float s = s0[i] - max;
15616
+ const float val = expf(s);
15617
+ #else
15618
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15619
  memcpy(&scvt, &s, sizeof(scvt));
15620
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15621
+ #endif
15622
  sum += (ggml_float)val;
15623
+ ds0[i] = val;
15624
  }
15625
  }
15626
 
15627
  assert(sum > 0.0);
15628
+ sum = (1.0 - eps)/sum;
15629
  }
15630
 
15631
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
15632
+ ggml_vec_scale_f32(nc, ds0, sum);
15633
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
15634
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
15635
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
15636
+
 
 
 
 
15637
 
15638
  #ifndef NDEBUG
15639
  for (int i = 0; i < nc; ++i) {
 
 
15640
  assert(!isnan(ds0[i]));
15641
  assert(!isinf(ds0[i]));
15642
  }
 
16191
  {
16192
  // necessary for llama
16193
  if (src0->grad) {
16194
+ float eps;
16195
+ memcpy(&eps, tensor->op_params, sizeof(float));
16196
+
16197
  src0->grad = ggml_add_impl(ctx,
16198
  src0->grad,
16199
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
16200
  inplace);
16201
  }
16202
  } break;
 
16964
  return result;
16965
  }
16966
 
16967
+ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
 
 
16968
  GGML_ASSERT(gf->n_nodes > 0);
16969
 
16970
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
 
16988
  }
16989
  }
16990
 
16991
+ for (int i = 0; i < gf->n_nodes; i++) {
16992
  struct ggml_tensor * node = gf->nodes[i];
16993
 
16994
  if (node->is_param) {
16995
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16996
+ ggml_build_forward_expand(gb, node->grad);
16997
  }
16998
  }
16999
+ }
17000
 
17001
+ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
17002
+ struct ggml_cgraph result = *gf;
17003
+ ggml_build_backward_expand(ctx, gf, &result, keep);
17004
  return result;
17005
  }
17006
 
 
17676
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
17677
  {
17678
  n_tasks = n_threads;
 
 
 
 
17679
  } break;
17680
  case GGML_OP_NONE:
17681
  {
 
18553
  struct ggml_opt_params params,
18554
  struct ggml_tensor * f,
18555
  struct ggml_cgraph * gf,
18556
+ struct ggml_cgraph * gb,
18557
+ ggml_opt_callback callback,
18558
+ void * callback_data) {
18559
  GGML_ASSERT(ggml_is_scalar(f));
18560
 
18561
  // these will store the parameters we want to optimize
18562
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
18563
 
18564
  int np = 0;
18565
+ int64_t nx = 0;
18566
  for (int i = 0; i < gf->n_nodes; ++i) {
18567
  if (gf->nodes[i]->is_param) {
18568
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
 
18581
  }
18582
 
18583
  // constants
18584
+ float sched = params.adam.sched;
18585
+ const float alpha = params.adam.alpha;
18586
+ const float decay = params.adam.decay * alpha;
18587
  const float beta1 = params.adam.beta1;
18588
  const float beta2 = params.adam.beta2;
18589
  const float eps = params.adam.eps;
18590
+ const float gclip = params.adam.gclip;
18591
+ const int decay_min_ndim = params.adam.decay_min_ndim;
18592
 
 
 
 
18593
  float * m = opt->adam.m->data; // first moment
18594
  float * v = opt->adam.v->data; // second moment
 
 
18595
 
18596
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18597
 
18598
+ if (callback) {
18599
+ callback(callback_data, &sched);
18600
+ }
18601
 
18602
  // compute the function value
18603
  ggml_graph_reset (gf);
18604
  ggml_set_f32 (f->grad, 1.0f);
18605
 
18606
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18607
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18608
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18609
+ ggml_graph_compute(gb, &cplan);
18610
 
18611
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
18612
  opt->adam.fx_best = opt->adam.fx_prev;
 
18614
  pf[opt->iter % params.past] = opt->adam.fx_prev;
18615
  }
18616
 
18617
+ opt->loss_before = opt->adam.fx_prev;
18618
+ opt->loss_after = opt->adam.fx_prev;
18619
+
18620
  // initialize
18621
  if (opt->just_initialized) {
18622
  opt->adam.n_no_improvement = 0;
 
18649
  UNUSED(t_start_cpu);
18650
 
18651
  {
18652
+ float gnorm = 1.0f;
18653
+ if (gclip > 0.0f) {
18654
+ // gradient clipping
18655
+ ggml_float sum = 0.0;
18656
+ for (int p = 0; p < np; ++p) {
18657
+ const int64_t ne = ggml_nelements(ps[p]);
18658
+ for (int64_t j = 0; j < ne; ++j) {
18659
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
18660
+ sum += (ggml_float)(g*g);
18661
+ }
18662
+ }
18663
+ ggml_float norm = sqrt(sum);
18664
+ if (norm > (ggml_float) gclip) {
18665
+ gnorm = (float) ((ggml_float) gclip / norm);
18666
+ }
18667
+ }
18668
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
18669
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
18670
+ int64_t i = 0;
18671
+ for (int p = 0; p < np; ++p) {
18672
+ const int64_t ne = ggml_nelements(ps[p]);
18673
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
18674
+ for (int64_t j = 0; j < ne; ++j) {
18675
+ float x = ggml_get_f32_1d(ps[p], j);
18676
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
18677
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
18678
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
18679
+ float mh = m[i]*beta1h;
18680
+ float vh = v[i]*beta2h;
18681
+ vh = sqrtf(vh) + eps;
18682
+ x = x*(1.0f - p_decay) - mh/vh;
18683
+ ggml_set_f32_1d(ps[p], j, x);
18684
+ ++i;
18685
+ }
18686
+ }
18687
+ }
18688
 
18689
+ if (callback) {
18690
+ callback(callback_data, &sched);
18691
  }
18692
 
18693
  ggml_graph_reset (gf);
18694
  ggml_set_f32 (f->grad, 1.0f);
18695
 
18696
+ ggml_graph_compute(gb, &cplan);
18697
 
18698
  const float fx = ggml_get_f32_1d(f, 0);
18699
+ opt->loss_after = fx;
18700
+
18701
 
18702
  // check convergence
18703
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
 
18766
  };
18767
 
18768
  static enum ggml_opt_result linesearch_backtracking(
 
18769
  const struct ggml_opt_params * params,
18770
  int nx,
18771
  float * x,
 
18777
  struct ggml_tensor * f,
18778
  struct ggml_cgraph * gf,
18779
  struct ggml_cgraph * gb,
18780
+ struct ggml_cplan * cplan,
18781
  const int np,
18782
+ struct ggml_tensor * ps[],
18783
+ ggml_opt_callback callback,
18784
+ void * callback_data) {
18785
  int count = 0;
18786
 
18787
  float width = 0.0f;
 
18810
  dgtest = params->lbfgs.ftol*dginit;
18811
 
18812
  while (true) {
18813
+ if (callback) {
18814
+ // LBFG-S does not support learning rate -> ignore learning schedule
18815
+ float sched = 0;
18816
+ callback(callback_data, &sched);
18817
+ }
18818
+
18819
  ggml_vec_cpy_f32(nx, x, xp);
18820
  ggml_vec_mad_f32(nx, x, d, *step);
18821
 
 
18826
  ggml_graph_reset (gf);
18827
  ggml_set_f32 (f->grad, 1.0f);
18828
 
18829
+ ggml_graph_compute(gb, cplan);
18830
 
18831
  ggml_opt_get_grad(np, ps, g);
18832
 
 
18860
  // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
18861
  return count;
18862
  }
 
18863
  }
18864
  }
18865
 
 
18885
  struct ggml_opt_params params,
18886
  struct ggml_tensor * f,
18887
  struct ggml_cgraph * gf,
18888
+ struct ggml_cgraph * gb,
18889
+ ggml_opt_callback callback,
18890
+ void * callback_data) {
18891
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18892
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18893
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
 
18919
  opt->iter = iter;
18920
  }
18921
 
18922
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18923
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18924
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18925
+
18926
  float * x = opt->lbfgs.x->data; // current parameters
18927
  float * xp = opt->lbfgs.xp->data; // previous parameters
18928
  float * g = opt->lbfgs.g->data; // current gradient
 
18944
  float * lm_s = opt->lbfgs.lms->data;
18945
  float * lm_y = opt->lbfgs.lmy->data;
18946
 
18947
+ if (callback) {
18948
+ // LBFG-S does not support learning rate -> ignore learning schedule
18949
+ float sched = 0;
18950
+ callback(callback_data, &sched);
18951
+ }
18952
+
18953
  // evaluate the function value and its gradient
18954
  {
18955
  ggml_opt_set_params(np, ps, x);
 
18957
  ggml_graph_reset (gf);
18958
  ggml_set_f32 (f->grad, 1.0f);
18959
 
18960
+ ggml_graph_compute(gb, &cplan);
18961
 
18962
  ggml_opt_get_grad(np, ps, g);
18963
 
18964
  fx = ggml_get_f32_1d(f, 0);
18965
+
18966
+ opt->loss_before = fx;
18967
+ opt->loss_after = fx;
18968
  }
18969
 
18970
  // search direction = -gradient
 
19019
  ggml_vec_cpy_f32(nx, xp, x);
19020
  ggml_vec_cpy_f32(nx, gp, g);
19021
 
19022
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
19023
 
19024
  if (ls < 0) {
19025
  // linesearch failed - go back to the previous point and return
 
19029
  return ls;
19030
  }
19031
 
19032
+ opt->loss_after = fx;
19033
+
19034
  ggml_vec_norm_f32(nx, &xnorm, x);
19035
  ggml_vec_norm_f32(nx, &gnorm, g);
19036
 
 
19088
  // ys = y^t \cdot s -> 1 / \rho.
19089
  // yy = y^t \cdot y.
19090
  //
19091
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
19092
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
19093
 
19094
  lm_ys[end[0]] = ys;
 
19151
  .adam = {
19152
  .n_iter = 10000,
19153
  .sched = 1.000f,
19154
+ .decay = 0.0f,
19155
+ .decay_min_ndim = 2,
19156
  .alpha = 0.001f,
19157
  .beta1 = 0.9f,
19158
  .beta2 = 0.999f,
19159
  .eps = 1e-8f,
19160
  .eps_f = 1e-5f,
19161
  .eps_g = 1e-3f,
19162
+ .gclip = 0.0f,
19163
  },
19164
  };
19165
  } break;
 
19209
  switch (opt->params.type) {
19210
  case GGML_OPT_ADAM:
19211
  {
 
 
 
19212
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19213
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
 
 
19214
  opt->adam.pf = params.past > 0
19215
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
19216
  : NULL;
 
 
 
19217
  ggml_set_zero(opt->adam.m);
19218
  ggml_set_zero(opt->adam.v);
 
 
19219
  if (opt->adam.pf) {
19220
  ggml_set_zero(opt->adam.pf);
19221
  }
 
19299
  *gf = ggml_build_forward (f);
19300
  *gb = ggml_build_backward(ctx, gf, true);
19301
 
19302
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
19303
  }
19304
 
19305
  enum ggml_opt_result ggml_opt_resume_g(
 
19307
  struct ggml_opt_context * opt,
19308
  struct ggml_tensor * f,
19309
  struct ggml_cgraph * gf,
19310
+ struct ggml_cgraph * gb,
19311
+ ggml_opt_callback callback,
19312
+ void * callback_data) {
19313
 
19314
  // build forward + backward compute graphs
19315
  enum ggml_opt_result result = GGML_OPT_OK;
 
19317
  switch (opt->params.type) {
19318
  case GGML_OPT_ADAM:
19319
  {
19320
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19321
  } break;
19322
  case GGML_OPT_LBFGS:
19323
  {
19324
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19325
  } break;
19326
  }
19327
 
 
19776
 
19777
  // read the kv pairs
19778
  {
19779
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19780
 
19781
  for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19782
  struct gguf_kv * kv = &ctx->kv[i];
 
19859
 
19860
  // read the tensor infos
19861
  {
19862
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19863
 
19864
  for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19865
  struct gguf_tensor_info * info = &ctx->infos[i];
 
20060
  }
20061
  }
20062
 
20063
+ free(ctx->kv);
20064
  }
20065
 
20066
  if (ctx->infos) {
 
20072
  }
20073
  }
20074
 
20075
+ free(ctx->infos);
20076
  }
20077
 
20078
  GGML_ALIGNED_FREE(ctx);
ggml.h CHANGED
@@ -479,6 +479,9 @@ extern "C" {
479
  int64_t perf_cycles;
480
  int64_t perf_time_us;
481
 
 
 
 
482
  void * data;
483
 
484
  char name[GGML_MAX_NAME];
@@ -661,7 +664,7 @@ extern "C" {
661
  GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
662
 
663
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
664
- GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
665
 
666
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
667
 
@@ -952,11 +955,11 @@ extern "C" {
952
 
953
  // a - x
954
  // b - dy
955
- // TODO: update with configurable eps
956
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
957
  struct ggml_context * ctx,
958
  struct ggml_tensor * a,
959
- struct ggml_tensor * b);
 
960
 
961
  // A: n columns, m rows
962
  // B: n columns, p rows (i.e. we transpose it internally)
@@ -1612,7 +1615,8 @@ extern "C" {
1612
  struct ggml_tensor * tensor);
1613
 
1614
 
1615
- GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
1616
 
1617
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1618
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
@@ -1677,6 +1681,8 @@ extern "C" {
1677
  GGML_LINESEARCH_INVALID_PARAMETERS,
1678
  };
1679
 
 
 
1680
  // optimization parameters
1681
  //
1682
  // see ggml.c (ggml_opt_default_params) for default values
@@ -1712,12 +1718,14 @@ extern "C" {
1712
 
1713
  float sched; // schedule multiplier (fixed, decay or warmup)
1714
  float decay; // weight decay for AdamW, use 0.0f to disable
 
1715
  float alpha; // learning rate
1716
  float beta1;
1717
  float beta2;
1718
  float eps; // epsilon for numerical stability
1719
  float eps_f; // epsilon for convergence test
1720
  float eps_g; // epsilon for convergence test
 
1721
  } adam;
1722
 
1723
  // LBFGS parameters
@@ -1745,14 +1753,12 @@ extern "C" {
1745
 
1746
  bool just_initialized;
1747
 
 
 
 
1748
  struct {
1749
- struct ggml_tensor * x; // view of the parameters
1750
- struct ggml_tensor * g1; // gradient
1751
- struct ggml_tensor * g2; // gradient squared
1752
  struct ggml_tensor * m; // first moment
1753
  struct ggml_tensor * v; // second moment
1754
- struct ggml_tensor * mh; // first moment hat
1755
- struct ggml_tensor * vh; // second moment hat
1756
  struct ggml_tensor * pf; // past function values
1757
  float fx_best;
1758
  float fx_prev;
@@ -1789,10 +1795,10 @@ extern "C" {
1789
 
1790
  // initialize optimizer context
1791
  GGML_API void ggml_opt_init(
1792
- struct ggml_context * ctx,
1793
  struct ggml_opt_context * opt,
1794
- struct ggml_opt_params params,
1795
- int64_t nx);
1796
 
1797
  // continue optimizing the function defined by the tensor f
1798
  GGML_API enum ggml_opt_result ggml_opt_resume(
@@ -1806,7 +1812,9 @@ extern "C" {
1806
  struct ggml_opt_context * opt,
1807
  struct ggml_tensor * f,
1808
  struct ggml_cgraph * gf,
1809
- struct ggml_cgraph * gb);
 
 
1810
 
1811
  //
1812
  // quantization
 
479
  int64_t perf_cycles;
480
  int64_t perf_time_us;
481
 
482
+ struct ggml_tensor * view_src;
483
+ size_t view_offs;
484
+
485
  void * data;
486
 
487
  char name[GGML_MAX_NAME];
 
664
  GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
665
 
666
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
667
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
668
 
669
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
670
 
 
955
 
956
  // a - x
957
  // b - dy
 
958
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
959
  struct ggml_context * ctx,
960
  struct ggml_tensor * a,
961
+ struct ggml_tensor * b,
962
+ float eps);
963
 
964
  // A: n columns, m rows
965
  // B: n columns, p rows (i.e. we transpose it internally)
 
1615
  struct ggml_tensor * tensor);
1616
 
1617
 
1618
+ GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1619
+ GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
1620
 
1621
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1622
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
1681
  GGML_LINESEARCH_INVALID_PARAMETERS,
1682
  };
1683
 
1684
+ typedef void (*ggml_opt_callback)(void * data, float * sched);
1685
+
1686
  // optimization parameters
1687
  //
1688
  // see ggml.c (ggml_opt_default_params) for default values
 
1718
 
1719
  float sched; // schedule multiplier (fixed, decay or warmup)
1720
  float decay; // weight decay for AdamW, use 0.0f to disable
1721
+ int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
1722
  float alpha; // learning rate
1723
  float beta1;
1724
  float beta2;
1725
  float eps; // epsilon for numerical stability
1726
  float eps_f; // epsilon for convergence test
1727
  float eps_g; // epsilon for convergence test
1728
+ float gclip; // gradient clipping
1729
  } adam;
1730
 
1731
  // LBFGS parameters
 
1753
 
1754
  bool just_initialized;
1755
 
1756
+ float loss_before;
1757
+ float loss_after;
1758
+
1759
  struct {
 
 
 
1760
  struct ggml_tensor * m; // first moment
1761
  struct ggml_tensor * v; // second moment
 
 
1762
  struct ggml_tensor * pf; // past function values
1763
  float fx_best;
1764
  float fx_prev;
 
1795
 
1796
  // initialize optimizer context
1797
  GGML_API void ggml_opt_init(
1798
+ struct ggml_context * ctx,
1799
  struct ggml_opt_context * opt,
1800
+ struct ggml_opt_params params,
1801
+ int64_t nx);
1802
 
1803
  // continue optimizing the function defined by the tensor f
1804
  GGML_API enum ggml_opt_result ggml_opt_resume(
 
1812
  struct ggml_opt_context * opt,
1813
  struct ggml_tensor * f,
1814
  struct ggml_cgraph * gf,
1815
+ struct ggml_cgraph * gb,
1816
+ ggml_opt_callback callback,
1817
+ void * callback_data);
1818
 
1819
  //
1820
  // quantization