ggerganov commited on
Commit
bf6b4f8
·
unverified ·
1 Parent(s): 6ab2cc0

ggml : backport llama.cpp updates (close #709)

Browse files

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
tested)

Files changed (3) hide show
  1. ggml.c +0 -0
  2. ggml.h +82 -31
  3. whisper.cpp +78 -55
ggml.c CHANGED
The diff for this file is too large to render. See raw diff
 
ggml.h CHANGED
@@ -236,6 +236,7 @@ enum ggml_op {
236
 
237
  GGML_OP_SCALE,
238
  GGML_OP_CPY,
 
239
  GGML_OP_RESHAPE,
240
  GGML_OP_VIEW,
241
  GGML_OP_PERMUTE,
@@ -253,16 +254,29 @@ enum ggml_op {
253
  GGML_OP_COUNT,
254
  };
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  // n-dimensional tensor
257
  struct ggml_tensor {
258
  enum ggml_type type;
259
 
260
  int n_dims;
261
- int ne[GGML_MAX_DIMS]; // number of elements
262
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263
- // nb[0] = sizeof(type)
264
- // nb[1] = nb[0] * ne[0] + padding
265
- // nb[i] = nb[i-1] * ne[i-1]
266
 
267
  // compute data
268
  enum ggml_op op;
@@ -316,6 +330,7 @@ struct ggml_init_params {
316
  // memory pool
317
  size_t mem_size; // bytes
318
  void * mem_buffer; // if NULL, memory will be allocated internally
 
319
  };
320
 
321
  void ggml_time_init(void); // call this once at the beginning of the program
@@ -327,8 +342,8 @@ int64_t ggml_cycles_per_ms(void);
327
  void ggml_print_object (const struct ggml_object * obj);
328
  void ggml_print_objects(const struct ggml_context * ctx);
329
 
330
- int ggml_nelements(const struct ggml_tensor * tensor);
331
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
332
 
333
  int ggml_blck_size (enum ggml_type type);
334
  size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -343,40 +358,37 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
343
 
344
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
345
 
346
- bool ggml_mlock_supported(void);
347
- bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
348
-
349
  struct ggml_tensor * ggml_new_tensor(
350
  struct ggml_context * ctx,
351
  enum ggml_type type,
352
  int n_dims,
353
- const int *ne);
354
 
355
  struct ggml_tensor * ggml_new_tensor_1d(
356
  struct ggml_context * ctx,
357
  enum ggml_type type,
358
- int ne0);
359
 
360
  struct ggml_tensor * ggml_new_tensor_2d(
361
  struct ggml_context * ctx,
362
  enum ggml_type type,
363
- int ne0,
364
- int ne1);
365
 
366
  struct ggml_tensor * ggml_new_tensor_3d(
367
  struct ggml_context * ctx,
368
  enum ggml_type type,
369
- int ne0,
370
- int ne1,
371
- int ne2);
372
 
373
  struct ggml_tensor * ggml_new_tensor_4d(
374
  struct ggml_context * ctx,
375
  enum ggml_type type,
376
- int ne0,
377
- int ne1,
378
- int ne2,
379
- int ne3);
380
 
381
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
382
  struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -514,6 +526,11 @@ struct ggml_tensor * ggml_cpy(
514
  struct ggml_tensor * a,
515
  struct ggml_tensor * b);
516
 
 
 
 
 
 
517
  // return view(a), b specifies the new shape
518
  // TODO: when we start computing gradient, make a copy instead of view
519
  struct ggml_tensor * ggml_reshape(
@@ -526,33 +543,43 @@ struct ggml_tensor * ggml_reshape(
526
  struct ggml_tensor * ggml_reshape_2d(
527
  struct ggml_context * ctx,
528
  struct ggml_tensor * a,
529
- int ne0,
530
- int ne1);
531
 
532
  // return view(a)
533
  // TODO: when we start computing gradient, make a copy instead of view
534
  struct ggml_tensor * ggml_reshape_3d(
535
  struct ggml_context * ctx,
536
  struct ggml_tensor * a,
537
- int ne0,
538
- int ne1,
539
- int ne2);
540
 
541
  // offset in bytes
542
  struct ggml_tensor * ggml_view_1d(
543
  struct ggml_context * ctx,
544
  struct ggml_tensor * a,
545
- int ne0,
546
  size_t offset);
547
 
548
  struct ggml_tensor * ggml_view_2d(
549
  struct ggml_context * ctx,
550
  struct ggml_tensor * a,
551
- int ne0,
552
- int ne1,
553
  size_t nb1, // row stride in bytes
554
  size_t offset);
555
 
 
 
 
 
 
 
 
 
 
 
556
  struct ggml_tensor * ggml_permute(
557
  struct ggml_context * ctx,
558
  struct ggml_tensor * a,
@@ -748,8 +775,8 @@ enum ggml_opt_result ggml_opt(
748
  // quantization
749
  //
750
 
751
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
752
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
753
 
754
  //
755
  // system info
@@ -768,6 +795,30 @@ int ggml_cpu_has_blas(void);
768
  int ggml_cpu_has_sse3(void);
769
  int ggml_cpu_has_vsx(void);
770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  #ifdef __cplusplus
772
  }
773
  #endif
 
236
 
237
  GGML_OP_SCALE,
238
  GGML_OP_CPY,
239
+ GGML_OP_CONT,
240
  GGML_OP_RESHAPE,
241
  GGML_OP_VIEW,
242
  GGML_OP_PERMUTE,
 
254
  GGML_OP_COUNT,
255
  };
256
 
257
+
258
+ // ggml object
259
+ struct ggml_object {
260
+ size_t offs;
261
+ size_t size;
262
+
263
+ struct ggml_object * next;
264
+
265
+ char padding[8];
266
+ };
267
+
268
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
269
+
270
  // n-dimensional tensor
271
  struct ggml_tensor {
272
  enum ggml_type type;
273
 
274
  int n_dims;
275
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
276
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
277
+ // nb[0] = sizeof(type)
278
+ // nb[1] = nb[0] * ne[0] + padding
279
+ // nb[i] = nb[i-1] * ne[i-1]
280
 
281
  // compute data
282
  enum ggml_op op;
 
330
  // memory pool
331
  size_t mem_size; // bytes
332
  void * mem_buffer; // if NULL, memory will be allocated internally
333
+ bool no_alloc; // don't allocate memory for the tensor data
334
  };
335
 
336
  void ggml_time_init(void); // call this once at the beginning of the program
 
342
  void ggml_print_object (const struct ggml_object * obj);
343
  void ggml_print_objects(const struct ggml_context * ctx);
344
 
345
+ int64_t ggml_nelements(const struct ggml_tensor * tensor);
346
+ size_t ggml_nbytes (const struct ggml_tensor * tensor);
347
 
348
  int ggml_blck_size (enum ggml_type type);
349
  size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 
358
 
359
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
360
 
 
 
 
361
  struct ggml_tensor * ggml_new_tensor(
362
  struct ggml_context * ctx,
363
  enum ggml_type type,
364
  int n_dims,
365
+ const int64_t *ne);
366
 
367
  struct ggml_tensor * ggml_new_tensor_1d(
368
  struct ggml_context * ctx,
369
  enum ggml_type type,
370
+ int64_t ne0);
371
 
372
  struct ggml_tensor * ggml_new_tensor_2d(
373
  struct ggml_context * ctx,
374
  enum ggml_type type,
375
+ int64_t ne0,
376
+ int64_t ne1);
377
 
378
  struct ggml_tensor * ggml_new_tensor_3d(
379
  struct ggml_context * ctx,
380
  enum ggml_type type,
381
+ int64_t ne0,
382
+ int64_t ne1,
383
+ int64_t ne2);
384
 
385
  struct ggml_tensor * ggml_new_tensor_4d(
386
  struct ggml_context * ctx,
387
  enum ggml_type type,
388
+ int64_t ne0,
389
+ int64_t ne1,
390
+ int64_t ne2,
391
+ int64_t ne3);
392
 
393
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
394
  struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
 
526
  struct ggml_tensor * a,
527
  struct ggml_tensor * b);
528
 
529
+ // make contiguous
530
+ struct ggml_tensor * ggml_cont(
531
+ struct ggml_context * ctx,
532
+ struct ggml_tensor * a);
533
+
534
  // return view(a), b specifies the new shape
535
  // TODO: when we start computing gradient, make a copy instead of view
536
  struct ggml_tensor * ggml_reshape(
 
543
  struct ggml_tensor * ggml_reshape_2d(
544
  struct ggml_context * ctx,
545
  struct ggml_tensor * a,
546
+ int64_t ne0,
547
+ int64_t ne1);
548
 
549
  // return view(a)
550
  // TODO: when we start computing gradient, make a copy instead of view
551
  struct ggml_tensor * ggml_reshape_3d(
552
  struct ggml_context * ctx,
553
  struct ggml_tensor * a,
554
+ int64_t ne0,
555
+ int64_t ne1,
556
+ int64_t ne2);
557
 
558
  // offset in bytes
559
  struct ggml_tensor * ggml_view_1d(
560
  struct ggml_context * ctx,
561
  struct ggml_tensor * a,
562
+ int64_t ne0,
563
  size_t offset);
564
 
565
  struct ggml_tensor * ggml_view_2d(
566
  struct ggml_context * ctx,
567
  struct ggml_tensor * a,
568
+ int64_t ne0,
569
+ int64_t ne1,
570
  size_t nb1, // row stride in bytes
571
  size_t offset);
572
 
573
+ struct ggml_tensor * ggml_view_3d(
574
+ struct ggml_context * ctx,
575
+ struct ggml_tensor * a,
576
+ int64_t ne0,
577
+ int64_t ne1,
578
+ int64_t ne2,
579
+ size_t nb1, // row stride in bytes
580
+ size_t nb2, // slice stride in bytes
581
+ size_t offset);
582
+
583
  struct ggml_tensor * ggml_permute(
584
  struct ggml_context * ctx,
585
  struct ggml_tensor * a,
 
775
  // quantization
776
  //
777
 
778
+ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
779
+ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
780
 
781
  //
782
  // system info
 
795
  int ggml_cpu_has_sse3(void);
796
  int ggml_cpu_has_vsx(void);
797
 
798
+
799
+ //
800
+ // Internal types and functions exposed for tests and benchmarks
801
+ //
802
+
803
+ #ifdef __cplusplus
804
+ // restrict not standard in C++
805
+ #define GGML_RESTRICT
806
+ #else
807
+ #define GGML_RESTRICT restrict
808
+ #endif
809
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
810
+ typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
811
+ typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
812
+
813
+ typedef struct {
814
+ dequantize_row_q_t dequantize_row_q;
815
+ quantize_row_q_t quantize_row_q;
816
+ quantize_row_q_t quantize_row_q_reference;
817
+ vec_dot_q_t vec_dot_q;
818
+ } quantize_fns_t;
819
+
820
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
821
+
822
  #ifdef __cplusplus
823
  }
824
  #endif
whisper.cpp CHANGED
@@ -654,9 +654,11 @@ static bool kv_cache_init(
654
  int n_ctx) {
655
  cache.buf.resize(mem_bytes);
656
 
657
- struct ggml_init_params params;
658
- params.mem_size = cache.buf.size();
659
- params.mem_buffer = cache.buf.data();
 
 
660
 
661
  cache.ctx = ggml_init(params);
662
 
@@ -688,9 +690,11 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
688
 
689
  WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
690
 
691
- struct ggml_init_params params;
692
- params.mem_size = cache.buf.size();
693
- params.mem_buffer = cache.buf.data();
 
 
694
 
695
  cache.ctx = ggml_init(params);
696
 
@@ -1028,9 +1032,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1028
 
1029
  // create the ggml context
1030
  {
1031
- struct ggml_init_params params;
1032
- params.mem_size = wctx.model.buf->size();
1033
- params.mem_buffer = wctx.model.buf->data();
 
 
1034
 
1035
  model.ctx = ggml_init(params);
1036
  if (!model.ctx) {
@@ -1254,10 +1260,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1254
  break;
1255
  }
1256
 
1257
- int32_t nelements = 1;
1258
- int32_t ne[3] = { 1, 1, 1 };
1259
  for (int i = 0; i < n_dims; ++i) {
1260
- read_safe(loader, ne[i]);
 
 
1261
  nelements *= ne[i];
1262
  }
1263
 
@@ -1278,7 +1286,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1278
  }
1279
 
1280
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
1281
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
1282
  __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
1283
  return false;
1284
  }
@@ -1286,7 +1294,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1286
  const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
1287
 
1288
  if (nelements*bpe != ggml_nbytes(tensor)) {
1289
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1290
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1291
  return false;
1292
  }
@@ -1344,9 +1352,11 @@ static bool whisper_encode_internal(
1344
  const int n_mels = hparams.n_mels;
1345
  assert(mel_inp.n_mel == n_mels);
1346
 
1347
- struct ggml_init_params params;
1348
- params.mem_size = wstate.buf_compute.size();
1349
- params.mem_buffer = wstate.buf_compute.data();
 
 
1350
 
1351
  struct ggml_context * ctx0 = ggml_init(params);
1352
 
@@ -1501,8 +1511,7 @@ static bool whisper_encode_internal(
1501
  Vcur,
1502
  n_state/n_head, n_head, n_ctx),
1503
  1, 2, 0, 3),
1504
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
1505
- );
1506
 
1507
  struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
1508
  #else
@@ -1726,10 +1735,12 @@ static bool whisper_encode_internal(
1726
 
1727
  wstate.use_buf(ctx0, -1);
1728
 
1729
- //struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
1730
- //struct ggml_tensor * v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
1731
- struct ggml_tensor* k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
1732
- struct ggml_tensor* v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*n_ctx));
 
 
1733
 
1734
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
1735
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
@@ -1797,9 +1808,11 @@ static bool whisper_decode_internal(
1797
 
1798
  //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
1799
 
1800
- struct ggml_init_params params;
1801
- params.mem_size = wstate.buf_compute.size();
1802
- params.mem_buffer = wstate.buf_compute.data();
 
 
1803
 
1804
  struct ggml_context * ctx0 = ggml_init(params);
1805
 
@@ -1862,20 +1875,24 @@ static bool whisper_decode_internal(
1862
 
1863
  Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1864
 
1865
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1866
- layer.attn_v_w,
1867
- cur);
1868
-
1869
- Vcur = ggml_add(ctx0,
1870
- ggml_repeat(ctx0,
1871
- layer.attn_v_b,
1872
- Vcur),
1873
- Vcur);
1874
-
1875
  // store key and value to memory
1876
  {
 
 
 
 
 
 
 
 
 
 
 
 
1877
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
1878
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_state, (ggml_element_size(kv_self.v)*n_state)*(il*n_ctx + n_past));
 
 
1879
 
1880
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1881
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
@@ -1914,16 +1931,14 @@ static bool whisper_decode_internal(
1914
 
1915
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1916
 
1917
- struct ggml_tensor * V_trans =
1918
- ggml_cpy(ctx0,
1919
- ggml_permute(ctx0,
1920
- ggml_reshape_3d(ctx0,
1921
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
1922
- n_state/n_head, n_head, n_past + N),
1923
- 1, 2, 0, 3),
1924
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_state/n_head, n_head));
1925
 
1926
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
1927
 
1928
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1929
 
@@ -1986,15 +2001,22 @@ static bool whisper_decode_internal(
1986
  ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
1987
  n_state/n_head, n_head, M);
1988
 
1989
- struct ggml_tensor * Vcross =
1990
- ggml_reshape_3d(ctx0,
1991
- ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
1992
- n_state/n_head, n_head, M);
1993
 
1994
- struct ggml_tensor * V_trans =
1995
- ggml_cpy(ctx0,
1996
- ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
1997
- ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
 
 
 
 
 
 
 
1998
 
1999
  // ------
2000
 
@@ -2021,7 +2043,7 @@ static bool whisper_decode_internal(
2021
 
2022
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
2023
 
2024
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
2025
 
2026
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2027
 
@@ -4726,6 +4748,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4726
  struct ggml_init_params gparams = {
4727
  /*.mem_size =*/ buf.size(),
4728
  /*.mem_buffer =*/ buf.data(),
 
4729
  };
4730
 
4731
  struct ggml_context * ctx0 = ggml_init(gparams);
 
654
  int n_ctx) {
655
  cache.buf.resize(mem_bytes);
656
 
657
+ struct ggml_init_params params = {
658
+ /*.mem_size =*/ cache.buf.size(),
659
+ /*.mem_buffer =*/ cache.buf.data(),
660
+ /*.no_alloc =*/ false,
661
+ };
662
 
663
  cache.ctx = ggml_init(params);
664
 
 
690
 
691
  WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
692
 
693
+ struct ggml_init_params params = {
694
+ /*.mem_size =*/ cache.buf.size(),
695
+ /*.mem_buffer =*/ cache.buf.data(),
696
+ /*.no_alloc =*/ false,
697
+ };
698
 
699
  cache.ctx = ggml_init(params);
700
 
 
1032
 
1033
  // create the ggml context
1034
  {
1035
+ struct ggml_init_params params = {
1036
+ /*.mem_size =*/ wctx.model.buf->size(),
1037
+ /*.mem_buffer =*/ wctx.model.buf->data(),
1038
+ /*.no_alloc =*/ false,
1039
+ };
1040
 
1041
  model.ctx = ggml_init(params);
1042
  if (!model.ctx) {
 
1260
  break;
1261
  }
1262
 
1263
+ int64_t nelements = 1;
1264
+ int64_t ne[3] = { 1, 1, 1 };
1265
  for (int i = 0; i < n_dims; ++i) {
1266
+ int32_t ne_cur;
1267
+ read_safe(loader, ne_cur);
1268
+ ne[i] = ne_cur;
1269
  nelements *= ne[i];
1270
  }
1271
 
 
1286
  }
1287
 
1288
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
1289
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%lld, %lld, %lld]\n",
1290
  __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
1291
  return false;
1292
  }
 
1294
  const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
1295
 
1296
  if (nelements*bpe != ggml_nbytes(tensor)) {
1297
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
1298
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1299
  return false;
1300
  }
 
1352
  const int n_mels = hparams.n_mels;
1353
  assert(mel_inp.n_mel == n_mels);
1354
 
1355
+ struct ggml_init_params params = {
1356
+ /*.mem_size =*/ wstate.buf_compute.size(),
1357
+ /*.mem_buffer =*/ wstate.buf_compute.data(),
1358
+ /*.no_alloc =*/ false,
1359
+ };
1360
 
1361
  struct ggml_context * ctx0 = ggml_init(params);
1362
 
 
1511
  Vcur,
1512
  n_state/n_head, n_head, n_ctx),
1513
  1, 2, 0, 3),
1514
+ ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
 
1515
 
1516
  struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
1517
  #else
 
1735
 
1736
  wstate.use_buf(ctx0, -1);
1737
 
1738
+ Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
1739
+
1740
+ struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
1741
+ struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
1742
+ ( n_ctx)*ggml_element_size(wstate.kv_cross.v),
1743
+ (il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
1744
 
1745
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
1746
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
 
1808
 
1809
  //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
1810
 
1811
+ struct ggml_init_params params = {
1812
+ /*.mem_size =*/ wstate.buf_compute.size(),
1813
+ /*.mem_buffer =*/ wstate.buf_compute.data(),
1814
+ /*.no_alloc =*/ false,
1815
+ };
1816
 
1817
  struct ggml_context * ctx0 = ggml_init(params);
1818
 
 
1875
 
1876
  Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1877
 
 
 
 
 
 
 
 
 
 
 
1878
  // store key and value to memory
1879
  {
1880
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1881
+ layer.attn_v_w,
1882
+ cur);
1883
+
1884
+ Vcur = ggml_add(ctx0,
1885
+ ggml_repeat(ctx0,
1886
+ layer.attn_v_b,
1887
+ Vcur),
1888
+ Vcur);
1889
+
1890
+ Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
1891
+
1892
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
1893
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
1894
+ ( n_ctx)*ggml_element_size(kv_self.v),
1895
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
1896
 
1897
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1898
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
 
1931
 
1932
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1933
 
1934
+ struct ggml_tensor * V =
1935
+ ggml_view_3d(ctx0, kv_self.v,
1936
+ n_past + N, n_state/n_head, n_head,
1937
+ n_ctx*ggml_element_size(kv_self.v),
1938
+ n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
1939
+ il*n_ctx*ggml_element_size(kv_self.v)*n_state);
 
 
1940
 
1941
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1942
 
1943
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1944
 
 
2001
  ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
2002
  n_state/n_head, n_head, M);
2003
 
2004
+ //struct ggml_tensor * Vcross =
2005
+ // ggml_reshape_3d(ctx0,
2006
+ // ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
2007
+ // n_state/n_head, n_head, M);
2008
 
2009
+ //struct ggml_tensor * V_trans =
2010
+ // ggml_cpy(ctx0,
2011
+ // ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
2012
+ // ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
2013
+
2014
+ struct ggml_tensor * V =
2015
+ ggml_view_3d(ctx0, wstate.kv_cross.v,
2016
+ M, n_state/n_head, n_head,
2017
+ M*ggml_element_size(wstate.kv_cross.v),
2018
+ M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
2019
+ il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
2020
 
2021
  // ------
2022
 
 
2043
 
2044
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
2045
 
2046
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2047
 
2048
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2049
 
 
4748
  struct ggml_init_params gparams = {
4749
  /*.mem_size =*/ buf.size(),
4750
  /*.mem_buffer =*/ buf.data(),
4751
+ /*.no_alloc =*/ false,
4752
  };
4753
 
4754
  struct ggml_context * ctx0 = ggml_init(gparams);