ggerganov commited on
Commit
521bf92
·
1 Parent(s): 8a0ab81

Use Accelerate framework on Apple silicon

Browse files

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

Files changed (5) hide show
  1. Makefile +7 -2
  2. README.md +13 -7
  3. ggml.c +189 -104
  4. main.cpp +1 -1
  5. whisper.cpp +6 -6
Makefile CHANGED
@@ -8,6 +8,7 @@ UNAME_M := $(shell uname -m)
8
 
9
  CFLAGS = -O3 -std=c11
10
  CXXFLAGS = -O3 -std=c++11
 
11
 
12
  CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
13
  CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
@@ -37,7 +38,11 @@ ifeq ($(UNAME_M),amd64)
37
  CFLAGS += -mavx -mavx2 -mfma -mf16c
38
  endif
39
  ifneq ($(filter arm%,$(UNAME_M)),)
40
- # Mac M1
 
 
 
 
41
  endif
42
  ifneq ($(filter aarch64%,$(UNAME_M)),)
43
  endif
@@ -59,7 +64,7 @@ endif
59
  #
60
 
61
  main: main.cpp ggml.o whisper.o
62
- $(CXX) $(CXXFLAGS) main.cpp whisper.o ggml.o -o main
63
  ./main -h
64
 
65
  ggml.o: ggml.c ggml.h
 
8
 
9
  CFLAGS = -O3 -std=c11
10
  CXXFLAGS = -O3 -std=c++11
11
+ LDFLAGS =
12
 
13
  CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
14
  CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
 
38
  CFLAGS += -mavx -mavx2 -mfma -mf16c
39
  endif
40
  ifneq ($(filter arm%,$(UNAME_M)),)
41
+ # Mac M1 - include Accelerate framework
42
+ ifeq ($(UNAME_S),Darwin)
43
+ CFLAGS += -DGGML_USE_ACCELERATE
44
+ LDFLAGS += -framework Accelerate
45
+ endif
46
  endif
47
  ifneq ($(filter aarch64%,$(UNAME_M)),)
48
  endif
 
64
  #
65
 
66
  main: main.cpp ggml.o whisper.o
67
+ $(CXX) $(CXXFLAGS) main.cpp whisper.o ggml.o -o main $(LDFLAGS)
68
  ./main -h
69
 
70
  ggml.o: ggml.c ggml.h
README.md CHANGED
@@ -6,7 +6,8 @@
6
  High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
7
 
8
  - Plain C/C++ implementation without dependencies
9
- - ARM_NEON and AVX intrinsics support
 
10
  - Mixed F16 / F32 precision
11
  - Low memory usage (Flash Attention + Flash Forward)
12
  - Zero memory allocations at runtime
@@ -224,7 +225,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
224
  The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
225
 
226
  ```bash
227
- # Install SDL2 on Linux
228
  sudo apt-get install libsdl2-dev
229
 
230
  # Install SDL2 on Mac OS
@@ -240,6 +241,10 @@ make stream
240
  - Simple usage is demonstrated in [main.cpp](main.cpp)
241
  - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](stream.cpp)
242
 
 
 
 
 
243
  ## Limitations
244
 
245
  - Very basic greedy sampling scheme - always pick up the top token. You can implement your own strategy
@@ -250,11 +255,12 @@ make stream
250
 
251
  | Model | Disk | Mem |
252
  | --- | --- | --- |
253
- | tiny | 75 MB | ~240 MB |
254
- | base | 142 MB | ~380 MB |
255
- | small | 466 MB | ~970 MB |
256
- | medium | 1.5 GB | ~2.5 GB |
257
- | large | 2.9 GB | ~4.6 GB |
 
258
 
259
  ## ggml format
260
 
 
6
  High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
7
 
8
  - Plain C/C++ implementation without dependencies
9
+ - Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
10
+ - AVX intrinsics support for x86 architectures
11
  - Mixed F16 / F32 precision
12
  - Low memory usage (Flash Attention + Flash Forward)
13
  - Zero memory allocations at runtime
 
225
  The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
226
 
227
  ```bash
228
+ # Install SDL2 on Linux
229
  sudo apt-get install libsdl2-dev
230
 
231
  # Install SDL2 on Mac OS
 
241
  - Simple usage is demonstrated in [main.cpp](main.cpp)
242
  - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](stream.cpp)
243
 
244
+ The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
245
+ instrisics or CBLAS Accelerate framwork routines are used. The latter are especially effective for bigger sizes since
246
+ the framwork utilizes the special-purpose AMX coprocessor available in modern Apple products.
247
+
248
  ## Limitations
249
 
250
  - Very basic greedy sampling scheme - always pick up the top token. You can implement your own strategy
 
255
 
256
  | Model | Disk | Mem |
257
  | --- | --- | --- |
258
+ | tiny | 75 MB | ~280 MB |
259
+ | base | 142 MB | ~430 MB |
260
+ | small | 466 MB | ~1.0 GB |
261
+ | medium | 1.5 GB | ~2.6 GB |
262
+ | large | 2.9 GB | ~4.7 GB |
263
+
264
 
265
  ## ggml format
266
 
ggml.c CHANGED
@@ -716,19 +716,28 @@ inline static float ggml_gelu_f32(float x) {
716
  return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
717
  }
718
 
719
- inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
 
720
  for (int i = 0; i < n; ++i) {
721
- y[i] = ggml_gelu_f32(x[i]);
722
  }
723
  }
724
 
725
- inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
726
- const uint16_t * i16 = (const uint16_t *) x;
727
  for (int i = 0; i < n; ++i) {
728
- y[i] = table_gelu_f16[i16[i]];
 
 
729
  }
730
  }
731
 
 
 
 
 
 
 
732
  inline static void ggml_vec_sum_f32 (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; }
733
  inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }
734
 
@@ -2867,13 +2876,15 @@ void ggml_compute_forward_add_f32(
2867
  const struct ggml_tensor * src0,
2868
  const struct ggml_tensor * src1,
2869
  struct ggml_tensor * dst) {
2870
- GGML_ASSERT(params->ith == 0);
2871
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
2872
 
2873
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
2874
  return;
2875
  }
2876
 
 
 
 
2877
  const int n = ggml_nrows(src0);
2878
  const int nc = src0->ne[0];
2879
 
@@ -2890,7 +2901,7 @@ void ggml_compute_forward_add_f32(
2890
  GGML_ASSERT(nb00 == sizeof(float));
2891
 
2892
  if (nb10 == sizeof(float)) {
2893
- for (int j = 0; j < n; j++) {
2894
  ggml_vec_add_f32(nc,
2895
  (float *) ((char *) dst->data + j*nb1),
2896
  (float *) ((char *) src0->data + j*nb01),
@@ -2898,7 +2909,7 @@ void ggml_compute_forward_add_f32(
2898
  }
2899
  } else {
2900
  // src1 is not contiguous
2901
- for (int j = 0; j < n; j++) {
2902
  float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
2903
  float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
2904
  for (int i = 0; i < nc; i++) {
@@ -3669,14 +3680,16 @@ void ggml_compute_forward_norm_f32(
3669
  const struct ggml_compute_params * params,
3670
  const struct ggml_tensor * src0,
3671
  struct ggml_tensor * dst) {
3672
- assert(params->ith == 0);
3673
- assert(ggml_are_same_shape(src0, dst));
3674
 
3675
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3676
  return;
3677
  }
3678
 
3679
- assert(src0->nb[0] == sizeof(float));
 
 
 
3680
 
3681
  const int ne00 = src0->ne[0];
3682
  const int ne01 = src0->ne[1];
@@ -3696,7 +3709,7 @@ void ggml_compute_forward_norm_f32(
3696
  // TODO: optimize
3697
  for (int i03 = 0; i03 < ne03; i03++) {
3698
  for (int i02 = 0; i02 < ne02; i02++) {
3699
- for (int i01 = 0; i01 < ne01; i01++) {
3700
  const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
3701
 
3702
  ggml_float mean = 0.0;
@@ -3745,6 +3758,28 @@ void ggml_compute_forward_norm(
3745
 
3746
  // ggml_compute_forward_mul_mat
3747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3748
  void ggml_compute_forward_mul_mat_f32(
3749
  const struct ggml_compute_params * params,
3750
  const struct ggml_tensor * src0,
@@ -3812,6 +3847,47 @@ void ggml_compute_forward_mul_mat_f32(
3812
  // nb00 < nb01 - src0 is transposed
3813
  // compute by src0 columns
3814
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3815
  if (params->type == GGML_TASK_INIT) {
3816
  if (nb01 >= nb00) {
3817
  return;
@@ -3848,78 +3924,6 @@ void ggml_compute_forward_mul_mat_f32(
3848
  return;
3849
  }
3850
 
3851
- //#ifdef GGML_USE_ACCELERATE
3852
- // // try to use BLAS
3853
- //
3854
- // if (nb01 >= nb00 && ne0 > 1024 && ne1 > 1024) {
3855
- // if (params->ith != 0) return;
3856
- // printf("XXXXXXXX\n");
3857
- //
3858
- // GGML_ASSERT(ggml_is_contiguous(src0));
3859
- // GGML_ASSERT(ggml_is_contiguous(src1));
3860
- //
3861
- // printf("ne00 = %d, ne01 = %d, ne02 = %d, ne03 = %d\n", ne00, ne01, ne02, ne03);
3862
- // printf("ne10 = %d, ne11 = %d, ne12 = %d, ne13 = %d\n", ne10, ne11, ne12, ne13);
3863
- // printf("ne0 = %d, ne1 = %d, ne2 = %d, ne3 = %d\n", ne0, ne1, ne2, ne3);
3864
- //
3865
- // printf("nb00 = %d, nb01 = %d, nb02 = %d, nb03 = %d\n", nb00, nb01, nb02, nb03);
3866
- // printf("nb10 = %d, nb11 = %d, nb12 = %d, nb13 = %d\n", nb10, nb11, nb12, nb13);
3867
- // printf("nb0 = %d, nb1 = %d, nb2 = %d, nb3 = %d\n", nb0, nb1, nb2, nb3);
3868
- //
3869
- // float * const wdata = params->wdata;
3870
- //
3871
- // int64_t tsum = 0.0;
3872
- // for (int i03 = 0; i03 < ne03; i03++) {
3873
- // for (int i02 = 0; i02 < ne02; i02++) {
3874
- // const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
3875
- // const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
3876
- // float * z = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
3877
- //
3878
- // // transpose src1
3879
- // for (int j = 0; j < ne11; ++j) {
3880
- // for (int i = 0; i < ne10; ++i) {
3881
- // wdata[i*ne11 + j] = y[j*ne10 + i];
3882
- // }
3883
- // }
3884
- //
3885
- // {
3886
- // const int64_t tt0 = ggml_time_us();
3887
- // cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
3888
- // 1500, 1500, 64,
3889
- // 1.0, x, 64,
3890
- // wdata, 1500,
3891
- // 0.0, z, 1500);
3892
- // const int64_t tt1 = ggml_time_us();
3893
- // tsum += tt1 - tt0;
3894
- // }
3895
- //
3896
- // // transpose z
3897
- // for (int j = 0; j < ne1; ++j) {
3898
- // for (int i = 0; i < ne0; ++i) {
3899
- // wdata[i*ne1 + j] = z[j*ne0 + i];
3900
- // }
3901
- // }
3902
- //
3903
- // memcpy(z, wdata, ne0*ne1*sizeof(float));
3904
- //
3905
- // //cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
3906
- // // ne0, ne1, 64,
3907
- // // 1.0f,
3908
- // // x, ne00,
3909
- // // y, ne11,
3910
- // // 0.0f,
3911
- // // z, 1500);
3912
- // }
3913
- // }
3914
- // printf("time = %f ms\n", tsum/1000.0);
3915
- // return;
3916
- // } else {
3917
- // //cblas_sgemv(CblasRowMajor, CblasTrans, ne00, ne01, 1.0, src0->data, ne01, src1->data, 1, 0.0, dst->data, 1);
3918
- // }
3919
- //
3920
- //#endif
3921
-
3922
-
3923
  if (nb01 >= nb00) {
3924
  // TODO: do not support transposed src1
3925
  assert(nb10 == sizeof(float));
@@ -4064,24 +4068,24 @@ void ggml_compute_forward_mul_mat_f16_f32(
4064
  const int ith = params->ith;
4065
  const int nth = params->nth;
4066
 
4067
- assert(ne02 == ne12);
4068
- assert(ne03 == ne13);
4069
- assert(ne2 == ne12);
4070
- assert(ne3 == ne13);
4071
 
4072
  // TODO: we don't support permuted src0
4073
- assert(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t));
4074
 
4075
  // dst cannot be transposed or permuted
4076
- assert(nb0 == sizeof(float));
4077
- assert(nb0 <= nb1);
4078
- assert(nb1 <= nb2);
4079
- assert(nb2 <= nb3);
4080
 
4081
- assert(ne0 == ne01);
4082
- assert(ne1 == ne11);
4083
- assert(ne2 == ne02);
4084
- assert(ne3 == ne03);
4085
 
4086
  // nb01 >= nb00 - src0 is not transposed
4087
  // compute by src0 rows
@@ -4089,6 +4093,73 @@ void ggml_compute_forward_mul_mat_f16_f32(
4089
  // nb00 < nb01 - src0 is transposed
4090
  // compute by src0 columns
4091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4092
  if (params->type == GGML_TASK_INIT) {
4093
  if (nb01 >= nb00) {
4094
  ggml_fp16_t * const wdata = params->wdata;
@@ -6534,7 +6605,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
6534
 
6535
  switch (node->op) {
6536
  case GGML_OP_DUP:
 
 
 
6537
  case GGML_OP_ADD:
 
 
 
6538
  case GGML_OP_SUB:
6539
  case GGML_OP_MUL:
6540
  case GGML_OP_DIV:
@@ -6553,11 +6630,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
6553
  } break;
6554
  case GGML_OP_GELU:
6555
  {
6556
- node->n_tasks = MIN(n_threads, ggml_nrows(node->src0));
6557
  } break;
6558
  case GGML_OP_NORM:
6559
  {
6560
- node->n_tasks = 1;
6561
  } break;
6562
  case GGML_OP_MUL_MAT:
6563
  {
@@ -6572,7 +6649,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
6572
  } else {
6573
  if (node->src0->type == GGML_TYPE_F16 &&
6574
  node->src1->type == GGML_TYPE_F32) {
 
 
 
 
 
 
 
6575
  cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
 
6576
  } else if (node->src0->type == GGML_TYPE_F32 &&
6577
  node->src1->type == GGML_TYPE_F32) {
6578
  cur = 0;
@@ -6585,7 +6670,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
6585
  } break;
6586
  case GGML_OP_SCALE:
6587
  {
6588
- node->n_tasks = MIN(n_threads, ggml_nrows(node->src0));
6589
  } break;
6590
  case GGML_OP_CPY:
6591
  case GGML_OP_RESHAPE:
@@ -6599,7 +6684,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
6599
  } break;
6600
  case GGML_OP_SOFT_MAX:
6601
  {
6602
- node->n_tasks = MIN(n_threads, ggml_nrows(node->src0));
6603
  } break;
6604
  case GGML_OP_ROPE:
6605
  {
@@ -6714,7 +6799,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
6714
  struct ggml_compute_params params = {
6715
  /*.type =*/ GGML_TASK_INIT,
6716
  /*.ith =*/ 0,
6717
- /*.nth =*/ n_threads,
6718
  /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
6719
  /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
6720
  };
@@ -6898,9 +6983,9 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
6898
 
6899
  perf_total_per_op_us[node->op] += node->perf_time_us;
6900
 
6901
- GGML_PRINT(" - %3d: [ %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
6902
  i,
6903
- node->ne[0], node->ne[1],
6904
  GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
6905
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
6906
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
 
716
  return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
717
  }
718
 
719
+ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
720
+ const uint16_t * i16 = (const uint16_t *) x;
721
  for (int i = 0; i < n; ++i) {
722
+ y[i] = table_gelu_f16[i16[i]];
723
  }
724
  }
725
 
726
+ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
727
+ uint16_t t;
728
  for (int i = 0; i < n; ++i) {
729
+ ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
730
+ memcpy(&t, &fp16, sizeof(uint16_t));
731
+ y[i] = table_gelu_f16[t];
732
  }
733
  }
734
 
735
+ //inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
736
+ // for (int i = 0; i < n; ++i) {
737
+ // y[i] = ggml_gelu_f32(x[i]);
738
+ // }
739
+ //}
740
+
741
  inline static void ggml_vec_sum_f32 (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; }
742
  inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }
743
 
 
2876
  const struct ggml_tensor * src0,
2877
  const struct ggml_tensor * src1,
2878
  struct ggml_tensor * dst) {
 
2879
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
2880
 
2881
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
2882
  return;
2883
  }
2884
 
2885
+ const int ith = params->ith;
2886
+ const int nth = params->nth;
2887
+
2888
  const int n = ggml_nrows(src0);
2889
  const int nc = src0->ne[0];
2890
 
 
2901
  GGML_ASSERT(nb00 == sizeof(float));
2902
 
2903
  if (nb10 == sizeof(float)) {
2904
+ for (int j = ith; j < n; j += nth) {
2905
  ggml_vec_add_f32(nc,
2906
  (float *) ((char *) dst->data + j*nb1),
2907
  (float *) ((char *) src0->data + j*nb01),
 
2909
  }
2910
  } else {
2911
  // src1 is not contiguous
2912
+ for (int j = ith; j < n; j += nth) {
2913
  float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
2914
  float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
2915
  for (int i = 0; i < nc; i++) {
 
3680
  const struct ggml_compute_params * params,
3681
  const struct ggml_tensor * src0,
3682
  struct ggml_tensor * dst) {
3683
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
 
3684
 
3685
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3686
  return;
3687
  }
3688
 
3689
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
3690
+
3691
+ const int ith = params->ith;
3692
+ const int nth = params->nth;
3693
 
3694
  const int ne00 = src0->ne[0];
3695
  const int ne01 = src0->ne[1];
 
3709
  // TODO: optimize
3710
  for (int i03 = 0; i03 < ne03; i03++) {
3711
  for (int i02 = 0; i02 < ne02; i02++) {
3712
+ for (int i01 = ith; i01 < ne01; i01 += nth) {
3713
  const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
3714
 
3715
  ggml_float mean = 0.0;
 
3758
 
3759
  // ggml_compute_forward_mul_mat
3760
 
3761
+ // helper function to determine if it is better to use BLAS or not
3762
+ // for large matrices, BLAS is faster
3763
+ bool ggml_compute_forward_mul_mat_use_blas(
3764
+ const struct ggml_tensor * src0,
3765
+ const struct ggml_tensor * src1,
3766
+ struct ggml_tensor * dst) {
3767
+ UNUSED(src0);
3768
+
3769
+ const int ne10 = src1->ne[0];
3770
+
3771
+ const int ne0 = dst->ne[0];
3772
+ const int ne1 = dst->ne[1];
3773
+
3774
+ // TODO: find the optimal values for these
3775
+ if (ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
3776
+ //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
3777
+ return true;
3778
+ }
3779
+
3780
+ return false;
3781
+ }
3782
+
3783
  void ggml_compute_forward_mul_mat_f32(
3784
  const struct ggml_compute_params * params,
3785
  const struct ggml_tensor * src0,
 
3847
  // nb00 < nb01 - src0 is transposed
3848
  // compute by src0 columns
3849
 
3850
+ //#ifdef GGML_USE_ACCELERATE
3851
+ // if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
3852
+ // GGML_ASSERT(ggml_is_contiguous(src0));
3853
+ // GGML_ASSERT(nb10 == sizeof(float));
3854
+ //
3855
+ // if (params->ith != 0) return;
3856
+ //
3857
+ // if (params->type == GGML_TASK_INIT) {
3858
+ // return;
3859
+ // }
3860
+ //
3861
+ // if (params->type == GGML_TASK_FINALIZE) {
3862
+ // return;
3863
+ // }
3864
+ //
3865
+ // float * const wdata = params->wdata;
3866
+ //
3867
+ // for (int i03 = 0; i03 < ne03; i03++) {
3868
+ // for (int i02 = 0; i02 < ne02; i02++) {
3869
+ // const float * x = (float *) (src0->data);
3870
+ // const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
3871
+ //
3872
+ // float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
3873
+ //
3874
+ // // zT = y * xT
3875
+ // {
3876
+ // cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
3877
+ // ne11, ne01, ne10,
3878
+ // 1.0f, y, ne10,
3879
+ // x, ne10,
3880
+ // 0.0f, d, ne01);
3881
+ // }
3882
+ // }
3883
+ // }
3884
+ //
3885
+ // //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
3886
+ //
3887
+ // return;
3888
+ // }
3889
+ //#endif
3890
+
3891
  if (params->type == GGML_TASK_INIT) {
3892
  if (nb01 >= nb00) {
3893
  return;
 
3924
  return;
3925
  }
3926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3927
  if (nb01 >= nb00) {
3928
  // TODO: do not support transposed src1
3929
  assert(nb10 == sizeof(float));
 
4068
  const int ith = params->ith;
4069
  const int nth = params->nth;
4070
 
4071
+ GGML_ASSERT(ne02 == ne12);
4072
+ GGML_ASSERT(ne03 == ne13);
4073
+ GGML_ASSERT(ne2 == ne12);
4074
+ GGML_ASSERT(ne3 == ne13);
4075
 
4076
  // TODO: we don't support permuted src0
4077
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t));
4078
 
4079
  // dst cannot be transposed or permuted
4080
+ GGML_ASSERT(nb0 == sizeof(float));
4081
+ GGML_ASSERT(nb0 <= nb1);
4082
+ GGML_ASSERT(nb1 <= nb2);
4083
+ GGML_ASSERT(nb2 <= nb3);
4084
 
4085
+ GGML_ASSERT(ne0 == ne01);
4086
+ GGML_ASSERT(ne1 == ne11);
4087
+ GGML_ASSERT(ne2 == ne02);
4088
+ GGML_ASSERT(ne3 == ne03);
4089
 
4090
  // nb01 >= nb00 - src0 is not transposed
4091
  // compute by src0 rows
 
4093
  // nb00 < nb01 - src0 is transposed
4094
  // compute by src0 columns
4095
 
4096
+ #ifdef GGML_USE_ACCELERATE
4097
+ if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
4098
+ GGML_ASSERT(nb10 == sizeof(float));
4099
+
4100
+ if (params->ith != 0) return;
4101
+
4102
+ if (params->type == GGML_TASK_INIT) {
4103
+ return;
4104
+ }
4105
+
4106
+ if (params->type == GGML_TASK_FINALIZE) {
4107
+ return;
4108
+ }
4109
+
4110
+ float * const wdata = params->wdata;
4111
+
4112
+ for (int i03 = 0; i03 < ne03; i03++) {
4113
+ for (int i02 = 0; i02 < ne02; i02++) {
4114
+ {
4115
+ int id = 0;
4116
+ for (int i01 = 0; i01 < ne01; ++i01) {
4117
+ for (int i00 = 0; i00 < ne00; ++i00) {
4118
+ wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
4119
+ }
4120
+ }
4121
+ }
4122
+
4123
+ const float * x = wdata;
4124
+ const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
4125
+
4126
+ // float * z = wdata + ne00*ne01;
4127
+
4128
+ // z = x * yT
4129
+ //{
4130
+ // cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
4131
+ // ne01, ne11, ne00,
4132
+ // 1.0f, x, ne00,
4133
+ // y, ne00,
4134
+ // 0.0f, z, ne11);
4135
+ //}
4136
+
4137
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
4138
+
4139
+ // transpose z
4140
+ //for (int j = 0; j < ne11; ++j) {
4141
+ // for (int i = 0; i < ne01; ++i) {
4142
+ // d[j*ne01 + i] = z[i*ne11 + j];
4143
+ // }
4144
+ //}
4145
+
4146
+ // zT = y * xT
4147
+ {
4148
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
4149
+ ne11, ne01, ne10,
4150
+ 1.0f, y, ne10,
4151
+ x, ne10,
4152
+ 0.0f, d, ne01);
4153
+ }
4154
+ }
4155
+ }
4156
+
4157
+ //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
4158
+
4159
+ return;
4160
+ }
4161
+ #endif
4162
+
4163
  if (params->type == GGML_TASK_INIT) {
4164
  if (nb01 >= nb00) {
4165
  ggml_fp16_t * const wdata = params->wdata;
 
6605
 
6606
  switch (node->op) {
6607
  case GGML_OP_DUP:
6608
+ {
6609
+ node->n_tasks = 1;
6610
+ } break;
6611
  case GGML_OP_ADD:
6612
+ {
6613
+ node->n_tasks = 1;
6614
+ } break;
6615
  case GGML_OP_SUB:
6616
  case GGML_OP_MUL:
6617
  case GGML_OP_DIV:
 
6630
  } break;
6631
  case GGML_OP_GELU:
6632
  {
6633
+ node->n_tasks = n_threads;
6634
  } break;
6635
  case GGML_OP_NORM:
6636
  {
6637
+ node->n_tasks = n_threads;
6638
  } break;
6639
  case GGML_OP_MUL_MAT:
6640
  {
 
6649
  } else {
6650
  if (node->src0->type == GGML_TYPE_F16 &&
6651
  node->src1->type == GGML_TYPE_F32) {
6652
+ #ifdef GGML_USE_ACCELERATE
6653
+ if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
6654
+ cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
6655
+ } else {
6656
+ cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
6657
+ }
6658
+ #else
6659
  cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
6660
+ #endif
6661
  } else if (node->src0->type == GGML_TYPE_F32 &&
6662
  node->src1->type == GGML_TYPE_F32) {
6663
  cur = 0;
 
6670
  } break;
6671
  case GGML_OP_SCALE:
6672
  {
6673
+ node->n_tasks = n_threads;
6674
  } break;
6675
  case GGML_OP_CPY:
6676
  case GGML_OP_RESHAPE:
 
6684
  } break;
6685
  case GGML_OP_SOFT_MAX:
6686
  {
6687
+ node->n_tasks = n_threads;
6688
  } break;
6689
  case GGML_OP_ROPE:
6690
  {
 
6799
  struct ggml_compute_params params = {
6800
  /*.type =*/ GGML_TASK_INIT,
6801
  /*.ith =*/ 0,
6802
+ /*.nth =*/ node->n_tasks,
6803
  /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
6804
  /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
6805
  };
 
6983
 
6984
  perf_total_per_op_us[node->op] += node->perf_time_us;
6985
 
6986
+ GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
6987
  i,
6988
+ node->ne[0], node->ne[1], node->ne[2],
6989
  GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
6990
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
6991
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
main.cpp CHANGED
@@ -21,7 +21,7 @@ std::string to_timestamp(int64_t t) {
21
  msec = msec - min * (1000 * 60);
22
  int64_t sec = msec / 1000;
23
  msec = msec - sec * 1000;
24
-
25
  char buf[32];
26
  snprintf(buf, sizeof(buf), "%02d:%02d:%02d.%03d", (int) hr, (int) min, (int) sec, (int) msec);
27
 
 
21
  msec = msec - min * (1000 * 60);
22
  int64_t sec = msec / 1000;
23
  msec = msec - sec * 1000;
24
+
25
  char buf[32];
26
  snprintf(buf, sizeof(buf), "%02d:%02d:%02d.%03d", (int) hr, (int) min, (int) sec, (int) msec);
27
 
whisper.cpp CHANGED
@@ -15,7 +15,7 @@
15
  #include <vector>
16
 
17
  #define USE_FLASH_ATTN
18
- #define USE_FLASH_FF
19
 
20
  // available whisper models
21
  enum e_model {
@@ -148,11 +148,11 @@ static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
148
  };
149
 
150
  static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
151
- { MODEL_TINY, 64ull*MB },
152
- { MODEL_BASE, 84ull*MB },
153
- { MODEL_SMALL, 128ull*MB },
154
- { MODEL_MEDIUM, 172ull*MB },
155
- { MODEL_LARGE, 216ull*MB },
156
  };
157
 
158
  static const std::map<e_model, size_t> MEM_REQ_DECODE = {
 
15
  #include <vector>
16
 
17
  #define USE_FLASH_ATTN
18
+ //#define USE_FLASH_FF
19
 
20
  // available whisper models
21
  enum e_model {
 
148
  };
149
 
150
  static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
151
+ { MODEL_TINY, 104ull*MB },
152
+ { MODEL_BASE, 138ull*MB },
153
+ { MODEL_SMALL, 208ull*MB },
154
+ { MODEL_MEDIUM, 280ull*MB },
155
+ { MODEL_LARGE, 354ull*MB },
156
  };
157
 
158
  static const std::map<e_model, size_t> MEM_REQ_DECODE = {