whisper.cpp

Running

ggerganov commited on Oct 17, 2022

Commit

521bf92

1 Parent(s): 8a0ab81

Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

Files changed (5) hide show

Makefile +7 -2
README.md +13 -7
ggml.c +189 -104
main.cpp +1 -1
whisper.cpp +6 -6

Makefile CHANGED Viewed

@@ -8,6 +8,7 @@ UNAME_M := $(shell uname -m)
 CFLAGS   = -O3 -std=c11
 CXXFLAGS = -O3 -std=c++11
 CFLAGS   += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
 CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
@@ -37,7 +38,11 @@ ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
 ifneq ($(filter arm%,$(UNAME_M)),)
-	# Mac M1
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 endif
@@ -59,7 +64,7 @@ endif
 #
 main: main.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) main.cpp whisper.o ggml.o -o main
 	./main -h
 ggml.o: ggml.c ggml.h

 CFLAGS   = -O3 -std=c11
 CXXFLAGS = -O3 -std=c++11
+LDFLAGS  =
 CFLAGS   += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
 CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
 ifneq ($(filter arm%,$(UNAME_M)),)
+	# Mac M1 - include Accelerate framework
+	ifeq ($(UNAME_S),Darwin)
+		CFLAGS  += -DGGML_USE_ACCELERATE
+		LDFLAGS += -framework Accelerate
+	endif
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 endif
 #
 main: main.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) main.cpp whisper.o ggml.o -o main $(LDFLAGS)
 	./main -h
 ggml.o: ggml.c ggml.h

README.md CHANGED Viewed

@@ -6,7 +6,8 @@
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
 - Plain C/C++ implementation without dependencies
-- ARM_NEON and AVX intrinsics support
 - Mixed F16 / F32 precision
 - Low memory usage (Flash Attention + Flash Forward)
 - Zero memory allocations at runtime
@@ -224,7 +225,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
 The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
-# Install SDL2 on Linux
 sudo apt-get install libsdl2-dev
 # Install SDL2 on Mac OS
@@ -240,6 +241,10 @@ make stream
 - Simple usage is demonstrated in [main.cpp](main.cpp)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](stream.cpp)
 ## Limitations
 - Very basic greedy sampling scheme - always pick up the top token. You can implement your own strategy
@@ -250,11 +255,12 @@ make stream
 | Model  | Disk   | Mem     |
 | ---    | ---    | ---     |
-| tiny   |  75 MB | ~240 MB |
-| base   | 142 MB | ~380 MB |
-| small  | 466 MB | ~970 MB |
-| medium | 1.5 GB | ~2.5 GB |
-| large  | 2.9 GB | ~4.6 GB |
 ## ggml format

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
 - Plain C/C++ implementation without dependencies
+- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
+- AVX intrinsics support for x86 architectures
 - Mixed F16 / F32 precision
 - Low memory usage (Flash Attention + Flash Forward)
 - Zero memory allocations at runtime
 The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
+# Install SDL2 on Linux
 sudo apt-get install libsdl2-dev
 # Install SDL2 on Mac OS
 - Simple usage is demonstrated in [main.cpp](main.cpp)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](stream.cpp)
+The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
+instrisics or CBLAS Accelerate framwork routines are used. The latter are especially effective for bigger sizes since
+the framwork utilizes the special-purpose AMX coprocessor available in modern Apple products.
 ## Limitations
 - Very basic greedy sampling scheme - always pick up the top token. You can implement your own strategy
 | Model  | Disk   | Mem     |
 | ---    | ---    | ---     |
+| tiny   |  75 MB | ~280 MB |
+| base   | 142 MB | ~430 MB |
+| small  | 466 MB | ~1.0 GB |
+| medium | 1.5 GB | ~2.6 GB |
+| large  | 2.9 GB | ~4.7 GB |
 ## ggml format

ggml.c CHANGED Viewed

@@ -716,19 +716,28 @@ inline static float ggml_gelu_f32(float x) {
     return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
 }
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
     }
 }
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
     for (int i = 0; i < n; ++i) {
-        y[i] = table_gelu_f16[i16[i]];
     }
 }
 inline static void ggml_vec_sum_f32     (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; }
 inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }
@@ -2867,13 +2876,15 @@ void ggml_compute_forward_add_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
     GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
     const int n  = ggml_nrows(src0);
     const int nc = src0->ne[0];
@@ -2890,7 +2901,7 @@ void ggml_compute_forward_add_f32(
     GGML_ASSERT(nb00 == sizeof(float));
     if (nb10 == sizeof(float)) {
-        for (int j = 0; j < n; j++) {
             ggml_vec_add_f32(nc,
                     (float *) ((char *) dst->data  + j*nb1),
                     (float *) ((char *) src0->data + j*nb01),
@@ -2898,7 +2909,7 @@ void ggml_compute_forward_add_f32(
         }
     } else {
         // src1 is not contiguous
-        for (int j = 0; j < n; j++) {
             float * dst_ptr  = (float *) ((char *) dst->data  + j*nb1);
             float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
             for (int i = 0; i < nc; i++) {
@@ -3669,14 +3680,16 @@ void ggml_compute_forward_norm_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
-    assert(src0->nb[0] == sizeof(float));
     const int ne00 = src0->ne[0];
     const int ne01 = src0->ne[1];
@@ -3696,7 +3709,7 @@ void ggml_compute_forward_norm_f32(
     // TODO: optimize
     for (int i03 = 0; i03 < ne03; i03++) {
         for (int i02 = 0; i02 < ne02; i02++) {
-            for (int i01 = 0; i01 < ne01; i01++) {
                 const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                 ggml_float mean = 0.0;
@@ -3745,6 +3758,28 @@ void ggml_compute_forward_norm(
 // ggml_compute_forward_mul_mat
 void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -3812,6 +3847,47 @@ void ggml_compute_forward_mul_mat_f32(
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
     if (params->type == GGML_TASK_INIT) {
         if (nb01 >= nb00) {
             return;
@@ -3848,78 +3924,6 @@ void ggml_compute_forward_mul_mat_f32(
         return;
     }
-//#ifdef GGML_USE_ACCELERATE
-//    // try to use BLAS
-//
-//    if (nb01 >= nb00 && ne0 > 1024 && ne1 > 1024) {
-//        if (params->ith != 0) return;
-//        printf("XXXXXXXX\n");
-//
-//        GGML_ASSERT(ggml_is_contiguous(src0));
-//        GGML_ASSERT(ggml_is_contiguous(src1));
-//
-//        printf("ne00 = %d, ne01 = %d, ne02 = %d, ne03 = %d\n", ne00, ne01, ne02, ne03);
-//        printf("ne10 = %d, ne11 = %d, ne12 = %d, ne13 = %d\n", ne10, ne11, ne12, ne13);
-//        printf("ne0  = %d, ne1  = %d, ne2  = %d, ne3  = %d\n", ne0, ne1, ne2, ne3);
-//
-//        printf("nb00 = %d, nb01 = %d, nb02 = %d, nb03 = %d\n", nb00, nb01, nb02, nb03);
-//        printf("nb10 = %d, nb11 = %d, nb12 = %d, nb13 = %d\n", nb10, nb11, nb12, nb13);
-//        printf("nb0  = %d, nb1  = %d, nb2  = %d, nb3  = %d\n", nb0, nb1, nb2, nb3);
-//
-//        float * const wdata = params->wdata;
-//
-//        int64_t tsum = 0.0;
-//        for (int i03 = 0; i03 < ne03; i03++) {
-//            for (int i02 = 0; i02 < ne02; i02++) {
-//                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
-//                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-//                      float * z = (float *) ((char *)  dst->data + i02*nb2  + i03*nb3);
-//
-//                // transpose src1
-//                for (int j = 0; j < ne11; ++j) {
-//                    for (int i = 0; i < ne10; ++i) {
-//                        wdata[i*ne11 + j] = y[j*ne10 + i];
-//                    }
-//                }
-//
-//                {
-//                    const int64_t tt0 = ggml_time_us();
-//                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-//                            1500, 1500, 64,
-//                            1.0, x, 64,
-//                            wdata, 1500,
-//                            0.0, z, 1500);
-//                    const int64_t tt1 = ggml_time_us();
-//                    tsum += tt1 - tt0;
-//                }
-//
-//                // transpose z
-//                for (int j = 0; j < ne1; ++j) {
-//                    for (int i = 0; i < ne0; ++i) {
-//                        wdata[i*ne1 + j] = z[j*ne0 + i];
-//                    }
-//                }
-//
-//                memcpy(z, wdata, ne0*ne1*sizeof(float));
-//
-//                //cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-//                //        ne0, ne1, 64,
-//                //        1.0f,
-//                //        x, ne00,
-//                //        y, ne11,
-//                //        0.0f,
-//                //        z, 1500);
-//            }
-//        }
-//        printf("time = %f ms\n", tsum/1000.0);
-//        return;
-//    } else {
-//        //cblas_sgemv(CblasRowMajor, CblasTrans,   ne00, ne01, 1.0, src0->data, ne01, src1->data, 1, 0.0, dst->data, 1);
-//    }
-//
-//#endif
     if (nb01 >= nb00) {
         // TODO: do not support transposed src1
         assert(nb10 == sizeof(float));
@@ -4064,24 +4068,24 @@ void ggml_compute_forward_mul_mat_f16_f32(
     const int ith = params->ith;
     const int nth = params->nth;
-    assert(ne02 == ne12);
-    assert(ne03 == ne13);
-    assert(ne2  == ne12);
-    assert(ne3  == ne13);
     // TODO: we don't support permuted src0
-    assert(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t));
     // dst cannot be transposed or permuted
-    assert(nb0 == sizeof(float));
-    assert(nb0 <= nb1);
-    assert(nb1 <= nb2);
-    assert(nb2 <= nb3);
-    assert(ne0 == ne01);
-    assert(ne1 == ne11);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
@@ -4089,6 +4093,73 @@ void ggml_compute_forward_mul_mat_f16_f32(
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
     if (params->type == GGML_TASK_INIT) {
         if (nb01 >= nb00) {
             ggml_fp16_t * const wdata = params->wdata;
@@ -6534,7 +6605,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
             switch (node->op) {
                 case GGML_OP_DUP:
                 case GGML_OP_ADD:
                 case GGML_OP_SUB:
                 case GGML_OP_MUL:
                 case GGML_OP_DIV:
@@ -6553,11 +6630,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_GELU:
                     {
-                        node->n_tasks = MIN(n_threads, ggml_nrows(node->src0));
                     } break;
                 case GGML_OP_NORM:
                     {
-                        node->n_tasks = 1;
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
@@ -6572,7 +6649,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         } else {
                             if (node->src0->type == GGML_TYPE_F16 &&
                                 node->src1->type == GGML_TYPE_F32) {
                                 cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
                             } else if (node->src0->type == GGML_TYPE_F32 &&
                                        node->src1->type == GGML_TYPE_F32) {
                                 cur = 0;
@@ -6585,7 +6670,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        node->n_tasks = MIN(n_threads, ggml_nrows(node->src0));
                     } break;
                 case GGML_OP_CPY:
                 case GGML_OP_RESHAPE:
@@ -6599,7 +6684,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_SOFT_MAX:
                     {
-                        node->n_tasks = MIN(n_threads, ggml_nrows(node->src0));
                     } break;
                 case GGML_OP_ROPE:
                     {
@@ -6714,7 +6799,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_INIT,
             /*.ith   =*/ 0,
-            /*.nth   =*/ n_threads,
             /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
             /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
         };
@@ -6898,9 +6983,9 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
         perf_total_per_op_us[node->op] += node->perf_time_us;
-        GGML_PRINT(" - %3d: [ %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
-                node->ne[0], node->ne[1],
                 GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,

     return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
 }
+inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
     for (int i = 0; i < n; ++i) {
+        y[i] = table_gelu_f16[i16[i]];
     }
 }
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
     for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = table_gelu_f16[t];
     }
 }
+//inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = ggml_gelu_f32(x[i]);
+//    }
+//}
 inline static void ggml_vec_sum_f32     (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; }
 inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
+    const int ith = params->ith;
+    const int nth = params->nth;
     const int n  = ggml_nrows(src0);
     const int nc = src0->ne[0];
     GGML_ASSERT(nb00 == sizeof(float));
     if (nb10 == sizeof(float)) {
+        for (int j = ith; j < n; j += nth) {
             ggml_vec_add_f32(nc,
                     (float *) ((char *) dst->data  + j*nb1),
                     (float *) ((char *) src0->data + j*nb01),
         }
     } else {
         // src1 is not contiguous
+        for (int j = ith; j < n; j += nth) {
             float * dst_ptr  = (float *) ((char *) dst->data  + j*nb1);
             float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
             for (int i = 0; i < nc; i++) {
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    const int ith = params->ith;
+    const int nth = params->nth;
     const int ne00 = src0->ne[0];
     const int ne01 = src0->ne[1];
     // TODO: optimize
     for (int i03 = 0; i03 < ne03; i03++) {
         for (int i02 = 0; i02 < ne02; i02++) {
+            for (int i01 = ith; i01 < ne01; i01 += nth) {
                 const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                 ggml_float mean = 0.0;
 // ggml_compute_forward_mul_mat
+// helper function to determine if it is better to use BLAS or not
+// for large matrices, BLAS is faster
+bool ggml_compute_forward_mul_mat_use_blas(
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    UNUSED(src0);
+    const int ne10 = src1->ne[0];
+    const int ne0 = dst->ne[0];
+    const int ne1 = dst->ne[1];
+    // TODO: find the optimal values for these
+    if (ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
+        //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
+        return true;
+    }
+    return false;
+}
 void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
+//#ifdef GGML_USE_ACCELERATE
+//    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+//        GGML_ASSERT(ggml_is_contiguous(src0));
+//        GGML_ASSERT(nb10 == sizeof(float));
+//
+//        if (params->ith != 0) return;
+//
+//        if (params->type == GGML_TASK_INIT) {
+//            return;
+//        }
+//
+//        if (params->type == GGML_TASK_FINALIZE) {
+//            return;
+//        }
+//
+//        float * const wdata = params->wdata;
+//
+//        for (int i03 = 0; i03 < ne03; i03++) {
+//            for (int i02 = 0; i02 < ne02; i02++) {
+//                const float * x = (float *) (src0->data);
+//                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+//
+//                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+//
+//                // zT = y * xT
+//                {
+//                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+//                            ne11, ne01, ne10,
+//                            1.0f,    y, ne10,
+//                                     x, ne10,
+//                            0.0f,    d, ne01);
+//                }
+//            }
+//        }
+//
+//        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+//
+//        return;
+//    }
+//#endif
     if (params->type == GGML_TASK_INIT) {
         if (nb01 >= nb00) {
             return;
         return;
     }
     if (nb01 >= nb00) {
         // TODO: do not support transposed src1
         assert(nb10 == sizeof(float));
     const int ith = params->ith;
     const int nth = params->nth;
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
     // TODO: we don't support permuted src0
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t));
     // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
+#ifdef GGML_USE_ACCELERATE
+    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+        GGML_ASSERT(nb10 == sizeof(float));
+        if (params->ith != 0) return;
+        if (params->type == GGML_TASK_INIT) {
+            return;
+        }
+        if (params->type == GGML_TASK_FINALIZE) {
+            return;
+        }
+        float * const wdata = params->wdata;
+        for (int i03 = 0; i03 < ne03; i03++) {
+            for (int i02 = 0; i02 < ne02; i02++) {
+                {
+                    int id = 0;
+                    for (int i01 = 0; i01 < ne01; ++i01) {
+                        for (int i00 = 0; i00 < ne00; ++i00) {
+                            wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
+                        }
+                    }
+                }
+                const float * x = wdata;
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+                //      float * z =                          wdata + ne00*ne01;
+                // z = x * yT
+                //{
+                //    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                //            ne01, ne11, ne00,
+                //            1.0f, x, ne00,
+                //                  y, ne00,
+                //            0.0f, z, ne11);
+                //}
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+                // transpose z
+                //for (int j = 0; j < ne11; ++j) {
+                //    for (int i = 0; i < ne01; ++i) {
+                //        d[j*ne01 + i] = z[i*ne11 + j];
+                //    }
+                //}
+                // zT = y * xT
+                {
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                            ne11, ne01, ne10,
+                            1.0f,    y, ne10,
+                                     x, ne10,
+                            0.0f,    d, ne01);
+                }
+            }
+        }
+        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+        return;
+    }
+#endif
     if (params->type == GGML_TASK_INIT) {
         if (nb01 >= nb00) {
             ggml_fp16_t * const wdata = params->wdata;
             switch (node->op) {
                 case GGML_OP_DUP:
+                    {
+                        node->n_tasks = 1;
+                    } break;
                 case GGML_OP_ADD:
+                    {
+                        node->n_tasks = 1;
+                    } break;
                 case GGML_OP_SUB:
                 case GGML_OP_MUL:
                 case GGML_OP_DIV:
                     } break;
                 case GGML_OP_GELU:
                     {
+                        node->n_tasks = n_threads;
                     } break;
                 case GGML_OP_NORM:
                     {
+                        node->n_tasks = n_threads;
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
                         } else {
                             if (node->src0->type == GGML_TYPE_F16 &&
                                 node->src1->type == GGML_TYPE_F32) {
+#ifdef GGML_USE_ACCELERATE
+                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                                    cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
+                                } else {
+                                    cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
+                                }
+#else
                                 cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
+#endif
                             } else if (node->src0->type == GGML_TYPE_F32 &&
                                        node->src1->type == GGML_TYPE_F32) {
                                 cur = 0;
                     } break;
                 case GGML_OP_SCALE:
                     {
+                        node->n_tasks = n_threads;
                     } break;
                 case GGML_OP_CPY:
                 case GGML_OP_RESHAPE:
                     } break;
                 case GGML_OP_SOFT_MAX:
                     {
+                        node->n_tasks = n_threads;
                     } break;
                 case GGML_OP_ROPE:
                     {
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_INIT,
             /*.ith   =*/ 0,
+            /*.nth   =*/ node->n_tasks,
             /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
             /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
         };
         perf_total_per_op_us[node->op] += node->perf_time_us;
+        GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
+                node->ne[0], node->ne[1], node->ne[2],
                 GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,

main.cpp CHANGED Viewed

@@ -21,7 +21,7 @@ std::string to_timestamp(int64_t t) {
     msec = msec - min * (1000 * 60);
     int64_t sec = msec / 1000;
     msec = msec - sec * 1000;
     char buf[32];
     snprintf(buf, sizeof(buf), "%02d:%02d:%02d.%03d", (int) hr, (int) min, (int) sec, (int) msec);

     msec = msec - min * (1000 * 60);
     int64_t sec = msec / 1000;
     msec = msec - sec * 1000;
     char buf[32];
     snprintf(buf, sizeof(buf), "%02d:%02d:%02d.%03d", (int) hr, (int) min, (int) sec, (int) msec);

whisper.cpp CHANGED Viewed

@@ -15,7 +15,7 @@
 #include <vector>
 #define USE_FLASH_ATTN
-#define USE_FLASH_FF
 // available whisper models
 enum e_model {
@@ -148,11 +148,11 @@ static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
 };
 static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
-    { MODEL_TINY,     64ull*MB },
-    { MODEL_BASE,     84ull*MB },
-    { MODEL_SMALL,   128ull*MB },
-    { MODEL_MEDIUM,  172ull*MB },
-    { MODEL_LARGE,   216ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_DECODE = {

 #include <vector>
 #define USE_FLASH_ATTN
+//#define USE_FLASH_FF
 // available whisper models
 enum e_model {
 };
 static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
+    { MODEL_TINY,    104ull*MB },
+    { MODEL_BASE,    138ull*MB },
+    { MODEL_SMALL,   208ull*MB },
+    { MODEL_MEDIUM,  280ull*MB },
+    { MODEL_LARGE,   354ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_DECODE = {