Spaces:
Running
Running
Use Accelerate framework on Apple silicon
Browse filesHuge performance improvement in the Encode (almost x2 on MacBook M1 Pro)
Also various extra optimizations:
- Multi-threaded NORM operator
- Faster GELU via F16 cast
- Makefile +7 -2
- README.md +13 -7
- ggml.c +189 -104
- main.cpp +1 -1
- whisper.cpp +6 -6
Makefile
CHANGED
|
@@ -8,6 +8,7 @@ UNAME_M := $(shell uname -m)
|
|
| 8 |
|
| 9 |
CFLAGS = -O3 -std=c11
|
| 10 |
CXXFLAGS = -O3 -std=c++11
|
|
|
|
| 11 |
|
| 12 |
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
|
| 13 |
CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
|
|
@@ -37,7 +38,11 @@ ifeq ($(UNAME_M),amd64)
|
|
| 37 |
CFLAGS += -mavx -mavx2 -mfma -mf16c
|
| 38 |
endif
|
| 39 |
ifneq ($(filter arm%,$(UNAME_M)),)
|
| 40 |
-
# Mac M1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
endif
|
| 42 |
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
| 43 |
endif
|
|
@@ -59,7 +64,7 @@ endif
|
|
| 59 |
#
|
| 60 |
|
| 61 |
main: main.cpp ggml.o whisper.o
|
| 62 |
-
$(CXX) $(CXXFLAGS) main.cpp whisper.o ggml.o -o main
|
| 63 |
./main -h
|
| 64 |
|
| 65 |
ggml.o: ggml.c ggml.h
|
|
|
|
| 8 |
|
| 9 |
CFLAGS = -O3 -std=c11
|
| 10 |
CXXFLAGS = -O3 -std=c++11
|
| 11 |
+
LDFLAGS =
|
| 12 |
|
| 13 |
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
|
| 14 |
CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
|
|
|
|
| 38 |
CFLAGS += -mavx -mavx2 -mfma -mf16c
|
| 39 |
endif
|
| 40 |
ifneq ($(filter arm%,$(UNAME_M)),)
|
| 41 |
+
# Mac M1 - include Accelerate framework
|
| 42 |
+
ifeq ($(UNAME_S),Darwin)
|
| 43 |
+
CFLAGS += -DGGML_USE_ACCELERATE
|
| 44 |
+
LDFLAGS += -framework Accelerate
|
| 45 |
+
endif
|
| 46 |
endif
|
| 47 |
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
| 48 |
endif
|
|
|
|
| 64 |
#
|
| 65 |
|
| 66 |
main: main.cpp ggml.o whisper.o
|
| 67 |
+
$(CXX) $(CXXFLAGS) main.cpp whisper.o ggml.o -o main $(LDFLAGS)
|
| 68 |
./main -h
|
| 69 |
|
| 70 |
ggml.o: ggml.c ggml.h
|
README.md
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
| 7 |
|
| 8 |
- Plain C/C++ implementation without dependencies
|
| 9 |
-
-
|
|
|
|
| 10 |
- Mixed F16 / F32 precision
|
| 11 |
- Low memory usage (Flash Attention + Flash Forward)
|
| 12 |
- Zero memory allocations at runtime
|
|
@@ -224,7 +225,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
|
|
| 224 |
The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
|
| 225 |
|
| 226 |
```bash
|
| 227 |
-
# Install SDL2 on Linux
|
| 228 |
sudo apt-get install libsdl2-dev
|
| 229 |
|
| 230 |
# Install SDL2 on Mac OS
|
|
@@ -240,6 +241,10 @@ make stream
|
|
| 240 |
- Simple usage is demonstrated in [main.cpp](main.cpp)
|
| 241 |
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](stream.cpp)
|
| 242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
## Limitations
|
| 244 |
|
| 245 |
- Very basic greedy sampling scheme - always pick up the top token. You can implement your own strategy
|
|
@@ -250,11 +255,12 @@ make stream
|
|
| 250 |
|
| 251 |
| Model | Disk | Mem |
|
| 252 |
| --- | --- | --- |
|
| 253 |
-
| tiny | 75 MB | ~
|
| 254 |
-
| base | 142 MB | ~
|
| 255 |
-
| small | 466 MB | ~
|
| 256 |
-
| medium | 1.5 GB | ~2.
|
| 257 |
-
| large | 2.9 GB | ~4.
|
|
|
|
| 258 |
|
| 259 |
## ggml format
|
| 260 |
|
|
|
|
| 6 |
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
| 7 |
|
| 8 |
- Plain C/C++ implementation without dependencies
|
| 9 |
+
- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
|
| 10 |
+
- AVX intrinsics support for x86 architectures
|
| 11 |
- Mixed F16 / F32 precision
|
| 12 |
- Low memory usage (Flash Attention + Flash Forward)
|
| 13 |
- Zero memory allocations at runtime
|
|
|
|
| 225 |
The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
|
| 226 |
|
| 227 |
```bash
|
| 228 |
+
# Install SDL2 on Linux
|
| 229 |
sudo apt-get install libsdl2-dev
|
| 230 |
|
| 231 |
# Install SDL2 on Mac OS
|
|
|
|
| 241 |
- Simple usage is demonstrated in [main.cpp](main.cpp)
|
| 242 |
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](stream.cpp)
|
| 243 |
|
| 244 |
+
The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
|
| 245 |
+
instrisics or CBLAS Accelerate framwork routines are used. The latter are especially effective for bigger sizes since
|
| 246 |
+
the framwork utilizes the special-purpose AMX coprocessor available in modern Apple products.
|
| 247 |
+
|
| 248 |
## Limitations
|
| 249 |
|
| 250 |
- Very basic greedy sampling scheme - always pick up the top token. You can implement your own strategy
|
|
|
|
| 255 |
|
| 256 |
| Model | Disk | Mem |
|
| 257 |
| --- | --- | --- |
|
| 258 |
+
| tiny | 75 MB | ~280 MB |
|
| 259 |
+
| base | 142 MB | ~430 MB |
|
| 260 |
+
| small | 466 MB | ~1.0 GB |
|
| 261 |
+
| medium | 1.5 GB | ~2.6 GB |
|
| 262 |
+
| large | 2.9 GB | ~4.7 GB |
|
| 263 |
+
|
| 264 |
|
| 265 |
## ggml format
|
| 266 |
|
ggml.c
CHANGED
|
@@ -716,19 +716,28 @@ inline static float ggml_gelu_f32(float x) {
|
|
| 716 |
return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
|
| 717 |
}
|
| 718 |
|
| 719 |
-
inline static void
|
|
|
|
| 720 |
for (int i = 0; i < n; ++i) {
|
| 721 |
-
y[i] =
|
| 722 |
}
|
| 723 |
}
|
| 724 |
|
| 725 |
-
inline static void
|
| 726 |
-
|
| 727 |
for (int i = 0; i < n; ++i) {
|
| 728 |
-
|
|
|
|
|
|
|
| 729 |
}
|
| 730 |
}
|
| 731 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 732 |
inline static void ggml_vec_sum_f32 (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; }
|
| 733 |
inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }
|
| 734 |
|
|
@@ -2867,13 +2876,15 @@ void ggml_compute_forward_add_f32(
|
|
| 2867 |
const struct ggml_tensor * src0,
|
| 2868 |
const struct ggml_tensor * src1,
|
| 2869 |
struct ggml_tensor * dst) {
|
| 2870 |
-
GGML_ASSERT(params->ith == 0);
|
| 2871 |
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
| 2872 |
|
| 2873 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 2874 |
return;
|
| 2875 |
}
|
| 2876 |
|
|
|
|
|
|
|
|
|
|
| 2877 |
const int n = ggml_nrows(src0);
|
| 2878 |
const int nc = src0->ne[0];
|
| 2879 |
|
|
@@ -2890,7 +2901,7 @@ void ggml_compute_forward_add_f32(
|
|
| 2890 |
GGML_ASSERT(nb00 == sizeof(float));
|
| 2891 |
|
| 2892 |
if (nb10 == sizeof(float)) {
|
| 2893 |
-
for (int j =
|
| 2894 |
ggml_vec_add_f32(nc,
|
| 2895 |
(float *) ((char *) dst->data + j*nb1),
|
| 2896 |
(float *) ((char *) src0->data + j*nb01),
|
|
@@ -2898,7 +2909,7 @@ void ggml_compute_forward_add_f32(
|
|
| 2898 |
}
|
| 2899 |
} else {
|
| 2900 |
// src1 is not contiguous
|
| 2901 |
-
for (int j =
|
| 2902 |
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
| 2903 |
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
| 2904 |
for (int i = 0; i < nc; i++) {
|
|
@@ -3669,14 +3680,16 @@ void ggml_compute_forward_norm_f32(
|
|
| 3669 |
const struct ggml_compute_params * params,
|
| 3670 |
const struct ggml_tensor * src0,
|
| 3671 |
struct ggml_tensor * dst) {
|
| 3672 |
-
|
| 3673 |
-
assert(ggml_are_same_shape(src0, dst));
|
| 3674 |
|
| 3675 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 3676 |
return;
|
| 3677 |
}
|
| 3678 |
|
| 3679 |
-
|
|
|
|
|
|
|
|
|
|
| 3680 |
|
| 3681 |
const int ne00 = src0->ne[0];
|
| 3682 |
const int ne01 = src0->ne[1];
|
|
@@ -3696,7 +3709,7 @@ void ggml_compute_forward_norm_f32(
|
|
| 3696 |
// TODO: optimize
|
| 3697 |
for (int i03 = 0; i03 < ne03; i03++) {
|
| 3698 |
for (int i02 = 0; i02 < ne02; i02++) {
|
| 3699 |
-
for (int i01 =
|
| 3700 |
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
| 3701 |
|
| 3702 |
ggml_float mean = 0.0;
|
|
@@ -3745,6 +3758,28 @@ void ggml_compute_forward_norm(
|
|
| 3745 |
|
| 3746 |
// ggml_compute_forward_mul_mat
|
| 3747 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3748 |
void ggml_compute_forward_mul_mat_f32(
|
| 3749 |
const struct ggml_compute_params * params,
|
| 3750 |
const struct ggml_tensor * src0,
|
|
@@ -3812,6 +3847,47 @@ void ggml_compute_forward_mul_mat_f32(
|
|
| 3812 |
// nb00 < nb01 - src0 is transposed
|
| 3813 |
// compute by src0 columns
|
| 3814 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3815 |
if (params->type == GGML_TASK_INIT) {
|
| 3816 |
if (nb01 >= nb00) {
|
| 3817 |
return;
|
|
@@ -3848,78 +3924,6 @@ void ggml_compute_forward_mul_mat_f32(
|
|
| 3848 |
return;
|
| 3849 |
}
|
| 3850 |
|
| 3851 |
-
//#ifdef GGML_USE_ACCELERATE
|
| 3852 |
-
// // try to use BLAS
|
| 3853 |
-
//
|
| 3854 |
-
// if (nb01 >= nb00 && ne0 > 1024 && ne1 > 1024) {
|
| 3855 |
-
// if (params->ith != 0) return;
|
| 3856 |
-
// printf("XXXXXXXX\n");
|
| 3857 |
-
//
|
| 3858 |
-
// GGML_ASSERT(ggml_is_contiguous(src0));
|
| 3859 |
-
// GGML_ASSERT(ggml_is_contiguous(src1));
|
| 3860 |
-
//
|
| 3861 |
-
// printf("ne00 = %d, ne01 = %d, ne02 = %d, ne03 = %d\n", ne00, ne01, ne02, ne03);
|
| 3862 |
-
// printf("ne10 = %d, ne11 = %d, ne12 = %d, ne13 = %d\n", ne10, ne11, ne12, ne13);
|
| 3863 |
-
// printf("ne0 = %d, ne1 = %d, ne2 = %d, ne3 = %d\n", ne0, ne1, ne2, ne3);
|
| 3864 |
-
//
|
| 3865 |
-
// printf("nb00 = %d, nb01 = %d, nb02 = %d, nb03 = %d\n", nb00, nb01, nb02, nb03);
|
| 3866 |
-
// printf("nb10 = %d, nb11 = %d, nb12 = %d, nb13 = %d\n", nb10, nb11, nb12, nb13);
|
| 3867 |
-
// printf("nb0 = %d, nb1 = %d, nb2 = %d, nb3 = %d\n", nb0, nb1, nb2, nb3);
|
| 3868 |
-
//
|
| 3869 |
-
// float * const wdata = params->wdata;
|
| 3870 |
-
//
|
| 3871 |
-
// int64_t tsum = 0.0;
|
| 3872 |
-
// for (int i03 = 0; i03 < ne03; i03++) {
|
| 3873 |
-
// for (int i02 = 0; i02 < ne02; i02++) {
|
| 3874 |
-
// const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
| 3875 |
-
// const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
| 3876 |
-
// float * z = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
| 3877 |
-
//
|
| 3878 |
-
// // transpose src1
|
| 3879 |
-
// for (int j = 0; j < ne11; ++j) {
|
| 3880 |
-
// for (int i = 0; i < ne10; ++i) {
|
| 3881 |
-
// wdata[i*ne11 + j] = y[j*ne10 + i];
|
| 3882 |
-
// }
|
| 3883 |
-
// }
|
| 3884 |
-
//
|
| 3885 |
-
// {
|
| 3886 |
-
// const int64_t tt0 = ggml_time_us();
|
| 3887 |
-
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
| 3888 |
-
// 1500, 1500, 64,
|
| 3889 |
-
// 1.0, x, 64,
|
| 3890 |
-
// wdata, 1500,
|
| 3891 |
-
// 0.0, z, 1500);
|
| 3892 |
-
// const int64_t tt1 = ggml_time_us();
|
| 3893 |
-
// tsum += tt1 - tt0;
|
| 3894 |
-
// }
|
| 3895 |
-
//
|
| 3896 |
-
// // transpose z
|
| 3897 |
-
// for (int j = 0; j < ne1; ++j) {
|
| 3898 |
-
// for (int i = 0; i < ne0; ++i) {
|
| 3899 |
-
// wdata[i*ne1 + j] = z[j*ne0 + i];
|
| 3900 |
-
// }
|
| 3901 |
-
// }
|
| 3902 |
-
//
|
| 3903 |
-
// memcpy(z, wdata, ne0*ne1*sizeof(float));
|
| 3904 |
-
//
|
| 3905 |
-
// //cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
| 3906 |
-
// // ne0, ne1, 64,
|
| 3907 |
-
// // 1.0f,
|
| 3908 |
-
// // x, ne00,
|
| 3909 |
-
// // y, ne11,
|
| 3910 |
-
// // 0.0f,
|
| 3911 |
-
// // z, 1500);
|
| 3912 |
-
// }
|
| 3913 |
-
// }
|
| 3914 |
-
// printf("time = %f ms\n", tsum/1000.0);
|
| 3915 |
-
// return;
|
| 3916 |
-
// } else {
|
| 3917 |
-
// //cblas_sgemv(CblasRowMajor, CblasTrans, ne00, ne01, 1.0, src0->data, ne01, src1->data, 1, 0.0, dst->data, 1);
|
| 3918 |
-
// }
|
| 3919 |
-
//
|
| 3920 |
-
//#endif
|
| 3921 |
-
|
| 3922 |
-
|
| 3923 |
if (nb01 >= nb00) {
|
| 3924 |
// TODO: do not support transposed src1
|
| 3925 |
assert(nb10 == sizeof(float));
|
|
@@ -4064,24 +4068,24 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 4064 |
const int ith = params->ith;
|
| 4065 |
const int nth = params->nth;
|
| 4066 |
|
| 4067 |
-
|
| 4068 |
-
|
| 4069 |
-
|
| 4070 |
-
|
| 4071 |
|
| 4072 |
// TODO: we don't support permuted src0
|
| 4073 |
-
|
| 4074 |
|
| 4075 |
// dst cannot be transposed or permuted
|
| 4076 |
-
|
| 4077 |
-
|
| 4078 |
-
|
| 4079 |
-
|
| 4080 |
|
| 4081 |
-
|
| 4082 |
-
|
| 4083 |
-
|
| 4084 |
-
|
| 4085 |
|
| 4086 |
// nb01 >= nb00 - src0 is not transposed
|
| 4087 |
// compute by src0 rows
|
|
@@ -4089,6 +4093,73 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 4089 |
// nb00 < nb01 - src0 is transposed
|
| 4090 |
// compute by src0 columns
|
| 4091 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4092 |
if (params->type == GGML_TASK_INIT) {
|
| 4093 |
if (nb01 >= nb00) {
|
| 4094 |
ggml_fp16_t * const wdata = params->wdata;
|
|
@@ -6534,7 +6605,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 6534 |
|
| 6535 |
switch (node->op) {
|
| 6536 |
case GGML_OP_DUP:
|
|
|
|
|
|
|
|
|
|
| 6537 |
case GGML_OP_ADD:
|
|
|
|
|
|
|
|
|
|
| 6538 |
case GGML_OP_SUB:
|
| 6539 |
case GGML_OP_MUL:
|
| 6540 |
case GGML_OP_DIV:
|
|
@@ -6553,11 +6630,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 6553 |
} break;
|
| 6554 |
case GGML_OP_GELU:
|
| 6555 |
{
|
| 6556 |
-
node->n_tasks =
|
| 6557 |
} break;
|
| 6558 |
case GGML_OP_NORM:
|
| 6559 |
{
|
| 6560 |
-
node->n_tasks =
|
| 6561 |
} break;
|
| 6562 |
case GGML_OP_MUL_MAT:
|
| 6563 |
{
|
|
@@ -6572,7 +6649,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 6572 |
} else {
|
| 6573 |
if (node->src0->type == GGML_TYPE_F16 &&
|
| 6574 |
node->src1->type == GGML_TYPE_F32) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6575 |
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
|
|
|
|
| 6576 |
} else if (node->src0->type == GGML_TYPE_F32 &&
|
| 6577 |
node->src1->type == GGML_TYPE_F32) {
|
| 6578 |
cur = 0;
|
|
@@ -6585,7 +6670,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 6585 |
} break;
|
| 6586 |
case GGML_OP_SCALE:
|
| 6587 |
{
|
| 6588 |
-
node->n_tasks =
|
| 6589 |
} break;
|
| 6590 |
case GGML_OP_CPY:
|
| 6591 |
case GGML_OP_RESHAPE:
|
|
@@ -6599,7 +6684,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 6599 |
} break;
|
| 6600 |
case GGML_OP_SOFT_MAX:
|
| 6601 |
{
|
| 6602 |
-
node->n_tasks =
|
| 6603 |
} break;
|
| 6604 |
case GGML_OP_ROPE:
|
| 6605 |
{
|
|
@@ -6714,7 +6799,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 6714 |
struct ggml_compute_params params = {
|
| 6715 |
/*.type =*/ GGML_TASK_INIT,
|
| 6716 |
/*.ith =*/ 0,
|
| 6717 |
-
/*.nth =*/
|
| 6718 |
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
| 6719 |
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
| 6720 |
};
|
|
@@ -6898,9 +6983,9 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
| 6898 |
|
| 6899 |
perf_total_per_op_us[node->op] += node->perf_time_us;
|
| 6900 |
|
| 6901 |
-
GGML_PRINT(" - %3d: [ %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
| 6902 |
i,
|
| 6903 |
-
node->ne[0], node->ne[1],
|
| 6904 |
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
| 6905 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
| 6906 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
|
|
|
| 716 |
return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
|
| 717 |
}
|
| 718 |
|
| 719 |
+
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 720 |
+
const uint16_t * i16 = (const uint16_t *) x;
|
| 721 |
for (int i = 0; i < n; ++i) {
|
| 722 |
+
y[i] = table_gelu_f16[i16[i]];
|
| 723 |
}
|
| 724 |
}
|
| 725 |
|
| 726 |
+
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
| 727 |
+
uint16_t t;
|
| 728 |
for (int i = 0; i < n; ++i) {
|
| 729 |
+
ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
|
| 730 |
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
| 731 |
+
y[i] = table_gelu_f16[t];
|
| 732 |
}
|
| 733 |
}
|
| 734 |
|
| 735 |
+
//inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
| 736 |
+
// for (int i = 0; i < n; ++i) {
|
| 737 |
+
// y[i] = ggml_gelu_f32(x[i]);
|
| 738 |
+
// }
|
| 739 |
+
//}
|
| 740 |
+
|
| 741 |
inline static void ggml_vec_sum_f32 (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; }
|
| 742 |
inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }
|
| 743 |
|
|
|
|
| 2876 |
const struct ggml_tensor * src0,
|
| 2877 |
const struct ggml_tensor * src1,
|
| 2878 |
struct ggml_tensor * dst) {
|
|
|
|
| 2879 |
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
| 2880 |
|
| 2881 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 2882 |
return;
|
| 2883 |
}
|
| 2884 |
|
| 2885 |
+
const int ith = params->ith;
|
| 2886 |
+
const int nth = params->nth;
|
| 2887 |
+
|
| 2888 |
const int n = ggml_nrows(src0);
|
| 2889 |
const int nc = src0->ne[0];
|
| 2890 |
|
|
|
|
| 2901 |
GGML_ASSERT(nb00 == sizeof(float));
|
| 2902 |
|
| 2903 |
if (nb10 == sizeof(float)) {
|
| 2904 |
+
for (int j = ith; j < n; j += nth) {
|
| 2905 |
ggml_vec_add_f32(nc,
|
| 2906 |
(float *) ((char *) dst->data + j*nb1),
|
| 2907 |
(float *) ((char *) src0->data + j*nb01),
|
|
|
|
| 2909 |
}
|
| 2910 |
} else {
|
| 2911 |
// src1 is not contiguous
|
| 2912 |
+
for (int j = ith; j < n; j += nth) {
|
| 2913 |
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
| 2914 |
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
| 2915 |
for (int i = 0; i < nc; i++) {
|
|
|
|
| 3680 |
const struct ggml_compute_params * params,
|
| 3681 |
const struct ggml_tensor * src0,
|
| 3682 |
struct ggml_tensor * dst) {
|
| 3683 |
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
|
|
| 3684 |
|
| 3685 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 3686 |
return;
|
| 3687 |
}
|
| 3688 |
|
| 3689 |
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
| 3690 |
+
|
| 3691 |
+
const int ith = params->ith;
|
| 3692 |
+
const int nth = params->nth;
|
| 3693 |
|
| 3694 |
const int ne00 = src0->ne[0];
|
| 3695 |
const int ne01 = src0->ne[1];
|
|
|
|
| 3709 |
// TODO: optimize
|
| 3710 |
for (int i03 = 0; i03 < ne03; i03++) {
|
| 3711 |
for (int i02 = 0; i02 < ne02; i02++) {
|
| 3712 |
+
for (int i01 = ith; i01 < ne01; i01 += nth) {
|
| 3713 |
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
| 3714 |
|
| 3715 |
ggml_float mean = 0.0;
|
|
|
|
| 3758 |
|
| 3759 |
// ggml_compute_forward_mul_mat
|
| 3760 |
|
| 3761 |
+
// helper function to determine if it is better to use BLAS or not
|
| 3762 |
+
// for large matrices, BLAS is faster
|
| 3763 |
+
bool ggml_compute_forward_mul_mat_use_blas(
|
| 3764 |
+
const struct ggml_tensor * src0,
|
| 3765 |
+
const struct ggml_tensor * src1,
|
| 3766 |
+
struct ggml_tensor * dst) {
|
| 3767 |
+
UNUSED(src0);
|
| 3768 |
+
|
| 3769 |
+
const int ne10 = src1->ne[0];
|
| 3770 |
+
|
| 3771 |
+
const int ne0 = dst->ne[0];
|
| 3772 |
+
const int ne1 = dst->ne[1];
|
| 3773 |
+
|
| 3774 |
+
// TODO: find the optimal values for these
|
| 3775 |
+
if (ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
|
| 3776 |
+
//printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
|
| 3777 |
+
return true;
|
| 3778 |
+
}
|
| 3779 |
+
|
| 3780 |
+
return false;
|
| 3781 |
+
}
|
| 3782 |
+
|
| 3783 |
void ggml_compute_forward_mul_mat_f32(
|
| 3784 |
const struct ggml_compute_params * params,
|
| 3785 |
const struct ggml_tensor * src0,
|
|
|
|
| 3847 |
// nb00 < nb01 - src0 is transposed
|
| 3848 |
// compute by src0 columns
|
| 3849 |
|
| 3850 |
+
//#ifdef GGML_USE_ACCELERATE
|
| 3851 |
+
// if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 3852 |
+
// GGML_ASSERT(ggml_is_contiguous(src0));
|
| 3853 |
+
// GGML_ASSERT(nb10 == sizeof(float));
|
| 3854 |
+
//
|
| 3855 |
+
// if (params->ith != 0) return;
|
| 3856 |
+
//
|
| 3857 |
+
// if (params->type == GGML_TASK_INIT) {
|
| 3858 |
+
// return;
|
| 3859 |
+
// }
|
| 3860 |
+
//
|
| 3861 |
+
// if (params->type == GGML_TASK_FINALIZE) {
|
| 3862 |
+
// return;
|
| 3863 |
+
// }
|
| 3864 |
+
//
|
| 3865 |
+
// float * const wdata = params->wdata;
|
| 3866 |
+
//
|
| 3867 |
+
// for (int i03 = 0; i03 < ne03; i03++) {
|
| 3868 |
+
// for (int i02 = 0; i02 < ne02; i02++) {
|
| 3869 |
+
// const float * x = (float *) (src0->data);
|
| 3870 |
+
// const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
| 3871 |
+
//
|
| 3872 |
+
// float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
| 3873 |
+
//
|
| 3874 |
+
// // zT = y * xT
|
| 3875 |
+
// {
|
| 3876 |
+
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
| 3877 |
+
// ne11, ne01, ne10,
|
| 3878 |
+
// 1.0f, y, ne10,
|
| 3879 |
+
// x, ne10,
|
| 3880 |
+
// 0.0f, d, ne01);
|
| 3881 |
+
// }
|
| 3882 |
+
// }
|
| 3883 |
+
// }
|
| 3884 |
+
//
|
| 3885 |
+
// //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
| 3886 |
+
//
|
| 3887 |
+
// return;
|
| 3888 |
+
// }
|
| 3889 |
+
//#endif
|
| 3890 |
+
|
| 3891 |
if (params->type == GGML_TASK_INIT) {
|
| 3892 |
if (nb01 >= nb00) {
|
| 3893 |
return;
|
|
|
|
| 3924 |
return;
|
| 3925 |
}
|
| 3926 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3927 |
if (nb01 >= nb00) {
|
| 3928 |
// TODO: do not support transposed src1
|
| 3929 |
assert(nb10 == sizeof(float));
|
|
|
|
| 4068 |
const int ith = params->ith;
|
| 4069 |
const int nth = params->nth;
|
| 4070 |
|
| 4071 |
+
GGML_ASSERT(ne02 == ne12);
|
| 4072 |
+
GGML_ASSERT(ne03 == ne13);
|
| 4073 |
+
GGML_ASSERT(ne2 == ne12);
|
| 4074 |
+
GGML_ASSERT(ne3 == ne13);
|
| 4075 |
|
| 4076 |
// TODO: we don't support permuted src0
|
| 4077 |
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t));
|
| 4078 |
|
| 4079 |
// dst cannot be transposed or permuted
|
| 4080 |
+
GGML_ASSERT(nb0 == sizeof(float));
|
| 4081 |
+
GGML_ASSERT(nb0 <= nb1);
|
| 4082 |
+
GGML_ASSERT(nb1 <= nb2);
|
| 4083 |
+
GGML_ASSERT(nb2 <= nb3);
|
| 4084 |
|
| 4085 |
+
GGML_ASSERT(ne0 == ne01);
|
| 4086 |
+
GGML_ASSERT(ne1 == ne11);
|
| 4087 |
+
GGML_ASSERT(ne2 == ne02);
|
| 4088 |
+
GGML_ASSERT(ne3 == ne03);
|
| 4089 |
|
| 4090 |
// nb01 >= nb00 - src0 is not transposed
|
| 4091 |
// compute by src0 rows
|
|
|
|
| 4093 |
// nb00 < nb01 - src0 is transposed
|
| 4094 |
// compute by src0 columns
|
| 4095 |
|
| 4096 |
+
#ifdef GGML_USE_ACCELERATE
|
| 4097 |
+
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4098 |
+
GGML_ASSERT(nb10 == sizeof(float));
|
| 4099 |
+
|
| 4100 |
+
if (params->ith != 0) return;
|
| 4101 |
+
|
| 4102 |
+
if (params->type == GGML_TASK_INIT) {
|
| 4103 |
+
return;
|
| 4104 |
+
}
|
| 4105 |
+
|
| 4106 |
+
if (params->type == GGML_TASK_FINALIZE) {
|
| 4107 |
+
return;
|
| 4108 |
+
}
|
| 4109 |
+
|
| 4110 |
+
float * const wdata = params->wdata;
|
| 4111 |
+
|
| 4112 |
+
for (int i03 = 0; i03 < ne03; i03++) {
|
| 4113 |
+
for (int i02 = 0; i02 < ne02; i02++) {
|
| 4114 |
+
{
|
| 4115 |
+
int id = 0;
|
| 4116 |
+
for (int i01 = 0; i01 < ne01; ++i01) {
|
| 4117 |
+
for (int i00 = 0; i00 < ne00; ++i00) {
|
| 4118 |
+
wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
| 4119 |
+
}
|
| 4120 |
+
}
|
| 4121 |
+
}
|
| 4122 |
+
|
| 4123 |
+
const float * x = wdata;
|
| 4124 |
+
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
| 4125 |
+
|
| 4126 |
+
// float * z = wdata + ne00*ne01;
|
| 4127 |
+
|
| 4128 |
+
// z = x * yT
|
| 4129 |
+
//{
|
| 4130 |
+
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
| 4131 |
+
// ne01, ne11, ne00,
|
| 4132 |
+
// 1.0f, x, ne00,
|
| 4133 |
+
// y, ne00,
|
| 4134 |
+
// 0.0f, z, ne11);
|
| 4135 |
+
//}
|
| 4136 |
+
|
| 4137 |
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
| 4138 |
+
|
| 4139 |
+
// transpose z
|
| 4140 |
+
//for (int j = 0; j < ne11; ++j) {
|
| 4141 |
+
// for (int i = 0; i < ne01; ++i) {
|
| 4142 |
+
// d[j*ne01 + i] = z[i*ne11 + j];
|
| 4143 |
+
// }
|
| 4144 |
+
//}
|
| 4145 |
+
|
| 4146 |
+
// zT = y * xT
|
| 4147 |
+
{
|
| 4148 |
+
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
| 4149 |
+
ne11, ne01, ne10,
|
| 4150 |
+
1.0f, y, ne10,
|
| 4151 |
+
x, ne10,
|
| 4152 |
+
0.0f, d, ne01);
|
| 4153 |
+
}
|
| 4154 |
+
}
|
| 4155 |
+
}
|
| 4156 |
+
|
| 4157 |
+
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
| 4158 |
+
|
| 4159 |
+
return;
|
| 4160 |
+
}
|
| 4161 |
+
#endif
|
| 4162 |
+
|
| 4163 |
if (params->type == GGML_TASK_INIT) {
|
| 4164 |
if (nb01 >= nb00) {
|
| 4165 |
ggml_fp16_t * const wdata = params->wdata;
|
|
|
|
| 6605 |
|
| 6606 |
switch (node->op) {
|
| 6607 |
case GGML_OP_DUP:
|
| 6608 |
+
{
|
| 6609 |
+
node->n_tasks = 1;
|
| 6610 |
+
} break;
|
| 6611 |
case GGML_OP_ADD:
|
| 6612 |
+
{
|
| 6613 |
+
node->n_tasks = 1;
|
| 6614 |
+
} break;
|
| 6615 |
case GGML_OP_SUB:
|
| 6616 |
case GGML_OP_MUL:
|
| 6617 |
case GGML_OP_DIV:
|
|
|
|
| 6630 |
} break;
|
| 6631 |
case GGML_OP_GELU:
|
| 6632 |
{
|
| 6633 |
+
node->n_tasks = n_threads;
|
| 6634 |
} break;
|
| 6635 |
case GGML_OP_NORM:
|
| 6636 |
{
|
| 6637 |
+
node->n_tasks = n_threads;
|
| 6638 |
} break;
|
| 6639 |
case GGML_OP_MUL_MAT:
|
| 6640 |
{
|
|
|
|
| 6649 |
} else {
|
| 6650 |
if (node->src0->type == GGML_TYPE_F16 &&
|
| 6651 |
node->src1->type == GGML_TYPE_F32) {
|
| 6652 |
+
#ifdef GGML_USE_ACCELERATE
|
| 6653 |
+
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 6654 |
+
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
|
| 6655 |
+
} else {
|
| 6656 |
+
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
|
| 6657 |
+
}
|
| 6658 |
+
#else
|
| 6659 |
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
|
| 6660 |
+
#endif
|
| 6661 |
} else if (node->src0->type == GGML_TYPE_F32 &&
|
| 6662 |
node->src1->type == GGML_TYPE_F32) {
|
| 6663 |
cur = 0;
|
|
|
|
| 6670 |
} break;
|
| 6671 |
case GGML_OP_SCALE:
|
| 6672 |
{
|
| 6673 |
+
node->n_tasks = n_threads;
|
| 6674 |
} break;
|
| 6675 |
case GGML_OP_CPY:
|
| 6676 |
case GGML_OP_RESHAPE:
|
|
|
|
| 6684 |
} break;
|
| 6685 |
case GGML_OP_SOFT_MAX:
|
| 6686 |
{
|
| 6687 |
+
node->n_tasks = n_threads;
|
| 6688 |
} break;
|
| 6689 |
case GGML_OP_ROPE:
|
| 6690 |
{
|
|
|
|
| 6799 |
struct ggml_compute_params params = {
|
| 6800 |
/*.type =*/ GGML_TASK_INIT,
|
| 6801 |
/*.ith =*/ 0,
|
| 6802 |
+
/*.nth =*/ node->n_tasks,
|
| 6803 |
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
| 6804 |
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
| 6805 |
};
|
|
|
|
| 6983 |
|
| 6984 |
perf_total_per_op_us[node->op] += node->perf_time_us;
|
| 6985 |
|
| 6986 |
+
GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
| 6987 |
i,
|
| 6988 |
+
node->ne[0], node->ne[1], node->ne[2],
|
| 6989 |
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
| 6990 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
| 6991 |
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
main.cpp
CHANGED
|
@@ -21,7 +21,7 @@ std::string to_timestamp(int64_t t) {
|
|
| 21 |
msec = msec - min * (1000 * 60);
|
| 22 |
int64_t sec = msec / 1000;
|
| 23 |
msec = msec - sec * 1000;
|
| 24 |
-
|
| 25 |
char buf[32];
|
| 26 |
snprintf(buf, sizeof(buf), "%02d:%02d:%02d.%03d", (int) hr, (int) min, (int) sec, (int) msec);
|
| 27 |
|
|
|
|
| 21 |
msec = msec - min * (1000 * 60);
|
| 22 |
int64_t sec = msec / 1000;
|
| 23 |
msec = msec - sec * 1000;
|
| 24 |
+
|
| 25 |
char buf[32];
|
| 26 |
snprintf(buf, sizeof(buf), "%02d:%02d:%02d.%03d", (int) hr, (int) min, (int) sec, (int) msec);
|
| 27 |
|
whisper.cpp
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
#include <vector>
|
| 16 |
|
| 17 |
#define USE_FLASH_ATTN
|
| 18 |
-
|
| 19 |
|
| 20 |
// available whisper models
|
| 21 |
enum e_model {
|
|
@@ -148,11 +148,11 @@ static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
|
|
| 148 |
};
|
| 149 |
|
| 150 |
static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
|
| 151 |
-
{ MODEL_TINY,
|
| 152 |
-
{ MODEL_BASE,
|
| 153 |
-
{ MODEL_SMALL,
|
| 154 |
-
{ MODEL_MEDIUM,
|
| 155 |
-
{ MODEL_LARGE,
|
| 156 |
};
|
| 157 |
|
| 158 |
static const std::map<e_model, size_t> MEM_REQ_DECODE = {
|
|
|
|
| 15 |
#include <vector>
|
| 16 |
|
| 17 |
#define USE_FLASH_ATTN
|
| 18 |
+
//#define USE_FLASH_FF
|
| 19 |
|
| 20 |
// available whisper models
|
| 21 |
enum e_model {
|
|
|
|
| 148 |
};
|
| 149 |
|
| 150 |
static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
|
| 151 |
+
{ MODEL_TINY, 104ull*MB },
|
| 152 |
+
{ MODEL_BASE, 138ull*MB },
|
| 153 |
+
{ MODEL_SMALL, 208ull*MB },
|
| 154 |
+
{ MODEL_MEDIUM, 280ull*MB },
|
| 155 |
+
{ MODEL_LARGE, 354ull*MB },
|
| 156 |
};
|
| 157 |
|
| 158 |
static const std::map<e_model, size_t> MEM_REQ_DECODE = {
|