ggerganov commited on
Commit
7708c26
·
1 Parent(s): ad9dd7b

whisper : revert mel-related changes (#0)

Browse files

too much extra logic and complexity for small benefit

.gitignore CHANGED
@@ -9,6 +9,7 @@
9
  .DS_Store
10
  .vimspector.json
11
  /CMakeSettings.json
 
12
 
13
  build/
14
  build-*/
 
9
  .DS_Store
10
  .vimspector.json
11
  /CMakeSettings.json
12
+ /talk-llama.dSYM/
13
 
14
  build/
15
  build-*/
Makefile CHANGED
@@ -512,9 +512,6 @@ ifdef GGML_CUDA
512
  OBJ_GGML += ggml/src/ggml-cuda.o
513
  OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
514
  OBJ_GGML += $(OBJ_CUDA_TMPL)
515
-
516
- #OBJ_WHISPER += src/whisper-mel-cuda.o
517
-
518
  ifdef WHISPER_FATAL_WARNINGS
519
  MK_NVCCFLAGS += -Werror all-warnings
520
  endif # WHISPER_FATAL_WARNINGS
@@ -623,10 +620,6 @@ ggml/src/ggml-cuda.o: \
623
  ggml/src/ggml-common.h \
624
  $(wildcard ggml/src/ggml-cuda/*.cuh)
625
  $(NVCC_COMPILE)
626
-
627
- #src/whisper-mel-cuda.o: src/whisper-mel-cuda.cu src/whisper-mel-cuda.hpp
628
- # $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
629
-
630
  endif # GGML_CUDA
631
 
632
  ifdef GGML_VULKAN
@@ -955,7 +948,6 @@ $(LIB_GGML_S): \
955
 
956
  src/whisper.o: \
957
  src/whisper.cpp \
958
- src/whisper-mel.hpp \
959
  include/whisper.h \
960
  ggml/include/ggml.h \
961
  ggml/include/ggml-alloc.h \
 
512
  OBJ_GGML += ggml/src/ggml-cuda.o
513
  OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
514
  OBJ_GGML += $(OBJ_CUDA_TMPL)
 
 
 
515
  ifdef WHISPER_FATAL_WARNINGS
516
  MK_NVCCFLAGS += -Werror all-warnings
517
  endif # WHISPER_FATAL_WARNINGS
 
620
  ggml/src/ggml-common.h \
621
  $(wildcard ggml/src/ggml-cuda/*.cuh)
622
  $(NVCC_COMPILE)
 
 
 
 
623
  endif # GGML_CUDA
624
 
625
  ifdef GGML_VULKAN
 
948
 
949
  src/whisper.o: \
950
  src/whisper.cpp \
 
951
  include/whisper.h \
952
  ggml/include/ggml.h \
953
  ggml/include/ggml-alloc.h \
bindings/ruby/ext/extconf.rb CHANGED
@@ -1,7 +1,6 @@
1
  require 'mkmf'
2
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
3
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
4
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
5
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
6
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
7
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
 
1
  require 'mkmf'
2
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
3
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
 
4
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
5
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
6
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
src/CMakeLists.txt CHANGED
@@ -78,43 +78,13 @@ if (WHISPER_OPENVINO)
78
  set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
79
  endif()
80
 
81
- #if (GGML_CUDA)
82
- # cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
83
- #
84
- # find_package(CUDAToolkit)
85
- # if (CUDAToolkit_FOUND)
86
- # message(STATUS "CUDA found")
87
- #
88
- # if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
89
- # # 52 == lowest CUDA 12 standard
90
- # # 60 == f16 CUDA intrinsics
91
- # # 61 == integer CUDA intrinsics
92
- # # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
93
- # set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
94
- # endif()
95
- # message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
96
- #
97
- # enable_language(CUDA)
98
- # else()
99
- # message(WARNING "CUDA not found")
100
- # endif()
101
- #endif()
102
-
103
  # whisper
104
 
105
  add_library(whisper
106
  ../include/whisper.h
107
  whisper.cpp
108
- whisper-mel.hpp
109
  )
110
 
111
- # TODO: disabled because it relies on ggml internals that are no longer accessible (ggml-backend-impl.h, ggml-cuda/common.cuh, ..)
112
- #if (GGML_CUDA)
113
- # target_sources(whisper PRIVATE whisper-mel-cuda.cu)
114
- #
115
- # target_link_libraries(whisper PRIVATE CUDA::cufft)
116
- #endif()
117
-
118
  # Set the version numbers
119
  set_target_properties(whisper PROPERTIES
120
  VERSION ${PROJECT_VERSION}
 
78
  set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
79
  endif()
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # whisper
82
 
83
  add_library(whisper
84
  ../include/whisper.h
85
  whisper.cpp
 
86
  )
87
 
 
 
 
 
 
 
 
88
  # Set the version numbers
89
  set_target_properties(whisper PROPERTIES
90
  VERSION ${PROJECT_VERSION}
src/whisper-mel-cuda.cu DELETED
@@ -1,363 +0,0 @@
1
- #define CUB_IGNORE_DEPRECATED_CPP_DIALECT
2
- #include "whisper-mel-cuda.hpp"
3
- #include "whisper.h"
4
-
5
- #include <ggml-backend.h>
6
-
7
- #include <cuda.h>
8
- #include <cuda_runtime.h>
9
- #include <cufft.h>
10
- #include <cublas_v2.h>
11
- #include <cuComplex.h>
12
- #include <cub/device/device_reduce.cuh>
13
- #include <device_launch_parameters.h>
14
-
15
- #include <algorithm>
16
-
17
- #if defined(_MSC_VER)
18
- #pragma warning(disable: 4324) // added padding
19
- #endif
20
-
21
- namespace {
22
-
23
- static const char* cufftGetErrorString(cufftResult_t res) {
24
- switch (res) {
25
- case CUFFT_SUCCESS: return "The cuFFT operation was successful";
26
- case CUFFT_INVALID_PLAN: return "cuFFT was passed an invalid plan handle";
27
- case CUFFT_ALLOC_FAILED: return "cuFFT failed to allocate GPU or CPU memory";
28
- case CUFFT_INVALID_TYPE: return "No longer used";
29
- case CUFFT_INVALID_VALUE: return "User specified an invalid pointer or parameter";
30
- case CUFFT_INTERNAL_ERROR: return "Driver or internal cuFFT library error";
31
- case CUFFT_EXEC_FAILED: return "Failed to execute an FFT on the GPU";
32
- case CUFFT_SETUP_FAILED: return "The cuFFT library failed to initialize";
33
- case CUFFT_INVALID_SIZE: return "User specified an invalid transform size";
34
- case CUFFT_UNALIGNED_DATA: return "No longer used";
35
- case CUFFT_INCOMPLETE_PARAMETER_LIST: return "Missing parameters in call";
36
- case CUFFT_INVALID_DEVICE: return "Execution of a plan was on different GPU than plan creation";
37
- case CUFFT_PARSE_ERROR: return "Internal plan database error";
38
- case CUFFT_NO_WORKSPACE: return "No workspace has been provided prior to plan execution";
39
- case CUFFT_NOT_IMPLEMENTED: return "Function does not implement functionality for parameters given.";
40
- case CUFFT_LICENSE_ERROR: return "Used in previous versions.";
41
- case CUFFT_NOT_SUPPORTED: return "Operation is not supported for parameters given.";
42
- default: return "Unknown error";
43
- }
44
- }
45
-
46
- #define CUFFT_CHECK(err) CUDA_CHECK_GEN(err, CUFFT_SUCCESS, cufftGetErrorString)
47
-
48
- __global__ void k_fill_stft_input(
49
- const float * padded_samples,
50
- const int n_frames,
51
- const float * hann_window,
52
- float * stft_in
53
- ) {
54
- auto y = blockIdx.y * blockDim.y + threadIdx.y;
55
- // if (y >= n_frames) return;
56
- auto x = blockIdx.x * blockDim.x + threadIdx.x;
57
- // if (x >= WHISPER_N_FFT) return;
58
-
59
- auto line = padded_samples + y * WHISPER_HOP_LENGTH;
60
- auto outLine = stft_in + y * WHISPER_N_FFT;
61
-
62
- outLine[x] = line[x] * hann_window[x];
63
- }
64
-
65
- __global__ void k_calc_magnitudes(
66
- const cuComplex * stft_out,
67
- const int n_frames,
68
- float * magnitudes
69
- ) {
70
- auto y = blockIdx.y * blockDim.y + threadIdx.y;
71
- // if (y >= n_frames) return;
72
- auto x = blockIdx.x * blockDim.x + threadIdx.x;
73
- // if (x >= WHISPER_N_FFT_HALF) return;
74
-
75
- auto idx = y * WHISPER_N_FFT_HALF + x;
76
-
77
- auto r = stft_out[idx].x;
78
- auto i = stft_out[idx].y;
79
- magnitudes[idx] = r * r + i * i;
80
- }
81
-
82
- __global__ void k_calc_log_mel(
83
- const float * mel_data,
84
- const int n_mel,
85
- const float * max_val,
86
- float * log_mel
87
- ) {
88
- auto x = blockIdx.x * blockDim.x + threadIdx.x;
89
- if (x >= n_mel) return;
90
-
91
- float val = mel_data[x];
92
-
93
- constexpr float e = 1e-10f;
94
- if (val < e) val = e;
95
-
96
- val = log10(val);
97
-
98
- const float max = log10(*max_val) - 8.f;
99
- if (val < max) val = max;
100
-
101
- log_mel[x] = (val + 4) / 4;
102
- }
103
-
104
- static void fill_stft_input(
105
- const float * padded_samples,
106
- int n_frames,
107
- const float * hann_window,
108
- float * stft_in,
109
- cudaStream_t stream
110
- ) {
111
- dim3 block(WHISPER_N_FFT, 1);
112
- dim3 grid(1, n_frames);
113
-
114
- k_fill_stft_input<<<grid, block, 0, stream>>>(padded_samples, n_frames, hann_window, stft_in);
115
- }
116
-
117
- static void calc_magnitudes(
118
- const cuComplex * stft_out,
119
- int n_frames,
120
- float * magnitudes,
121
- cudaStream_t stream
122
- ) {
123
- dim3 block(WHISPER_N_FFT_HALF, 1);
124
- dim3 grid(1, n_frames);
125
- k_calc_magnitudes<<<grid, block, 0, stream>>>(stft_out, n_frames, magnitudes);
126
- }
127
-
128
- constexpr auto LOG_MEL_PREFIX_SIZE = 256;
129
-
130
- static void calc_log_mel(
131
- const float * mel_data,
132
- int n_mel,
133
- void * tempStorage,
134
- int tempStorageSize,
135
- float * log_mel,
136
- cudaStream_t stream
137
- ) {
138
- float * max_val = reinterpret_cast<float *>(tempStorage);
139
- void * maxTemp = reinterpret_cast<char*>(tempStorage) + LOG_MEL_PREFIX_SIZE;
140
-
141
- size_t nbytes = size_t(tempStorageSize - LOG_MEL_PREFIX_SIZE);
142
- cub::DeviceReduce::Max(maxTemp, nbytes, mel_data, max_val, n_mel, stream);
143
-
144
- int block = 256;
145
- int grid = (n_mel + block - 1) / block;
146
-
147
- k_calc_log_mel<<<grid, block, 0, stream>>>(mel_data, n_mel, max_val, log_mel);
148
- }
149
-
150
- class mel_calc_cuda : public whisper_mel_calc {
151
- const int m_n_mel;
152
-
153
- ggml_backend_t m_backend = nullptr;
154
- int m_device = -1;
155
-
156
- cudaStream_t m_stream = nullptr;
157
- cublasHandle_t m_cublas_handle = nullptr;
158
-
159
- float * m_hann_window = nullptr;
160
-
161
- float * m_filters = nullptr;
162
-
163
- // max samples for which we have allocated memory for the temp working areas below (cufft, log_mel)
164
- int m_n_max_samples = 0;
165
-
166
- size_t m_cufft_workspace_size = 0;
167
- void * m_cufft_workspace = nullptr;
168
-
169
- size_t m_log_mel_temp_storage_size = 0;
170
- void * m_log_mel_temp_storage = nullptr;
171
- public:
172
- mel_calc_cuda(ggml_backend_t backend, const whisper_filters & filters)
173
- : m_n_mel(filters.n_mel)
174
- , m_backend(backend)
175
- {
176
- ggml_backend_cuda_context* cuda_ctx = (ggml_backend_cuda_context*)m_backend->context;
177
- m_device = cuda_ctx->device;
178
-
179
- if (ggml_cuda_info().devices[m_device].cc < 600) {
180
- // we've only tesed on 6.0 and higher and we've had reports of crashes on 5.0:
181
- // https://github.com/ggerganov/whisper.cpp/issues/2230
182
- // to be safe forbid anything below 6.0
183
- throw std::runtime_error("CUDA compute capability 6.0 or higher is required");
184
- }
185
-
186
- ggml_cuda_set_device(m_device);
187
-
188
- if (filters.n_fft != WHISPER_N_FFT_HALF) {
189
- throw std::invalid_argument("MelFilters n_frames must be WHISPER_N_FFT_HALF");
190
- }
191
- assert(filters.data.size() == filters.n_mel * WHISPER_N_FFT_HALF);
192
-
193
- CUDA_CHECK(cudaStreamCreate(&m_stream));
194
- CUBLAS_CHECK(cublasCreate(&m_cublas_handle));
195
- CUBLAS_CHECK(cublasSetMathMode(m_cublas_handle, CUBLAS_TF32_TENSOR_OP_MATH));
196
- CUBLAS_CHECK(cublasSetStream(m_cublas_handle, m_stream));
197
-
198
- // create Hann window
199
- {
200
- auto hw = whisper_mel_calc::hann_window();
201
- CUDA_CHECK(cudaMallocAsync(&m_hann_window, hw.len * sizeof(float), m_stream));
202
- CUDA_CHECK(cudaMemcpyAsync(m_hann_window, hw.data, hw.len * sizeof(float), cudaMemcpyHostToDevice, m_stream));
203
- }
204
-
205
- // fill filters
206
- {
207
- auto& f = filters.data;
208
- CUDA_CHECK(cudaMallocAsync(&m_filters, f.size() * sizeof(float), m_stream));
209
- CUDA_CHECK(cudaMemcpyAsync(m_filters, f.data(), f.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));
210
- }
211
-
212
- // preallocate working areas enough for the most common cases (<= 30s)
213
- ensure_working_areas(WHISPER_N_SAMPLES);
214
- }
215
-
216
- ~mel_calc_cuda() {
217
- ggml_cuda_set_device(m_device);
218
- CUDA_CHECK(cudaStreamSynchronize(m_stream));
219
- CUDA_CHECK(cudaStreamDestroy(m_stream));
220
- CUDA_CHECK(cudaFree(m_hann_window));
221
- CUDA_CHECK(cudaFree(m_cufft_workspace));
222
- CUDA_CHECK(cudaFree(m_filters));
223
- CUDA_CHECK(cudaFree(m_log_mel_temp_storage));
224
- }
225
-
226
- void ensure_working_areas(int n_samples) {
227
- if (n_samples <= m_n_max_samples) {
228
- return;
229
- }
230
-
231
- const auto max_padded_samples = n_samples + WHISPER_N_SAMPLES + WHISPER_N_FFT;
232
- const auto max_frames = 1 + (max_padded_samples - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;
233
-
234
- // cufft workspace
235
- {
236
- if (m_cufft_workspace) {
237
- CUDA_CHECK(cudaFree(m_cufft_workspace));
238
- m_cufft_workspace_size = 0;
239
- m_cufft_workspace = nullptr;
240
- }
241
- CUFFT_CHECK(cufftEstimate1d(WHISPER_N_FFT, CUFFT_R2C, max_frames, &m_cufft_workspace_size));
242
- CUDA_CHECK(cudaMallocAsync(&m_cufft_workspace, m_cufft_workspace_size, m_stream));
243
- }
244
-
245
- // device reduce working area
246
- {
247
- if (m_log_mel_temp_storage) {
248
- CUDA_CHECK(cudaFree(m_log_mel_temp_storage));
249
- m_log_mel_temp_storage_size = 0;
250
- m_log_mel_temp_storage = nullptr;
251
- }
252
-
253
- const auto max_mels = 160;
254
-
255
- size_t nbytes = 0;
256
- float* temp = nullptr;
257
- cub::DeviceReduce::Max(nullptr, nbytes, temp, temp, max_frames * max_mels);
258
- m_log_mel_temp_storage_size = nbytes + LOG_MEL_PREFIX_SIZE;
259
-
260
- CUDA_CHECK(cudaMallocAsync(&m_log_mel_temp_storage, m_log_mel_temp_storage_size, m_stream));
261
- }
262
-
263
- m_n_max_samples = n_samples;
264
- }
265
-
266
- virtual whisper_mel calculate(whisper_span<const float> samples, int /*n_threads*/) override {
267
- ggml_cuda_set_device(m_device);
268
- ensure_working_areas(samples.len);
269
-
270
- const size_t mirror_pad = WHISPER_N_FFT / 2;
271
- const size_t padded_size = samples.len + WHISPER_N_SAMPLES + WHISPER_N_FFT;
272
-
273
- // pad
274
- std::vector<float> padded_samples(padded_size);
275
- std::reverse_copy(samples.data + 1, samples.data + 1 + mirror_pad, padded_samples.begin()); // reflect
276
- std::copy(samples.data, samples.data + samples.len, padded_samples.begin() + mirror_pad); // copy
277
-
278
- // fill the rest of the data
279
- // it should canonically be mirrored at the end as well,
280
- // but we just assume the last MEL_FRAME_SIZE/2 samples are zeros
281
- std::fill(padded_samples.begin() + mirror_pad + samples.len, padded_samples.end(), 0.f);
282
-
283
- const auto n_frames = 1 + (padded_samples.size() - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;
284
-
285
- float * cu_padded_samples = nullptr;
286
- CUDA_CHECK(cudaMallocAsync(&cu_padded_samples, padded_samples.size() * sizeof(float), m_stream));
287
- CUDA_CHECK(cudaMemcpyAsync(cu_padded_samples, padded_samples.data(), padded_samples.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));
288
-
289
- float * stft_in = nullptr; // contiguous buffer for stft input
290
- CUDA_CHECK(cudaMallocAsync(&stft_in, n_frames * WHISPER_N_FFT * sizeof(float), m_stream));
291
-
292
- fill_stft_input(cu_padded_samples, int(n_frames), m_hann_window, stft_in, m_stream);
293
-
294
- cufftComplex* stft_out;
295
- CUDA_CHECK(cudaMallocAsync(&stft_out, n_frames * WHISPER_N_FFT_HALF * sizeof(cufftComplex), m_stream));
296
-
297
- cufftHandle plan;
298
- CUFFT_CHECK(cufftCreate(&plan));
299
- CUFFT_CHECK(cufftSetAutoAllocation(plan, 0));
300
- {
301
- size_t waSize;
302
- CUFFT_CHECK(cufftMakePlan1d(plan, WHISPER_N_FFT, CUFFT_R2C, int(n_frames), &waSize));
303
- assert(waSize <= m_cufft_workspace_size);
304
- CUFFT_CHECK(cufftSetWorkArea(plan, m_cufft_workspace));
305
- CUFFT_CHECK(cufftSetStream(plan, m_stream));
306
- }
307
- CUFFT_CHECK(cufftExecR2C(plan, stft_in, stft_out));
308
-
309
- const auto n_mag_frames = n_frames - 1; // drop last frame
310
- float * magnitudes;
311
- CUDA_CHECK(cudaMallocAsync(&magnitudes, n_mag_frames * WHISPER_N_FFT_HALF * sizeof(float), m_stream));
312
- calc_magnitudes(stft_out, int(n_mag_frames), magnitudes, m_stream);
313
-
314
- float * mel_data = nullptr;
315
- CUDA_CHECK(cudaMallocAsync(&mel_data, m_n_mel * n_mag_frames * sizeof(float), m_stream));
316
-
317
- const float fone = 1.0f, fzero = 0.0f;
318
- CUBLAS_CHECK(cublasSgemm(m_cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
319
- int(n_mag_frames), m_n_mel, WHISPER_N_FFT_HALF,
320
- &fone,
321
- magnitudes, WHISPER_N_FFT_HALF,
322
- m_filters, WHISPER_N_FFT_HALF,
323
- &fzero,
324
- mel_data, int(n_mag_frames)));
325
-
326
- whisper_mel ret;
327
- // Calculate semi-padded sample length to ensure compatibility
328
- int n_len_org = 1 + int(samples.len + mirror_pad - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;
329
- whisper_mel_init(ret, m_backend, int(n_mag_frames), n_len_org, m_n_mel);
330
- assert(ggml_nbytes(ret.tensor) == m_n_mel * n_mag_frames * sizeof(float));
331
-
332
- float* log_mels = reinterpret_cast<float*>(ret.tensor->data);
333
-
334
- calc_log_mel(
335
- mel_data, int(m_n_mel * n_mag_frames),
336
- m_log_mel_temp_storage , int(m_log_mel_temp_storage_size),
337
- log_mels, m_stream);
338
-
339
- CUDA_CHECK(cudaStreamSynchronize(m_stream));
340
-
341
- // cleanup
342
- CUFFT_CHECK(cufftDestroy(plan));
343
- CUDA_CHECK(cudaFreeAsync(mel_data, m_stream));
344
- CUDA_CHECK(cudaFreeAsync(magnitudes, m_stream));
345
- CUDA_CHECK(cudaFreeAsync(stft_out, m_stream));
346
- CUDA_CHECK(cudaFreeAsync(stft_in, m_stream));
347
- CUDA_CHECK(cudaFreeAsync(cu_padded_samples, m_stream));
348
-
349
- return ret;
350
- }
351
- };
352
-
353
- }
354
-
355
- whisper_mel_calc * whisper_mel_calc_create_cuda(ggml_backend_t backend, const whisper_filters & filters) {
356
- try {
357
- return new mel_calc_cuda(backend, filters);
358
- }
359
- catch (...) {
360
- // TODO: log error (but for this we would have to expose the log state to be accessible here)
361
- return nullptr;
362
- }
363
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/whisper-mel-cuda.hpp DELETED
@@ -1,3 +0,0 @@
1
- #include "whisper-mel.hpp"
2
-
3
- whisper_mel_calc * whisper_mel_calc_create_cuda(ggml_backend_t backend, const whisper_filters & filters);
 
 
 
 
src/whisper-mel.hpp DELETED
@@ -1,34 +0,0 @@
1
- #pragma once
2
- #include "ggml-backend.h"
3
- #include <vector>
4
-
5
- struct whisper_mel {
6
- int n_len_org = 0;
7
-
8
- ggml_context * ctx = nullptr;
9
- ggml_tensor * tensor = nullptr;
10
- ggml_backend_buffer_t buffer = nullptr;
11
- };
12
-
13
- void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel);
14
-
15
- void whisper_mel_free(whisper_mel & mel);
16
-
17
- struct whisper_filters {
18
- int32_t n_mel;
19
- int32_t n_fft;
20
-
21
- std::vector<float> data;
22
- };
23
-
24
- template <typename T>
25
- struct whisper_span {
26
- T * data;
27
- int len;
28
- };
29
-
30
- struct whisper_mel_calc {
31
- virtual ~whisper_mel_calc();
32
- virtual whisper_mel calculate(whisper_span<const float> samples, int n_threads) = 0;
33
- static whisper_span<const float> hann_window();
34
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/whisper.cpp CHANGED
@@ -10,7 +10,6 @@
10
 
11
  #ifdef GGML_USE_CUDA
12
  #include "ggml-cuda.h"
13
- #include "whisper-mel-cuda.hpp"
14
  #endif
15
 
16
  #ifdef GGML_USE_SYCL
@@ -37,8 +36,6 @@
37
  #include "ggml-alloc.h"
38
  #include "ggml-backend.h"
39
 
40
- #include "whisper-mel.hpp"
41
-
42
  #include <atomic>
43
  #include <algorithm>
44
  #include <cassert>
@@ -401,6 +398,21 @@ static const std::map<whisper_alignment_heads_preset, whisper_aheads> g_aheads {
401
 
402
  static std::vector<uint32_t> get_alignment_heads_by_layer(const whisper_context_params & cparams, int il, int32_t n_text_layer, int32_t n_head);
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  struct whisper_vocab {
405
  using id = int32_t;
406
  using token = std::string;
@@ -830,8 +842,6 @@ struct whisper_state {
830
  whisper_kv_cache kv_pad;
831
 
832
  whisper_mel mel;
833
- whisper_mel_calc * mel_calc = nullptr;
834
- whisper_mel_calc * mel_calc_fallback = nullptr;
835
 
836
  whisper_batch batch;
837
 
@@ -850,6 +860,7 @@ struct whisper_state {
850
  struct ggml_tensor * embd_enc = nullptr;
851
 
852
  // helpers for GPU offloading
 
853
  std::vector<float> inp_mask;
854
 
855
  // decode output (2-dimensional array: [n_tokens][n_vocab])
@@ -1912,8 +1923,7 @@ static bool whisper_encode_external(const whisper_state & wstate) {
1912
 
1913
  static struct ggml_cgraph * whisper_build_graph_conv(
1914
  whisper_context & wctx,
1915
- whisper_state & wstate,
1916
- const int mel_offset) {
1917
  const auto & model = wctx.model;
1918
  const auto & hparams = model.hparams;
1919
 
@@ -1932,35 +1942,9 @@ static struct ggml_cgraph * whisper_build_graph_conv(
1932
 
1933
  ggml_cgraph * gf = ggml_new_graph(ctx0);
1934
 
1935
- GGML_ASSERT(wstate.mel.tensor);
1936
-
1937
- ggml_tensor * mel_inp = wstate.mel.tensor;
1938
- ggml_set_input(mel_inp);
1939
-
1940
- ggml_tensor * mel;
1941
- if (ggml_nelements(mel_inp) > 0) {
1942
- const int n_len = int(mel_inp->ne[0]);
1943
- const int out_s = 2 * n_ctx;
1944
- const int i0 = std::min(mel_offset, n_len);
1945
- const int i1 = std::min(mel_offset + out_s, n_len);
1946
- const int mel_s = i1 - i0;
1947
-
1948
- assert(mel_inp->type == GGML_TYPE_F32);
1949
- assert(mel_inp->ne[1] == n_mels);
1950
-
1951
- ggml_tensor * cur = ggml_view_2d(ctx0, mel_inp, out_s, n_mels, mel_inp->nb[1], ggml_row_size(mel_inp->type, i0));
1952
-
1953
- if (mel_s < out_s) {
1954
- mel = ggml_pad(ctx0, cur, out_s - mel_s, 0, 0, 0);
1955
- } else {
1956
- mel = ggml_cont(ctx0, cur);
1957
- }
1958
- } else {
1959
- // empty mel - just create a dummy tensor with the correct size
1960
- mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
1961
- }
1962
-
1963
  ggml_set_name(mel, "mel");
 
1964
 
1965
  struct ggml_tensor * cur = nullptr;
1966
 
@@ -2332,21 +2316,45 @@ static bool whisper_encode_internal(
2332
  {
2333
  auto & sched = wstate.sched_conv.sched;
2334
 
2335
- ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
2336
 
2337
  if (!ggml_backend_sched_alloc_graph(sched, gf)) {
2338
  // should never happen as we pre-allocate the memory
2339
  return false;
2340
  }
2341
 
2342
- if (!ggml_graph_compute_helper(sched, gf, n_threads)) {
2343
- return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2344
  }
2345
 
2346
- if (whisper_encode_external(wstate)) {
2347
- ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
2348
- assert(mel->ne[1] == wctx.model.hparams.n_mels);
2349
- GGML_UNUSED(mel);
 
2350
  #if defined(WHISPER_USE_COREML)
2351
  whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
2352
  #elif defined(WHISPER_USE_OPENVINO)
@@ -2970,35 +2978,6 @@ struct whisper_global_cache {
2970
  } global_cache;
2971
  }
2972
 
2973
- // Mel spectrogram
2974
-
2975
- void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel) {
2976
- //WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel);
2977
- mel.n_len_org = n_len_org;
2978
- assert(!mel.ctx);
2979
- mel.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});
2980
- mel.tensor = ggml_new_tensor_2d(mel.ctx, GGML_TYPE_F32, n_len, n_mel);
2981
- mel.buffer = ggml_backend_alloc_buffer(backend, ggml_nbytes(mel.tensor) + ggml_backend_get_alignment(backend));
2982
- auto alloc = ggml_tallocr_new(mel.buffer);
2983
- ggml_tallocr_alloc(&alloc, mel.tensor);
2984
- }
2985
-
2986
- void whisper_mel_free(whisper_mel & mel) {
2987
- ggml_free(mel.ctx);
2988
- ggml_backend_buffer_free(mel.buffer);
2989
-
2990
- mel.n_len_org = 0;
2991
- mel.ctx = nullptr;
2992
- mel.tensor = nullptr;
2993
- mel.buffer = nullptr;
2994
- }
2995
-
2996
- whisper_mel_calc::~whisper_mel_calc() = default; // export vtable
2997
-
2998
- whisper_span<const float> whisper_mel_calc::hann_window() {
2999
- return {global_cache.hann_window, WHISPER_N_FFT};
3000
- }
3001
-
3002
  // naive Discrete Fourier Transform
3003
  // input is real-valued
3004
  // output is complex-valued
@@ -3068,22 +3047,12 @@ static void fft(float* in, int N, float* out) {
3068
  }
3069
  }
3070
 
3071
- namespace {
3072
-
3073
- struct whisper_mel_data {
3074
- int n_len;
3075
- int n_len_org;
3076
- int n_mel;
3077
- float * data;
3078
- };
3079
-
3080
- void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
3081
- int n_samples, int n_threads,
3082
- const whisper_filters & filters, whisper_mel_data & mel) {
3083
- const auto frame_size = WHISPER_N_FFT;
3084
- const auto frame_step = WHISPER_HOP_LENGTH;
3085
  std::vector<float> fft_in(frame_size * 2, 0.0);
3086
  std::vector<float> fft_out(frame_size * 2 * 2 * 2);
 
3087
  int n_fft = filters.n_fft;
3088
  int i = ith;
3089
 
@@ -3098,6 +3067,7 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v
3098
  for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
3099
  fft_in[j] = hann[j] * samples[offset + j];
3100
  }
 
3101
  // fill the rest with zeros
3102
  if (n_samples - offset < frame_size) {
3103
  std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
@@ -3115,7 +3085,6 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v
3115
  // mel spectrogram
3116
  for (int j = 0; j < mel.n_mel; j++) {
3117
  double sum = 0.0;
3118
-
3119
  // unroll loop (suggested by GH user @lunixbochs)
3120
  int k = 0;
3121
  for (k = 0; k < n_fft - 3; k += 4) {
@@ -3125,14 +3094,11 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v
3125
  fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
3126
  fft_out[k + 3] * filters.data[j * n_fft + k + 3];
3127
  }
3128
-
3129
  // handle n_fft remainder
3130
  for (; k < n_fft; k++) {
3131
  sum += fft_out[k] * filters.data[j * n_fft + k];
3132
  }
3133
-
3134
  sum = log10(std::max(sum, 1e-10));
3135
-
3136
  mel.data[j * mel.n_len + i] = sum;
3137
  }
3138
  }
@@ -3146,116 +3112,97 @@ void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::v
3146
  }
3147
  }
3148
 
3149
- struct mel_calc_cpu : public whisper_mel_calc {
3150
- ggml_backend_t m_backend;
3151
- const whisper_filters & m_filters;
3152
- mel_calc_cpu(ggml_backend_t backend, const whisper_filters & filters) : m_backend(backend), m_filters(filters) {}
3153
-
3154
- // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
3155
- whisper_mel calculate(whisper_span<const float> ssamples, int n_threads) override {
3156
- // Hann window
3157
- const float * hann = global_cache.hann_window;
3158
-
3159
- // Calculate the length of padding
3160
- int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
3161
- int64_t stage_2_pad = WHISPER_N_FFT / 2;
 
3162
 
3163
- const int n_samples = int(ssamples.len);
3164
- const float * samples = ssamples.data;
 
3165
 
3166
- // Initialize a vector and copy data from C array to it.
3167
- std::vector<float> samples_padded;
3168
- samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
3169
- std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
3170
 
3171
- // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
3172
- std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
 
 
3173
 
3174
- // reflective pad 200 samples at the beginning of audio
3175
- std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
3176
 
3177
- whisper_mel_data mel;
3178
- mel.n_mel = m_filters.n_mel;
3179
- // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
3180
- // Calculate number of frames + remove the last frame
3181
- mel.n_len = (samples_padded.size() - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;
3182
- // Calculate semi-padded sample length to ensure compatibility
3183
- mel.n_len_org = 1 + (n_samples + stage_2_pad - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;
3184
 
3185
- std::vector<float> host_mel_data;
 
 
 
 
 
 
3186
 
3187
- whisper_mel ret;
3188
- whisper_mel_init(ret, m_backend, mel.n_len, mel.n_len_org, mel.n_mel);
3189
- if (ggml_backend_buffer_is_host(ret.buffer)) {
3190
- mel.data = reinterpret_cast<float*>(ret.tensor->data);
3191
- } else {
3192
- host_mel_data.resize(mel.n_len * mel.n_mel);
3193
- mel.data = host_mel_data.data();
3194
  }
3195
 
3196
- {
3197
- std::vector<std::thread> workers(n_threads - 1);
3198
- for (int iw = 0; iw < n_threads - 1; ++iw) {
3199
- workers[iw] = std::thread(
3200
- log_mel_spectrogram_worker_thread, iw + 1, hann, samples_padded,
3201
- n_samples + stage_2_pad, n_threads,
3202
- std::cref(m_filters), std::ref(mel));
3203
- }
3204
-
3205
- // main thread
3206
- log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, n_threads, m_filters, mel);
3207
 
3208
- for (int iw = 0; iw < n_threads - 1; ++iw) {
3209
- workers[iw].join();
3210
- }
3211
  }
 
3212
 
3213
- // clamping and normalization
3214
- double mmax = -1e20;
3215
- for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
3216
- if (mel.data[i] > mmax) {
3217
- mmax = mel.data[i];
3218
- }
3219
  }
 
3220
 
3221
- mmax -= 8.0;
3222
-
3223
- for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
3224
- if (mel.data[i] < mmax) {
3225
- mel.data[i] = mmax;
3226
- }
3227
-
3228
- mel.data[i] = (mel.data[i] + 4.0)/4.0;
3229
- }
3230
 
3231
- if (!host_mel_data.empty()) {
3232
- // the ret buffer is not host-accessible so we used this temporary buffer and now we need to upload it
3233
- ggml_backend_tensor_set(ret.tensor, host_mel_data.data(), 0, ggml_nbytes(ret.tensor));
3234
  }
3235
 
3236
- return ret;
3237
  }
3238
- };
3239
- }
3240
 
3241
- static whisper_mel_calc * whisper_mel_calc_create(ggml_backend_t backend, const whisper_filters & filters) {
3242
- // TODO: disabled because it relies on ggml internals that are no longer accessible (ggml-backend-impl.h, ggml-cuda/common.cuh, ..)
3243
- //#if defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS)
3244
- #if 0
3245
- if (ggml_backend_is_cuda(backend)) {
3246
- auto ret = whisper_mel_calc_create_cuda(backend, filters);
3247
- if (ret) {
3248
- // run a warmup to avoid the first kernel launch overhead (thus we get the best perf even on the first run)
3249
- const float warmup[256] = { 0 };
3250
- ret->calculate({ warmup, 256 }, 1);
3251
- return ret;
3252
  }
 
 
3253
  }
3254
- #endif
3255
 
3256
- // a specialized mel_calc could not be created
3257
- // fall back to CPU
3258
- return new mel_calc_cpu(backend, filters);
3259
  }
3260
 
3261
  // split text into tokens
@@ -3380,17 +3327,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3380
  return nullptr;
3381
  }
3382
 
3383
- state->mel_calc = whisper_mel_calc_create(state->backends[0], ctx->model.filters);
3384
-
3385
- // init 60s of random mel data
3386
- {
3387
- const int n_len = 2*100*WHISPER_CHUNK_SIZE;
3388
- const int n_mel = ctx->model.filters.n_mel;
3389
-
3390
- whisper_mel_free(state->mel);
3391
- whisper_mel_init(state->mel, state->backends[0], n_len, n_len, n_mel);
3392
- }
3393
-
3394
  // at this point, we don't know yet how many decoders will be used
3395
  // later during decoding, if more decoders are used, we will recreate the KV cache respectively
3396
  state->kv_self_n_dec = 1;
@@ -3483,7 +3419,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
3483
  {
3484
  bool ok = whisper_sched_graph_init(state->sched_conv, state->backends,
3485
  [&]() {
3486
- return whisper_build_graph_conv(*ctx, *state, 0);
3487
  });
3488
 
3489
  if (!ok) {
@@ -3805,13 +3741,6 @@ void whisper_free_state(struct whisper_state * state) {
3805
  whisper_kv_cache_free(state->kv_cross);
3806
  whisper_kv_cache_free(state->kv_pad);
3807
 
3808
- whisper_mel_free(state->mel);
3809
-
3810
- delete state->mel_calc;
3811
- state->mel_calc = nullptr;
3812
- delete state->mel_calc_fallback;
3813
- state->mel_calc_fallback = nullptr;
3814
-
3815
  #ifdef WHISPER_USE_COREML
3816
  if (state->ctx_coreml != nullptr) {
3817
  whisper_coreml_free(state->ctx_coreml);
@@ -3869,37 +3798,11 @@ void whisper_free_params(struct whisper_full_params * params) {
3869
  }
3870
 
3871
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3872
- const int64_t t_start_us = ggml_time_us();
3873
-
3874
- whisper_mel_free(state->mel);
3875
- if (n_samples <= 5 * 60 * WHISPER_SAMPLE_RATE) {
3876
- // calculate mel spectrogram for lengths up to 5 minutes on the most optimal mel calculator
3877
- state->mel = state->mel_calc->calculate({samples, n_samples}, n_threads);
3878
- } else {
3879
- // calcuate mel spectrogram for longer audios on the CPU
3880
- // 1. gpu calculations may use hundreds of megabytes of memory for longer audios so we're being conservative
3881
- // with our gpu demands
3882
- // 2. the time to transcribe audios this long will be dominated by the decoding time, so the mel calculation
3883
- // taking longer is not a major concern
3884
- if (!state->mel_calc_fallback) {
3885
- state->mel_calc_fallback = new mel_calc_cpu(state->backends[0], ctx->model.filters);
3886
- }
3887
- state->mel = state->mel_calc_fallback->calculate({samples, n_samples}, n_threads);
3888
  }
3889
 
3890
- state->t_mel_us += ggml_time_us() - t_start_us;
3891
-
3892
- // Dump log_mel_spectrogram
3893
- //{
3894
- // auto& mel = state->mel;
3895
- // std::ofstream outFile("log_mel_spectrogram.json");
3896
- // outFile << "[";
3897
- // for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
3898
- // outFile << mel.data[i] << ", ";
3899
- // }
3900
- // outFile << mel.data[mel.data.size() - 1] << "]";
3901
- // outFile.close();
3902
- //}
3903
  return 0;
3904
  }
3905
 
@@ -3918,10 +3821,12 @@ int whisper_set_mel_with_state(
3918
  return -1;
3919
  }
3920
 
3921
- whisper_mel_free(state->mel);
3922
- whisper_mel_init(state->mel, state->backends[0], n_len, n_len, n_mel);
 
3923
 
3924
- ggml_backend_tensor_set(state->mel.tensor, data, 0, ggml_nbytes(state->mel.tensor));
 
3925
 
3926
  return 0;
3927
  }
 
10
 
11
  #ifdef GGML_USE_CUDA
12
  #include "ggml-cuda.h"
 
13
  #endif
14
 
15
  #ifdef GGML_USE_SYCL
 
36
  #include "ggml-alloc.h"
37
  #include "ggml-backend.h"
38
 
 
 
39
  #include <atomic>
40
  #include <algorithm>
41
  #include <cassert>
 
398
 
399
  static std::vector<uint32_t> get_alignment_heads_by_layer(const whisper_context_params & cparams, int il, int32_t n_text_layer, int32_t n_head);
400
 
401
+ struct whisper_mel {
402
+ int n_len;
403
+ int n_len_org;
404
+ int n_mel;
405
+
406
+ std::vector<float> data;
407
+ };
408
+
409
+ struct whisper_filters {
410
+ int32_t n_mel;
411
+ int32_t n_fft;
412
+
413
+ std::vector<float> data;
414
+ };
415
+
416
  struct whisper_vocab {
417
  using id = int32_t;
418
  using token = std::string;
 
842
  whisper_kv_cache kv_pad;
843
 
844
  whisper_mel mel;
 
 
845
 
846
  whisper_batch batch;
847
 
 
860
  struct ggml_tensor * embd_enc = nullptr;
861
 
862
  // helpers for GPU offloading
863
+ std::vector<float> inp_mel;
864
  std::vector<float> inp_mask;
865
 
866
  // decode output (2-dimensional array: [n_tokens][n_vocab])
 
1923
 
1924
  static struct ggml_cgraph * whisper_build_graph_conv(
1925
  whisper_context & wctx,
1926
+ whisper_state & wstate) {
 
1927
  const auto & model = wctx.model;
1928
  const auto & hparams = model.hparams;
1929
 
 
1942
 
1943
  ggml_cgraph * gf = ggml_new_graph(ctx0);
1944
 
1945
+ struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1946
  ggml_set_name(mel, "mel");
1947
+ ggml_set_input(mel);
1948
 
1949
  struct ggml_tensor * cur = nullptr;
1950
 
 
2316
  {
2317
  auto & sched = wstate.sched_conv.sched;
2318
 
2319
+ ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate);
2320
 
2321
  if (!ggml_backend_sched_alloc_graph(sched, gf)) {
2322
  // should never happen as we pre-allocate the memory
2323
  return false;
2324
  }
2325
 
2326
+ struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
2327
+
2328
+ // set the input
2329
+ {
2330
+ const auto & mel_inp = wstate.mel;
2331
+ const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx;
2332
+
2333
+ assert(mel->type == GGML_TYPE_F32);
2334
+ assert(mel_inp.n_mel == wctx.model.hparams.n_mels);
2335
+
2336
+ wstate.inp_mel.resize(ggml_nelements(mel));
2337
+
2338
+ float * dst = wstate.inp_mel.data();
2339
+ memset(dst, 0, ggml_nbytes(mel));
2340
+
2341
+ const int i0 = std::min(mel_offset, mel_inp.n_len);
2342
+ const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
2343
+
2344
+ for (int j = 0; j < mel_inp.n_mel; ++j) {
2345
+ for (int i = i0; i < i1; ++i) {
2346
+ dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
2347
+ }
2348
+ }
2349
+
2350
+ ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
2351
  }
2352
 
2353
+ if (!whisper_encode_external(wstate)) {
2354
+ if (!ggml_graph_compute_helper(sched, gf, n_threads)) {
2355
+ return false;
2356
+ }
2357
+ } else {
2358
  #if defined(WHISPER_USE_COREML)
2359
  whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
2360
  #elif defined(WHISPER_USE_OPENVINO)
 
2978
  } global_cache;
2979
  }
2980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2981
  // naive Discrete Fourier Transform
2982
  // input is real-valued
2983
  // output is complex-valued
 
3047
  }
3048
  }
3049
 
3050
+ static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
3051
+ int n_samples, int frame_size, int frame_step, int n_threads,
3052
+ const whisper_filters & filters, whisper_mel & mel) {
 
 
 
 
 
 
 
 
 
 
 
3053
  std::vector<float> fft_in(frame_size * 2, 0.0);
3054
  std::vector<float> fft_out(frame_size * 2 * 2 * 2);
3055
+
3056
  int n_fft = filters.n_fft;
3057
  int i = ith;
3058
 
 
3067
  for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
3068
  fft_in[j] = hann[j] * samples[offset + j];
3069
  }
3070
+
3071
  // fill the rest with zeros
3072
  if (n_samples - offset < frame_size) {
3073
  std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
 
3085
  // mel spectrogram
3086
  for (int j = 0; j < mel.n_mel; j++) {
3087
  double sum = 0.0;
 
3088
  // unroll loop (suggested by GH user @lunixbochs)
3089
  int k = 0;
3090
  for (k = 0; k < n_fft - 3; k += 4) {
 
3094
  fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
3095
  fft_out[k + 3] * filters.data[j * n_fft + k + 3];
3096
  }
 
3097
  // handle n_fft remainder
3098
  for (; k < n_fft; k++) {
3099
  sum += fft_out[k] * filters.data[j * n_fft + k];
3100
  }
 
3101
  sum = log10(std::max(sum, 1e-10));
 
3102
  mel.data[j * mel.n_len + i] = sum;
3103
  }
3104
  }
 
3112
  }
3113
  }
3114
 
3115
+ // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
3116
+ static bool log_mel_spectrogram(
3117
+ whisper_state & wstate,
3118
+ const float * samples,
3119
+ const int n_samples,
3120
+ const int /*sample_rate*/,
3121
+ const int frame_size,
3122
+ const int frame_step,
3123
+ const int n_mel,
3124
+ const int n_threads,
3125
+ const whisper_filters & filters,
3126
+ const bool debug,
3127
+ whisper_mel & mel) {
3128
+ const int64_t t_start_us = ggml_time_us();
3129
 
3130
+ // Hann window
3131
+ WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
3132
+ const float * hann = global_cache.hann_window;
3133
 
3134
+ // Calculate the length of padding
3135
+ int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
3136
+ int64_t stage_2_pad = frame_size / 2;
 
3137
 
3138
+ // Initialize a vector and copy data from C array to it.
3139
+ std::vector<float> samples_padded;
3140
+ samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
3141
+ std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
3142
 
3143
+ // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
3144
+ std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
3145
 
3146
+ // reflective pad 200 samples at the beginning of audio
3147
+ std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
 
 
 
 
 
3148
 
3149
+ mel.n_mel = n_mel;
3150
+ // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
3151
+ // Calculate number of frames + remove the last frame
3152
+ mel.n_len = (samples_padded.size() - frame_size) / frame_step;
3153
+ // Calculate semi-padded sample length to ensure compatibility
3154
+ mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
3155
+ mel.data.resize(mel.n_mel * mel.n_len);
3156
 
3157
+ {
3158
+ std::vector<std::thread> workers(n_threads - 1);
3159
+ for (int iw = 0; iw < n_threads - 1; ++iw) {
3160
+ workers[iw] = std::thread(
3161
+ log_mel_spectrogram_worker_thread, iw + 1, hann, samples_padded,
3162
+ n_samples + stage_2_pad, frame_size, frame_step, n_threads,
3163
+ std::cref(filters), std::ref(mel));
3164
  }
3165
 
3166
+ // main thread
3167
+ log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
 
 
 
 
 
 
 
 
 
3168
 
3169
+ for (int iw = 0; iw < n_threads - 1; ++iw) {
3170
+ workers[iw].join();
 
3171
  }
3172
+ }
3173
 
3174
+ // clamping and normalization
3175
+ double mmax = -1e20;
3176
+ for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
3177
+ if (mel.data[i] > mmax) {
3178
+ mmax = mel.data[i];
 
3179
  }
3180
+ }
3181
 
3182
+ mmax -= 8.0;
 
 
 
 
 
 
 
 
3183
 
3184
+ for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
3185
+ if (mel.data[i] < mmax) {
3186
+ mel.data[i] = mmax;
3187
  }
3188
 
3189
+ mel.data[i] = (mel.data[i] + 4.0)/4.0;
3190
  }
 
 
3191
 
3192
+ wstate.t_mel_us += ggml_time_us() - t_start_us;
3193
+
3194
+ // Dump log_mel_spectrogram
3195
+ if (debug) {
3196
+ std::ofstream outFile("log_mel_spectrogram.json");
3197
+ outFile << "[";
3198
+ for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
3199
+ outFile << mel.data[i] << ", ";
 
 
 
3200
  }
3201
+ outFile << mel.data[mel.data.size() - 1] << "]";
3202
+ outFile.close();
3203
  }
 
3204
 
3205
+ return true;
 
 
3206
  }
3207
 
3208
  // split text into tokens
 
3327
  return nullptr;
3328
  }
3329
 
 
 
 
 
 
 
 
 
 
 
 
3330
  // at this point, we don't know yet how many decoders will be used
3331
  // later during decoding, if more decoders are used, we will recreate the KV cache respectively
3332
  state->kv_self_n_dec = 1;
 
3419
  {
3420
  bool ok = whisper_sched_graph_init(state->sched_conv, state->backends,
3421
  [&]() {
3422
+ return whisper_build_graph_conv(*ctx, *state);
3423
  });
3424
 
3425
  if (!ok) {
 
3741
  whisper_kv_cache_free(state->kv_cross);
3742
  whisper_kv_cache_free(state->kv_pad);
3743
 
 
 
 
 
 
 
 
3744
  #ifdef WHISPER_USE_COREML
3745
  if (state->ctx_coreml != nullptr) {
3746
  whisper_coreml_free(state->ctx_coreml);
 
3798
  }
3799
 
3800
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3801
+ if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
3802
+ WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
3803
+ return -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
3804
  }
3805
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3806
  return 0;
3807
  }
3808
 
 
3821
  return -1;
3822
  }
3823
 
3824
+ state->mel.n_len = n_len;
3825
+ state->mel.n_len_org = n_len;
3826
+ state->mel.n_mel = n_mel;
3827
 
3828
+ state->mel.data.resize(n_len*n_mel);
3829
+ memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
3830
 
3831
  return 0;
3832
  }