Spaces:
Running
Running
ggml : backport llama.cpp updates (close #709)
Browse files- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
tested)
- ggml.c +0 -0
- ggml.h +82 -31
- whisper.cpp +78 -55
ggml.c
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml.h
CHANGED
|
@@ -236,6 +236,7 @@ enum ggml_op {
|
|
| 236 |
|
| 237 |
GGML_OP_SCALE,
|
| 238 |
GGML_OP_CPY,
|
|
|
|
| 239 |
GGML_OP_RESHAPE,
|
| 240 |
GGML_OP_VIEW,
|
| 241 |
GGML_OP_PERMUTE,
|
|
@@ -253,16 +254,29 @@ enum ggml_op {
|
|
| 253 |
GGML_OP_COUNT,
|
| 254 |
};
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
// n-dimensional tensor
|
| 257 |
struct ggml_tensor {
|
| 258 |
enum ggml_type type;
|
| 259 |
|
| 260 |
int n_dims;
|
| 261 |
-
|
| 262 |
-
size_t
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
|
| 267 |
// compute data
|
| 268 |
enum ggml_op op;
|
|
@@ -316,6 +330,7 @@ struct ggml_init_params {
|
|
| 316 |
// memory pool
|
| 317 |
size_t mem_size; // bytes
|
| 318 |
void * mem_buffer; // if NULL, memory will be allocated internally
|
|
|
|
| 319 |
};
|
| 320 |
|
| 321 |
void ggml_time_init(void); // call this once at the beginning of the program
|
|
@@ -327,8 +342,8 @@ int64_t ggml_cycles_per_ms(void);
|
|
| 327 |
void ggml_print_object (const struct ggml_object * obj);
|
| 328 |
void ggml_print_objects(const struct ggml_context * ctx);
|
| 329 |
|
| 330 |
-
|
| 331 |
-
size_t
|
| 332 |
|
| 333 |
int ggml_blck_size (enum ggml_type type);
|
| 334 |
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
|
@@ -343,40 +358,37 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
| 343 |
|
| 344 |
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
| 345 |
|
| 346 |
-
bool ggml_mlock_supported(void);
|
| 347 |
-
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
|
| 348 |
-
|
| 349 |
struct ggml_tensor * ggml_new_tensor(
|
| 350 |
struct ggml_context * ctx,
|
| 351 |
enum ggml_type type,
|
| 352 |
int n_dims,
|
| 353 |
-
const
|
| 354 |
|
| 355 |
struct ggml_tensor * ggml_new_tensor_1d(
|
| 356 |
struct ggml_context * ctx,
|
| 357 |
enum ggml_type type,
|
| 358 |
-
|
| 359 |
|
| 360 |
struct ggml_tensor * ggml_new_tensor_2d(
|
| 361 |
struct ggml_context * ctx,
|
| 362 |
enum ggml_type type,
|
| 363 |
-
|
| 364 |
-
|
| 365 |
|
| 366 |
struct ggml_tensor * ggml_new_tensor_3d(
|
| 367 |
struct ggml_context * ctx,
|
| 368 |
enum ggml_type type,
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
|
| 373 |
struct ggml_tensor * ggml_new_tensor_4d(
|
| 374 |
struct ggml_context * ctx,
|
| 375 |
enum ggml_type type,
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
|
| 381 |
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
| 382 |
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
|
@@ -514,6 +526,11 @@ struct ggml_tensor * ggml_cpy(
|
|
| 514 |
struct ggml_tensor * a,
|
| 515 |
struct ggml_tensor * b);
|
| 516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
// return view(a), b specifies the new shape
|
| 518 |
// TODO: when we start computing gradient, make a copy instead of view
|
| 519 |
struct ggml_tensor * ggml_reshape(
|
|
@@ -526,33 +543,43 @@ struct ggml_tensor * ggml_reshape(
|
|
| 526 |
struct ggml_tensor * ggml_reshape_2d(
|
| 527 |
struct ggml_context * ctx,
|
| 528 |
struct ggml_tensor * a,
|
| 529 |
-
|
| 530 |
-
|
| 531 |
|
| 532 |
// return view(a)
|
| 533 |
// TODO: when we start computing gradient, make a copy instead of view
|
| 534 |
struct ggml_tensor * ggml_reshape_3d(
|
| 535 |
struct ggml_context * ctx,
|
| 536 |
struct ggml_tensor * a,
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
|
| 541 |
// offset in bytes
|
| 542 |
struct ggml_tensor * ggml_view_1d(
|
| 543 |
struct ggml_context * ctx,
|
| 544 |
struct ggml_tensor * a,
|
| 545 |
-
|
| 546 |
size_t offset);
|
| 547 |
|
| 548 |
struct ggml_tensor * ggml_view_2d(
|
| 549 |
struct ggml_context * ctx,
|
| 550 |
struct ggml_tensor * a,
|
| 551 |
-
|
| 552 |
-
|
| 553 |
size_t nb1, // row stride in bytes
|
| 554 |
size_t offset);
|
| 555 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
struct ggml_tensor * ggml_permute(
|
| 557 |
struct ggml_context * ctx,
|
| 558 |
struct ggml_tensor * a,
|
|
@@ -748,8 +775,8 @@ enum ggml_opt_result ggml_opt(
|
|
| 748 |
// quantization
|
| 749 |
//
|
| 750 |
|
| 751 |
-
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k,
|
| 752 |
-
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k,
|
| 753 |
|
| 754 |
//
|
| 755 |
// system info
|
|
@@ -768,6 +795,30 @@ int ggml_cpu_has_blas(void);
|
|
| 768 |
int ggml_cpu_has_sse3(void);
|
| 769 |
int ggml_cpu_has_vsx(void);
|
| 770 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
#ifdef __cplusplus
|
| 772 |
}
|
| 773 |
#endif
|
|
|
|
| 236 |
|
| 237 |
GGML_OP_SCALE,
|
| 238 |
GGML_OP_CPY,
|
| 239 |
+
GGML_OP_CONT,
|
| 240 |
GGML_OP_RESHAPE,
|
| 241 |
GGML_OP_VIEW,
|
| 242 |
GGML_OP_PERMUTE,
|
|
|
|
| 254 |
GGML_OP_COUNT,
|
| 255 |
};
|
| 256 |
|
| 257 |
+
|
| 258 |
+
// ggml object
|
| 259 |
+
struct ggml_object {
|
| 260 |
+
size_t offs;
|
| 261 |
+
size_t size;
|
| 262 |
+
|
| 263 |
+
struct ggml_object * next;
|
| 264 |
+
|
| 265 |
+
char padding[8];
|
| 266 |
+
};
|
| 267 |
+
|
| 268 |
+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
| 269 |
+
|
| 270 |
// n-dimensional tensor
|
| 271 |
struct ggml_tensor {
|
| 272 |
enum ggml_type type;
|
| 273 |
|
| 274 |
int n_dims;
|
| 275 |
+
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
| 276 |
+
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
| 277 |
+
// nb[0] = sizeof(type)
|
| 278 |
+
// nb[1] = nb[0] * ne[0] + padding
|
| 279 |
+
// nb[i] = nb[i-1] * ne[i-1]
|
| 280 |
|
| 281 |
// compute data
|
| 282 |
enum ggml_op op;
|
|
|
|
| 330 |
// memory pool
|
| 331 |
size_t mem_size; // bytes
|
| 332 |
void * mem_buffer; // if NULL, memory will be allocated internally
|
| 333 |
+
bool no_alloc; // don't allocate memory for the tensor data
|
| 334 |
};
|
| 335 |
|
| 336 |
void ggml_time_init(void); // call this once at the beginning of the program
|
|
|
|
| 342 |
void ggml_print_object (const struct ggml_object * obj);
|
| 343 |
void ggml_print_objects(const struct ggml_context * ctx);
|
| 344 |
|
| 345 |
+
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
| 346 |
+
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 347 |
|
| 348 |
int ggml_blck_size (enum ggml_type type);
|
| 349 |
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
|
|
|
| 358 |
|
| 359 |
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
| 360 |
|
|
|
|
|
|
|
|
|
|
| 361 |
struct ggml_tensor * ggml_new_tensor(
|
| 362 |
struct ggml_context * ctx,
|
| 363 |
enum ggml_type type,
|
| 364 |
int n_dims,
|
| 365 |
+
const int64_t *ne);
|
| 366 |
|
| 367 |
struct ggml_tensor * ggml_new_tensor_1d(
|
| 368 |
struct ggml_context * ctx,
|
| 369 |
enum ggml_type type,
|
| 370 |
+
int64_t ne0);
|
| 371 |
|
| 372 |
struct ggml_tensor * ggml_new_tensor_2d(
|
| 373 |
struct ggml_context * ctx,
|
| 374 |
enum ggml_type type,
|
| 375 |
+
int64_t ne0,
|
| 376 |
+
int64_t ne1);
|
| 377 |
|
| 378 |
struct ggml_tensor * ggml_new_tensor_3d(
|
| 379 |
struct ggml_context * ctx,
|
| 380 |
enum ggml_type type,
|
| 381 |
+
int64_t ne0,
|
| 382 |
+
int64_t ne1,
|
| 383 |
+
int64_t ne2);
|
| 384 |
|
| 385 |
struct ggml_tensor * ggml_new_tensor_4d(
|
| 386 |
struct ggml_context * ctx,
|
| 387 |
enum ggml_type type,
|
| 388 |
+
int64_t ne0,
|
| 389 |
+
int64_t ne1,
|
| 390 |
+
int64_t ne2,
|
| 391 |
+
int64_t ne3);
|
| 392 |
|
| 393 |
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
| 394 |
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
|
|
|
| 526 |
struct ggml_tensor * a,
|
| 527 |
struct ggml_tensor * b);
|
| 528 |
|
| 529 |
+
// make contiguous
|
| 530 |
+
struct ggml_tensor * ggml_cont(
|
| 531 |
+
struct ggml_context * ctx,
|
| 532 |
+
struct ggml_tensor * a);
|
| 533 |
+
|
| 534 |
// return view(a), b specifies the new shape
|
| 535 |
// TODO: when we start computing gradient, make a copy instead of view
|
| 536 |
struct ggml_tensor * ggml_reshape(
|
|
|
|
| 543 |
struct ggml_tensor * ggml_reshape_2d(
|
| 544 |
struct ggml_context * ctx,
|
| 545 |
struct ggml_tensor * a,
|
| 546 |
+
int64_t ne0,
|
| 547 |
+
int64_t ne1);
|
| 548 |
|
| 549 |
// return view(a)
|
| 550 |
// TODO: when we start computing gradient, make a copy instead of view
|
| 551 |
struct ggml_tensor * ggml_reshape_3d(
|
| 552 |
struct ggml_context * ctx,
|
| 553 |
struct ggml_tensor * a,
|
| 554 |
+
int64_t ne0,
|
| 555 |
+
int64_t ne1,
|
| 556 |
+
int64_t ne2);
|
| 557 |
|
| 558 |
// offset in bytes
|
| 559 |
struct ggml_tensor * ggml_view_1d(
|
| 560 |
struct ggml_context * ctx,
|
| 561 |
struct ggml_tensor * a,
|
| 562 |
+
int64_t ne0,
|
| 563 |
size_t offset);
|
| 564 |
|
| 565 |
struct ggml_tensor * ggml_view_2d(
|
| 566 |
struct ggml_context * ctx,
|
| 567 |
struct ggml_tensor * a,
|
| 568 |
+
int64_t ne0,
|
| 569 |
+
int64_t ne1,
|
| 570 |
size_t nb1, // row stride in bytes
|
| 571 |
size_t offset);
|
| 572 |
|
| 573 |
+
struct ggml_tensor * ggml_view_3d(
|
| 574 |
+
struct ggml_context * ctx,
|
| 575 |
+
struct ggml_tensor * a,
|
| 576 |
+
int64_t ne0,
|
| 577 |
+
int64_t ne1,
|
| 578 |
+
int64_t ne2,
|
| 579 |
+
size_t nb1, // row stride in bytes
|
| 580 |
+
size_t nb2, // slice stride in bytes
|
| 581 |
+
size_t offset);
|
| 582 |
+
|
| 583 |
struct ggml_tensor * ggml_permute(
|
| 584 |
struct ggml_context * ctx,
|
| 585 |
struct ggml_tensor * a,
|
|
|
|
| 775 |
// quantization
|
| 776 |
//
|
| 777 |
|
| 778 |
+
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 779 |
+
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 780 |
|
| 781 |
//
|
| 782 |
// system info
|
|
|
|
| 795 |
int ggml_cpu_has_sse3(void);
|
| 796 |
int ggml_cpu_has_vsx(void);
|
| 797 |
|
| 798 |
+
|
| 799 |
+
//
|
| 800 |
+
// Internal types and functions exposed for tests and benchmarks
|
| 801 |
+
//
|
| 802 |
+
|
| 803 |
+
#ifdef __cplusplus
|
| 804 |
+
// restrict not standard in C++
|
| 805 |
+
#define GGML_RESTRICT
|
| 806 |
+
#else
|
| 807 |
+
#define GGML_RESTRICT restrict
|
| 808 |
+
#endif
|
| 809 |
+
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 810 |
+
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 811 |
+
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
| 812 |
+
|
| 813 |
+
typedef struct {
|
| 814 |
+
dequantize_row_q_t dequantize_row_q;
|
| 815 |
+
quantize_row_q_t quantize_row_q;
|
| 816 |
+
quantize_row_q_t quantize_row_q_reference;
|
| 817 |
+
vec_dot_q_t vec_dot_q;
|
| 818 |
+
} quantize_fns_t;
|
| 819 |
+
|
| 820 |
+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
| 821 |
+
|
| 822 |
#ifdef __cplusplus
|
| 823 |
}
|
| 824 |
#endif
|
whisper.cpp
CHANGED
|
@@ -654,9 +654,11 @@ static bool kv_cache_init(
|
|
| 654 |
int n_ctx) {
|
| 655 |
cache.buf.resize(mem_bytes);
|
| 656 |
|
| 657 |
-
struct ggml_init_params params
|
| 658 |
-
|
| 659 |
-
|
|
|
|
|
|
|
| 660 |
|
| 661 |
cache.ctx = ggml_init(params);
|
| 662 |
|
|
@@ -688,9 +690,11 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
|
|
| 688 |
|
| 689 |
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
|
| 690 |
|
| 691 |
-
struct ggml_init_params params
|
| 692 |
-
|
| 693 |
-
|
|
|
|
|
|
|
| 694 |
|
| 695 |
cache.ctx = ggml_init(params);
|
| 696 |
|
|
@@ -1028,9 +1032,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1028 |
|
| 1029 |
// create the ggml context
|
| 1030 |
{
|
| 1031 |
-
struct ggml_init_params params
|
| 1032 |
-
|
| 1033 |
-
|
|
|
|
|
|
|
| 1034 |
|
| 1035 |
model.ctx = ggml_init(params);
|
| 1036 |
if (!model.ctx) {
|
|
@@ -1254,10 +1260,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1254 |
break;
|
| 1255 |
}
|
| 1256 |
|
| 1257 |
-
|
| 1258 |
-
|
| 1259 |
for (int i = 0; i < n_dims; ++i) {
|
| 1260 |
-
|
|
|
|
|
|
|
| 1261 |
nelements *= ne[i];
|
| 1262 |
}
|
| 1263 |
|
|
@@ -1278,7 +1286,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1278 |
}
|
| 1279 |
|
| 1280 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
| 1281 |
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%
|
| 1282 |
__func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
|
| 1283 |
return false;
|
| 1284 |
}
|
|
@@ -1286,7 +1294,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1286 |
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
|
| 1287 |
|
| 1288 |
if (nelements*bpe != ggml_nbytes(tensor)) {
|
| 1289 |
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %
|
| 1290 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 1291 |
return false;
|
| 1292 |
}
|
|
@@ -1344,9 +1352,11 @@ static bool whisper_encode_internal(
|
|
| 1344 |
const int n_mels = hparams.n_mels;
|
| 1345 |
assert(mel_inp.n_mel == n_mels);
|
| 1346 |
|
| 1347 |
-
struct ggml_init_params params
|
| 1348 |
-
|
| 1349 |
-
|
|
|
|
|
|
|
| 1350 |
|
| 1351 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 1352 |
|
|
@@ -1501,8 +1511,7 @@ static bool whisper_encode_internal(
|
|
| 1501 |
Vcur,
|
| 1502 |
n_state/n_head, n_head, n_ctx),
|
| 1503 |
1, 2, 0, 3),
|
| 1504 |
-
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
|
| 1505 |
-
);
|
| 1506 |
|
| 1507 |
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
| 1508 |
#else
|
|
@@ -1726,10 +1735,12 @@ static bool whisper_encode_internal(
|
|
| 1726 |
|
| 1727 |
wstate.use_buf(ctx0, -1);
|
| 1728 |
|
| 1729 |
-
|
| 1730 |
-
|
| 1731 |
-
struct ggml_tensor* k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
|
| 1732 |
-
struct ggml_tensor* v =
|
|
|
|
|
|
|
| 1733 |
|
| 1734 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
|
| 1735 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
|
@@ -1797,9 +1808,11 @@ static bool whisper_decode_internal(
|
|
| 1797 |
|
| 1798 |
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
|
| 1799 |
|
| 1800 |
-
struct ggml_init_params params
|
| 1801 |
-
|
| 1802 |
-
|
|
|
|
|
|
|
| 1803 |
|
| 1804 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 1805 |
|
|
@@ -1862,20 +1875,24 @@ static bool whisper_decode_internal(
|
|
| 1862 |
|
| 1863 |
Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1864 |
|
| 1865 |
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
| 1866 |
-
layer.attn_v_w,
|
| 1867 |
-
cur);
|
| 1868 |
-
|
| 1869 |
-
Vcur = ggml_add(ctx0,
|
| 1870 |
-
ggml_repeat(ctx0,
|
| 1871 |
-
layer.attn_v_b,
|
| 1872 |
-
Vcur),
|
| 1873 |
-
Vcur);
|
| 1874 |
-
|
| 1875 |
// store key and value to memory
|
| 1876 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1877 |
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
|
| 1878 |
-
struct ggml_tensor * v =
|
|
|
|
|
|
|
| 1879 |
|
| 1880 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
| 1881 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
|
@@ -1914,16 +1931,14 @@ static bool whisper_decode_internal(
|
|
| 1914 |
|
| 1915 |
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
| 1916 |
|
| 1917 |
-
struct ggml_tensor *
|
| 1918 |
-
|
| 1919 |
-
|
| 1920 |
-
|
| 1921 |
-
|
| 1922 |
-
|
| 1923 |
-
1, 2, 0, 3),
|
| 1924 |
-
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_state/n_head, n_head));
|
| 1925 |
|
| 1926 |
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0,
|
| 1927 |
|
| 1928 |
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 1929 |
|
|
@@ -1986,15 +2001,22 @@ static bool whisper_decode_internal(
|
|
| 1986 |
ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
|
| 1987 |
n_state/n_head, n_head, M);
|
| 1988 |
|
| 1989 |
-
struct ggml_tensor * Vcross =
|
| 1990 |
-
|
| 1991 |
-
|
| 1992 |
-
|
| 1993 |
|
| 1994 |
-
struct ggml_tensor * V_trans =
|
| 1995 |
-
|
| 1996 |
-
|
| 1997 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1998 |
|
| 1999 |
// ------
|
| 2000 |
|
|
@@ -2021,7 +2043,7 @@ static bool whisper_decode_internal(
|
|
| 2021 |
|
| 2022 |
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
|
| 2023 |
|
| 2024 |
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0,
|
| 2025 |
|
| 2026 |
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 2027 |
|
|
@@ -4726,6 +4748,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4726 |
struct ggml_init_params gparams = {
|
| 4727 |
/*.mem_size =*/ buf.size(),
|
| 4728 |
/*.mem_buffer =*/ buf.data(),
|
|
|
|
| 4729 |
};
|
| 4730 |
|
| 4731 |
struct ggml_context * ctx0 = ggml_init(gparams);
|
|
|
|
| 654 |
int n_ctx) {
|
| 655 |
cache.buf.resize(mem_bytes);
|
| 656 |
|
| 657 |
+
struct ggml_init_params params = {
|
| 658 |
+
/*.mem_size =*/ cache.buf.size(),
|
| 659 |
+
/*.mem_buffer =*/ cache.buf.data(),
|
| 660 |
+
/*.no_alloc =*/ false,
|
| 661 |
+
};
|
| 662 |
|
| 663 |
cache.ctx = ggml_init(params);
|
| 664 |
|
|
|
|
| 690 |
|
| 691 |
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
|
| 692 |
|
| 693 |
+
struct ggml_init_params params = {
|
| 694 |
+
/*.mem_size =*/ cache.buf.size(),
|
| 695 |
+
/*.mem_buffer =*/ cache.buf.data(),
|
| 696 |
+
/*.no_alloc =*/ false,
|
| 697 |
+
};
|
| 698 |
|
| 699 |
cache.ctx = ggml_init(params);
|
| 700 |
|
|
|
|
| 1032 |
|
| 1033 |
// create the ggml context
|
| 1034 |
{
|
| 1035 |
+
struct ggml_init_params params = {
|
| 1036 |
+
/*.mem_size =*/ wctx.model.buf->size(),
|
| 1037 |
+
/*.mem_buffer =*/ wctx.model.buf->data(),
|
| 1038 |
+
/*.no_alloc =*/ false,
|
| 1039 |
+
};
|
| 1040 |
|
| 1041 |
model.ctx = ggml_init(params);
|
| 1042 |
if (!model.ctx) {
|
|
|
|
| 1260 |
break;
|
| 1261 |
}
|
| 1262 |
|
| 1263 |
+
int64_t nelements = 1;
|
| 1264 |
+
int64_t ne[3] = { 1, 1, 1 };
|
| 1265 |
for (int i = 0; i < n_dims; ++i) {
|
| 1266 |
+
int32_t ne_cur;
|
| 1267 |
+
read_safe(loader, ne_cur);
|
| 1268 |
+
ne[i] = ne_cur;
|
| 1269 |
nelements *= ne[i];
|
| 1270 |
}
|
| 1271 |
|
|
|
|
| 1286 |
}
|
| 1287 |
|
| 1288 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
| 1289 |
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%lld, %lld, %lld]\n",
|
| 1290 |
__func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
|
| 1291 |
return false;
|
| 1292 |
}
|
|
|
|
| 1294 |
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
|
| 1295 |
|
| 1296 |
if (nelements*bpe != ggml_nbytes(tensor)) {
|
| 1297 |
+
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
|
| 1298 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 1299 |
return false;
|
| 1300 |
}
|
|
|
|
| 1352 |
const int n_mels = hparams.n_mels;
|
| 1353 |
assert(mel_inp.n_mel == n_mels);
|
| 1354 |
|
| 1355 |
+
struct ggml_init_params params = {
|
| 1356 |
+
/*.mem_size =*/ wstate.buf_compute.size(),
|
| 1357 |
+
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
| 1358 |
+
/*.no_alloc =*/ false,
|
| 1359 |
+
};
|
| 1360 |
|
| 1361 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 1362 |
|
|
|
|
| 1511 |
Vcur,
|
| 1512 |
n_state/n_head, n_head, n_ctx),
|
| 1513 |
1, 2, 0, 3),
|
| 1514 |
+
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
|
|
|
|
| 1515 |
|
| 1516 |
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
| 1517 |
#else
|
|
|
|
| 1735 |
|
| 1736 |
wstate.use_buf(ctx0, -1);
|
| 1737 |
|
| 1738 |
+
Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
|
| 1739 |
+
|
| 1740 |
+
struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
|
| 1741 |
+
struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
|
| 1742 |
+
( n_ctx)*ggml_element_size(wstate.kv_cross.v),
|
| 1743 |
+
(il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
|
| 1744 |
|
| 1745 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
|
| 1746 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
|
|
|
| 1808 |
|
| 1809 |
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
|
| 1810 |
|
| 1811 |
+
struct ggml_init_params params = {
|
| 1812 |
+
/*.mem_size =*/ wstate.buf_compute.size(),
|
| 1813 |
+
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
| 1814 |
+
/*.no_alloc =*/ false,
|
| 1815 |
+
};
|
| 1816 |
|
| 1817 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 1818 |
|
|
|
|
| 1875 |
|
| 1876 |
Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1877 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1878 |
// store key and value to memory
|
| 1879 |
{
|
| 1880 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
| 1881 |
+
layer.attn_v_w,
|
| 1882 |
+
cur);
|
| 1883 |
+
|
| 1884 |
+
Vcur = ggml_add(ctx0,
|
| 1885 |
+
ggml_repeat(ctx0,
|
| 1886 |
+
layer.attn_v_b,
|
| 1887 |
+
Vcur),
|
| 1888 |
+
Vcur);
|
| 1889 |
+
|
| 1890 |
+
Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
|
| 1891 |
+
|
| 1892 |
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
|
| 1893 |
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
|
| 1894 |
+
( n_ctx)*ggml_element_size(kv_self.v),
|
| 1895 |
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
|
| 1896 |
|
| 1897 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
| 1898 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
|
|
|
| 1931 |
|
| 1932 |
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
| 1933 |
|
| 1934 |
+
struct ggml_tensor * V =
|
| 1935 |
+
ggml_view_3d(ctx0, kv_self.v,
|
| 1936 |
+
n_past + N, n_state/n_head, n_head,
|
| 1937 |
+
n_ctx*ggml_element_size(kv_self.v),
|
| 1938 |
+
n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
|
| 1939 |
+
il*n_ctx*ggml_element_size(kv_self.v)*n_state);
|
|
|
|
|
|
|
| 1940 |
|
| 1941 |
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
| 1942 |
|
| 1943 |
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 1944 |
|
|
|
|
| 2001 |
ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
|
| 2002 |
n_state/n_head, n_head, M);
|
| 2003 |
|
| 2004 |
+
//struct ggml_tensor * Vcross =
|
| 2005 |
+
// ggml_reshape_3d(ctx0,
|
| 2006 |
+
// ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
|
| 2007 |
+
// n_state/n_head, n_head, M);
|
| 2008 |
|
| 2009 |
+
//struct ggml_tensor * V_trans =
|
| 2010 |
+
// ggml_cpy(ctx0,
|
| 2011 |
+
// ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
|
| 2012 |
+
// ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
|
| 2013 |
+
|
| 2014 |
+
struct ggml_tensor * V =
|
| 2015 |
+
ggml_view_3d(ctx0, wstate.kv_cross.v,
|
| 2016 |
+
M, n_state/n_head, n_head,
|
| 2017 |
+
M*ggml_element_size(wstate.kv_cross.v),
|
| 2018 |
+
M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
|
| 2019 |
+
il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
|
| 2020 |
|
| 2021 |
// ------
|
| 2022 |
|
|
|
|
| 2043 |
|
| 2044 |
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
|
| 2045 |
|
| 2046 |
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
| 2047 |
|
| 2048 |
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 2049 |
|
|
|
|
| 4748 |
struct ggml_init_params gparams = {
|
| 4749 |
/*.mem_size =*/ buf.size(),
|
| 4750 |
/*.mem_buffer =*/ buf.data(),
|
| 4751 |
+
/*.no_alloc =*/ false,
|
| 4752 |
};
|
| 4753 |
|
| 4754 |
struct ggml_context * ctx0 = ggml_init(gparams);
|