slaren OccamRazor commited on
Commit
b9b60de
·
1 Parent(s): c3a7159

vulkan : reuse parent extra for views (llama/7806)

Browse files

* vulkan : reuse parent extra for views

* Fix validation error when multiple compute contexts are used in a graph

---------

Co-authored-by: 0cc4m <[email protected]>

Files changed (1) hide show
  1. ggml-vulkan.cpp +56 -72
ggml-vulkan.cpp CHANGED
@@ -345,15 +345,12 @@ struct vk_context {
345
  };
346
 
347
  struct ggml_tensor_extra_gpu {
348
- bool ready;
349
-
350
  size_t ctx_idx;
351
 
352
  vk_buffer_ref buffer_gpu;
353
  uint64_t offset;
354
 
355
  void reset() {
356
- ready = false;
357
  ctx_idx = 0;
358
  buffer_gpu.reset();
359
  offset = 0;
@@ -2949,7 +2946,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2949
  const uint64_t d_sz = sizeof(float) * d_ne;
2950
 
2951
  vk_buffer d_D = extra->buffer_gpu.lock();
2952
- const uint64_t d_buf_offset = extra->offset;
2953
  GGML_ASSERT(d_D != nullptr);
2954
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
2955
  vk_buffer d_X;
@@ -2958,12 +2955,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2958
  uint64_t y_buf_offset = 0;
2959
  if (!src0_uma) {
2960
  d_Qx = extra_src0->buffer_gpu.lock();
2961
- qx_buf_offset = extra_src0->offset;
2962
  GGML_ASSERT(d_Qx != nullptr);
2963
  }
2964
  if (!src1_uma) {
2965
  d_Qy = extra_src1->buffer_gpu.lock();
2966
- qy_buf_offset = extra_src1->offset;
2967
  GGML_ASSERT(d_Qy != nullptr);
2968
  }
2969
  if (qx_needs_dequant) {
@@ -3114,7 +3111,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3114
  const uint64_t d_sz = sizeof(float) * d_ne;
3115
 
3116
  vk_buffer d_D = extra->buffer_gpu.lock();
3117
- const uint64_t d_buf_offset = extra->offset;
3118
  GGML_ASSERT(d_D != nullptr);
3119
  vk_buffer d_X;
3120
  uint64_t x_buf_offset = 0;
@@ -3122,12 +3119,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3122
  uint64_t y_buf_offset = 0;
3123
  if(!src0_uma) {
3124
  d_Qx = extra_src0->buffer_gpu.lock();
3125
- qx_buf_offset = extra_src0->offset;
3126
  GGML_ASSERT(d_Qx != nullptr);
3127
  }
3128
  if(!src1_uma) {
3129
  d_Qy = extra_src1->buffer_gpu.lock();
3130
- qy_buf_offset = extra_src1->offset;
3131
  GGML_ASSERT(d_Qy != nullptr);
3132
  }
3133
  if (qx_needs_dequant) {
@@ -3246,14 +3243,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3246
  const uint64_t d_sz = sizeof(float) * d_ne;
3247
 
3248
  vk_buffer d_D = extra->buffer_gpu.lock();
3249
- const uint64_t d_buf_offset = extra->offset;
3250
  GGML_ASSERT(d_D != nullptr);
3251
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3252
- const uint64_t qx_buf_offset = extra_src0->offset;
3253
  GGML_ASSERT(d_Qx != nullptr);
3254
  if (!src1_uma) {
3255
  d_Qy = extra_src1->buffer_gpu.lock();
3256
- qy_buf_offset = extra_src1->offset;
3257
  GGML_ASSERT(d_Qx != nullptr);
3258
  }
3259
 
@@ -3323,14 +3320,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3323
  const uint64_t d_sz = sizeof(float) * d_ne;
3324
 
3325
  vk_buffer d_D = extra->buffer_gpu.lock();
3326
- const uint64_t d_buf_offset = extra->offset;
3327
  GGML_ASSERT(d_D != nullptr);
3328
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3329
- const uint64_t qx_buf_offset = extra_src0->offset;
3330
  GGML_ASSERT(d_Qx != nullptr);
3331
  if (!src1_uma) {
3332
  d_Qy = extra_src1->buffer_gpu.lock();
3333
- qy_buf_offset = extra_src1->offset;
3334
  GGML_ASSERT(d_Qx != nullptr);
3335
  }
3336
 
@@ -3459,7 +3456,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3459
  const uint64_t d_sz = sizeof(float) * d_ne;
3460
 
3461
  vk_buffer d_D = extra->buffer_gpu.lock();
3462
- const uint64_t d_buf_offset = extra->offset;
3463
  GGML_ASSERT(d_D != nullptr);
3464
  vk_buffer d_X;
3465
  uint64_t x_buf_offset = 0;
@@ -3467,17 +3464,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3467
  uint64_t y_buf_offset = 0;
3468
  if (!src0_uma) {
3469
  d_Qx = extra_src0->buffer_gpu.lock();
3470
- qx_buf_offset = extra_src0->offset;
3471
  GGML_ASSERT(d_Qx != nullptr);
3472
  }
3473
  if (!src1_uma) {
3474
  d_Qy = extra_src1->buffer_gpu.lock();
3475
- qy_buf_offset = extra_src1->offset;
3476
  GGML_ASSERT(d_Qy != nullptr);
3477
  }
3478
  if (!ids_uma) {
3479
  d_ids = extra_ids->buffer_gpu.lock();
3480
- ids_buf_offset = extra_ids->offset;
3481
  GGML_ASSERT(d_ids != nullptr);
3482
  }
3483
  if (qx_needs_dequant) {
@@ -3636,7 +3633,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3636
  const uint64_t d_sz = sizeof(float) * d_ne;
3637
 
3638
  vk_buffer d_D = extra->buffer_gpu.lock();
3639
- const uint64_t d_buf_offset = extra->offset;
3640
  GGML_ASSERT(d_D != nullptr);
3641
  vk_buffer d_X;
3642
  uint64_t x_buf_offset = 0;
@@ -3644,17 +3641,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3644
  uint64_t y_buf_offset = 0;
3645
  if(!src0_uma) {
3646
  d_Qx = extra_src0->buffer_gpu.lock();
3647
- qx_buf_offset = extra_src0->offset;
3648
  GGML_ASSERT(d_Qx != nullptr);
3649
  }
3650
  if(!src1_uma) {
3651
  d_Qy = extra_src1->buffer_gpu.lock();
3652
- qy_buf_offset = extra_src1->offset;
3653
  GGML_ASSERT(d_Qy != nullptr);
3654
  }
3655
  if(!ids_uma) {
3656
  d_ids = extra_ids->buffer_gpu.lock();
3657
- ids_buf_offset = extra_ids->offset;
3658
  GGML_ASSERT(d_ids != nullptr);
3659
  }
3660
  if (qx_needs_dequant) {
@@ -3769,9 +3766,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3769
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3770
 
3771
  const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3772
- const uint64_t src_offset = extra_src0->offset;
3773
  vk_buffer dst_buf = extra->buffer_gpu.lock();
3774
- const uint64_t dst_offset = extra->offset;
3775
 
3776
  std::vector<vk::BufferCopy> copies;
3777
 
@@ -4062,21 +4059,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4062
  }
4063
 
4064
  GGML_ASSERT(d_D != nullptr);
4065
- uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4066
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4067
  if(!src0_uma) {
4068
  d_X = extra_src0->buffer_gpu.lock();
4069
- x_buf_offset = extra_src0->offset;
4070
  GGML_ASSERT(d_X != nullptr);
4071
  }
4072
  if (use_src1 && !src1_uma) {
4073
  d_Y = extra_src1->buffer_gpu.lock();
4074
- y_buf_offset = extra_src1->offset;
4075
  GGML_ASSERT(d_Y != nullptr);
4076
  }
4077
  if (use_src2 && !src2_uma) {
4078
  d_Z = extra_src2->buffer_gpu.lock();
4079
- z_buf_offset = extra_src2->offset;
4080
  GGML_ASSERT(d_Z != nullptr);
4081
  }
4082
 
@@ -4336,7 +4333,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4336
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4337
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4338
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4339
- const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4340
 
4341
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4342
  (uint32_t)ggml_nelements(src0),
@@ -5569,6 +5566,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5569
  const ggml_tensor * src2 = node->src[2];
5570
 
5571
  switch (node->op) {
 
 
 
 
 
 
 
5572
  case GGML_OP_UNARY:
5573
  switch (ggml_get_unary_op(node)) {
5574
  case GGML_UNARY_OP_SILU:
@@ -5590,10 +5594,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5590
  case GGML_OP_CPY:
5591
  case GGML_OP_CONT:
5592
  case GGML_OP_DUP:
5593
- case GGML_OP_RESHAPE:
5594
- case GGML_OP_VIEW:
5595
- case GGML_OP_PERMUTE:
5596
- case GGML_OP_TRANSPOSE:
5597
  case GGML_OP_NORM:
5598
  case GGML_OP_RMS_NORM:
5599
  case GGML_OP_DIAG_MASK_INF:
@@ -5601,7 +5601,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5601
  case GGML_OP_ROPE:
5602
  case GGML_OP_MUL_MAT:
5603
  case GGML_OP_MUL_MAT_ID:
5604
- case GGML_OP_NONE:
5605
  case GGML_OP_ARGSORT:
5606
  case GGML_OP_SUM_ROWS:
5607
  break;
@@ -5654,12 +5653,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5654
  case GGML_OP_DUP:
5655
  ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5656
 
5657
- break;
5658
- case GGML_OP_RESHAPE:
5659
- case GGML_OP_VIEW:
5660
- case GGML_OP_PERMUTE:
5661
- case GGML_OP_TRANSPOSE:
5662
- case GGML_OP_NONE:
5663
  break;
5664
  case GGML_OP_NORM:
5665
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -5712,7 +5705,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5712
  return;
5713
  }
5714
 
5715
- extra->ready = true;
5716
  extra->ctx_idx = ctx->compute_ctx->idx;
5717
 
5718
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5796,8 +5788,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5796
  ggml_vk_check_results_0(ctx, params, tensor);
5797
  #endif
5798
 
5799
- GGML_ASSERT(extra->ready);
5800
-
5801
  vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
5802
 
5803
  // Only run if ctx hasn't been submitted yet
@@ -5822,8 +5812,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5822
  subctx.out_memcpys.clear();
5823
  }
5824
 
5825
- extra->ready = false;
5826
-
5827
  return true;
5828
  }
5829
 
@@ -5943,7 +5931,9 @@ struct ggml_backend_vk_buffer_context {
5943
 
5944
  ~ggml_backend_vk_buffer_context() {
5945
  ggml_vk_destroy_buffer(dev_buffer);
5946
- delete[] temp_tensor_extras;
 
 
5947
  }
5948
 
5949
  ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
@@ -5990,18 +5980,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5990
  #endif
5991
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5992
 
5993
- ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5994
- if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
5995
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5996
- ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
5997
- extra->buffer_gpu = extra_view->buffer_gpu;
5998
- extra->offset = extra_view->offset + tensor->view_offs;
5999
  } else {
 
6000
  extra->buffer_gpu = ctx->dev_buffer;
6001
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
 
6002
  }
6003
-
6004
- tensor->extra = extra;
6005
  }
6006
 
6007
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -6014,7 +6002,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
6014
 
6015
  vk_buffer buf = extra->buffer_gpu.lock();
6016
 
6017
- ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
6018
  }
6019
 
6020
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6027,7 +6015,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
6027
 
6028
  vk_buffer buf = extra->buffer_gpu.lock();
6029
 
6030
- ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
6031
  }
6032
 
6033
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6038,7 +6026,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
6038
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6039
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6040
 
6041
- ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6042
 
6043
  return true;
6044
  }
@@ -6264,7 +6252,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6264
 
6265
  vk_buffer buf = extra->buffer_gpu.lock();
6266
 
6267
- ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6268
  }
6269
 
6270
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6284,7 +6272,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6284
 
6285
  vk_buffer buf = extra->buffer_gpu.lock();
6286
 
6287
- ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6288
  }
6289
 
6290
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6305,7 +6293,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6305
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6306
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6307
 
6308
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6309
  return true;
6310
  }
6311
 
@@ -6478,11 +6466,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6478
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6479
  // } break;
6480
  case GGML_OP_ROPE:
6481
- {
6482
- const int mode = ((const int32_t *) op->op_params)[2];
6483
-
6484
- return true;
6485
- } break;
6486
  case GGML_OP_NONE:
6487
  case GGML_OP_RESHAPE:
6488
  case GGML_OP_VIEW:
@@ -6725,7 +6709,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6725
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6726
 
6727
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6728
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
6729
  }
6730
 
6731
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -6809,7 +6793,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6809
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6810
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6811
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6812
- uint64_t offset = extra->offset;
6813
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6814
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6815
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -6851,7 +6835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6851
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6852
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6853
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6854
- uint64_t offset = extra->offset;
6855
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6856
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6857
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -6909,7 +6893,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6909
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6910
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6911
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6912
- uint64_t offset = extra->offset;
6913
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6914
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6915
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -7092,11 +7076,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7092
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7093
 
7094
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7095
- if (extra->offset + tensor_size >= buffer_gpu->size) {
7096
- tensor_size = buffer_gpu->size - (extra->offset);
7097
  }
7098
 
7099
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
7100
  }
7101
 
7102
  float first_error_result = -1.0f;
 
345
  };
346
 
347
  struct ggml_tensor_extra_gpu {
 
 
348
  size_t ctx_idx;
349
 
350
  vk_buffer_ref buffer_gpu;
351
  uint64_t offset;
352
 
353
  void reset() {
 
354
  ctx_idx = 0;
355
  buffer_gpu.reset();
356
  offset = 0;
 
2946
  const uint64_t d_sz = sizeof(float) * d_ne;
2947
 
2948
  vk_buffer d_D = extra->buffer_gpu.lock();
2949
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
2950
  GGML_ASSERT(d_D != nullptr);
2951
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
2952
  vk_buffer d_X;
 
2955
  uint64_t y_buf_offset = 0;
2956
  if (!src0_uma) {
2957
  d_Qx = extra_src0->buffer_gpu.lock();
2958
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
2959
  GGML_ASSERT(d_Qx != nullptr);
2960
  }
2961
  if (!src1_uma) {
2962
  d_Qy = extra_src1->buffer_gpu.lock();
2963
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
2964
  GGML_ASSERT(d_Qy != nullptr);
2965
  }
2966
  if (qx_needs_dequant) {
 
3111
  const uint64_t d_sz = sizeof(float) * d_ne;
3112
 
3113
  vk_buffer d_D = extra->buffer_gpu.lock();
3114
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3115
  GGML_ASSERT(d_D != nullptr);
3116
  vk_buffer d_X;
3117
  uint64_t x_buf_offset = 0;
 
3119
  uint64_t y_buf_offset = 0;
3120
  if(!src0_uma) {
3121
  d_Qx = extra_src0->buffer_gpu.lock();
3122
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3123
  GGML_ASSERT(d_Qx != nullptr);
3124
  }
3125
  if(!src1_uma) {
3126
  d_Qy = extra_src1->buffer_gpu.lock();
3127
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3128
  GGML_ASSERT(d_Qy != nullptr);
3129
  }
3130
  if (qx_needs_dequant) {
 
3243
  const uint64_t d_sz = sizeof(float) * d_ne;
3244
 
3245
  vk_buffer d_D = extra->buffer_gpu.lock();
3246
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3247
  GGML_ASSERT(d_D != nullptr);
3248
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3249
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3250
  GGML_ASSERT(d_Qx != nullptr);
3251
  if (!src1_uma) {
3252
  d_Qy = extra_src1->buffer_gpu.lock();
3253
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3254
  GGML_ASSERT(d_Qx != nullptr);
3255
  }
3256
 
 
3320
  const uint64_t d_sz = sizeof(float) * d_ne;
3321
 
3322
  vk_buffer d_D = extra->buffer_gpu.lock();
3323
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3324
  GGML_ASSERT(d_D != nullptr);
3325
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3326
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3327
  GGML_ASSERT(d_Qx != nullptr);
3328
  if (!src1_uma) {
3329
  d_Qy = extra_src1->buffer_gpu.lock();
3330
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3331
  GGML_ASSERT(d_Qx != nullptr);
3332
  }
3333
 
 
3456
  const uint64_t d_sz = sizeof(float) * d_ne;
3457
 
3458
  vk_buffer d_D = extra->buffer_gpu.lock();
3459
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3460
  GGML_ASSERT(d_D != nullptr);
3461
  vk_buffer d_X;
3462
  uint64_t x_buf_offset = 0;
 
3464
  uint64_t y_buf_offset = 0;
3465
  if (!src0_uma) {
3466
  d_Qx = extra_src0->buffer_gpu.lock();
3467
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3468
  GGML_ASSERT(d_Qx != nullptr);
3469
  }
3470
  if (!src1_uma) {
3471
  d_Qy = extra_src1->buffer_gpu.lock();
3472
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3473
  GGML_ASSERT(d_Qy != nullptr);
3474
  }
3475
  if (!ids_uma) {
3476
  d_ids = extra_ids->buffer_gpu.lock();
3477
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3478
  GGML_ASSERT(d_ids != nullptr);
3479
  }
3480
  if (qx_needs_dequant) {
 
3633
  const uint64_t d_sz = sizeof(float) * d_ne;
3634
 
3635
  vk_buffer d_D = extra->buffer_gpu.lock();
3636
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3637
  GGML_ASSERT(d_D != nullptr);
3638
  vk_buffer d_X;
3639
  uint64_t x_buf_offset = 0;
 
3641
  uint64_t y_buf_offset = 0;
3642
  if(!src0_uma) {
3643
  d_Qx = extra_src0->buffer_gpu.lock();
3644
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3645
  GGML_ASSERT(d_Qx != nullptr);
3646
  }
3647
  if(!src1_uma) {
3648
  d_Qy = extra_src1->buffer_gpu.lock();
3649
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3650
  GGML_ASSERT(d_Qy != nullptr);
3651
  }
3652
  if(!ids_uma) {
3653
  d_ids = extra_ids->buffer_gpu.lock();
3654
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3655
  GGML_ASSERT(d_ids != nullptr);
3656
  }
3657
  if (qx_needs_dequant) {
 
3766
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3767
 
3768
  const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3769
+ const uint64_t src_offset = extra_src0->offset + src0->view_offs;
3770
  vk_buffer dst_buf = extra->buffer_gpu.lock();
3771
+ const uint64_t dst_offset = extra->offset + dst->view_offs;
3772
 
3773
  std::vector<vk::BufferCopy> copies;
3774
 
 
4059
  }
4060
 
4061
  GGML_ASSERT(d_D != nullptr);
4062
+ uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4063
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4064
  if(!src0_uma) {
4065
  d_X = extra_src0->buffer_gpu.lock();
4066
+ x_buf_offset = extra_src0->offset + src0->view_offs;
4067
  GGML_ASSERT(d_X != nullptr);
4068
  }
4069
  if (use_src1 && !src1_uma) {
4070
  d_Y = extra_src1->buffer_gpu.lock();
4071
+ y_buf_offset = extra_src1->offset + src1->view_offs;
4072
  GGML_ASSERT(d_Y != nullptr);
4073
  }
4074
  if (use_src2 && !src2_uma) {
4075
  d_Z = extra_src2->buffer_gpu.lock();
4076
+ z_buf_offset = extra_src2->offset + src2->view_offs;
4077
  GGML_ASSERT(d_Z != nullptr);
4078
  }
4079
 
 
4333
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4334
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4335
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4336
+ const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4337
 
4338
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4339
  (uint32_t)ggml_nelements(src0),
 
5566
  const ggml_tensor * src2 = node->src[2];
5567
 
5568
  switch (node->op) {
5569
+ // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
5570
+ case GGML_OP_RESHAPE:
5571
+ case GGML_OP_VIEW:
5572
+ case GGML_OP_PERMUTE:
5573
+ case GGML_OP_TRANSPOSE:
5574
+ case GGML_OP_NONE:
5575
+ return;
5576
  case GGML_OP_UNARY:
5577
  switch (ggml_get_unary_op(node)) {
5578
  case GGML_UNARY_OP_SILU:
 
5594
  case GGML_OP_CPY:
5595
  case GGML_OP_CONT:
5596
  case GGML_OP_DUP:
 
 
 
 
5597
  case GGML_OP_NORM:
5598
  case GGML_OP_RMS_NORM:
5599
  case GGML_OP_DIAG_MASK_INF:
 
5601
  case GGML_OP_ROPE:
5602
  case GGML_OP_MUL_MAT:
5603
  case GGML_OP_MUL_MAT_ID:
 
5604
  case GGML_OP_ARGSORT:
5605
  case GGML_OP_SUM_ROWS:
5606
  break;
 
5653
  case GGML_OP_DUP:
5654
  ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5655
 
 
 
 
 
 
 
5656
  break;
5657
  case GGML_OP_NORM:
5658
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
 
5705
  return;
5706
  }
5707
 
 
5708
  extra->ctx_idx = ctx->compute_ctx->idx;
5709
 
5710
  #ifdef GGML_VULKAN_CHECK_RESULTS
 
5788
  ggml_vk_check_results_0(ctx, params, tensor);
5789
  #endif
5790
 
 
 
5791
  vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
5792
 
5793
  // Only run if ctx hasn't been submitted yet
 
5812
  subctx.out_memcpys.clear();
5813
  }
5814
 
 
 
5815
  return true;
5816
  }
5817
 
 
5931
 
5932
  ~ggml_backend_vk_buffer_context() {
5933
  ggml_vk_destroy_buffer(dev_buffer);
5934
+ if (temp_tensor_extras != nullptr) {
5935
+ delete[] temp_tensor_extras;
5936
+ }
5937
  }
5938
 
5939
  ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
 
5980
  #endif
5981
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5982
 
5983
+ if (tensor->view_src != nullptr) {
 
5984
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5985
+ GGML_ASSERT(tensor->view_src->extra != nullptr);
5986
+ tensor->extra = tensor->view_src->extra;
 
5987
  } else {
5988
+ ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5989
  extra->buffer_gpu = ctx->dev_buffer;
5990
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5991
+ tensor->extra = extra;
5992
  }
 
 
5993
  }
5994
 
5995
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
6002
 
6003
  vk_buffer buf = extra->buffer_gpu.lock();
6004
 
6005
+ ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6006
  }
6007
 
6008
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
6015
 
6016
  vk_buffer buf = extra->buffer_gpu.lock();
6017
 
6018
+ ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6019
  }
6020
 
6021
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
 
6026
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6027
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6028
 
6029
+ ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6030
 
6031
  return true;
6032
  }
 
6252
 
6253
  vk_buffer buf = extra->buffer_gpu.lock();
6254
 
6255
+ ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6256
  }
6257
 
6258
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
6272
 
6273
  vk_buffer buf = extra->buffer_gpu.lock();
6274
 
6275
+ ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6276
  }
6277
 
6278
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
 
6293
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6294
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6295
 
6296
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6297
  return true;
6298
  }
6299
 
 
6466
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6467
  // } break;
6468
  case GGML_OP_ROPE:
6469
+ return true;
 
 
 
 
6470
  case GGML_OP_NONE:
6471
  case GGML_OP_RESHAPE:
6472
  case GGML_OP_VIEW:
 
6709
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6710
 
6711
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6712
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6713
  }
6714
 
6715
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
 
6793
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6794
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6795
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6796
+ uint64_t offset = extra->offset + src0->view_offs;
6797
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6798
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6799
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
 
6835
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6836
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6837
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6838
+ uint64_t offset = extra->offset + src1->view_offs;
6839
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6840
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6841
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
 
6893
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6894
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6895
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6896
+ uint64_t offset = extra->offset + src2->view_offs;
6897
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6898
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6899
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
 
7076
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7077
 
7078
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7079
+ if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
7080
+ tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
7081
  }
7082
 
7083
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7084
  }
7085
 
7086
  float first_error_result = -1.0f;