ggerganov commited on
Commit
58507b9
·
unverified ·
1 Parent(s): 91ed6bd

sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)

Browse files
Files changed (4) hide show
  1. ggml-alloc.c +12 -11
  2. ggml-quants.c +5 -0
  3. ggml.c +83 -258
  4. ggml.h +5 -0
ggml-alloc.c CHANGED
@@ -446,12 +446,14 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
446
  return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
447
  }
448
 
449
- static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) {
450
  ggml_tallocr_t alloc = node_tallocr(galloc, view);
451
 
452
  //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
453
  GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
454
- view->backend = view->view_src->backend;
 
 
455
  view->buffer = view->view_src->buffer;
456
  view->data = (char *)view->view_src->data + view->view_offs;
457
 
@@ -469,7 +471,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
469
 
470
  if (node->data == NULL) {
471
  if (ggml_is_view(node)) {
472
- init_view(galloc, node);
473
  } else {
474
  // see if we can reuse a parent's buffer (inplace)
475
  if (ggml_op_can_inplace(node->op)) {
@@ -499,15 +501,14 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
499
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
500
  node->view_src = view_src;
501
  view_src_hn->n_views += 1;
502
- init_view(galloc, node);
503
  return;
504
  }
505
- }
506
- else {
507
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
508
  node->view_src = parent;
509
  p_hn->n_views += 1;
510
- init_view(galloc, node);
511
  return;
512
  }
513
  }
@@ -537,7 +538,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
537
  hash_get(galloc, view_src)->n_views += 1;
538
  if (node->buffer == NULL && node->data != NULL) {
539
  // view of a pre-allocated tensor, didn't call init_view() yet
540
- init_view(galloc, node);
541
  }
542
  }
543
 
@@ -548,7 +549,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
548
  }
549
  hash_get(galloc, parent)->n_children += 1;
550
  if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
551
- init_view(galloc, parent);
552
  }
553
  }
554
  }
@@ -663,7 +664,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
663
  return max_size;
664
  }
665
 
666
- void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) {
667
  const size_t hash_size = hash_set.size;
668
 
669
  GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
@@ -686,7 +687,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
686
  // reset hash values
687
  memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
688
 
689
- galloc->hash_allocs = hash_node_alloct;
690
 
691
  ggml_tallocr_alloc_graph_impl(galloc, graph);
692
 
 
446
  return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
447
  }
448
 
449
+ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
450
  ggml_tallocr_t alloc = node_tallocr(galloc, view);
451
 
452
  //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
453
  GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
454
+ if (update_backend) {
455
+ view->backend = view->view_src->backend;
456
+ }
457
  view->buffer = view->view_src->buffer;
458
  view->data = (char *)view->view_src->data + view->view_offs;
459
 
 
471
 
472
  if (node->data == NULL) {
473
  if (ggml_is_view(node)) {
474
+ init_view(galloc, node, true);
475
  } else {
476
  // see if we can reuse a parent's buffer (inplace)
477
  if (ggml_op_can_inplace(node->op)) {
 
501
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
502
  node->view_src = view_src;
503
  view_src_hn->n_views += 1;
504
+ init_view(galloc, node, false);
505
  return;
506
  }
507
+ } else {
 
508
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
509
  node->view_src = parent;
510
  p_hn->n_views += 1;
511
+ init_view(galloc, node, false);
512
  return;
513
  }
514
  }
 
538
  hash_get(galloc, view_src)->n_views += 1;
539
  if (node->buffer == NULL && node->data != NULL) {
540
  // view of a pre-allocated tensor, didn't call init_view() yet
541
+ init_view(galloc, node, true);
542
  }
543
  }
544
 
 
549
  }
550
  hash_get(galloc, parent)->n_children += 1;
551
  if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
552
+ init_view(galloc, parent, true);
553
  }
554
  }
555
  }
 
664
  return max_size;
665
  }
666
 
667
+ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
668
  const size_t hash_size = hash_set.size;
669
 
670
  GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
 
687
  // reset hash values
688
  memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
689
 
690
+ galloc->hash_allocs = hash_node_talloc;
691
 
692
  ggml_tallocr_alloc_graph_impl(galloc, graph);
693
 
ggml-quants.c CHANGED
@@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
1368
  float max = x[0];
1369
  float sum_w = weights[0];
1370
  float sum_x = sum_w * x[0];
 
 
 
 
1371
  for (int i = 1; i < n; ++i) {
 
1372
  if (x[i] < min) min = x[i];
1373
  if (x[i] > max) max = x[i];
1374
  float w = weights[i];
 
1368
  float max = x[0];
1369
  float sum_w = weights[0];
1370
  float sum_x = sum_w * x[0];
1371
+ #ifdef HAVE_BUGGY_APPLE_LINKER
1372
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
1373
+ for (volatile int i = 1; i < n; ++i) {
1374
+ #else
1375
  for (int i = 1; i < n; ++i) {
1376
+ #endif
1377
  if (x[i] < min) min = x[i];
1378
  if (x[i] > max) max = x[i];
1379
  float w = weights[i];
ggml.c CHANGED
@@ -5024,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
5024
  int n_dims,
5025
  int mode,
5026
  int n_ctx,
 
5027
  float freq_base,
5028
  float freq_scale,
 
 
 
 
5029
  float xpos_base,
5030
  bool xpos_down) {
5031
  GGML_ASSERT(ggml_is_vector(b));
@@ -5042,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
5042
 
5043
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5044
 
5045
- int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
5046
- memcpy(params + 4, &freq_base, sizeof(float));
5047
- memcpy(params + 5, &freq_scale, sizeof(float));
5048
- memcpy(params + 6, &xpos_base, sizeof(float));
5049
- memcpy(params + 7, &xpos_down, sizeof(bool));
 
 
 
 
5050
  ggml_set_op_params(result, params, sizeof(params));
5051
 
5052
  result->op = GGML_OP_ROPE_BACK;
@@ -9376,7 +9385,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9376
  }
9377
  #endif
9378
 
9379
-
9380
  static void ggml_compute_forward_mul_mat(
9381
  const struct ggml_compute_params * params,
9382
  const struct ggml_tensor * src0,
@@ -10946,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
10946
  const struct ggml_compute_params * params,
10947
  const struct ggml_tensor * src0,
10948
  const struct ggml_tensor * src1,
10949
- struct ggml_tensor * dst) {
 
10950
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10951
  return;
10952
  }
@@ -11005,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
11005
  const bool is_neox = mode & 2;
11006
  const bool is_glm = mode & 4;
11007
 
 
 
 
 
 
11008
  const int32_t * pos = (const int32_t *) src1->data;
11009
 
11010
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11021,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
11021
  float block_theta = MAX(p - (n_ctx - 2), 0);
11022
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11023
  const float cos_theta = cosf(theta_base);
11024
- const float sin_theta = sinf(theta_base);
11025
  const float cos_block_theta = cosf(block_theta);
11026
- const float sin_block_theta = sinf(block_theta);
11027
 
11028
  theta_base *= theta_scale;
11029
  block_theta *= theta_scale;
@@ -11047,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
11047
  rope_yarn(
11048
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11049
  );
 
11050
 
11051
  // zeta scaling for xPos only:
11052
  float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@@ -11077,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
11077
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11078
  &cos_theta, &sin_theta
11079
  );
 
11080
 
11081
  theta_base *= theta_scale;
11082
 
@@ -11102,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
11102
  const struct ggml_compute_params * params,
11103
  const struct ggml_tensor * src0,
11104
  const struct ggml_tensor * src1,
11105
- struct ggml_tensor * dst) {
 
11106
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11107
  return;
11108
  }
@@ -11154,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
11154
  const bool is_neox = mode & 2;
11155
  const bool is_glm = mode & 4;
11156
 
 
 
 
 
 
11157
  const int32_t * pos = (const int32_t *) src1->data;
11158
 
11159
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11170,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
11170
  float block_theta = MAX(p - (n_ctx - 2), 0);
11171
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11172
  const float cos_theta = cosf(theta_base);
11173
- const float sin_theta = sinf(theta_base);
11174
  const float cos_block_theta = cosf(block_theta);
11175
- const float sin_block_theta = sinf(block_theta);
11176
 
11177
  theta_base *= theta_scale;
11178
  block_theta *= theta_scale;
@@ -11196,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
11196
  rope_yarn(
11197
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11198
  );
 
11199
 
11200
  theta_base *= theta_scale;
11201
 
@@ -11222,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
11222
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11223
  &cos_theta, &sin_theta
11224
  );
 
11225
 
11226
  theta_base *= theta_scale;
11227
 
@@ -11251,11 +11275,11 @@ static void ggml_compute_forward_rope(
11251
  switch (src0->type) {
11252
  case GGML_TYPE_F16:
11253
  {
11254
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
11255
  } break;
11256
  case GGML_TYPE_F32:
11257
  {
11258
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
11259
  } break;
11260
  default:
11261
  {
@@ -11266,216 +11290,6 @@ static void ggml_compute_forward_rope(
11266
 
11267
  // ggml_compute_forward_rope_back
11268
 
11269
- static void ggml_compute_forward_rope_back_f32(
11270
- const struct ggml_compute_params * params,
11271
- const struct ggml_tensor * src0,
11272
- const struct ggml_tensor * src1,
11273
- struct ggml_tensor * dst) {
11274
-
11275
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11276
- return;
11277
- }
11278
-
11279
- // y = rope(x, src1)
11280
- // dx = rope_back(dy, src1)
11281
- // src0 is dy, src1 contains options
11282
-
11283
- float freq_base;
11284
- float freq_scale;
11285
-
11286
- // these two only relevant for xPos RoPE:
11287
- float xpos_base;
11288
- bool xpos_down;
11289
-
11290
- //const int n_past = ((int32_t *) dst->op_params)[0];
11291
- const int n_dims = ((int32_t *) dst->op_params)[1];
11292
- const int mode = ((int32_t *) dst->op_params)[2];
11293
- const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
11294
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11295
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
11296
- memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
11297
- memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
11298
-
11299
- GGML_TENSOR_UNARY_OP_LOCALS
11300
-
11301
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
11302
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
11303
-
11304
- assert(nb0 == sizeof(float));
11305
-
11306
- const int ith = params->ith;
11307
- const int nth = params->nth;
11308
-
11309
- const int nr = ggml_nrows(dst);
11310
-
11311
- // rows per thread
11312
- const int dr = (nr + nth - 1)/nth;
11313
-
11314
- // row range for this thread
11315
- const int ir0 = dr*ith;
11316
- const int ir1 = MIN(ir0 + dr, nr);
11317
-
11318
- // row index used to determine which thread to use
11319
- int ir = 0;
11320
-
11321
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
11322
-
11323
- const bool is_neox = mode & 2;
11324
-
11325
- const int32_t * pos = (const int32_t *) src1->data;
11326
-
11327
- for (int64_t i3 = 0; i3 < ne3; i3++) {
11328
- for (int64_t i2 = 0; i2 < ne2; i2++) {
11329
- const int64_t p = pos[i2];
11330
- for (int64_t i1 = 0; i1 < ne1; i1++) {
11331
- if (ir++ < ir0) continue;
11332
- if (ir > ir1) break;
11333
-
11334
- float theta_base = freq_scale * (float)p;
11335
-
11336
- if (!is_neox) {
11337
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11338
- const float cos_theta = cosf(theta_base);
11339
- const float sin_theta = sinf(theta_base);
11340
-
11341
- // zeta scaling for xPos only:
11342
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
11343
- if (xpos_down) zeta = 1.0f / zeta;
11344
-
11345
- theta_base *= theta_scale;
11346
-
11347
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11348
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11349
-
11350
- const float dy0 = dy[0];
11351
- const float dy1 = dy[1];
11352
-
11353
- dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
11354
- dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
11355
- }
11356
- } else {
11357
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11358
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11359
- const float cos_theta = cosf(theta_base);
11360
- const float sin_theta = sinf(theta_base);
11361
-
11362
- theta_base *= theta_scale;
11363
-
11364
- const int64_t i0 = ib*n_dims + ic/2;
11365
-
11366
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11367
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11368
-
11369
- const float dy0 = dy[0];
11370
- const float dy1 = dy[n_dims/2];
11371
-
11372
- dx[0] = dy0*cos_theta + dy1*sin_theta;
11373
- dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
11374
- }
11375
- }
11376
- }
11377
- }
11378
- }
11379
- }
11380
- }
11381
-
11382
- static void ggml_compute_forward_rope_back_f16(
11383
- const struct ggml_compute_params * params,
11384
- const struct ggml_tensor * src0,
11385
- const struct ggml_tensor * src1,
11386
- struct ggml_tensor * dst) {
11387
-
11388
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11389
- return;
11390
- }
11391
-
11392
- // y = rope(x, src1)
11393
- // dx = rope_back(dy, src1)
11394
- // src0 is dy, src1 contains options
11395
-
11396
- //const int n_past = ((int32_t *) dst->op_params)[0];
11397
- const int n_dims = ((int32_t *) dst->op_params)[1];
11398
- const int mode = ((int32_t *) dst->op_params)[2];
11399
-
11400
- GGML_TENSOR_UNARY_OP_LOCALS
11401
-
11402
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
11403
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
11404
-
11405
- assert(nb0 == sizeof(ggml_fp16_t));
11406
-
11407
- const int ith = params->ith;
11408
- const int nth = params->nth;
11409
-
11410
- const int nr = ggml_nrows(dst);
11411
-
11412
- // rows per thread
11413
- const int dr = (nr + nth - 1)/nth;
11414
-
11415
- // row range for this thread
11416
- const int ir0 = dr*ith;
11417
- const int ir1 = MIN(ir0 + dr, nr);
11418
-
11419
- // row index used to determine which thread to use
11420
- int ir = 0;
11421
-
11422
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
11423
-
11424
- const bool is_neox = mode & 2;
11425
-
11426
- const int32_t * pos = (const int32_t *) src1->data;
11427
-
11428
- for (int64_t i3 = 0; i3 < ne3; i3++) {
11429
- for (int64_t i2 = 0; i2 < ne2; i2++) {
11430
- const int64_t p = pos[i2];
11431
- for (int64_t i1 = 0; i1 < ne1; i1++) {
11432
- if (ir++ < ir0) continue;
11433
- if (ir > ir1) break;
11434
-
11435
- float theta_base = (float)p;
11436
-
11437
- if (!is_neox) {
11438
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11439
- const float cos_theta = cosf(theta_base);
11440
- const float sin_theta = sinf(theta_base);
11441
-
11442
- theta_base *= theta_scale;
11443
-
11444
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11445
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11446
-
11447
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
11448
- const float dy1 = GGML_FP16_TO_FP32(dy[1]);
11449
-
11450
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
11451
- dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
11452
- }
11453
- } else {
11454
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11455
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11456
- const float cos_theta = cosf(theta_base);
11457
- const float sin_theta = sinf(theta_base);
11458
-
11459
- theta_base *= theta_scale;
11460
-
11461
- const int64_t i0 = ib*n_dims + ic/2;
11462
-
11463
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11464
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11465
-
11466
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
11467
- const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
11468
-
11469
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
11470
- dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
11471
- }
11472
- }
11473
- }
11474
- }
11475
- }
11476
- }
11477
- }
11478
-
11479
  static void ggml_compute_forward_rope_back(
11480
  const struct ggml_compute_params * params,
11481
  const struct ggml_tensor * src0,
@@ -11484,11 +11298,11 @@ static void ggml_compute_forward_rope_back(
11484
  switch (src0->type) {
11485
  case GGML_TYPE_F16:
11486
  {
11487
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
11488
  } break;
11489
  case GGML_TYPE_F32:
11490
  {
11491
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
11492
  } break;
11493
  default:
11494
  {
@@ -14923,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14923
  // necessary for llama
14924
  if (src0->grad) {
14925
  //const int n_past = ((int32_t *) tensor->op_params)[0];
14926
- const int n_dims = ((int32_t *) tensor->op_params)[1];
14927
- const int mode = ((int32_t *) tensor->op_params)[2];
14928
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
14929
- float freq_base;
14930
- float freq_scale;
14931
- float xpos_base;
14932
- bool xpos_down;
14933
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
14934
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
14935
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
14936
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
 
 
 
14937
 
14938
  src0->grad = ggml_add_or_set(ctx,
14939
  src0->grad,
@@ -14943,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14943
  n_dims,
14944
  mode,
14945
  n_ctx,
 
14946
  freq_base,
14947
  freq_scale,
 
 
 
 
14948
  xpos_base,
14949
  xpos_down),
14950
  zero_table);
@@ -14954,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14954
  {
14955
  if (src0->grad) {
14956
  //const int n_past = ((int32_t *) tensor->op_params)[0];
14957
- const int n_dims = ((int32_t *) tensor->op_params)[1];
14958
- const int mode = ((int32_t *) tensor->op_params)[2];
14959
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
14960
- float freq_base;
14961
- float freq_scale;
14962
- float xpos_base;
14963
- bool xpos_down;
14964
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
14965
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
14966
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
14967
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
 
 
 
14968
 
14969
  src0->grad = ggml_add_or_set(ctx,
14970
  src0->grad,
@@ -14973,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14973
  src1,
14974
  n_dims,
14975
  mode,
14976
- 0,
14977
  n_ctx,
 
14978
  freq_base,
14979
  freq_scale,
14980
- 0.0f,
14981
- 1.0f,
14982
- 0.0f,
14983
- 0.0f,
14984
  xpos_base,
14985
  xpos_down,
14986
  false),
@@ -18248,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18248
  {
18249
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18250
 
18251
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
18252
  struct gguf_kv * kv = &ctx->kv[i];
18253
 
18254
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18295,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18295
  case GGUF_TYPE_STRING:
18296
  {
18297
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18298
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
18299
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18300
  }
18301
  } break;
@@ -18323,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18323
  {
18324
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18325
 
18326
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18327
  struct gguf_tensor_info * info = &ctx->infos[i];
18328
 
18329
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -18370,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18370
  // compute the total size of the data section, taking into account the alignment
18371
  {
18372
  ctx->size = 0;
18373
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18374
  struct gguf_tensor_info * info = &ctx->infos[i];
18375
 
18376
  const int64_t ne =
@@ -18439,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18439
  ggml_set_no_alloc(ctx_data, true);
18440
 
18441
  // create the tensors
18442
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18443
  const int64_t ne[GGML_MAX_DIMS] = {
18444
  ctx->infos[i].ne[0],
18445
  ctx->infos[i].ne[1],
 
5024
  int n_dims,
5025
  int mode,
5026
  int n_ctx,
5027
+ int n_orig_ctx,
5028
  float freq_base,
5029
  float freq_scale,
5030
+ float ext_factor,
5031
+ float attn_factor,
5032
+ float beta_fast,
5033
+ float beta_slow,
5034
  float xpos_base,
5035
  bool xpos_down) {
5036
  GGML_ASSERT(ggml_is_vector(b));
 
5047
 
5048
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5049
 
5050
+ int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
5051
+ memcpy(params + 5, &freq_base, sizeof(float));
5052
+ memcpy(params + 6, &freq_scale, sizeof(float));
5053
+ memcpy(params + 7, &ext_factor, sizeof(float));
5054
+ memcpy(params + 8, &attn_factor, sizeof(float));
5055
+ memcpy(params + 9, &beta_fast, sizeof(float));
5056
+ memcpy(params + 10, &beta_slow, sizeof(float));
5057
+ memcpy(params + 11, &xpos_base, sizeof(float));
5058
+ memcpy(params + 12, &xpos_down, sizeof(bool));
5059
  ggml_set_op_params(result, params, sizeof(params));
5060
 
5061
  result->op = GGML_OP_ROPE_BACK;
 
9385
  }
9386
  #endif
9387
 
 
9388
  static void ggml_compute_forward_mul_mat(
9389
  const struct ggml_compute_params * params,
9390
  const struct ggml_tensor * src0,
 
10954
  const struct ggml_compute_params * params,
10955
  const struct ggml_tensor * src0,
10956
  const struct ggml_tensor * src1,
10957
+ struct ggml_tensor * dst,
10958
+ const bool forward) {
10959
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10960
  return;
10961
  }
 
11014
  const bool is_neox = mode & 2;
11015
  const bool is_glm = mode & 4;
11016
 
11017
+ // backward process uses inverse rotation by cos and sin.
11018
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
11019
+ // this essentially just switches the sign of sin.
11020
+ const float sin_sign = forward ? 1.0f : -1.0f;
11021
+
11022
  const int32_t * pos = (const int32_t *) src1->data;
11023
 
11024
  for (int64_t i3 = 0; i3 < ne3; i3++) {
 
11035
  float block_theta = MAX(p - (n_ctx - 2), 0);
11036
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11037
  const float cos_theta = cosf(theta_base);
11038
+ const float sin_theta = sinf(theta_base) * sin_sign;
11039
  const float cos_block_theta = cosf(block_theta);
11040
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
11041
 
11042
  theta_base *= theta_scale;
11043
  block_theta *= theta_scale;
 
11061
  rope_yarn(
11062
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11063
  );
11064
+ sin_theta *= sin_sign;
11065
 
11066
  // zeta scaling for xPos only:
11067
  float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
 
11092
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11093
  &cos_theta, &sin_theta
11094
  );
11095
+ sin_theta *= sin_sign;
11096
 
11097
  theta_base *= theta_scale;
11098
 
 
11118
  const struct ggml_compute_params * params,
11119
  const struct ggml_tensor * src0,
11120
  const struct ggml_tensor * src1,
11121
+ struct ggml_tensor * dst,
11122
+ const bool forward) {
11123
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11124
  return;
11125
  }
 
11171
  const bool is_neox = mode & 2;
11172
  const bool is_glm = mode & 4;
11173
 
11174
+ // backward process uses inverse rotation by cos and sin.
11175
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
11176
+ // this essentially just switches the sign of sin.
11177
+ const float sin_sign = forward ? 1.0f : -1.0f;
11178
+
11179
  const int32_t * pos = (const int32_t *) src1->data;
11180
 
11181
  for (int64_t i3 = 0; i3 < ne3; i3++) {
 
11192
  float block_theta = MAX(p - (n_ctx - 2), 0);
11193
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11194
  const float cos_theta = cosf(theta_base);
11195
+ const float sin_theta = sinf(theta_base) * sin_sign;
11196
  const float cos_block_theta = cosf(block_theta);
11197
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
11198
 
11199
  theta_base *= theta_scale;
11200
  block_theta *= theta_scale;
 
11218
  rope_yarn(
11219
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11220
  );
11221
+ sin_theta *= sin_sign;
11222
 
11223
  theta_base *= theta_scale;
11224
 
 
11245
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11246
  &cos_theta, &sin_theta
11247
  );
11248
+ sin_theta *= sin_sign;
11249
 
11250
  theta_base *= theta_scale;
11251
 
 
11275
  switch (src0->type) {
11276
  case GGML_TYPE_F16:
11277
  {
11278
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
11279
  } break;
11280
  case GGML_TYPE_F32:
11281
  {
11282
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
11283
  } break;
11284
  default:
11285
  {
 
11290
 
11291
  // ggml_compute_forward_rope_back
11292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11293
  static void ggml_compute_forward_rope_back(
11294
  const struct ggml_compute_params * params,
11295
  const struct ggml_tensor * src0,
 
11298
  switch (src0->type) {
11299
  case GGML_TYPE_F16:
11300
  {
11301
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
11302
  } break;
11303
  case GGML_TYPE_F32:
11304
  {
11305
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
11306
  } break;
11307
  default:
11308
  {
 
14737
  // necessary for llama
14738
  if (src0->grad) {
14739
  //const int n_past = ((int32_t *) tensor->op_params)[0];
14740
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
14741
+ const int mode = ((int32_t *) tensor->op_params)[2];
14742
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
14743
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
14744
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
14745
+
14746
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
14747
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
14748
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
14749
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
14750
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
14751
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
14752
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
14753
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
14754
 
14755
  src0->grad = ggml_add_or_set(ctx,
14756
  src0->grad,
 
14760
  n_dims,
14761
  mode,
14762
  n_ctx,
14763
+ n_orig_ctx,
14764
  freq_base,
14765
  freq_scale,
14766
+ ext_factor,
14767
+ attn_factor,
14768
+ beta_fast,
14769
+ beta_slow,
14770
  xpos_base,
14771
  xpos_down),
14772
  zero_table);
 
14776
  {
14777
  if (src0->grad) {
14778
  //const int n_past = ((int32_t *) tensor->op_params)[0];
14779
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
14780
+ const int mode = ((int32_t *) tensor->op_params)[2];
14781
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
14782
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
14783
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
14784
+
14785
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
14786
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
14787
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
14788
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
14789
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
14790
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
14791
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
14792
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
14793
 
14794
  src0->grad = ggml_add_or_set(ctx,
14795
  src0->grad,
 
14798
  src1,
14799
  n_dims,
14800
  mode,
 
14801
  n_ctx,
14802
+ n_orig_ctx,
14803
  freq_base,
14804
  freq_scale,
14805
+ ext_factor,
14806
+ attn_factor,
14807
+ beta_fast,
14808
+ beta_slow,
14809
  xpos_base,
14810
  xpos_down,
14811
  false),
 
18073
  {
18074
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18075
 
18076
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
18077
  struct gguf_kv * kv = &ctx->kv[i];
18078
 
18079
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
 
18120
  case GGUF_TYPE_STRING:
18121
  {
18122
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18123
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
18124
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18125
  }
18126
  } break;
 
18148
  {
18149
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18150
 
18151
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18152
  struct gguf_tensor_info * info = &ctx->infos[i];
18153
 
18154
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 
18195
  // compute the total size of the data section, taking into account the alignment
18196
  {
18197
  ctx->size = 0;
18198
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18199
  struct gguf_tensor_info * info = &ctx->infos[i];
18200
 
18201
  const int64_t ne =
 
18264
  ggml_set_no_alloc(ctx_data, true);
18265
 
18266
  // create the tensors
18267
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18268
  const int64_t ne[GGML_MAX_DIMS] = {
18269
  ctx->infos[i].ne[0],
18270
  ctx->infos[i].ne[1],
ggml.h CHANGED
@@ -1371,8 +1371,13 @@ extern "C" {
1371
  int n_dims,
1372
  int mode,
1373
  int n_ctx,
 
1374
  float freq_base,
1375
  float freq_scale,
 
 
 
 
1376
  float xpos_base,
1377
  bool xpos_down);
1378
 
 
1371
  int n_dims,
1372
  int mode,
1373
  int n_ctx,
1374
+ int n_orig_ctx,
1375
  float freq_base,
1376
  float freq_scale,
1377
+ float ext_factor,
1378
+ float attn_factor,
1379
+ float beta_fast,
1380
+ float beta_slow,
1381
  float xpos_base,
1382
  bool xpos_down);
1383