ggerganov commited on
Commit
8d0f0ac
·
1 Parent(s): e7722cb

tts : add OuteTTS support (llama/10784)

Browse files

* server : add "tokens" output

ggml-ci

* server : output embeddings for all tokens when pooling = none

ggml-ci

* server : be explicit about the pooling type in the tests

ggml-ci

* server : do not normalize embeddings when there is no pooling

ggml-ci

* llama : add OuteTTS support (wip)

* wip

* extract features

* first conv

* group norm

* resnet conv

* resnet

* attn

* pos net

* layer norm

* convnext

* head

* hann window

* fix n_embd + remove llama.cpp hacks

* compute hann window

* fft

* spectrum processing

* clean-up

* tts : receive input text and generate codes

* clip : fix new conv name

* tts : minor fix

* tts : add header + minor fixes

ggml-ci

* tts : add matchematical constant

ggml-ci

* tts : fix sampling + cut initial noise

* tts : fixes

* tts : update default samplers

ggml-ci

* tts : text pre-processing

* tts : outetts-voc -> wavtokenizer-dec

* tts : remove hardcoded constants

ggml-ci

* tts : fix tensor shapes

* llama : refactor wavtokenizer tensors

ggml-ci

* cont

ggml-ci

* cont [no ci]

* llama : update WavTokenizer to non-causal attn

* llama : handle no-vocab detokenization

* tts : add Python example for OuteTTS (wip)

* tts : extend python example to generate spectrogram

ggml-ci

* server : fix rebase artifacts

* tts : enable "return_tokens" in Python example

ggml-ci

* tts : minor fixes

* common : support HF download for vocoder

Files changed (2) hide show
  1. ggml/include/ggml.h +29 -12
  2. ggml/src/ggml.c +127 -93
ggml/include/ggml.h CHANGED
@@ -1564,17 +1564,6 @@ extern "C" {
1564
  int d1, // dilation dimension 1
1565
  bool is_2D);
1566
 
1567
- GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
1568
- struct ggml_context * ctx,
1569
- struct ggml_tensor * a, // convolution kernel
1570
- struct ggml_tensor * b, // data
1571
- int s0, // stride dimension 0
1572
- int s1, // stride dimension 1
1573
- int p0, // padding dimension 0
1574
- int p1, // padding dimension 1
1575
- int d0, // dilation dimension 0
1576
- int d1); // dilation dimension 1
1577
-
1578
  GGML_API struct ggml_tensor * ggml_conv_1d(
1579
  struct ggml_context * ctx,
1580
  struct ggml_tensor * a, // convolution kernel
@@ -1592,6 +1581,23 @@ extern "C" {
1592
  int s, // stride
1593
  int d); // dilation
1594
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1595
  GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1596
  struct ggml_context * ctx,
1597
  struct ggml_tensor * a, // convolution kernel
@@ -1611,7 +1617,6 @@ extern "C" {
1611
  int d0, // dilation dimension 0
1612
  int d1); // dilation dimension 1
1613
 
1614
-
1615
  // kernel size is a->ne[0] x a->ne[1]
1616
  // stride is equal to kernel size
1617
  // padding is zero
@@ -1638,6 +1643,18 @@ extern "C" {
1638
  struct ggml_tensor * a,
1639
  struct ggml_tensor * b);
1640
 
 
 
 
 
 
 
 
 
 
 
 
 
1641
  GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1642
  struct ggml_context * ctx,
1643
  struct ggml_tensor * a,
 
1564
  int d1, // dilation dimension 1
1565
  bool is_2D);
1566
 
 
 
 
 
 
 
 
 
 
 
 
1567
  GGML_API struct ggml_tensor * ggml_conv_1d(
1568
  struct ggml_context * ctx,
1569
  struct ggml_tensor * a, // convolution kernel
 
1581
  int s, // stride
1582
  int d); // dilation
1583
 
1584
+ // depthwise
1585
+ // TODO: this is very likely wrong for some cases! - needs more testing
1586
+ GGML_API struct ggml_tensor * ggml_conv_1d_dw(
1587
+ struct ggml_context * ctx,
1588
+ struct ggml_tensor * a, // convolution kernel
1589
+ struct ggml_tensor * b, // data
1590
+ int s0, // stride
1591
+ int p0, // padding
1592
+ int d0); // dilation
1593
+
1594
+ GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
1595
+ struct ggml_context * ctx,
1596
+ struct ggml_tensor * a, // convolution kernel
1597
+ struct ggml_tensor * b, // data
1598
+ int s0, // stride
1599
+ int d0); // dilation
1600
+
1601
  GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1602
  struct ggml_context * ctx,
1603
  struct ggml_tensor * a, // convolution kernel
 
1617
  int d0, // dilation dimension 0
1618
  int d1); // dilation dimension 1
1619
 
 
1620
  // kernel size is a->ne[0] x a->ne[1]
1621
  // stride is equal to kernel size
1622
  // padding is zero
 
1643
  struct ggml_tensor * a,
1644
  struct ggml_tensor * b);
1645
 
1646
+ // depthwise
1647
+ GGML_API struct ggml_tensor * ggml_conv_2d_dw(
1648
+ struct ggml_context * ctx,
1649
+ struct ggml_tensor * a, // convolution kernel
1650
+ struct ggml_tensor * b, // data
1651
+ int s0, // stride dimension 0
1652
+ int s1, // stride dimension 1
1653
+ int p0, // padding dimension 0
1654
+ int p1, // padding dimension 1
1655
+ int d0, // dilation dimension 0
1656
+ int d1); // dilation dimension 1
1657
+
1658
  GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1659
  struct ggml_context * ctx,
1660
  struct ggml_tensor * a,
ggml/src/ggml.c CHANGED
@@ -3760,13 +3760,84 @@ struct ggml_tensor * ggml_clamp(
3760
  return result;
3761
  }
3762
 
3763
- // ggml_conv_1d
3764
-
3765
  static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
3766
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
3767
  }
3768
 
3769
- GGML_API struct ggml_tensor * ggml_conv_1d(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3770
  struct ggml_context * ctx,
3771
  struct ggml_tensor * a,
3772
  struct ggml_tensor * b,
@@ -3796,137 +3867,75 @@ struct ggml_tensor* ggml_conv_1d_ph(
3796
  return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
3797
  }
3798
 
3799
- // ggml_conv_transpose_1d
3800
-
3801
- static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
3802
- return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
3803
- }
3804
 
3805
- GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
3806
  struct ggml_context * ctx,
3807
  struct ggml_tensor * a,
3808
  struct ggml_tensor * b,
3809
  int s0,
3810
  int p0,
3811
  int d0) {
3812
- GGML_ASSERT(ggml_is_matrix(b));
3813
- GGML_ASSERT(a->ne[2] == b->ne[1]);
3814
- GGML_ASSERT(a->ne[3] == 1);
3815
 
3816
- GGML_ASSERT(p0 == 0);
3817
- GGML_ASSERT(d0 == 1);
3818
 
3819
- const int64_t ne[4] = {
3820
- ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
3821
- a->ne[1], b->ne[2], 1,
3822
- };
3823
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3824
 
3825
- int32_t params[] = { s0, p0, d0 };
3826
- ggml_set_op_params(result, params, sizeof(params));
3827
-
3828
- result->op = GGML_OP_CONV_TRANSPOSE_1D;
3829
- result->src[0] = a;
3830
- result->src[1] = b;
3831
 
3832
  return result;
3833
  }
3834
 
3835
- // ggml_conv_depthwise
3836
 
3837
- struct ggml_tensor * ggml_conv_depthwise_2d(
3838
  struct ggml_context * ctx,
3839
  struct ggml_tensor * a,
3840
  struct ggml_tensor * b,
3841
  int s0,
3842
- int s1,
3843
- int p0,
3844
- int p1,
3845
- int d0,
3846
- int d1) {
3847
- struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
3848
- struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
3849
- ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
3850
- s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
3851
- struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
3852
 
3853
- new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
3854
- struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
3855
- result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
3856
 
3857
- return result;
 
3858
  }
3859
- // ggml_conv_2d
3860
 
3861
- // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
3862
- // a: [OC,IC, KH, KW]
3863
- // b: [N, IC, IH, IW]
3864
- // result: [N, OH, OW, IC*KH*KW]
3865
- struct ggml_tensor * ggml_im2col(
3866
  struct ggml_context * ctx,
3867
  struct ggml_tensor * a,
3868
  struct ggml_tensor * b,
3869
  int s0,
3870
- int s1,
3871
  int p0,
3872
- int p1,
3873
- int d0,
3874
- int d1,
3875
- bool is_2D,
3876
- enum ggml_type dst_type) {
3877
- if(is_2D) {
3878
- GGML_ASSERT(a->ne[2] == b->ne[2]);
3879
- } else {
3880
- GGML_ASSERT(a->ne[1] == b->ne[1]);
3881
- GGML_ASSERT(b->ne[3] == 1);
3882
- }
3883
-
3884
- const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
3885
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
3886
 
3887
- GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
3888
- GGML_ASSERT((OW > 0) && "b too small compared to a");
3889
 
3890
  const int64_t ne[4] = {
3891
- is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
3892
- OW,
3893
- is_2D ? OH : b->ne[2],
3894
- is_2D ? b->ne[3] : 1,
3895
  };
 
3896
 
3897
- struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
3898
- int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3899
  ggml_set_op_params(result, params, sizeof(params));
3900
 
3901
- result->op = GGML_OP_IM2COL;
3902
  result->src[0] = a;
3903
  result->src[1] = b;
3904
 
3905
  return result;
3906
  }
3907
 
3908
- struct ggml_tensor * ggml_im2col_back(
3909
- struct ggml_context * ctx,
3910
- struct ggml_tensor * a,
3911
- struct ggml_tensor * b,
3912
- int64_t * ne,
3913
- int s0,
3914
- int s1,
3915
- int p0,
3916
- int p1,
3917
- int d0,
3918
- int d1,
3919
- bool is_2D) {
3920
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3921
- int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3922
- ggml_set_op_params(result, params, sizeof(params));
3923
-
3924
- result->op = GGML_OP_IM2COL_BACK;
3925
- result->src[0] = a;
3926
- result->src[1] = b;
3927
-
3928
- return result;
3929
- }
3930
 
3931
  // a: [OC,IC, KH, KW]
3932
  // b: [N, IC, IH, IW]
@@ -3973,6 +3982,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph(
3973
  return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
3974
  }
3975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3976
  // ggml_conv_transpose_2d_p0
3977
 
3978
  static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
 
3760
  return result;
3761
  }
3762
 
 
 
3763
  static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
3764
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
3765
  }
3766
 
3767
+ // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
3768
+ // a: [OC,IC, KH, KW]
3769
+ // b: [N, IC, IH, IW]
3770
+ // result: [N, OH, OW, IC*KH*KW]
3771
+ struct ggml_tensor * ggml_im2col(
3772
+ struct ggml_context * ctx,
3773
+ struct ggml_tensor * a,
3774
+ struct ggml_tensor * b,
3775
+ int s0,
3776
+ int s1,
3777
+ int p0,
3778
+ int p1,
3779
+ int d0,
3780
+ int d1,
3781
+ bool is_2D,
3782
+ enum ggml_type dst_type) {
3783
+ if (is_2D) {
3784
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
3785
+ } else {
3786
+ //GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
3787
+ GGML_ASSERT(b->ne[1] == a->ne[1]);
3788
+ GGML_ASSERT(b->ne[3] == 1);
3789
+ }
3790
+
3791
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
3792
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
3793
+
3794
+ GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
3795
+ GGML_ASSERT((OW > 0) && "b too small compared to a");
3796
+
3797
+ const int64_t ne[4] = {
3798
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
3799
+ OW,
3800
+ is_2D ? OH : b->ne[2],
3801
+ is_2D ? b->ne[3] : 1,
3802
+ };
3803
+
3804
+ struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
3805
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3806
+ ggml_set_op_params(result, params, sizeof(params));
3807
+
3808
+ result->op = GGML_OP_IM2COL;
3809
+ result->src[0] = a;
3810
+ result->src[1] = b;
3811
+
3812
+ return result;
3813
+ }
3814
+
3815
+ struct ggml_tensor * ggml_im2col_back(
3816
+ struct ggml_context * ctx,
3817
+ struct ggml_tensor * a,
3818
+ struct ggml_tensor * b,
3819
+ int64_t * ne,
3820
+ int s0,
3821
+ int s1,
3822
+ int p0,
3823
+ int p1,
3824
+ int d0,
3825
+ int d1,
3826
+ bool is_2D) {
3827
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3828
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3829
+ ggml_set_op_params(result, params, sizeof(params));
3830
+
3831
+ result->op = GGML_OP_IM2COL_BACK;
3832
+ result->src[0] = a;
3833
+ result->src[1] = b;
3834
+
3835
+ return result;
3836
+ }
3837
+
3838
+ // ggml_conv_1d
3839
+
3840
+ struct ggml_tensor * ggml_conv_1d(
3841
  struct ggml_context * ctx,
3842
  struct ggml_tensor * a,
3843
  struct ggml_tensor * b,
 
3867
  return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
3868
  }
3869
 
3870
+ // ggml_conv_1d_dw
 
 
 
 
3871
 
3872
+ struct ggml_tensor * ggml_conv_1d_dw(
3873
  struct ggml_context * ctx,
3874
  struct ggml_tensor * a,
3875
  struct ggml_tensor * b,
3876
  int s0,
3877
  int p0,
3878
  int d0) {
3879
+ struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
3880
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
 
3881
 
3882
+ struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
 
3883
 
3884
+ struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
 
 
 
 
3885
 
3886
+ result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
 
 
 
 
 
3887
 
3888
  return result;
3889
  }
3890
 
3891
+ // ggml_conv_1d_dw_ph
3892
 
3893
+ struct ggml_tensor * ggml_conv_1d_dw_ph(
3894
  struct ggml_context * ctx,
3895
  struct ggml_tensor * a,
3896
  struct ggml_tensor * b,
3897
  int s0,
3898
+ int d0) {
3899
+ return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
3900
+ }
 
 
 
 
 
 
 
3901
 
3902
+ // ggml_conv_transpose_1d
 
 
3903
 
3904
+ static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
3905
+ return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
3906
  }
 
3907
 
3908
+ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
 
 
 
 
3909
  struct ggml_context * ctx,
3910
  struct ggml_tensor * a,
3911
  struct ggml_tensor * b,
3912
  int s0,
 
3913
  int p0,
3914
+ int d0) {
3915
+ GGML_ASSERT(ggml_is_matrix(b));
3916
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
3917
+ GGML_ASSERT(a->ne[3] == 1);
 
 
 
 
 
 
 
 
 
 
3918
 
3919
+ GGML_ASSERT(p0 == 0);
3920
+ GGML_ASSERT(d0 == 1);
3921
 
3922
  const int64_t ne[4] = {
3923
+ ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
3924
+ a->ne[1], b->ne[2], 1,
 
 
3925
  };
3926
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3927
 
3928
+ int32_t params[] = { s0, p0, d0 };
 
3929
  ggml_set_op_params(result, params, sizeof(params));
3930
 
3931
+ result->op = GGML_OP_CONV_TRANSPOSE_1D;
3932
  result->src[0] = a;
3933
  result->src[1] = b;
3934
 
3935
  return result;
3936
  }
3937
 
3938
+ // ggml_conv_2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3939
 
3940
  // a: [OC,IC, KH, KW]
3941
  // b: [N, IC, IH, IW]
 
3982
  return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
3983
  }
3984
 
3985
+ // ggml_conv_2d_dw
3986
+
3987
+ struct ggml_tensor * ggml_conv_2d_dw(
3988
+ struct ggml_context * ctx,
3989
+ struct ggml_tensor * a,
3990
+ struct ggml_tensor * b,
3991
+ int s0,
3992
+ int s1,
3993
+ int p0,
3994
+ int p1,
3995
+ int d0,
3996
+ int d1) {
3997
+ struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
3998
+ struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
3999
+ ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
4000
+ s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
4001
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
4002
+
4003
+ new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
4004
+ struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
4005
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
4006
+
4007
+ return result;
4008
+ }
4009
+
4010
  // ggml_conv_transpose_2d_p0
4011
 
4012
  static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {