Spaces:
Running
Running
Commit
·
8d3b3c1
1
Parent(s):
0612f1f
llama/ggml: add LLM training support (llama/10544)
Browse files* llama/ggml: add LLM training support
more compact progress bar
llama_save_model_to_file
llama_opt_param_filter
ggml_graph_dup force_grads
refactor ggml_opt, fix test-opt
* remove logits_all
* refactor CUDA implementation for ACC
* reset graph at beginning of opt period
- ggml/include/ggml-opt.h +47 -28
- ggml/include/ggml.h +6 -7
- ggml/src/ggml-backend.cpp +1 -1
- ggml/src/ggml-cuda/acc.cu +40 -26
- ggml/src/ggml-cuda/sum.cu +1 -1
- ggml/src/ggml-opt.cpp +368 -190
- ggml/src/ggml.c +23 -18
ggml/include/ggml-opt.h
CHANGED
|
@@ -37,13 +37,16 @@ extern "C" {
|
|
| 37 |
// ====== Dataset ======
|
| 38 |
|
| 39 |
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
int64_t
|
| 43 |
-
int64_t
|
|
|
|
|
|
|
| 44 |
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
|
| 45 |
|
| 46 |
// get underlying tensors that store the data
|
|
|
|
| 47 |
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
|
| 48 |
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
|
| 49 |
|
|
@@ -56,13 +59,19 @@ extern "C" {
|
|
| 56 |
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
|
| 57 |
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
|
| 58 |
int64_t ibatch);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
// ====== Model / Context ======
|
| 61 |
|
| 62 |
enum ggml_opt_build_type {
|
| 63 |
-
GGML_OPT_BUILD_TYPE_FORWARD,
|
| 64 |
-
GGML_OPT_BUILD_TYPE_GRAD,
|
| 65 |
-
GGML_OPT_BUILD_TYPE_OPT,
|
| 66 |
};
|
| 67 |
|
| 68 |
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
|
@@ -81,20 +90,22 @@ extern "C" {
|
|
| 81 |
// userdata can be used to pass arbitrary data
|
| 82 |
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
|
| 83 |
|
| 84 |
-
// returns the default optimizer params (constant)
|
| 85 |
// userdata is not used
|
| 86 |
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
|
| 87 |
|
|
|
|
|
|
|
|
|
|
| 88 |
// parameters for initializing a new optimization context
|
| 89 |
struct ggml_opt_params {
|
| 90 |
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
struct ggml_tensor
|
| 97 |
-
struct ggml_tensor * outputs;
|
| 98 |
|
| 99 |
enum ggml_opt_loss_type loss_type;
|
| 100 |
enum ggml_opt_build_type build_type;
|
|
@@ -107,12 +118,9 @@ extern "C" {
|
|
| 107 |
|
| 108 |
// get parameters for an optimization context with defaults set where possible
|
| 109 |
// parameters for which no sensible defaults exist are supplied as arguments to this function
|
| 110 |
-
GGML_API ggml_opt_params ggml_opt_default_params(
|
| 111 |
-
ggml_backend_sched_t
|
| 112 |
-
|
| 113 |
-
struct ggml_tensor * inputs,
|
| 114 |
-
struct ggml_tensor * outputs,
|
| 115 |
-
enum ggml_opt_loss_type loss_type);
|
| 116 |
|
| 117 |
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
|
| 118 |
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
|
|
@@ -121,6 +129,7 @@ extern "C" {
|
|
| 121 |
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
|
| 122 |
|
| 123 |
// get underlying tensors that store data
|
|
|
|
| 124 |
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
|
| 125 |
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
|
| 126 |
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
|
|
@@ -128,11 +137,12 @@ extern "C" {
|
|
| 128 |
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
|
| 129 |
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
|
| 130 |
|
|
|
|
| 131 |
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
| 132 |
|
| 133 |
// ====== Optimization Result ======
|
| 134 |
|
| 135 |
-
GGML_API ggml_opt_result_t ggml_opt_result_init();
|
| 136 |
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
|
| 137 |
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
|
| 138 |
|
|
@@ -144,11 +154,20 @@ extern "C" {
|
|
| 144 |
|
| 145 |
// ====== Computation ======
|
| 146 |
|
| 147 |
-
//
|
| 148 |
-
GGML_API void
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
// do forward pass, increment result if not NULL, do backward pass
|
| 151 |
-
GGML_API void
|
| 152 |
|
| 153 |
// ############################################################################
|
| 154 |
// ## The high-level functions start here. They do not depend on any private ##
|
|
@@ -200,9 +219,9 @@ extern "C" {
|
|
| 200 |
// fit model defined by inputs and outputs to dataset
|
| 201 |
GGML_API void ggml_opt_fit(
|
| 202 |
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
|
| 203 |
-
ggml_context
|
| 204 |
-
ggml_tensor
|
| 205 |
-
ggml_tensor
|
| 206 |
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
| 207 |
enum ggml_opt_loss_type loss_type, // loss to minimize
|
| 208 |
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
|
|
|
| 37 |
// ====== Dataset ======
|
| 38 |
|
| 39 |
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
|
| 40 |
+
enum ggml_type type_data, // the type for the internal data tensor
|
| 41 |
+
enum ggml_type type_label, // the type for the internal labels tensor
|
| 42 |
+
int64_t ne_datapoint, // number of elements per datapoint
|
| 43 |
+
int64_t ne_label, // number of elements per label
|
| 44 |
+
int64_t ndata, // total number of datapoints/labels
|
| 45 |
+
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
|
| 46 |
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
|
| 47 |
|
| 48 |
// get underlying tensors that store the data
|
| 49 |
+
GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
|
| 50 |
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
|
| 51 |
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
|
| 52 |
|
|
|
|
| 59 |
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
|
| 60 |
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
|
| 61 |
int64_t ibatch);
|
| 62 |
+
GGML_API void ggml_opt_dataset_get_batch_host(
|
| 63 |
+
ggml_opt_dataset_t dataset,
|
| 64 |
+
void * data_batch,
|
| 65 |
+
size_t nb_data_batch,
|
| 66 |
+
void * labels_batch,
|
| 67 |
+
int64_t ibatch);
|
| 68 |
|
| 69 |
// ====== Model / Context ======
|
| 70 |
|
| 71 |
enum ggml_opt_build_type {
|
| 72 |
+
GGML_OPT_BUILD_TYPE_FORWARD = 10,
|
| 73 |
+
GGML_OPT_BUILD_TYPE_GRAD = 20,
|
| 74 |
+
GGML_OPT_BUILD_TYPE_OPT = 30,
|
| 75 |
};
|
| 76 |
|
| 77 |
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
|
|
|
| 90 |
// userdata can be used to pass arbitrary data
|
| 91 |
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
|
| 92 |
|
| 93 |
+
// returns the default optimizer params (constant, hard-coded values)
|
| 94 |
// userdata is not used
|
| 95 |
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
|
| 96 |
|
| 97 |
+
// casts userdata to ggml_opt_optimizer_params and returns it
|
| 98 |
+
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
|
| 99 |
+
|
| 100 |
// parameters for initializing a new optimization context
|
| 101 |
struct ggml_opt_params {
|
| 102 |
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
|
| 103 |
|
| 104 |
+
// by default the forward graph needs to be reconstructed for each eval
|
| 105 |
+
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
|
| 106 |
+
struct ggml_context * ctx_compute;
|
| 107 |
+
struct ggml_tensor * inputs;
|
| 108 |
+
struct ggml_tensor * outputs;
|
|
|
|
| 109 |
|
| 110 |
enum ggml_opt_loss_type loss_type;
|
| 111 |
enum ggml_opt_build_type build_type;
|
|
|
|
| 118 |
|
| 119 |
// get parameters for an optimization context with defaults set where possible
|
| 120 |
// parameters for which no sensible defaults exist are supplied as arguments to this function
|
| 121 |
+
GGML_API struct ggml_opt_params ggml_opt_default_params(
|
| 122 |
+
ggml_backend_sched_t backend_sched,
|
| 123 |
+
enum ggml_opt_loss_type loss_type);
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
|
| 126 |
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
|
|
|
|
| 129 |
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
|
| 130 |
|
| 131 |
// get underlying tensors that store data
|
| 132 |
+
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
|
| 133 |
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
|
| 134 |
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
|
| 135 |
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
|
|
|
|
| 137 |
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
|
| 138 |
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
|
| 139 |
|
| 140 |
+
// get the gradient accumulator for a node from the forward graph
|
| 141 |
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
| 142 |
|
| 143 |
// ====== Optimization Result ======
|
| 144 |
|
| 145 |
+
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
| 146 |
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
|
| 147 |
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
|
| 148 |
|
|
|
|
| 154 |
|
| 155 |
// ====== Computation ======
|
| 156 |
|
| 157 |
+
// if not using static graphs, this function must be called prior to ggml_opt_alloc
|
| 158 |
+
GGML_API void ggml_opt_prepare_alloc(
|
| 159 |
+
ggml_opt_context_t opt_ctx,
|
| 160 |
+
struct ggml_context * ctx_compute,
|
| 161 |
+
struct ggml_cgraph * gf,
|
| 162 |
+
struct ggml_tensor * inputs,
|
| 163 |
+
struct ggml_tensor * outputs);
|
| 164 |
+
|
| 165 |
+
// allocate the next graph for evaluation, either forward or forward + backward
|
| 166 |
+
// must be called exactly once prior to calling ggml_opt_eval
|
| 167 |
+
GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
|
| 168 |
|
| 169 |
+
// do forward pass, increment result if not NULL, do backward pass if allocated
|
| 170 |
+
GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
|
| 171 |
|
| 172 |
// ############################################################################
|
| 173 |
// ## The high-level functions start here. They do not depend on any private ##
|
|
|
|
| 219 |
// fit model defined by inputs and outputs to dataset
|
| 220 |
GGML_API void ggml_opt_fit(
|
| 221 |
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
|
| 222 |
+
struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
|
| 223 |
+
struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
|
| 224 |
+
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
| 225 |
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
| 226 |
enum ggml_opt_loss_type loss_type, // loss to minimize
|
| 227 |
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
ggml/include/ggml.h
CHANGED
|
@@ -768,7 +768,7 @@ extern "C" {
|
|
| 768 |
// Tensor flags
|
| 769 |
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
| 770 |
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
| 771 |
-
GGML_API void ggml_set_param(struct
|
| 772 |
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
| 773 |
|
| 774 |
//
|
|
@@ -938,7 +938,7 @@ extern "C" {
|
|
| 938 |
GGML_API struct ggml_tensor * ggml_repeat_back(
|
| 939 |
struct ggml_context * ctx,
|
| 940 |
struct ggml_tensor * a,
|
| 941 |
-
struct ggml_tensor * b);
|
| 942 |
|
| 943 |
// concat a and b along dim
|
| 944 |
// used in stable-diffusion
|
|
@@ -2049,15 +2049,14 @@ extern "C" {
|
|
| 2049 |
|
| 2050 |
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
| 2051 |
GGML_API void ggml_build_backward_expand(
|
| 2052 |
-
struct ggml_context *
|
| 2053 |
-
struct
|
| 2054 |
-
struct
|
| 2055 |
-
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
|
| 2056 |
|
| 2057 |
// graph allocation in a context
|
| 2058 |
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
| 2059 |
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
| 2060 |
-
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
| 2061 |
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
| 2062 |
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
|
| 2063 |
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
|
|
|
| 768 |
// Tensor flags
|
| 769 |
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
| 770 |
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
| 771 |
+
GGML_API void ggml_set_param(struct ggml_tensor * tensor);
|
| 772 |
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
| 773 |
|
| 774 |
//
|
|
|
|
| 938 |
GGML_API struct ggml_tensor * ggml_repeat_back(
|
| 939 |
struct ggml_context * ctx,
|
| 940 |
struct ggml_tensor * a,
|
| 941 |
+
struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
|
| 942 |
|
| 943 |
// concat a and b along dim
|
| 944 |
// used in stable-diffusion
|
|
|
|
| 2049 |
|
| 2050 |
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
| 2051 |
GGML_API void ggml_build_backward_expand(
|
| 2052 |
+
struct ggml_context * ctx, // context for gradient computation
|
| 2053 |
+
struct ggml_cgraph * cgraph,
|
| 2054 |
+
struct ggml_tensor ** grad_accs);
|
|
|
|
| 2055 |
|
| 2056 |
// graph allocation in a context
|
| 2057 |
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
| 2058 |
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
| 2059 |
+
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
|
| 2060 |
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
| 2061 |
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
|
| 2062 |
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -1111,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1111 |
|
| 1112 |
const int node_backend_id = tensor_backend_id(node);
|
| 1113 |
|
| 1114 |
-
assert(node_backend_id != -1); // all nodes should be assigned by now
|
| 1115 |
|
| 1116 |
// check if we should start a new split based on the sources of the current node
|
| 1117 |
bool need_new_split = false;
|
|
|
|
| 1111 |
|
| 1112 |
const int node_backend_id = tensor_backend_id(node);
|
| 1113 |
|
| 1114 |
+
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
| 1115 |
|
| 1116 |
// check if we should start a new split based on the sources of the current node
|
| 1117 |
bool need_new_split = false;
|
ggml/src/ggml-cuda/acc.cu
CHANGED
|
@@ -1,47 +1,61 @@
|
|
| 1 |
#include "acc.cuh"
|
| 2 |
|
| 3 |
-
static __global__ void acc_f32(const float * x, const float * y, float * dst, const
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
const
|
|
|
|
| 7 |
if (i >= ne) {
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
}
|
|
|
|
| 19 |
}
|
| 20 |
|
| 21 |
-
static void acc_f32_cuda(const float * x, const float * y, float * dst, const
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
| 25 |
-
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12,
|
| 26 |
}
|
| 27 |
|
| 28 |
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 29 |
const ggml_tensor * src0 = dst->src[0];
|
| 30 |
const ggml_tensor * src1 = dst->src[1];
|
| 31 |
-
|
| 32 |
-
const float *
|
| 33 |
-
float *
|
|
|
|
|
|
|
| 34 |
cudaStream_t stream = ctx.stream();
|
| 35 |
|
| 36 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 37 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 38 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 39 |
-
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2],
|
| 47 |
}
|
|
|
|
| 1 |
#include "acc.cuh"
|
| 2 |
|
| 3 |
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
|
| 4 |
+
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
| 5 |
+
const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
|
| 6 |
+
const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
| 7 |
+
|
| 8 |
if (i >= ne) {
|
| 9 |
return;
|
| 10 |
}
|
| 11 |
+
|
| 12 |
+
int64_t src1_idx = i - offset;
|
| 13 |
+
|
| 14 |
+
int64_t tmp = src1_idx;
|
| 15 |
+
const int64_t i13 = tmp / s13;
|
| 16 |
+
tmp -= i13 * s13;
|
| 17 |
+
const int64_t i12 = tmp / s12;
|
| 18 |
+
tmp -= i12 * s12;
|
| 19 |
+
const int64_t i11 = tmp / s11;
|
| 20 |
+
tmp -= i11 * s11;
|
| 21 |
+
const int64_t i10 = tmp;
|
| 22 |
+
|
| 23 |
+
float val = x[i];
|
| 24 |
+
if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
|
| 25 |
+
val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
|
| 26 |
}
|
| 27 |
+
dst[i] = val;
|
| 28 |
}
|
| 29 |
|
| 30 |
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
|
| 31 |
+
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
| 32 |
+
const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
|
| 33 |
+
const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
| 34 |
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
|
| 35 |
}
|
| 36 |
|
| 37 |
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 38 |
const ggml_tensor * src0 = dst->src[0];
|
| 39 |
const ggml_tensor * src1 = dst->src[1];
|
| 40 |
+
|
| 41 |
+
const float * src0_d = (const float *) src0->data;
|
| 42 |
+
const float * src1_d = (const float *) src1->data;
|
| 43 |
+
float * dst_d = (float *) dst->data;
|
| 44 |
+
|
| 45 |
cudaStream_t stream = ctx.stream();
|
| 46 |
|
| 47 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 48 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 49 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
| 50 |
|
| 51 |
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
| 52 |
+
GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
|
| 53 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(dst));
|
| 54 |
+
|
| 55 |
+
const int64_t s1 = dst->op_params[0] / sizeof(float);
|
| 56 |
+
const int64_t s2 = dst->op_params[1] / sizeof(float);
|
| 57 |
+
const int64_t s3 = dst->op_params[2] / sizeof(float);
|
| 58 |
+
const int64_t offset = dst->op_params[3] / sizeof(float);
|
| 59 |
|
| 60 |
+
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
|
| 61 |
}
|
ggml/src/ggml-cuda/sum.cu
CHANGED
|
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 31 |
|
| 32 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 33 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 34 |
-
GGML_ASSERT(
|
| 35 |
|
| 36 |
const float * src0_d = (const float *) src0->data;
|
| 37 |
float * dst_d = (float *) dst->data;
|
|
|
|
| 31 |
|
| 32 |
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 33 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 34 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 35 |
|
| 36 |
const float * src0_d = (const float *) src0->data;
|
| 37 |
float * dst_d = (float *) dst->data;
|
ggml/src/ggml-opt.cpp
CHANGED
|
@@ -28,16 +28,19 @@ struct ggml_opt_dataset {
|
|
| 28 |
};
|
| 29 |
|
| 30 |
struct ggml_opt_context {
|
| 31 |
-
ggml_backend_sched_t
|
| 32 |
-
ggml_cgraph
|
| 33 |
-
ggml_cgraph
|
| 34 |
-
struct ggml_context
|
| 35 |
-
struct ggml_context
|
| 36 |
-
struct ggml_context
|
| 37 |
-
struct ggml_context
|
| 38 |
-
ggml_backend_buffer_t
|
| 39 |
-
ggml_backend_buffer_t
|
| 40 |
-
std::mt19937
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
struct ggml_tensor * inputs = nullptr;
|
| 43 |
struct ggml_tensor * outputs = nullptr;
|
|
@@ -50,6 +53,11 @@ struct ggml_opt_context {
|
|
| 50 |
struct ggml_cgraph * gf = nullptr;
|
| 51 |
struct ggml_cgraph * gb_grad = nullptr;
|
| 52 |
struct ggml_cgraph * gb_opt = nullptr;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
int64_t iter = 1;
|
| 55 |
int32_t opt_period = 1;
|
|
@@ -73,7 +81,13 @@ struct ggml_opt_result {
|
|
| 73 |
|
| 74 |
// ====== Dataset ======
|
| 75 |
|
| 76 |
-
ggml_opt_dataset_t ggml_opt_dataset_init(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
GGML_ASSERT(ne_datapoint > 0);
|
| 78 |
GGML_ASSERT(ne_label >= 0);
|
| 79 |
GGML_ASSERT(ndata > 0);
|
|
@@ -92,11 +106,11 @@ ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label,
|
|
| 92 |
result->ctx = ggml_init(params);
|
| 93 |
}
|
| 94 |
|
| 95 |
-
result->data = ggml_new_tensor_2d(result->ctx,
|
| 96 |
result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
|
| 97 |
|
| 98 |
if (ne_label > 0) {
|
| 99 |
-
result->labels = ggml_new_tensor_2d(result->ctx,
|
| 100 |
result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
|
| 101 |
} else {
|
| 102 |
result->labels = nullptr;
|
|
@@ -119,6 +133,10 @@ void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
|
|
| 119 |
delete dataset;
|
| 120 |
}
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
|
| 123 |
return dataset->data;
|
| 124 |
}
|
|
@@ -144,6 +162,8 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
|
|
| 144 |
GGML_ASSERT( data_batch && ggml_is_contiguous(data_batch));
|
| 145 |
GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
|
| 146 |
GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
|
|
|
|
|
|
|
| 147 |
|
| 148 |
const size_t nb_data_batch = ggml_nbytes(data_batch);
|
| 149 |
GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
|
|
@@ -171,6 +191,31 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
|
|
| 171 |
}
|
| 172 |
}
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
// ====== Model / Context ======
|
| 175 |
|
| 176 |
struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
|
|
@@ -187,17 +232,18 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
|
|
| 187 |
return result;
|
| 188 |
}
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
struct ggml_opt_params ggml_opt_default_params(
|
| 191 |
ggml_backend_sched_t backend_sched,
|
| 192 |
-
struct ggml_context * ctx_compute,
|
| 193 |
-
struct ggml_tensor * inputs,
|
| 194 |
-
struct ggml_tensor * outputs,
|
| 195 |
enum ggml_opt_loss_type loss_type) {
|
| 196 |
return {
|
| 197 |
/*backend_sched =*/ backend_sched,
|
| 198 |
-
/*ctx_compute =*/
|
| 199 |
-
/*inputs =*/
|
| 200 |
-
/*logits =*/
|
| 201 |
/*loss_type =*/ loss_type,
|
| 202 |
/*build_type =*/ GGML_OPT_BUILD_TYPE_OPT,
|
| 203 |
/*opt_period =*/ 1,
|
|
@@ -266,195 +312,246 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
|
|
| 266 |
return dst;
|
| 267 |
}
|
| 268 |
|
| 269 |
-
static void
|
| 270 |
-
GGML_ASSERT(
|
| 271 |
-
|
| 272 |
-
return;
|
| 273 |
-
}
|
| 274 |
-
|
| 275 |
-
ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
|
| 276 |
-
|
| 277 |
-
{
|
| 278 |
-
ggml_init_params params = {
|
| 279 |
-
/*.mem_size =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
|
| 280 |
-
/*.mem_buffer =*/ nullptr,
|
| 281 |
-
/*.no_alloc =*/ true,
|
| 282 |
-
};
|
| 283 |
-
ggml_free(opt_ctx->ctx_copy);
|
| 284 |
-
opt_ctx->ctx_copy = ggml_init(params);
|
| 285 |
-
}
|
| 286 |
-
|
| 287 |
-
opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
|
| 288 |
-
|
| 289 |
-
ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
|
| 290 |
-
opt_ctx->allocated_graph = graph;
|
| 291 |
-
}
|
| 292 |
-
|
| 293 |
-
ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
| 294 |
-
ggml_opt_context_t result = new struct ggml_opt_context;
|
| 295 |
-
result->backend_sched = params.backend_sched;
|
| 296 |
-
result->ctx_compute = params.ctx_compute;
|
| 297 |
-
result->inputs = params.inputs;
|
| 298 |
-
result->outputs = params.outputs;
|
| 299 |
-
result->opt_period = params.opt_period;
|
| 300 |
-
result->get_opt_pars = params.get_opt_pars;
|
| 301 |
-
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
| 302 |
-
|
| 303 |
-
GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
|
| 304 |
-
GGML_ASSERT(result->opt_period >= 1);
|
| 305 |
-
|
| 306 |
-
const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
|
| 307 |
-
(params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
|
| 308 |
|
| 309 |
-
|
| 310 |
-
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
|
| 315 |
int n_param = 0;
|
| 316 |
-
for (int i = 0; i <
|
| 317 |
-
|
|
|
|
| 318 |
n_param++;
|
| 319 |
}
|
|
|
|
| 320 |
}
|
| 321 |
|
| 322 |
-
{
|
| 323 |
// The static context is used for:
|
| 324 |
-
// - gradients (1 tensor per param if using gradient accumulation)
|
| 325 |
// - optimizer momenta (2 tensors per param)
|
| 326 |
-
// - labels
|
| 327 |
-
// - loss
|
| 328 |
-
// - pred
|
| 329 |
-
// - ncorrect (2 tensors).
|
| 330 |
-
|
| 331 |
-
const size_t
|
|
|
|
|
|
|
|
|
|
| 332 |
struct ggml_init_params params = {
|
| 333 |
/*.mem_size =*/ size_meta,
|
| 334 |
/*.mem_buffer =*/ nullptr,
|
| 335 |
/*.no_alloc =*/ true,
|
| 336 |
};
|
| 337 |
-
|
| 338 |
}
|
|
|
|
|
|
|
| 339 |
{
|
| 340 |
-
// The
|
| 341 |
-
//
|
|
|
|
| 342 |
const size_t size_meta = 1 * ggml_tensor_overhead();
|
| 343 |
struct ggml_init_params params = {
|
| 344 |
/*.mem_size =*/ size_meta,
|
| 345 |
/*.mem_buffer =*/ nullptr,
|
| 346 |
/*.no_alloc =*/ true,
|
| 347 |
};
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
}
|
| 350 |
|
|
|
|
| 351 |
|
| 352 |
-
switch (
|
| 353 |
case GGML_OPT_LOSS_TYPE_MEAN: {
|
| 354 |
-
|
| 355 |
-
ggml_set_name(
|
| 356 |
-
const float scale = 1.0f / (
|
| 357 |
-
|
| 358 |
-
ggml_set_name(
|
| 359 |
-
|
| 360 |
break;
|
| 361 |
}
|
| 362 |
case GGML_OPT_LOSS_TYPE_SUM: {
|
| 363 |
-
|
| 364 |
-
ggml_set_name(
|
| 365 |
-
|
| 366 |
break;
|
| 367 |
}
|
| 368 |
case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
|
| 369 |
-
|
| 370 |
-
ggml_set_input(
|
| 371 |
-
ggml_set_name(
|
| 372 |
-
|
| 373 |
-
ggml_set_name(
|
| 374 |
-
if (
|
| 375 |
-
|
| 376 |
-
ggml_set_name(
|
| 377 |
}
|
| 378 |
-
|
| 379 |
break;
|
| 380 |
}
|
| 381 |
case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
|
| 382 |
-
|
| 383 |
-
ggml_set_input(
|
| 384 |
-
ggml_set_name(
|
| 385 |
-
|
| 386 |
-
ggml_set_name(
|
| 387 |
-
|
| 388 |
-
ggml_set_name(
|
| 389 |
-
|
| 390 |
-
ggml_set_name(
|
| 391 |
-
const float scale = 1.0f / (
|
| 392 |
-
|
| 393 |
-
ggml_set_name(
|
| 394 |
-
|
| 395 |
break;
|
| 396 |
}
|
| 397 |
}
|
| 398 |
-
ggml_set_output(
|
| 399 |
-
ggml_set_loss(
|
| 400 |
-
ggml_build_forward_expand(
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
-
if (
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
|
|
|
| 414 |
}
|
| 415 |
|
| 416 |
-
if (
|
| 417 |
-
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
}
|
| 420 |
|
| 421 |
// gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
|
| 422 |
-
|
| 423 |
-
ggml_build_backward_expand(
|
| 424 |
|
| 425 |
-
if (
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
| 429 |
}
|
| 430 |
|
| 431 |
-
GGML_ASSERT(
|
| 432 |
|
| 433 |
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
|
| 434 |
-
|
| 435 |
|
| 436 |
-
|
| 437 |
-
ggml_set_input(
|
| 438 |
-
ggml_set_name(
|
| 439 |
|
| 440 |
-
for (int i =
|
| 441 |
-
struct ggml_tensor * node =
|
| 442 |
-
struct ggml_tensor * grad = ggml_graph_get_grad(
|
| 443 |
|
| 444 |
-
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 445 |
-
struct ggml_tensor * m =
|
| 446 |
-
struct ggml_tensor * v =
|
| 447 |
-
struct ggml_tensor * opt_step = ggml_opt_step_adamw(
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
}
|
| 450 |
}
|
| 451 |
|
| 452 |
-
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
-
|
|
|
|
| 456 |
|
| 457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
return result;
|
| 460 |
}
|
|
@@ -464,9 +561,9 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
|
|
| 464 |
return;
|
| 465 |
}
|
| 466 |
ggml_backend_buffer_free(opt_ctx->buf_static);
|
| 467 |
-
ggml_backend_buffer_free(opt_ctx->
|
| 468 |
ggml_free(opt_ctx->ctx_static);
|
| 469 |
-
ggml_free(opt_ctx->
|
| 470 |
delete opt_ctx;
|
| 471 |
}
|
| 472 |
|
|
@@ -582,8 +679,79 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl
|
|
| 582 |
|
| 583 |
// ====== Computation ======
|
| 584 |
|
| 585 |
-
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
|
| 588 |
|
| 589 |
GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
|
|
@@ -609,9 +777,19 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
|
|
| 609 |
adamw_par_data[6] = beta2h;
|
| 610 |
}
|
| 611 |
|
| 612 |
-
ggml_opt_alloc_graph(opt_ctx, graph);
|
| 613 |
ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
|
| 614 |
opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
|
| 616 |
if (!result) {
|
| 617 |
return;
|
|
@@ -635,12 +813,14 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
|
|
| 635 |
ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
|
| 636 |
result->loss.push_back(loss);
|
| 637 |
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
|
|
|
|
|
|
| 642 |
|
| 643 |
-
if (!opt_ctx->
|
| 644 |
result->ncorrect = -1;
|
| 645 |
return;
|
| 646 |
}
|
|
@@ -652,26 +832,6 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
|
|
| 652 |
result->ncorrect += ncorrect;
|
| 653 |
}
|
| 654 |
|
| 655 |
-
void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
|
| 656 |
-
ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
|
| 657 |
-
}
|
| 658 |
-
|
| 659 |
-
void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
|
| 660 |
-
if (opt_ctx->opt_period == 1) {
|
| 661 |
-
ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
|
| 662 |
-
return;
|
| 663 |
-
}
|
| 664 |
-
|
| 665 |
-
const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
|
| 666 |
-
if (opt_i_next == 0) {
|
| 667 |
-
ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
|
| 668 |
-
ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
|
| 669 |
-
} else {
|
| 670 |
-
ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
|
| 671 |
-
}
|
| 672 |
-
opt_ctx->opt_i = opt_i_next;
|
| 673 |
-
}
|
| 674 |
-
|
| 675 |
// ====== High-Level Functions ======
|
| 676 |
|
| 677 |
void ggml_opt_epoch(
|
|
@@ -700,16 +860,18 @@ void ggml_opt_epoch(
|
|
| 700 |
int64_t ibatch = 0;
|
| 701 |
int64_t t_loop_start = ggml_time_us();
|
| 702 |
for (; ibatch < ibatch_split; ++ibatch) {
|
|
|
|
| 703 |
ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
|
| 704 |
-
|
| 705 |
if (callback_train) {
|
| 706 |
callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
|
| 707 |
}
|
| 708 |
}
|
| 709 |
t_loop_start = ggml_time_us();
|
| 710 |
for (; ibatch < nbatches; ++ibatch) {
|
|
|
|
| 711 |
ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
|
| 712 |
-
|
| 713 |
if (callback_eval) {
|
| 714 |
callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
|
| 715 |
}
|
|
@@ -726,13 +888,26 @@ void ggml_opt_epoch_callback_progress_bar(
|
|
| 726 |
int64_t t_start_us) {
|
| 727 |
fprintf(stderr, "%s[", train ? "train: " : "val: ");
|
| 728 |
|
| 729 |
-
|
|
|
|
|
|
|
| 730 |
for (int64_t j = 0; j < bar_length; ++j) {
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 736 |
} else {
|
| 737 |
fprintf(stderr, " ");
|
| 738 |
}
|
|
@@ -764,8 +939,8 @@ void ggml_opt_epoch_callback_progress_bar(
|
|
| 764 |
const int64_t t_eta_m = t_eta_s / 60;
|
| 765 |
t_eta_s -= t_eta_m * 60;
|
| 766 |
|
| 767 |
-
fprintf(stderr, "
|
| 768 |
-
"t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "
|
| 769 |
idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
|
| 770 |
t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
|
| 771 |
if (ibatch == ibatch_max) {
|
|
@@ -806,7 +981,10 @@ void ggml_opt_fit(
|
|
| 806 |
|
| 807 |
int64_t epoch = 1;
|
| 808 |
|
| 809 |
-
ggml_opt_params params = ggml_opt_default_params(backend_sched,
|
|
|
|
|
|
|
|
|
|
| 810 |
params.opt_period = opt_period;
|
| 811 |
params.get_opt_pars = get_opt_pars;
|
| 812 |
params.get_opt_pars_ud = &epoch;
|
|
|
|
| 28 |
};
|
| 29 |
|
| 30 |
struct ggml_opt_context {
|
| 31 |
+
ggml_backend_sched_t backend_sched = nullptr;
|
| 32 |
+
ggml_cgraph * allocated_graph = nullptr;
|
| 33 |
+
ggml_cgraph * allocated_graph_copy = nullptr;
|
| 34 |
+
struct ggml_context * ctx_static = nullptr;
|
| 35 |
+
struct ggml_context * ctx_cpu = nullptr;
|
| 36 |
+
struct ggml_context * ctx_compute = nullptr;
|
| 37 |
+
struct ggml_context * ctx_copy = nullptr;
|
| 38 |
+
ggml_backend_buffer_t buf_static = nullptr;
|
| 39 |
+
ggml_backend_buffer_t buf_cpu = nullptr;
|
| 40 |
+
std::mt19937 rng;
|
| 41 |
+
enum ggml_opt_loss_type loss_type;
|
| 42 |
+
enum ggml_opt_build_type build_type;
|
| 43 |
+
enum ggml_opt_build_type build_type_alloc;
|
| 44 |
|
| 45 |
struct ggml_tensor * inputs = nullptr;
|
| 46 |
struct ggml_tensor * outputs = nullptr;
|
|
|
|
| 53 |
struct ggml_cgraph * gf = nullptr;
|
| 54 |
struct ggml_cgraph * gb_grad = nullptr;
|
| 55 |
struct ggml_cgraph * gb_opt = nullptr;
|
| 56 |
+
bool static_graphs = false;
|
| 57 |
+
bool eval_ready = false;
|
| 58 |
+
std::vector<struct ggml_tensor *> grad_accs;
|
| 59 |
+
std::vector<struct ggml_tensor *> grad_m;
|
| 60 |
+
std::vector<struct ggml_tensor *> grad_v;
|
| 61 |
|
| 62 |
int64_t iter = 1;
|
| 63 |
int32_t opt_period = 1;
|
|
|
|
| 81 |
|
| 82 |
// ====== Dataset ======
|
| 83 |
|
| 84 |
+
ggml_opt_dataset_t ggml_opt_dataset_init(
|
| 85 |
+
enum ggml_type type_data,
|
| 86 |
+
enum ggml_type type_label,
|
| 87 |
+
int64_t ne_datapoint,
|
| 88 |
+
int64_t ne_label,
|
| 89 |
+
int64_t ndata,
|
| 90 |
+
int64_t ndata_shard) {
|
| 91 |
GGML_ASSERT(ne_datapoint > 0);
|
| 92 |
GGML_ASSERT(ne_label >= 0);
|
| 93 |
GGML_ASSERT(ndata > 0);
|
|
|
|
| 106 |
result->ctx = ggml_init(params);
|
| 107 |
}
|
| 108 |
|
| 109 |
+
result->data = ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata);
|
| 110 |
result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
|
| 111 |
|
| 112 |
if (ne_label > 0) {
|
| 113 |
+
result->labels = ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata);
|
| 114 |
result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
|
| 115 |
} else {
|
| 116 |
result->labels = nullptr;
|
|
|
|
| 133 |
delete dataset;
|
| 134 |
}
|
| 135 |
|
| 136 |
+
int64_t ggml_opt_dataset_ndata(ggml_opt_dataset_t dataset) {
|
| 137 |
+
return dataset->ndata;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
|
| 141 |
return dataset->data;
|
| 142 |
}
|
|
|
|
| 162 |
GGML_ASSERT( data_batch && ggml_is_contiguous(data_batch));
|
| 163 |
GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
|
| 164 |
GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
|
| 165 |
+
GGML_ASSERT( data_batch->type == dataset->data->type);
|
| 166 |
+
GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type);
|
| 167 |
|
| 168 |
const size_t nb_data_batch = ggml_nbytes(data_batch);
|
| 169 |
GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
|
|
|
|
| 191 |
}
|
| 192 |
}
|
| 193 |
|
| 194 |
+
void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) {
|
| 195 |
+
GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
|
| 196 |
+
GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
|
| 197 |
+
|
| 198 |
+
const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
|
| 199 |
+
|
| 200 |
+
GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
|
| 201 |
+
|
| 202 |
+
for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
|
| 203 |
+
const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
|
| 204 |
+
|
| 205 |
+
const char * ptr_data = (const char *) dataset->data->data + ishard *dataset->nbs_data;
|
| 206 |
+
char * ptr_data_batch = (char *) data_batch + ishard_batch*dataset->nbs_data;
|
| 207 |
+
memcpy(ptr_data_batch, ptr_data, dataset->nbs_data);
|
| 208 |
+
|
| 209 |
+
if (!labels_batch) {
|
| 210 |
+
continue;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
const char * ptr_labels = (const char *) dataset->labels->data + ishard *dataset->nbs_labels;
|
| 214 |
+
char * ptr_labels_batch = (char *) labels_batch + ishard_batch*dataset->nbs_labels;
|
| 215 |
+
memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels);
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
// ====== Model / Context ======
|
| 220 |
|
| 221 |
struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
|
|
|
|
| 232 |
return result;
|
| 233 |
}
|
| 234 |
|
| 235 |
+
struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
|
| 236 |
+
return *((struct ggml_opt_optimizer_params *) userdata);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
struct ggml_opt_params ggml_opt_default_params(
|
| 240 |
ggml_backend_sched_t backend_sched,
|
|
|
|
|
|
|
|
|
|
| 241 |
enum ggml_opt_loss_type loss_type) {
|
| 242 |
return {
|
| 243 |
/*backend_sched =*/ backend_sched,
|
| 244 |
+
/*ctx_compute =*/ nullptr,
|
| 245 |
+
/*inputs =*/ nullptr,
|
| 246 |
+
/*logits =*/ nullptr,
|
| 247 |
/*loss_type =*/ loss_type,
|
| 248 |
/*build_type =*/ GGML_OPT_BUILD_TYPE_OPT,
|
| 249 |
/*opt_period =*/ 1,
|
|
|
|
| 312 |
return dst;
|
| 313 |
}
|
| 314 |
|
| 315 |
+
static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
| 316 |
+
GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
|
| 317 |
+
GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
+
const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
|
| 320 |
+
!(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
|
| 321 |
|
| 322 |
+
ggml_set_input(opt_ctx->inputs);
|
| 323 |
+
ggml_set_output(opt_ctx->outputs);
|
| 324 |
|
| 325 |
int n_param = 0;
|
| 326 |
+
for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) {
|
| 327 |
+
const struct ggml_tensor * node = opt_ctx->gf->nodes[i];
|
| 328 |
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 329 |
n_param++;
|
| 330 |
}
|
| 331 |
+
GGML_ASSERT(!(node->flags & GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented");
|
| 332 |
}
|
| 333 |
|
| 334 |
+
if (!opt_ctx->ctx_static) {
|
| 335 |
// The static context is used for:
|
| 336 |
+
// - gradients (1 per loss, 1 tensor per param if using gradient accumulation)
|
| 337 |
// - optimizer momenta (2 tensors per param)
|
| 338 |
+
// - labels (if using static graphs)
|
| 339 |
+
// - loss (if using static graphs, up to 5 tensors)
|
| 340 |
+
// - pred (if using static graphs)
|
| 341 |
+
// - ncorrect (if using static graphs, 2 tensors).
|
| 342 |
+
constexpr size_t n_loss = 1;
|
| 343 |
+
const size_t tensors_per_param = (accumulate ? 1 : 0) +
|
| 344 |
+
(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
|
| 345 |
+
const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
|
| 346 |
+
const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
|
| 347 |
struct ggml_init_params params = {
|
| 348 |
/*.mem_size =*/ size_meta,
|
| 349 |
/*.mem_buffer =*/ nullptr,
|
| 350 |
/*.no_alloc =*/ true,
|
| 351 |
};
|
| 352 |
+
opt_ctx->ctx_static = ggml_init(params);
|
| 353 |
}
|
| 354 |
+
GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc);
|
| 355 |
+
|
| 356 |
{
|
| 357 |
+
// The cpu context is allocated statically if using static graphs, dynamically otherwise.
|
| 358 |
+
// It is used for:
|
| 359 |
+
// - optimizer parameters (1 shared for all optimizer invocations)
|
| 360 |
const size_t size_meta = 1 * ggml_tensor_overhead();
|
| 361 |
struct ggml_init_params params = {
|
| 362 |
/*.mem_size =*/ size_meta,
|
| 363 |
/*.mem_buffer =*/ nullptr,
|
| 364 |
/*.no_alloc =*/ true,
|
| 365 |
};
|
| 366 |
+
ggml_free(opt_ctx->ctx_cpu);
|
| 367 |
+
opt_ctx->ctx_cpu = ggml_init(params);
|
| 368 |
+
|
| 369 |
+
ggml_backend_buffer_free(opt_ctx->buf_cpu);
|
| 370 |
+
opt_ctx->buf_cpu = nullptr;
|
| 371 |
}
|
| 372 |
|
| 373 |
+
struct ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute;
|
| 374 |
|
| 375 |
+
switch (opt_ctx->loss_type) {
|
| 376 |
case GGML_OPT_LOSS_TYPE_MEAN: {
|
| 377 |
+
opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
|
| 378 |
+
ggml_set_name(opt_ctx->loss, "loss_sum");
|
| 379 |
+
const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
|
| 380 |
+
opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
|
| 381 |
+
ggml_set_name(opt_ctx->loss, "loss_mean");
|
| 382 |
+
opt_ctx->loss_per_datapoint = true;
|
| 383 |
break;
|
| 384 |
}
|
| 385 |
case GGML_OPT_LOSS_TYPE_SUM: {
|
| 386 |
+
opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
|
| 387 |
+
ggml_set_name(opt_ctx->loss, "loss_sum");
|
| 388 |
+
opt_ctx->loss_per_datapoint = false;
|
| 389 |
break;
|
| 390 |
}
|
| 391 |
case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
|
| 392 |
+
opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
|
| 393 |
+
ggml_set_input(opt_ctx->labels);
|
| 394 |
+
ggml_set_name(opt_ctx->labels, "labels");
|
| 395 |
+
opt_ctx->loss = ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels);
|
| 396 |
+
ggml_set_name(opt_ctx->loss, "loss_cross_entropy");
|
| 397 |
+
if (opt_ctx->opt_period > 1) {
|
| 398 |
+
opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period);
|
| 399 |
+
ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled");
|
| 400 |
}
|
| 401 |
+
opt_ctx->loss_per_datapoint = true;
|
| 402 |
break;
|
| 403 |
}
|
| 404 |
case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
|
| 405 |
+
opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
|
| 406 |
+
ggml_set_input(opt_ctx->labels);
|
| 407 |
+
ggml_set_name(opt_ctx->labels, "labels");
|
| 408 |
+
opt_ctx->loss = ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels);
|
| 409 |
+
ggml_set_name(opt_ctx->loss, "loss_error");
|
| 410 |
+
opt_ctx->loss = ggml_sqr(ctx_results, opt_ctx->loss);
|
| 411 |
+
ggml_set_name(opt_ctx->loss, "loss_squared_error");
|
| 412 |
+
opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->loss);
|
| 413 |
+
ggml_set_name(opt_ctx->loss, "loss_sum_squared_error");
|
| 414 |
+
const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
|
| 415 |
+
opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
|
| 416 |
+
ggml_set_name(opt_ctx->loss, "loss_mean_squared_error");
|
| 417 |
+
opt_ctx->loss_per_datapoint = true;
|
| 418 |
break;
|
| 419 |
}
|
| 420 |
}
|
| 421 |
+
ggml_set_output(opt_ctx->loss);
|
| 422 |
+
ggml_set_loss(opt_ctx->loss);
|
| 423 |
+
ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss);
|
| 424 |
+
|
| 425 |
+
if (opt_ctx->loss_type == GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) {
|
| 426 |
+
opt_ctx->pred = ggml_argmax(ctx_results, opt_ctx->outputs);
|
| 427 |
+
ggml_set_name(opt_ctx->pred, "pred");
|
| 428 |
+
ggml_set_output(opt_ctx->pred);
|
| 429 |
+
ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred);
|
| 430 |
+
|
| 431 |
+
opt_ctx->ncorrect = ggml_count_equal(ctx_results, opt_ctx->pred, ggml_argmax(ctx_results, opt_ctx->labels));
|
| 432 |
+
ggml_set_name(opt_ctx->ncorrect, "ncorrect");
|
| 433 |
+
ggml_set_output(opt_ctx->ncorrect);
|
| 434 |
+
ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect);
|
| 435 |
+
}
|
| 436 |
|
| 437 |
+
if (opt_ctx->buf_static) {
|
| 438 |
+
if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
|
| 439 |
+
return;
|
| 440 |
+
}
|
| 441 |
+
} else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_FORWARD) {
|
| 442 |
+
opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
|
| 443 |
+
opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
|
| 444 |
+
return;
|
| 445 |
}
|
| 446 |
|
| 447 |
+
if (opt_ctx->grad_accs.empty()) {
|
| 448 |
+
GGML_ASSERT(opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD);
|
| 449 |
+
|
| 450 |
+
const int n_nodes = opt_ctx->gf->n_nodes;
|
| 451 |
+
opt_ctx->grad_accs.resize(n_nodes);
|
| 452 |
+
for (int i = 0; i < n_nodes; ++i) {
|
| 453 |
+
ggml_tensor * node = opt_ctx->gf->nodes[i];
|
| 454 |
+
if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
|
| 455 |
+
opt_ctx->grad_accs[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
| 456 |
+
} else {
|
| 457 |
+
opt_ctx->grad_accs[i] = nullptr;
|
| 458 |
+
}
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
|
| 462 |
+
opt_ctx->grad_m.resize(n_nodes);
|
| 463 |
+
opt_ctx->grad_v.resize(n_nodes);
|
| 464 |
+
for (int i = 0; i < n_nodes; ++i) {
|
| 465 |
+
ggml_tensor * node = opt_ctx->gf->nodes[i];
|
| 466 |
+
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
| 467 |
+
opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
| 468 |
+
opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
| 469 |
+
} else {
|
| 470 |
+
opt_ctx->grad_m[i] = nullptr;
|
| 471 |
+
opt_ctx->grad_v[i] = nullptr;
|
| 472 |
+
}
|
| 473 |
+
}
|
| 474 |
+
}
|
| 475 |
}
|
| 476 |
|
| 477 |
// gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
|
| 478 |
+
opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
|
| 479 |
+
ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
|
| 480 |
|
| 481 |
+
if (opt_ctx->buf_static) {
|
| 482 |
+
if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_GRAD) {
|
| 483 |
+
return;
|
| 484 |
+
}
|
| 485 |
+
} else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_GRAD) {
|
| 486 |
+
opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
|
| 487 |
+
ggml_graph_reset(opt_ctx->gb_grad);
|
| 488 |
}
|
| 489 |
|
| 490 |
+
GGML_ASSERT(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT);
|
| 491 |
|
| 492 |
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
|
| 493 |
+
opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
|
| 494 |
|
| 495 |
+
opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
|
| 496 |
+
ggml_set_input(opt_ctx->adamw_params);
|
| 497 |
+
ggml_set_name(opt_ctx->adamw_params, "adamw_params");
|
| 498 |
|
| 499 |
+
for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
|
| 500 |
+
struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
|
| 501 |
+
struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
|
| 502 |
|
| 503 |
+
if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
|
| 504 |
+
struct ggml_tensor * m = opt_ctx->grad_m[i];
|
| 505 |
+
struct ggml_tensor * v = opt_ctx->grad_v[i];
|
| 506 |
+
struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
|
| 507 |
+
|
| 508 |
+
ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str());
|
| 509 |
+
ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str());
|
| 510 |
+
ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
|
| 511 |
+
|
| 512 |
+
ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
|
| 513 |
}
|
| 514 |
}
|
| 515 |
|
| 516 |
+
if (!opt_ctx->buf_static) {
|
| 517 |
+
opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
|
| 518 |
+
opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
|
| 519 |
+
ggml_graph_reset(opt_ctx->gb_opt);
|
| 520 |
+
}
|
| 521 |
|
| 522 |
+
opt_ctx->buf_cpu = ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, ggml_backend_cpu_buffer_type());
|
| 523 |
+
}
|
| 524 |
|
| 525 |
+
ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
| 526 |
+
ggml_opt_context_t result = new struct ggml_opt_context;
|
| 527 |
+
result->backend_sched = params.backend_sched;
|
| 528 |
+
result->ctx_compute = params.ctx_compute;
|
| 529 |
+
result->loss_type = params.loss_type;
|
| 530 |
+
result->build_type = params.build_type;
|
| 531 |
+
result->build_type_alloc = params.build_type;
|
| 532 |
+
result->inputs = params.inputs;
|
| 533 |
+
result->outputs = params.outputs;
|
| 534 |
+
result->opt_period = params.opt_period;
|
| 535 |
+
result->get_opt_pars = params.get_opt_pars;
|
| 536 |
+
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
| 537 |
+
|
| 538 |
+
GGML_ASSERT(result->opt_period >= 1);
|
| 539 |
+
|
| 540 |
+
result->static_graphs = result->ctx_compute;
|
| 541 |
+
|
| 542 |
+
if (!result->static_graphs) {
|
| 543 |
+
GGML_ASSERT(!result->inputs);
|
| 544 |
+
GGML_ASSERT(!result->outputs);
|
| 545 |
+
return result;
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
GGML_ASSERT(result->inputs);
|
| 549 |
+
GGML_ASSERT(result->outputs);
|
| 550 |
+
|
| 551 |
+
result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
|
| 552 |
+
ggml_build_forward_expand(result->gf, result->outputs);
|
| 553 |
+
|
| 554 |
+
ggml_opt_build(result);
|
| 555 |
|
| 556 |
return result;
|
| 557 |
}
|
|
|
|
| 561 |
return;
|
| 562 |
}
|
| 563 |
ggml_backend_buffer_free(opt_ctx->buf_static);
|
| 564 |
+
ggml_backend_buffer_free(opt_ctx->buf_cpu);
|
| 565 |
ggml_free(opt_ctx->ctx_static);
|
| 566 |
+
ggml_free(opt_ctx->ctx_cpu);
|
| 567 |
delete opt_ctx;
|
| 568 |
}
|
| 569 |
|
|
|
|
| 679 |
|
| 680 |
// ====== Computation ======
|
| 681 |
|
| 682 |
+
void ggml_opt_prepare_alloc(
|
| 683 |
+
ggml_opt_context_t opt_ctx,
|
| 684 |
+
struct ggml_context * ctx_compute,
|
| 685 |
+
struct ggml_cgraph * gf,
|
| 686 |
+
struct ggml_tensor * inputs,
|
| 687 |
+
struct ggml_tensor * outputs) {
|
| 688 |
+
GGML_ASSERT(!opt_ctx->static_graphs);
|
| 689 |
+
opt_ctx->ctx_compute = ctx_compute;
|
| 690 |
+
opt_ctx->gf = gf;
|
| 691 |
+
opt_ctx->inputs = inputs;
|
| 692 |
+
opt_ctx->outputs = outputs;
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
|
| 696 |
+
GGML_ASSERT(!opt_ctx->eval_ready);
|
| 697 |
+
if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
|
| 698 |
+
ggml_graph_reset(opt_ctx->gb_grad);
|
| 699 |
+
}
|
| 700 |
+
if (backward) {
|
| 701 |
+
const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
|
| 702 |
+
opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
|
| 703 |
+
} else {
|
| 704 |
+
opt_ctx->build_type = GGML_OPT_BUILD_TYPE_FORWARD;
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
if (!opt_ctx->static_graphs) {
|
| 708 |
+
ggml_opt_build(opt_ctx);
|
| 709 |
+
}
|
| 710 |
+
|
| 711 |
+
struct ggml_cgraph * graph = nullptr;
|
| 712 |
+
switch (opt_ctx->build_type) {
|
| 713 |
+
case GGML_OPT_BUILD_TYPE_FORWARD: {
|
| 714 |
+
graph = opt_ctx->gf;
|
| 715 |
+
} break;
|
| 716 |
+
case GGML_OPT_BUILD_TYPE_GRAD: {
|
| 717 |
+
graph = opt_ctx->gb_grad;
|
| 718 |
+
} break;
|
| 719 |
+
case GGML_OPT_BUILD_TYPE_OPT: {
|
| 720 |
+
graph = opt_ctx->gb_opt;
|
| 721 |
+
} break;
|
| 722 |
+
}
|
| 723 |
+
GGML_ASSERT(graph);
|
| 724 |
+
|
| 725 |
+
if (opt_ctx->allocated_graph == graph) {
|
| 726 |
+
opt_ctx->eval_ready = true;
|
| 727 |
+
return;
|
| 728 |
+
}
|
| 729 |
+
|
| 730 |
+
ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
|
| 731 |
+
|
| 732 |
+
if (opt_ctx->static_graphs) {
|
| 733 |
+
ggml_init_params params = {
|
| 734 |
+
/*.mem_size =*/ graph->size*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph->size, graph->grads),
|
| 735 |
+
/*.mem_buffer =*/ nullptr,
|
| 736 |
+
/*.no_alloc =*/ true,
|
| 737 |
+
};
|
| 738 |
+
ggml_free(opt_ctx->ctx_copy);
|
| 739 |
+
opt_ctx->ctx_copy = ggml_init(params);
|
| 740 |
+
|
| 741 |
+
opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
|
| 742 |
+
} else {
|
| 743 |
+
opt_ctx->allocated_graph_copy = graph;
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
|
| 747 |
+
opt_ctx->allocated_graph = graph;
|
| 748 |
+
|
| 749 |
+
opt_ctx->eval_ready = true;
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
|
| 753 |
+
GGML_ASSERT(opt_ctx->eval_ready);
|
| 754 |
+
if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
|
| 755 |
struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
|
| 756 |
|
| 757 |
GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
|
|
|
|
| 777 |
adamw_par_data[6] = beta2h;
|
| 778 |
}
|
| 779 |
|
|
|
|
| 780 |
ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
|
| 781 |
opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
|
| 782 |
+
opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
|
| 783 |
+
|
| 784 |
+
if (!opt_ctx->static_graphs) {
|
| 785 |
+
opt_ctx->gf = nullptr;
|
| 786 |
+
opt_ctx->gb_grad = nullptr;
|
| 787 |
+
opt_ctx->gb_opt = nullptr;
|
| 788 |
+
opt_ctx->allocated_graph = nullptr;
|
| 789 |
+
opt_ctx->allocated_graph_copy = nullptr;
|
| 790 |
+
}
|
| 791 |
+
|
| 792 |
+
opt_ctx->eval_ready = false;
|
| 793 |
|
| 794 |
if (!result) {
|
| 795 |
return;
|
|
|
|
| 813 |
ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
|
| 814 |
result->loss.push_back(loss);
|
| 815 |
|
| 816 |
+
if (opt_ctx->pred) {
|
| 817 |
+
GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
|
| 818 |
+
std::vector<int32_t> pred(ndata);
|
| 819 |
+
ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
|
| 820 |
+
result->pred.insert(result->pred.end(), pred.begin(), pred.end());
|
| 821 |
+
}
|
| 822 |
|
| 823 |
+
if (!opt_ctx->ncorrect || result->ncorrect < 0) {
|
| 824 |
result->ncorrect = -1;
|
| 825 |
return;
|
| 826 |
}
|
|
|
|
| 832 |
result->ncorrect += ncorrect;
|
| 833 |
}
|
| 834 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
// ====== High-Level Functions ======
|
| 836 |
|
| 837 |
void ggml_opt_epoch(
|
|
|
|
| 860 |
int64_t ibatch = 0;
|
| 861 |
int64_t t_loop_start = ggml_time_us();
|
| 862 |
for (; ibatch < ibatch_split; ++ibatch) {
|
| 863 |
+
ggml_opt_alloc(opt_ctx, /*backward =*/ true);
|
| 864 |
ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
|
| 865 |
+
ggml_opt_eval(opt_ctx, result_train);
|
| 866 |
if (callback_train) {
|
| 867 |
callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
|
| 868 |
}
|
| 869 |
}
|
| 870 |
t_loop_start = ggml_time_us();
|
| 871 |
for (; ibatch < nbatches; ++ibatch) {
|
| 872 |
+
ggml_opt_alloc(opt_ctx, /*backward =*/ false);
|
| 873 |
ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
|
| 874 |
+
ggml_opt_eval(opt_ctx, result_eval);
|
| 875 |
if (callback_eval) {
|
| 876 |
callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
|
| 877 |
}
|
|
|
|
| 888 |
int64_t t_start_us) {
|
| 889 |
fprintf(stderr, "%s[", train ? "train: " : "val: ");
|
| 890 |
|
| 891 |
+
// The progress bar consists of partially filled blocks, unicode has 8 separate fill levels.
|
| 892 |
+
constexpr int64_t bar_length = 8;
|
| 893 |
+
const int64_t ibatch8 = 8 * ibatch;
|
| 894 |
for (int64_t j = 0; j < bar_length; ++j) {
|
| 895 |
+
if (ibatch_max * (8*j + 8) / bar_length < ibatch8) {
|
| 896 |
+
fprintf(stderr, "\u2588"); // full block
|
| 897 |
+
} else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) {
|
| 898 |
+
fprintf(stderr, "\u2589"); // 7/8 filled
|
| 899 |
+
} else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) {
|
| 900 |
+
fprintf(stderr, "\u258A"); // 6/8 filled
|
| 901 |
+
} else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) {
|
| 902 |
+
fprintf(stderr, "\u258B"); // 5/8 filled
|
| 903 |
+
} else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) {
|
| 904 |
+
fprintf(stderr, "\u258C"); // 4/8 filled
|
| 905 |
+
} else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) {
|
| 906 |
+
fprintf(stderr, "\u258D"); // 3/8 filled
|
| 907 |
+
} else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) {
|
| 908 |
+
fprintf(stderr, "\u258E"); // 2/8 filled
|
| 909 |
+
} else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) {
|
| 910 |
+
fprintf(stderr, "\u258F"); // 1/8 filled
|
| 911 |
} else {
|
| 912 |
fprintf(stderr, " ");
|
| 913 |
}
|
|
|
|
| 939 |
const int64_t t_eta_m = t_eta_s / 60;
|
| 940 |
t_eta_s -= t_eta_m * 60;
|
| 941 |
|
| 942 |
+
fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% "
|
| 943 |
+
"t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r",
|
| 944 |
idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
|
| 945 |
t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
|
| 946 |
if (ibatch == ibatch_max) {
|
|
|
|
| 981 |
|
| 982 |
int64_t epoch = 1;
|
| 983 |
|
| 984 |
+
ggml_opt_params params = ggml_opt_default_params(backend_sched, loss_type);
|
| 985 |
+
params.ctx_compute = ctx_compute;
|
| 986 |
+
params.inputs = inputs;
|
| 987 |
+
params.outputs = outputs;
|
| 988 |
params.opt_period = opt_period;
|
| 989 |
params.get_opt_pars = get_opt_pars;
|
| 990 |
params.get_opt_pars_ud = &epoch;
|
ggml/src/ggml.c
CHANGED
|
@@ -5499,7 +5499,7 @@ static void ggml_compute_backward(
|
|
| 5499 |
// tensor = src0 * 1 + src1 * 0
|
| 5500 |
if (src0_needs_grads) {
|
| 5501 |
// dsrc0 = dtensor * 1
|
| 5502 |
-
ggml_add_or_set(ctx, cgraph, isrc0, grad);
|
| 5503 |
}
|
| 5504 |
if (src1_needs_grads) {
|
| 5505 |
// dsrc1 = dtensor * 0 -> noop
|
|
@@ -5780,10 +5780,9 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
| 5780 |
}
|
| 5781 |
|
| 5782 |
void ggml_build_backward_expand(
|
| 5783 |
-
struct ggml_context *
|
| 5784 |
-
struct
|
| 5785 |
-
struct
|
| 5786 |
-
bool accumulate) {
|
| 5787 |
GGML_ASSERT(cgraph->n_nodes > 0);
|
| 5788 |
GGML_ASSERT(cgraph->grads);
|
| 5789 |
GGML_ASSERT(cgraph->grad_accs);
|
|
@@ -5856,21 +5855,24 @@ void ggml_build_backward_expand(
|
|
| 5856 |
GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
|
| 5857 |
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
|
| 5858 |
|
| 5859 |
-
const size_t
|
| 5860 |
-
GGML_ASSERT(
|
| 5861 |
-
GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used,
|
| 5862 |
-
if (
|
| 5863 |
-
cgraph->grad_accs[
|
| 5864 |
-
cgraph->grads[
|
| 5865 |
-
|
|
|
|
|
|
|
|
|
|
| 5866 |
}
|
| 5867 |
-
grads_needed[
|
| 5868 |
}
|
| 5869 |
|
| 5870 |
for (int i = n_nodes_f - 1; i >= 0; --i) {
|
| 5871 |
// inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
|
| 5872 |
// use allocator to automatically make inplace operations
|
| 5873 |
-
ggml_compute_backward(
|
| 5874 |
}
|
| 5875 |
|
| 5876 |
free(grads_needed);
|
|
@@ -6016,8 +6018,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
|
| 6016 |
}
|
| 6017 |
}
|
| 6018 |
|
| 6019 |
-
struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
| 6020 |
-
struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads
|
| 6021 |
ggml_graph_cpy(cgraph, result);
|
| 6022 |
return result;
|
| 6023 |
}
|
|
@@ -6036,6 +6038,9 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
|
| 6036 |
}
|
| 6037 |
|
| 6038 |
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
|
|
|
|
|
|
|
|
| 6039 |
GGML_ASSERT(cgraph->grads != NULL);
|
| 6040 |
|
| 6041 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -6345,8 +6350,8 @@ void ggml_set_output(struct ggml_tensor * tensor) {
|
|
| 6345 |
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
| 6346 |
}
|
| 6347 |
|
| 6348 |
-
void ggml_set_param(struct
|
| 6349 |
-
|
| 6350 |
tensor->flags |= GGML_TENSOR_FLAG_PARAM;
|
| 6351 |
}
|
| 6352 |
|
|
|
|
| 5499 |
// tensor = src0 * 1 + src1 * 0
|
| 5500 |
if (src0_needs_grads) {
|
| 5501 |
// dsrc0 = dtensor * 1
|
| 5502 |
+
ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad, src0));
|
| 5503 |
}
|
| 5504 |
if (src1_needs_grads) {
|
| 5505 |
// dsrc1 = dtensor * 0 -> noop
|
|
|
|
| 5780 |
}
|
| 5781 |
|
| 5782 |
void ggml_build_backward_expand(
|
| 5783 |
+
struct ggml_context * ctx,
|
| 5784 |
+
struct ggml_cgraph * cgraph,
|
| 5785 |
+
struct ggml_tensor ** grad_accs) {
|
|
|
|
| 5786 |
GGML_ASSERT(cgraph->n_nodes > 0);
|
| 5787 |
GGML_ASSERT(cgraph->grads);
|
| 5788 |
GGML_ASSERT(cgraph->grad_accs);
|
|
|
|
| 5855 |
GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
|
| 5856 |
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
|
| 5857 |
|
| 5858 |
+
const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
|
| 5859 |
+
GGML_ASSERT(ihash != GGML_HASHSET_FULL);
|
| 5860 |
+
GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, ihash));
|
| 5861 |
+
if (grad_accs && grad_accs[i]) {
|
| 5862 |
+
cgraph->grad_accs[ihash] = grad_accs[i];
|
| 5863 |
+
cgraph->grads[ihash] = cgraph->grad_accs[ihash];
|
| 5864 |
+
} else if (node->flags & GGML_TENSOR_FLAG_LOSS) {
|
| 5865 |
+
// loss tensors always need a gradient accumulator
|
| 5866 |
+
cgraph->grad_accs[ihash] = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
| 5867 |
+
cgraph->grads[ihash] = cgraph->grad_accs[ihash];
|
| 5868 |
}
|
| 5869 |
+
grads_needed[ihash] = true;
|
| 5870 |
}
|
| 5871 |
|
| 5872 |
for (int i = n_nodes_f - 1; i >= 0; --i) {
|
| 5873 |
// inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
|
| 5874 |
// use allocator to automatically make inplace operations
|
| 5875 |
+
ggml_compute_backward(ctx, cgraph, i, grads_needed);
|
| 5876 |
}
|
| 5877 |
|
| 5878 |
free(grads_needed);
|
|
|
|
| 6018 |
}
|
| 6019 |
}
|
| 6020 |
|
| 6021 |
+
struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads) {
|
| 6022 |
+
struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads || force_grads);
|
| 6023 |
ggml_graph_cpy(cgraph, result);
|
| 6024 |
return result;
|
| 6025 |
}
|
|
|
|
| 6038 |
}
|
| 6039 |
|
| 6040 |
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
| 6041 |
+
if (!cgraph) {
|
| 6042 |
+
return;
|
| 6043 |
+
}
|
| 6044 |
GGML_ASSERT(cgraph->grads != NULL);
|
| 6045 |
|
| 6046 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
| 6350 |
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
| 6351 |
}
|
| 6352 |
|
| 6353 |
+
void ggml_set_param(struct ggml_tensor * tensor) {
|
| 6354 |
+
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
| 6355 |
tensor->flags |= GGML_TENSOR_FLAG_PARAM;
|
| 6356 |
}
|
| 6357 |
|