Spaces:
Running
Running
ggml : sync with ggml repo (warning fixes + asserts)
Browse files
ggml.c
CHANGED
|
@@ -8245,8 +8245,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 8245 |
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
| 8246 |
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
| 8247 |
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
| 8248 |
-
#else
|
| 8249 |
-
float * const wdata = params->wdata;
|
| 8250 |
#endif
|
| 8251 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 8252 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
@@ -8263,8 +8261,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 8263 |
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
|
| 8264 |
}
|
| 8265 |
}
|
|
|
|
|
|
|
| 8266 |
}
|
| 8267 |
#else
|
|
|
|
| 8268 |
{
|
| 8269 |
size_t id = 0;
|
| 8270 |
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
|
@@ -8272,6 +8273,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 8272 |
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
| 8273 |
}
|
| 8274 |
}
|
|
|
|
|
|
|
| 8275 |
}
|
| 8276 |
#endif
|
| 8277 |
|
|
@@ -8537,7 +8540,10 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
| 8537 |
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
| 8538 |
id += ne00;
|
| 8539 |
}
|
|
|
|
|
|
|
| 8540 |
}
|
|
|
|
| 8541 |
const float * x = wdata;
|
| 8542 |
#endif
|
| 8543 |
|
|
@@ -9118,7 +9124,7 @@ static void ggml_compute_forward_alibi_f32(
|
|
| 9118 |
//const int nb3 = src0->nb[3];
|
| 9119 |
|
| 9120 |
assert(nb0 == sizeof(float));
|
| 9121 |
-
assert(ne1+n_past == ne0);
|
| 9122 |
|
| 9123 |
// add alibi to src0 (KQ_scaled)
|
| 9124 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
@@ -9179,7 +9185,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
| 9179 |
//const int nb3 = src0->nb[3];
|
| 9180 |
|
| 9181 |
assert(nb0 == sizeof(ggml_fp16_t));
|
| 9182 |
-
assert(ne1+n_past == ne0);
|
| 9183 |
|
| 9184 |
// add alibi to src0 (KQ_scaled)
|
| 9185 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
@@ -11571,12 +11577,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 11571 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 11572 |
node->n_tasks = 1; // TODO: this actually is doing nothing
|
| 11573 |
// the threads are still spinning
|
| 11574 |
-
#if defined(
|
|
|
|
|
|
|
|
|
|
| 11575 |
// here we need memory just for single 2D matrix from src0
|
| 11576 |
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
| 11577 |
-
#else
|
| 11578 |
-
// with GPU, we need memory for the full 3D / 4D data
|
| 11579 |
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
|
| 11580 |
#endif
|
| 11581 |
} else {
|
| 11582 |
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
|
@@ -11586,7 +11592,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 11586 |
#endif
|
| 11587 |
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
| 11588 |
cur = 0;
|
| 11589 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
| 11590 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 11591 |
node->n_tasks = 1;
|
| 11592 |
}
|
|
|
|
| 8245 |
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
| 8246 |
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
| 8247 |
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
|
|
|
|
|
|
| 8248 |
#endif
|
| 8249 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 8250 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
|
|
| 8261 |
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
|
| 8262 |
}
|
| 8263 |
}
|
| 8264 |
+
|
| 8265 |
+
assert(id*sizeof(ggml_fp16_t) <= params->wsize);
|
| 8266 |
}
|
| 8267 |
#else
|
| 8268 |
+
float * const wdata = params->wdata;
|
| 8269 |
{
|
| 8270 |
size_t id = 0;
|
| 8271 |
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
|
|
|
| 8273 |
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
| 8274 |
}
|
| 8275 |
}
|
| 8276 |
+
|
| 8277 |
+
assert(id*sizeof(float) <= params->wsize);
|
| 8278 |
}
|
| 8279 |
#endif
|
| 8280 |
|
|
|
|
| 8540 |
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
| 8541 |
id += ne00;
|
| 8542 |
}
|
| 8543 |
+
|
| 8544 |
+
assert(id*sizeof(float) <= params->wsize);
|
| 8545 |
}
|
| 8546 |
+
|
| 8547 |
const float * x = wdata;
|
| 8548 |
#endif
|
| 8549 |
|
|
|
|
| 9124 |
//const int nb3 = src0->nb[3];
|
| 9125 |
|
| 9126 |
assert(nb0 == sizeof(float));
|
| 9127 |
+
assert(ne1 + n_past == ne0); (void) n_past;
|
| 9128 |
|
| 9129 |
// add alibi to src0 (KQ_scaled)
|
| 9130 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
|
|
| 9185 |
//const int nb3 = src0->nb[3];
|
| 9186 |
|
| 9187 |
assert(nb0 == sizeof(ggml_fp16_t));
|
| 9188 |
+
assert(ne1 + n_past == ne0); (void) n_past;
|
| 9189 |
|
| 9190 |
// add alibi to src0 (KQ_scaled)
|
| 9191 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
|
|
| 11577 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 11578 |
node->n_tasks = 1; // TODO: this actually is doing nothing
|
| 11579 |
// the threads are still spinning
|
| 11580 |
+
#if defined(GGML_USE_CUBLAS)
|
| 11581 |
+
// with cuBLAS, we need memory for the full 3D / 4D data of src1
|
| 11582 |
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
| 11583 |
+
#else
|
| 11584 |
// here we need memory just for single 2D matrix from src0
|
| 11585 |
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
|
|
|
|
|
|
|
|
|
| 11586 |
#endif
|
| 11587 |
} else {
|
| 11588 |
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
|
|
|
| 11592 |
#endif
|
| 11593 |
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
| 11594 |
cur = 0;
|
| 11595 |
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
| 11596 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 11597 |
node->n_tasks = 1;
|
| 11598 |
}
|
ggml.h
CHANGED
|
@@ -701,8 +701,8 @@ extern "C" {
|
|
| 701 |
struct ggml_tensor * c1);
|
| 702 |
|
| 703 |
// Mapping operations
|
| 704 |
-
|
| 705 |
-
|
| 706 |
|
| 707 |
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
| 708 |
struct ggml_context * ctx,
|
|
|
|
| 701 |
struct ggml_tensor * c1);
|
| 702 |
|
| 703 |
// Mapping operations
|
| 704 |
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
| 705 |
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
| 706 |
|
| 707 |
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
| 708 |
struct ggml_context * ctx,
|