Spaces:
Running
Running
Rémy O
commited on
Commit
·
ed46ad5
1
Parent(s):
1b7672d
vulkan: initial support for IQ4_XS quantization (llama/11501)
Browse files- ggml/src/ggml-vulkan/ggml-vulkan.cpp +25 -0
- ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +37 -1
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +23 -0
- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
- ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +20 -1
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +1 -1
- ggml/src/ggml-vulkan/vulkan-shaders/types.comp +23 -5
- ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1 -0
ggml/src/ggml-vulkan/ggml-vulkan.cpp
CHANGED
|
@@ -1622,6 +1622,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1622 |
//CREATE_FA(GGML_TYPE_IQ2_S, iq2_s)
|
| 1623 |
//CREATE_FA(GGML_TYPE_IQ3_XXS, iq3_xxs)
|
| 1624 |
//CREATE_FA(GGML_TYPE_IQ3_S, iq3_s)
|
|
|
|
| 1625 |
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl)
|
| 1626 |
#undef CREATE_FA
|
| 1627 |
|
|
@@ -1655,6 +1656,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1655 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1656 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1657 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
|
|
|
| 1658 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1659 |
|
| 1660 |
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
|
|
@@ -1673,6 +1675,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1673 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1674 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1675 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
|
|
|
| 1676 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1677 |
#undef CREATE_MM
|
| 1678 |
#undef CREATE_MM2
|
|
@@ -1726,6 +1729,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1726 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1727 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1728 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
|
|
| 1729 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1730 |
} else {
|
| 1731 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
@@ -1744,6 +1748,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1744 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1745 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1746 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
|
|
| 1747 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1748 |
}
|
| 1749 |
|
|
@@ -1770,6 +1775,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1770 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1771 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1772 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
|
|
| 1773 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1774 |
} else {
|
| 1775 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
@@ -1788,6 +1794,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1788 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1789 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1790 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
|
|
| 1791 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1792 |
}
|
| 1793 |
}
|
|
@@ -1837,6 +1844,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1837 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1838 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1839 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
|
|
| 1840 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1841 |
|
| 1842 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
|
@@ -1861,6 +1869,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1861 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1862 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1863 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
|
|
| 1864 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1865 |
}
|
| 1866 |
#undef CREATE_MM2
|
|
@@ -1902,6 +1911,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1902 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1903 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1904 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
|
|
| 1905 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1906 |
|
| 1907 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
|
@@ -1926,6 +1936,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1926 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1927 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1928 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
|
|
| 1929 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1930 |
}
|
| 1931 |
#undef CREATE_MM
|
|
@@ -1962,6 +1973,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1962 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1963 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1964 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
|
|
|
| 1965 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
| 1966 |
|
| 1967 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
|
@@ -1981,6 +1993,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1981 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1982 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1983 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
|
|
|
| 1984 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
| 1985 |
}
|
| 1986 |
|
|
@@ -2001,6 +2014,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 2001 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2002 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2003 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
|
|
|
| 2004 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
| 2005 |
|
| 2006 |
// dequant shaders
|
|
@@ -2020,6 +2034,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 2020 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2021 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2022 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
|
|
|
| 2023 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
| 2024 |
|
| 2025 |
// get_rows
|
|
@@ -2035,6 +2050,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 2035 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2036 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2037 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
|
|
|
| 2038 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2039 |
|
| 2040 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
@@ -2049,6 +2065,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 2049 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2050 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2051 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
|
|
|
| 2052 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2053 |
|
| 2054 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
|
@@ -2995,6 +3012,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
| 2995 |
case GGML_TYPE_IQ2_S:
|
| 2996 |
case GGML_TYPE_IQ3_XXS:
|
| 2997 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 2998 |
case GGML_TYPE_IQ4_NL:
|
| 2999 |
break;
|
| 3000 |
default:
|
|
@@ -3048,6 +3066,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
| 3048 |
case GGML_TYPE_IQ2_S:
|
| 3049 |
case GGML_TYPE_IQ3_XXS:
|
| 3050 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 3051 |
case GGML_TYPE_IQ4_NL:
|
| 3052 |
break;
|
| 3053 |
default:
|
|
@@ -3084,6 +3103,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
| 3084 |
case GGML_TYPE_IQ2_S:
|
| 3085 |
case GGML_TYPE_IQ3_XXS:
|
| 3086 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 3087 |
case GGML_TYPE_IQ4_NL:
|
| 3088 |
break;
|
| 3089 |
default:
|
|
@@ -3132,6 +3152,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
| 3132 |
case GGML_TYPE_IQ2_S:
|
| 3133 |
case GGML_TYPE_IQ3_XXS:
|
| 3134 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 3135 |
case GGML_TYPE_IQ4_NL:
|
| 3136 |
break;
|
| 3137 |
default:
|
|
@@ -3163,6 +3184,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|
| 3163 |
case GGML_TYPE_IQ2_S:
|
| 3164 |
case GGML_TYPE_IQ3_XXS:
|
| 3165 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 3166 |
case GGML_TYPE_IQ4_NL:
|
| 3167 |
break;
|
| 3168 |
default:
|
|
@@ -8037,6 +8059,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 8037 |
case GGML_TYPE_IQ2_S:
|
| 8038 |
case GGML_TYPE_IQ3_XXS:
|
| 8039 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 8040 |
case GGML_TYPE_IQ4_NL:
|
| 8041 |
break;
|
| 8042 |
default:
|
|
@@ -8110,6 +8133,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 8110 |
//case GGML_TYPE_IQ2_S:
|
| 8111 |
//case GGML_TYPE_IQ3_XXS:
|
| 8112 |
//case GGML_TYPE_IQ3_S:
|
|
|
|
| 8113 |
case GGML_TYPE_IQ4_NL:
|
| 8114 |
break;
|
| 8115 |
default:
|
|
@@ -8132,6 +8156,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 8132 |
case GGML_TYPE_IQ2_S:
|
| 8133 |
case GGML_TYPE_IQ3_XXS:
|
| 8134 |
case GGML_TYPE_IQ3_S:
|
|
|
|
| 8135 |
case GGML_TYPE_IQ4_NL:
|
| 8136 |
return true;
|
| 8137 |
default:
|
|
|
|
| 1622 |
//CREATE_FA(GGML_TYPE_IQ2_S, iq2_s)
|
| 1623 |
//CREATE_FA(GGML_TYPE_IQ3_XXS, iq3_xxs)
|
| 1624 |
//CREATE_FA(GGML_TYPE_IQ3_S, iq3_s)
|
| 1625 |
+
//CREATE_FA(GGML_TYPE_IQ4_XS, iq4_xs)
|
| 1626 |
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl)
|
| 1627 |
#undef CREATE_FA
|
| 1628 |
|
|
|
|
| 1656 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1657 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1658 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1659 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1660 |
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
| 1661 |
|
| 1662 |
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
|
|
|
|
| 1675 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1676 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1677 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1678 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1679 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
| 1680 |
#undef CREATE_MM
|
| 1681 |
#undef CREATE_MM2
|
|
|
|
| 1729 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1730 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1731 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1732 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1733 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1734 |
} else {
|
| 1735 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
|
|
|
| 1748 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1749 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1750 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1751 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1752 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1753 |
}
|
| 1754 |
|
|
|
|
| 1775 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1776 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1777 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1778 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1779 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1780 |
} else {
|
| 1781 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
|
|
|
| 1794 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1795 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1796 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1797 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1798 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1799 |
}
|
| 1800 |
}
|
|
|
|
| 1844 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1845 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1846 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1847 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1848 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1849 |
|
| 1850 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
|
|
|
| 1869 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1870 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1871 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1872 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1873 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1874 |
}
|
| 1875 |
#undef CREATE_MM2
|
|
|
|
| 1911 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1912 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1913 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1914 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1915 |
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
| 1916 |
|
| 1917 |
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
|
|
|
| 1936 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1937 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1938 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1939 |
+
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1940 |
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
| 1941 |
}
|
| 1942 |
#undef CREATE_MM
|
|
|
|
| 1973 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1974 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1975 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1976 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f32_f32_len, mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1977 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
| 1978 |
|
| 1979 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
|
|
|
| 1993 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1994 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1995 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1996 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f16_f32_len, mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
| 1997 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
| 1998 |
}
|
| 1999 |
|
|
|
|
| 2014 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2015 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2016 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2017 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
| 2018 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
| 2019 |
|
| 2020 |
// dequant shaders
|
|
|
|
| 2034 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2035 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2036 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2037 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_XS], "dequant_iq4_xs", dequant_iq4_xs_len, dequant_iq4_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 2038 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
| 2039 |
|
| 2040 |
// get_rows
|
|
|
|
| 2050 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2051 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2052 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2053 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2054 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2055 |
|
| 2056 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
|
|
| 2065 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2066 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2067 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2068 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs_f32", get_rows_iq4_xs_f32_len, get_rows_iq4_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2069 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 2070 |
|
| 2071 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
|
|
|
| 3012 |
case GGML_TYPE_IQ2_S:
|
| 3013 |
case GGML_TYPE_IQ3_XXS:
|
| 3014 |
case GGML_TYPE_IQ3_S:
|
| 3015 |
+
case GGML_TYPE_IQ4_XS:
|
| 3016 |
case GGML_TYPE_IQ4_NL:
|
| 3017 |
break;
|
| 3018 |
default:
|
|
|
|
| 3066 |
case GGML_TYPE_IQ2_S:
|
| 3067 |
case GGML_TYPE_IQ3_XXS:
|
| 3068 |
case GGML_TYPE_IQ3_S:
|
| 3069 |
+
case GGML_TYPE_IQ4_XS:
|
| 3070 |
case GGML_TYPE_IQ4_NL:
|
| 3071 |
break;
|
| 3072 |
default:
|
|
|
|
| 3103 |
case GGML_TYPE_IQ2_S:
|
| 3104 |
case GGML_TYPE_IQ3_XXS:
|
| 3105 |
case GGML_TYPE_IQ3_S:
|
| 3106 |
+
case GGML_TYPE_IQ4_XS:
|
| 3107 |
case GGML_TYPE_IQ4_NL:
|
| 3108 |
break;
|
| 3109 |
default:
|
|
|
|
| 3152 |
case GGML_TYPE_IQ2_S:
|
| 3153 |
case GGML_TYPE_IQ3_XXS:
|
| 3154 |
case GGML_TYPE_IQ3_S:
|
| 3155 |
+
case GGML_TYPE_IQ4_XS:
|
| 3156 |
case GGML_TYPE_IQ4_NL:
|
| 3157 |
break;
|
| 3158 |
default:
|
|
|
|
| 3184 |
case GGML_TYPE_IQ2_S:
|
| 3185 |
case GGML_TYPE_IQ3_XXS:
|
| 3186 |
case GGML_TYPE_IQ3_S:
|
| 3187 |
+
case GGML_TYPE_IQ4_XS:
|
| 3188 |
case GGML_TYPE_IQ4_NL:
|
| 3189 |
break;
|
| 3190 |
default:
|
|
|
|
| 8059 |
case GGML_TYPE_IQ2_S:
|
| 8060 |
case GGML_TYPE_IQ3_XXS:
|
| 8061 |
case GGML_TYPE_IQ3_S:
|
| 8062 |
+
case GGML_TYPE_IQ4_XS:
|
| 8063 |
case GGML_TYPE_IQ4_NL:
|
| 8064 |
break;
|
| 8065 |
default:
|
|
|
|
| 8133 |
//case GGML_TYPE_IQ2_S:
|
| 8134 |
//case GGML_TYPE_IQ3_XXS:
|
| 8135 |
//case GGML_TYPE_IQ3_S:
|
| 8136 |
+
//case GGML_TYPE_IQ4_XS:
|
| 8137 |
case GGML_TYPE_IQ4_NL:
|
| 8138 |
break;
|
| 8139 |
default:
|
|
|
|
| 8156 |
case GGML_TYPE_IQ2_S:
|
| 8157 |
case GGML_TYPE_IQ3_XXS:
|
| 8158 |
case GGML_TYPE_IQ3_S:
|
| 8159 |
+
case GGML_TYPE_IQ4_XS:
|
| 8160 |
case GGML_TYPE_IQ4_NL:
|
| 8161 |
return true;
|
| 8162 |
default:
|
ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
CHANGED
|
@@ -12,7 +12,7 @@ layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
void main() {
|
| 15 |
-
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 16 |
init_iq_shmem(gl_WorkGroupSize);
|
| 17 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 18 |
return;
|
|
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
void main() {
|
| 15 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 16 |
init_iq_shmem(gl_WorkGroupSize);
|
| 17 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 18 |
return;
|
ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
CHANGED
|
@@ -217,7 +217,7 @@ void quantize(uint dst_idx, uint src_idx)
|
|
| 217 |
#endif
|
| 218 |
|
| 219 |
void main() {
|
| 220 |
-
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 221 |
init_iq_shmem(gl_WorkGroupSize);
|
| 222 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 223 |
return;
|
|
|
|
| 217 |
#endif
|
| 218 |
|
| 219 |
void main() {
|
| 220 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 221 |
init_iq_shmem(gl_WorkGroupSize);
|
| 222 |
if (gl_LocalInvocationIndex.x != 0) {
|
| 223 |
return;
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
CHANGED
|
@@ -304,6 +304,42 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
|
| 304 |
}
|
| 305 |
#endif
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
#if defined(DATA_A_IQ4_NL)
|
| 308 |
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 309 |
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
|
@@ -321,7 +357,7 @@ vec2 get_dm(uint ib, uint a_offset) {
|
|
| 321 |
}
|
| 322 |
#endif
|
| 323 |
|
| 324 |
-
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 325 |
vec2 get_dm(uint ib, uint a_offset) {
|
| 326 |
return vec2(float(data_a[a_offset + ib].d), 0);
|
| 327 |
}
|
|
|
|
| 304 |
}
|
| 305 |
#endif
|
| 306 |
|
| 307 |
+
#if defined(DATA_A_IQ4_XS)
|
| 308 |
+
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 309 |
+
const uint ib32 = iqs / 32;
|
| 310 |
+
const uint iq = 16 * ib32 + (iqs % 16);
|
| 311 |
+
|
| 312 |
+
const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
| 313 |
+
const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
|
| 314 |
+
const uint qshift = (iqs & 16) >> 2;
|
| 315 |
+
u8vec2 qs = u8vec2(data_a[a_offset + ib].qs[iq], data_a[a_offset + ib].qs[iq + 1]);
|
| 316 |
+
qs = (qs >> qshift) & uint8_t(0xF);
|
| 317 |
+
|
| 318 |
+
const float dl = float(int(sl | (sh << 4)) - 32);
|
| 319 |
+
return dl * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
|
| 320 |
+
}
|
| 321 |
+
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
| 322 |
+
const uint ib32 = iqs / 32;
|
| 323 |
+
const uint iq = 16 * ib32 + (iqs % 16);
|
| 324 |
+
|
| 325 |
+
const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
| 326 |
+
const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
|
| 327 |
+
const uint qshift = (iqs & 16) >> 2;
|
| 328 |
+
u8vec4 qs = u8vec4(
|
| 329 |
+
data_a[a_offset + ib].qs[iq + 0],
|
| 330 |
+
data_a[a_offset + ib].qs[iq + 1],
|
| 331 |
+
data_a[a_offset + ib].qs[iq + 2],
|
| 332 |
+
data_a[a_offset + ib].qs[iq + 3]
|
| 333 |
+
);
|
| 334 |
+
qs = (qs >> qshift) & uint8_t(0xF);
|
| 335 |
+
|
| 336 |
+
const float dl = float(int(sl | (sh << 4)) - 32);
|
| 337 |
+
return dl * vec4(
|
| 338 |
+
kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y],
|
| 339 |
+
kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
|
| 340 |
+
}
|
| 341 |
+
#endif
|
| 342 |
+
|
| 343 |
#if defined(DATA_A_IQ4_NL)
|
| 344 |
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 345 |
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
|
|
|
| 357 |
}
|
| 358 |
#endif
|
| 359 |
|
| 360 |
+
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 361 |
vec2 get_dm(uint ib, uint a_offset) {
|
| 362 |
return vec2(float(data_a[a_offset + ib].d), 0);
|
| 363 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
CHANGED
|
@@ -454,6 +454,27 @@ float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords
|
|
| 454 |
}
|
| 455 |
#endif
|
| 456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
#if defined(DATA_A_IQ4_NL)
|
| 459 |
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
|
|
@@ -504,6 +525,8 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
|
|
| 504 |
#define dequantFuncA dequantFuncIQ3_XXS
|
| 505 |
#elif defined(DATA_A_IQ3_S)
|
| 506 |
#define dequantFuncA dequantFuncIQ3_S
|
|
|
|
|
|
|
| 507 |
#elif defined(DATA_A_IQ4_NL)
|
| 508 |
#define dequantFuncA dequantFuncIQ4_NL
|
| 509 |
#endif
|
|
|
|
| 454 |
}
|
| 455 |
#endif
|
| 456 |
|
| 457 |
+
#if defined(DATA_A_IQ4_XS)
|
| 458 |
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_XS {
|
| 459 |
+
block_iq4_xs block;
|
| 460 |
+
};
|
| 461 |
+
|
| 462 |
+
float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
| 463 |
+
{
|
| 464 |
+
const float16_t d = bl.block.d;
|
| 465 |
+
const uint idx = coordInBlock[1];
|
| 466 |
+
|
| 467 |
+
const uint ib32 = (idx & 0xE0) >> 5; // 0..7
|
| 468 |
+
|
| 469 |
+
const uint sl = (bl.block.scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
| 470 |
+
const uint sh = ((bl.block.scales_h) >> (2 * ib32)) & 3;
|
| 471 |
+
const uint qshift = (idx & 16) >> 2;
|
| 472 |
+
const uint q = (bl.block.qs[16 * ib32 + (idx % 16)] >> qshift) & 0xF;
|
| 473 |
+
|
| 474 |
+
float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
|
| 475 |
+
return ret;
|
| 476 |
+
}
|
| 477 |
+
#endif
|
| 478 |
|
| 479 |
#if defined(DATA_A_IQ4_NL)
|
| 480 |
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
|
|
|
|
| 525 |
#define dequantFuncA dequantFuncIQ3_XXS
|
| 526 |
#elif defined(DATA_A_IQ3_S)
|
| 527 |
#define dequantFuncA dequantFuncIQ3_S
|
| 528 |
+
#elif defined(DATA_A_IQ4_XS)
|
| 529 |
+
#define dequantFuncA dequantFuncIQ4_XS
|
| 530 |
#elif defined(DATA_A_IQ4_NL)
|
| 531 |
#define dequantFuncA dequantFuncIQ4_NL
|
| 532 |
#endif
|
ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "dequant_head.comp"
|
| 4 |
+
|
| 5 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 6 |
+
|
| 7 |
+
layout (binding = 0) readonly buffer A {block_iq4_xs data_a[];};
|
| 8 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
// Each thread handles 1 subblock (1 scale and 32 quantized values)
|
| 12 |
+
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
| 13 |
+
|
| 14 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 15 |
+
|
| 16 |
+
if (ib >= p.nel / 256) {
|
| 17 |
+
return;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
const uint ib32 = gl_LocalInvocationID.x % 8;
|
| 21 |
+
|
| 22 |
+
const float d = float(data_a[ib].d);
|
| 23 |
+
// Scales are 6 bits
|
| 24 |
+
const uint scale = ((data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF)
|
| 25 |
+
| (((data_a[ib].scales_h >> (2 * ib32)) & 3) << 4);
|
| 26 |
+
const float dl = d * (int(scale) - 32);
|
| 27 |
+
|
| 28 |
+
const uint b_idx = 256 * ib + 32 * ib32;
|
| 29 |
+
const uint q_idx = 16 * ib32;
|
| 30 |
+
[[unroll]] for (uint l = 0; l < 16; ++l) {
|
| 31 |
+
data_b[b_idx + l + 0] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
|
| 32 |
+
data_b[b_idx + l + 16] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
|
| 33 |
+
}
|
| 34 |
+
}
|
ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
CHANGED
|
@@ -104,7 +104,7 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
|
|
| 104 |
#endif
|
| 105 |
|
| 106 |
void main() {
|
| 107 |
-
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 108 |
init_iq_shmem(gl_WorkGroupSize);
|
| 109 |
#endif
|
| 110 |
|
|
|
|
| 104 |
#endif
|
| 105 |
|
| 106 |
void main() {
|
| 107 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 108 |
init_iq_shmem(gl_WorkGroupSize);
|
| 109 |
#endif
|
| 110 |
|
ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
CHANGED
|
@@ -12,7 +12,7 @@ void main() {
|
|
| 12 |
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
| 13 |
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
| 14 |
|
| 15 |
-
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 16 |
init_iq_shmem(gl_WorkGroupSize);
|
| 17 |
#endif
|
| 18 |
|
|
|
|
| 12 |
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
| 13 |
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
| 14 |
|
| 15 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 16 |
init_iq_shmem(gl_WorkGroupSize);
|
| 17 |
#endif
|
| 18 |
|
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
CHANGED
|
@@ -133,7 +133,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|
| 133 |
void main() {
|
| 134 |
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
| 135 |
|
| 136 |
-
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 137 |
init_iq_shmem(gl_WorkGroupSize);
|
| 138 |
#endif
|
| 139 |
|
|
|
|
| 133 |
void main() {
|
| 134 |
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
| 135 |
|
| 136 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 137 |
init_iq_shmem(gl_WorkGroupSize);
|
| 138 |
#endif
|
| 139 |
|
ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
CHANGED
|
@@ -95,7 +95,7 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
|
|
| 95 |
#endif
|
| 96 |
|
| 97 |
void main() {
|
| 98 |
-
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 99 |
init_iq_shmem(gl_WorkGroupSize);
|
| 100 |
#endif
|
| 101 |
|
|
@@ -547,6 +547,25 @@ void main() {
|
|
| 547 |
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
|
| 548 |
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 551 |
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 552 |
#elif defined(DATA_A_IQ4_NL)
|
|
|
|
| 95 |
#endif
|
| 96 |
|
| 97 |
void main() {
|
| 98 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 99 |
init_iq_shmem(gl_WorkGroupSize);
|
| 100 |
#endif
|
| 101 |
|
|
|
|
| 547 |
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
|
| 548 |
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
| 549 |
|
| 550 |
+
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 551 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 552 |
+
#elif defined(DATA_A_IQ4_XS)
|
| 553 |
+
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 554 |
+
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
| 555 |
+
|
| 556 |
+
const uint ib = idx / 128; // 2 values per idx
|
| 557 |
+
const uint ib32 = (idx % 128) / 16; // 0..7
|
| 558 |
+
const uint iq = 16 * ib32 + 2 * (idx % 8);
|
| 559 |
+
|
| 560 |
+
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
| 561 |
+
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
|
| 562 |
+
const uint qshift = (idx & 8) >> 1;
|
| 563 |
+
u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
|
| 564 |
+
qs = (qs >> qshift) & uint8_t(0xF);
|
| 565 |
+
|
| 566 |
+
const float d = float(data_a[ib].d);
|
| 567 |
+
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
|
| 568 |
+
|
| 569 |
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 570 |
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
| 571 |
#elif defined(DATA_A_IQ4_NL)
|
ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
CHANGED
|
@@ -106,7 +106,7 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem
|
|
| 106 |
#endif
|
| 107 |
|
| 108 |
void main() {
|
| 109 |
-
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
| 110 |
init_iq_shmem(gl_WorkGroupSize);
|
| 111 |
#endif
|
| 112 |
|
|
|
|
| 106 |
#endif
|
| 107 |
|
| 108 |
void main() {
|
| 109 |
+
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
| 110 |
init_iq_shmem(gl_WorkGroupSize);
|
| 111 |
#endif
|
| 112 |
|
ggml/src/ggml-vulkan/vulkan-shaders/types.comp
CHANGED
|
@@ -1026,6 +1026,23 @@ void init_iq_shmem(uvec3 wgsize)
|
|
| 1026 |
#define A_TYPE_PACKED16 block_iq3_s_packed16
|
| 1027 |
#endif
|
| 1028 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1029 |
#define QUANT_K_IQ4_NL 32
|
| 1030 |
#define QUANT_R_IQ4_NL 2
|
| 1031 |
|
|
@@ -1042,7 +1059,13 @@ struct block_iq4_nl_packed16
|
|
| 1042 |
};
|
| 1043 |
|
| 1044 |
#if defined(DATA_A_IQ4_NL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
|
|
|
|
| 1046 |
const int8_t kvalues_iq4nl_const[16] = {
|
| 1047 |
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
| 1048 |
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
|
@@ -1058,11 +1081,6 @@ void init_iq_shmem(uvec3 wgsize)
|
|
| 1058 |
}
|
| 1059 |
barrier();
|
| 1060 |
}
|
| 1061 |
-
|
| 1062 |
-
#define QUANT_K QUANT_K_IQ4_NL
|
| 1063 |
-
#define QUANT_R QUANT_R_IQ4_NL
|
| 1064 |
-
#define A_TYPE block_iq4_nl
|
| 1065 |
-
#define A_TYPE_PACKED16 block_iq4_nl_packed16
|
| 1066 |
#endif
|
| 1067 |
|
| 1068 |
#endif // !defined(GGML_TYPES_COMP)
|
|
|
|
| 1026 |
#define A_TYPE_PACKED16 block_iq3_s_packed16
|
| 1027 |
#endif
|
| 1028 |
|
| 1029 |
+
#define QUANT_K_IQ4_XS 256
|
| 1030 |
+
#define QUANT_R_IQ4_XS 1
|
| 1031 |
+
|
| 1032 |
+
struct block_iq4_xs
|
| 1033 |
+
{
|
| 1034 |
+
float16_t d;
|
| 1035 |
+
uint16_t scales_h;
|
| 1036 |
+
uint8_t scales_l[QUANT_K_IQ4_XS/64];
|
| 1037 |
+
uint8_t qs[QUANT_K_IQ4_XS/2];
|
| 1038 |
+
};
|
| 1039 |
+
|
| 1040 |
+
#if defined(DATA_A_IQ4_XS)
|
| 1041 |
+
#define QUANT_K QUANT_K_IQ4_XS
|
| 1042 |
+
#define QUANT_R QUANT_R_IQ4_XS
|
| 1043 |
+
#define A_TYPE block_iq4_xs
|
| 1044 |
+
#endif
|
| 1045 |
+
|
| 1046 |
#define QUANT_K_IQ4_NL 32
|
| 1047 |
#define QUANT_R_IQ4_NL 2
|
| 1048 |
|
|
|
|
| 1059 |
};
|
| 1060 |
|
| 1061 |
#if defined(DATA_A_IQ4_NL)
|
| 1062 |
+
#define QUANT_K QUANT_K_IQ4_NL
|
| 1063 |
+
#define QUANT_R QUANT_R_IQ4_NL
|
| 1064 |
+
#define A_TYPE block_iq4_nl
|
| 1065 |
+
#define A_TYPE_PACKED16 block_iq4_nl_packed16
|
| 1066 |
+
#endif
|
| 1067 |
|
| 1068 |
+
#if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
|
| 1069 |
const int8_t kvalues_iq4nl_const[16] = {
|
| 1070 |
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
| 1071 |
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
|
|
|
| 1081 |
}
|
| 1082 |
barrier();
|
| 1083 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1084 |
#endif
|
| 1085 |
|
| 1086 |
#endif // !defined(GGML_TYPES_COMP)
|
ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
CHANGED
|
@@ -60,6 +60,7 @@ const std::vector<std::string> type_names = {
|
|
| 60 |
"iq2_s",
|
| 61 |
"iq3_xxs",
|
| 62 |
"iq3_s",
|
|
|
|
| 63 |
"iq4_nl"
|
| 64 |
};
|
| 65 |
|
|
|
|
| 60 |
"iq2_s",
|
| 61 |
"iq3_xxs",
|
| 62 |
"iq3_s",
|
| 63 |
+
"iq4_xs",
|
| 64 |
"iq4_nl"
|
| 65 |
};
|
| 66 |
|