Diego Devesa commited on
Commit
62cf694
·
1 Parent(s): 0454008

ggml-cpu : remove unnecesary arm feature detection (llama/14281)

Browse files

Support for Arm runtime feature detection has now been added to GGML_CPU_ALL_VARIANTS. This removes the old and not very functional code.

ggml/src/ggml-cpu/arch/arm/repack.cpp CHANGED
@@ -256,45 +256,43 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
256
  UNUSED(blocklen);
257
 
258
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
259
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
260
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
261
-
262
- for (int c = 0; c < nc; c += ncols_interleaved) {
263
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
264
- float32x4_t acc = vdupq_n_f32(0);
265
- for (int b = 0; b < nb; b++) {
266
- int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
267
- int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
268
- int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
269
- int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
270
- float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
271
-
272
- int8x16_t a0 = vld1q_s8(a_ptr->qs);
273
- int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
274
- float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
275
-
276
- int32x4_t ret = vdupq_n_s32(0);
277
-
278
- ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
279
- ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
280
- ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
281
- ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
282
-
283
- ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
284
- ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
285
- ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
286
- ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
287
-
288
- acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
289
- vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
290
- a_ptr++;
291
- b_ptr++;
292
- }
293
- vst1q_f32(s, acc);
294
- s += ncols_interleaved;
295
  }
296
- return;
 
297
  }
 
298
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
299
  float sumf[4];
300
  int sumi;
@@ -341,50 +339,48 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
341
  UNUSED(blocklen);
342
 
343
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
344
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
345
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
346
-
347
- for (int c = 0; c < nc; c += ncols_interleaved) {
348
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
349
- float32x4_t acc = vdupq_n_f32(0);
350
- for (int b = 0; b < nb; b++) {
351
- int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
352
- int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
353
- int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
354
- int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
355
- float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
356
-
357
- int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
358
- int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
359
- int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
360
- int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
361
- float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
362
-
363
- int32x4_t ret0 = vdupq_n_s32(0);
364
- int32x4_t ret1 = vdupq_n_s32(0);
365
-
366
- ret0 = vdotq_s32(ret0, b0 << 4, a0);
367
- ret1 = vdotq_s32(ret1, b1 << 4, a0);
368
- ret0 = vdotq_s32(ret0, b2 << 4, a1);
369
- ret1 = vdotq_s32(ret1, b3 << 4, a1);
370
-
371
- ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
372
- ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
373
- ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
374
- ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
375
-
376
- int32x4_t ret = vpaddq_s32(ret0, ret1);
377
-
378
- acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
379
- vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
380
- a_ptr++;
381
- b_ptr++;
382
- }
383
- vst1q_f32(s, acc);
384
- s += ncols_interleaved;
385
  }
386
- return;
 
387
  }
 
388
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
389
  float sumf[4];
390
  int sumi;
@@ -432,7 +428,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
432
 
433
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
434
  #if defined(__ARM_FEATURE_SVE)
435
- if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
436
  const void * b_ptr = vx;
437
  const void * a_ptr = vy;
438
  float * res_ptr = s;
@@ -547,54 +543,52 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
547
  UNUSED(blocklen);
548
 
549
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
550
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
551
- const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
552
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
553
- float * res_ptr = s;
554
-
555
- for (int x = 0; x < nc / ncols_interleaved; x++) {
556
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
557
-
558
- float32x4_t sumf = vdupq_n_f32(0);
559
- for (int l = 0; l < nb; l++) {
560
- uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
561
- uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
562
- uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
563
- uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
564
-
565
- int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
566
- int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
567
- int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
568
- int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
569
- int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
570
- int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
571
- int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
572
- int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
573
-
574
- int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
575
- int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
576
-
577
- int32x4_t sumi = vdupq_n_s32(0);
578
- sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
579
- sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
580
- sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
581
- sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
582
- sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
583
- sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
584
- sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
585
- sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
586
-
587
- float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
588
- float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
589
- float32x4_t d = a_d * b_d;
590
 
591
- sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
592
- }
593
 
594
- vst1q_f32(res_ptr + x * 4, sumf);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595
  }
596
- return;
 
597
  }
 
598
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
599
  {
600
  float sumf[4];
@@ -643,465 +637,463 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
643
  UNUSED(ncols_interleaved);
644
  UNUSED(blocklen);
645
 
646
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
647
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
648
- const void * b_ptr = vx;
649
- const void * a_ptr = vy;
650
- float * res_ptr = s;
651
- size_t res_stride = bs * sizeof(float);
652
-
653
- __asm__ __volatile__(
654
- "mov x10, %x[nr]\n"
655
- "mov x9, #0x88\n"
656
- "cmp x10, #0x10\n"
657
- "mul x9, %x[nb], x9\n"
658
- "blt 4f\n"
659
- "1:" // Row loop
660
- "add x28, %x[b_ptr], #0x8\n"
661
- "mov x27, %x[nc]\n"
662
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
663
- "2:" // Column loop
664
- "add x25, %x[a_ptr], #0x8\n"
665
- "movi v15.16b, #0x0\n"
666
- "movi v19.16b, #0x0\n"
667
- "mov x24, %x[nb]\n"
668
- "add x23, x25, x9\n"
669
- "movi v18.16b, #0x0\n"
670
- "movi v14.16b, #0x0\n"
671
- "add x22, x23, x9\n"
672
- "movi v11.16b, #0x0\n"
673
- "movi v13.16b, #0x0\n"
674
- "add x21, x22, x9\n"
675
- "movi v23.16b, #0x0\n"
676
- "movi v16.16b, #0x0\n"
677
- "movi v25.16b, #0x0\n"
678
- "movi v7.16b, #0x0\n"
679
- "movi v0.16b, #0x0\n"
680
- "movi v4.16b, #0x0\n"
681
- "movi v5.16b, #0x0\n"
682
- "movi v21.16b, #0x0\n"
683
- "movi v8.16b, #0x0\n"
684
- "movi v1.16b, #0x0\n"
685
- "3:" // Block loop
686
- "ldr q3, [x28, #0x0]\n"
687
- "ldr q31, [x25, #0x0]\n"
688
- "movi v28.16b, #0x4\n"
689
- "movi v10.4s, #0x0\n"
690
- "ldr q22, [x28, #0x10]\n"
691
- "ldr q6, [x25, #0x10]\n"
692
- "movi v29.4s, #0x0\n"
693
- "movi v9.4s, #0x0\n"
694
- "ldr q27, [x28, #0x20]\n"
695
- "ldr q30, [x28, #0x30]\n"
696
- "movi v20.4s, #0x0\n"
697
- "movi v24.16b, #0xf0\n"
698
- "ldr d2, [x25, #-0x8]\n"
699
- "ldr d26, [x23, #-0x8]\n"
700
- "sshl v12.16b, v3.16b, v28.16b\n"
701
- "sub x20, x28, #0x8\n"
702
- "ldr d17, [x20, #0x0]\n"
703
- "and v3.16b, v3.16b, v24.16b\n"
704
- "subs x24, x24, #0x1\n"
705
- "add x28, x28, #0x48\n"
706
- ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
707
- ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
708
- ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
709
- ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
710
- "sshl v31.16b, v22.16b, v28.16b\n"
711
- "and v22.16b, v22.16b, v24.16b\n"
712
- "fcvtl v17.4s, v17.4h\n"
713
- "fcvtl v2.4s, v2.4h\n"
714
- "fcvtl v26.4s, v26.4h\n"
715
- ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
716
- ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
717
- ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
718
- ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
719
- "sshl v6.16b, v27.16b, v28.16b\n"
720
- "sshl v28.16b, v30.16b, v28.16b\n"
721
- "and v27.16b, v27.16b, v24.16b\n"
722
- "and v30.16b, v30.16b, v24.16b\n"
723
- "ldr q24, [x25, #0x20]\n"
724
- ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
725
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
726
- ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
727
- ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
728
- "ldr q24, [x25, #0x30]\n"
729
- ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
730
- ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
731
- ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
732
- ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
733
- "ldr q24, [x25, #0x40]\n"
734
- ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
735
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
736
- ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
737
- ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
738
- "ldr q24, [x25, #0x50]\n"
739
- ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
740
- ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
741
- ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
742
- ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
743
- "ldr q24, [x25, #0x60]\n"
744
- ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
745
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
746
- ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
747
- ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
748
- "ldr q24, [x25, #0x70]\n"
749
- "add x25, x25, #0x88\n"
750
- ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
751
- ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
752
- ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
753
- ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
754
- "fmul v24.4s, v17.4s, v2.s[0]\n"
755
- "scvtf v10.4s, v10.4s, #0x4\n"
756
- "scvtf v29.4s, v29.4s, #0x4\n"
757
- "scvtf v9.4s, v9.4s, #0x4\n"
758
- "scvtf v20.4s, v20.4s, #0x4\n"
759
- "fmla v15.4s, v10.4s, v24.4s\n"
760
- "ldr q24, [x23, #0x0]\n"
761
- "fmul v10.4s, v17.4s, v2.s[1]\n"
762
- "fmla v19.4s, v29.4s, v10.4s\n"
763
- "ldr q10, [x23, #0x10]\n"
764
- "fmul v29.4s, v17.4s, v2.s[2]\n"
765
- "fmul v2.4s, v17.4s, v2.s[3]\n"
766
- "fmla v18.4s, v9.4s, v29.4s\n"
767
- "movi v9.4s, #0x0\n"
768
- "movi v29.4s, #0x0\n"
769
- ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
770
- ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
771
- "fmla v14.4s, v20.4s, v2.4s\n"
772
- "movi v20.4s, #0x0\n"
773
- "movi v2.4s, #0x0\n"
774
- ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
775
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
776
- "ldr q24, [x23, #0x20]\n"
777
- ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
778
- ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
779
- ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
780
- ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
781
- "ldr q10, [x23, #0x30]\n"
782
- ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
783
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
784
- ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
785
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
786
- "ldr q24, [x23, #0x40]\n"
787
- ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
788
- ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
789
- ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
790
- ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
791
- "ldr q10, [x23, #0x50]\n"
792
- ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
793
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
794
- ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
795
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
796
- "ldr q24, [x23, #0x60]\n"
797
- ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
798
- ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
799
- ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
800
- ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
801
- "ldr q10, [x23, #0x70]\n"
802
- "add x23, x23, #0x88\n"
803
- ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
804
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
805
- ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
806
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
807
- "ldr q24, [x22, #0x0]\n"
808
- ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
809
- ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
810
- ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
811
- ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
812
- "fmul v10.4s, v17.4s, v26.s[0]\n"
813
- "scvtf v9.4s, v9.4s, #0x4\n"
814
- "scvtf v29.4s, v29.4s, #0x4\n"
815
- "scvtf v20.4s, v20.4s, #0x4\n"
816
- "scvtf v2.4s, v2.4s, #0x4\n"
817
- "fmla v11.4s, v9.4s, v10.4s\n"
818
- "ldr q9, [x22, #0x10]\n"
819
- "fmul v10.4s, v17.4s, v26.s[1]\n"
820
- "fmla v13.4s, v29.4s, v10.4s\n"
821
- "ldr d29, [x22, #-0x8]\n"
822
- "fmul v10.4s, v17.4s, v26.s[2]\n"
823
- "fmul v26.4s, v17.4s, v26.s[3]\n"
824
- "fcvtl v29.4s, v29.4h\n"
825
- "fmla v23.4s, v20.4s, v10.4s\n"
826
- "movi v20.4s, #0x0\n"
827
- "movi v10.4s, #0x0\n"
828
- "fmla v16.4s, v2.4s, v26.4s\n"
829
- "movi v26.4s, #0x0\n"
830
- "movi v2.4s, #0x0\n"
831
- ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
832
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
833
- ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
834
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
835
- "ldr q24, [x22, #0x20]\n"
836
- ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
837
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
838
- ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
839
- ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
840
- "ldr q9, [x22, #0x30]\n"
841
- ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
842
- ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
843
- ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
844
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
845
- "ldr q24, [x22, #0x40]\n"
846
- ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
847
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
848
- ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
849
- ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
850
- "ldr q9, [x22, #0x50]\n"
851
- ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
852
- ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
853
- ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
854
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
855
- "ldr q24, [x22, #0x60]\n"
856
- ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
857
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
858
- ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
859
- ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
860
- "ldr q9, [x22, #0x70]\n"
861
- "add x22, x22, #0x88\n"
862
- ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
863
- ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
864
- ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
865
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
866
- "ldr q24, [x21, #0x0]\n"
867
- ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
868
- ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
869
- ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
870
- ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
871
- "fmul v9.4s, v17.4s, v29.s[0]\n"
872
- "scvtf v20.4s, v20.4s, #0x4\n"
873
- "scvtf v10.4s, v10.4s, #0x4\n"
874
- "scvtf v26.4s, v26.4s, #0x4\n"
875
- "scvtf v2.4s, v2.4s, #0x4\n"
876
- "fmla v25.4s, v20.4s, v9.4s\n"
877
- "ldr q9, [x21, #0x10]\n"
878
- "fmul v20.4s, v17.4s, v29.s[1]\n"
879
- "fmla v7.4s, v10.4s, v20.4s\n"
880
- "ldr d20, [x21, #-0x8]\n"
881
- "fmul v10.4s, v17.4s, v29.s[2]\n"
882
- "fmul v29.4s, v17.4s, v29.s[3]\n"
883
- "fcvtl v20.4s, v20.4h\n"
884
- "fmla v0.4s, v26.4s, v10.4s\n"
885
- "movi v26.4s, #0x0\n"
886
- "movi v10.4s, #0x0\n"
887
- "fmla v4.4s, v2.4s, v29.4s\n"
888
- "movi v2.4s, #0x0\n"
889
- "movi v29.4s, #0x0\n"
890
- ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
891
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
892
- ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
893
- ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
894
- "ldr q12, [x21, #0x20]\n"
895
- "fmul v24.4s, v17.4s, v20.s[0]\n"
896
- ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
897
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
898
- ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
899
- ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
900
- "ldr q9, [x21, #0x30]\n"
901
- "fmul v31.4s, v17.4s, v20.s[1]\n"
902
- ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
903
- ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
904
- ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
905
- ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
906
- "ldr q12, [x21, #0x40]\n"
907
- "fmul v6.4s, v17.4s, v20.s[2]\n"
908
- "fmul v20.4s, v17.4s, v20.s[3]\n"
909
- ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
910
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
911
- ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
912
- ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
913
- "ldr q9, [x21, #0x50]\n"
914
- ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
915
- ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
916
- ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
917
- ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
918
- "ldr q12, [x21, #0x60]\n"
919
- ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
920
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
921
- ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
922
- ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
923
- "ldr q17, [x21, #0x70]\n"
924
- "add x21, x21, #0x88\n"
925
- ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
926
- ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
927
- ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
928
- ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
929
- ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
930
- ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
931
- ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
932
- ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
933
- "scvtf v26.4s, v26.4s, #0x4\n"
934
- "scvtf v10.4s, v10.4s, #0x4\n"
935
- "fmla v5.4s, v26.4s, v24.4s\n"
936
- "scvtf v2.4s, v2.4s, #0x4\n"
937
- "scvtf v29.4s, v29.4s, #0x4\n"
938
- "fmla v21.4s, v10.4s, v31.4s\n"
939
- "fmla v8.4s, v2.4s, v6.4s\n"
940
- "fmla v1.4s, v29.4s, v20.4s\n"
941
- "bgt 3b\n"
942
- "mov x20, %x[res_ptr]\n"
943
- "subs x27, x27, #0x4\n"
944
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
945
- "str q15, [x20, #0x0]\n"
946
- "add x20, x20, %x[res_stride]\n"
947
- "str q19, [x20, #0x0]\n"
948
- "add x20, x20, %x[res_stride]\n"
949
- "str q18, [x20, #0x0]\n"
950
- "add x20, x20, %x[res_stride]\n"
951
- "str q14, [x20, #0x0]\n"
952
- "add x20, x20, %x[res_stride]\n"
953
- "str q11, [x20, #0x0]\n"
954
- "add x20, x20, %x[res_stride]\n"
955
- "str q13, [x20, #0x0]\n"
956
- "add x20, x20, %x[res_stride]\n"
957
- "str q23, [x20, #0x0]\n"
958
- "add x20, x20, %x[res_stride]\n"
959
- "str q16, [x20, #0x0]\n"
960
- "add x20, x20, %x[res_stride]\n"
961
- "str q25, [x20, #0x0]\n"
962
- "add x20, x20, %x[res_stride]\n"
963
- "str q7, [x20, #0x0]\n"
964
- "add x20, x20, %x[res_stride]\n"
965
- "str q0, [x20, #0x0]\n"
966
- "add x20, x20, %x[res_stride]\n"
967
- "str q4, [x20, #0x0]\n"
968
- "add x20, x20, %x[res_stride]\n"
969
- "str q5, [x20, #0x0]\n"
970
- "add x20, x20, %x[res_stride]\n"
971
- "str q21, [x20, #0x0]\n"
972
- "add x20, x20, %x[res_stride]\n"
973
- "str q8, [x20, #0x0]\n"
974
- "add x20, x20, %x[res_stride]\n"
975
- "str q1, [x20, #0x0]\n"
976
- "bne 2b\n"
977
- "mov x20, #0x4\n"
978
- "sub x10, x10, #0x10\n"
979
- "cmp x10, #0x10\n"
980
- "mov %x[res_ptr], x26\n"
981
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
982
- "bge 1b\n"
983
- "4:" // Row loop skip
984
- "cbz x10, 9f\n"
985
- "5:" // Row tail: Row loop
986
- "add x24, %x[b_ptr], #0x8\n"
987
- "mov x23, %x[nc]\n"
988
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
989
- "6:" // Row tail: Column loop
990
- "movi v15.16b, #0x0\n"
991
- "movi v19.16b, #0x0\n"
992
- "add x25, %x[a_ptr], #0x8\n"
993
- "mov x21, %x[nb]\n"
994
- "movi v18.16b, #0x0\n"
995
- "movi v14.16b, #0x0\n"
996
- "7:" // Row tail: Block loop
997
- "ldr q7, [x24, #0x0]\n"
998
- "ldr q5, [x25, #0x0]\n"
999
- "movi v9.16b, #0x4\n"
1000
- "movi v4.4s, #0x0\n"
1001
- "ldr q3, [x24, #0x10]\n"
1002
- "ldr q2, [x25, #0x10]\n"
1003
- "movi v1.4s, #0x0\n"
1004
- "movi v0.4s, #0x0\n"
1005
- "ldr q13, [x24, #0x20]\n"
1006
- "ldr q31, [x25, #0x20]\n"
1007
- "movi v30.4s, #0x0\n"
1008
- "movi v29.16b, #0xf0\n"
1009
- "ldr q28, [x24, #0x30]\n"
1010
- "ldr q27, [x25, #0x30]\n"
1011
- "sshl v20.16b, v7.16b, v9.16b\n"
1012
- "sub x20, x24, #0x8\n"
1013
- "ldr q26, [x25, #0x40]\n"
1014
- "ldr q25, [x25, #0x50]\n"
1015
- "sshl v17.16b, v3.16b, v9.16b\n"
1016
- "and v7.16b, v7.16b, v29.16b\n"
1017
- "ldr q24, [x25, #0x60]\n"
1018
- "ldr q16, [x25, #0x70]\n"
1019
- "sshl v22.16b, v13.16b, v9.16b\n"
1020
- "and v3.16b, v3.16b, v29.16b\n"
1021
- "ldr d21, [x20, #0x0]\n"
1022
- "ldr d12, [x25, #-0x8]\n"
1023
- ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
1024
- ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
1025
- ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
1026
- ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
1027
- "sshl v9.16b, v28.16b, v9.16b\n"
1028
- "subs x21, x21, #0x1\n"
1029
- "and v13.16b, v13.16b, v29.16b\n"
1030
- "and v28.16b, v28.16b, v29.16b\n"
1031
- "add x25, x25, #0x88\n"
1032
- "add x24, x24, #0x48\n"
1033
- "fcvtl v21.4s, v21.4h\n"
1034
- "fcvtl v12.4s, v12.4h\n"
1035
- ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
1036
- ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
1037
- ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
1038
- ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
1039
- "fmul v11.4s, v21.4s, v12.s[0]\n"
1040
- "fmul v23.4s, v21.4s, v12.s[1]\n"
1041
- "fmul v17.4s, v21.4s, v12.s[2]\n"
1042
- ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
1043
- "fmul v6.4s, v21.4s, v12.s[3]\n"
1044
- ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
1045
- ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
1046
- ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
1047
- ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
1048
- ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
1049
- ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
1050
- ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
1051
- ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
1052
- ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
1053
- ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
1054
- ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
1055
- ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
1056
- ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
1057
- ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
1058
- ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
1059
- ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
1060
- ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
1061
- ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
1062
- ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
1063
- ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
1064
- ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
1065
- ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
1066
- ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
1067
- "scvtf v4.4s, v4.4s, #0x4\n"
1068
- "scvtf v1.4s, v1.4s, #0x4\n"
1069
- "scvtf v0.4s, v0.4s, #0x4\n"
1070
- "fmla v15.4s, v4.4s, v11.4s\n"
1071
- "scvtf v30.4s, v30.4s, #0x4\n"
1072
- "fmla v19.4s, v1.4s, v23.4s\n"
1073
- "fmla v18.4s, v0.4s, v17.4s\n"
1074
- "fmla v14.4s, v30.4s, v6.4s\n"
1075
- "bgt 7b\n"
1076
- "mov x20, %x[res_ptr]\n"
1077
- "cmp x10, #0x1\n"
1078
- "str q15, [x20, #0x0]\n"
1079
- "add x20, x20, %x[res_stride]\n"
1080
- "ble 8f\n"
1081
- "cmp x10, #0x2\n"
1082
- "str q19, [x20, #0x0]\n"
1083
- "add x20, x20, %x[res_stride]\n"
1084
- "ble 8f\n"
1085
- "cmp x10, #0x3\n"
1086
- "str q18, [x20, #0x0]\n"
1087
- "add x20, x20, %x[res_stride]\n"
1088
- "ble 8f\n"
1089
- "str q14, [x20, #0x0]\n"
1090
- "8:" // Row tail: Accumulator store skip
1091
- "subs x23, x23, #0x4\n"
1092
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1093
- "bne 6b\n"
1094
- "subs x10, x10, #0x4\n"
1095
- "add %x[a_ptr], %x[a_ptr], x9\n"
1096
- "mov %x[res_ptr], x22\n"
1097
- "bgt 5b\n"
1098
- "9:" // Row tail: Row loop skip
1099
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1100
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1101
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1102
- );
1103
- return;
1104
- }
1105
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1106
  {
1107
  float sumf[4][4];
@@ -1160,404 +1152,402 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1160
  UNUSED(blocklen);
1161
 
1162
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1163
- if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
1164
- const void * b_ptr = vx;
1165
- const void * a_ptr = vy;
1166
- float * res_ptr = s;
1167
- size_t res_stride = bs * sizeof(float);
1168
-
1169
- __asm__ __volatile__(
1170
- "mov x10, %x[nr]\n"
1171
- "mov x9, #0x88\n"
1172
- "cmp x10, #0x10\n"
1173
- "mul x9, %x[nb], x9\n"
1174
- "blt 4f\n"
1175
- "1:" // Row loop
1176
- "add x28, %x[b_ptr], #0x8\n"
1177
- "mov x27, %x[nc]\n"
1178
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1179
- "2:" // Column loop
1180
- "add x25, %x[a_ptr], #0x8\n"
1181
- "movi v2.16b, #0x0\n"
1182
- "movi v10.16b, #0x0\n"
1183
- "mov x24, %x[nb]\n"
1184
- "add x23, x25, x9\n"
1185
- "movi v12.16b, #0x0\n"
1186
- "movi v28.16b, #0x0\n"
1187
- "add x22, x23, x9\n"
1188
- "movi v11.16b, #0x0\n"
1189
- "movi v13.16b, #0x0\n"
1190
- "add x21, x22, x9\n"
1191
- "movi v22.16b, #0x0\n"
1192
- "movi v23.16b, #0x0\n"
1193
- "movi v25.16b, #0x0\n"
1194
- "movi v5.16b, #0x0\n"
1195
- "movi v7.16b, #0x0\n"
1196
- "movi v4.16b, #0x0\n"
1197
- "movi v6.16b, #0x0\n"
1198
- "movi v30.16b, #0x0\n"
1199
- "movi v24.16b, #0x0\n"
1200
- "movi v14.16b, #0x0\n"
1201
- "3:" // Block loop
1202
- "ldr q21, [x28, #0x0]\n"
1203
- "ldr q16, [x28, #0x10]\n"
1204
- "movi v1.16b, #0x4\n"
1205
- "movi v19.4s, #0x0\n"
1206
- "ldr q27, [x25, #0x0]\n"
1207
- "ldr q15, [x25, #0x10]\n"
1208
- "movi v26.4s, #0x0\n"
1209
- "movi v18.4s, #0x0\n"
1210
- "ldr q29, [x28, #0x20]\n"
1211
- "ldr q3, [x28, #0x30]\n"
1212
- "movi v17.4s, #0x0\n"
1213
- "movi v0.16b, #0xf0\n"
1214
- "ldr d20, [x25, #-0x8]\n"
1215
- "ldr d9, [x23, #-0x8]\n"
1216
- "sshl v8.16b, v21.16b, v1.16b\n"
1217
- "sshl v31.16b, v16.16b, v1.16b\n"
1218
- "and v21.16b, v21.16b, v0.16b\n"
1219
- "and v16.16b, v16.16b, v0.16b\n"
1220
- "sub x20, x28, #0x8\n"
1221
- "subs x24, x24, #0x1\n"
1222
- "add x28, x28, #0x48\n"
1223
- ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
1224
- ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
1225
- "ldr q27, [x25, #0x20]\n"
1226
- ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
1227
- ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
1228
- "sshl v15.16b, v29.16b, v1.16b\n"
1229
- "sshl v1.16b, v3.16b, v1.16b\n"
1230
- "and v29.16b, v29.16b, v0.16b\n"
1231
- "and v3.16b, v3.16b, v0.16b\n"
1232
- "ldr q0, [x25, #0x30]\n"
1233
- "fcvtl v20.4s, v20.4h\n"
1234
- ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
1235
- "fcvtl v9.4s, v9.4h\n"
1236
- ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
1237
- "ldr q27, [x25, #0x40]\n"
1238
- ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
1239
- ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
1240
- "ldr q0, [x25, #0x50]\n"
1241
- ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
1242
- ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
1243
- "ldr q27, [x25, #0x60]\n"
1244
- ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
1245
- ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
1246
- "ldr q0, [x25, #0x70]\n"
1247
- "add x25, x25, #0x88\n"
1248
- ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
1249
- ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
1250
- "ldr d27, [x20, #0x0]\n"
1251
- ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
1252
- ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
1253
- "fcvtl v27.4s, v27.4h\n"
1254
- "uzp1 v0.2d, v19.2d, v26.2d\n"
1255
- "uzp2 v26.2d, v19.2d, v26.2d\n"
1256
- "fmul v19.4s, v27.4s, v20.s[0]\n"
1257
- "scvtf v0.4s, v0.4s, #0x4\n"
1258
- "scvtf v26.4s, v26.4s, #0x4\n"
1259
- "fmla v2.4s, v0.4s, v19.4s\n"
1260
- "ldr q19, [x23, #0x0]\n"
1261
- "uzp1 v0.2d, v18.2d, v17.2d\n"
1262
- "uzp2 v18.2d, v18.2d, v17.2d\n"
1263
- "fmul v17.4s, v27.4s, v20.s[1]\n"
1264
- "scvtf v0.4s, v0.4s, #0x4\n"
1265
- "scvtf v18.4s, v18.4s, #0x4\n"
1266
- "fmla v10.4s, v26.4s, v17.4s\n"
1267
- "ldr q17, [x23, #0x10]\n"
1268
- "fmul v26.4s, v27.4s, v20.s[2]\n"
1269
- "fmul v20.4s, v27.4s, v20.s[3]\n"
1270
- "fmla v12.4s, v0.4s, v26.4s\n"
1271
- "ldr d0, [x22, #-0x8]\n"
1272
- "ldr d26, [x21, #-0x8]\n"
1273
- "fcvtl v0.4s, v0.4h\n"
1274
- "fmla v28.4s, v18.4s, v20.4s\n"
1275
- "movi v20.4s, #0x0\n"
1276
- "movi v18.4s, #0x0\n"
1277
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1278
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1279
- "ldr q19, [x23, #0x20]\n"
1280
- "fcvtl v26.4s, v26.4h\n"
1281
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1282
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1283
- "ldr q19, [x23, #0x40]\n"
1284
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1285
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1286
- "ldr q19, [x23, #0x60]\n"
1287
- ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
1288
- ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
1289
- "uzp1 v19.2d, v20.2d, v18.2d\n"
1290
- "scvtf v19.4s, v19.4s, #0x4\n"
1291
- "uzp2 v20.2d, v20.2d, v18.2d\n"
1292
- "fmul v18.4s, v27.4s, v9.s[0]\n"
1293
- "scvtf v20.4s, v20.4s, #0x4\n"
1294
- "fmla v11.4s, v19.4s, v18.4s\n"
1295
- "ldr q18, [x22, #0x0]\n"
1296
- "fmul v19.4s, v27.4s, v9.s[1]\n"
1297
- "fmla v13.4s, v20.4s, v19.4s\n"
1298
- "movi v19.4s, #0x0\n"
1299
- "movi v20.4s, #0x0\n"
1300
- ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
1301
- ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
1302
- "ldr q17, [x23, #0x30]\n"
1303
- ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
1304
- ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
1305
- "ldr q17, [x23, #0x50]\n"
1306
- ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
1307
- ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
1308
- "ldr q17, [x23, #0x70]\n"
1309
- "add x23, x23, #0x88\n"
1310
- ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
1311
- ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
1312
- "uzp1 v17.2d, v19.2d, v20.2d\n"
1313
- "scvtf v17.4s, v17.4s, #0x4\n"
1314
- "uzp2 v20.2d, v19.2d, v20.2d\n"
1315
- "fmul v19.4s, v27.4s, v9.s[2]\n"
1316
- "fmul v9.4s, v27.4s, v9.s[3]\n"
1317
- "scvtf v20.4s, v20.4s, #0x4\n"
1318
- "fmla v22.4s, v17.4s, v19.4s\n"
1319
- "ldr q17, [x22, #0x10]\n"
1320
- "movi v19.4s, #0x0\n"
1321
- ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
1322
- "fmla v23.4s, v20.4s, v9.4s\n"
1323
- "movi v20.4s, #0x0\n"
1324
- "movi v9.4s, #0x0\n"
1325
- ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
1326
- "ldr q18, [x22, #0x20]\n"
1327
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1328
- ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
1329
- ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
1330
- "ldr q18, [x22, #0x40]\n"
1331
- ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
1332
- ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
1333
- "ldr q18, [x22, #0x60]\n"
1334
- ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
1335
- ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
1336
- "movi v18.4s, #0x0\n"
1337
- ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
1338
- "ldr q17, [x22, #0x30]\n"
1339
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1340
- ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
1341
- "ldr q17, [x22, #0x50]\n"
1342
- ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
1343
- ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
1344
- "ldr q17, [x22, #0x70]\n"
1345
- "add x22, x22, #0x88\n"
1346
- ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
1347
- ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
1348
- "uzp1 v17.2d, v19.2d, v20.2d\n"
1349
- "uzp2 v20.2d, v19.2d, v20.2d\n"
1350
- "fmul v19.4s, v27.4s, v0.s[0]\n"
1351
- "scvtf v17.4s, v17.4s, #0x4\n"
1352
- "scvtf v20.4s, v20.4s, #0x4\n"
1353
- "fmla v25.4s, v17.4s, v19.4s\n"
1354
- "ldr q19, [x21, #0x0]\n"
1355
- "fmul v17.4s, v27.4s, v0.s[1]\n"
1356
- "fmla v5.4s, v20.4s, v17.4s\n"
1357
- "ldr q17, [x21, #0x10]\n"
1358
- "uzp1 v20.2d, v9.2d, v18.2d\n"
1359
- "uzp2 v9.2d, v9.2d, v18.2d\n"
1360
- "fmul v18.4s, v27.4s, v0.s[2]\n"
1361
- "fmul v0.4s, v27.4s, v0.s[3]\n"
1362
- "scvtf v20.4s, v20.4s, #0x4\n"
1363
- "scvtf v9.4s, v9.4s, #0x4\n"
1364
- "fmla v7.4s, v20.4s, v18.4s\n"
1365
- "movi v20.4s, #0x0\n"
1366
- "movi v18.4s, #0x0\n"
1367
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1368
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1369
- "ldr q19, [x21, #0x20]\n"
1370
- "fmla v4.4s, v9.4s, v0.4s\n"
1371
- "movi v9.4s, #0x0\n"
1372
- "movi v0.4s, #0x0\n"
1373
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1374
- "fmul v8.4s, v27.4s, v26.s[0]\n"
1375
- ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
1376
- "ldr q17, [x21, #0x30]\n"
1377
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1378
- "fmul v31.4s, v27.4s, v26.s[1]\n"
1379
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1380
- "ldr q19, [x21, #0x40]\n"
1381
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1382
- "fmul v15.4s, v27.4s, v26.s[2]\n"
1383
- "fmul v27.4s, v27.4s, v26.s[3]\n"
1384
- ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
1385
- "ldr q1, [x21, #0x50]\n"
1386
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1387
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1388
- "ldr q26, [x21, #0x60]\n"
1389
- ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
1390
- ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
1391
- "ldr q21, [x21, #0x70]\n"
1392
- "add x21, x21, #0x88\n"
1393
- ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
1394
- ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
1395
- ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
1396
- ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
1397
- "uzp1 v29.2d, v20.2d, v18.2d\n"
1398
- "uzp2 v21.2d, v20.2d, v18.2d\n"
1399
- "scvtf v29.4s, v29.4s, #0x4\n"
1400
- "uzp1 v18.2d, v9.2d, v0.2d\n"
1401
- "uzp2 v16.2d, v9.2d, v0.2d\n"
1402
- "scvtf v21.4s, v21.4s, #0x4\n"
1403
- "fmla v6.4s, v29.4s, v8.4s\n"
1404
- "scvtf v18.4s, v18.4s, #0x4\n"
1405
- "scvtf v16.4s, v16.4s, #0x4\n"
1406
- "fmla v30.4s, v21.4s, v31.4s\n"
1407
- "fmla v24.4s, v18.4s, v15.4s\n"
1408
- "fmla v14.4s, v16.4s, v27.4s\n"
1409
- "bgt 3b\n"
1410
- "mov x20, %x[res_ptr]\n"
1411
- "subs x27, x27, #0x4\n"
1412
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1413
- "str q2, [x20, #0x0]\n"
1414
- "add x20, x20, %x[res_stride]\n"
1415
- "str q10, [x20, #0x0]\n"
1416
- "add x20, x20, %x[res_stride]\n"
1417
- "str q12, [x20, #0x0]\n"
1418
- "add x20, x20, %x[res_stride]\n"
1419
- "str q28, [x20, #0x0]\n"
1420
- "add x20, x20, %x[res_stride]\n"
1421
- "str q11, [x20, #0x0]\n"
1422
- "add x20, x20, %x[res_stride]\n"
1423
- "str q13, [x20, #0x0]\n"
1424
- "add x20, x20, %x[res_stride]\n"
1425
- "str q22, [x20, #0x0]\n"
1426
- "add x20, x20, %x[res_stride]\n"
1427
- "str q23, [x20, #0x0]\n"
1428
- "add x20, x20, %x[res_stride]\n"
1429
- "str q25, [x20, #0x0]\n"
1430
- "add x20, x20, %x[res_stride]\n"
1431
- "str q5, [x20, #0x0]\n"
1432
- "add x20, x20, %x[res_stride]\n"
1433
- "str q7, [x20, #0x0]\n"
1434
- "add x20, x20, %x[res_stride]\n"
1435
- "str q4, [x20, #0x0]\n"
1436
- "add x20, x20, %x[res_stride]\n"
1437
- "str q6, [x20, #0x0]\n"
1438
- "add x20, x20, %x[res_stride]\n"
1439
- "str q30, [x20, #0x0]\n"
1440
- "add x20, x20, %x[res_stride]\n"
1441
- "str q24, [x20, #0x0]\n"
1442
- "add x20, x20, %x[res_stride]\n"
1443
- "str q14, [x20, #0x0]\n"
1444
- "bne 2b\n"
1445
- "mov x20, #0x4\n"
1446
- "sub x10, x10, #0x10\n"
1447
- "cmp x10, #0x10\n"
1448
- "mov %x[res_ptr], x26\n"
1449
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1450
- "bge 1b\n"
1451
- "4:" // Row loop skip
1452
- "cbz x10, 9f\n"
1453
- "5:" // Row tail: Row loop
1454
- "add x24, %x[b_ptr], #0x8\n"
1455
- "mov x23, %x[nc]\n"
1456
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1457
- "6:" // Row tail: Column loop
1458
- "movi v2.16b, #0x0\n"
1459
- "movi v10.16b, #0x0\n"
1460
- "add x25, %x[a_ptr], #0x8\n"
1461
- "mov x21, %x[nb]\n"
1462
- "movi v12.16b, #0x0\n"
1463
- "movi v28.16b, #0x0\n"
1464
- "7:" // Row tail: Block loop
1465
- "ldr q6, [x24, #0x0]\n"
1466
- "ldr q5, [x24, #0x10]\n"
1467
- "movi v17.16b, #0x4\n"
1468
- "movi v8.4s, #0x0\n"
1469
- "ldr q4, [x25, #0x0]\n"
1470
- "ldr q13, [x25, #0x10]\n"
1471
- "movi v27.4s, #0x0\n"
1472
- "movi v0.4s, #0x0\n"
1473
- "ldr q31, [x24, #0x20]\n"
1474
- "ldr q14, [x24, #0x30]\n"
1475
- "movi v29.4s, #0x0\n"
1476
- "movi v22.16b, #0xf0\n"
1477
- "ldr q11, [x25, #0x20]\n"
1478
- "ldr q23, [x25, #0x30]\n"
1479
- "sshl v21.16b, v6.16b, v17.16b\n"
1480
- "sshl v16.16b, v5.16b, v17.16b\n"
1481
- "ldr q20, [x25, #0x40]\n"
1482
- "ldr q26, [x25, #0x50]\n"
1483
- "and v6.16b, v6.16b, v22.16b\n"
1484
- "and v5.16b, v5.16b, v22.16b\n"
1485
- "ldr q25, [x25, #0x60]\n"
1486
- "ldr q3, [x25, #0x70]\n"
1487
- "sshl v19.16b, v31.16b, v17.16b\n"
1488
- "sshl v18.16b, v14.16b, v17.16b\n"
1489
- "ldr d17, [x25, #-0x8]\n"
1490
- ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
1491
- ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
1492
- "and v31.16b, v31.16b, v22.16b\n"
1493
- ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
1494
- ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
1495
- "and v14.16b, v14.16b, v22.16b\n"
1496
- "sub x20, x24, #0x8\n"
1497
- "ldr d16, [x20, #0x0]\n"
1498
- "subs x21, x21, #0x1\n"
1499
- "add x25, x25, #0x88\n"
1500
- "fcvtl v17.4s, v17.4h\n"
1501
- "add x24, x24, #0x48\n"
1502
- ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
1503
- ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
1504
- ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
1505
- ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
1506
- "fcvtl v16.4s, v16.4h\n"
1507
- ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
1508
- ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
1509
- "fmul v23.4s, v16.4s, v17.s[0]\n"
1510
- "fmul v21.4s, v16.4s, v17.s[1]\n"
1511
- "fmul v1.4s, v16.4s, v17.s[2]\n"
1512
- "fmul v20.4s, v16.4s, v17.s[3]\n"
1513
- ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
1514
- ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
1515
- ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
1516
- ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
1517
- ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
1518
- ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
1519
- "uzp1 v19.2d, v8.2d, v27.2d\n"
1520
- "uzp2 v18.2d, v8.2d, v27.2d\n"
1521
- "scvtf v19.4s, v19.4s, #0x4\n"
1522
- "uzp1 v17.2d, v0.2d, v29.2d\n"
1523
- "uzp2 v16.2d, v0.2d, v29.2d\n"
1524
- "scvtf v18.4s, v18.4s, #0x4\n"
1525
- "fmla v2.4s, v19.4s, v23.4s\n"
1526
- "scvtf v17.4s, v17.4s, #0x4\n"
1527
- "scvtf v16.4s, v16.4s, #0x4\n"
1528
- "fmla v10.4s, v18.4s, v21.4s\n"
1529
- "fmla v12.4s, v17.4s, v1.4s\n"
1530
- "fmla v28.4s, v16.4s, v20.4s\n"
1531
- "bgt 7b\n"
1532
- "mov x20, %x[res_ptr]\n"
1533
- "cmp x10, #0x1\n"
1534
- "str q2, [x20, #0x0]\n"
1535
- "add x20, x20, %x[res_stride]\n"
1536
- "ble 8f\n"
1537
- "cmp x10, #0x2\n"
1538
- "str q10, [x20, #0x0]\n"
1539
- "add x20, x20, %x[res_stride]\n"
1540
- "ble 8f\n"
1541
- "cmp x10, #0x3\n"
1542
- "str q12, [x20, #0x0]\n"
1543
- "add x20, x20, %x[res_stride]\n"
1544
- "ble 8f\n"
1545
- "str q28, [x20, #0x0]\n"
1546
- "8:" // Row tail: Accumulator store skip
1547
- "subs x23, x23, #0x4\n"
1548
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1549
- "bne 6b\n"
1550
- "subs x10, x10, #0x4\n"
1551
- "add %x[a_ptr], %x[a_ptr], x9\n"
1552
- "mov %x[res_ptr], x22\n"
1553
- "bgt 5b\n"
1554
- "9:" // Row tail: Row loop skip
1555
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1556
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1557
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1558
- );
1559
- return;
1560
- }
1561
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1562
  float sumf[4][4];
1563
  int sumi;
@@ -1615,7 +1605,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1615
 
1616
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
1617
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1618
- if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
1619
  const void * b_ptr = vx;
1620
  const void * a_ptr = vy;
1621
  float * res_ptr = s;
@@ -2083,59 +2073,57 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
2083
  UNUSED(blocklen);
2084
 
2085
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
2086
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2087
- const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
2088
 
2089
- for (int y = 0; y < nr / 4; y++) {
2090
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2091
- for (int x = 0; x < nc / ncols_interleaved; x++) {
2092
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
2093
 
2094
- float32x4_t sumf[4];
2095
- for (int m = 0; m < 4; m++) {
2096
- sumf[m] = vdupq_n_f32(0);
2097
- }
2098
 
2099
- for (int l = 0; l < nb; l++) {
2100
- float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
2101
- float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
2102
-
2103
- int32x4_t sumi_0 = vdupq_n_s32(0);
2104
- int32x4_t sumi_1 = vdupq_n_s32(0);
2105
- int32x4_t sumi_2 = vdupq_n_s32(0);
2106
- int32x4_t sumi_3 = vdupq_n_s32(0);
2107
-
2108
- for (int k = 0; k < 4; k++) {
2109
- int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
2110
- int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
2111
-
2112
- uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
2113
- int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
2114
- int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
2115
-
2116
- sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
2117
- sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
2118
- sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
2119
- sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
2120
- sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
2121
- sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
2122
- sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
2123
- sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
2124
- }
2125
 
2126
- sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
2127
- sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
2128
- sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
2129
- sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2130
  }
2131
 
2132
- for (int m = 0; m < 4; m++) {
2133
- vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
2134
- }
 
 
 
 
 
2135
  }
2136
  }
2137
- return;
2138
  }
 
2139
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
2140
  {
2141
  float sumf[4][4];
 
256
  UNUSED(blocklen);
257
 
258
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
259
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
260
+
261
+ for (int c = 0; c < nc; c += ncols_interleaved) {
262
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
263
+ float32x4_t acc = vdupq_n_f32(0);
264
+ for (int b = 0; b < nb; b++) {
265
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
266
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
267
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
268
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
269
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
270
+
271
+ int8x16_t a0 = vld1q_s8(a_ptr->qs);
272
+ int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
273
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
274
+
275
+ int32x4_t ret = vdupq_n_s32(0);
276
+
277
+ ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
278
+ ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
279
+ ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
280
+ ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
281
+
282
+ ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
283
+ ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
284
+ ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
285
+ ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
286
+
287
+ acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
288
+ vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
289
+ a_ptr++;
290
+ b_ptr++;
 
 
 
 
291
  }
292
+ vst1q_f32(s, acc);
293
+ s += ncols_interleaved;
294
  }
295
+ return;
296
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
297
  float sumf[4];
298
  int sumi;
 
339
  UNUSED(blocklen);
340
 
341
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
342
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
343
+
344
+ for (int c = 0; c < nc; c += ncols_interleaved) {
345
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
346
+ float32x4_t acc = vdupq_n_f32(0);
347
+ for (int b = 0; b < nb; b++) {
348
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
349
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
350
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
351
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
352
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
353
+
354
+ int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
355
+ int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
356
+ int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
357
+ int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
358
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
359
+
360
+ int32x4_t ret0 = vdupq_n_s32(0);
361
+ int32x4_t ret1 = vdupq_n_s32(0);
362
+
363
+ ret0 = vdotq_s32(ret0, b0 << 4, a0);
364
+ ret1 = vdotq_s32(ret1, b1 << 4, a0);
365
+ ret0 = vdotq_s32(ret0, b2 << 4, a1);
366
+ ret1 = vdotq_s32(ret1, b3 << 4, a1);
367
+
368
+ ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
369
+ ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
370
+ ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
371
+ ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
372
+
373
+ int32x4_t ret = vpaddq_s32(ret0, ret1);
374
+
375
+ acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
376
+ vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
377
+ a_ptr++;
378
+ b_ptr++;
 
 
 
 
379
  }
380
+ vst1q_f32(s, acc);
381
+ s += ncols_interleaved;
382
  }
383
+ return;
384
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
385
  float sumf[4];
386
  int sumi;
 
428
 
429
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
430
  #if defined(__ARM_FEATURE_SVE)
431
+ if (ggml_cpu_get_sve_cnt() == QK8_0) {
432
  const void * b_ptr = vx;
433
  const void * a_ptr = vy;
434
  float * res_ptr = s;
 
543
  UNUSED(blocklen);
544
 
545
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
546
+ const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
547
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
548
+ float * res_ptr = s;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
 
550
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
551
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
552
 
553
+ float32x4_t sumf = vdupq_n_f32(0);
554
+ for (int l = 0; l < nb; l++) {
555
+ uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
556
+ uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
557
+ uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
558
+ uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
559
+
560
+ int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
561
+ int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
562
+ int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
563
+ int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
564
+ int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
565
+ int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
566
+ int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
567
+ int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
568
+
569
+ int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
570
+ int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
571
+
572
+ int32x4_t sumi = vdupq_n_s32(0);
573
+ sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
574
+ sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
575
+ sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
576
+ sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
577
+ sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
578
+ sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
579
+ sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
580
+ sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
581
+
582
+ float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
583
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
584
+ float32x4_t d = a_d * b_d;
585
+
586
+ sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
587
  }
588
+
589
+ vst1q_f32(res_ptr + x * 4, sumf);
590
  }
591
+ return;
592
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
593
  {
594
  float sumf[4];
 
637
  UNUSED(ncols_interleaved);
638
  UNUSED(blocklen);
639
 
640
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
641
+ const void * b_ptr = vx;
642
+ const void * a_ptr = vy;
643
+ float * res_ptr = s;
644
+ size_t res_stride = bs * sizeof(float);
645
+
646
+ __asm__ __volatile__(
647
+ "mov x10, %x[nr]\n"
648
+ "mov x9, #0x88\n"
649
+ "cmp x10, #0x10\n"
650
+ "mul x9, %x[nb], x9\n"
651
+ "blt 4f\n"
652
+ "1:" // Row loop
653
+ "add x28, %x[b_ptr], #0x8\n"
654
+ "mov x27, %x[nc]\n"
655
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
656
+ "2:" // Column loop
657
+ "add x25, %x[a_ptr], #0x8\n"
658
+ "movi v15.16b, #0x0\n"
659
+ "movi v19.16b, #0x0\n"
660
+ "mov x24, %x[nb]\n"
661
+ "add x23, x25, x9\n"
662
+ "movi v18.16b, #0x0\n"
663
+ "movi v14.16b, #0x0\n"
664
+ "add x22, x23, x9\n"
665
+ "movi v11.16b, #0x0\n"
666
+ "movi v13.16b, #0x0\n"
667
+ "add x21, x22, x9\n"
668
+ "movi v23.16b, #0x0\n"
669
+ "movi v16.16b, #0x0\n"
670
+ "movi v25.16b, #0x0\n"
671
+ "movi v7.16b, #0x0\n"
672
+ "movi v0.16b, #0x0\n"
673
+ "movi v4.16b, #0x0\n"
674
+ "movi v5.16b, #0x0\n"
675
+ "movi v21.16b, #0x0\n"
676
+ "movi v8.16b, #0x0\n"
677
+ "movi v1.16b, #0x0\n"
678
+ "3:" // Block loop
679
+ "ldr q3, [x28, #0x0]\n"
680
+ "ldr q31, [x25, #0x0]\n"
681
+ "movi v28.16b, #0x4\n"
682
+ "movi v10.4s, #0x0\n"
683
+ "ldr q22, [x28, #0x10]\n"
684
+ "ldr q6, [x25, #0x10]\n"
685
+ "movi v29.4s, #0x0\n"
686
+ "movi v9.4s, #0x0\n"
687
+ "ldr q27, [x28, #0x20]\n"
688
+ "ldr q30, [x28, #0x30]\n"
689
+ "movi v20.4s, #0x0\n"
690
+ "movi v24.16b, #0xf0\n"
691
+ "ldr d2, [x25, #-0x8]\n"
692
+ "ldr d26, [x23, #-0x8]\n"
693
+ "sshl v12.16b, v3.16b, v28.16b\n"
694
+ "sub x20, x28, #0x8\n"
695
+ "ldr d17, [x20, #0x0]\n"
696
+ "and v3.16b, v3.16b, v24.16b\n"
697
+ "subs x24, x24, #0x1\n"
698
+ "add x28, x28, #0x48\n"
699
+ ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
700
+ ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
701
+ ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
702
+ ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
703
+ "sshl v31.16b, v22.16b, v28.16b\n"
704
+ "and v22.16b, v22.16b, v24.16b\n"
705
+ "fcvtl v17.4s, v17.4h\n"
706
+ "fcvtl v2.4s, v2.4h\n"
707
+ "fcvtl v26.4s, v26.4h\n"
708
+ ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
709
+ ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
710
+ ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
711
+ ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
712
+ "sshl v6.16b, v27.16b, v28.16b\n"
713
+ "sshl v28.16b, v30.16b, v28.16b\n"
714
+ "and v27.16b, v27.16b, v24.16b\n"
715
+ "and v30.16b, v30.16b, v24.16b\n"
716
+ "ldr q24, [x25, #0x20]\n"
717
+ ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
718
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
719
+ ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
720
+ ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
721
+ "ldr q24, [x25, #0x30]\n"
722
+ ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
723
+ ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
724
+ ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
725
+ ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
726
+ "ldr q24, [x25, #0x40]\n"
727
+ ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
728
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
729
+ ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
730
+ ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
731
+ "ldr q24, [x25, #0x50]\n"
732
+ ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
733
+ ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
734
+ ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
735
+ ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
736
+ "ldr q24, [x25, #0x60]\n"
737
+ ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
738
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
739
+ ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
740
+ ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
741
+ "ldr q24, [x25, #0x70]\n"
742
+ "add x25, x25, #0x88\n"
743
+ ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
744
+ ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
745
+ ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
746
+ ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
747
+ "fmul v24.4s, v17.4s, v2.s[0]\n"
748
+ "scvtf v10.4s, v10.4s, #0x4\n"
749
+ "scvtf v29.4s, v29.4s, #0x4\n"
750
+ "scvtf v9.4s, v9.4s, #0x4\n"
751
+ "scvtf v20.4s, v20.4s, #0x4\n"
752
+ "fmla v15.4s, v10.4s, v24.4s\n"
753
+ "ldr q24, [x23, #0x0]\n"
754
+ "fmul v10.4s, v17.4s, v2.s[1]\n"
755
+ "fmla v19.4s, v29.4s, v10.4s\n"
756
+ "ldr q10, [x23, #0x10]\n"
757
+ "fmul v29.4s, v17.4s, v2.s[2]\n"
758
+ "fmul v2.4s, v17.4s, v2.s[3]\n"
759
+ "fmla v18.4s, v9.4s, v29.4s\n"
760
+ "movi v9.4s, #0x0\n"
761
+ "movi v29.4s, #0x0\n"
762
+ ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
763
+ ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
764
+ "fmla v14.4s, v20.4s, v2.4s\n"
765
+ "movi v20.4s, #0x0\n"
766
+ "movi v2.4s, #0x0\n"
767
+ ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
768
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
769
+ "ldr q24, [x23, #0x20]\n"
770
+ ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
771
+ ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
772
+ ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
773
+ ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
774
+ "ldr q10, [x23, #0x30]\n"
775
+ ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
776
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
777
+ ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
778
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
779
+ "ldr q24, [x23, #0x40]\n"
780
+ ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
781
+ ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
782
+ ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
783
+ ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
784
+ "ldr q10, [x23, #0x50]\n"
785
+ ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
786
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
787
+ ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
788
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
789
+ "ldr q24, [x23, #0x60]\n"
790
+ ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
791
+ ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
792
+ ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
793
+ ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
794
+ "ldr q10, [x23, #0x70]\n"
795
+ "add x23, x23, #0x88\n"
796
+ ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
797
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
798
+ ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
799
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
800
+ "ldr q24, [x22, #0x0]\n"
801
+ ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
802
+ ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
803
+ ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
804
+ ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
805
+ "fmul v10.4s, v17.4s, v26.s[0]\n"
806
+ "scvtf v9.4s, v9.4s, #0x4\n"
807
+ "scvtf v29.4s, v29.4s, #0x4\n"
808
+ "scvtf v20.4s, v20.4s, #0x4\n"
809
+ "scvtf v2.4s, v2.4s, #0x4\n"
810
+ "fmla v11.4s, v9.4s, v10.4s\n"
811
+ "ldr q9, [x22, #0x10]\n"
812
+ "fmul v10.4s, v17.4s, v26.s[1]\n"
813
+ "fmla v13.4s, v29.4s, v10.4s\n"
814
+ "ldr d29, [x22, #-0x8]\n"
815
+ "fmul v10.4s, v17.4s, v26.s[2]\n"
816
+ "fmul v26.4s, v17.4s, v26.s[3]\n"
817
+ "fcvtl v29.4s, v29.4h\n"
818
+ "fmla v23.4s, v20.4s, v10.4s\n"
819
+ "movi v20.4s, #0x0\n"
820
+ "movi v10.4s, #0x0\n"
821
+ "fmla v16.4s, v2.4s, v26.4s\n"
822
+ "movi v26.4s, #0x0\n"
823
+ "movi v2.4s, #0x0\n"
824
+ ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
825
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
826
+ ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
827
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
828
+ "ldr q24, [x22, #0x20]\n"
829
+ ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
830
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
831
+ ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
832
+ ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
833
+ "ldr q9, [x22, #0x30]\n"
834
+ ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
835
+ ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
836
+ ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
837
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
838
+ "ldr q24, [x22, #0x40]\n"
839
+ ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
840
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
841
+ ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
842
+ ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
843
+ "ldr q9, [x22, #0x50]\n"
844
+ ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
845
+ ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
846
+ ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
847
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
848
+ "ldr q24, [x22, #0x60]\n"
849
+ ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
850
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
851
+ ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
852
+ ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
853
+ "ldr q9, [x22, #0x70]\n"
854
+ "add x22, x22, #0x88\n"
855
+ ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
856
+ ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
857
+ ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
858
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
859
+ "ldr q24, [x21, #0x0]\n"
860
+ ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
861
+ ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
862
+ ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
863
+ ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
864
+ "fmul v9.4s, v17.4s, v29.s[0]\n"
865
+ "scvtf v20.4s, v20.4s, #0x4\n"
866
+ "scvtf v10.4s, v10.4s, #0x4\n"
867
+ "scvtf v26.4s, v26.4s, #0x4\n"
868
+ "scvtf v2.4s, v2.4s, #0x4\n"
869
+ "fmla v25.4s, v20.4s, v9.4s\n"
870
+ "ldr q9, [x21, #0x10]\n"
871
+ "fmul v20.4s, v17.4s, v29.s[1]\n"
872
+ "fmla v7.4s, v10.4s, v20.4s\n"
873
+ "ldr d20, [x21, #-0x8]\n"
874
+ "fmul v10.4s, v17.4s, v29.s[2]\n"
875
+ "fmul v29.4s, v17.4s, v29.s[3]\n"
876
+ "fcvtl v20.4s, v20.4h\n"
877
+ "fmla v0.4s, v26.4s, v10.4s\n"
878
+ "movi v26.4s, #0x0\n"
879
+ "movi v10.4s, #0x0\n"
880
+ "fmla v4.4s, v2.4s, v29.4s\n"
881
+ "movi v2.4s, #0x0\n"
882
+ "movi v29.4s, #0x0\n"
883
+ ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
884
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
885
+ ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
886
+ ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
887
+ "ldr q12, [x21, #0x20]\n"
888
+ "fmul v24.4s, v17.4s, v20.s[0]\n"
889
+ ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
890
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
891
+ ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
892
+ ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
893
+ "ldr q9, [x21, #0x30]\n"
894
+ "fmul v31.4s, v17.4s, v20.s[1]\n"
895
+ ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
896
+ ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
897
+ ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
898
+ ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
899
+ "ldr q12, [x21, #0x40]\n"
900
+ "fmul v6.4s, v17.4s, v20.s[2]\n"
901
+ "fmul v20.4s, v17.4s, v20.s[3]\n"
902
+ ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
903
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
904
+ ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
905
+ ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
906
+ "ldr q9, [x21, #0x50]\n"
907
+ ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
908
+ ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
909
+ ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
910
+ ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
911
+ "ldr q12, [x21, #0x60]\n"
912
+ ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
913
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
914
+ ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
915
+ ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
916
+ "ldr q17, [x21, #0x70]\n"
917
+ "add x21, x21, #0x88\n"
918
+ ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
919
+ ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
920
+ ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
921
+ ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
922
+ ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
923
+ ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
924
+ ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
925
+ ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
926
+ "scvtf v26.4s, v26.4s, #0x4\n"
927
+ "scvtf v10.4s, v10.4s, #0x4\n"
928
+ "fmla v5.4s, v26.4s, v24.4s\n"
929
+ "scvtf v2.4s, v2.4s, #0x4\n"
930
+ "scvtf v29.4s, v29.4s, #0x4\n"
931
+ "fmla v21.4s, v10.4s, v31.4s\n"
932
+ "fmla v8.4s, v2.4s, v6.4s\n"
933
+ "fmla v1.4s, v29.4s, v20.4s\n"
934
+ "bgt 3b\n"
935
+ "mov x20, %x[res_ptr]\n"
936
+ "subs x27, x27, #0x4\n"
937
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
938
+ "str q15, [x20, #0x0]\n"
939
+ "add x20, x20, %x[res_stride]\n"
940
+ "str q19, [x20, #0x0]\n"
941
+ "add x20, x20, %x[res_stride]\n"
942
+ "str q18, [x20, #0x0]\n"
943
+ "add x20, x20, %x[res_stride]\n"
944
+ "str q14, [x20, #0x0]\n"
945
+ "add x20, x20, %x[res_stride]\n"
946
+ "str q11, [x20, #0x0]\n"
947
+ "add x20, x20, %x[res_stride]\n"
948
+ "str q13, [x20, #0x0]\n"
949
+ "add x20, x20, %x[res_stride]\n"
950
+ "str q23, [x20, #0x0]\n"
951
+ "add x20, x20, %x[res_stride]\n"
952
+ "str q16, [x20, #0x0]\n"
953
+ "add x20, x20, %x[res_stride]\n"
954
+ "str q25, [x20, #0x0]\n"
955
+ "add x20, x20, %x[res_stride]\n"
956
+ "str q7, [x20, #0x0]\n"
957
+ "add x20, x20, %x[res_stride]\n"
958
+ "str q0, [x20, #0x0]\n"
959
+ "add x20, x20, %x[res_stride]\n"
960
+ "str q4, [x20, #0x0]\n"
961
+ "add x20, x20, %x[res_stride]\n"
962
+ "str q5, [x20, #0x0]\n"
963
+ "add x20, x20, %x[res_stride]\n"
964
+ "str q21, [x20, #0x0]\n"
965
+ "add x20, x20, %x[res_stride]\n"
966
+ "str q8, [x20, #0x0]\n"
967
+ "add x20, x20, %x[res_stride]\n"
968
+ "str q1, [x20, #0x0]\n"
969
+ "bne 2b\n"
970
+ "mov x20, #0x4\n"
971
+ "sub x10, x10, #0x10\n"
972
+ "cmp x10, #0x10\n"
973
+ "mov %x[res_ptr], x26\n"
974
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
975
+ "bge 1b\n"
976
+ "4:" // Row loop skip
977
+ "cbz x10, 9f\n"
978
+ "5:" // Row tail: Row loop
979
+ "add x24, %x[b_ptr], #0x8\n"
980
+ "mov x23, %x[nc]\n"
981
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
982
+ "6:" // Row tail: Column loop
983
+ "movi v15.16b, #0x0\n"
984
+ "movi v19.16b, #0x0\n"
985
+ "add x25, %x[a_ptr], #0x8\n"
986
+ "mov x21, %x[nb]\n"
987
+ "movi v18.16b, #0x0\n"
988
+ "movi v14.16b, #0x0\n"
989
+ "7:" // Row tail: Block loop
990
+ "ldr q7, [x24, #0x0]\n"
991
+ "ldr q5, [x25, #0x0]\n"
992
+ "movi v9.16b, #0x4\n"
993
+ "movi v4.4s, #0x0\n"
994
+ "ldr q3, [x24, #0x10]\n"
995
+ "ldr q2, [x25, #0x10]\n"
996
+ "movi v1.4s, #0x0\n"
997
+ "movi v0.4s, #0x0\n"
998
+ "ldr q13, [x24, #0x20]\n"
999
+ "ldr q31, [x25, #0x20]\n"
1000
+ "movi v30.4s, #0x0\n"
1001
+ "movi v29.16b, #0xf0\n"
1002
+ "ldr q28, [x24, #0x30]\n"
1003
+ "ldr q27, [x25, #0x30]\n"
1004
+ "sshl v20.16b, v7.16b, v9.16b\n"
1005
+ "sub x20, x24, #0x8\n"
1006
+ "ldr q26, [x25, #0x40]\n"
1007
+ "ldr q25, [x25, #0x50]\n"
1008
+ "sshl v17.16b, v3.16b, v9.16b\n"
1009
+ "and v7.16b, v7.16b, v29.16b\n"
1010
+ "ldr q24, [x25, #0x60]\n"
1011
+ "ldr q16, [x25, #0x70]\n"
1012
+ "sshl v22.16b, v13.16b, v9.16b\n"
1013
+ "and v3.16b, v3.16b, v29.16b\n"
1014
+ "ldr d21, [x20, #0x0]\n"
1015
+ "ldr d12, [x25, #-0x8]\n"
1016
+ ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
1017
+ ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
1018
+ ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
1019
+ ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
1020
+ "sshl v9.16b, v28.16b, v9.16b\n"
1021
+ "subs x21, x21, #0x1\n"
1022
+ "and v13.16b, v13.16b, v29.16b\n"
1023
+ "and v28.16b, v28.16b, v29.16b\n"
1024
+ "add x25, x25, #0x88\n"
1025
+ "add x24, x24, #0x48\n"
1026
+ "fcvtl v21.4s, v21.4h\n"
1027
+ "fcvtl v12.4s, v12.4h\n"
1028
+ ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
1029
+ ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
1030
+ ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
1031
+ ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
1032
+ "fmul v11.4s, v21.4s, v12.s[0]\n"
1033
+ "fmul v23.4s, v21.4s, v12.s[1]\n"
1034
+ "fmul v17.4s, v21.4s, v12.s[2]\n"
1035
+ ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
1036
+ "fmul v6.4s, v21.4s, v12.s[3]\n"
1037
+ ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
1038
+ ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
1039
+ ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
1040
+ ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
1041
+ ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
1042
+ ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
1043
+ ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
1044
+ ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
1045
+ ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
1046
+ ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
1047
+ ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
1048
+ ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
1049
+ ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
1050
+ ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
1051
+ ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
1052
+ ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
1053
+ ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
1054
+ ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
1055
+ ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
1056
+ ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
1057
+ ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
1058
+ ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
1059
+ ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
1060
+ "scvtf v4.4s, v4.4s, #0x4\n"
1061
+ "scvtf v1.4s, v1.4s, #0x4\n"
1062
+ "scvtf v0.4s, v0.4s, #0x4\n"
1063
+ "fmla v15.4s, v4.4s, v11.4s\n"
1064
+ "scvtf v30.4s, v30.4s, #0x4\n"
1065
+ "fmla v19.4s, v1.4s, v23.4s\n"
1066
+ "fmla v18.4s, v0.4s, v17.4s\n"
1067
+ "fmla v14.4s, v30.4s, v6.4s\n"
1068
+ "bgt 7b\n"
1069
+ "mov x20, %x[res_ptr]\n"
1070
+ "cmp x10, #0x1\n"
1071
+ "str q15, [x20, #0x0]\n"
1072
+ "add x20, x20, %x[res_stride]\n"
1073
+ "ble 8f\n"
1074
+ "cmp x10, #0x2\n"
1075
+ "str q19, [x20, #0x0]\n"
1076
+ "add x20, x20, %x[res_stride]\n"
1077
+ "ble 8f\n"
1078
+ "cmp x10, #0x3\n"
1079
+ "str q18, [x20, #0x0]\n"
1080
+ "add x20, x20, %x[res_stride]\n"
1081
+ "ble 8f\n"
1082
+ "str q14, [x20, #0x0]\n"
1083
+ "8:" // Row tail: Accumulator store skip
1084
+ "subs x23, x23, #0x4\n"
1085
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1086
+ "bne 6b\n"
1087
+ "subs x10, x10, #0x4\n"
1088
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1089
+ "mov %x[res_ptr], x22\n"
1090
+ "bgt 5b\n"
1091
+ "9:" // Row tail: Row loop skip
1092
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1093
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1094
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1095
+ );
1096
+ return;
 
 
1097
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1098
  {
1099
  float sumf[4][4];
 
1152
  UNUSED(blocklen);
1153
 
1154
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1155
+ const void * b_ptr = vx;
1156
+ const void * a_ptr = vy;
1157
+ float * res_ptr = s;
1158
+ size_t res_stride = bs * sizeof(float);
1159
+
1160
+ __asm__ __volatile__(
1161
+ "mov x10, %x[nr]\n"
1162
+ "mov x9, #0x88\n"
1163
+ "cmp x10, #0x10\n"
1164
+ "mul x9, %x[nb], x9\n"
1165
+ "blt 4f\n"
1166
+ "1:" // Row loop
1167
+ "add x28, %x[b_ptr], #0x8\n"
1168
+ "mov x27, %x[nc]\n"
1169
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1170
+ "2:" // Column loop
1171
+ "add x25, %x[a_ptr], #0x8\n"
1172
+ "movi v2.16b, #0x0\n"
1173
+ "movi v10.16b, #0x0\n"
1174
+ "mov x24, %x[nb]\n"
1175
+ "add x23, x25, x9\n"
1176
+ "movi v12.16b, #0x0\n"
1177
+ "movi v28.16b, #0x0\n"
1178
+ "add x22, x23, x9\n"
1179
+ "movi v11.16b, #0x0\n"
1180
+ "movi v13.16b, #0x0\n"
1181
+ "add x21, x22, x9\n"
1182
+ "movi v22.16b, #0x0\n"
1183
+ "movi v23.16b, #0x0\n"
1184
+ "movi v25.16b, #0x0\n"
1185
+ "movi v5.16b, #0x0\n"
1186
+ "movi v7.16b, #0x0\n"
1187
+ "movi v4.16b, #0x0\n"
1188
+ "movi v6.16b, #0x0\n"
1189
+ "movi v30.16b, #0x0\n"
1190
+ "movi v24.16b, #0x0\n"
1191
+ "movi v14.16b, #0x0\n"
1192
+ "3:" // Block loop
1193
+ "ldr q21, [x28, #0x0]\n"
1194
+ "ldr q16, [x28, #0x10]\n"
1195
+ "movi v1.16b, #0x4\n"
1196
+ "movi v19.4s, #0x0\n"
1197
+ "ldr q27, [x25, #0x0]\n"
1198
+ "ldr q15, [x25, #0x10]\n"
1199
+ "movi v26.4s, #0x0\n"
1200
+ "movi v18.4s, #0x0\n"
1201
+ "ldr q29, [x28, #0x20]\n"
1202
+ "ldr q3, [x28, #0x30]\n"
1203
+ "movi v17.4s, #0x0\n"
1204
+ "movi v0.16b, #0xf0\n"
1205
+ "ldr d20, [x25, #-0x8]\n"
1206
+ "ldr d9, [x23, #-0x8]\n"
1207
+ "sshl v8.16b, v21.16b, v1.16b\n"
1208
+ "sshl v31.16b, v16.16b, v1.16b\n"
1209
+ "and v21.16b, v21.16b, v0.16b\n"
1210
+ "and v16.16b, v16.16b, v0.16b\n"
1211
+ "sub x20, x28, #0x8\n"
1212
+ "subs x24, x24, #0x1\n"
1213
+ "add x28, x28, #0x48\n"
1214
+ ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
1215
+ ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
1216
+ "ldr q27, [x25, #0x20]\n"
1217
+ ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
1218
+ ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
1219
+ "sshl v15.16b, v29.16b, v1.16b\n"
1220
+ "sshl v1.16b, v3.16b, v1.16b\n"
1221
+ "and v29.16b, v29.16b, v0.16b\n"
1222
+ "and v3.16b, v3.16b, v0.16b\n"
1223
+ "ldr q0, [x25, #0x30]\n"
1224
+ "fcvtl v20.4s, v20.4h\n"
1225
+ ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
1226
+ "fcvtl v9.4s, v9.4h\n"
1227
+ ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
1228
+ "ldr q27, [x25, #0x40]\n"
1229
+ ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
1230
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
1231
+ "ldr q0, [x25, #0x50]\n"
1232
+ ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
1233
+ ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
1234
+ "ldr q27, [x25, #0x60]\n"
1235
+ ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
1236
+ ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
1237
+ "ldr q0, [x25, #0x70]\n"
1238
+ "add x25, x25, #0x88\n"
1239
+ ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
1240
+ ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
1241
+ "ldr d27, [x20, #0x0]\n"
1242
+ ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
1243
+ ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
1244
+ "fcvtl v27.4s, v27.4h\n"
1245
+ "uzp1 v0.2d, v19.2d, v26.2d\n"
1246
+ "uzp2 v26.2d, v19.2d, v26.2d\n"
1247
+ "fmul v19.4s, v27.4s, v20.s[0]\n"
1248
+ "scvtf v0.4s, v0.4s, #0x4\n"
1249
+ "scvtf v26.4s, v26.4s, #0x4\n"
1250
+ "fmla v2.4s, v0.4s, v19.4s\n"
1251
+ "ldr q19, [x23, #0x0]\n"
1252
+ "uzp1 v0.2d, v18.2d, v17.2d\n"
1253
+ "uzp2 v18.2d, v18.2d, v17.2d\n"
1254
+ "fmul v17.4s, v27.4s, v20.s[1]\n"
1255
+ "scvtf v0.4s, v0.4s, #0x4\n"
1256
+ "scvtf v18.4s, v18.4s, #0x4\n"
1257
+ "fmla v10.4s, v26.4s, v17.4s\n"
1258
+ "ldr q17, [x23, #0x10]\n"
1259
+ "fmul v26.4s, v27.4s, v20.s[2]\n"
1260
+ "fmul v20.4s, v27.4s, v20.s[3]\n"
1261
+ "fmla v12.4s, v0.4s, v26.4s\n"
1262
+ "ldr d0, [x22, #-0x8]\n"
1263
+ "ldr d26, [x21, #-0x8]\n"
1264
+ "fcvtl v0.4s, v0.4h\n"
1265
+ "fmla v28.4s, v18.4s, v20.4s\n"
1266
+ "movi v20.4s, #0x0\n"
1267
+ "movi v18.4s, #0x0\n"
1268
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1269
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1270
+ "ldr q19, [x23, #0x20]\n"
1271
+ "fcvtl v26.4s, v26.4h\n"
1272
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1273
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1274
+ "ldr q19, [x23, #0x40]\n"
1275
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1276
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1277
+ "ldr q19, [x23, #0x60]\n"
1278
+ ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
1279
+ ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
1280
+ "uzp1 v19.2d, v20.2d, v18.2d\n"
1281
+ "scvtf v19.4s, v19.4s, #0x4\n"
1282
+ "uzp2 v20.2d, v20.2d, v18.2d\n"
1283
+ "fmul v18.4s, v27.4s, v9.s[0]\n"
1284
+ "scvtf v20.4s, v20.4s, #0x4\n"
1285
+ "fmla v11.4s, v19.4s, v18.4s\n"
1286
+ "ldr q18, [x22, #0x0]\n"
1287
+ "fmul v19.4s, v27.4s, v9.s[1]\n"
1288
+ "fmla v13.4s, v20.4s, v19.4s\n"
1289
+ "movi v19.4s, #0x0\n"
1290
+ "movi v20.4s, #0x0\n"
1291
+ ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
1292
+ ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
1293
+ "ldr q17, [x23, #0x30]\n"
1294
+ ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
1295
+ ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
1296
+ "ldr q17, [x23, #0x50]\n"
1297
+ ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
1298
+ ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
1299
+ "ldr q17, [x23, #0x70]\n"
1300
+ "add x23, x23, #0x88\n"
1301
+ ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
1302
+ ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
1303
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1304
+ "scvtf v17.4s, v17.4s, #0x4\n"
1305
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1306
+ "fmul v19.4s, v27.4s, v9.s[2]\n"
1307
+ "fmul v9.4s, v27.4s, v9.s[3]\n"
1308
+ "scvtf v20.4s, v20.4s, #0x4\n"
1309
+ "fmla v22.4s, v17.4s, v19.4s\n"
1310
+ "ldr q17, [x22, #0x10]\n"
1311
+ "movi v19.4s, #0x0\n"
1312
+ ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
1313
+ "fmla v23.4s, v20.4s, v9.4s\n"
1314
+ "movi v20.4s, #0x0\n"
1315
+ "movi v9.4s, #0x0\n"
1316
+ ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
1317
+ "ldr q18, [x22, #0x20]\n"
1318
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1319
+ ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
1320
+ ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
1321
+ "ldr q18, [x22, #0x40]\n"
1322
+ ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
1323
+ ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
1324
+ "ldr q18, [x22, #0x60]\n"
1325
+ ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
1326
+ ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
1327
+ "movi v18.4s, #0x0\n"
1328
+ ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
1329
+ "ldr q17, [x22, #0x30]\n"
1330
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1331
+ ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
1332
+ "ldr q17, [x22, #0x50]\n"
1333
+ ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
1334
+ ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
1335
+ "ldr q17, [x22, #0x70]\n"
1336
+ "add x22, x22, #0x88\n"
1337
+ ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
1338
+ ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
1339
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1340
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1341
+ "fmul v19.4s, v27.4s, v0.s[0]\n"
1342
+ "scvtf v17.4s, v17.4s, #0x4\n"
1343
+ "scvtf v20.4s, v20.4s, #0x4\n"
1344
+ "fmla v25.4s, v17.4s, v19.4s\n"
1345
+ "ldr q19, [x21, #0x0]\n"
1346
+ "fmul v17.4s, v27.4s, v0.s[1]\n"
1347
+ "fmla v5.4s, v20.4s, v17.4s\n"
1348
+ "ldr q17, [x21, #0x10]\n"
1349
+ "uzp1 v20.2d, v9.2d, v18.2d\n"
1350
+ "uzp2 v9.2d, v9.2d, v18.2d\n"
1351
+ "fmul v18.4s, v27.4s, v0.s[2]\n"
1352
+ "fmul v0.4s, v27.4s, v0.s[3]\n"
1353
+ "scvtf v20.4s, v20.4s, #0x4\n"
1354
+ "scvtf v9.4s, v9.4s, #0x4\n"
1355
+ "fmla v7.4s, v20.4s, v18.4s\n"
1356
+ "movi v20.4s, #0x0\n"
1357
+ "movi v18.4s, #0x0\n"
1358
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1359
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1360
+ "ldr q19, [x21, #0x20]\n"
1361
+ "fmla v4.4s, v9.4s, v0.4s\n"
1362
+ "movi v9.4s, #0x0\n"
1363
+ "movi v0.4s, #0x0\n"
1364
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1365
+ "fmul v8.4s, v27.4s, v26.s[0]\n"
1366
+ ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
1367
+ "ldr q17, [x21, #0x30]\n"
1368
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1369
+ "fmul v31.4s, v27.4s, v26.s[1]\n"
1370
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1371
+ "ldr q19, [x21, #0x40]\n"
1372
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1373
+ "fmul v15.4s, v27.4s, v26.s[2]\n"
1374
+ "fmul v27.4s, v27.4s, v26.s[3]\n"
1375
+ ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
1376
+ "ldr q1, [x21, #0x50]\n"
1377
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1378
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1379
+ "ldr q26, [x21, #0x60]\n"
1380
+ ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
1381
+ ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
1382
+ "ldr q21, [x21, #0x70]\n"
1383
+ "add x21, x21, #0x88\n"
1384
+ ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
1385
+ ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
1386
+ ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
1387
+ ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
1388
+ "uzp1 v29.2d, v20.2d, v18.2d\n"
1389
+ "uzp2 v21.2d, v20.2d, v18.2d\n"
1390
+ "scvtf v29.4s, v29.4s, #0x4\n"
1391
+ "uzp1 v18.2d, v9.2d, v0.2d\n"
1392
+ "uzp2 v16.2d, v9.2d, v0.2d\n"
1393
+ "scvtf v21.4s, v21.4s, #0x4\n"
1394
+ "fmla v6.4s, v29.4s, v8.4s\n"
1395
+ "scvtf v18.4s, v18.4s, #0x4\n"
1396
+ "scvtf v16.4s, v16.4s, #0x4\n"
1397
+ "fmla v30.4s, v21.4s, v31.4s\n"
1398
+ "fmla v24.4s, v18.4s, v15.4s\n"
1399
+ "fmla v14.4s, v16.4s, v27.4s\n"
1400
+ "bgt 3b\n"
1401
+ "mov x20, %x[res_ptr]\n"
1402
+ "subs x27, x27, #0x4\n"
1403
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1404
+ "str q2, [x20, #0x0]\n"
1405
+ "add x20, x20, %x[res_stride]\n"
1406
+ "str q10, [x20, #0x0]\n"
1407
+ "add x20, x20, %x[res_stride]\n"
1408
+ "str q12, [x20, #0x0]\n"
1409
+ "add x20, x20, %x[res_stride]\n"
1410
+ "str q28, [x20, #0x0]\n"
1411
+ "add x20, x20, %x[res_stride]\n"
1412
+ "str q11, [x20, #0x0]\n"
1413
+ "add x20, x20, %x[res_stride]\n"
1414
+ "str q13, [x20, #0x0]\n"
1415
+ "add x20, x20, %x[res_stride]\n"
1416
+ "str q22, [x20, #0x0]\n"
1417
+ "add x20, x20, %x[res_stride]\n"
1418
+ "str q23, [x20, #0x0]\n"
1419
+ "add x20, x20, %x[res_stride]\n"
1420
+ "str q25, [x20, #0x0]\n"
1421
+ "add x20, x20, %x[res_stride]\n"
1422
+ "str q5, [x20, #0x0]\n"
1423
+ "add x20, x20, %x[res_stride]\n"
1424
+ "str q7, [x20, #0x0]\n"
1425
+ "add x20, x20, %x[res_stride]\n"
1426
+ "str q4, [x20, #0x0]\n"
1427
+ "add x20, x20, %x[res_stride]\n"
1428
+ "str q6, [x20, #0x0]\n"
1429
+ "add x20, x20, %x[res_stride]\n"
1430
+ "str q30, [x20, #0x0]\n"
1431
+ "add x20, x20, %x[res_stride]\n"
1432
+ "str q24, [x20, #0x0]\n"
1433
+ "add x20, x20, %x[res_stride]\n"
1434
+ "str q14, [x20, #0x0]\n"
1435
+ "bne 2b\n"
1436
+ "mov x20, #0x4\n"
1437
+ "sub x10, x10, #0x10\n"
1438
+ "cmp x10, #0x10\n"
1439
+ "mov %x[res_ptr], x26\n"
1440
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1441
+ "bge 1b\n"
1442
+ "4:" // Row loop skip
1443
+ "cbz x10, 9f\n"
1444
+ "5:" // Row tail: Row loop
1445
+ "add x24, %x[b_ptr], #0x8\n"
1446
+ "mov x23, %x[nc]\n"
1447
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1448
+ "6:" // Row tail: Column loop
1449
+ "movi v2.16b, #0x0\n"
1450
+ "movi v10.16b, #0x0\n"
1451
+ "add x25, %x[a_ptr], #0x8\n"
1452
+ "mov x21, %x[nb]\n"
1453
+ "movi v12.16b, #0x0\n"
1454
+ "movi v28.16b, #0x0\n"
1455
+ "7:" // Row tail: Block loop
1456
+ "ldr q6, [x24, #0x0]\n"
1457
+ "ldr q5, [x24, #0x10]\n"
1458
+ "movi v17.16b, #0x4\n"
1459
+ "movi v8.4s, #0x0\n"
1460
+ "ldr q4, [x25, #0x0]\n"
1461
+ "ldr q13, [x25, #0x10]\n"
1462
+ "movi v27.4s, #0x0\n"
1463
+ "movi v0.4s, #0x0\n"
1464
+ "ldr q31, [x24, #0x20]\n"
1465
+ "ldr q14, [x24, #0x30]\n"
1466
+ "movi v29.4s, #0x0\n"
1467
+ "movi v22.16b, #0xf0\n"
1468
+ "ldr q11, [x25, #0x20]\n"
1469
+ "ldr q23, [x25, #0x30]\n"
1470
+ "sshl v21.16b, v6.16b, v17.16b\n"
1471
+ "sshl v16.16b, v5.16b, v17.16b\n"
1472
+ "ldr q20, [x25, #0x40]\n"
1473
+ "ldr q26, [x25, #0x50]\n"
1474
+ "and v6.16b, v6.16b, v22.16b\n"
1475
+ "and v5.16b, v5.16b, v22.16b\n"
1476
+ "ldr q25, [x25, #0x60]\n"
1477
+ "ldr q3, [x25, #0x70]\n"
1478
+ "sshl v19.16b, v31.16b, v17.16b\n"
1479
+ "sshl v18.16b, v14.16b, v17.16b\n"
1480
+ "ldr d17, [x25, #-0x8]\n"
1481
+ ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
1482
+ ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
1483
+ "and v31.16b, v31.16b, v22.16b\n"
1484
+ ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
1485
+ ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
1486
+ "and v14.16b, v14.16b, v22.16b\n"
1487
+ "sub x20, x24, #0x8\n"
1488
+ "ldr d16, [x20, #0x0]\n"
1489
+ "subs x21, x21, #0x1\n"
1490
+ "add x25, x25, #0x88\n"
1491
+ "fcvtl v17.4s, v17.4h\n"
1492
+ "add x24, x24, #0x48\n"
1493
+ ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
1494
+ ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
1495
+ ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
1496
+ ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
1497
+ "fcvtl v16.4s, v16.4h\n"
1498
+ ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
1499
+ ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
1500
+ "fmul v23.4s, v16.4s, v17.s[0]\n"
1501
+ "fmul v21.4s, v16.4s, v17.s[1]\n"
1502
+ "fmul v1.4s, v16.4s, v17.s[2]\n"
1503
+ "fmul v20.4s, v16.4s, v17.s[3]\n"
1504
+ ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
1505
+ ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
1506
+ ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
1507
+ ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
1508
+ ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
1509
+ ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
1510
+ "uzp1 v19.2d, v8.2d, v27.2d\n"
1511
+ "uzp2 v18.2d, v8.2d, v27.2d\n"
1512
+ "scvtf v19.4s, v19.4s, #0x4\n"
1513
+ "uzp1 v17.2d, v0.2d, v29.2d\n"
1514
+ "uzp2 v16.2d, v0.2d, v29.2d\n"
1515
+ "scvtf v18.4s, v18.4s, #0x4\n"
1516
+ "fmla v2.4s, v19.4s, v23.4s\n"
1517
+ "scvtf v17.4s, v17.4s, #0x4\n"
1518
+ "scvtf v16.4s, v16.4s, #0x4\n"
1519
+ "fmla v10.4s, v18.4s, v21.4s\n"
1520
+ "fmla v12.4s, v17.4s, v1.4s\n"
1521
+ "fmla v28.4s, v16.4s, v20.4s\n"
1522
+ "bgt 7b\n"
1523
+ "mov x20, %x[res_ptr]\n"
1524
+ "cmp x10, #0x1\n"
1525
+ "str q2, [x20, #0x0]\n"
1526
+ "add x20, x20, %x[res_stride]\n"
1527
+ "ble 8f\n"
1528
+ "cmp x10, #0x2\n"
1529
+ "str q10, [x20, #0x0]\n"
1530
+ "add x20, x20, %x[res_stride]\n"
1531
+ "ble 8f\n"
1532
+ "cmp x10, #0x3\n"
1533
+ "str q12, [x20, #0x0]\n"
1534
+ "add x20, x20, %x[res_stride]\n"
1535
+ "ble 8f\n"
1536
+ "str q28, [x20, #0x0]\n"
1537
+ "8:" // Row tail: Accumulator store skip
1538
+ "subs x23, x23, #0x4\n"
1539
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1540
+ "bne 6b\n"
1541
+ "subs x10, x10, #0x4\n"
1542
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1543
+ "mov %x[res_ptr], x22\n"
1544
+ "bgt 5b\n"
1545
+ "9:" // Row tail: Row loop skip
1546
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1547
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1548
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1549
+ );
1550
+ return;
 
 
1551
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1552
  float sumf[4][4];
1553
  int sumi;
 
1605
 
1606
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
1607
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1608
+ if (ggml_cpu_get_sve_cnt() == QK8_0) {
1609
  const void * b_ptr = vx;
1610
  const void * a_ptr = vy;
1611
  float * res_ptr = s;
 
2073
  UNUSED(blocklen);
2074
 
2075
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
2076
+ const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
 
2077
 
2078
+ for (int y = 0; y < nr / 4; y++) {
2079
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2080
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2081
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
2082
 
2083
+ float32x4_t sumf[4];
2084
+ for (int m = 0; m < 4; m++) {
2085
+ sumf[m] = vdupq_n_f32(0);
2086
+ }
2087
 
2088
+ for (int l = 0; l < nb; l++) {
2089
+ float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
2090
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2091
 
2092
+ int32x4_t sumi_0 = vdupq_n_s32(0);
2093
+ int32x4_t sumi_1 = vdupq_n_s32(0);
2094
+ int32x4_t sumi_2 = vdupq_n_s32(0);
2095
+ int32x4_t sumi_3 = vdupq_n_s32(0);
2096
+
2097
+ for (int k = 0; k < 4; k++) {
2098
+ int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
2099
+ int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
2100
+
2101
+ uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
2102
+ int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
2103
+ int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
2104
+
2105
+ sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
2106
+ sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
2107
+ sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
2108
+ sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
2109
+ sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
2110
+ sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
2111
+ sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
2112
+ sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
2113
  }
2114
 
2115
+ sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
2116
+ sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
2117
+ sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
2118
+ sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
2119
+ }
2120
+
2121
+ for (int m = 0; m < 4; m++) {
2122
+ vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
2123
  }
2124
  }
 
2125
  }
2126
+ return;
2127
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
2128
  {
2129
  float sumf[4][4];
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -74,13 +74,8 @@
74
 
75
  #if defined(__ARM_ARCH)
76
  struct ggml_arm_arch_features_type {
77
- int has_neon;
78
- int has_dotprod;
79
- int has_i8mm;
80
- int has_sve;
81
  int sve_cnt;
82
- int has_sme;
83
- } ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
84
  #endif
85
 
86
 
@@ -678,87 +673,15 @@ bool ggml_is_numa(void) {
678
 
679
  #if defined(__linux__) && defined(__aarch64__)
680
  #include <sys/auxv.h>
681
- #elif defined(__APPLE__)
682
- #include <sys/sysctl.h>
683
- #endif
684
-
685
- #if !defined(HWCAP2_I8MM)
686
- #define HWCAP2_I8MM (1 << 13)
687
- #endif
688
-
689
- #if !defined(HWCAP2_SME)
690
- #define HWCAP2_SME (1 << 23)
691
  #endif
692
 
693
  static void ggml_init_arm_arch_features(void) {
694
- #if defined(__linux__) && defined(__aarch64__)
695
- uint32_t hwcap = getauxval(AT_HWCAP);
696
- uint32_t hwcap2 = getauxval(AT_HWCAP2);
697
-
698
- ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
699
- ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
700
- ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
701
- ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
702
- ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
703
-
704
- #if defined(__ARM_FEATURE_SVE)
705
  ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
706
  #endif
707
- #elif defined(__APPLE__)
708
- int oldp = 0;
709
- size_t size = sizeof(oldp);
710
- if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
711
- oldp = 0;
712
- }
713
- ggml_arm_arch_features.has_neon = oldp;
714
-
715
- if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
716
- oldp = 0;
717
- }
718
- ggml_arm_arch_features.has_dotprod = oldp;
719
-
720
- if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
721
- oldp = 0;
722
- }
723
- ggml_arm_arch_features.has_i8mm = oldp;
724
-
725
- if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
726
- oldp = 0;
727
- }
728
- ggml_arm_arch_features.has_sme = oldp;
729
-
730
- ggml_arm_arch_features.has_sve = 0;
731
- ggml_arm_arch_features.sve_cnt = 0;
732
- #else
733
- // Run-time CPU feature detection not implemented for this platform, fallback to compile time
734
- #if defined(__ARM_NEON)
735
- ggml_arm_arch_features.has_neon = 1;
736
- #else
737
- ggml_arm_arch_features.has_neon = 0;
738
- #endif
739
-
740
- #if defined(__ARM_FEATURE_MATMUL_INT8)
741
- ggml_arm_arch_features.has_i8mm = 1;
742
- #else
743
- ggml_arm_arch_features.has_i8mm = 0;
744
- #endif
745
-
746
- #if defined(__ARM_FEATURE_SVE)
747
- ggml_arm_arch_features.has_sve = 1;
748
- ggml_arm_arch_features.sve_cnt = 16;
749
- #else
750
- ggml_arm_arch_features.has_sve = 0;
751
- ggml_arm_arch_features.sve_cnt = 0;
752
- #endif
753
-
754
- #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
755
- ggml_arm_arch_features.has_sme = 1;
756
- #else
757
- ggml_arm_arch_features.has_sme = 0;
758
- #endif
759
- #endif
760
  }
761
- #endif
 
762
 
763
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
764
  GGML_ASSERT(!ggml_get_no_alloc(ctx));
@@ -3448,7 +3371,7 @@ int ggml_cpu_has_vxe(void) {
3448
 
3449
  int ggml_cpu_has_neon(void) {
3450
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3451
- return ggml_arm_arch_features.has_neon;
3452
  #else
3453
  return 0;
3454
  #endif
@@ -3456,7 +3379,7 @@ int ggml_cpu_has_neon(void) {
3456
 
3457
  int ggml_cpu_has_dotprod(void) {
3458
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
3459
- return ggml_arm_arch_features.has_dotprod;
3460
  #else
3461
  return 0;
3462
  #endif
@@ -3464,7 +3387,7 @@ int ggml_cpu_has_dotprod(void) {
3464
 
3465
  int ggml_cpu_has_sve(void) {
3466
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
3467
- return ggml_arm_arch_features.has_sve;
3468
  #else
3469
  return 0;
3470
  #endif
@@ -3472,7 +3395,7 @@ int ggml_cpu_has_sve(void) {
3472
 
3473
  int ggml_cpu_has_matmul_int8(void) {
3474
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
3475
- return ggml_arm_arch_features.has_i8mm;
3476
  #else
3477
  return 0;
3478
  #endif
@@ -3488,7 +3411,7 @@ int ggml_cpu_get_sve_cnt(void) {
3488
 
3489
  int ggml_cpu_has_sme(void) {
3490
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
3491
- return ggml_arm_arch_features.has_sme;
3492
  #else
3493
  return 0;
3494
  #endif
 
74
 
75
  #if defined(__ARM_ARCH)
76
  struct ggml_arm_arch_features_type {
 
 
 
 
77
  int sve_cnt;
78
+ } ggml_arm_arch_features = { 0 };
 
79
  #endif
80
 
81
 
 
673
 
674
  #if defined(__linux__) && defined(__aarch64__)
675
  #include <sys/auxv.h>
 
 
 
 
 
 
 
 
 
 
676
  #endif
677
 
678
  static void ggml_init_arm_arch_features(void) {
679
+ #if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
 
 
 
 
 
 
 
 
 
 
680
  ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
681
  #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  }
683
+
684
+ #endif // __ARM_ARCH
685
 
686
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
687
  GGML_ASSERT(!ggml_get_no_alloc(ctx));
 
3371
 
3372
  int ggml_cpu_has_neon(void) {
3373
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3374
+ return 1;
3375
  #else
3376
  return 0;
3377
  #endif
 
3379
 
3380
  int ggml_cpu_has_dotprod(void) {
3381
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
3382
+ return 1;
3383
  #else
3384
  return 0;
3385
  #endif
 
3387
 
3388
  int ggml_cpu_has_sve(void) {
3389
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
3390
+ return 1;
3391
  #else
3392
  return 0;
3393
  #endif
 
3395
 
3396
  int ggml_cpu_has_matmul_int8(void) {
3397
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
3398
+ return 1;
3399
  #else
3400
  return 0;
3401
  #endif
 
3411
 
3412
  int ggml_cpu_has_sme(void) {
3413
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
3414
+ return 1;
3415
  #else
3416
  return 0;
3417
  #endif