/***************************************************************************** * Copyright (C) 2022-2023 MulticoreWare, Inc * * Authors: David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #include "asm-sve.S" #include "mc-a-common.S" .arch armv8-a+sve2 #ifdef __APPLE__ .section __RODATA,__rodata #else .section .rodata #endif .align 4 .text function PFX(pixel_avg_pp_12x16_sve2) sub x1, x1, #4 sub x3, x3, #4 sub x5, x5, #4 ptrue p0.s, vl1 ptrue p1.b, vl8 mov x11, #4 .rept 16 ld1w {z0.s}, p0/z, [x2] ld1b {z1.b}, p1/z, [x2, x11] ld1w {z2.s}, p0/z, [x4] ld1b {z3.b}, p1/z, [x4, x11] add x2, x2, #4 add x2, x2, x3 add x4, x4, #4 add x4, x4, x5 urhadd z0.b, p1/m, z0.b, z2.b urhadd z1.b, p1/m, z1.b, z3.b st1b {z0.b}, p1, [x0] st1b {z1.b}, p1, [x0, x11] add x0, x0, #4 add x0, x0, x1 .endr ret endfunc function PFX(pixel_avg_pp_24x32_sve2) mov w12, #4 rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_pixel_avg_pp_24x32 sub x1, x1, #16 sub x3, x3, #16 sub x5, x5, #16 .lpavg_24x32_sve2: sub w12, w12, #1 .rept 8 ld1 {v0.16b}, [x2], #16 ld1 {v1.8b}, [x2], x3 ld1 {v2.16b}, [x4], #16 ld1 {v3.8b}, [x4], x5 urhadd v0.16b, v0.16b, v2.16b urhadd v1.8b, v1.8b, v3.8b st1 {v0.16b}, [x0], #16 st1 {v1.8b}, [x0], x1 .endr cbnz w12, .lpavg_24x32_sve2 ret .vl_gt_16_pixel_avg_pp_24x32: mov x10, #24 mov x11, #0 whilelt p0.b, x11, x10 .vl_gt_16_loop_pixel_avg_pp_24x32: sub w12, w12, #1 .rept 8 ld1b {z0.b}, p0/z, [x2] ld1b {z2.b}, p0/z, [x4] add x2, x2, x3 add x4, x4, x5 urhadd z0.b, p0/m, z0.b, z2.b st1b {z0.b}, p0, [x0] add x0, x0, x1 .endr cbnz w12, .vl_gt_16_loop_pixel_avg_pp_24x32 ret endfunc .macro pixel_avg_pp_32xN_sve2 h function PFX(pixel_avg_pp_32x\h\()_sve2) rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_pixel_avg_pp_32_\h .rept \h ld1 {v0.16b-v1.16b}, [x2], x3 ld1 {v2.16b-v3.16b}, [x4], x5 urhadd v0.16b, v0.16b, v2.16b urhadd v1.16b, v1.16b, v3.16b st1 {v0.16b-v1.16b}, [x0], x1 .endr ret .vl_gt_16_pixel_avg_pp_32_\h: ptrue p0.b, vl32 .rept \h ld1b {z0.b}, p0/z, [x2] ld1b {z2.b}, p0/z, [x4] add x2, x2, x3 add x4, x4, x5 urhadd z0.b, p0/m, z0.b, z2.b st1b {z0.b}, p0, [x0] add x0, x0, x1 .endr ret endfunc .endm pixel_avg_pp_32xN_sve2 8 pixel_avg_pp_32xN_sve2 16 pixel_avg_pp_32xN_sve2 24 .macro pixel_avg_pp_32xN1_sve2 h function PFX(pixel_avg_pp_32x\h\()_sve2) rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_pixel_avg_pp_32xN1_\h mov w12, #\h / 8 .lpavg_sve2_32x\h\(): sub w12, w12, #1 .rept 8 ld1 {v0.16b-v1.16b}, [x2], x3 ld1 {v2.16b-v3.16b}, [x4], x5 urhadd v0.16b, v0.16b, v2.16b urhadd v1.16b, v1.16b, v3.16b st1 {v0.16b-v1.16b}, [x0], x1 .endr cbnz w12, .lpavg_sve2_32x\h ret .vl_gt_16_pixel_avg_pp_32xN1_\h: ptrue p0.b, vl32 mov w12, #\h / 8 .eq_32_loop_pixel_avg_pp_32xN1_\h\(): sub w12, w12, #1 .rept 8 ld1b {z0.b}, p0/z, [x2] ld1b {z2.b}, p0/z, [x4] add x2, x2, x3 add x4, x4, x5 urhadd z0.b, p0/m, z0.b, z2.b st1b {z0.b}, p0, [x0] add x0, x0, x1 .endr cbnz w12, .eq_32_loop_pixel_avg_pp_32xN1_\h ret endfunc .endm pixel_avg_pp_32xN1_sve2 32 pixel_avg_pp_32xN1_sve2 64 function PFX(pixel_avg_pp_48x64_sve2) rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_pixel_avg_pp_48x64 mov w12, #8 .lpavg_48x64_sve2: sub w12, w12, #1 .rept 8 ld1 {v0.16b-v2.16b}, [x2], x3 ld1 {v3.16b-v5.16b}, [x4], x5 urhadd v0.16b, v0.16b, v3.16b urhadd v1.16b, v1.16b, v4.16b urhadd v2.16b, v2.16b, v5.16b st1 {v0.16b-v2.16b}, [x0], x1 .endr cbnz w12, .lpavg_48x64_sve2 ret .vl_gt_16_pixel_avg_pp_48x64: cmp x9, #32 bgt .vl_gt_32_pixel_avg_pp_48x64 ptrue p0.b, vl32 ptrue p1.b, vl16 mov w12, #8 .vl_eq_32_pixel_avg_pp_48x64: sub w12, w12, #1 .rept 8 ld1b {z0.b}, p0/z, [x2] ld1b {z1.b}, p1/z, [x2, #1, mul vl] ld1b {z2.b}, p0/z, [x4] ld1b {z3.b}, p1/z, [x4, #1, mul vl] add x2, x2, x3 add x4, x4, x5 urhadd z0.b, p0/m, z0.b, z2.b urhadd z1.b, p1/m, z1.b, z3.b st1b {z0.b}, p0, [x0] st1b {z1.b}, p1, [x0, #1, mul vl] add x0, x0, x1 .endr cbnz w12, .vl_eq_32_pixel_avg_pp_48x64 ret .vl_gt_32_pixel_avg_pp_48x64: mov x10, #48 mov x11, #0 whilelt p0.b, x11, x10 mov w12, #8 .Loop_gt_32_pixel_avg_pp_48x64: sub w12, w12, #1 .rept 8 ld1b {z0.b}, p0/z, [x2] ld1b {z2.b}, p0/z, [x4] add x2, x2, x3 add x4, x4, x5 urhadd z0.b, p0/m, z0.b, z2.b st1b {z0.b}, p0, [x0] add x0, x0, x1 .endr cbnz w12, .Loop_gt_32_pixel_avg_pp_48x64 ret endfunc .macro pixel_avg_pp_64xN_sve2 h function PFX(pixel_avg_pp_64x\h\()_sve2) rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_pixel_avg_pp_64x\h mov w12, #\h / 4 .lpavg_sve2_64x\h\(): sub w12, w12, #1 .rept 4 ld1 {v0.16b-v3.16b}, [x2], x3 ld1 {v4.16b-v7.16b}, [x4], x5 urhadd v0.16b, v0.16b, v4.16b urhadd v1.16b, v1.16b, v5.16b urhadd v2.16b, v2.16b, v6.16b urhadd v3.16b, v3.16b, v7.16b st1 {v0.16b-v3.16b}, [x0], x1 .endr cbnz w12, .lpavg_sve2_64x\h ret .vl_gt_16_pixel_avg_pp_64x\h\(): cmp x9, #48 bgt .vl_gt_48_pixel_avg_pp_64x\h ptrue p0.b, vl32 mov w12, #\h / 4 .vl_eq_32_pixel_avg_pp_64x\h\(): sub w12, w12, #1 .rept 4 ld1b {z0.b}, p0/z, [x2] ld1b {z1.b}, p0/z, [x2, #1, mul vl] ld1b {z2.b}, p0/z, [x4] ld1b {z3.b}, p0/z, [x4, #1, mul vl] add x2, x2, x3 add x4, x4, x5 urhadd z0.b, p0/m, z0.b, z2.b urhadd z1.b, p0/m, z1.b, z3.b st1b {z0.b}, p0, [x0] st1b {z1.b}, p0, [x0, #1, mul vl] add x0, x0, x1 .endr cbnz w12, .vl_eq_32_pixel_avg_pp_64x\h ret .vl_gt_48_pixel_avg_pp_64x\h\(): ptrue p0.b, vl64 mov w12, #\h / 4 .vl_eq_64_pixel_avg_pp_64x\h\(): sub w12, w12, #1 .rept 4 ld1b {z0.b}, p0/z, [x2] ld1b {z2.b}, p0/z, [x4] add x2, x2, x3 add x4, x4, x5 urhadd z0.b, p0/m, z0.b, z2.b st1b {z0.b}, p0, [x0] add x0, x0, x1 .endr cbnz w12, .vl_eq_64_pixel_avg_pp_64x\h ret endfunc .endm pixel_avg_pp_64xN_sve2 16 pixel_avg_pp_64xN_sve2 32 pixel_avg_pp_64xN_sve2 48 pixel_avg_pp_64xN_sve2 64 // void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) .macro addAvg_2xN_sve2 h function PFX(addAvg_2x\h\()_sve2) ptrue p0.s, vl2 ptrue p1.h, vl4 ptrue p2.h, vl2 .rept \h / 2 ld1rw {z0.s}, p0/z, [x0] ld1rw {z1.s}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 ld1rw {z2.s}, p0/z, [x0] ld1rw {z3.s}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p1/m, z0.h, z1.h add z2.h, p1/m, z2.h, z3.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 st1b {z0.h}, p2, [x2] add x2, x2, x5 st1b {z2.h}, p2, [x2] add x2, x2, x5 .endr ret endfunc .endm addAvg_2xN_sve2 4 addAvg_2xN_sve2 8 addAvg_2xN_sve2 16 .macro addAvg_6xN_sve2 h function PFX(addAvg_6x\h\()_sve2) mov w12, #\h / 2 ptrue p0.b, vl16 ptrue p2.h, vl6 .Loop_sve2_addavg_6x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 ld1b {z2.b}, p0/z, [x0] ld1b {z3.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h add z2.h, p0/m, z2.h, z3.h sqrshrnb z0.b, z0.h, #7 sqrshrnb z2.b, z2.h, #7 add z0.b, z0.b, #0x80 add z2.b, z2.b, #0x80 st1b {z0.h}, p2, [x2] add x2, x2, x5 st1b {z2.h}, p2, [x2] add x2, x2, x5 cbnz w12, .Loop_sve2_addavg_6x\h ret endfunc .endm addAvg_6xN_sve2 8 addAvg_6xN_sve2 16 .macro addAvg_8xN_sve2 h function PFX(addAvg_8x\h\()_sve2) ptrue p0.b, vl16 .rept \h / 2 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 ld1b {z2.b}, p0/z, [x0] ld1b {z3.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h add z2.h, p0/m, z2.h, z3.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 st1b {z2.h}, p0, [x2] add x2, x2, x5 .endr ret endfunc .endm .macro addAvg_8xN1_sve2 h function PFX(addAvg_8x\h\()_sve2) mov w12, #\h / 2 ptrue p0.b, vl16 .Loop_sve2_addavg_8x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 ld1b {z2.b}, p0/z, [x0] ld1b {z3.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h add z2.h, p0/m, z2.h, z3.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 st1b {z2.h}, p0, [x2] add x2, x2, x5 cbnz w12, .Loop_sve2_addavg_8x\h ret endfunc .endm addAvg_8xN_sve2 2 addAvg_8xN_sve2 4 addAvg_8xN_sve2 6 addAvg_8xN_sve2 8 addAvg_8xN_sve2 12 addAvg_8xN_sve2 16 addAvg_8xN1_sve2 32 addAvg_8xN1_sve2 64 .macro addAvg_12xN_sve2 h function PFX(addAvg_12x\h\()_sve2) mov w12, #\h rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_addAvg_12x\h ptrue p0.b, vl16 ptrue p1.b, vl8 .Loop_sve2_addavg_12x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] ld1b {z2.b}, p1/z, [x0, #1, mul vl] ld1b {z3.b}, p1/z, [x1, #1, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h add z2.h, p1/m, z2.h, z3.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z2.h}, p1, [x2, #1, mul vl] add x2, x2, x5 cbnz w12, .Loop_sve2_addavg_12x\h ret .vl_gt_16_addAvg_12x\h\(): mov x10, #24 mov x11, #0 whilelt p0.b, x11, x10 .Loop_sve2_gt_16_addavg_12x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 cbnz w12, .Loop_sve2_gt_16_addavg_12x\h ret endfunc .endm addAvg_12xN_sve2 16 addAvg_12xN_sve2 32 .macro addAvg_16xN_sve2 h function PFX(addAvg_16x\h\()_sve2) mov w12, #\h rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_addAvg_16x\h ptrue p0.b, vl16 .Loop_eq_16_sve2_addavg_16x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] ld1b {z2.b}, p0/z, [x0, #1, mul vl] ld1b {z3.b}, p0/z, [x1, #1, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h add z2.h, p0/m, z2.h, z3.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z2.h}, p0, [x2, #1, mul vl] add x2, x2, x5 cbnz w12, .Loop_eq_16_sve2_addavg_16x\h ret .vl_gt_16_addAvg_16x\h\(): cmp x9, #32 bgt .vl_gt_32_addAvg_16x\h ptrue p0.b, vl32 .Loop_gt_16_sve2_addavg_16x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 st1b {z0.h}, p1, [x2] add x2, x2, x5 cbnz w12, .Loop_gt_16_sve2_addavg_16x\h ret .vl_gt_32_addAvg_16x\h\(): mov x10, #48 mov x11, #0 whilelt p0.b, x11, x10 .Loop_gt_32_sve2_addavg_16x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 cbnz w12, .Loop_gt_32_sve2_addavg_16x\h ret endfunc .endm addAvg_16xN_sve2 4 addAvg_16xN_sve2 8 addAvg_16xN_sve2 12 addAvg_16xN_sve2 16 addAvg_16xN_sve2 24 addAvg_16xN_sve2 32 addAvg_16xN_sve2 64 .macro addAvg_24xN_sve2 h function PFX(addAvg_24x\h\()_sve2) mov w12, #\h rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_addAvg_24x\h addAvg_start .Loop_eq_16_sve2_addavg_24x\h\(): sub w12, w12, #1 ld1 {v0.16b-v2.16b}, [x0], x3 ld1 {v3.16b-v5.16b}, [x1], x4 addavg_1 v0, v3 addavg_1 v1, v4 addavg_1 v2, v5 sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h sqxtun v2.8b, v2.8h st1 {v0.8b-v2.8b}, [x2], x5 cbnz w12, .Loop_eq_16_sve2_addavg_24x\h ret .vl_gt_16_addAvg_24x\h\(): cmp x9, #48 bgt .vl_gt_48_addAvg_24x\h ptrue p0.b, vl32 ptrue p1.b, vl16 .Loop_gt_16_sve2_addavg_24x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p1/z, [x0, #1, mul vl] ld1b {z2.b}, p0/z, [x1] ld1b {z3.b}, p1/z, [x1, #1, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z2.h add z1.h, p1/m, z1.h, z3.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z1.b, z1.h, #7 add z1.b, z1.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p1, [x2, #1, mul vl] add x2, x2, x5 cbnz w12, .Loop_gt_16_sve2_addavg_24x\h ret .vl_gt_48_addAvg_24x\h\(): mov x10, #48 mov x11, #0 whilelt p0.b, x11, x10 .Loop_gt_48_sve2_addavg_24x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z2.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z2.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 cbnz w12, .Loop_gt_48_sve2_addavg_24x\h ret endfunc .endm addAvg_24xN_sve2 32 addAvg_24xN_sve2 64 .macro addAvg_32xN_sve2 h function PFX(addAvg_32x\h\()_sve2) mov w12, #\h rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_addAvg_32x\h ptrue p0.b, vl16 .Loop_eq_16_sve2_addavg_32x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x0, #1, mul vl] ld1b {z2.b}, p0/z, [x0, #2, mul vl] ld1b {z3.b}, p0/z, [x0, #3, mul vl] ld1b {z4.b}, p0/z, [x1] ld1b {z5.b}, p0/z, [x1, #1, mul vl] ld1b {z6.b}, p0/z, [x1, #2, mul vl] ld1b {z7.b}, p0/z, [x1, #3, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z4.h add z1.h, p0/m, z1.h, z5.h add z2.h, p0/m, z2.h, z6.h add z3.h, p0/m, z3.h, z7.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z1.b, z1.h, #7 add z1.b, z1.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 sqrshrnb z3.b, z3.h, #7 add z3.b, z3.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p0, [x2, #1, mul vl] st1b {z2.h}, p0, [x2, #2, mul vl] st1b {z3.h}, p0, [x2, #3, mul vl] add x2, x2, x5 cbnz w12, .Loop_eq_16_sve2_addavg_32x\h ret .vl_gt_16_addAvg_32x\h\(): cmp x9, #48 bgt .vl_gt_48_addAvg_32x\h ptrue p0.b, vl32 .Loop_gt_eq_32_sve2_addavg_32x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x0, #1, mul vl] ld1b {z2.b}, p0/z, [x1] ld1b {z3.b}, p0/z, [x1, #1, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z2.h add z1.h, p0/m, z1.h, z3.h sqrshrnb z0.b, z0.h, #7 add z1.b, z1.b, #0x80 sqrshrnb z1.b, z1.h, #7 add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p0, [x2, #1, mul vl] add x2, x2, x5 cbnz w12, .Loop_gt_eq_32_sve2_addavg_32x\h ret .vl_gt_48_addAvg_32x\h\(): ptrue p0.b, vl64 .Loop_eq_64_sve2_addavg_32x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z1.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 cbnz w12, .Loop_eq_64_sve2_addavg_32x\h ret endfunc .endm addAvg_32xN_sve2 8 addAvg_32xN_sve2 16 addAvg_32xN_sve2 24 addAvg_32xN_sve2 32 addAvg_32xN_sve2 48 addAvg_32xN_sve2 64 function PFX(addAvg_48x64_sve2) mov w12, #64 rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_addAvg_48x64 addAvg_start sub x3, x3, #64 sub x4, x4, #64 .Loop_eq_16_sve2_addavg_48x64: sub w12, w12, #1 ld1 {v0.8h-v3.8h}, [x0], #64 ld1 {v4.8h-v7.8h}, [x1], #64 ld1 {v20.8h-v21.8h}, [x0], x3 ld1 {v22.8h-v23.8h}, [x1], x4 addavg_1 v0, v4 addavg_1 v1, v5 addavg_1 v2, v6 addavg_1 v3, v7 addavg_1 v20, v22 addavg_1 v21, v23 sqxtun v0.8b, v0.8h sqxtun2 v0.16b, v1.8h sqxtun v1.8b, v2.8h sqxtun2 v1.16b, v3.8h sqxtun v2.8b, v20.8h sqxtun2 v2.16b, v21.8h st1 {v0.16b-v2.16b}, [x2], x5 cbnz w12, .Loop_eq_16_sve2_addavg_48x64 ret .vl_gt_16_addAvg_48x64: cmp x9, #48 bgt .vl_gt_48_addAvg_48x64 ptrue p0.b, vl32 .Loop_gt_eq_32_sve2_addavg_48x64: sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x0, #1, mul vl] ld1b {z2.b}, p0/z, [x0, #2, mul vl] ld1b {z4.b}, p0/z, [x1] ld1b {z5.b}, p0/z, [x1, #1, mul vl] ld1b {z6.b}, p0/z, [x1, #2, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z4.h add z1.h, p0/m, z1.h, z5.h add z2.h, p0/m, z2.h, z6.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z1.b, z1.h, #7 add z1.b, z1.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p0, [x2, #1, mul vl] st1b {z2.h}, p0, [x2, #2, mul vl] add x2, x2, x5 cbnz w12, .Loop_gt_eq_32_sve2_addavg_48x64 ret .vl_gt_48_addAvg_48x64: cmp x9, #112 bgt .vl_gt_112_addAvg_48x64 ptrue p0.b, vl64 ptrue p1.b, vl32 .Loop_gt_48_sve2_addavg_48x64: sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p1/z, [x0, #1, mul vl] ld1b {z4.b}, p0/z, [x1] ld1b {z5.b}, p1/z, [x1, #1, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z4.h add z1.h, p1/m, z1.h, z5.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z1.b, z1.h, #7 add z1.b, z1.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p1, [x2, #1, mul vl] add x2, x2, x5 cbnz w12, .Loop_gt_48_sve2_addavg_48x64 ret .vl_gt_112_addAvg_48x64: mov x10, #96 mov x11, #0 whilelt p0.b, x11, x10 .Loop_gt_112_sve2_addavg_48x64: sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z4.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z4.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 cbnz w12, .Loop_gt_112_sve2_addavg_48x64 ret endfunc .macro addAvg_64xN_sve2 h function PFX(addAvg_64x\h\()_sve2) mov w12, #\h rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_addAvg_64x\h addAvg_start sub x3, x3, #64 sub x4, x4, #64 .Loop_eq_16_sve2_addavg_64x\h\(): sub w12, w12, #1 ld1 {v0.8h-v3.8h}, [x0], #64 ld1 {v4.8h-v7.8h}, [x1], #64 ld1 {v20.8h-v23.8h}, [x0], x3 ld1 {v24.8h-v27.8h}, [x1], x4 addavg_1 v0, v4 addavg_1 v1, v5 addavg_1 v2, v6 addavg_1 v3, v7 addavg_1 v20, v24 addavg_1 v21, v25 addavg_1 v22, v26 addavg_1 v23, v27 sqxtun v0.8b, v0.8h sqxtun2 v0.16b, v1.8h sqxtun v1.8b, v2.8h sqxtun2 v1.16b, v3.8h sqxtun v2.8b, v20.8h sqxtun2 v2.16b, v21.8h sqxtun v3.8b, v22.8h sqxtun2 v3.16b, v23.8h st1 {v0.16b-v3.16b}, [x2], x5 cbnz w12, .Loop_eq_16_sve2_addavg_64x\h ret .vl_gt_16_addAvg_64x\h\(): cmp x9, #48 bgt .vl_gt_48_addAvg_64x\h ptrue p0.b, vl32 .Loop_gt_eq_32_sve2_addavg_64x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x0, #1, mul vl] ld1b {z2.b}, p0/z, [x0, #2, mul vl] ld1b {z3.b}, p0/z, [x0, #3, mul vl] ld1b {z4.b}, p0/z, [x1] ld1b {z5.b}, p0/z, [x1, #1, mul vl] ld1b {z6.b}, p0/z, [x1, #2, mul vl] ld1b {z7.b}, p0/z, [x1, #3, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z4.h add z1.h, p0/m, z1.h, z5.h add z2.h, p0/m, z2.h, z6.h add z3.h, p0/m, z3.h, z7.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z1.b, z1.h, #7 add z1.b, z1.b, #0x80 sqrshrnb z2.b, z2.h, #7 add z2.b, z2.b, #0x80 sqrshrnb z3.b, z3.h, #7 add z3.b, z3.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p0, [x2, #1, mul vl] st1b {z2.h}, p0, [x2, #2, mul vl] st1b {z3.h}, p0, [x2, #3, mul vl] add x2, x2, x5 cbnz w12, .Loop_gt_eq_32_sve2_addavg_64x\h ret .vl_gt_48_addAvg_64x\h\(): cmp x9, #112 bgt .vl_gt_112_addAvg_64x\h ptrue p0.b, vl64 .Loop_gt_eq_48_sve2_addavg_64x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z1.b}, p0/z, [x0, #1, mul vl] ld1b {z4.b}, p0/z, [x1] ld1b {z5.b}, p0/z, [x1, #1, mul vl] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z4.h add z1.h, p0/m, z1.h, z5.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 sqrshrnb z1.b, z1.h, #7 add z1.b, z1.b, #0x80 st1b {z0.h}, p0, [x2] st1b {z1.h}, p0, [x2, #1, mul vl] add x2, x2, x5 cbnz w12, .Loop_gt_eq_48_sve2_addavg_64x\h ret .vl_gt_112_addAvg_64x\h\(): ptrue p0.b, vl128 .Loop_gt_eq_128_sve2_addavg_64x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, [x0] ld1b {z4.b}, p0/z, [x1] add x0, x0, x3, lsl #1 add x1, x1, x4, lsl #1 add z0.h, p0/m, z0.h, z4.h sqrshrnb z0.b, z0.h, #7 add z0.b, z0.b, #0x80 st1b {z0.h}, p0, [x2] add x2, x2, x5 cbnz w12, .Loop_gt_eq_128_sve2_addavg_64x\h ret endfunc .endm addAvg_64xN_sve2 16 addAvg_64xN_sve2 32 addAvg_64xN_sve2 48 addAvg_64xN_sve2 64