/***************************************************************************** * Copyright (C) 2024 MulticoreWare, Inc * * Authors: Hari Limaye * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #include "asm.S" .arch armv8.2-a+dotprod #ifdef __APPLE__ .section __RODATA,__rodata #else .section .rodata #endif .align 4 .text // Fully unrolled. .macro SSE_PP_4xN h function PFX(pixel_sse_pp_4x\h\()_neon_dotprod) movi v0.4s, #0 .rept \h / 4 ldr s16, [x0] ldr s17, [x2] add x0, x0, x1 add x2, x2, x3 ld1 {v16.s}[1], [x0], x1 ld1 {v16.s}[2], [x0], x1 ld1 {v16.s}[3], [x0], x1 ld1 {v17.s}[1], [x2], x3 ld1 {v17.s}[2], [x2], x3 ld1 {v17.s}[3], [x2], x3 uabd v1.16b, v16.16b, v17.16b udot v0.4s, v1.16b, v1.16b .endr addv s0, v0.4s fmov w0, s0 ret endfunc .endm SSE_PP_4xN 4 SSE_PP_4xN 8 // Fully unrolled. .macro SSE_PP_8xN h function PFX(pixel_sse_pp_8x\h\()_neon_dotprod) movi v0.4s, #0 .rept \h ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x2], x3 uabd v1.8b, v16.8b, v17.8b udot v0.2s, v1.8b, v1.8b .endr addv s0, v0.4s fmov w0, s0 ret endfunc .endm SSE_PP_8xN 8 SSE_PP_8xN 16 // Fully unrolled. .macro SSE_PP_16xN h function PFX(pixel_sse_pp_16x\h\()_neon_dotprod) movi v0.4s, #0 movi v1.4s, #0 .rept \h / 2 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x2], x3 ld1 {v18.16b}, [x0], x1 ld1 {v19.16b}, [x2], x3 uabd v2.16b, v16.16b, v17.16b udot v0.4s, v2.16b, v2.16b uabd v3.16b, v18.16b, v19.16b udot v1.4s, v3.16b, v3.16b .endr add v0.4s, v0.4s, v1.4s addv s0, v0.4s fmov w0, s0 ret endfunc .endm SSE_PP_16xN 16 SSE_PP_16xN 32 // Loop unrolled to process 4 rows per iteration. function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0 movi v0.4s, #0 movi v1.4s, #0 .Loop_sse_pp_32xh: sub w4, w4, #1 .rept 4 ld1 {v16.16b,v17.16b}, [x0], x1 ld1 {v18.16b,v19.16b}, [x2], x3 uabd v2.16b, v16.16b, v18.16b udot v0.4s, v2.16b, v2.16b uabd v3.16b, v17.16b, v19.16b udot v1.4s, v3.16b, v3.16b .endr cbnz w4, .Loop_sse_pp_32xh add v0.4s, v0.4s, v1.4s addv s0, v0.4s fmov w0, s0 ret endfunc .macro SSE_PP_32xN h function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) mov w4, \h / 4 b PFX(pixel_sse_pp_32xh_neon_dotprod) endfunc .endm SSE_PP_32xN 32 SSE_PP_32xN 64 // Loop unrolled to process 4 rows per iteration. function PFX(pixel_sse_pp_64x64_neon_dotprod) mov w12, #16 movi v0.4s, #0 movi v1.4s, #0 .Loop_sse_pp_64: sub w12, w12, #1 .rept 4 ld1 {v16.16b-v19.16b}, [x0], x1 ld1 {v20.16b-v23.16b}, [x2], x3 uabd v2.16b, v16.16b, v20.16b udot v0.4s, v2.16b, v2.16b uabd v3.16b, v17.16b, v21.16b udot v1.4s, v3.16b, v3.16b uabd v4.16b, v18.16b, v22.16b udot v0.4s, v4.16b, v4.16b uabd v5.16b, v19.16b, v23.16b udot v1.4s, v5.16b, v5.16b .endr cbnz w12, .Loop_sse_pp_64 add v0.4s, v0.4s, v1.4s addv s0, v0.4s fmov w0, s0 ret endfunc