;***************************************************************************** ;* Copyright (C) 2013-2020 MulticoreWare, Inc ;* ;* Authors: Praveen Kumar Tiwari ;* Murugan Vairavel ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 64 ALIGN 64 const shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 cextern pb_4 cextern pb_1 cextern pb_16 cextern pb_64 cextern pw_4 cextern pb_8 cextern pb_32 cextern pb_128 SECTION .text ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x4, 4, 7, 0 mov r4w, [r2] mov r5w, [r2 + r3] mov r6w, [r2 + 2 * r3] lea r3, [r3 + 2 * r3] mov r3w, [r2 + r3] mov [r0], r4w mov [r0 + r1], r5w mov [r0 + 2 * r1], r6w lea r1, [r1 + 2 * r1] mov [r0 + r1], r3w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x8, 4, 7, 0 lea r5, [3 * r1] lea r6, [3 * r3] mov r4w, [r2] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w lea r2, [r2 + 4 * r3] mov r4w, [r2] lea r0, [r0 + 4 * r1] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x16, 4, 7, 0 lea r5, [3 * r1] lea r6, [3 * r3] mov r4w, [r2] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w %rep 3 lea r2, [r2 + 4 * r3] mov r4w, [r2] lea r0, [r0 + 4 * r1] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x2, 4, 6, 0 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x4, 4, 4, 4 movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] lea r3, [r3 + r3 * 2] movd m3, [r2 + r3] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 lea r1, [r1 + 2 * r1] movd [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x8, 4, 6, 4 lea r4, [3 * r1] lea r5, [3 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r5] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r4], m3 lea r2, [r2 + 4 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r5] lea r0, [r0 + 4 * r1] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r4], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 7, 4 mov r4d, %2/8 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r6] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r5], m3 lea r2, [r2 + 4 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r6] lea r0, [r0 + 4 * r1] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r5], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PP_W4_H8 4, 16 BLOCKCOPY_PP_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x8, 4, 7, 3 movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] movd m2, [r2 + 2 * r3] mov r6w, [r2 + 2 * r3 + 4] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w movd [r0 + 2 * r1], m2 mov [r0 + 2 * r1 + 4], r6w lea r2, [r2 + 2 * r3] movd m0, [r2 + r3] mov r4w, [r2 + r3 + 4] movd m1, [r2 + 2 * r3] mov r5w, [r2 + 2 * r3 + 4] lea r2, [r2 + 2 * r3] movd m2, [r2 + r3] mov r6w, [r2 + r3 + 4] lea r0, [r0 + 2 * r1] movd [r0 + r1], m0 mov [r0 + r1 + 4], r4w movd [r0 + 2 * r1], m1 mov [r0 + 2 * r1 + 4], r5w lea r0, [r0 + 2 * r1] movd [r0 + r1], m2 mov [r0 + r1 + 4], r6w lea r2, [r2 + 2 * r3] movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] lea r0, [r0 + 2 * r1] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x16, 4, 7, 2 mov r6d, 16/2 .loop: movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] lea r2, [r2 + r3 * 2] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w lea r0, [r0 + r1 * 2] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x2, 4, 4, 2 movh m0, [r2] movh m1, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x4, 4, 4, 4 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r3, [r3 + r3 * 2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 lea r1, [r1 + 2 * r1] movh [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x6, 4, 4, 6 movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + 2 * r3] movh m2, [r2] movh m3, [r2 + r3] lea r2, [r2 + 2 * r3] movh m4, [r2] movh m5, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] movh [r0], m4 movh [r0 + r1], m5 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x12, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 2 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x8, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x16, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 3 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x32, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 7 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x64(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x64, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 15 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movh m0, [r2] movd m1, [r2 + 8] movh m2, [r2 + r3] movd m3, [r2 + r3 + 8] lea r2, [r2 + 2 * r3] movh [r0], m0 movd [r0 + 8], m1 movh [r0 + r1], m2 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] movh m0, [r2] movd m1, [r2 + 8] movh m2, [r2 + r3] movd m3, [r2 + r3 + 8] movh [r0], m0 movd [r0 + 8], m1 movh [r0 + r1], m2 movd [r0 + r1 + 8], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W12_H4 12, 16 BLOCKCOPY_PP_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W16_H4 16, 4 BLOCKCOPY_PP_W16_H4 16, 12 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H8 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/8 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + r3] lea r2, [r2 + 2 * r3] movu m4, [r2] movu m5, [r2 + r3] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r0, [r0 + 2 * r1] movu [r0], m4 movu [r0 + r1], m5 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W16_H8 16, 8 BLOCKCOPY_PP_W16_H8 16, 16 BLOCKCOPY_PP_W16_H8 16, 32 BLOCKCOPY_PP_W16_H8 16, 64 BLOCKCOPY_PP_W16_H8 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W24_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu m4, [r2] movh m5, [r2 + 16] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu [r0], m4 movh [r0 + 16], m5 movu m0, [r2 + r3] movh m1, [r2 + r3 + 16] movu [r0 + r1], m0 movh [r0 + r1 + 16], m1 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W24_H4 24, 32 BLOCKCOPY_PP_W24_H4 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W32_H4 32, 8 BLOCKCOPY_PP_W32_H4 32, 16 BLOCKCOPY_PP_W32_H4 32, 24 BLOCKCOPY_PP_W32_H4 32, 32 BLOCKCOPY_PP_W32_H4 32, 64 BLOCKCOPY_PP_W32_H4 32, 48 INIT_YMM avx cglobal blockcopy_pp_32x8, 4, 6, 6 lea r4, [3 * r1] lea r5, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 RET INIT_YMM avx cglobal blockcopy_pp_32x16, 4, 6, 6 lea r4, [3 * r1] lea r5, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] lea r2, [r2 + 4 * r3] movu m2, [r2] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] movu m5, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r0, [r0 + 4 * r1] movu [r0], m2 movu [r0 + r1], m3 movu [r0 + 2 * r1], m4 movu [r0 + r4], m5 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_YMM avx cglobal blockcopy_pp_32x24, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, 24/8 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H16_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, %2/16 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] lea r2, [r2 + 4 * r3] movu m2, [r2] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] movu m5, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r0, [r0 + 4 * r1] movu [r0], m2 movu [r0 + r1], m3 movu [r0 + 2 * r1], m4 movu [r0 + r4], m5 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET %endmacro BLOCKCOPY_PP_W32_H16_avx 32, 32 BLOCKCOPY_PP_W32_H16_avx 32, 48 BLOCKCOPY_PP_W32_H16_avx 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W48_H4_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu xm1, [r2 + 32] movu m2, [r2 + r3] movu xm3, [r2 + r3 + 32] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 32], xm1 movu [r0 + r1], m2 movu [r0 + r1 + 32], xm3 lea r0, [r0 + 2 * r1] movu m0, [r2] movu xm1, [r2 + 32] movu m2, [r2 + r3] movu xm3, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 32], xm1 movu [r0 + r1], m2 movu [r0 + r1 + 32], xm3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W48_H4_avx 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + r3] movu m5, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + r1], m4 movu [r0 + r1 + 16], m5 movu m0, [r2 + r3 + 32] movu m1, [r2 + r3 + 48] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + 16] movu m4, [r2 + 32] movu m5, [r2 + 48] movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + 16], m3 movu [r0 + 32], m4 movu [r0 + 48], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W64_H4 64, 16 BLOCKCOPY_PP_W64_H4 64, 32 BLOCKCOPY_PP_W64_H4 64, 48 BLOCKCOPY_PP_W64_H4 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + r3] movu m3, [r2 + r3 + 32] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 32] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + r1], m2 movu [r0 + r1 + 32], m3 movu [r0 + 2 * r1], m4 movu [r0 + 2 * r1 + 32], m5 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu [r0 + r4], m0 movu [r0 + r4 + 32], m1 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET %endmacro BLOCKCOPY_PP_W64_H4_avx 64, 16 BLOCKCOPY_PP_W64_H4_avx 64, 32 BLOCKCOPY_PP_W64_H4_avx 64, 48 BLOCKCOPY_PP_W64_H4_avx 64, 64 ;---------------------------------------------------------------------------------------------- ; blockcopy_pp avx512 code start ;---------------------------------------------------------------------------------------------- %macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r4] movu [r0] , m0 movu [r0 + r1] , m1 movu [r0 + 2 * r1] , m2 movu [r0 + r5] , m3 %endmacro %macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0 movu ym0, [r2] vinserti32x8 m0, [r2 + r3], 1 movu ym1, [r2 + 2 * r3] vinserti32x8 m1, [r2 + r4], 1 movu [r0] , ym0 vextracti32x8 [r0 + r1] , m0, 1 movu [r0 + 2 * r1] , ym1 vextracti32x8 [r0 + r5] , m1, 1 %endmacro ;---------------------------------------------------------------------------------------------- ; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;---------------------------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4_avx512 1 INIT_ZMM avx512 cglobal blockcopy_pp_64x%1, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] %rep %1/4 - 1 PROCESS_BLOCKCOPY_PP_64X4_avx512 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] %endrep PROCESS_BLOCKCOPY_PP_64X4_avx512 RET %endmacro BLOCKCOPY_PP_W64_H4_avx512 16 BLOCKCOPY_PP_W64_H4_avx512 32 BLOCKCOPY_PP_W64_H4_avx512 48 BLOCKCOPY_PP_W64_H4_avx512 64 %macro BLOCKCOPY_PP_W32_H4_avx512 1 INIT_ZMM avx512 cglobal blockcopy_pp_32x%1, 4, 6, 2 lea r4, [3 * r3] lea r5, [3 * r1] %rep %1/4 - 1 PROCESS_BLOCKCOPY_PP_32X4_avx512 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] %endrep PROCESS_BLOCKCOPY_PP_32X4_avx512 RET %endmacro BLOCKCOPY_PP_W32_H4_avx512 8 BLOCKCOPY_PP_W32_H4_avx512 16 BLOCKCOPY_PP_W32_H4_avx512 24 BLOCKCOPY_PP_W32_H4_avx512 32 BLOCKCOPY_PP_W32_H4_avx512 48 BLOCKCOPY_PP_W32_H4_avx512 64 ;---------------------------------------------------------------------------------------------- ; blockcopy_pp avx512 code end ;---------------------------------------------------------------------------------------------- ;----------------------------------------------------------------------------- ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x4, 4, 5, 2 add r3, r3 ;Row 0-1 movd m0, [r2] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0], r4w pextrw [r0 + r1], m0, 4 ;Row 2-3 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x8, 4, 5, 2 add r3, r3 ;Row 0-1 movd m0, [r2] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0], r4w pextrw [r0 + r1], m0, 4 ;Row 2-3 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 ;Row 4-5 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 ;Row 6-7 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W2_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: movd m0, [r2] movd m1, [r2 + r3] dec r6d lea r2, [r2 + r3 * 2] packuswb m0, m0 packuswb m1, m1 movd r4d, m0 movd r5d, m1 mov [r0], r4w mov [r0 + r1], r5w lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W2_H2 2, 4 BLOCKCOPY_SP_W2_H2 2, 8 BLOCKCOPY_SP_W2_H2 2, 16 ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] packuswb m0, m1 packuswb m2, m3 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] movh m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m5, [r2 + r3] movh m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 movd [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] pshufd m4, m4, 2 movd [r0 + r1], m4 movd [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] pshufd m6, m6, 2 movd [r0 + r1], m6 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 .loop: movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] movh m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m5, [r2 + r3] movh m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 movd [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] pshufd m4, m4, 2 movd [r0 + r1], m4 movd [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] pshufd m6, m6, 2 movd [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W4_H8 4, 16 BLOCKCOPY_SP_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_6x8, 4, 4, 2 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W6_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: movh m0, [r2] movd m2, [r2 + 8] movh m1, [r2 + r3] movd m3, [r2 + r3 + 8] dec r6d lea r2, [r2 + r3 * 2] packuswb m0, m0 packuswb m2, m2 packuswb m1, m1 packuswb m3, m3 movd r4d, m2 movd r5d, m3 movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W6_H2 6, 8 BLOCKCOPY_SP_W6_H2 6, 16 ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movlps [r0], m0 movhps [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] packuswb m0, m1 packuswb m2, m3 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] movu m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 movlps [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m6 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride add r3, r3 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] dec r4d lea r2, [r2 + r3 * 2] packuswb m0, m1 packuswb m2, m3 movlps [r0], m0 movhps [r0 + r1], m0 lea r0, [r0 + r1 * 2] movlps [r0], m2 movhps [r0 + r1], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W8_H4 8, 12 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H8 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] movu m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 movlps [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W8_H8 8, 16 BLOCKCOPY_SP_W8_H8 8, 32 BLOCKCOPY_SP_W8_H8 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 16] lea r2, [r2 + 2 * r3] movu m6, [r2 + r3] movu m7, [r2 + r3 + 16] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movh [r0], m0 pshufd m0, m0, 2 movd [r0 + 8], m0 movh [r0 + r1], m2 pshufd m2, m2, 2 movd [r0 + r1 + 8], m2 movh [r0 + 2 * r1], m4 pshufd m4, m4, 2 movd [r0 + 2 * r1 + 8], m4 lea r0, [r0 + 2 * r1] movh [r0 + r1], m6 pshufd m6, m6, 2 movd [r0 + r1 + 8], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W12_H4 12, 16 BLOCKCOPY_SP_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 16] lea r2, [r2 + 2 * r3] movu m6, [r2 + r3] movu m7, [r2 + r3 + 16] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + r1], m2 movu [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movu [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W16_H4 16, 4 BLOCKCOPY_SP_W16_H4 16, 8 BLOCKCOPY_SP_W16_H4 16, 12 BLOCKCOPY_SP_W16_H4 16, 16 BLOCKCOPY_SP_W16_H4 16, 32 BLOCKCOPY_SP_W16_H4 16, 64 BLOCKCOPY_SP_W16_H4 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W16_H8_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r0], xm0 movu [r0 + r1], xm1 movu [r0 + 2 * r1], xm2 movu [r0 + r6], xm3 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 lea r0, [r0 + 4 * r1] movu [r0], xm0 movu [r0 + r1], xm1 movu [r0 + 2 * r1], xm2 movu [r0 + r6], xm3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W16_H8_avx2 16, 16 BLOCKCOPY_SP_W16_H8_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W24_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movu [r0], m0 movlps [r0 + 16], m2 movhps [r0 + r1], m2 movu [r0 + r1 + 8], m4 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W24_H2 24, 32 BLOCKCOPY_SP_W24_H2 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W32_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + r3] movu m5, [r2 + r3 + 16] movu m6, [r2 + r3 + 32] movu m7, [r2 + r3 + 48] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + r1], m4 movu [r0 + r1 + 16], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W32_H2 32, 8 BLOCKCOPY_SP_W32_H2 32, 16 BLOCKCOPY_SP_W32_H2 32, 24 BLOCKCOPY_SP_W32_H2 32, 32 BLOCKCOPY_SP_W32_H2 32, 64 BLOCKCOPY_SP_W32_H2 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W32_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + r3] movu m3, [r2 + r3 + 32] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0], m0 movu [r0 + r1], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + r5] movu m3, [r2 + r5 + 32] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + 2 * r1], m0 movu [r0 + r6], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W32_H4_avx2 32, 32 BLOCKCOPY_SP_W32_H4_avx2 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W64_H1 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu m6, [r2 + 96] movu m7, [r2 + 112] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 movu [r0 + 48], m6 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W64_H1 64, 16 BLOCKCOPY_SP_W64_H1 64, 32 BLOCKCOPY_SP_W64_H1 64, 48 BLOCKCOPY_SP_W64_H1 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W64_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu m3, [r2 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0], m0 movu [r0 + 32], m2 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu m3, [r2 + r3 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + r1], m0 movu [r0 + r1 + 32], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu m3, [r2 + 2 * r3 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m2 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu m2, [r2 + r5 + 64] movu m3, [r2 + r5 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + r6], m0 movu [r0 + r6 + 32], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W64_H4_avx2 64, 64 %macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0 movu m0, [r2] movu m1, [r2 + 64] movu m2, [r2 + r3] movu m3, [r2 + r3 + 64] packuswb m0, m1 packuswb m2, m3 vpermq m0, m4, m0 vpermq m2, m4, m2 movu [r0], m0 movu [r0 + r1], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 64] movu m2, [r2 + r4] movu m3, [r2 + r4 + 64] packuswb m0, m1 packuswb m2, m3 vpermq m0, m4, m0 vpermq m2, m4, m2 movu [r0 + 2 * r1], m0 movu [r0 + r5], m2 %endmacro %macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r4] packuswb m0, m1 packuswb m2, m3 vpermq m0, m4, m0 vpermq m2, m4, m2 movu [r0], ym0 vextracti32x8 [r0 + r1], m0, 1 movu [r0 + 2 * r1], ym2 vextracti32x8 [r0 + r5], m2, 1 %endmacro ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_ZMM avx512 cglobal blockcopy_sp_64x64, 4, 6, 5 mova m4, [shuf1_avx512] add r3, r3 lea r4, [3 * r3] lea r5, [3 * r1] %rep 15 PROCESS_BLOCKCOPY_SP_64x4_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep PROCESS_BLOCKCOPY_SP_64x4_AVX512 RET %macro BLOCKCOPY_SP_32xN_AVX512 1 INIT_ZMM avx512 cglobal blockcopy_sp_32x%1, 4, 6, 5 mova m4, [shuf1_avx512] add r3, r3 lea r4, [3 * r3] lea r5, [3 * r1] %rep %1/4 - 1 PROCESS_BLOCKCOPY_SP_32x4_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep PROCESS_BLOCKCOPY_SP_32x4_AVX512 RET %endmacro BLOCKCOPY_SP_32xN_AVX512 32 BLOCKCOPY_SP_32xN_AVX512 64 ;----------------------------------------------------------------------------- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val add r1, r1 movd m0, r2d pshuflw m0, m0, 0 movh [r0], m0 movh [r0 + r1], m0 movh [r0 + 2 * r1], m0 lea r0, [r0 + 2 * r1] movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_8x8, 3, 4, 1, dst, dstStride, val add r1, r1 lea r3, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_16x16(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_16x16, 3, 4, 1, dst, dstStride, val add r1, r1 lea r3, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 movu [r0], m0 movu [r0 + 16], m0 movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 RET INIT_YMM avx2 cglobal blockfill_s_16x16, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- %macro BLOCKFILL_S_W32_H8 2 INIT_XMM sse2 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val mov r3d, %2/8 add r1, r1 lea r4, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 .loop: movu [r0], m0 movu [r0 + 16], m0 movu [r0 + 32], m0 movu [r0 + 48], m0 movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + 2 * r1 + 48], m0 movu [r0 + r4], m0 movu [r0 + r4 + 16], m0 movu [r0 + r4 + 32], m0 movu [r0 + r4 + 48], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 movu [r0 + 4 * r1 + 32], m0 movu [r0 + 4 * r1 + 48], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + 2 * r1 + 48], m0 movu [r0 + r4], m0 movu [r0 + r4 + 16], m0 movu [r0 + r4 + 32], m0 movu [r0 + r4 + 48], m0 lea r0, [r0 + 4 * r1] dec r3d jnz .loop RET %endmacro BLOCKFILL_S_W32_H8 32, 32 INIT_YMM avx2 cglobal blockfill_s_32x32, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 RET ;-------------------------------------------------------------------- ; void blockfill_s_32x32(int16_t* dst, intptr_t dstride, int16_t val) ;-------------------------------------------------------------------- INIT_ZMM avx512 cglobal blockfill_s_32x32, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 %rep 8 movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] %endrep RET ;-------------------------------------------------------------------- ; void blockfill_s_aligned_32x32(int16_t* dst, intptr_t dstride, int16_t val) ;-------------------------------------------------------------------- INIT_ZMM avx512 cglobal blockfill_s_aligned_32x32, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 %rep 8 mova [r0], m0 mova [r0 + r1], m0 mova [r0 + 2 * r1], m0 mova [r0 + r3], m0 lea r0, [r0 + 4 * r1] %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movd [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movd [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride add r1, r1 mov r4d, 16/2 .loop: movd m0, [r2] movd m1, [r2 + r3] dec r4d lea r2, [r2 + r3 * 2] pmovzxbw m0, m0 pmovzxbw m1, m1 movd [r0], m0 movd [r0 + r1], m1 lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W4_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W4_H4 4, 8 BLOCKCOPY_PS_W4_H4 4, 16 BLOCKCOPY_PS_W4_H4 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W6_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movh m0, [r2] pmovzxbw m0, m0 movh [r0], m0 pextrd [r0 + 8], m0, 2 movh m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 pextrd [r0 + r1 + 8], m0, 2 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 pextrd [r0 + 2 * r1 + 8], m0, 2 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 pextrd [r0 + r1 + 8], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W6_H4 6, 8 BLOCKCOPY_PS_W6_H4 6, 16 ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W8_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W8_H4 8, 8 BLOCKCOPY_PS_W8_H4 8, 16 BLOCKCOPY_PS_W8_H4 8, 32 BLOCKCOPY_PS_W8_H4 8, 12 BLOCKCOPY_PS_W8_H4 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W12_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movh [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movh [r0 + r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W12_H2 12, 16 BLOCKCOPY_PS_W12_H2 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride add r1, r1 pxor m0, m0 movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + 2 * r3] pmovzxbw m2, m1 movu [r0 + 2 * r1], m2 punpckhbw m1, m0 movu [r0 + 2 * r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W16_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + 2 * r3] pmovzxbw m2, m1 movu [r0 + 2 * r1], m2 punpckhbw m1, m0 movu [r0 + 2 * r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W16_H4 16, 8 BLOCKCOPY_PS_W16_H4 16, 12 BLOCKCOPY_PS_W16_H4 16, 16 BLOCKCOPY_PS_W16_H4 16, 32 BLOCKCOPY_PS_W16_H4 16, 64 BLOCKCOPY_PS_W16_H4 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W16_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_ps_%1x%2, 4, 7, 3 add r1, r1 mov r4d, %2/4 lea r5, [3 * r3] lea r6, [3 * r1] pxor m0, m0 .loop: movu xm1, [r2] pmovzxbw m2, xm1 movu [r0], m2 movu xm1, [r2 + r3] pmovzxbw m2, xm1 movu [r0 + r1], m2 movu xm1, [r2 + 2 * r3] pmovzxbw m2, xm1 movu [r0 + 2 * r1], m2 movu xm1, [r2 + r5] pmovzxbw m2, xm1 movu [r0 + r6], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W16_H4_avx2 16, 16 BLOCKCOPY_PS_W16_H4_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W24_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movh m1, [r2 + 16] pmovzxbw m1, m1 movu [r0 + 32], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movh m1, [r2 + r3 + 16] pmovzxbw m1, m1 movu [r0 + r1 + 32], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W24_H2 24, 32 BLOCKCOPY_PS_W24_H2 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W32_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W32_H2 32, 8 BLOCKCOPY_PS_W32_H2 32, 16 BLOCKCOPY_PS_W32_H2 32, 24 BLOCKCOPY_PS_W32_H2 32, 32 BLOCKCOPY_PS_W32_H2 32, 64 BLOCKCOPY_PS_W32_H2 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W32_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_ps_%1x%2, 4, 7, 2 add r1, r1 mov r4d, %2/4 lea r5, [3 * r3] lea r6, [3 * r1] .loop: pmovzxbw m0, [r2 + 0] pmovzxbw m1, [r2 + 16] movu [r0 + 0], m0 movu [r0 + 32], m1 pmovzxbw m0, [r2 + r3 + 0] pmovzxbw m1, [r2 + r3 + 16] movu [r0 + r1 + 0], m0 movu [r0 + r1 + 32], m1 pmovzxbw m0, [r2 + r3 * 2 + 0] pmovzxbw m1, [r2 + r3 * 2 + 16] movu [r0 + r1 * 2 + 0], m0 movu [r0 + r1 * 2 + 32], m1 pmovzxbw m0, [r2 + r5 + 0] pmovzxbw m1, [r2 + r5 + 16] movu [r0 + r6 + 0], m0 movu [r0 + r6 + 32], m1 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W32_H4_avx2 32, 32 BLOCKCOPY_PS_W32_H4_avx2 32, 64 %macro PROCESS_BLOCKCOPY_PS_32x8_AVX512 0 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + r3] pmovzxbw m2, [r2 + r3 * 2] pmovzxbw m3, [r2 + r4] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + r1 * 2], m2 movu [r0 + r5], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] pmovzxbw m0, [r2] pmovzxbw m1, [r2 + r3] pmovzxbw m2, [r2 + r3 * 2] pmovzxbw m3, [r2 + r4] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + r1 * 2], m2 movu [r0 + r5], m3 %endmacro INIT_ZMM avx512 cglobal blockcopy_ps_32x32, 4, 6, 4 add r1, r1 lea r4, [3 * r3] lea r5, [3 * r1] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 RET INIT_ZMM avx512 cglobal blockcopy_ps_32x64, 4, 6, 4 add r1, r1 lea r4, [3 * r3] lea r5, [3 * r1] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_32x8_AVX512 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W48_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + 32] pmovzxbw m2, m1 movu [r0 + 64], m2 punpckhbw m1, m0 movu [r0 + 80], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 movu m1, [r2 + r3 + 32] pmovzxbw m2, m1 movu [r0 + r1 + 64], m2 punpckhbw m1, m0 movu [r0 + r1 + 80], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W64_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + 32] pmovzxbw m2, m1 movu [r0 + 64], m2 punpckhbw m1, m0 movu [r0 + 80], m1 movu m1, [r2 + 48] pmovzxbw m2, m1 movu [r0 + 96], m2 punpckhbw m1, m0 movu [r0 + 112], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 movu m1, [r2 + r3 + 32] pmovzxbw m2, m1 movu [r0 + r1 + 64], m2 punpckhbw m1, m0 movu [r0 + r1 + 80], m1 movu m1, [r2 + r3 + 48] pmovzxbw m2, m1 movu [r0 + r1 + 96], m2 punpckhbw m1, m0 movu [r0 + r1 + 112], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W64_H2 64, 16 BLOCKCOPY_PS_W64_H2 64, 32 BLOCKCOPY_PS_W64_H2 64, 48 BLOCKCOPY_PS_W64_H2 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal blockcopy_ps_64x64, 4, 7, 4 add r1, r1 mov r4d, 64/8 lea r5, [3 * r3] lea r6, [3 * r1] .loop: %rep 2 pmovzxbw m0, [r2 + 0] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r2 + 32] pmovzxbw m3, [r2 + 48] movu [r0 + 0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 pmovzxbw m0, [r2 + r3 + 0] pmovzxbw m1, [r2 + r3 + 16] pmovzxbw m2, [r2 + r3 + 32] pmovzxbw m3, [r2 + r3 + 48] movu [r0 + r1 + 0], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu [r0 + r1 + 96], m3 pmovzxbw m0, [r2 + r3 * 2 + 0] pmovzxbw m1, [r2 + r3 * 2 + 16] pmovzxbw m2, [r2 + r3 * 2 + 32] pmovzxbw m3, [r2 + r3 * 2 + 48] movu [r0 + r1 * 2 + 0], m0 movu [r0 + r1 * 2 + 32], m1 movu [r0 + r1 * 2 + 64], m2 movu [r0 + r1 * 2 + 96], m3 pmovzxbw m0, [r2 + r5 + 0] pmovzxbw m1, [r2 + r5 + 16] pmovzxbw m2, [r2 + r5 + 32] pmovzxbw m3, [r2 + r5 + 48] movu [r0 + r6 + 0], m0 movu [r0 + r6 + 32], m1 movu [r0 + r6 + 64], m2 movu [r0 + r6 + 96], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep dec r4d jnz .loop RET %macro PROCESS_BLOCKCOPY_PS_64x8_AVX512 0 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 32] pmovzxbw m2, [r2 + r3] pmovzxbw m3, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 64], m1 movu [r0 + r1], m2 movu [r0 + r1 + 64], m3 pmovzxbw m0, [r2 + r3 * 2] pmovzxbw m1, [r2 + r3 * 2 + 32] pmovzxbw m2, [r2 + r4] pmovzxbw m3, [r2 + r4 + 32] movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 64], m1 movu [r0 + r5], m2 movu [r0 + r5 + 64], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 32] pmovzxbw m2, [r2 + r3] pmovzxbw m3, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 64], m1 movu [r0 + r1], m2 movu [r0 + r1 + 64], m3 pmovzxbw m0, [r2 + r3 * 2] pmovzxbw m1, [r2 + r3 * 2 + 32] pmovzxbw m2, [r2 + r4] pmovzxbw m3, [r2 + r4 + 32] movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 64], m1 movu [r0 + r5], m2 movu [r0 + r5 + 64], m3 %endmacro ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_ZMM avx512 cglobal blockcopy_ps_64x64, 4, 6, 4 add r1, r1 lea r4, [3 * r3] lea r5, [3 * r1] PROCESS_BLOCKCOPY_PS_64x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_64x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_64x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_64x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_64x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_64x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_64x8_AVX512 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] PROCESS_BLOCKCOPY_PS_64x8_AVX512 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x4, 4, 6, 0 add r1, r1 add r3, r3 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x8, 4, 6, 0 add r1, r1 add r3, r3 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x16, 4, 7, 0 add r1, r1 add r3, r3 mov r6d, 16/2 .loop: mov r4d, [r2] mov r5d, [r2 + r3] dec r6d lea r2, [r2 + r3 * 2] mov [r0], r4d mov [r0 + r1], r5d lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x2, 4, 4, 2 add r1, r1 add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x4, 4, 4, 4 add r1, r1 add r3, r3 movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W4_H8 4, 8 BLOCKCOPY_SS_W4_H8 4, 16 BLOCKCOPY_SS_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x8, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x16, 4, 5, 4 add r1, r1 add r3, r3 mov r4d, 16/2 .loop: movh m0, [r2] movd m2, [r2 + 8] movh m1, [r2 + r3] movd m3, [r2 + r3 + 8] dec r4d lea r2, [r2 + r3 * 2] movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x2, 4, 4, 2 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x4, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x6, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x12, 4, 5, 2 add r1, r1 add r3, r3 mov r4d, 12/2 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] dec r4d movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W8_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W8_H8 8, 8 BLOCKCOPY_SS_W8_H8 8, 16 BLOCKCOPY_SS_W8_H8 8, 32 BLOCKCOPY_SS_W8_H8 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W12_H4 12, 16 BLOCKCOPY_SS_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H4 16, 4 BLOCKCOPY_SS_W16_H4 16, 12 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r6], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H4_avx 16, 4 BLOCKCOPY_SS_W16_H4_avx 16, 12 BLOCKCOPY_SS_W16_H4_avx 16, 8 BLOCKCOPY_SS_W16_H4_avx 16, 16 BLOCKCOPY_SS_W16_H4_avx 16, 24 BLOCKCOPY_SS_W16_H4_avx 16, 32 BLOCKCOPY_SS_W16_H4_avx 16, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H8 16, 8 BLOCKCOPY_SS_W16_H8 16, 16 BLOCKCOPY_SS_W16_H8 16, 32 BLOCKCOPY_SS_W16_H8 16, 64 BLOCKCOPY_SS_W16_H8 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W24_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W24_H4 24, 32 BLOCKCOPY_SS_W24_H4 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W24_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 2 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu xm1, [r2 + 32] movu [r0], m0 movu [r0 + 32], xm1 movu m0, [r2 + r3] movu xm1, [r2 + r3 + 32] movu [r0 + r1], m0 movu [r0 + r1 + 32], xm1 movu m0, [r2 + 2 * r3] movu xm1, [r2 + 2 * r3 + 32] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], xm1 movu m0, [r2 + r5] movu xm1, [r2 + r5 + 32] movu [r0 + r6], m0 movu [r0 + r6 + 32], xm1 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W24_H4_avx 24, 32 BLOCKCOPY_SS_W24_H4_avx 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W32_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W32_H4 32, 8 BLOCKCOPY_SS_W32_H4 32, 16 BLOCKCOPY_SS_W32_H4 32, 24 BLOCKCOPY_SS_W32_H4 32, 32 BLOCKCOPY_SS_W32_H4 32, 64 BLOCKCOPY_SS_W32_H4 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W32_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu [r0], m0 movu [r0 + 32], m1 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu m0, [r2 + r6] movu m1, [r2 + r6 + 32] movu [r0 + r5], m0 movu [r0 + r5 + 32], m1 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W32_H4_avx 32, 8 BLOCKCOPY_SS_W32_H4_avx 32, 16 BLOCKCOPY_SS_W32_H4_avx 32, 24 BLOCKCOPY_SS_W32_H4_avx 32, 32 BLOCKCOPY_SS_W32_H4_avx 32, 48 BLOCKCOPY_SS_W32_H4_avx 32, 64 %macro PROCESS_BLOCKCOPY_SS_W32_H8_avx512 0 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r6] lea r2, [r2 + 4 * r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r5], m3 lea r0, [r0 + 4 * r1] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r6] lea r2, [r2 + 4 * r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r5], m3 lea r0, [r0 + 4 * r1] %endmacro %macro PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 0 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r6] lea r2, [r2 + 4 * r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r5], m3 lea r0, [r0 + 4 * r1] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r6] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r5], m3 %endmacro ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_ZMM avx512 cglobal blockcopy_ss_32x8, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_32x16, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_32x24, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_32x32, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_32x48, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_32x64, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_avx512 PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + 64], m4 movu [r0 + 80], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu m4, [r2 + r3 + 64] movu m5, [r2 + r3 + 80] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu [r0 + r1 + 64], m4 movu [r0 + r1 + 80], m5 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + 64], m4 movu [r0 + 80], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu m4, [r2 + r3 + 64] movu m5, [r2 + r3 + 80] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu [r0 + r1 + 64], m4 movu [r0 + r1 + 80], m5 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_48x64(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_YMM avx cglobal blockcopy_ss_48x64, 4, 7, 6 mov r4d, 64/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu [r0 + 2 * r1 + 64], m2 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu m2, [r2 + r5 + 64] movu [r0 + r6], m0 movu [r0 + r6 + 32], m1 movu [r0 + r6 + 64], m2 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + 64] movu m1, [r2 + 80] movu m2, [r2 + 96] movu m3, [r2 + 112] movu [r0 + 64], m0 movu [r0 + 80], m1 movu [r0 + 96], m2 movu [r0 + 112], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu m0, [r2 + r3 + 64] movu m1, [r2 + r3 + 80] movu m2, [r2 + r3 + 96] movu m3, [r2 + r3 + 112] movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m1 movu [r0 + r1 + 96], m2 movu [r0 + r1 + 112], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + 64] movu m1, [r2 + 80] movu m2, [r2 + 96] movu m3, [r2 + 112] movu [r0 + 64], m0 movu [r0 + 80], m1 movu [r0 + 96], m2 movu [r0 + 112], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu m0, [r2 + r3 + 64] movu m1, [r2 + r3 + 80] movu m2, [r2 + r3 + 96] movu m3, [r2 + r3 + 112] movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m1 movu [r0 + r1 + 96], m2 movu [r0 + r1 + 112], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W64_H4 64, 16 BLOCKCOPY_SS_W64_H4 64, 32 BLOCKCOPY_SS_W64_H4 64, 48 BLOCKCOPY_SS_W64_H4 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu m3, [r2 + 96] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu m3, [r2 + r3 + 96] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu [r0 + r1 + 96], m3 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu m3, [r2 + 2 * r3 + 96] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu [r0 + 2 * r1 + 64], m2 movu [r0 + 2 * r1 + 96], m3 movu m0, [r2 + r6] movu m1, [r2 + r6 + 32] movu m2, [r2 + r6 + 64] movu m3, [r2 + r6 + 96] lea r2, [r2 + 4 * r3] movu [r0 + r5], m0 movu [r0 + r5 + 32], m1 movu [r0 + r5 + 64], m2 movu [r0 + r5 + 96], m3 lea r0, [r0 + 4 * r1] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W64_H4_avx 64, 16 BLOCKCOPY_SS_W64_H4_avx 64, 32 BLOCKCOPY_SS_W64_H4_avx 64, 48 BLOCKCOPY_SS_W64_H4_avx 64, 64 %macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0 movu m0, [r2] movu m1, [r2 + mmsize] movu m2, [r2 + r3] movu m3, [r2 + r3 + mmsize] movu [r0], m0 movu [r0 + mmsize], m1 movu [r0 + r1], m2 movu [r0 + r1 + mmsize], m3 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + mmsize] movu m2, [r2 + r6] movu m3, [r2 + r6 + mmsize] lea r2, [r2 + 4 * r3] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + mmsize], m1 movu [r0 + r5], m2 movu [r0 + r5 + mmsize], m3 lea r0, [r0 + 4 * r1] movu m0, [r2] movu m1, [r2 + mmsize] movu m2, [r2 + r3] movu m3, [r2 + r3 + mmsize] movu [r0], m0 movu [r0 + mmsize], m1 movu [r0 + r1], m2 movu [r0 + r1 + mmsize], m3 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + mmsize] movu m2, [r2 + r6] movu m3, [r2 + r6 + mmsize] lea r2, [r2 + 4 * r3] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + mmsize], m1 movu [r0 + r5], m2 movu [r0 + r5 + mmsize], m3 lea r0, [r0 + 4 * r1] %endmacro %macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0 movu m0, [r2] movu m1, [r2 + mmsize] movu m2, [r2 + r3] movu m3, [r2 + r3 + mmsize] movu [r0], m0 movu [r0 + mmsize], m1 movu [r0 + r1], m2 movu [r0 + r1 + mmsize], m3 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + mmsize] movu m2, [r2 + r6] movu m3, [r2 + r6 + mmsize] lea r2, [r2 + 4 * r3] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + mmsize], m1 movu [r0 + r5], m2 movu [r0 + r5 + mmsize], m3 lea r0, [r0 + 4 * r1] movu m0, [r2] movu m1, [r2 + mmsize] movu m2, [r2 + r3] movu m3, [r2 + r3 + mmsize] movu [r0], m0 movu [r0 + mmsize], m1 movu [r0 + r1], m2 movu [r0 + r1 + mmsize], m3 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + mmsize] movu m2, [r2 + r6] movu m3, [r2 + r6 + mmsize] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + mmsize], m1 movu [r0 + r5], m2 movu [r0 + r5 + mmsize], m3 %endmacro ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_ZMM avx512 cglobal blockcopy_ss_64x16, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_64x32, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_64x48, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 RET INIT_ZMM avx512 cglobal blockcopy_ss_64x64, 4, 7, 4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_avx512 PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_4, 3, 4, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; m0 - shift ; m1 - word [-round] ; Row 0-3 movh m2, [r1] movhps m2, [r1 + r2] lea r1, [r1 + r2 * 2] movh m3, [r1] movhps m3, [r1 + r2] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_8, 3, 5, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 8/4 lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; r4 - stride * 3 ; m0 - shift ; m1 - word [-round] .loop: ; Row 0-1 mova m2, [r1] mova m3, [r1 + r2] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 2-3 mova m2, [r1 + r2 * 2] mova m3, [r1 + r4] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_8, 3, 4, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu xm3, [r1 + 2 * r2] vinserti128 m3, m3, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0], m2 movu [r0 + 32], m3 ; Row 4-7 lea r1, [r1 + 4 * r2] movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu xm3, [r1 + 2 * r2] vinserti128 m3, m3, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 64], m2 movu [r0 + 96], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_16, 3, 4, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/2 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift ; m1 - word [-round] .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 1 mova m2, [r1 + r2 + 0 * mmsize] mova m3, [r1 + r2 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_16, 4, 5, 4 add r2d, r2d movd xm0, r3d pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] mov r4d, 16/8 .loop: ; Row 0-1 movu m2, [r1] movu m3, [r1 + r2] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 0 * mmsize], m2 movu [r0 + 1 * mmsize], m3 ; Row 2-3 movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 2 * mmsize], m2 movu [r0 + 3 * mmsize], m3 ; Row 4-5 lea r1, [r1 + 4 * r2] movu m2, [r1] movu m3, [r1 + r2] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 4 * mmsize], m2 movu [r0 + 5 * mmsize], m3 ; Row 6-7 movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 6 * mmsize], m2 movu [r0 + 7 * mmsize], m3 add r0, 8 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET INIT_ZMM avx512 cglobal cpy2Dto1D_shr_16, 4, 5, 4 shl r2d, 1 movd xm0, r3d pcmpeqw ymm1, ymm1 psllw ym1, ymm1, xm0 psraw ym1, 1 vinserti32x8 m1, ym1, 1 lea r3, [r2 * 3] mov r4d, 2 .loop: ; Row 0-1 movu ym2, [r1] vinserti32x8 m2, [r1 + r2], 1 psubw m2, m1 psraw m2, xm0 movu [r0], m2 ; Row 2-3 movu ym2, [r1 + 2 * r2] vinserti32x8 m2, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 movu [r0 + mmsize], m2 lea r1, [r1 + 4 * r2] ; Row 4-5 movu ym2, [r1] vinserti32x8 m2, [r1 + r2], 1 psubw m2, m1 psraw m2, xm0 movu [r0 + 2 * mmsize], m2 ; Row 6-7 movu ym2, [r1 + 2 * r2] vinserti32x8 m2, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 movu [r0 + 3 * mmsize], m2 add r0, 4 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift ; m1 - word [-round] .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 add r0, 4 * mmsize add r1, r2 dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_32, 4, 5, 4 add r2d, r2d movd xm0, r3d pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] mov r4d, 32/4 .loop: ; Row 0 movu m2, [r1] movu m3, [r1 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 0 * mmsize], m2 movu [r0 + 1 * mmsize], m3 ; Row 1 movu m2, [r1 + r2] movu m3, [r1 + r2 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 2 * mmsize], m2 movu [r0 + 3 * mmsize], m3 ; Row 2 movu m2, [r1 + 2 * r2] movu m3, [r1 + 2 * r2 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 4 * mmsize], m2 movu [r0 + 5 * mmsize], m3 ; Row 3 movu m2, [r1 + r3] movu m3, [r1 + r3 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 6 * mmsize], m2 movu [r0 + 7 * mmsize], m3 add r0, 8 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET INIT_ZMM avx512 cglobal cpy2Dto1D_shr_32, 4, 5, 4 shl r2d, 1 movd xm0, r3d pcmpeqw ymm1, ymm1 psllw ym1, ymm1, xm0 psraw ym1, 1 vinserti32x8 m1, ym1, 1 lea r3, [r2 * 3] mov r4d, 8 .loop: ; Row 0 movu m2, [r1] psubw m2, m1 psraw m2, xm0 movu [r0], m2 ; Row 1 movu m2, [r1 + r2] psubw m2, m1 psraw m2, xm0 movu [r0 + mmsize], m2 ; Row 2 movu m2, [r1 + 2 * r2] psubw m2, m1 psraw m2, xm0 movu [r0 + 2 * mmsize], m2 ; Row 3 movu m2, [r1 + r3] psubw m2, m1 psraw m2, xm0 movu [r0 + 3 * mmsize], m2 add r0, 4 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_4, 3, 3, 3 add r2d, r2d movd m0, r3m ; Row 0-3 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] psllw m1, m0 psllw m2, m0 movh [r0], m1 movhps [r0 + r2], m1 movh [r0 + r2 * 2], m2 lea r2, [r2 * 3] movhps [r0 + r2], m2 RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_4, 3, 3, 2 add r2d, r2d movd xm0, r3m ; Row 0-3 movu m1, [r1] psllw m1, xm0 vextracti128 xm0, m1, 1 movq [r0], xm1 movhps [r0 + r2], xm1 lea r0, [r0 + r2 * 2] movq [r0], xm0 movhps [r0 + r2], xm0 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_8, 3, 4, 5 add r2d, r2d movd m0, r3m lea r3, [r2 * 3] ; Row 0-3 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + r2 * 2], m3 mova [r0 + r3], m4 lea r0, [r0 + r2 * 4] ; Row 4-7 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + r2 * 2], m3 mova [r0 + r3], m4 RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_8, 3, 4, 3 add r2d, r2d movd xm0, r3m lea r3, [r2 * 3] ; Row 0-3 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0], xm1 vextracti128 [r0 + r2], m1, 1 movu [r0 + r2 * 2], xm2 vextracti128 [r0 + r3], m2, 1 ; Row 4-7 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] lea r0, [r0 + r2 * 4] psllw m1, xm0 psllw m2, xm0 movu [r0], xm1 vextracti128 [r0 + r2], m1, 1 movu [r0 + r2 * 2], xm2 vextracti128 [r0 + r3], m2, 1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_16, 3, 4, 5 add r2d, r2d movd m0, r3m mov r3d, 16/4 .loop: ; Row 0-1 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + 16], m2 mova [r0 + r2], m3 mova [r0 + r2 + 16], m4 ; Row 2-3 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] lea r0, [r0 + r2 * 2] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + 16], m2 mova [r0 + r2], m3 mova [r0 + r2 + 16], m4 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_16, 3, 5, 3 add r2d, r2d movd xm0, r3m mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0], m1 movu [r0 + r2], m2 ; Row 2-3 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0 + r2 * 2], m1 movu [r0 + r4], m2 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd m0, r3m mov r3d, 32/2 .loop: ; Row 0 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0 + 0 * mmsize], m1 mova [r0 + 1 * mmsize], m2 mova [r0 + 2 * mmsize], m3 mova [r0 + 3 * mmsize], m4 ; Row 1 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0 + r2 + 0 * mmsize], m1 mova [r0 + r2 + 1 * mmsize], m2 mova [r0 + r2 + 2 * mmsize], m3 mova [r0 + r2 + 3 * mmsize], m4 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd xm0, r3m mov r3d, 32/2 .loop: ; Row 0-1 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] movu m3, [r1 + 2 * mmsize] movu m4, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + r2], m3 movu [r0 + r2 + mmsize], m4 add r1, 4 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; cpy_1Dto2D_shl avx512 code start ;-------------------------------------------------------------------------------------- %macro PROCESS_CPY1Dto2D_SHL_32x8_AVX512 0 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] movu m3, [r1 + 2 * mmsize] movu m4, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + r2], m2 movu [r0 + 2 * r2], m3 movu [r0 + r3], m4 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] movu m3, [r1 + 2 * mmsize] movu m4, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + r2], m2 movu [r0 + 2 * r2], m3 movu [r0 + r3], m4 %endmacro ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_ZMM avx512 cglobal cpy1Dto2D_shl_32, 4, 4, 5 add r2d, r2d movd xm0, r3d lea r3, [3 * r2] %rep 3 PROCESS_CPY1Dto2D_SHL_32x8_AVX512 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] %endrep PROCESS_CPY1Dto2D_SHL_32x8_AVX512 RET %macro PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 0 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + 2 * r2], m3 mova [r0 + r3], m4 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + 2 * r2], m3 mova [r0 + r3], m4 %endmacro ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_ZMM avx512 cglobal cpy1Dto2D_shl_aligned_32, 4, 4, 5 add r2d, r2d movd xm0, r3d lea r3, [3 * r2] %rep 3 PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] %endrep PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 RET ;-------------------------------------------------------------------------------------- ; copy_cnt avx512 code end ;-------------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_4, 3,3,3 add r2d, r2d pxor m2, m2 ; row 0 & 1 movh m0, [r1] movhps m0, [r1 + r2] mova [r0], m0 ; row 2 & 3 movh m1, [r1 + r2 * 2] lea r2, [r2 * 3] movhps m1, [r1 + r2] mova [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m2 ; get count ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem %if 0 pmovmskb eax, m0 not ax popcnt ax, ax %else mova m1, [pb_1] paddb m0, m1 psadbw m0, m2 pshufd m1, m0, 2 paddw m0, m1 movd eax, m0 %endif RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_8, 3,3,6 add r2d, r2d pxor m4, m4 pxor m5, m5 ; row 0 & 1 movu m0, [r1] movu m1, [r1 + r2] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 2 & 3 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 4 & 5 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 6 & 7 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; get count mova m0, [pb_4] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_8, 3,4,5 add r2d, r2d lea r3, [r2 * 3] ; row 0 - 1 movu xm0, [r1] vinserti128 m0, m0, [r1 + r2], 1 movu [r0], m0 ; row 2 - 3 movu xm1, [r1 + r2 * 2] vinserti128 m1, m1, [r1 + r3], 1 movu [r0 + 32], m1 lea r1, [r1 + r2 * 4] ; row 4 - 5 movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu [r0 + 64], m2 ; row 6 - 7 movu xm3, [r1 + r2 * 2] vinserti128 m3, m3, [r1 + r3], 1 movu [r0 + 96], m3 ; get count xorpd m4, m4 vpacksswb m0, m1 vpacksswb m2, m3 pminub m0, [pb_1] pminub m2, [pb_1] paddb m0, m2 vextracti128 xm1, m0, 1 paddb xm0, xm1 psadbw xm0, xm4 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_16, 3,4,6 add r2d, r2d mov r3d, 4 pxor m4, m4 pxor m5, m5 .loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 1 movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 2 movu m0, [r1 + 2 * r2] movu m1, [r1 + 2 * r2 + 16] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 3 lea r1, [r1 + 2 * r2] movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop mova m0, [pb_16] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_16, 3, 5, 5 add r2d, r2d lea r3, [r2 * 3] mov r4d, 16/4 mova m3, [pb_1] xorpd m4, m4 .loop: ; row 0 - 1 movu m0, [r1] movu [r0], m0 movu m1, [r1 + r2] movu [r0 + 32], m1 packsswb m0, m1 pminub m0, m3 ; row 2 - 3 movu m1, [r1 + r2 * 2] movu [r0 + 64], m1 movu m2, [r1 + r3] movu [r0 + 96], m2 packsswb m1, m2 pminub m1, m3 paddb m0, m1 paddb m4, m0 add r0, 128 lea r1, [r1 + 4 * r2] dec r4d jnz .loop ; get count xorpd m0, m0 vextracti128 xm1, m4, 1 paddb xm4, xm1 psadbw xm4, xm0 movhlps xm1, xm4 paddd xm4, xm1 movd eax, xm4 RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_32, 3,4,6 add r2d, r2d mov r3d, 16 pxor m4, m4 pxor m5, m5 .loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 movu m0, [r1 + 32] movu m1, [r1 + 48] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 1 movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 movu m0, [r1 + r2 + 32] movu m1, [r1 + r2 + 48] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop ; get count mova m0, [pb_64] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_32, 3, 5, 5 add r2d, r2d mov r3d, 32/2 mova m3, [pb_1] xorpd m4, m4 .loop: ; row 0 movu m0, [r1] movu [r0], m0 movu m1, [r1 + 32] movu [r0 + 32], m1 packsswb m0, m1 pminub m0, m3 ; row 1 movu m1, [r1 + r2] movu [r0 + 64], m1 movu m2, [r1 + r2 + 32] movu [r0 + 96], m2 packsswb m1, m2 pminub m1, m3 paddb m0, m1 paddb m4, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop ; get count xorpd m0, m0 vextracti128 xm1, m4, 1 paddb xm4, xm1 psadbw xm4, xm0 movhlps xm1, xm4 paddd xm4, xm1 movd eax, xm4 RET ;-------------------------------------------------------------------------------------- ; copy_cnt avx512 code start ;-------------------------------------------------------------------------------------- %macro PROCESS_COPY_CNT_32x4_AVX512 0 movu m0, [r1] movu m1, [r1 + r2] movu [r0], m0 movu [r0 + mmsize], m1 packsswb m0, m1 pminub m0, m3 movu m1, [r1 + 2 * r2] movu m2, [r1 + r3] movu [r0 + 2 * mmsize], m1 movu [r0 + 3 * mmsize], m2 packsswb m1, m2 pminub m1, m3 paddb m0, m1 paddb m4, m0 %endmacro %macro PROCESS_COPY_CNT_16x4_AVX512 0 movu ym0, [r1] vinserti32x8 m0, [r1 + r2], 1 movu ym1, [r1 + 2 * r2] vinserti32x8 m1, [r1 + r3], 1 movu [r0], m0 movu [r0 + mmsize], m1 packsswb m0, m1 pminub m0, m3 paddb m4, m0 %endmacro %macro PROCESS_COPY_CNT_END_AVX512 0 pxor m0, m0 vextracti32x8 ym1, m4, 1 paddb ym4, ym1 vextracti32x4 xm1, ym4, 1 paddb xm4, xm1 psadbw xm4, xm0 movhlps xm1, xm4 paddd xm4, xm1 movd eax, xm4 %endmacro ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride); ;-------------------------------------------------------------------------------------- INIT_ZMM avx512 cglobal copy_cnt_32, 3, 4, 5 add r2d, r2d lea r3, [3 * r2] vbroadcasti32x8 m3, [pb_1] pxor m4, m4 %rep 7 PROCESS_COPY_CNT_32x4_AVX512 add r0, 4 * mmsize lea r1, [r1 + 4 * r2] %endrep PROCESS_COPY_CNT_32x4_AVX512 PROCESS_COPY_CNT_END_AVX512 RET INIT_ZMM avx512 cglobal copy_cnt_16, 3, 4, 5 add r2d, r2d lea r3, [3 * r2] vbroadcasti32x8 m3, [pb_1] pxor m4, m4 %rep 3 PROCESS_COPY_CNT_16x4_AVX512 add r0, 2 * mmsize lea r1, [r1 + 4 * r2] %endrep PROCESS_COPY_CNT_16x4_AVX512 PROCESS_COPY_CNT_END_AVX512 RET ;-------------------------------------------------------------------------------------- ; copy_cnt avx512 code end ;-------------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_4, 4, 4, 4 add r2d, r2d movd m0, r3d ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; m0 - shift ; Row 0-3 movh m2, [r1] movhps m2, [r1 + r2] lea r1, [r1 + r2 * 2] movh m3, [r1] movhps m3, [r1 + r2] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_8, 4, 5, 4 add r2d, r2d movd m0, r3d mov r3d, 8/4 lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; r4 - stride * 3 ; m0 - shift .loop: ; Row 0, 1 mova m2, [r1] mova m3, [r1 + r2] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 2, 3 mova m2, [r1 + r2 * 2] mova m3, [r1 + r4] psllw m2, m0 psllw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_8(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_8, 4, 5, 2 add r2d, r2d movd xm0, r3d lea r4, [3 * r2] ; Row 0, 1 movu xm1, [r1] vinserti128 m1, m1, [r1 + r2], 1 psllw m1, xm0 movu [r0], m1 ; Row 2, 3 movu xm1, [r1 + 2 * r2] vinserti128 m1, m1, [r1 + r4], 1 psllw m1, xm0 movu [r0 + 32], m1 lea r1, [r1 + 4 * r2] ; Row 4, 5 movu xm1, [r1] vinserti128 m1, m1, [r1 + r2], 1 psllw m1, xm0 movu [r0 + 64], m1 ; Row 6, 7 movu xm1, [r1 + 2 * r2] vinserti128 m1, m1, [r1 + r4], 1 psllw m1, xm0 movu [r0 + 96], m1 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_16, 4, 4, 4 add r2d, r2d movd m0, r3d mov r3d, 16/2 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 1 mova m2, [r1 + r2 + 0 * mmsize] mova m3, [r1 + r2 + 1 * mmsize] psllw m2, m0 psllw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 2] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_16(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_16, 3, 5, 3 add r2d, r2d movd xm0, r3m mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m1, [r1] movu m2, [r1 + r2] psllw m1, xm0 psllw m2, xm0 movu [r0 + 0 * mmsize], m1 movu [r0 + 1 * mmsize], m2 ; Row 2-3 movu m1, [r1 + 2 * r2] movu m2, [r1 + r4] psllw m1, xm0 psllw m2, xm0 movu [r0 + 2 * mmsize], m1 movu [r0 + 3 * mmsize], m2 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_32, 4, 4, 6 add r2d, r2d movd m0, r3d mov r3d, 32/1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psllw m2, m0 psllw m3, m0 psllw m4, m0 psllw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 add r0, 4 * mmsize add r1, r2 dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_32(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_32, 3, 5, 5 add r2d, r2d movd xm0, r3m mov r3d, 32/4 lea r4, [3 * r2] .loop: ; Row 0-1 movu m1, [r1] movu m2, [r1 + 32] movu m3, [r1 + r2] movu m4, [r1 + r2 + 32] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + 2 * mmsize], m3 movu [r0 + 3 * mmsize], m4 ; Row 2-3 movu m1, [r1 + 2 * r2] movu m2, [r1 + 2 * r2 + 32] movu m3, [r1 + r4] movu m4, [r1 + r4 + 32] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0 + 4 * mmsize], m1 movu [r0 + 5 * mmsize], m2 movu [r0 + 6 * mmsize], m3 movu [r0 + 7 * mmsize], m4 add r0, 8 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; cpy2Dto1D_shl avx512 code start ;-------------------------------------------------------------------------------------- %macro PROCESS_CPY2Dto1D_SHL_16x8_AVX512 0 movu m1, [r1] vinserti32x8 m1, [r1 + r2], 1 movu m2, [r1 + 2 * r2] vinserti32x8 m2, [r1 + r3], 1 psllw m1, xm0 psllw m2, xm0 movu [r0], m1 movu [r0 + mmsize], m2 add r0, 2 * mmsize lea r1, [r1 + r2 * 4] movu m1, [r1] vinserti32x8 m1, [r1 + r2], 1 movu m2, [r1 + 2 * r2] vinserti32x8 m2, [r1 + r3], 1 psllw m1, xm0 psllw m2, xm0 movu [r0], m1 movu [r0 + mmsize], m2 %endmacro %macro PROCESS_CPY2Dto1D_SHL_32x8_AVX512 0 movu m1, [r1] movu m2, [r1 + r2] movu m3, [r1 + 2 * r2] movu m4, [r1 + r3] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + 2 * mmsize], m3 movu [r0 + 3 * mmsize], m4 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] movu m1, [r1] movu m2, [r1 + r2] movu m3, [r1 + 2 * r2] movu m4, [r1 + r3] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + 2 * mmsize], m3 movu [r0 + 3 * mmsize], m4 %endmacro ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_ZMM avx512 cglobal cpy2Dto1D_shl_32, 4, 4, 5 add r2d, r2d movd xm0, r3d lea r3, [3 * r2] PROCESS_CPY2Dto1D_SHL_32x8_AVX512 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] PROCESS_CPY2Dto1D_SHL_32x8_AVX512 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] PROCESS_CPY2Dto1D_SHL_32x8_AVX512 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] PROCESS_CPY2Dto1D_SHL_32x8_AVX512 RET INIT_ZMM avx512 cglobal cpy2Dto1D_shl_16, 4, 4, 3 add r2d, r2d movd xm0, r3d lea r3, [3 * r2] PROCESS_CPY2Dto1D_SHL_16x8_AVX512 add r0, 2 * mmsize lea r1, [r1 + r2 * 4] PROCESS_CPY2Dto1D_SHL_16x8_AVX512 RET ;-------------------------------------------------------------------------------------- ; cpy2Dto1D_shl avx512 code end ;-------------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_4, 3, 3, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 ; Row 0-3 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 movh [r0], m2 movhps [r0 + r2], m2 movh [r0 + r2 * 2], m3 lea r2, [r2 * 3] movhps [r0 + r2], m3 RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_4, 3, 3, 3 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 ; Row 0-3 movu m2, [r1] psubw m2, m1 psraw m2, xm0 vextracti128 xm1, m2, 1 movq [r0], xm2 movhps [r0 + r2], xm2 lea r0, [r0 + r2 * 2] movq [r0], xm1 movhps [r0 + r2], xm1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_8, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + r2], m3 mova [r0 + r2 * 2], m4 mova [r0 + r3], m5 ; Row 4-7 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] lea r0, [r0 + r2 * 4] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + r2], m3 mova [r0 + r2 * 2], m4 mova [r0 + r3], m5 RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_8, 3, 4, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], xm2 vextracti128 [r0 + r2], m2, 1 movu [r0 + r2 * 2], xm3 vextracti128 [r0 + r3], m3, 1 ; Row 4-7 movu m2, [r1 + 2 * mmsize] movu m3, [r1 + 3 * mmsize] lea r0, [r0 + r2 * 4] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], xm2 vextracti128 [r0 + r2], m2, 1 movu [r0 + r2 * 2], xm3 vextracti128 [r0 + r3], m3, 1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_16, 3, 5, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + mmsize], m3 mova [r0 + r2], m4 mova [r0 + r2 + mmsize], m5 ; Row 2-3 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + r2 * 2], m2 mova [r0 + r2 * 2 + mmsize], m3 mova [r0 + r4], m4 mova [r0 + r4 + mmsize], m5 add r1, 8 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_16, 3, 5, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], m2 movu [r0 + r2], m3 ; Row 2-3 movu m2, [r1 + 2 * mmsize] movu m3, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0 + r2 * 2], m2 movu [r0 + r4], m3 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET INIT_ZMM avx512 cglobal cpy1Dto2D_shr_16, 3, 5, 4 shl r2d, 1 movd xm0, r3m pcmpeqw xmm1, xmm1 psllw xm1, xmm1, xm0 psraw xm1, 1 vpbroadcastw m1, xm1 mov r3d, 4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m2, [r1] psubw m2, m1 psraw m2, xm0 movu [r0], ym2 vextracti32x8 [r0 + r2], m2, 1 ; Row 2-3 movu m2, [r1 + mmsize] psubw m2, m1 psraw m2, xm0 movu [r0 + r2 * 2], ym2 vextracti32x8 [r0 + r4], m2, 1 add r1, 2 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/2 .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 ; Row 1 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + r2 + 0 * mmsize], m2 mova [r0 + r2 + 1 * mmsize], m3 mova [r0 + r2 + 2 * mmsize], m4 mova [r0 + r2 + 3 * mmsize], m5 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 32/2 .loop: ; Row 0-1 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] movu m4, [r1 + 2 * mmsize] movu m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, xm0 psraw m3, xm0 psraw m4, xm0 psraw m5, xm0 movu [r0], m2 movu [r0 + mmsize], m3 movu [r0 + r2], m4 movu [r0 + r2 + mmsize], m5 add r1, 4 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_ZMM avx512 cglobal cpy1Dto2D_shr_32, 3, 4, 6 shl r2d, 1 movd xm0, r3m pcmpeqw xmm1, xmm1 psllw xm1, xmm1, xm0 psraw xm1, 1 vpbroadcastw m1, xm1 mov r3d, 16 .loop: ; Row 0-1 movu m2, [r1] movu m3, [r1 + mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], m2 movu [r0 + r2], m3 add r1, 2 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET