;******************************************************************************
;* VP9 motion compensation SIMD optimizations
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
;* Copyright (c) 2025 Two Orioles, LLC
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA 32

cextern pw_256
cextern pw_64

%macro F8_SSSE3_TAPS 8
times 16 db %1, %2
times 16 db %3, %4
times 16 db %5, %6
times 16 db %7, %8
%endmacro

%macro F8_SSE2_TAPS 8
times 8 dw %1
times 8 dw %2
times 8 dw %3
times 8 dw %4
times 8 dw %5
times 8 dw %6
times 8 dw %7
times 8 dw %8
%endmacro

%macro F8_16BPP_TAPS 8
times 8 dw %1, %2
times 8 dw %3, %4
times 8 dw %5, %6
times 8 dw %7, %8
%endmacro

%macro FILTER 0-1
%if %0 > 0
%1 %+ _smooth:
%endif
    ; smooth
    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
%if %0 > 0
%1 %+ _regular:
%endif
    ; regular
    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
%if %0 > 0
%1 %+ _sharp:
%endif
    ; sharp
    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
%endmacro

%define F8_TAPS F8_SSSE3_TAPS
; int8_t ff_filters_ssse3[3][15][4][32]
const filters_ssse3
FILTER
%define F8_TAPS F8_SSE2_TAPS
; int16_t ff_filters_sse2[3][15][8][8]
const filters_sse2
FILTER
%define F8_TAPS F8_16BPP_TAPS
; int16_t ff_filters_16bpp[3][15][4][16]
const filters_16bpp
FILTER

filter4_h_perm0: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
filter4_h_perm1: db 1, 2, 2, 3, 3, 4, 4, 5, 3, 4, 4, 5, 5, 6, 6, 7

%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64
ALIGN 64
spel_h_perm16:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
spel_v_perm16:  db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
spel_v_perm32:  db  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39
                db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
                db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
                db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
spel_hv_perm4:  db 16, 32, 48,  8, 18, 34, 50, 10, 20, 36, 52, 12, 22, 38, 54, 14
                db 32, 48,  8, 24, 34, 50, 10, 26, 36, 52, 12, 28, 38, 54, 14, 30
                db 48,  8, 24, 40, 50, 10, 26, 42, 52, 12, 28, 44, 54, 14, 30, 46
                db  8, 24, 40, 56, 10, 26, 42, 58, 12, 28, 44, 60, 14, 30, 46, 62
spel_hv_perm8:  db 16, 32, 48,  8, 17, 33, 49,  9, 18, 34, 50, 10, 19, 35, 51, 11
                db 32, 48,  8, 24, 33, 49,  9, 25, 34, 50, 10, 26, 35, 51, 11, 27
                db 48,  8, 24, 40, 49,  9, 25, 41, 50, 10, 26, 42, 51, 11, 27, 43
                db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
spel_hv_perm16: db 32,  8, 33,  9, 34, 10, 35, 11, 36, 12, 37, 13, 38, 14, 39, 15
                db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
                db 48, 24, 49, 25, 50, 26, 51, 27, 52, 28, 53, 29, 54, 30, 55, 31
                db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
spel_h_shufB:   db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

%define spel_h_shufA (spel_h_perm16+ 0)
%define spel_h_shufC (spel_h_perm16+16)

%macro F8_AVX512_TAPS 8
db %1, %2, %3, %4, %5, %6, %7, %8
%endmacro
%define F8_TAPS F8_AVX512_TAPS
FILTER vp9_spel_filter

pb_02461357:    db  0,  2,  4,  6,  1,  3,  5,  7
pd_64:          dd 64
pw_m33:         times 2 dw -33
pb_4:           times 4 db 4
%endif

SECTION .text

%macro filter_sse2_h_fn 1
%assign %%px mmsize/2
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
    pxor        m5, m5
    mova        m6, [pw_64]
    mova        m7, [filteryq+  0]
%if ARCH_X86_64
    mova        m8, [filteryq+ 16]
    mova        m9, [filteryq+ 32]
    mova       m10, [filteryq+ 48]
    mova       m11, [filteryq+ 64]
    mova       m12, [filteryq+ 80]
    mova       m13, [filteryq+ 96]
    mova       m14, [filteryq+112]
%endif
.loop:
    movh        m0, [srcq-3]
    movh        m1, [srcq-2]
    movh        m2, [srcq-1]
    movh        m3, [srcq+0]
    movh        m4, [srcq+1]
    punpcklbw   m0, m5
    punpcklbw   m1, m5
    punpcklbw   m2, m5
    punpcklbw   m3, m5
    punpcklbw   m4, m5
    pmullw      m0, m7
%if ARCH_X86_64
    pmullw      m1, m8
    pmullw      m2, m9
    pmullw      m3, m10
    pmullw      m4, m11
%else
    pmullw      m1, [filteryq+ 16]
    pmullw      m2, [filteryq+ 32]
    pmullw      m3, [filteryq+ 48]
    pmullw      m4, [filteryq+ 64]
%endif
    paddw       m0, m1
    paddw       m2, m3
    paddw       m0, m4
    movh        m1, [srcq+2]
    movh        m3, [srcq+3]
    movh        m4, [srcq+4]
    add       srcq, sstrideq
    punpcklbw   m1, m5
    punpcklbw   m3, m5
    punpcklbw   m4, m5
%if ARCH_X86_64
    pmullw      m1, m12
    pmullw      m3, m13
    pmullw      m4, m14
%else
    pmullw      m1, [filteryq+ 80]
    pmullw      m3, [filteryq+ 96]
    pmullw      m4, [filteryq+112]
%endif
    paddw       m0, m1
    paddw       m3, m4
    paddw       m0, m6
    paddw       m2, m3
    paddsw      m0, m2
    psraw       m0, 7
%ifidn %1, avg
    movh        m1, [dstq]
%endif
    packuswb    m0, m0
%ifidn %1, avg
    pavgb       m0, m1
%endif
    movh    [dstq], m0
    add       dstq, dstrideq
    dec         hd
    jg .loop
    RET
%endmacro

INIT_XMM sse2
filter_sse2_h_fn put
filter_sse2_h_fn avg

%macro filter4_h_fn 2
cglobal vp9_%1_8tap_1d_h_4_8, 6, 6, %2, dst, dstride, src, sstride, h, filtery
    mova        m2, [filter4_h_perm0]
    mova        m3, [filter4_h_perm1]
    pcmpeqw     m4, m4
    movu        m5, [filteryq+24]
    movu        m6, [filteryq+88]
    psllw       m4, 6   ; pw_m64
.loop:
    movq        m0, [srcq-3]
    movq        m1, [srcq+0]
    pshufb      m0, m2
    pshufb      m1, m3
    pmaddubsw   m0, m5
    pmaddubsw   m1, m6
%ifidn %1, avg
    movd        m7, [dstq]
%endif
    add       srcq, sstrideq
    paddw       m0, m1
    movhlps     m1, m0
    psubw       m0, m4
    paddsw      m0, m1
    psraw       m0, 7
    packuswb    m0, m0
%ifidn %1, avg
    pavgb       m0, m7
%endif
    movd    [dstq], m0
    add       dstq, dstrideq
    sub         hd, 1
    jg .loop
    RET
%endmacro

INIT_XMM ssse3
filter4_h_fn put, 7
filter4_h_fn avg, 8

%macro filter_h_fn 1
%assign %%px mmsize/2
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
    mova        m6, [pw_256]
    mova        m7, [filteryq+ 0]
%ifdef m8
    mova        m8, [filteryq+32]
    mova        m9, [filteryq+64]
    mova       m10, [filteryq+96]
%endif
.loop:
    movh        m0, [srcq-3]
    movh        m1, [srcq-2]
    movh        m2, [srcq-1]
    movh        m3, [srcq+0]
    movh        m4, [srcq+1]
    movh        m5, [srcq+2]
    punpcklbw   m0, m1
    punpcklbw   m2, m3
    movh        m1, [srcq+3]
    movh        m3, [srcq+4]
    add       srcq, sstrideq
    punpcklbw   m4, m5
    punpcklbw   m1, m3
    pmaddubsw   m0, m7
%ifdef m8
    pmaddubsw   m2, m8
    pmaddubsw   m4, m9
    pmaddubsw   m1, m10
%else
    pmaddubsw   m2, [filteryq+32]
    pmaddubsw   m4, [filteryq+64]
    pmaddubsw   m1, [filteryq+96]
%endif
    paddw       m0, m4
    paddw       m2, m1
    paddsw      m0, m2
    pmulhrsw    m0, m6
%ifidn %1, avg
    movh        m1, [dstq]
%endif
    packuswb    m0, m0
%ifidn %1, avg
    pavgb       m0, m1
%endif
    movh    [dstq], m0
    add       dstq, dstrideq
    dec         hd
    jg .loop
    RET
%endmacro

INIT_XMM ssse3
filter_h_fn put
filter_h_fn avg

%if ARCH_X86_64
%macro filter_hx2_fn 1
%assign %%px mmsize
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
    mova       m13, [pw_256]
    mova        m8, [filteryq+ 0]
    mova        m9, [filteryq+32]
    mova       m10, [filteryq+64]
    mova       m11, [filteryq+96]
.loop:
    movu        m0, [srcq-3]
    movu        m1, [srcq-2]
    movu        m2, [srcq-1]
    movu        m3, [srcq+0]
    movu        m4, [srcq+1]
    movu        m5, [srcq+2]
    movu        m6, [srcq+3]
    movu        m7, [srcq+4]
    add       srcq, sstrideq
    SBUTTERFLY  bw, 0, 1, 12
    SBUTTERFLY  bw, 2, 3, 12
    SBUTTERFLY  bw, 4, 5, 12
    SBUTTERFLY  bw, 6, 7, 12
    pmaddubsw   m0, m8
    pmaddubsw   m1, m8
    pmaddubsw   m2, m9
    pmaddubsw   m3, m9
    pmaddubsw   m4, m10
    pmaddubsw   m5, m10
    pmaddubsw   m6, m11
    pmaddubsw   m7, m11
    paddw       m0, m4
    paddw       m1, m5
    paddw       m2, m6
    paddw       m3, m7
    paddsw      m0, m2
    paddsw      m1, m3
    pmulhrsw    m0, m13
    pmulhrsw    m1, m13
    packuswb    m0, m1
%ifidn %1, avg
    pavgb       m0, [dstq]
%endif
    mova    [dstq], m0
    add       dstq, dstrideq
    dec         hd
    jg .loop
    RET
%endmacro

INIT_XMM ssse3
filter_hx2_fn put
filter_hx2_fn avg

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
filter_hx2_fn put
filter_hx2_fn avg
%endif

%endif ; ARCH_X86_64

%macro filter_sse2_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
    mov   filteryq, r5mp
%define hd r4mp
%endif
    pxor        m5, m5
    mova        m6, [pw_64]
    lea  sstride3q, [sstrideq*3]
    lea      src4q, [srcq+sstrideq]
    sub       srcq, sstride3q
    mova        m7, [filteryq+  0]
%ifdef m8
    mova        m8, [filteryq+ 16]
    mova        m9, [filteryq+ 32]
    mova       m10, [filteryq+ 48]
    mova       m11, [filteryq+ 64]
    mova       m12, [filteryq+ 80]
    mova       m13, [filteryq+ 96]
    mova       m14, [filteryq+112]
%endif
.loop:
    ; FIXME maybe reuse loads from previous rows, or just
    ; more generally unroll this to prevent multiple loads of
    ; the same data?
    movh        m0, [srcq]
    movh        m1, [srcq+sstrideq]
    movh        m2, [srcq+sstrideq*2]
    movh        m3, [srcq+sstride3q]
    add       srcq, sstrideq
    movh        m4, [src4q]
    punpcklbw   m0, m5
    punpcklbw   m1, m5
    punpcklbw   m2, m5
    punpcklbw   m3, m5
    punpcklbw   m4, m5
    pmullw      m0, m7
%ifdef m8
    pmullw      m1, m8
    pmullw      m2, m9
    pmullw      m3, m10
    pmullw      m4, m11
%else
    pmullw      m1, [filteryq+ 16]
    pmullw      m2, [filteryq+ 32]
    pmullw      m3, [filteryq+ 48]
    pmullw      m4, [filteryq+ 64]
%endif
    paddw       m0, m1
    paddw       m2, m3
    paddw       m0, m4
    movh        m1, [src4q+sstrideq]
    movh        m3, [src4q+sstrideq*2]
    movh        m4, [src4q+sstride3q]
    add      src4q, sstrideq
    punpcklbw   m1, m5
    punpcklbw   m3, m5
    punpcklbw   m4, m5
%ifdef m8
    pmullw      m1, m12
    pmullw      m3, m13
    pmullw      m4, m14
%else
    pmullw      m1, [filteryq+ 80]
    pmullw      m3, [filteryq+ 96]
    pmullw      m4, [filteryq+112]
%endif
    paddw       m0, m1
    paddw       m3, m4
    paddw       m0, m6
    paddw       m2, m3
    paddsw      m0, m2
    psraw       m0, 7
%ifidn %1, avg
    movh        m1, [dstq]
%endif
    packuswb    m0, m0
%ifidn %1, avg
    pavgb       m0, m1
%endif
    movh    [dstq], m0
    add       dstq, dstrideq
    dec         hd
    jg .loop
    RET
%endmacro

INIT_XMM sse2
filter_sse2_v_fn put
filter_sse2_v_fn avg

%macro filter4_v_fn 1
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_4_8, 6, 7, 8, dst, dstride, src, sstride, h, filtery, sstride3
%else
cglobal vp9_%1_8tap_1d_v_4_8, 4, 5, 8, dst, dstride, src, sstride, filtery
%define hd r4mp
%define sstride3q filteryq
%endif
    lea  sstride3q, [sstrideq*3]
    sub       srcq, sstride3q
    movd        m0, [srcq]
    movd        m1, [srcq+sstrideq]
    movd        m2, [srcq+sstrideq*2]
    movd        m3, [srcq+sstride3q]
    lea       srcq, [srcq+sstrideq*4]
    movd        m4, [srcq]
    movd        m5, [srcq+sstrideq]
    punpcklbw   m0, m1
    punpcklbw   m1, m2
    punpcklbw   m2, m3
    punpcklbw   m3, m4
    punpcklqdq  m0, m1
    movd        m1, [srcq+sstrideq*2]
    add       srcq, sstride3q
%if ARCH_X86_32
    mov   filteryq, r5mp
%endif
    punpcklqdq  m2, m3
    punpcklbw   m4, m5
    punpcklbw   m5, m1
    punpcklqdq  m4, m5
.loop:
    pmaddubsw   m0, [filteryq]
    movd        m3, [srcq]
    movd        m5, [srcq+sstrideq]
    pmaddubsw   m7, m4, [filteryq+64]
    pmaddubsw   m6, m2, [filteryq+32]
    punpcklbw   m1, m3
    punpcklbw   m3, m5
    punpcklqdq  m1, m3
    pmaddubsw   m3, m1, [filteryq+96]
    paddw       m0, [pw_64]
    lea       srcq, [srcq+2*sstrideq]
    paddw       m7, m0
    mova        m0, m2
    mova        m2, m4
%ifidn %1, avg
    movd        m4, [dstq]
%endif
    paddw       m6, m3
%ifidn %1, avg
    movd        m3, [dstq+dstrideq]
%endif
    paddsw      m6, m7
    psraw       m6, 7
    packuswb    m6, m6
    pshuflw     m7, m6, 0xE
%ifidn %1, avg
    pavgb       m6, m4
%endif
    movd    [dstq], m6
    mova        m4, m1
%ifidn %1, avg
    pavgb       m7, m3
%endif
    movd [dstq+dstrideq], m7
    lea       dstq, [dstq+2*dstrideq]
    mova        m1, m5
    sub         hd, 2
    jg .loop
    RET
%endmacro

%macro filter_v_fn 1
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_8_8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
cglobal vp9_%1_8tap_1d_v_8_8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
    mov   filteryq, r5mp
%define hd r4mp
%endif
    mova        m6, [pw_256]
    lea  sstride3q, [sstrideq*3]
    lea      src4q, [srcq+sstrideq]
    sub       srcq, sstride3q
    mova        m7, [filteryq+ 0]
%if ARCH_X86_64
    mova        m8, [filteryq+32]
    mova        m9, [filteryq+64]
    mova       m10, [filteryq+96]
%endif
.loop:
    ; FIXME maybe reuse loads from previous rows, or just more generally
    ; unroll this to prevent multiple loads of the same data?
    movh        m0, [srcq]
    movh        m1, [srcq+sstrideq]
    movh        m2, [srcq+sstrideq*2]
    movh        m3, [srcq+sstride3q]
    movh        m4, [src4q]
    movh        m5, [src4q+sstrideq]
    punpcklbw   m0, m1
    punpcklbw   m2, m3
    movh        m1, [src4q+sstrideq*2]
    movh        m3, [src4q+sstride3q]
    add       srcq, sstrideq
    add      src4q, sstrideq
    punpcklbw   m4, m5
    punpcklbw   m1, m3
    pmaddubsw   m0, m7
%if ARCH_X86_64
    pmaddubsw   m2, m8
    pmaddubsw   m4, m9
    pmaddubsw   m1, m10
%else
    pmaddubsw   m2, [filteryq+32]
    pmaddubsw   m4, [filteryq+64]
    pmaddubsw   m1, [filteryq+96]
%endif
    paddw       m0, m4
    paddw       m2, m1
    paddsw      m0, m2
    pmulhrsw    m0, m6
%ifidn %1, avg
    movh        m1, [dstq]
%endif
    packuswb    m0, m0
%ifidn %1, avg
    pavgb       m0, m1
%endif
    movh    [dstq], m0
    add       dstq, dstrideq
    dec         hd
    jg .loop
    RET
%endmacro

INIT_XMM ssse3
filter4_v_fn put
filter4_v_fn avg

INIT_XMM ssse3
filter_v_fn put
filter_v_fn avg

%if ARCH_X86_64

%macro filter_vx2_fn 1
%assign %%px mmsize
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
    mova       m13, [pw_256]
    lea  sstride3q, [sstrideq*3]
    lea      src4q, [srcq+sstrideq]
    sub       srcq, sstride3q
    mova        m8, [filteryq+ 0]
    mova        m9, [filteryq+32]
    mova       m10, [filteryq+64]
    mova       m11, [filteryq+96]
.loop:
    ; FIXME maybe reuse loads from previous rows, or just
    ; more generally unroll this to prevent multiple loads of
    ; the same data?
    movu        m0, [srcq]
    movu        m1, [srcq+sstrideq]
    movu        m2, [srcq+sstrideq*2]
    movu        m3, [srcq+sstride3q]
    movu        m4, [src4q]
    movu        m5, [src4q+sstrideq]
    movu        m6, [src4q+sstrideq*2]
    movu        m7, [src4q+sstride3q]
    add       srcq, sstrideq
    add      src4q, sstrideq
    SBUTTERFLY  bw, 0, 1, 12
    SBUTTERFLY  bw, 2, 3, 12
    SBUTTERFLY  bw, 4, 5, 12
    SBUTTERFLY  bw, 6, 7, 12
    pmaddubsw   m0, m8
    pmaddubsw   m1, m8
    pmaddubsw   m2, m9
    pmaddubsw   m3, m9
    pmaddubsw   m4, m10
    pmaddubsw   m5, m10
    pmaddubsw   m6, m11
    pmaddubsw   m7, m11
    paddw       m0, m4
    paddw       m1, m5
    paddw       m2, m6
    paddw       m3, m7
    paddsw      m0, m2
    paddsw      m1, m3
    pmulhrsw    m0, m13
    pmulhrsw    m1, m13
    packuswb    m0, m1
%ifidn %1, avg
    pavgb       m0, [dstq]
%endif
    mova    [dstq], m0
    add       dstq, dstrideq
    dec         hd
    jg .loop
    RET
%endmacro

INIT_XMM ssse3
filter_vx2_fn put
filter_vx2_fn avg

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
filter_vx2_fn put
filter_vx2_fn avg
%endif

%endif ; ARCH_X86_64

%macro fpel_fn 6-8 0, 4
%if %2 == 4
%define %%srcfn movh
%define %%dstfn movh
%else
%define %%srcfn movu
%define %%dstfn mova
%endif

%if %7 == 8
%define %%pavg pavgb
%define %%szsuf _8
%elif %7 == 16
%define %%pavg pavgw
%define %%szsuf _16
%else
%define %%szsuf
%endif

%if %2 <= mmsize
cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
    lea  sstride3q, [sstrideq*3]
    lea  dstride3q, [dstrideq*3]
%else
cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
%endif
.loop:
    %%srcfn     m0, [srcq]
    %%srcfn     m1, [srcq+s%3]
    %%srcfn     m2, [srcq+s%4]
    %%srcfn     m3, [srcq+s%5]
%if %2/mmsize == 8
    %%srcfn     m4, [srcq+mmsize*4]
    %%srcfn     m5, [srcq+mmsize*5]
    %%srcfn     m6, [srcq+mmsize*6]
    %%srcfn     m7, [srcq+mmsize*7]
%endif
    lea       srcq, [srcq+sstrideq*%6]
%ifidn %1, avg
    %%pavg      m0, [dstq]
    %%pavg      m1, [dstq+d%3]
    %%pavg      m2, [dstq+d%4]
%if %2 == 4
    %%srcfn     m4, [dstq+d%5]
    %%pavg      m3, m4
%else
    %%pavg      m3, [dstq+d%5]
%endif
%if %2/mmsize == 8
    %%pavg      m4, [dstq+mmsize*4]
    %%pavg      m5, [dstq+mmsize*5]
    %%pavg      m6, [dstq+mmsize*6]
    %%pavg      m7, [dstq+mmsize*7]
%endif
%endif
    %%dstfn [dstq], m0
    %%dstfn [dstq+d%3], m1
    %%dstfn [dstq+d%4], m2
    %%dstfn [dstq+d%5], m3
%if %2/mmsize == 8
    %%dstfn [dstq+mmsize*4], m4
    %%dstfn [dstq+mmsize*5], m5
    %%dstfn [dstq+mmsize*6], m6
    %%dstfn [dstq+mmsize*7], m7
%endif
    lea       dstq, [dstq+dstrideq*%6]
    sub         hd, %6
    jnz .loop
    RET
%endmacro

%define d16 16
%define s16 16
%define d32 32
%define s32 32
INIT_MMX mmx
fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
INIT_MMX mmxext
fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
INIT_XMM sse
fpel_fn put, 16, strideq, strideq*2, stride3q, 4
fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
INIT_XMM sse2
fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
INIT_YMM avx
fpel_fn put, 32, strideq, strideq*2, stride3q, 4
fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
%endif
INIT_MMX mmxext
fpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
INIT_XMM sse2
fpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
fpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
fpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
fpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
fpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
%endif
%undef s16
%undef d16
%undef s32
%undef d32

%if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64
%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
%if %5
    vpermb              m%2, m6, m%1
    vpermb              m%3, m7, m%1
    vpermb              m%4, m8, m%1
%else
%if %2 < %4 ; reuse a previous value if possible
    pshufb              m%2, m%1, m6
%endif
    pshufb              m%3, m%1, m7
    pshufb              m%4, m%1, m8
%endif
    mova                m%1, m5
    vpdpbusd            m%1, m%2, m9
    mova                m%2, m5
    vpdpbusd            m%2, m%3, m9
    vpdpbusd            m%1, m%3, m10
    vpdpbusd            m%2, m%4, m10
    packusdw            m%1, m%2
    psrlw               m%1, 7
%endmacro

%macro SPEL_H_INIT 2 ; put/avg, w
cglobal vp9_%1_8tap_smooth_%2h_8, 4, 7, 0
    lea                  r6, [vp9_spel_filter_smooth-8]
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main
cglobal vp9_%1_8tap_sharp_%2h_8, 4, 7, 0
    lea                  r6, [vp9_spel_filter_sharp-8]
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2h_8 %+ SUFFIX).main
cglobal vp9_%1_8tap_regular_%2h_8, 4, 7, 0, dst, ds, src, ss, h, mx
    lea                  r6, [vp9_spel_filter_regular-8]
.main:
    mov                 mxd, mxm
    movifnidn            hd, hm
    sub                srcq, 3
    vpbroadcastd         m5, [pd_64]
    vpbroadcastd         m9, [r6+mxq*8+0]
    vpbroadcastd        m10, [r6+mxq*8+4]
%endmacro

%macro SPEL_V_INIT 2 ; put/avg, w
cglobal vp9_%1_8tap_smooth_%2v_8, 4, 7, 0
    lea                  r5, [vp9_spel_filter_smooth-8]
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main
cglobal vp9_%1_8tap_sharp_%2v_8, 4, 7, 0
    lea                  r5, [vp9_spel_filter_sharp-8]
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2v_8 %+ SUFFIX).main
cglobal vp9_%1_8tap_regular_%2v_8, 4, 7, 0, dst, ds, src, ss, h, mx, my
    lea                  r5, [vp9_spel_filter_regular-8]
.main:
    mov                 myd, mym
    movifnidn            hd, hm
    lea                 myq, [r5+myq*8]
    vpbroadcastd         m7, [pw_256]
    vpbroadcastw         m8, [myq+0]
    vpbroadcastw         m9, [myq+2]
    lea                  r5, [ssq*3]
    vpbroadcastw        m10, [myq+4]
    sub                srcq, r5
    vpbroadcastw        m11, [myq+6]
%endmacro

%macro SPEL_HV_INIT 2 ; put/avg, w
cglobal vp9_%1_8tap_smooth_%2hv_8, 4, 8, 0
    lea                  r6, [vp9_spel_filter_smooth-8]
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main
cglobal vp9_%1_8tap_sharp_%2hv_8, 4, 8, 0
    lea                  r6, [vp9_spel_filter_sharp-8]
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_%2hv_8 %+ SUFFIX).main
cglobal vp9_%1_8tap_regular_%2hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
    lea                  r6, [vp9_spel_filter_regular-8]
.main:
%if %2 == 16
    xor                r7d, r7d
.main2:
%endif
    mov                 mxd, mxm
    movifnidn            hd, hm
    sub                srcq, 3
    vpbroadcastd         m9, [r6+mxq*8+0]
    vpbroadcastd        m10, [r6+mxq*8+4]
    mov                 mxd, mym
    vpbroadcastd         m5, [pd_64]
    lea                 myq, [r6+mxq*8]
    lea                  r5, [ssq*3]
    sub                srcq, r5
%endmacro

%macro MC_AVX512 1 ; put/avg
    SPEL_H_INIT          %1, 4
    vbroadcasti32x4      m6, [spel_h_shufA]
    lea                  r5, [ssq*3]
    vbroadcasti32x4      m7, [spel_h_shufB]
    lea                  r6, [dsq*3]
    vbroadcasti32x4      m8, [spel_h_shufC]
.h_w4_loop:
    movu                xm0, [srcq+ssq*0]
    vinserti32x4        ym0, [srcq+ssq*1], 1
    vinserti32x4         m0, [srcq+ssq*2], 2
    vinserti32x4         m0, [srcq+r5   ], 3
    lea                srcq, [srcq+ssq*4]
    pshufb               m1, m0, m6
    pshufb               m0, m7
    mova                 m2, m5
    vpdpbusd             m2, m1, m9
    vpdpbusd             m2, m0, m10
    vpmovsdw            ym0, m2
    psraw               ym0, 7
    packuswb            ym0, ym0
    vextracti32x4       xm1, ym0, 1
%ifidn %1, avg
    movd               xmm2, [dstq+dsq*0]
    pinsrd             xmm2, [dstq+dsq*1], 1
    movd               xmm3, [dstq+dsq*2]
    pinsrd             xmm3, [dstq+r6   ], 1
    pavgb               xm0, xmm2
    pavgb               xm1, xmm3
%endif
    movd       [dstq+dsq*0], xm0
    pextrd     [dstq+dsq*1], xm0, 1
    movd       [dstq+dsq*2], xm1
    pextrd     [dstq+r6   ], xm1, 1
    lea                dstq, [dstq+dsq*4]
    sub                  hd, 4
    jg .h_w4_loop
    RET

    SPEL_H_INIT          %1, 8
    vbroadcasti32x4      m6, [spel_h_shufA]
    lea                  r5, [ssq*3]
    vbroadcasti32x4      m7, [spel_h_shufB]
    lea                  r6, [dsq*3]
    vbroadcasti32x4      m8, [spel_h_shufC]
.h_w8_loop:
    movu                xm0, [srcq+ssq*0]
    vinserti32x4        ym0, [srcq+ssq*1], 1
    vinserti32x4         m0, [srcq+ssq*2], 2
    vinserti32x4         m0, [srcq+r5   ], 3
    lea                srcq, [srcq+ssq*4]
    PUT_8TAP_H            0, 1, 2, 3
    vpmovuswb           ym0, m0
    vextracti32x4       xm1, ym0, 1
%ifidn %1, avg
    movq               xmm2, [dstq+dsq*0]
    movhps             xmm2, [dstq+dsq*1]
    movq               xmm3, [dstq+dsq*2]
    movhps             xmm3, [dstq+r6   ]
    pavgb               xm0, xmm2
    pavgb               xm1, xmm3
%endif
    movq       [dstq+dsq*0], xm0
    movhps     [dstq+dsq*1], xm0
    movq       [dstq+dsq*2], xm1
    movhps     [dstq+r6   ], xm1
    lea                dstq, [dstq+dsq*4]
    sub                  hd, 4
    jg .h_w8_loop
    RET

    SPEL_H_INIT          %1, 16
    mova                 m6, [spel_h_perm16]
    vpbroadcastd         m8, [pb_4]
    paddb                m7, m8, m6
    paddb                m8, m7
.h_w16_loop:
    movu                ym0, [srcq+ssq*0]
    vinserti32x8         m0, [srcq+ssq*1], 1
    lea                srcq, [srcq+ssq*2]
    PUT_8TAP_H            0, 1, 2, 3, 1
    vpmovuswb           ym0, m0
%ifidn %1, avg
    movu                xm1, [dstq+dsq*0]
    vinserti32x4        ym1, [dstq+dsq*1], 1
    pavgb               ym0, ym1
%endif
    mova         [dstq+dsq*0], xm0
    vextracti128 [dstq+dsq*1], ym0, 1
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .h_w16_loop
    RET

    SPEL_H_INIT          %1, 32
    vbroadcasti32x4      m6, [spel_h_shufA]
    vbroadcasti32x4      m7, [spel_h_shufB]
    vbroadcasti32x4      m8, [spel_h_shufC]
.h_w32_loop:
    movu                ym0, [srcq+ssq*0+8*0]
    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
    movu                ym1, [srcq+ssq*0+8*1]
    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
    lea                srcq, [srcq+ssq*2]
    PUT_8TAP_H            0, 2, 3, 4
    PUT_8TAP_H            1, 4, 3, 2
    packuswb             m0, m1
%ifidn %1, avg
    movu                ym1, [dstq+dsq*0]
    vinserti32x8         m1, [dstq+dsq*1], 1
    pavgb                m0, m1
%endif
    mova          [dstq+dsq*0], ym0
    vextracti32x8 [dstq+dsq*1], m0, 1
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .h_w32_loop
    RET

    SPEL_H_INIT          %1, 64
    vbroadcasti32x4      m6, [spel_h_shufA]
    vbroadcasti32x4      m7, [spel_h_shufB]
    vbroadcasti32x4      m8, [spel_h_shufC]
.h_w64_loop:
    movu                 m0, [srcq+8*0]
    movu                 m1, [srcq+8*1]
    add                srcq, ssq
    PUT_8TAP_H            0, 2, 3, 4
    PUT_8TAP_H            1, 4, 3, 2
    packuswb             m0, m1
%ifidn %1, avg
    pavgb                m0, [dstq]
%endif
    mova             [dstq], m0
    add                dstq, dsq
    dec                  hd
    jg .h_w64_loop
    RET

    SPEL_V_INIT          %1, 4
    movd               xmm2, [srcq+ssq*0]
    pinsrd             xmm2, [srcq+ssq*1], 1
    pinsrd             xmm2, [srcq+ssq*2], 2
    add                srcq, r5
    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
    movd               xmm3, [srcq+ssq*1]
    vpbroadcastd       xmm1, [srcq+ssq*2]
    add                srcq, r5
    vpbroadcastd       xmm0, [srcq+ssq*0]
    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
    punpcklbw          xmm3, xmm1             ; 45 56
    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
    punpckhbw          xmm2, xmm4             ; 23 34
%if WIN64
    movaps          [rsp+8], xmm6
%endif
.v_w4_loop:
    vpbroadcastd       xmm4, [srcq+ssq*1]
    lea                srcq, [srcq+ssq*2]
    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
    mova               xmm1, xmm2
    pmaddubsw          xmm6, xmm2, xm9        ; a1 b1
    mova               xmm2, xmm3
    pmaddubsw          xmm3, xm10             ; a2 b2
    paddw              xmm5, xmm3
    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
    vpbroadcastd       xmm0, [srcq+ssq*0]
    vpblendd           xmm4, xmm0, 0x02       ; 7 8
    punpcklbw          xmm3, xmm4             ; 67 78
    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
    paddw              xmm6, xmm4
    paddsw             xmm5, xmm6
    pmulhrsw           xmm5, xm7
    packuswb           xmm5, xmm5
%ifidn %1, avg
    movd               xmm4, [dstq+dsq*0]
    pinsrd             xmm4, [dstq+dsq*1], 1
    pavgb              xmm5, xmm4
%endif
    movd       [dstq+dsq*0], xmm5
    pextrd     [dstq+dsq*1], xmm5, 1
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .v_w4_loop
%if WIN64
    movaps             xmm6, [rsp+8]
%endif
    RET

    SPEL_V_INIT          %1, 8
    movq               xmm1, [srcq+ssq*0]
    vpbroadcastq       ymm0, [srcq+ssq*1]
    vpbroadcastq       ymm2, [srcq+ssq*2]
    add                srcq, r5
    vpbroadcastq       ymm5, [srcq+ssq*0]
    vpbroadcastq       ymm3, [srcq+ssq*1]
    vpbroadcastq       ymm4, [srcq+ssq*2]
    add                srcq, r5
    vpblendd           ymm1, ymm0, 0x30
    vpblendd           ymm0, ymm2, 0x30
    punpcklbw          ymm1, ymm0       ; 01 12
    vpbroadcastq       ymm0, [srcq+ssq*0]
    vpblendd           ymm2, ymm5, 0x30
    vpblendd           ymm5, ymm3, 0x30
    punpcklbw          ymm2, ymm5       ; 23 34
    vpblendd           ymm3, ymm4, 0x30
    vpblendd           ymm4, ymm0, 0x30
    punpcklbw          ymm3, ymm4       ; 45 56
%if WIN64
    movaps          [rsp+8], xmm6
%endif
.v_w8_loop:
    vpbroadcastq       ymm4, [srcq+ssq*1]
    lea                srcq, [srcq+ssq*2]
    pmaddubsw          ymm5, ymm1, ym8  ; a0 b0
    mova               ymm1, ymm2
    pmaddubsw          ymm6, ymm2, ym9  ; a1 b1
    mova               ymm2, ymm3
    pmaddubsw          ymm3, ym10       ; a2 b2
    paddw              ymm5, ymm3
    vpblendd           ymm3, ymm0, ymm4, 0x30
    vpbroadcastq       ymm0, [srcq+ssq*0]
    vpblendd           ymm4, ymm4, ymm0, 0x30
    punpcklbw          ymm3, ymm4       ; 67 78
    pmaddubsw          ymm4, ymm3, ym11 ; a3 b3
    paddw              ymm6, ymm4
    paddsw             ymm5, ymm6
    pmulhrsw           ymm5, ym7
    vextracti128       xmm4, ymm5, 1
    packuswb           xmm5, xmm4
%ifidn %1, avg
    movq               xmm4, [dstq+dsq*0]
    movhps             xmm4, [dstq+dsq*1]
    pavgb              xmm5, xmm4
%endif
    movq       [dstq+dsq*0], xmm5
    movhps     [dstq+dsq*1], xmm5
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .v_w8_loop
%if WIN64
    movaps             xmm6, [rsp+8]
%endif
    vzeroupper
    RET

    SPEL_V_INIT          %1, 16
    mova                m12, [spel_v_perm16]
    vbroadcasti32x4      m1, [srcq+ssq*0]
    vbroadcasti32x4     ym4, [srcq+ssq*1]
    mov                 r6d, 0x0f
    vbroadcasti32x4      m2, [srcq+ssq*2]
    add                srcq, r5
    vbroadcasti32x4     ym5, [srcq+ssq*0]
    kmovb                k1, r6d
    vbroadcasti32x4      m3, [srcq+ssq*1]
    vbroadcasti32x4     ym6, [srcq+ssq*2]
    add                srcq, r5
    vbroadcasti32x4      m0, [srcq+ssq*0]
    vshufpd          m1{k1}, m4, m2, 0xcc
    vshufpd          m2{k1}, m5, m3, 0xcc
    vshufpd          m3{k1}, m6, m0, 0xcc
    vpermb               m1, m12, m1 ; 01 12
    vpermb               m2, m12, m2 ; 23 34
    vpermb               m3, m12, m3 ; 45 56
.v_w16_loop:
    pmaddubsw            m4, m1, m8  ; a0 b0
    mova                 m1, m2
    pmaddubsw            m5, m2, m9  ; a1 b1
    mova                 m2, m3
    pmaddubsw            m6, m3, m10 ; a2 b2
    mova                 m3, m0
    paddw                m4, m6
    vbroadcasti32x4     ym6, [srcq+ssq*1]
    lea                srcq, [srcq+ssq*2]
    vbroadcasti32x4      m0, [srcq+ssq*0]
    vshufpd          m3{k1}, m6, m0, 0xcc
    vpermb               m3, m12, m3 ; 67 78
    pmaddubsw            m6, m3, m11 ; a3 b3
    paddw                m5, m6
    paddsw               m4, m5
    pmulhrsw             m4, m7
    vextracti32x8       ym5, m4, 1
    packuswb            ym4, ym5
%ifidn %1, avg
    mova                xm5, [dstq+dsq*0]
    vinserti32x4        ym5, [dstq+dsq*1], 1
    pavgb               ym4, ym5
%endif
    mova          [dstq+dsq*0], xm4
    vextracti32x4 [dstq+dsq*1], ym4, 1
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .v_w16_loop
    RET

    SPEL_V_INIT          %1, 32
    mova                m12, [spel_v_perm32]
    pmovzxbq            m14, [pb_02461357]
    vpshrdw             m13, m12, m12, 8
    movu                ym0, [srcq+ssq*0]
    vinserti32x8         m0, [srcq+ssq*1], 1
    vpermb               m1, m12, m0 ; 01
    vinserti32x8         m0, [srcq+ssq*2], 0
    add                srcq, r5
    vpermb               m2, m13, m0 ; 12
    vinserti32x8         m0, [srcq+ssq*0], 1
    vpermb               m3, m12, m0 ; 23
    vinserti32x8         m0, [srcq+ssq*1], 0
    vpermb               m4, m13, m0 ; 34
    vinserti32x8         m0, [srcq+ssq*2], 1
    add                srcq, r5
    vpermb               m5, m12, m0 ; 45
    vinserti32x8         m0, [srcq+ssq*0], 0
    vpermb               m6, m13, m0 ; 56
.v_w32_loop:
    vinserti32x8         m0, [srcq+ssq*1], 1
    lea                srcq, [srcq+ssq*2]
    pmaddubsw           m15, m1, m8
    mova                 m1, m3
    pmaddubsw           m16, m2, m8
    mova                 m2, m4
    pmaddubsw           m17, m3, m9
    mova                 m3, m5
    pmaddubsw           m18, m4, m9
    mova                 m4, m6
    pmaddubsw           m19, m5, m10
    vpermb               m5, m12, m0 ; 67
    vinserti32x8         m0, [srcq+ssq*0], 0
    pmaddubsw           m20, m6, m10
    vpermb               m6, m13, m0 ; 78
    paddw               m15, m19
    pmaddubsw           m19, m5, m11
    paddw               m16, m20
    pmaddubsw           m20, m6, m11
    paddw               m17, m19
    paddw               m18, m20
    paddsw              m15, m17
    paddsw              m16, m18
    pmulhrsw            m15, m7
    pmulhrsw            m16, m7
    packuswb            m15, m16
    vpermq              m15, m14, m15
%ifidn %1, avg
    mova               ym16, [dstq+dsq*0]
    vinserti32x8        m16, [dstq+dsq*1], 1
    pavgb               m15, m16
%endif
    mova          [dstq+dsq*0], ym15
    vextracti32x8 [dstq+dsq*1], m15, 1
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .v_w32_loop
    vzeroupper
    RET

    SPEL_V_INIT          %1, 64
    movu                 m2, [srcq+ssq*0]
    movu                 m4, [srcq+ssq*1]
    movu                 m6, [srcq+ssq*2]
    add                srcq, r5
    movu                m13, [srcq+ssq*0]
    movu                m15, [srcq+ssq*1]
    movu                m17, [srcq+ssq*2]
    add                srcq, r5
    movu                 m0, [srcq+ssq*0]
    punpcklbw            m1, m2, m4   ; 01l
    punpckhbw            m2, m4       ; 01h
    punpcklbw            m3, m4, m6   ; 12l
    punpckhbw            m4, m6       ; 12h
    punpcklbw            m5, m6, m13  ; 23l
    punpckhbw            m6, m13      ; 23h
    punpcklbw           m12, m13, m15 ; 34l
    punpckhbw           m13, m15      ; 34h
    punpcklbw           m14, m15, m17 ; 45l
    punpckhbw           m15, m17      ; 45h
    punpcklbw           m16, m17, m0  ; 56l
    punpckhbw           m17, m0       ; 56h
%if WIN64
    movaps          [rsp+8], xmm6
%endif
.v_w64_loop:
    movu                m22, [srcq+ssq*1]
    pmaddubsw            m1, m8       ; a0l
    pmaddubsw           m18, m14, m10 ; a2l
    lea                srcq, [srcq+ssq*2]
    pmaddubsw            m2, m8       ; a0h
    pmaddubsw           m19, m15, m10 ; a2h
    paddw               m18, m1
    mova                 m1, m5
    paddw               m19, m2
    mova                 m2, m6
    pmaddubsw           m20, m5, m9   ; a1l
    mova                 m5, m14
    pmaddubsw           m21, m6, m9   ; a1h
    mova                 m6, m15
    punpcklbw           m14, m0, m22  ; 67l
    punpckhbw           m15, m0, m22  ; 67h
    pmaddubsw            m0, m14, m11 ; a3l
    paddw               m20, m0
    pmaddubsw            m0, m15, m11 ; a3h
    paddw               m21, m0
    movu                 m0, [srcq+ssq*0]
    paddsw              m18, m20
    paddsw              m19, m21
    pmaddubsw            m3, m8       ; b0l
    pmaddubsw           m20, m16, m10 ; b2l
    pmaddubsw            m4, m8       ; b0h
    pmaddubsw           m21, m17, m10 ; b2h
    pmulhrsw            m18, m7
    pmulhrsw            m19, m7
    paddw               m20, m3
    mova                 m3, m12
    paddw               m21, m4
    mova                 m4, m13
    packuswb            m18, m19
%ifidn %1, avg
    pavgb               m18, [dstq+dsq*0]
%endif
    mova       [dstq+dsq*0], m18
    pmaddubsw           m18, m12, m9  ; b1l
    mova                m12, m16
    punpcklbw           m16, m22, m0  ; 78l
    pmaddubsw           m19, m13, m9  ; b1h
    mova                m13, m17
    punpckhbw           m17, m22, m0  ; 78h
    pmaddubsw           m22, m16, m11 ; b3l
    paddw               m18, m22
    pmaddubsw           m22, m17, m11 ; b3h
    paddw               m19, m22
    paddsw              m18, m20
    paddsw              m19, m21
    pmulhrsw            m18, m7
    pmulhrsw            m19, m7
    packuswb            m18, m19
%ifidn %1, avg
    pavgb               m18, [dstq+dsq*1]
%endif
    mova       [dstq+dsq*1], m18
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .v_w64_loop
%if WIN64
    movaps             xmm6, [rsp+8]
%endif
    vzeroupper
    RET

    SPEL_HV_INIT         %1, 4
    vbroadcasti32x4     ym2, [srcq+ssq*0]
    vinserti32x4         m2, [srcq+ssq*1], 2
    vbroadcasti32x4      m6, [spel_h_shufA]
    vinserti32x4         m2, [srcq+ssq*2], 3 ; _ 0 1 2
    add                srcq, r5
    movu                xm0, [srcq+ssq*0]
    vinserti32x4        ym0, [srcq+ssq*1], 1
    vbroadcasti32x4      m7, [spel_h_shufB]
    vinserti32x4         m0, [srcq+ssq*2], 2
    add                srcq, r5
    vpbroadcastd        m11, [myq+0]
    vinserti32x4         m0, [srcq+ssq*0], 3 ; 3 4 5 6
    vpbroadcastd        m12, [myq+4]
    lea                  r6, [dsq*3]
    mova                 m8, [spel_hv_perm4]
    pshufb               m4, m2, m6
    mova                 m1, m5
    vpdpbusd             m1, m4, m9
    pshufb               m4, m0, m6
    mova                 m3, m5
    vpdpbusd             m3, m4, m9
    pshufb               m2, m7
    pshufb               m0, m7
    vpdpbusd             m1, m2, m10
    vpdpbusd             m3, m0, m10
    psrad                m1, 7
    psrad                m0, m3, 7
    packuswb             m1, m0     ; _3   04   15   26
    vpermb               m1, m8, m1 ; 0123 1234 2345 3456
.hv_w4_loop:
    movu                xm4, [srcq+ssq*1]
    vinserti32x4        ym4, [srcq+ssq*2], 1
    vinserti32x4         m4, [srcq+r5   ], 2
    lea                srcq, [srcq+ssq*4]
    vinserti32x4         m4, [srcq+ssq*0], 3 ; 7 8 9 a
    mova                 m3, m5
    pshufb               m2, m4, m6
    vpdpbusd             m3, m2, m9
    mova                 m2, m5
    vpdpbusd             m2, m1, m11
    pshufb               m4, m7
    vpdpbusd             m3, m4, m10
    psrad                m3, 7
    packuswb             m1, m0, m3 ; 37   48   59   6a
    mova                 m0, m3
    vpermb               m1, m8, m1 ; 4567 5678 6789 789a
    vpdpbusd             m2, m1, m12
    psrad                m2, 7
    vpmovdw             ym2, m2
    packuswb            ym2, ym2
    vextracti32x4       xm3, ym2, 1
%ifidn %1, avg
    movd               xmm4, [dstq+dsq*0]
    pinsrd             xmm4, [dstq+dsq*1], 1
    pavgb               xm2, xmm4
    movd               xmm4, [dstq+dsq*2]
    pinsrd             xmm4, [dstq+r6   ], 1
    pavgb               xm3, xmm4
%endif
    movd       [dstq+dsq*0], xm2
    pextrd     [dstq+dsq*1], xm2, 1
    movd       [dstq+dsq*2], xm3
    pextrd     [dstq+r6   ], xm3, 1
    lea                dstq, [dstq+dsq*4]
    sub                  hd, 4
    jg .hv_w4_loop
    RET

    SPEL_HV_INIT         %1, 8
    vbroadcasti32x4     ym2, [srcq+ssq*0]
    vinserti32x4         m2, [srcq+ssq*1], 2
    vbroadcasti32x4      m6, [spel_h_shufA]
    vinserti32x4         m2, [srcq+ssq*2], 3 ; _ 0 1 2
    add                srcq, r5
    movu                xm0, [srcq+ssq*0]
    vinserti32x4        ym0, [srcq+ssq*1], 1
    vbroadcasti32x4      m7, [spel_h_shufB]
    vinserti32x4         m0, [srcq+ssq*2], 2
    add                srcq, r5
    vpbroadcastd        m11, [myq+0]
    vinserti32x4         m0, [srcq+ssq*0], 3 ; 3 4 5 6
    vpbroadcastd        m12, [myq+4]
    lea                  r6, [dsq*3]
    vbroadcasti32x4      m8, [spel_h_shufC]
    mova                m13, [spel_hv_perm8]
    vpaddd              m14, m13, [pb_4] {1to16}
    PUT_8TAP_H            2, 1, 3, 4
    PUT_8TAP_H            0, 1, 3, 4
    packuswb             m2, m0      ; _3   04   15   26
    vpermb               m1, m13, m2 ; 0123 1234 2345 3456 (abcd)
    vpermb               m2, m14, m2 ; 0123 1234 2345 3456 (efgh)
.hv_w8_loop:
    movu               xm18, [srcq+ssq*1]
    vinserti128        ym18, [srcq+ssq*2], 1
    vinserti32x4        m18, [srcq+r5   ], 2
    lea                srcq, [srcq+ssq*4]
    vinserti32x4        m18, [srcq+ssq*0], 3 ; 7 8 9 a
    PUT_8TAP_H           18, 4, 16, 17
    mova                m16, m5
    vpdpbusd            m16, m1, m11
    mova                m17, m5
    vpdpbusd            m17, m2, m11
    packuswb             m2, m0, m18 ; 37   48   59   6a
    mova                 m0, m18
    vpermb               m1, m13, m2 ; 4567 5678 6789 789a (abcd)
    vpermb               m2, m14, m2 ; 4567 5678 6789 789a (efgh)
    vpdpbusd            m16, m1, m12
    vpdpbusd            m17, m2, m12
    packusdw            m16, m17
    psrlw               m16, 7
    vpmovuswb          ym16, m16
    vextracti128       xm17, ym16, 1
%ifidn %1, avg
    movq               xm18, [dstq+dsq*0]
    movhps             xm18, [dstq+dsq*1]
    pavgb              xm16, xm18
    movq               xm18, [dstq+dsq*2]
    movhps             xm18, [dstq+r6   ]
    pavgb              xm17, xm18
%endif
    movq       [dstq+dsq*0], xm16
    movhps     [dstq+dsq*1], xm16
    movq       [dstq+dsq*2], xm17
    movhps     [dstq+r6   ], xm17
    lea                dstq, [dstq+dsq*4]
    sub                  hd, 4
    jg .hv_w8_loop
    vzeroupper
    RET

cglobal vp9_%1_8tap_smooth_32hv_8, 4, 8, 0
    lea                  r6, [vp9_spel_filter_smooth-8]
    mov                 r7d, 256*1
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
cglobal vp9_%1_8tap_sharp_32hv_8, 4, 8, 0
    lea                  r6, [vp9_spel_filter_sharp-8]
    mov                 r7d, 256*1
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
cglobal vp9_%1_8tap_regular_32hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
    lea                  r6, [vp9_spel_filter_regular-8]
    mov                 r7d, 256*1
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
cglobal vp9_%1_8tap_smooth_64hv_8, 4, 8, 0
    lea                  r6, [vp9_spel_filter_smooth-8]
    mov                 r7d, 256*3
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
cglobal vp9_%1_8tap_sharp_64hv_8, 4, 8, 0
    lea                  r6, [vp9_spel_filter_sharp-8]
    mov                 r7d, 256*3
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2
cglobal vp9_%1_8tap_regular_64hv_8, 4, 8, 0, dst, ds, src, ss, h, mx, my
    lea                  r6, [vp9_spel_filter_regular-8]
    mov                 r7d, 256*3
    jmp mangle(private_prefix %+ _vp9_%1_8tap_regular_16hv_8 %+ SUFFIX).main2

    SPEL_HV_INIT         %1, 16
    vpbroadcastw        m11, [myq+0]
    mova                 m6, [spel_h_perm16]
    vpbroadcastw        m12, [myq+2]
    vpbroadcastd         m8, [pb_4]
    vpbroadcastw        m13, [myq+4]
    vpbroadcastd        m15, [pw_256]
    vpbroadcastw        m14, [myq+6]
    mova                m19, [spel_hv_perm16]
    vpandd              m20, m19, [pw_m33] {1to16} ; even indices & ~32
    paddb                m7, m6, m8
    lea                 r6d, [hq+r7]
    paddb                m8, m7
%if WIN64
    push                 r8
%endif
.hv_w16_loop0:
    movu               ym16, [srcq+ssq*0]    ; 0
    movu               ym17, [srcq+ssq*1]
    lea                  r7, [srcq+r5]
    vinserti32x8        m17, [srcq+ssq*2], 1 ; 1 2
    movu               ym18, [r7+ssq*0]
    mov                  r8, dstq
    vinserti32x8        m18, [r7+ssq*1], 1   ; 3 4
    movu                ym0, [r7+ssq*2]
    add                  r7, r5
    vinserti32x8         m0, [r7+ssq*0], 1   ; 5 6
INIT_YMM avx512icl
    PUT_8TAP_H           16, 1, 2, 3, 1
INIT_ZMM avx512icl
    PUT_8TAP_H           17, 1, 2, 3, 1
    PUT_8TAP_H           18, 1, 2, 3, 1
    PUT_8TAP_H            0, 1, 2, 3, 1
    packuswb            m16, m17
    packuswb            m17, m18
    packuswb            m18, m0
    vpermb               m1, m20, m16 ; 01 12
    vpermb               m2, m19, m17 ; 23 34
    vpermb               m3, m19, m18 ; 45 56
.hv_w16_loop:
    movu               ym18, [r7+ssq*1]
    lea                  r7, [r7+ssq*2]
    vinserti32x8        m18, [r7+ssq*0], 1
    PUT_8TAP_H           18, 4, 16, 17, 1
    pmaddubsw           m16, m1, m11 ; a0 b0
    mova                 m1, m2
    pmaddubsw           m17, m2, m12 ; a1 b1
    mova                 m2, m3
    pmaddubsw            m3, m13     ; a2 b2
    packuswb             m4, m0, m18
    paddw               m16, m3
    vpermb               m3, m19, m4 ; 67 78
    mova                 m0, m18
    pmaddubsw            m4, m3, m14 ; a3 b3
    paddw               m17, m4
    paddsw              m16, m17
    pmulhrsw            m16, m15
    vextracti32x8      ym17, m16, 1
    packuswb           ym16, ym17
%ifidn %1, avg
    mova               xm17, [r8+dsq*0]
    vinserti128        ym17, [r8+dsq*1], 1
    pavgb              ym16, ym17
%endif
    mova         [r8+dsq*0], xm16
    vextracti128 [r8+dsq*1], ym16, 1
    lea                  r8, [r8+dsq*2]
    sub                  hd, 2
    jg .hv_w16_loop
    add                srcq, 16
    add                dstq, 16
    movzx                hd, r6b
    sub                 r6d, 1<<8
    jg .hv_w16_loop0
    vzeroupper
%if WIN64
    pop                  r8
%endif
    RET
%endmacro

INIT_ZMM avx512icl
MC_AVX512 put
MC_AVX512 avg

%endif
