; /*
; * Provide SIMD MC functions for VVC decoding
; *
; * Copyright © 2021, VideoLAN and dav1d authors
; * Copyright © 2021, Two Orioles, LLC
; * All rights reserved.
; *
; * Copyright (c) 2023-2024 Nuo Mi
; * Copyright (c) 2023-2024 Wu Jianhua
; *
; * This file is part of FFmpeg.
; *
; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */

%include "libavutil/x86/x86util.asm"

%define MAX_PB_SIZE 128

SECTION_RODATA 32

%if ARCH_X86_64

%if HAVE_AVX2_EXTERNAL

pw_0    times 2 dw   0
pw_1    times 2 dw   1
pw_4    times 2 dw   4
pw_12   times 2 dw  12
pw_256  times 2 dw 256

%macro AVG_JMP_TABLE 3-*
    %xdefine %1_%2_%3_table (%%table - 2*%4)
    %xdefine %%base %1_%2_%3_table
    %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
    %%table:
    %rep %0 - 3
        dd %%prefix %+ .w%4 - %%base
        %rotate 1
    %endrep
%endmacro

AVG_JMP_TABLE    avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE    avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE  w_avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE  w_avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128

SECTION .text

%macro AVG_W16_FN 3 ; bpc, op, count
    %assign %%i 0
    %rep %3
        %define off %%i
        AVG_LOAD_W16        0, off
        %2
        AVG_SAVE_W16       %1, 0, off


        AVG_LOAD_W16        1, off
        %2
        AVG_SAVE_W16       %1, 1, off

        %assign %%i %%i+1
    %endrep
%endmacro

%macro AVG_FN 2 ; bpc, op
   jmp                  wq

.w2:
    movd                xm0, [src0q]
    pinsrd              xm0, [src0q + AVG_SRC_STRIDE], 1
    movd                xm1, [src1q]
    pinsrd              xm1, [src1q + AVG_SRC_STRIDE], 1
    %2
    AVG_SAVE_W2          %1
    AVG_LOOP_END        .w2

.w4:
    movq                xm0, [src0q]
    pinsrq              xm0, [src0q + AVG_SRC_STRIDE], 1
    movq                xm1, [src1q]
    pinsrq              xm1, [src1q + AVG_SRC_STRIDE], 1
    %2
    AVG_SAVE_W4          %1

    AVG_LOOP_END        .w4

.w8:
    vinserti128         m0, m0, [src0q], 0
    vinserti128         m0, m0, [src0q + AVG_SRC_STRIDE], 1
    vinserti128         m1, m1, [src1q], 0
    vinserti128         m1, m1, [src1q + AVG_SRC_STRIDE], 1
    %2
    AVG_SAVE_W8         %1

    AVG_LOOP_END       .w8

.w16:
    AVG_W16_FN          %1, %2, 1

    AVG_LOOP_END       .w16

.w32:
    AVG_W16_FN          %1, %2, 2

    AVG_LOOP_END       .w32

.w64:
    AVG_W16_FN          %1, %2, 4

    AVG_LOOP_END       .w64

.w128:
    AVG_W16_FN          %1, %2, 8

    AVG_LOOP_END       .w128

.ret:
    RET
%endmacro

%macro AVG   0
    paddsw               m0, m1
    pmulhrsw             m0, m2
    CLIPW                m0, m3, m4
%endmacro

%macro W_AVG 0
    punpckhwd            m5, m0, m1
    pmaddwd              m5, m3
    paddd                m5, m4
    psrad                m5, xm2

    punpcklwd            m0, m0, m1
    pmaddwd              m0, m3
    paddd                m0, m4
    psrad                m0, xm2

    packssdw             m0, m5
    CLIPW                m0, m6, m7
%endmacro

%macro AVG_LOAD_W16 2  ; line, offset
    movu               m0, [src0q + %1 * AVG_SRC_STRIDE + %2 * 32]
    movu               m1, [src1q + %1 * AVG_SRC_STRIDE + %2 * 32]
%endmacro

%macro AVG_SAVE_W2 1 ;bpc
    %if %1 == 16
        pextrd           [dstq], xm0, 0
        pextrd [dstq + strideq], xm0, 1
    %else
        packuswb           m0, m0
        pextrw           [dstq], xm0, 0
        pextrw [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W4 1 ;bpc
    %if %1 == 16
        pextrq           [dstq], xm0, 0
        pextrq [dstq + strideq], xm0, 1
    %else
        packuswb           m0, m0
        pextrd           [dstq], xm0, 0
        pextrd [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W8 1 ;bpc
    %if %1 == 16
        vextracti128            [dstq], m0, 0
        vextracti128  [dstq + strideq], m0, 1
    %else
        packuswb                    m0, m0
        vpermq                      m0, m0, 1000b
        pextrq                  [dstq], xm0, 0
        pextrq        [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W16 3 ; bpc, line, offset
    %if %1 == 16
        movu               [dstq + %2 * strideq + %3 * 32], m0
    %else
        packuswb                                        m0, m0
        vpermq                                          m0, m0, 1000b
        vextracti128       [dstq + %2 * strideq + %3 * 16], m0, 0
    %endif
%endmacro

%macro AVG_LOOP_END 1
    sub                  hd, 2
    je                 .ret

    lea               src0q, [src0q + 2 * AVG_SRC_STRIDE]
    lea               src1q, [src1q + 2 * AVG_SRC_STRIDE]
    lea                dstq, [dstq + 2 * strideq]
    jmp                  %1
%endmacro

%define AVG_SRC_STRIDE MAX_PB_SIZE*2

;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
;   const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
%macro VVC_AVG_AVX2 1
cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
    movifnidn            hd, hm

    pxor                 m3, m3             ; pixel min
    vpbroadcastw         m4, bdm            ; pixel max

    movifnidn           bdd, bdm
    inc                 bdd
    tzcnt               bdd, bdd            ; bit depth

    sub                 bdd, 8
    movd                xm0, bdd
    vpbroadcastd         m1, [pw_4]
    pminuw               m0, m1
    vpbroadcastd         m2, [pw_256]
    psllw                m2, xm0                ; shift

    lea                  r6, [avg_%1 %+ SUFFIX %+ _table]
    tzcnt                wd, wm
    movsxd               wq, dword [r6+wq*4]
    add                  wq, r6
    AVG_FN               %1, AVG
%endmacro

;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
;    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
;    intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, intptr_t pixel_max);
%macro VVC_W_AVG_AVX2 1
cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1

    movifnidn            hd, hm

    movifnidn           t0d, r8m                ; w1
    shl                 t0d, 16
    mov                 t0w, r7m                ; w0
    movd                xm3, t0d
    vpbroadcastd         m3, xm3                ; w0, w1

    pxor                m6, m6                  ;pixel min
    vpbroadcastw        m7, r11m                ;pixel max

    mov                 t1q, rcx                ; save ecx
    mov                 ecx, r11m
    inc                 ecx                     ; bd
    tzcnt               ecx, ecx
    sub                 ecx, 8
    mov                 t0d, r9m                ; o0
    add                 t0d, r10m               ; o1
    shl                 t0d, cl
    inc                 t0d                     ;((o0 + o1) << (BIT_DEPTH - 8)) + 1

    neg                 ecx
    add                 ecx, 4                  ; bd - 12
    cmovl               ecx, [pw_0]
    add                 ecx, 3
    add                 ecx, r6m
    movd                xm2, ecx                ; shift

    dec                ecx
    shl                t0d, cl
    movd               xm4, t0d
    vpbroadcastd        m4, xm4                 ; offset
    mov                rcx, t1q                 ; restore ecx

    lea                 r6, [w_avg_%1 %+ SUFFIX %+ _table]
    tzcnt               wd, wm
    movsxd              wq, dword [r6+wq*4]
    add                 wq, r6
    AVG_FN              %1, W_AVG
%endmacro

INIT_YMM avx2

VVC_AVG_AVX2 16

VVC_AVG_AVX2 8

VVC_W_AVG_AVX2 16

VVC_W_AVG_AVX2 8
%endif

%endif