/*
 * Copyright (c) 2020 Martin Storsjo
 * Copyright (c) 2024 Ramiro Polla
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/aarch64/asm.S"

#define RGB2YUV_COEFFS 16*4+16*32
#define BY v0.h[0]
#define GY v0.h[1]
#define RY v0.h[2]
#define BU v1.h[0]
#define GU v1.h[1]
#define RU v1.h[2]
#define BV v2.h[0]
#define GV v2.h[1]
#define RV v2.h[2]
#define Y_OFFSET  v22
#define UV_OFFSET v23

const shuf_0321_tbl, align=4
        .byte  0,  3,  2,  1
        .byte  4,  7,  6,  5
        .byte  8,  11, 10, 9
        .byte  12, 15, 14, 13
endconst

const shuf_1230_tbl, align=4
        .byte  1,  2,  3,  0
        .byte  5,  6,  7,  4
        .byte  9,  10, 11, 8
        .byte  13, 14, 15, 12
endconst

const shuf_2103_tbl, align=4
        .byte  2,  1,  0,  3
        .byte  6,  5,  4,  7
        .byte  10, 9,  8,  11
        .byte  14, 13, 12, 15
endconst

const shuf_3012_tbl, align=4
        .byte  3,  0,  1,  2
        .byte  7,  4,  5,  6
        .byte  11, 8,  9,  10
        .byte  15, 12, 13, 14
endconst

const shuf_3210_tbl, align=4
        .byte  3,  2,  1,  0
        .byte  7,  6,  5,  4
        .byte  11, 10, 9,  8
        .byte  15, 14, 13, 12
endconst

const shuf_3102_tbl, align=4
        .byte  3,  1,  0,  2
        .byte  7,  5,  4,  6
        .byte  11, 9,  8,  10
        .byte  15, 13, 12, 14
endconst

const shuf_2013_tbl, align=4
        .byte  2,  0,  1,  3
        .byte  6,  4,  5,  7
        .byte  10, 8,  9,  11
        .byte  14, 12, 13, 15
endconst

const shuf_1203_tbl, align=4
        .byte  1,  2,  0,  3
        .byte  5,  6,  4,  7
        .byte  9,  10, 8,  11
        .byte  13, 14, 12, 15
endconst

const shuf_2130_tbl, align=4
        .byte  2,  1,  3,  0
        .byte  6,  5,  7,  4
        .byte  10, 9,  11, 8
        .byte  14, 13, 15, 12
endconst

// rgb32tobgr24: tbl indices for 2-register sliding window (ldp+tbl+stp approach)
// Converts 16 BGRA pixels (64 bytes) to 16 BGR pixels (48 bytes) by dropping alpha.
// Each 16-byte output register selects 3-of-4 bytes from a {Vn, Vn+1} pair.
const rgb32tobgr24_tbl, align=4
        // out0 from {v0,v1}: pixels 0-5⅓ → B0 G0 R0 B1 G1 R1 B2 G2 R2 B3 G3 R3 B4 G4 R4 B5
        .byte  0,  1,  2,  4,  5,  6,  8,  9, 10, 12, 13, 14, 16, 17, 18, 20
        // out1 from {v1,v2}: pixels 5⅓-10⅔ → G5 R5 B6 G6 R6 B7 G7 R7 B8 G8 R8 B9 G9 R9 B10 G10
        .byte  5,  6,  8,  9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25
        // out2 from {v2,v3}: pixels 10⅔-15 → R10 B11 G11 R11 B12 G12 R12 B13 G13 R13 B14 G14 R14 B15 G15 R15
        .byte 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30
endconst

// rgb24tobgr32: tbl indices for sliding window (ldp+tbl+orr+stp approach)
// Converts 16 BGR pixels (48 bytes) to 16 BGRA pixels (64 bytes) by inserting alpha=255.
// Out-of-range index 128 produces 0 from tbl; orr with alpha mask fills in 0xFF.
const rgb24tobgr32_tbl, align=4
        // out0 from {v0}: pixels 0-3 → B0 G0 R0 _ B1 G1 R1 _ B2 G2 R2 _ B3 G3 R3 _
        .byte  0,  1,  2, 128,  3,  4,  5, 128,  6,  7,  8, 128,  9, 10, 11, 128
        // out1 from {v0,v1}: pixels 4-7 → B4 G4 R4 _ B5 G5 R5 _ B6 G6 R6 _ B7 G7 R7 _
        .byte 12, 13, 14, 128, 15, 16, 17, 128, 18, 19, 20, 128, 21, 22, 23, 128
        // out2 from {v1,v2}: pixels 8-11 → B8 G8 R8 _ B9 G9 R9 _ B10 G10 R10 _ B11 G11 R11 _
        .byte  8,  9, 10, 128, 11, 12, 13, 128, 14, 15, 16, 128, 17, 18, 19, 128
        // out3 from {v2}: pixels 12-15 → B12 G12 R12 _ B13 G13 R13 _ B14 G14 R14 _ B15 G15 R15 _
        .byte  4,  5,  6, 128,  7,  8,  9, 128, 10, 11, 12, 128, 13, 14, 15, 128
endconst

// convert rgb to 16-bit y, u, or v
// uses v3 and v4

.macro rgbconv16 dst, b, g, r, bc, gc, rc, shr_bits
        smull           v3.4s, \b\().4h, \bc
        smlal           v3.4s, \g\().4h, \gc
        smlal           v3.4s, \r\().4h, \rc
        smull2          v4.4s, \b\().8h, \bc
        smlal2          v4.4s, \g\().8h, \gc
        smlal2          v4.4s, \r\().8h, \rc        // v3:v4 = b * bc + g * gc + r * rc (32-bit)
        shrn            \dst\().4h, v3.4s, \shr_bits
        shrn2           \dst\().8h, v4.4s, \shr_bits       // dst = b * bc + g * gc + r * rc (16-bit)
.endm

// void ff_rgb24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
//                          uint8_t *vdst, int width, int height, int lumStride,
//                          int chromStride, int srcStride, int32_t *rgb2yuv);
function ff_rgb24toyv12_neon, export=1
// x0  const uint8_t *src
// x1  uint8_t *ydst
// x2  uint8_t *udst
// x3  uint8_t *vdst
// w4  int width
// w5  int height
// w6  int lumStride
// w7  int chromStride
        ldrsw           x14, [sp]
        ldr             x15, [sp, #8]
// x14 int srcStride
// x15 int32_t *rgb2yuv

        // extend width and stride parameters
        uxtw            x4, w4
        sxtw            x6, w6
        sxtw            x7, w7

        // src1 = x0
        // src2 = x10
        add             x10, x0,  x14               // x10 = src + srcStride
        lsl             x14, x14, #1                // srcStride *= 2
        add             x11, x4,  x4, lsl #1        // x11 = 3 * width
        sub             x14, x14, x11               // srcPadding = (2 * srcStride) - (3 * width)

        // ydst1 = x1
        // ydst2 = x11
        add             x11, x1,  x6                // x11 = ydst + lumStride
        lsl             x6,  x6,  #1                // lumStride *= 2
        sub             x6,  x6,  x4                // lumPadding = (2 * lumStride) - width

        sub             x7,  x7,  x4, lsr #1        // chromPadding = chromStride - (width / 2)

        // load rgb2yuv coefficients into v0, v1, and v2
        add             x15, x15, #RGB2YUV_COEFFS
        ld1             {v0.8h-v2.8h}, [x15]        // load 24 values

        // load offset constants
        movi            Y_OFFSET.8h,  #0x10, lsl #8
        movi            UV_OFFSET.8h, #0x80, lsl #8

1:
        mov             w15, w4                     // w15 = width

2:
        // load first line
        ld3             {v26.16b, v27.16b, v28.16b}, [x0], #48

        // widen first line to 16-bit
        uxtl            v16.8h, v26.8b              // v16 = B11
        uxtl            v17.8h, v27.8b              // v17 = G11
        uxtl            v18.8h, v28.8b              // v18 = R11
        uxtl2           v19.8h, v26.16b             // v19 = B12
        uxtl2           v20.8h, v27.16b             // v20 = G12
        uxtl2           v21.8h, v28.16b             // v21 = R12

        // calculate Y values for first line
        rgbconv16       v24, v16, v17, v18, BY, GY, RY, #7 // v24 = Y11
        rgbconv16       v25, v19, v20, v21, BY, GY, RY, #7 // v25 = Y12

        // load second line
        ld3             {v26.16b, v27.16b, v28.16b}, [x10], #48

        // pairwise add and save rgb values to calculate average
        addp            v5.8h, v16.8h, v19.8h
        addp            v6.8h, v17.8h, v20.8h
        addp            v7.8h, v18.8h, v21.8h

        // widen second line to 16-bit
        uxtl            v16.8h, v26.8b              // v16 = B21
        uxtl            v17.8h, v27.8b              // v17 = G21
        uxtl            v18.8h, v28.8b              // v18 = R21
        uxtl2           v19.8h, v26.16b             // v19 = B22
        uxtl2           v20.8h, v27.16b             // v20 = G22
        uxtl2           v21.8h, v28.16b             // v21 = R22

        // calculate Y values for second line
        rgbconv16       v26, v16, v17, v18, BY, GY, RY, #7 // v26 = Y21
        rgbconv16       v27, v19, v20, v21, BY, GY, RY, #7 // v27 = Y22

        // pairwise add rgb values to calculate average
        addp            v16.8h, v16.8h, v19.8h
        addp            v17.8h, v17.8h, v20.8h
        addp            v18.8h, v18.8h, v21.8h

        // calculate sum of r, g, b components in 2x2 blocks
        add             v16.8h, v16.8h, v5.8h
        add             v17.8h, v17.8h, v6.8h
        add             v18.8h, v18.8h, v7.8h

        // calculate U and V values
        rgbconv16       v28, v16, v17, v18, BU, GU, RU, #9 // v28 = U
        rgbconv16       v29, v16, v17, v18, BV, GV, RV, #9 // v29 = V

        // add offsets and narrow all values
        addhn           v24.8b, v24.8h, Y_OFFSET.8h
        addhn           v25.8b, v25.8h, Y_OFFSET.8h
        addhn           v26.8b, v26.8h, Y_OFFSET.8h
        addhn           v27.8b, v27.8h, Y_OFFSET.8h
        addhn           v28.8b, v28.8h, UV_OFFSET.8h
        addhn           v29.8b, v29.8h, UV_OFFSET.8h

        subs            w15, w15, #16

        // store output
        st1             {v24.8b, v25.8b}, [x1], #16 // store ydst1
        st1             {v26.8b, v27.8b}, [x11], #16 // store ydst2
        st1             {v28.8b}, [x2], #8          // store udst
        st1             {v29.8b}, [x3], #8          // store vdst

        b.gt            2b

        subs            w5,  w5,  #2

        // row += 2
        add             x0,  x0,  x14               // src1  += srcPadding
        add             x10, x10, x14               // src2  += srcPadding
        add             x1,  x1,  x6                // ydst1 += lumPadding
        add             x11, x11, x6                // ydst2 += lumPadding
        add             x2,  x2,  x7                // udst  += chromPadding
        add             x3,  x3,  x7                // vdst  += chromPadding
        b.gt            1b

        ret
endfunc

// void ff_rgb24tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size);
function ff_rgb24tobgr24_neon, export=1
        // x0 = src, x1 = dst, w2 = src_size (bytes)

        // Fast path: 48 bytes (16 pixels) per iteration
        subs            w2, w2, #48
        b.lt            2f
1:
        ld3             {v0.16b, v1.16b, v2.16b}, [x0], #48
        subs            w2, w2, #48
        mov             v3.16b, v0.16b
        mov             v0.16b, v2.16b
        mov             v2.16b, v3.16b
        st3             {v0.16b, v1.16b, v2.16b}, [x1], #48
        b.ge            1b
2:
        add             w2, w2, #48
        // Medium path: 24 bytes (8 pixels)
        cmp             w2, #24
        b.lt            3f
        ld3             {v0.8b, v1.8b, v2.8b}, [x0], #24
        sub             w2, w2, #24
        mov             v3.8b, v0.8b
        mov             v0.8b, v2.8b
        mov             v2.8b, v3.8b
        st3             {v0.8b, v1.8b, v2.8b}, [x1], #24
3:
        // Scalar tail: 3 bytes (1 pixel) at a time
        cmp             w2, #3
        b.lt            4f
5:
        ldrb            w4, [x0, #1]
        ldrb            w5, [x0, #2]
        ldrb            w3, [x0], #3
        subs            w2, w2, #3
        strb            w4, [x1, #1]
        strb            w3, [x1, #2]
        strb            w5, [x1], #3
        b.gt            5b
4:
        ret
endfunc

// void ff_rgb32tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size);
function ff_rgb32tobgr24_neon, export=1
        // x0 = src (BGRA), x1 = dst (BGR), w2 = src_size (bytes)

        // Load 3 tbl permutation masks for 2-register sliding window
        movrel          x3, rgb32tobgr24_tbl
        ld1             {v16.16b, v17.16b, v18.16b}, [x3]

        // Fast path: 64 bytes input (16 pixels) → 48 bytes output
        // Uses ldp+tbl(2-reg sliding window)+stp to avoid expensive ld4/st3.
        // Post-indexed addressing eliminates pointer-advance instructions.
        // subs placed between loads and tbl to fill load-latency gap on
        // in-order cores (A55).
        subs            w2, w2, #64
        b.lt            2f
1:
        ldp             q0, q1, [x0], #64
        ldp             q2, q3, [x0, #-32]
        subs            w2, w2, #64
        tbl             v4.16b, {v0.16b, v1.16b}, v16.16b
        tbl             v5.16b, {v1.16b, v2.16b}, v17.16b
        tbl             v6.16b, {v2.16b, v3.16b}, v18.16b
        stp             q4, q5, [x1], #48
        str             q6, [x1, #-16]
        b.ge            1b
2:
        add             w2, w2, #64
        // Medium path: 32 bytes input (8 pixels) → 24 bytes output
        cmp             w2, #32
        b.lt            3f
        ld4             {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
        sub             w2, w2, #32
        st3             {v0.8b, v1.8b, v2.8b}, [x1], #24
3:
        // Scalar tail: 4 bytes → 3 bytes at a time
        // Uses word load + halfword/byte stores to reduce instructions.
        // On LE: ldr gives A<<24|R<<16|G<<8|B; strh stores [B,G]; lsr+strb stores R.
        // subs and lsr fill load-use latency. lsr uses a fresh register so
        // strh and strb can issue independently; add advances x1 off critical path.
        cmp             w2, #4
        b.lt            4f
5:
        ldr             w3, [x0], #4
        subs            w2, w2, #4
        lsr             w4, w3, #16
        strh            w3, [x1]
        strb            w4, [x1, #2]
        add             x1, x1, #3
        b.gt            5b
4:
        ret
endfunc

// void ff_rgb24tobgr32_neon(const uint8_t *src, uint8_t *dst, int src_size);
function ff_rgb24tobgr32_neon, export=1
        // x0 = src (BGR), x1 = dst (BGRA), w2 = src_size (bytes)

        // Load tbl permutation indices and alpha mask for the fast path
        movrel          x3, rgb24tobgr32_tbl
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3]
        movi            v20.4s, #255, lsl #24             // Alpha mask: 00 00 00 FF per pixel

        // Fast path: 48 bytes input (16 pixels) → 64 bytes output
        // Uses ldp+tbl+orr+stp to avoid expensive ld3/st4 structure load/stores.
        // tbl produces 0 for alpha positions; orr fills in 0xFF.
        // Post-indexed addressing eliminates pointer-advance instructions.
        // tbl/orr interleaved so each orr starts as soon as its tbl result
        // is ready, hiding latency on narrow in-order cores (A55).
        subs            w2, w2, #48
        b.lt            2f
1:
        ldp             q0, q1, [x0], #48
        ldr             q2, [x0, #-16]
        subs            w2, w2, #48
        tbl             v4.16b, {v0.16b}, v16.16b
        tbl             v5.16b, {v0.16b, v1.16b}, v17.16b
        orr             v4.16b, v4.16b, v20.16b
        tbl             v6.16b, {v1.16b, v2.16b}, v18.16b
        orr             v5.16b, v5.16b, v20.16b
        tbl             v7.16b, {v2.16b}, v19.16b
        orr             v6.16b, v6.16b, v20.16b
        stp             q4, q5, [x1], #64
        orr             v7.16b, v7.16b, v20.16b
        stp             q6, q7, [x1, #-32]
        b.ge            1b
2:
        add             w2, w2, #48
        // Medium path: 24 bytes input (8 pixels) → 32 bytes output
        cmp             w2, #24
        b.lt            3f
        movi            v3.8b, #255
        ld3             {v0.8b, v1.8b, v2.8b}, [x0], #24
        sub             w2, w2, #24
        st4             {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], #32
3:
        // Scalar tail: 3 bytes → 4 bytes at a time
        // Uses halfword+byte loads, orr to combine with alpha, word store.
        // On LE: ldrh gives G<<8|B, ldrb gives R; orr assembles 0xFF<<24|R<<16|G<<8|B;
        // str stores [B,G,R,0xFF]. subs and add placed between loads and first
        // orr to fill load-use latency on A55.
        cmp             w2, #3
        b.lt            4f
5:
        ldrh            w4, [x0]
        ldrb            w5, [x0, #2]
        add             x0, x0, #3
        subs            w2, w2, #3
        orr             w4, w4, w5, lsl #16
        orr             w4, w4, #0xFF000000
        str             w4, [x1], #4
        b.gt            5b
4:
        ret
endfunc

// void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
//                               uint8_t *dest, int width, int height,
//                               int src1Stride, int src2Stride, int dstStride);
function ff_interleave_bytes_neon, export=1
        sub             w5,  w5,  w3
        sub             w6,  w6,  w3
        sub             w7,  w7,  w3, lsl #1
1:
        ands            w8,  w3,  #0xfffffff0 // & ~15
        b.eq            3f
2:
        ld1             {v0.16b}, [x0], #16
        ld1             {v1.16b}, [x1], #16
        subs            w8,  w8,  #16
        st2             {v0.16b, v1.16b}, [x2], #32
        b.gt            2b

        tst             w3,  #15
        b.eq            9f

3:
        tst             w3,  #8
        b.eq            4f
        ld1             {v0.8b}, [x0], #8
        ld1             {v1.8b}, [x1], #8
        st2             {v0.8b, v1.8b}, [x2], #16
4:
        tst             w3,  #4
        b.eq            5f

        ld1             {v0.s}[0], [x0], #4
        ld1             {v1.s}[0], [x1], #4
        zip1            v0.8b,   v0.8b,   v1.8b
        st1             {v0.8b}, [x2], #8

5:
        ands            w8,  w3,  #3
        b.eq            9f
6:
        ldrb            w9,  [x0], #1
        ldrb            w10, [x1], #1
        subs            w8,  w8,  #1
        bfi             w9,  w10, #8,  #8
        strh            w9,  [x2], #2
        b.gt            6b

9:
        subs            w4,  w4,  #1
        b.eq            0f
        add             x0,  x0,  w5, sxtw
        add             x1,  x1,  w6, sxtw
        add             x2,  x2,  w7, sxtw
        b               1b

0:
        ret
endfunc

// void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
//                                 int width, int height, int srcStride,
//                                 int dst1Stride, int dst2Stride);
function ff_deinterleave_bytes_neon, export=1
        sub             w5,  w5,  w3, lsl #1
        sub             w6,  w6,  w3
        sub             w7,  w7,  w3
1:
        ands            w8,  w3,  #0xfffffff0 // & ~15
        b.eq            3f
2:
        ld2             {v0.16b, v1.16b}, [x0], #32
        subs            w8,  w8,  #16
        st1             {v0.16b}, [x1], #16
        st1             {v1.16b}, [x2], #16
        b.gt            2b

        tst             w3,  #15
        b.eq            9f

3:
        tst             w3,  #8
        b.eq            4f
        ld2             {v0.8b, v1.8b}, [x0], #16
        st1             {v0.8b}, [x1], #8
        st1             {v1.8b}, [x2], #8
4:
        tst             w3,  #4
        b.eq            5f

        ld1             {v0.8b}, [x0], #8
        shrn            v1.8b,   v0.8h, #8
        xtn             v0.8b,   v0.8h
        st1             {v0.s}[0], [x1], #4
        st1             {v1.s}[0], [x2], #4

5:
        ands            w8,  w3,  #3
        b.eq            9f
6:
        ldrh            w9,  [x0], #2
        subs            w8,  w8,  #1
        ubfx            w10, w9,  #8,  #8
        strb            w9,  [x1], #1
        strb            w10, [x2], #1
        b.gt            6b

9:
        subs            w4,  w4,  #1
        b.eq            0f
        add             x0,  x0,  w5, sxtw
        add             x1,  x1,  w6, sxtw
        add             x2,  x2,  w7, sxtw
        b               1b

0:
        ret
endfunc

.macro neon_shuf shuf
function ff_shuffle_bytes_\shuf\()_neon, export=1
        movrel          x9, shuf_\shuf\()_tbl
        ld1             {v1.16b}, [x9]
        and             w5, w2, #~15
        and             w3, w2, #8
        and             w4, w2, #4
        cbz             w5, 2f
1:
        ld1             {v0.16b}, [x0], #16
        subs            w5, w5, #16
        tbl             v0.16b, {v0.16b}, v1.16b
        st1             {v0.16b}, [x1], #16
        b.gt            1b
2:
        cbz             w3, 3f
        ld1             {v0.8b}, [x0], #8
        tbl             v0.8b, {v0.16b}, v1.8b
        st1             {v0.8b}, [x1], #8
3:
        cbz             w4, 4f
.if \shuf == 0321
        ldr             w5, [x0]
        rev             w5, w5
        ror             w5, w5, #24
        str             w5, [x1]
.endif
.if \shuf == 1230
        ldr             w5, [x0]
        ror             w5, w5, #8
        str             w5, [x1]
.endif
.if \shuf == 2103
        ldr             w5, [x0]
        rev             w5, w5
        ror             w5, w5, #8
        str             w5, [x1]
.endif
.if \shuf == 3012
        ldr             w5, [x0]
        ror             w5, w5, #24
        str             w5, [x1]
.endif
.if \shuf == 3210
        ldr             w5, [x0]
        rev             w5, w5
        str             w5, [x1]
.endif
.if \shuf == 3102 || \shuf == 2013 || \shuf == 1203 || \shuf == 2130
        ld1             {v0.s}[0], [x0]
        tbl             v0.8b, {v0.16b}, v1.8b
        st1             {v0.s}[0], [x1]
.endif
4:
        ret
endfunc
.endm

neon_shuf 0321
neon_shuf 1230
neon_shuf 2103
neon_shuf 3012
neon_shuf 3102
neon_shuf 2013
neon_shuf 1203
neon_shuf 2130
neon_shuf 3210

/*
v0-v7 - two consecutive lines
x0 - upper Y destination
x1 - U destination
x2 - V destination
x3 - upper src line
w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
x6 - lum padding
x7 - chrom padding
x8 - src padding
w9 - number of bytes remaining in the tail
x10 - lower Y destination
w12 - tmp
x13 - lower src line
w14 - tmp
w17 - set to 1 if last line has to be handled separately (odd height)
*/

// one fast path iteration processes 16 uyvy tuples
// is_line_tail is set to 1 when final 16 tuples are being processed
// skip_storing_chroma is set to 1 when final line is processed and the height is odd
.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
        ld4             {v0.16b - v3.16b}, [x3], #64
.if ! \is_line_tail
        subs            w14, w14, #32
.endif

.if ! \skip_storing_chroma
.ifc \dst_fmt, yuv420
        ld4             {v4.16b - v7.16b}, [x13], #64
.endif

.ifc \dst_fmt, yuv420                                    // store UV
.ifc \src_fmt, uyvy
        uhadd           v0.16b, v4.16b, v0.16b            // halving sum of U
        uhadd           v2.16b, v6.16b, v2.16b            // halving sum of V
.else
        uhadd           v1.16b, v5.16b, v1.16b            // halving sum of U
        uhadd           v3.16b, v7.16b, v3.16b            // halving sum of V
.endif
.endif

.ifc \src_fmt, uyvy
        st1             {v2.16b}, [x2], #16
        st1             {v0.16b}, [x1], #16
.else
        st1             {v3.16b}, [x2], #16
        st1             {v1.16b}, [x1], #16
.endif

.ifc \dst_fmt, yuv420                                    // store_y
.ifc \src_fmt, uyvy
        mov             v6.16b, v5.16b
        st2             {v6.16b,v7.16b}, [x10], #32
.else
        mov             v5.16b, v4.16b
        st2             {v5.16b,v6.16b}, [x10], #32
.endif
.endif

.endif // ! \skip_storing_chroma

.ifc \src_fmt, uyvy
        mov             v2.16b, v1.16b
        st2             {v2.16b,v3.16b}, [x0], #32
.else
        mov             v1.16b, v0.16b
        st2             {v1.16b,v2.16b}, [x0], #32
.endif
.endm

// shift pointers back to width - 32 to process the tail of the line
// if the height is odd, processing the final line is simplified
.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
        add             x3, x3, w9, sxtw #1
        sub             x3, x3, #64
.if ! \is_final_odd_line
.ifc \dst_fmt, yuv420
        add             x13, x13, w9, sxtw #1
        sub             x13, x13, #64
        add             x10, x10, w9, sxtw
        sub             x10, x10, #32
.endif
.endif
        add             x0, x0, w9, sxtw
        sub             x0, x0, #32
.if ! \is_final_odd_line
        asr             w14, w9, #1
        add             x1, x1, w14, sxtw
        sub             x1, x1, #16
        add             x2, x2, w14, sxtw
        sub             x2, x2, #16
.endif
.endm

.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
.ifc \dst_fmt, yuv422
.ifc \src_fmt, uyvy
        ldrb            w12, [x3], #1
        ldrb            w14, [x3], #1
        strb            w12, [x1], #1
        strb            w14, [x0], #1
        ldrb            w12, [x3], #1
        ldrb            w14, [x3], #1
        strb            w12, [x2], #1
        strb            w14, [x0], #1
.else
        ldrb            w12, [x3], #1
        ldrb            w14, [x3], #1
        strb            w12, [x0], #1
        strb            w14, [x1], #1
        ldrb            w12, [x3], #1
        ldrb            w14, [x3], #1
        strb            w12, [x0], #1
        strb            w14, [x2], #1
.endif
.endif
.ifc \dst_fmt, yuv420
.ifc \src_fmt, uyvy
.if \skip_storing_chroma
        ldrb            w12, [x3], #2
        ldrb            w14, [x3], #2
        strb            w12, [x0], #1
        strb            w14, [x0], #1
.else
        ldrb            w12, [x3], #1
        ldrb            w14, [x13], #1
        add             w12, w12, w14
        lsr             w12, w12, #1
        strb            w12, [x1], #1
        ldrb            w14, [x3], #1
        ldrb            w12, [x13], #1
        strb            w14, [x0], #1
        strb            w12, [x10], #1
        ldrb            w14, [x13], #1
        ldrb            w12, [x3], #1
        add             w12, w12, w14
        lsr             w12, w12, #1
        strb            w12, [x2], #1
        ldrb            w14, [x3], #1
        ldrb            w12, [x13], #1
        strb            w14, [x0], #1
        strb            w12, [x10], #1
.endif
.else
.if \skip_storing_chroma
        ldrb            w12, [x3], #2
        ldrb            w14, [x3], #2
        strb            w12, [x0], #1
        strb            w14, [x0], #1
.else
        ldrb            w12, [x3], #1
        ldrb            w14, [x13], #1
        strb            w12, [x0], #1
        strb            w14, [x10], #1
        ldrb            w12, [x3], #1
        ldrb            w14, [x13], #1
        add             w12, w12, w14
        lsr             w12, w12, #1
        strb            w12, [x1], #1
        ldrb            w14, [x3], #1
        ldrb            w12, [x13], #1
        strb            w14, [x0], #1
        strb            w12, [x10], #1
        ldrb            w14, [x13], #1
        ldrb            w12, [x3], #1
        add             w12, w12, w14
        lsr             w12, w12, #1
        strb            w12, [x2], #1
.endif
.endif
.endif
.endm

.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
        add             x3, x3, x8
        add             x0, x0, x6
.ifc \dst_fmt, yuv420
        add             x13, x13, x8
        add             x10, x10, x6
.endif
        add             x1, x1, x7
        add             x2, x2, x7
.endm

.macro interleaved_yuv_to_planar src_fmt, dst_fmt
function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
        sxtw            x6, w6
        sxtw            x7, w7
        ldrsw           x8, [sp]
        ands            w11, w4, #~31                     // choose between fast and slow path

.ifc \dst_fmt, yuv420
        add             x10, x0, x6
        add             x13, x3, x8
        add             x8, x8, x8
        add             x6, x6, x6
        and             w17, w5, #1
        asr             w5, w5, #1
.endif
        asr             w9, w4, #1
        sub             x8, x8, w4, sxtw #1               // src offset
        sub             x6, x6, w4, sxtw                  // lum offset
        sub             x7, x7, x9                        // chr offset

        b.eq            6f

1:                                                        // fast path - the width is at least 32
        and             w14, w4, #~31                     // w14 is the main loop counter
        and             w9, w4, #31                       // w9 holds the remaining width, 0 to 31
2:
        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
        b.ne            2b
        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
        fastpath_iteration \src_fmt, \dst_fmt, 0, 0
        subs            w5, w5, #1
        move_pointers_to_next_line \src_fmt, \dst_fmt
        b.ne            1b

.ifc \dst_fmt, yuv420                                    // handle the last line in case the height is odd
        cbz             w17, 3f
        and             w14, w4, #~31
4:
        fastpath_iteration \src_fmt, \dst_fmt, 0, 1
        b.ne            4b
        fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
        fastpath_iteration \src_fmt, \dst_fmt, 1, 1
3:
.endif
        ret

6:                                                        // slow path - width is at most 31
        and             w9, w4, #31
7:
        subs            w9, w9, #2
        slowpath_iteration \src_fmt, \dst_fmt, 0
        b.ne            7b
        subs            w5, w5, #1
        move_pointers_to_next_line \src_fmt, \dst_fmt
        b.ne            6b

.ifc \dst_fmt, yuv420
        cbz             w17, 8f
        and             w9, w4, #31
.ifc \src_fmt, uyvy
        add             x3, x3, #1
.endif
5:
        subs            w9, w9, #2
        slowpath_iteration \src_fmt, \dst_fmt, 1
        b.ne            5b
8:
.endif
        ret
endfunc
.endm

interleaved_yuv_to_planar uyvy, yuv422
interleaved_yuv_to_planar uyvy, yuv420
interleaved_yuv_to_planar yuyv, yuv422
interleaved_yuv_to_planar yuyv, yuv420