/* * Copyright © 2023, VideoLAN and dav1d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" #include "src/loongarch/loongson_util.S" .macro PUSH_REG addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 .endm .macro POP_REG fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 .endm .macro malloc_space number li.w t0, \number sub.d sp, sp, t0 addi.d sp, sp, -64 PUSH_REG .endm .macro free_space number POP_REG li.w t0, \number add.d sp, sp, t0 addi.d sp, sp, 64 .endm .macro iwht4 vadd.h vr0, vr0, vr1 vsub.h vr4, vr2, vr3 vsub.h vr5, vr0, vr4 vsrai.h vr5, vr5, 1 vsub.h vr2, vr5, vr1 vsub.h vr1, vr5, vr3 vadd.h vr3, vr4, vr2 vsub.h vr0, vr0, vr1 .endm .macro DST_ADD_W4 in0, in1, in2, in3, in4, in5 vilvl.w \in0, \in1, \in0 // 0 1 2 3 4 5 6 7 x ... vilvl.w \in2, \in3, \in2 // 8 9 10 11 12 13 14 15 x ... vsllwil.hu.bu \in0, \in0, 0 vsllwil.hu.bu \in2, \in2, 0 vadd.h \in0, \in4, \in0 vadd.h \in2, \in5, \in2 vssrani.bu.h \in2, \in0, 0 vstelm.w \in2, a0, 0, 0 vstelmx.w \in2, a0, a1, 1 vstelmx.w \in2, a0, a1, 2 vstelmx.w \in2, a0, a1, 3 .endm .macro VLD_DST_ADD_W4 in0, in1 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1 .endm function inv_txfm_add_wht_wht_4x4_8bpc_lsx vld vr0, a2, 0 vld vr2, a2, 16 vxor.v vr20, vr20, vr20 vsrai.h vr0, vr0, 2 vsrai.h vr2, vr2, 2 vst vr20, a2, 0 vpickod.d vr1, vr0, vr0 vpickod.d vr3, vr2, vr2 vst vr20, a2, 16 iwht4 LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 iwht4 vilvl.d vr4, vr1, vr0 vilvl.d vr5, vr3, vr2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr4, vr5 endfunc const idct_coeffs, align=4 .word 2896, 2896*8, 1567, 3784 .word 799, 4017, 3406, 2276 .word 401, 4076, 3166, 2598 .word 1931, 3612, 3920, 1189 .word 201, 4091, 3035, 2751 .word 1751, 3703, 3857, 1380 .word 995, 3973, 3513, 2106 .word 2440, 3290, 4052, 601 endconst .macro vsrari_h_x4 in0, in1, in2, in3, out0, out1, out2, out3, shift vsrari.h \out0, \in0, \shift vsrari.h \out1, \in1, \shift vsrari.h \out2, \in2, \shift vsrari.h \out3, \in3, \shift .endm .macro vsrari_h_x8 in0, in1, in2, in3, in4, in5, in6, in7, out0, \ out1, out2, out3, out4, out5, out6, out7, shift vsrari.h \out0, \in0, \shift vsrari.h \out1, \in1, \shift vsrari.h \out2, \in2, \shift vsrari.h \out3, \in3, \shift vsrari.h \out4, \in4, \shift vsrari.h \out5, \in5, \shift vsrari.h \out6, \in6, \shift vsrari.h \out7, \in7, \shift .endm .macro vmulev_vmaddod_lsx in0, in1, in2, in3, out0, out1, sz vmulwev.w.h \out0, \in0, \in2 vmulwod.w.h \out1, \in0, \in2 vmaddwev.w.h \out0, \in1, \in3 vmaddwod.w.h \out1, \in1, \in3 .ifc \sz, .4h vilvl.w \out0, \out1, \out0 .else vilvl.w vr22, \out1, \out0 vilvh.w \out1, \out1, \out0 vor.v \out0, vr22, vr22 .endif .endm const idct_coeffs_h, align=4 .short 2896, 2896*8, 1567, 3784 .short 799, 4017, 3406, 2276 .short 401, 4076, 3166, 2598 .short 1931, 3612, 3920, 1189 .short 201, 4091, 3035, 2751 .short 1751, 3703, 3857, 1380 .short 995, 3973, 3513, 2106 .short 2440, 3290, 4052, 601 endconst const iadst4_coeffs, align=4 .word 1321, 3803, 2482, 3344 endconst .macro inv_dct4_lsx in0, in1, in2, in3, out0, out1, out2, out3, sz la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx \in0, \in2, vr20, vr20, vr16, vr18, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx \in0, \in2, vr20, vr21, vr17, vr19, \sz vssrarni.h.w vr18, vr16, 12 // t0 vssrarni.h.w vr19, vr17, 12 // t1 vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx \in1, \in3, vr21, vr20, \in0, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx \in1, \in3, vr20, vr21, \in2, vr17, \sz vssrarni.h.w vr16, \in0, 12 // t3 vssrarni.h.w vr17, \in2, 12 // t2 vsadd.h \out0, vr18, vr16 vsadd.h \out1, vr19, vr17 vssub.h \out2, vr19, vr17 vssub.h \out3, vr18, vr16 .endm functionl inv_dct_4h_x4_lsx inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .4h endfuncl functionl inv_dct_8h_x4_lsx inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .8h endfuncl .macro inv_adst4_core_lsx in0, in1, in2, in3, out0, out1, out2, out3 vsub.w vr16, \in0, \in2 // in0-in2 vmul.w vr17, \in0, vr20 // in0*1321 vmul.w vr19, \in0, vr22 // in0*2482 vmul.w vr18, \in1, vr23 // in1*3344 vmadd.w vr17, \in2, vr21 // in0*1321+in2*3803 vmsub.w vr19, \in2, vr20 // in2*1321 vadd.w vr16, vr16, \in3 // in0-in2+in3 vmadd.w vr17, \in3, vr22 // in0*1321+in2*3803+in3*2482 vmsub.w vr19, \in3, vr21 // in0*2482-in2*1321-in3*3803 vadd.w vr15, vr17, vr19 vmul.w \out2, vr16, vr23 // out[2] 8 9 10 11 vadd.w \out0, vr17, vr18 // out[0] 0 1 2 3 vadd.w \out1, vr19, vr18 // out[1] 4 5 6 7 vsub.w \out3, vr15, vr18 // out[3] 12 13 14 15 .endm .macro inv_adst4_lsx in0, in1, in2, in3, out0, out1, out2, out3 la.local t0, iadst4_coeffs vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr0, \in0, 0 vsllwil.w.h vr1, \in1, 0 vsllwil.w.h vr2, \in2, 0 vsllwil.w.h vr3, \in3, 0 inv_adst4_core_lsx vr0, vr1, vr2, vr3, \out0, \out1, \out2, \out3 vssrarni.h.w \out0, \out0, 12 vssrarni.h.w \out1, \out1, 12 vssrarni.h.w \out2, \out2, 12 vssrarni.h.w \out3, \out3, 12 .endm functionl inv_adst_4h_x4_lsx inv_adst4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3 endfuncl functionl inv_flipadst_4h_x4_lsx inv_adst4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0 endfuncl .macro inv_adst_8x4_lsx in0, in1, in2, in3, out0, out1, out2, out3 la.local t0, iadst4_coeffs vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, \in0, 0 // in0 vsllwil.w.h vr11, \in1, 0 // in1 vsllwil.w.h vr12, \in2, 0 // in2 vsllwil.w.h vr13, \in3, 0 // in3 inv_adst4_core_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vexth.w.h \in0, \in0 // in0 vexth.w.h \in1, \in1 // in1 vexth.w.h \in2, \in2 // in2 vexth.w.h \in3, \in3 // in3 inv_adst4_core_lsx \in0, \in1, \in2, \in3, \out0, \out1, \out2, \out3 vssrarni.h.w \out0, vr10, 12 vssrarni.h.w \out1, vr11, 12 vssrarni.h.w \out2, vr12, 12 vssrarni.h.w \out3, vr13, 12 .endm functionl inv_adst_8h_x4_lsx inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3 endfuncl functionl inv_flipadst_8h_x4_lsx inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0 endfuncl functionl inv_identity_4h_x4_lsx li.w t0, 1697 vreplgr2vr.h vr20, t0 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vmulwev.w.h vr16, vr0, vr20 vmulwod.w.h vr17, vr0, vr20 vmulwev.w.h vr18, vr2, vr20 vmulwod.w.h vr19, vr2, vr20 vilvl.w vr1, vr17, vr16 vilvh.w vr3, vr17, vr16 vilvl.w vr22, vr19, vr18 vilvh.w vr23, vr19, vr18 vssrarni.h.w vr3, vr1, 12 vssrarni.h.w vr23, vr22, 12 vsadd.h vr0, vr3, vr0 // t0 vsadd.h vr2, vr23, vr2 // t2 vilvh.d vr1, vr0, vr0 // t1 vilvh.d vr3, vr2, vr2 // t3 endfuncl .macro inv_identity4_lsx1 in0, in1, in2, out0, out1 vsllwil.w.h vr16, \in0, 0 vexth.w.h vr17, \in1 vmul.w vr18, vr16, \in2 vmul.w vr19, vr17, \in2 vsrari.w vr18, vr18, 12 vsrari.w vr19, vr19, 12 vadd.w \out0, vr18, vr16 vadd.w \out1, vr19, vr17 vssrarni.h.w \out1, \out0, 1 .endm functionl inv_identity_8h_x4_lsx li.w t0, 1697 vreplgr2vr.h vr20, t0 vmulwev.w.h vr16, vr0, vr20 vmulwod.w.h vr17, vr0, vr20 vmulwev.w.h vr18, vr1, vr20 vmulwod.w.h vr19, vr1, vr20 vilvl.w vr21, vr17, vr16 vilvh.w vr22, vr17, vr16 vilvl.w vr23, vr19, vr18 vilvh.w vr16, vr19, vr18 vssrarni.h.w vr22, vr21, 12 vssrarni.h.w vr16, vr23, 12 vsadd.h vr0, vr22, vr0 // t0 vsadd.h vr1, vr16, vr1 // t1 vmulwev.w.h vr16, vr2, vr20 vmulwod.w.h vr17, vr2, vr20 vmulwev.w.h vr18, vr3, vr20 vmulwod.w.h vr19, vr3, vr20 vilvl.w vr21, vr17, vr16 vilvh.w vr22, vr17, vr16 vilvl.w vr23, vr19, vr18 vilvh.w vr16, vr19, vr18 vssrarni.h.w vr22, vr21, 12 vssrarni.h.w vr16, vr23, 12 vsadd.h vr2, vr22, vr2 // t2 vsadd.h vr3, vr16, vr3 // t3 endfuncl functionl inv_identity_8h_x4_lsx1 li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3 inv_identity4_lsx1 \i, \i vr20, vr21, \i .endr endfuncl functionl inv_txfm_add_4x4_lsx vxor.v vr23, vr23, vr23 vld vr0, a2, 0 vld vr2, a2, 16 vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vst vr23, a2, 0 vst vr23, a2, 16 move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 move t6, ra jirl ra, t8, 0 move ra, t6 vilvl.d vr4, vr1, vr0 vilvl.d vr5, vr3, vr2 vsrari.h vr4, vr4, 4 vsrari.h vr5, vr5, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr4, vr5 endfuncl .macro idct_dc w, h, shift ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 .if (2*\w == \h) || (2*\h == \w) vmul.w vr2, vr0, vr2 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 .endif .if \shift>0 vsrari.w vr2, vr2, \shift // (dc + rnd) >> shift .endif vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 alsl.d t2, a1, a0, 1 vmadd.w vr20, vr2, vr0 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 vssrarni.h.w vr20, vr20, 12 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 .endm .macro fun4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, 1f idct_dc 4, 4, 0 DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20 b .\txfm1\()_\txfm2\()_4X4_END 1: .endif la.local t7, inv_\txfm1\()_4h_x4_lsx la.local t8, inv_\txfm2\()_4h_x4_lsx b inv_txfm_add_4x4_lsx .\txfm1\()_\txfm2\()_4X4_END: endfunc .endm fun4x4 dct, dct fun4x4 identity, identity fun4x4 adst, dct fun4x4 dct, adst fun4x4 adst, adst fun4x4 dct, flipadst fun4x4 flipadst, adst fun4x4 adst, flipadst fun4x4 flipadst, dct fun4x4 flipadst, flipadst fun4x4 dct, identity fun4x4 identity, dct fun4x4 flipadst, identity fun4x4 identity, flipadst fun4x4 identity, adst fun4x4 adst, identity const iadst8_coeffs_h, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst .macro inv_adst8_lsx out0, out1, out2, out3, out4, out5, out6, out7, sz la.local t0, iadst8_coeffs_h vldrepl.h vr20, t0, 0 // 4076 vldrepl.h vr21, t0, 2 // 401 vmulev_vmaddod_lsx vr7, vr0, vr20, vr21, vr16, vr17, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr7, vr0, vr21, vr20, vr18, vr19, \sz vssrarni.h.w vr17, vr16, 12 // t0a vssrarni.h.w vr19, vr18, 12 // t1a vldrepl.h vr20, t0, 4 // 3612 vldrepl.h vr21, t0, 6 // 1931 vmulev_vmaddod_lsx vr5, vr2, vr20, vr21, vr0, vr16, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr5, vr2, vr21, vr20, vr7, vr18, \sz vssrarni.h.w vr16, vr0, 12 // t2a vssrarni.h.w vr18, vr7, 12 // t3a vldrepl.h vr20, t0, 8 // 2598 vldrepl.h vr21, t0, 10 // 3166 vmulev_vmaddod_lsx vr3, vr4, vr20, vr21, vr2, vr0, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr3, vr4, vr21, vr20, vr5, vr7, \sz vssrarni.h.w vr0, vr2, 12 // t4a vssrarni.h.w vr7, vr5, 12 // t5a vldrepl.h vr20, t0, 12 // 1189 vldrepl.h vr21, t0, 14 // 3920 vmulev_vmaddod_lsx vr1, vr6, vr20, vr21, vr3, vr2, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr1, vr6, vr21, vr20, vr4, vr5, \sz vssrarni.h.w vr2, vr3, 12 // t6a vssrarni.h.w vr5, vr4, 12 // t7a vsadd.h vr3, vr17, vr0 // t0 vssub.h vr4, vr17, vr0 // t4 vsadd.h vr1, vr19, vr7 // t1 vssub.h vr6, vr19, vr7 // t5 vsadd.h vr17, vr16, vr2 // t2 vssub.h vr19, vr16, vr2 // t6 vsadd.h vr0, vr18, vr5 // t3 vssub.h vr7, vr18, vr5 // t7 la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx vr4, vr6, vr21, vr20, vr16, vr5, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr4, vr6, vr20, vr21, vr18, vr2, \sz vssrarni.h.w vr5, vr16, 12 // t4a vssrarni.h.w vr2, vr18, 12 // t5a vneg.h vr21, vr21 vmulev_vmaddod_lsx vr7, vr19, vr20, vr21, vr4, vr16, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr7, vr19, vr21, vr20, vr6, vr18, \sz vssrarni.h.w vr16, vr4, 12 // t7a vssrarni.h.w vr18, vr6, 12 // t6a vsadd.h vr4, vr5, vr18 // out1 vssub.h vr19, vr5, vr18 // t6 vsadd.h vr20, vr1, vr0 // out7 vssub.h vr18, vr1, vr0 // t3 vsadd.h \out0, vr3, vr17 // out0 vssub.h vr5, vr3, vr17 // t2 vsadd.h \out6, vr2, vr16 // out6 vssub.h vr23, vr2, vr16 // t7 vsllwil.w.h vr3, vr20, 0 // out7 vexth.w.h \out7, vr20 // out7 vsllwil.w.h vr21, vr4, 0 // out1 vexth.w.h \out1, vr4 // out1 vneg.w vr3, vr3 vneg.w \out7, \out7 vneg.w vr21, vr21 vneg.w \out1, \out1 vssrarni.h.w \out7, vr3, 0 vssrarni.h.w \out1, vr21, 0 la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz vsrari.w vr16, vr16, 12 vsrari.w \out3, \out3, 12 vneg.w vr16, vr16 vneg.w \out3, \out3 vssrarni.h.w \out3, vr16, 0 // out3 vssrarni.h.w \out4, vr17, 12 // out4 vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz vssrarni.h.w \out2, vr16, 12 // out2 vsrari.w vr17, vr17, 12 vsrari.w \out5, \out5, 12 vneg.w vr17, vr17 vneg.w \out5, \out5 vssrarni.h.w \out5, vr17, 0 // out5 .endm functionl inv_adst_8h_x8_lsx inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h endfuncl functionl inv_flipadst_8h_x8_lsx inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h endfuncl functionl inv_adst_4h_x8_lsx inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h endfuncl functionl inv_flipadst_4h_x8_lsx inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h endfuncl .macro inv_dct8_lsx in0, in1, in2, in3, in4, in5, in6, in7, sz inv_dct4_lsx \in0, \in2, \in4, \in6, \in0, \in2, \in4, \in6, \sz la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 8 // 799 vldrepl.h vr21, t0, 10 // 4017 vmulev_vmaddod_lsx \in1, \in7, vr21, vr20, vr16, vr17, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx \in1, \in7, vr20, vr21, vr18, vr19, \sz vssrarni.h.w vr17, vr16, 12 // t7a vssrarni.h.w vr19, vr18, 12 // t4a vldrepl.h vr20, t0, 12 // 3406 vldrepl.h vr21, t0, 14 // 2276 vmulev_vmaddod_lsx \in5, \in3, vr21, vr20, \in1, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx \in5, \in3, vr20, vr21, \in7, vr18, \sz vssrarni.h.w vr16, \in1, 12 // t6a vssrarni.h.w vr18, \in7, 12 // t5a vssub.h \in7, vr19, vr18 // t5a vsadd.h vr18, vr19, vr18 // t4 vssub.h \in5, vr17, vr16 // t6a vsadd.h vr16, vr17, vr16 // t7 vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx \in5, \in7, vr20, vr20, \in1, vr17, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx \in5, \in7, vr20, vr21, vr23, vr19, \sz vssrarni.h.w vr17, \in1, 12 // t6 vssrarni.h.w vr19, vr23, 12 // t5 vssub.h \in7, \in0, vr16 //c[7] vsadd.h \in0, \in0, vr16 //c[0] vssub.h \in5, \in4, vr19 //c[5] vsadd.h vr23, \in4, vr19 //c[2] vssub.h \in4, \in6, vr18 //c[4] vsadd.h \in3, \in6, vr18 //c[3] vssub.h \in6, \in2, vr17 //c[6] vsadd.h \in1, \in2, vr17 //c[1] vor.v \in2, vr23, vr23 .endm functionl inv_dct_8h_x8_lsx inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h endfuncl functionl inv_dct_4h_x8_lsx inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .4h endfuncl .macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7 vsllwil.hu.bu vr0, \in0, 0 vsllwil.hu.bu vr1, \in1, 0 vsllwil.hu.bu vr2, \in2, 0 vsllwil.hu.bu vr3, \in3, 0 vadd.h vr0, \in4, vr0 vadd.h vr1, \in5, vr1 vadd.h vr2, \in6, vr2 vadd.h vr3, \in7, vr3 vssrani.bu.h vr1, vr0, 0 vssrani.bu.h vr3, vr2, 0 vstelm.d vr1, a0, 0, 0 vstelmx.d vr1, a0, a1, 1 vstelmx.d vr3, a0, a1, 0 vstelmx.d vr3, a0, a1, 1 .endm .macro VLD_DST_ADD_W8 in0, in1, in2, in3 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3 .endm functionl inv_identity_8h_x8_lsx .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsadd.h \i, \i, \i .endr endfuncl functionl inv_identity_4h_x8_lsx .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsadd.h \i, \i, \i .endr endfuncl .macro def_fn_8x8_base variant functionl inv_txfm_\variant\()add_8x8_lsx vxor.v vr23, vr23, vr23 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out b .itx_8x8_epilog .else move t6, ra jirl ra, t7, 0 move ra, t6 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 1 .endr .itx_8x8_epilog: LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 add.d a0, a0, a1 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 .endif endfuncl .endm def_fn_8x8_base identity_ def_fn_8x8_base .macro fn8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_8x8 idct_dc 8, 8, 1 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 b .\txfm1\()_\txfm2\()_8X8_END .NO_HAS_DCONLY_8x8: .endif la.local t8, inv_\txfm2\()_8h_x8_lsx .ifc \txfm1, identity b inv_txfm_identity_add_8x8_lsx .else la.local t7, inv_\txfm1\()_8h_x8_lsx b inv_txfm_add_8x8_lsx .endif .\txfm1\()_\txfm2\()_8X8_END: endfunc .endm fn8x8 dct, dct fn8x8 identity, identity fn8x8 dct, adst fn8x8 dct, flipadst fn8x8 dct, identity fn8x8 adst, dct fn8x8 adst, adst fn8x8 adst, flipadst fn8x8 flipadst, dct fn8x8 flipadst, adst fn8x8 flipadst, flipadst fn8x8 identity, dct fn8x8 adst, identity fn8x8 flipadst, identity fn8x8 identity, adst fn8x8 identity, flipadst .macro rect2_lsx in0, in1, out0 vsllwil.w.h vr22, \in0, 0 // in1 vexth.w.h \in0, \in0 // in1 vmul.w vr22, vr22, \in1 vmul.w \out0, \in0, \in1 vssrarni.h.w \out0, vr22, 12 .endm .macro LSX_TRANSPOSE8x4_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5 vilvl.h \tmp0, \in1, \in0 vilvl.h \tmp1, \in3, \in2 vilvl.w \tmp2, \tmp1, \tmp0 vilvh.w \tmp3, \tmp1, \tmp0 vilvl.h \tmp0, \in5, \in4 vilvl.h \tmp1, \in7, \in6 vilvl.w \tmp4, \tmp1, \tmp0 vilvh.w \tmp5, \tmp1, \tmp0 vilvl.d \out0, \tmp4, \tmp2 vilvh.d \out1, \tmp4, \tmp2 vilvl.d \out2, \tmp5, \tmp3 vilvh.d \out3, \tmp5, \tmp3 .endm functionl inv_txfm_add_8x4_lsx vxor.v vr23, vr23, vr23 vld vr0, a2, 0 vld vr2, a2, 16 vld vr4, a2, 32 vld vr6, a2, 48 .irp i, 0, 16, 32, 48 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 rect2_lsx vr0, vr23, vr0 rect2_lsx vr2, vr23, vr2 rect2_lsx vr4, vr23, vr4 rect2_lsx vr6, vr23, vr6 vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vilvh.d vr5, vr4, vr4 vilvh.d vr7, vr6, vr6 move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \ vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 endfuncl .macro LSX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, out4, \ out5, out6, out7, tmp0, tmp1, tmp2, tmp3 vilvl.h \tmp0, \in1, \in0 vilvl.h \tmp1, \in3, \in2 vilvh.h \tmp2, \in1, \in0 vilvh.h \tmp3, \in3, \in2 vilvl.w \out0, \tmp1, \tmp0 vilvh.w \out2, \tmp1, \tmp0 vilvl.w \out4, \tmp3, \tmp2 vilvh.w \out6, \tmp3, \tmp2 vbsrl.v \out1, \out0, 8 vbsrl.v \out3, \out2, 8 vbsrl.v \out5, \out4, 8 vbsrl.v \out7, \out6, 8 vinsgr2vr.d \out0, zero, 1 vinsgr2vr.d \out2, zero, 1 vinsgr2vr.d \out4, zero, 1 vinsgr2vr.d \out6, zero, 1 .endm functionl inv_txfm_add_4x8_lsx vxor.v vr23, vr23, vr23 vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a2, 32 vld vr3, a2, 48 .irp i, 0, 16, 32, 48 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 rect2_lsx vr0, vr23, vr0 rect2_lsx vr1, vr23, vr1 rect2_lsx vr2, vr23, vr2 rect2_lsx vr3, vr23, vr3 move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \ vr6, vr7, vr16, vr17, vr18, vr19 move t6, ra jirl ra, t8, 0 move ra, t6 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr2, vr5, vr4 vilvl.d vr3, vr7, vr6 vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr16, vr17 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr18, vr19 endfuncl .macro fn8x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x4_8bpc_lsx .ifc \txfm1()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_8x4 idct_dc 8, 4, 0 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 b .\txfm1\()_\txfm2\()_8X4_END .NO_HAS_DCONLY_8x4: .endif la.local t7, inv_\txfm1\()_4h_x8_lsx la.local t8, inv_\txfm2\()_8h_x4_lsx b inv_txfm_add_8x4_lsx .\txfm1\()_\txfm2\()_8X4_END: endfunc .endm fn8x4 dct, dct fn8x4 identity, identity fn8x4 dct, adst fn8x4 dct, flipadst fn8x4 dct, identity fn8x4 adst, dct fn8x4 adst, adst fn8x4 adst, flipadst fn8x4 flipadst, dct fn8x4 flipadst, adst fn8x4 flipadst, flipadst fn8x4 identity, dct fn8x4 adst, identity fn8x4 flipadst, identity fn8x4 identity, adst fn8x4 identity, flipadst .macro fn4x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x8_8bpc_lsx .ifc \txfm1()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_4x8 idct_dc 4, 8, 0 DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20 add.d a0, a0, a1 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr5, vr5 b .\txfm1\()_\txfm2\()_4X8_END .NO_HAS_DCONLY_4x8: .endif la.local t7, inv_\txfm1\()_8h_x4_lsx la.local t8, inv_\txfm2\()_4h_x8_lsx b inv_txfm_add_4x8_lsx .\txfm1\()_\txfm2\()_4X8_END: endfunc .endm fn4x8 dct, dct fn4x8 identity, identity fn4x8 dct, adst fn4x8 dct, flipadst fn4x8 dct, identity fn4x8 adst, dct fn4x8 adst, adst fn4x8 adst, flipadst fn4x8 flipadst, dct fn4x8 flipadst, adst fn4x8 flipadst, flipadst fn4x8 identity, dct fn4x8 adst, identity fn4x8 flipadst, identity fn4x8 identity, adst fn4x8 identity, flipadst .macro inv_identity4_lsx_x2 in0, in1, in2, in3, in4, out0, out1 vsllwil.w.h vr4, \in0, 0 vexth.w.h vr5, \in0 vsllwil.w.h vr6, \in1, 0 vexth.w.h vr7, \in1 vmul.w vr4, vr4, \in2 vmul.w vr5, vr5, \in2 vmul.w vr6, vr6, \in2 vmul.w vr7, vr7, \in2 vssrarni.h.w vr5, vr4, 12 vssrarni.h.w vr7, vr6, 12 vsadd.h \out0, vr5, \in3 vsadd.h \out1, vr7, \in4 .endm .macro vmul_vmadd_w in0, in1, in2, in3, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \out0, vr22, \in2 vmul.w \out1, vr23, \in2 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmadd.w \out0, vr22, \in3 vmadd.w \out1, vr23, \in3 .endm .macro vmul_vmsub_w in0, in1, in2, in3, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \out0, vr22, \in2 vmul.w \out1, vr23, \in2 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmsub.w \out0, vr22, \in3 vmsub.w \out1, vr23, \in3 .endm .macro inv_dct16_lsx sz inv_dct8_lsx vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14, \sz la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 16 // 401 vldrepl.h vr21, t0, 18 // 4076 vmulev_vmaddod_lsx vr1, vr15, vr21, vr20, vr16, vr17, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr1, vr15, vr20, vr21, vr18, vr19, \sz vssrarni.h.w vr17, vr16, 12 // t15a vssrarni.h.w vr19, vr18, 12 // t8a vldrepl.h vr20, t0, 20 // 3166 -> 1583 vldrepl.h vr21, t0, 22 // 2598 -> 1299 vmulev_vmaddod_lsx vr9, vr7, vr21, vr20, vr1, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr9, vr7, vr20, vr21, vr15, vr18, \sz vssrarni.h.w vr16, vr1, 12 // t14a vssrarni.h.w vr18, vr15, 12 // t9a vldrepl.h vr20, t0, 24 // 1931 vldrepl.h vr21, t0, 26 // 3612 vmulev_vmaddod_lsx vr5, vr11, vr21, vr20, vr7, vr1, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr5, vr11, vr20, vr21, vr9, vr15, \sz vssrarni.h.w vr1, vr7, 12 // t13a vssrarni.h.w vr15, vr9, 12 // t10a vldrepl.h vr20, t0, 28 // 3920 vldrepl.h vr21, t0, 30 // 1189 vmulev_vmaddod_lsx vr13, vr3, vr21, vr20, vr5, vr7, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr13, vr3, vr20, vr21, vr11, vr9, \sz vssrarni.h.w vr7, vr5, 12 // t12a vssrarni.h.w vr9, vr11, 12 // t11a vsadd.h vr5, vr19, vr18 // t8 vssub.h vr11, vr19, vr18 // t9 vssub.h vr3, vr9, vr15 // t10 vsadd.h vr13, vr9, vr15 // t11 vsadd.h vr18, vr7, vr1 // t12 vssub.h vr19, vr7, vr1 // t13 vssub.h vr9, vr17, vr16 // t14 vsadd.h vr15, vr17, vr16 // t15 vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx vr9, vr11, vr21, vr20, vr1, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr9, vr11, vr20, vr21, vr7, vr17, \sz vssrarni.h.w vr16, vr1, 12 // t14a vssrarni.h.w vr17, vr7, 12 // t9a vneg.h vr21, vr21 vmulev_vmaddod_lsx vr19, vr3, vr21, vr20, vr9, vr1, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr19, vr3, vr20, vr21, vr11, vr7, \sz vneg.w vr1, vr1 vneg.w vr9, vr9 vssrarni.h.w vr7, vr11, 12 // t13a vssrarni.h.w vr1, vr9, 12 // t10a vsadd.h vr9, vr5, vr13 // t8a vssub.h vr11, vr5, vr13 // t11a vssub.h vr3, vr15, vr18 // t12a vsadd.h vr19, vr15, vr18 // t15a vsadd.h vr5, vr17, vr1 // t9 vssub.h vr13, vr17, vr1 // t10 vssub.h vr15, vr16, vr7 // t13 vsadd.h vr18, vr16, vr7 // t14 vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx vr15, vr13, vr20, vr20, vr1, vr7, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx vr15, vr13, vr20, vr21, vr17, vr16, \sz vssrarni.h.w vr7, vr1, 12 // t13a vssrarni.h.w vr16, vr17, 12 // t10a vmulev_vmaddod_lsx vr3, vr11, vr20, vr20, vr13, vr23, \sz vmulev_vmaddod_lsx vr3, vr11, vr20, vr21, vr15, vr17, \sz vssrarni.h.w vr23, vr13, 12 // t12 vssrarni.h.w vr17, vr15, 12 // t11 vssub.h vr15, vr0, vr19 // c[15] vsadd.h vr0, vr0, vr19 // c[0] vsadd.h vr1, vr2, vr18 // c[1] vssub.h vr20, vr2, vr18 // c[14] vsadd.h vr2, vr4, vr7 // c[2] vssub.h vr13, vr4, vr7 // c[13] vsadd.h vr3, vr6, vr23 // c[3] vssub.h vr21, vr6, vr23 // c[12] vsadd.h vr4, vr8, vr17 // c[4] vssub.h vr11, vr8, vr17 // c[11] vsadd.h vr7, vr14, vr9 // c[7] vssub.h vr8, vr14, vr9 // c[8] vsadd.h vr6, vr12, vr5 // c[6] vssub.h vr9, vr12, vr5 // c[9] vsadd.h vr5, vr10, vr16 // c[5] vssub.h vr10, vr10, vr16 // c[10] vor.v vr14, vr20, vr20 vor.v vr12, vr21, vr21 .endm functionl inv_dct_8h_x16_lsx inv_dct16_lsx .8h endfuncl functionl inv_dct_4h_x16_lsx inv_dct16_lsx .4h endfuncl .macro VLD_DST_ADD_W4_x4 in0, in1, in2, in3, in4, in5, in6 ,in7 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in0, \in1 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in2, \in3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in4, \in5 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in6, \in7 .endm .macro def_fn_4x16_base txfm functionl inv_txfm_\txfm\()add_4x16_lsx PUSH_REG blt a3, t5, 416f vld vr0, a2, 16 vld vr1, a2, 48 vld vr2, a2, 80 vld vr3, a2, 112 vxor.v vr23, vr23, vr23 .irp i, 16, 48, 80, 112 vst vr23, a2, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 .ifnc \txfm, identity_ vsrari.h vr0, vr0, 1 vsrari.h vr1, vr1, 1 vsrari.h vr2, vr2, 1 vsrari.h vr3, vr3, 1 .endif LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr8, vr9, vr24, vr25, vr26, \ vr27, vr14, vr28, vr10, vr11, vr12, vr13 416: ble t5, a3, 416416f .irp i, vr8, vr9, vr24, vr25, vr26, vr27, vr14, vr28 vxor.v \i, \i, \i .endr 416416: vld vr0, a2, 0 vld vr1, a2, 32 vld vr2, a2, 64 vld vr3, a2, 96 vxor.v vr23, vr23, vr23 .irp i, 0, 32, 64, 96 vst vr23, a2, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 .ifnc \txfm, identity_ vsrari.h vr0, vr0, 1 vsrari.h vr1, vr1, 1 vsrari.h vr2, vr2, 1 vsrari.h vr3, vr3, 1 .endif LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \ vr6, vr7, vr16, vr17, vr18, vr19 vor.v vr10, vr24, vr24 vor.v vr11, vr25, vr25 vor.v vr12, vr26, vr26 vor.v vr13, vr27, vr27 vor.v vr15, vr28, vr28 move t6, ra jirl ra, t8, 0 move ra, t6 vilvl.d vr16, vr1, vr0 vilvl.d vr17, vr3, vr2 vilvl.d vr18, vr5, vr4 vilvl.d vr19, vr7, vr6 vilvl.d vr20, vr9, vr8 vilvl.d vr21, vr11, vr10 vilvl.d vr22, vr13, vr12 vilvl.d vr23, vr15, vr14 .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vsrari.h \i, \i, 4 .endr VLD_DST_ADD_W4_x4 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 POP_REG endfuncl .endm def_fn_4x16_base identity_ def_fn_4x16_base .macro fn4x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_4x16_8bpc_lsx .ifc \txfm1()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_4x16 idct_dc 4, 16, 1 DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5 .rept 3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr5, vr5 .endr b .\txfm1\()_\txfm2\()_4X16_END .NO_HAS_DCONLY_4x16: .endif li.w t5, \eob_half la.local t7, inv_\txfm1\()_8h_x4_lsx .ifc \txfm1, identity la.local t7, inv_\txfm1\()_8h_x4_lsx1 .endif la.local t8, inv_\txfm2\()_4h_x16_lsx .ifc \txfm1, identity b inv_txfm_identity_add_4x16_lsx .else b inv_txfm_add_4x16_lsx .endif .\txfm1\()_\txfm2\()_4X16_END: endfunc .endm fn4x16 dct, dct, 29 fn4x16 identity, identity, 29 fn4x16 dct, adst, 29 fn4x16 dct, flipadst, 29 fn4x16 dct, identity, 8 fn4x16 adst, dct, 29 fn4x16 adst, adst, 29 fn4x16 adst, flipadst, 29 fn4x16 flipadst, dct, 29 fn4x16 flipadst, adst, 29 fn4x16 flipadst, flipadst, 29 fn4x16 identity, dct, 32 fn4x16 adst, identity, 8 fn4x16 flipadst, identity, 8 fn4x16 identity, adst, 32 fn4x16 identity, flipadst, 32 .macro inv_identity16_lsx in0, in1, in2, out0, sz .ifc \sz, .8h vsllwil.w.h vr16, \in0, 0 vexth.w.h vr17, \in0 vmul.w vr16, vr16, \in1 vmul.w vr17, vr17, \in1 vsadd.h \in2, \in2, \in2 vssrarni.h.w vr17, vr16, 11 vsadd.h \out0, vr17, \in2 .else vsllwil.w.h vr16, \in0, 0 vmul.w vr16, vr16, \in1 vsadd.h \in2, \in2, \in2 vssrarni.h.w vr16, vr16, 11 vsadd.h \out0, vr16, \in2 .endif .endm .macro inv_identity16_lsx1 in0, in1, in2, out0 vsllwil.w.h vr16, \in0, 0 vexth.w.h vr17, \in1 vmul.w vr18, vr16, \in2 vmul.w vr19, vr17, \in2 vsrari.w vr18, vr18, 11 vsrari.w vr19, vr19, 11 vslli.w vr16, vr16, 1 vslli.w vr17, vr17, 1 vadd.w vr16, vr18, vr16 vadd.w \out0, vr19, vr17 vssrarni.h.w \out0, vr16, 1 .endm functionl inv_identity_8h_x16_lsx li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_identity16_lsx \i, vr20, \i, \i, .8h .endr endfuncl functionl inv_identity_4h_x16_lsx li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_identity16_lsx \i, vr20, \i, \i, .4h .endr endfuncl functionl inv_identity_8h_x16_lsx1 li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_identity16_lsx1 \i, \i, vr20, \i .endr endfuncl const iadst16_coeffs_h, align=4 .short 4091, 201, 3973, 995 .short 3703, 1751, 3290, 2440 .short 2751, 3035, 2106, 3513 .short 1380, 3857, 601, 4052 endconst .macro inv_adst16_lsx txfm, sz la.local t0, iadst16_coeffs_h vldrepl.h vr20, t0, 0 // 4091 vldrepl.h vr21, t0, 2 // 201 vmulev_vmaddod_lsx vr15, vr0, vr20, vr21, vr16, vr18, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr15, vr0, vr21, vr20, vr17, vr19, \sz vssrarni.h.w vr18, vr16, 12 // t0 vssrarni.h.w vr19, vr17, 12 // t1 vldrepl.h vr20, t0, 4 // 3973 vldrepl.h vr21, t0, 6 // 995 vmulev_vmaddod_lsx vr13, vr2, vr20, vr21, vr16, vr0, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr13, vr2, vr21, vr20, vr17, vr15, \sz vssrarni.h.w vr0, vr16, 12 // t2 vssrarni.h.w vr15, vr17, 12 // t3 vldrepl.h vr20, t0, 8 // 3703 vldrepl.h vr21, t0, 10 // 1751 vmulev_vmaddod_lsx vr11, vr4, vr20, vr21, vr16, vr2, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr11, vr4, vr21, vr20, vr17, vr13, \sz vssrarni.h.w vr2, vr16, 12 // t4 vssrarni.h.w vr13, vr17, 12 // t5 vldrepl.h vr20, t0, 12 // 3290 -> 1645 vldrepl.h vr21, t0, 14 // 2440 -> 1220 vmulev_vmaddod_lsx vr9, vr6, vr20, vr21, vr16, vr4, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr9, vr6, vr21, vr20, vr17, vr11, \sz vssrarni.h.w vr4, vr16, 12 // t6 vssrarni.h.w vr11, vr17, 12 // t7 vldrepl.h vr20, t0, 16 // 2751 vldrepl.h vr21, t0, 18 // 3035 vmulev_vmaddod_lsx vr7, vr8, vr20, vr21, vr16, vr6, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr7, vr8, vr21, vr20, vr17, vr9, \sz vssrarni.h.w vr6, vr16, 12 // t8 vssrarni.h.w vr9, vr17, 12 // t9 vldrepl.h vr20, t0, 20 // 2106 vldrepl.h vr21, t0, 22 // 3513 vmulev_vmaddod_lsx vr5, vr10, vr20, vr21, vr16, vr7, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr5, vr10, vr21, vr20, vr17, vr8, \sz vssrarni.h.w vr7, vr16, 12 // t10 vssrarni.h.w vr8, vr17, 12 // t11 vldrepl.h vr20, t0, 24 // 1380 vldrepl.h vr21, t0, 26 // 3857 vmulev_vmaddod_lsx vr3, vr12, vr20, vr21, vr16, vr5, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr3, vr12, vr21, vr20, vr17, vr10, \sz vssrarni.h.w vr5, vr16, 12 // t12 vssrarni.h.w vr10, vr17, 12 // t13 vldrepl.h vr20, t0, 28 // 601 vldrepl.h vr21, t0, 30 // 4052 vmulev_vmaddod_lsx vr1, vr14, vr20, vr21, vr16, vr3, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr1, vr14, vr21, vr20, vr17, vr12, \sz vssrarni.h.w vr3, vr16, 12 // t14 vssrarni.h.w vr12, vr17, 12 // t15 vsadd.h vr1, vr18, vr6 // t0a vssub.h vr14, vr18, vr6 // t8a vsadd.h vr16, vr19, vr9 // t1a vssub.h vr17, vr19, vr9 // t9a vsadd.h vr6, vr0, vr7 // t2a vssub.h vr18, vr0, vr7 // t10a vsadd.h vr9, vr15, vr8 // t3a vssub.h vr19, vr15, vr8 // t11a vsadd.h vr0, vr2, vr5 // t4a vssub.h vr7, vr2, vr5 // t12a vsadd.h vr8, vr13, vr10 // t5a vssub.h vr15, vr13, vr10 // t13a vsadd.h vr2, vr4, vr3 // t6a vssub.h vr5, vr4, vr3 // t14a vsadd.h vr10, vr11, vr12 // t7a vssub.h vr13, vr11, vr12 // t15a la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 8 // 799 vldrepl.h vr21, t0, 10 // 4017 vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr3, vr11, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr4, vr12, \sz vssrarni.h.w vr11, vr3, 12 // t8 vssrarni.h.w vr12, vr4, 12 // t9 vneg.h vr21, vr21 vmulev_vmaddod_lsx vr15, vr7, vr20, vr21, vr3, vr14, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr15, vr7, vr21, vr20, vr4, vr17, \sz vssrarni.h.w vr14, vr3, 12 // t13 vssrarni.h.w vr17, vr4, 12 // t12 vldrepl.h vr20, t0, 12 // 3406 vldrepl.h vr21, t0, 14 // 2276 vmulev_vmaddod_lsx vr18, vr19, vr21, vr20, vr3, vr7, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr18, vr19, vr20, vr21, vr4, vr15, \sz vssrarni.h.w vr7, vr3, 12 // t10 vssrarni.h.w vr15, vr4, 12 // t11 vneg.h vr21, vr21 vmulev_vmaddod_lsx vr13, vr5, vr20, vr21, vr3, vr18, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr13, vr5, vr21, vr20, vr4, vr19, \sz vssrarni.h.w vr18, vr3, 12 // t15 vssrarni.h.w vr19, vr4, 12 // t14 vsadd.h vr5, vr1, vr0 // t0 vssub.h vr13, vr1, vr0 // t4 vsadd.h vr3, vr16, vr8 // t1 vssub.h vr4, vr16, vr8 // t5 vsadd.h vr0, vr6, vr2 // t2 vssub.h vr1, vr6, vr2 // t6 vsadd.h vr8, vr9, vr10 // t3 vssub.h vr16, vr9, vr10 // t7 vsadd.h vr2, vr11, vr17 // t8a vssub.h vr6, vr11, vr17 // t12a vsadd.h vr9, vr12, vr14 // t9a vssub.h vr10, vr12, vr14 // t13a vsadd.h vr11, vr7, vr19 // t10a vssub.h vr17, vr7, vr19 // t14a vsadd.h vr12, vr15, vr18 // t11a vssub.h vr14, vr15, vr18 // t15a vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx vr13, vr4, vr21, vr20, vr7, vr18, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr13, vr4, vr20, vr21, vr15, vr19, \sz vssrarni.h.w vr18, vr7, 12 // t4a vssrarni.h.w vr19, vr15, 12 // t5a vneg.h vr21, vr21 vmulev_vmaddod_lsx vr16, vr1, vr20, vr21, vr7, vr4, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr16, vr1, vr21, vr20, vr15, vr13, \sz vssrarni.h.w vr4, vr7, 12 // t7a vssrarni.h.w vr13, vr15, 12 // t6a vneg.h vr20, vr20 vmulev_vmaddod_lsx vr6, vr10, vr21, vr20, vr7, vr1, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr6, vr10, vr20, vr21, vr15, vr16, \sz vssrarni.h.w vr1, vr7, 12 // t12 vssrarni.h.w vr16, vr15, 12 // t13 vneg.h vr21, vr21 vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr7, vr6, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr15, vr10, \sz vssrarni.h.w vr6, vr7, 12 // t15 vssrarni.h.w vr10, vr15, 12 // t14 vssub.h vr17, vr5, vr0 // t2a vsadd.h vr14, vr5, vr0 // out[0] vssub.h vr7, vr3, vr8 // t3a vsadd.h vr15, vr3, vr8 // out[15] vsllwil.w.h vr22, vr15, 0 vexth.w.h vr15, vr15 vneg.w vr22, vr22 vneg.w vr15, vr15 vssrarni.h.w vr15, vr22, 0 // out[15] vsadd.h vr3, vr19, vr4 // out[12] vssub.h vr8, vr19, vr4 // t7 vssub.h vr0, vr18, vr13 // t6 vsadd.h vr5, vr18, vr13 // out[3] vsllwil.w.h vr22, vr5, 0 vexth.w.h vr5, vr5 vneg.w vr22, vr22 vneg.w vr5, vr5 vssrarni.h.w vr5, vr22, 0 // out[3] vsadd.h vr13, vr9, vr12 // out[14] vssub.h vr19, vr9, vr12 // t11 vssub.h vr4, vr2, vr11 // t10 vsadd.h vr18, vr2, vr11 // out[1] vsllwil.w.h vr22, vr18, 0 vexth.w.h vr18, vr18 vneg.w vr22, vr22 vneg.w vr18, vr18 vssrarni.h.w vr18, vr22, 0 // out[1] vsadd.h vr2, vr1, vr10 // out[2] vssub.h vr11, vr1, vr10 // t14a vssub.h vr12, vr16, vr6 // t15a vsadd.h vr9, vr16, vr6 // out[13] vsllwil.w.h vr22, vr9, 0 vexth.w.h vr9, vr9 vneg.w vr22, vr22 vneg.w vr9, vr9 vssrarni.h.w vr9, vr22, 0 // out[13] vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz vssrarni.h.w vr1, vr16, 12 // out[8] vsrari.w vr6, vr6, 12 vsrari.w vr10, vr10, 12 vneg.w vr6, vr6 vneg.w vr10, vr10 vssrarni.h.w vr10, vr6, 0 // out[7] vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz vssrarni.h.w vr7, vr6, 12 // out[4] vsrari.w vr16, vr16, 12 vsrari.w vr17, vr17, 12 vneg.w vr16, vr16 vneg.w vr17, vr17 vssrarni.h.w vr17, vr16, 0 // out[11] vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz vssrarni.h.w vr8, vr6, 12 // out[6] vsrari.w vr16, vr16, 12 vsrari.w vr0, vr0, 12 vneg.w vr16, vr16 vneg.w vr0, vr0 vssrarni.h.w vr0, vr16, 0 // out[9] vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz vssrarni.h.w vr19, vr16, 12 // out[10] vsrari.w vr6, vr6, 12 vsrari.w vr4, vr4, 12 vneg.w vr6, vr6 vneg.w vr4, vr4 vssrarni.h.w vr4, vr6, 0 // out[5] .ifc \txfm, adst vor.v vr12, vr3, vr3 vor.v vr3, vr5, vr5 vor.v vr5, vr4, vr4 vor.v vr4, vr7, vr7 vor.v vr7, vr10, vr10 vor.v vr10, vr19, vr19 vor.v vr6, vr8, vr8 vor.v vr8, vr1, vr1 vor.v vr11, vr17, vr17 vor.v vr20, vr13, vr13 vor.v vr13, vr9, vr9 vor.v vr9, vr0, vr0 vor.v vr0, vr14, vr14 vor.v vr14, vr20, vr20 vor.v vr1, vr18, vr18 .else vor.v vr6, vr0, vr0 vor.v vr0, vr15, vr15 vor.v vr15, vr14, vr14 vor.v vr14, vr18, vr18 vor.v vr11, vr7, vr7 vor.v vr7, vr1, vr1 vor.v vr1, vr13, vr13 vor.v vr13, vr2, vr2 vor.v vr2, vr9, vr9 vor.v vr9, vr8, vr8 vor.v vr8, vr10, vr10 vor.v vr10, vr4, vr4 vor.v vr4, vr17, vr17 vor.v vr12, vr5, vr5 vor.v vr5, vr19, vr19 .endif .endm // inv_adst16_lsx functionl inv_adst_8h_x16_lsx inv_adst16_lsx adst, 8h endfuncl functionl inv_flipadst_8h_x16_lsx inv_adst16_lsx flipadst, 8h endfuncl functionl inv_adst_4h_x16_lsx inv_adst16_lsx adst, 4h endfuncl functionl inv_flipadst_4h_x16_lsx inv_adst16_lsx flipadst, 4h endfuncl .macro VLD_DST_ADD_W8_x4 in0, in1, in2, in3, in4, in5, in6, in7, in8, \ in9, in10, in11, in12, in13, in14, in15 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in0, \in1, \in2, \in3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in4, \in5, \in6, \in7 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in8, \in9, \in10, \in11 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in12, \in13, \in14, \in15 .endm .macro def_base_8x16 txfm1 functionl inv_txfm_\txfm1\()add_8x16_lsx blt a3, t5, 816f vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vxor.v vr23, vr23, vr23 .irp i, 16, 48, 80, 112, 144, 176, 208, 240 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 rect2_lsx \i, vr23, \i .endr .ifc \txfm1, identity_ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 .else move t6, ra jirl ra, t7, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 1 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .endif 816: ble t5, a3, 816816f .irp i, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v \i, \i, \i .endr 816816: vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vxor.v vr23, vr23, vr23 .irp i, 0, 32, 64, 96, 128, 160, 192, 224 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 rect2_lsx \i, vr23, \i .endr .ifc \txfm1, identity_ .else move t6, ra jirl ra, t7, 0 move ra, t6 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 1 .endr .endif LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vor.v vr0, vr0, vr0 vsrari_h_x8 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4 VLD_DST_ADD_W8_x4 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 endfuncl .endm def_base_8x16 identity_ def_base_8x16 .macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11 vsllwil.hu.bu vr4, \in0, 0 vexth.hu.bu vr0, \in0 vsllwil.hu.bu vr5, \in1, 0 vexth.hu.bu vr1, \in1 vsllwil.hu.bu vr6, \in2, 0 vexth.hu.bu vr2, \in2 vsllwil.hu.bu vr7, \in3, 0 vexth.hu.bu vr3, \in3 vadd.h vr4, vr4, \in4 vadd.h vr0, vr0, \in5 vadd.h vr5, vr5, \in6 vadd.h vr1, vr1, \in7 vadd.h vr6, vr6, \in8 vadd.h vr2, vr2, \in9 vadd.h vr7, vr7, \in10 vadd.h vr3, vr3, \in11 vssrani.bu.h vr0, vr4, 0 vssrani.bu.h vr1, vr5, 0 vssrani.bu.h vr2, vr6, 0 vssrani.bu.h vr3, vr7, 0 vst vr0, a0, 0 vstx vr1, a0, a1 vst vr2, t2, 0 vstx vr3, t2, a1 .endm .macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \ \in4, \in5, \in6, \in7 .endm .macro def_fn_16x8 txfm1 functionl inv_txfm_\txfm1\()add_16x8_lsx PUSH_REG vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr23, vr23, vr23 .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, \ 176, 192, 208, 224, 240 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 rect2_lsx \i, vr23, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 .ifnc \txfm1, identity_ .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 1 .endr .endif LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, 4 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr24, vr8, vr25, vr9, vr26, vr10, vr27, vr11 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr28, vr12, vr29, vr13, vr30, vr14, vr31, vr15 POP_REG endfuncl .endm def_fn_16x8 identity_ def_fn_16x8 .macro fun16x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x8_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_16x8 idct_dc 16, 8, 1 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20, b .\txfm1\()_\txfm2\()_16x8_END .NO_HAS_DCONLY_16x8: .endif la.local t7, inv_\txfm1\()_8h_x16_lsx .ifc \txfm1, identity la.local t7, inv_identity_8h_x16_lsx1 .endif la.local t8, inv_\txfm2\()_8h_x8_lsx .ifc \txfm1, identity b inv_txfm_identity_add_16x8_lsx .else b inv_txfm_add_16x8_lsx .endif .\txfm1\()_\txfm2\()_16x8_END: endfunc .endm fun16x8 dct, dct fun16x8 identity, identity fun16x8 dct, adst fun16x8 dct, flipadst fun16x8 dct, identity fun16x8 adst, dct fun16x8 adst, adst fun16x8 adst, flipadst fun16x8 flipadst, dct fun16x8 flipadst, adst fun16x8 flipadst, flipadst fun16x8 identity, dct fun16x8 adst, identity fun16x8 flipadst, identity fun16x8 identity, adst fun16x8 identity, flipadst .macro fun8x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_8x16_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_8x16 idct_dc 8, 16, 1 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 .rept 3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 .endr b .\txfm1\()_\txfm2\()_8x16_END .NO_HAS_DCONLY_8x16: .endif li.w t5, \eob_half .ifnc \txfm1, identity la.local t7, inv_\txfm1\()_8h_x8_lsx .endif la.local t8, inv_\txfm2\()_8h_x16_lsx .ifc \txfm1, identity b inv_txfm_identity_add_8x16_lsx .else b inv_txfm_add_8x16_lsx .endif .\txfm1\()_\txfm2\()_8x16_END: endfunc .endm fun8x16 dct, dct, 43 fun8x16 identity, identity, 43 fun8x16 dct, adst, 43 fun8x16 dct, flipadst, 43 fun8x16 dct, identity, 8 fun8x16 adst, dct, 43 fun8x16 adst, adst, 43 fun8x16 adst, flipadst, 43 fun8x16 flipadst, dct, 43 fun8x16 flipadst, adst, 43 fun8x16 flipadst, flipadst, 43 fun8x16 identity, dct, 64 fun8x16 adst, identity, 8 fun8x16 flipadst, identity, 8 fun8x16 identity, adst, 64 fun8x16 identity, flipadst, 64 functionl inv_txfm_add_16x16_lsx malloc_space 512 addi.d t1, sp, 64 addi.d t2, a2, 0 .rept 2 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr23, vr23, vr23 .irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, \ 384, 416, 448, 480 vst vr23, a2, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 2 .endr vst_x8 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vst_x8 t1, 16, 32, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t1, t1, 256 addi.d a2, a2, 16 blt a3, t5, 1616f .endr 1616: ble t5, a3, 16161616f addi.d t1, sp, 320 vxor.v vr23, vr23, vr23 .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240 vst vr23, t1, \i .endr 16161616: addi.d t1, sp, 64 .rept 2 vld_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 move t6, ra jirl ra, t8, 0 move ra, t6 vst_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t1, t1, 16 .endr alsl.d t2, a1, a0, 1 addi.d t1, sp, 64 .rept 4 vld_x8 t1, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vsrari_h_x8 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4 VLD_DST_ADD_W16 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 addi.d t1, t1, 128 .endr free_space 512 endfuncl .macro fun16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_16x16 idct_dc 16, 16, 2 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 .rept 3 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .\txfm1\()_\txfm2\()_16x16_END .NO_HAS_DCONLY_16x16: .endif li.w t5, \eob_half la.local t7, inv_\txfm1\()_8h_x16_lsx la.local t8, inv_\txfm2\()_8h_x16_lsx b inv_txfm_add_16x16_lsx .\txfm1\()_\txfm2\()_16x16_END: endfunc .endm fun16x16 dct, dct, 36 fun16x16 adst, adst, 36 fun16x16 adst, dct, 36 fun16x16 dct, adst, 36 fun16x16 flipadst, dct, 36 fun16x16 dct, flipadst, 36 fun16x16 adst, flipadst, 36 fun16x16 flipadst, adst, 36 .macro dct_8x32_core_lsx in1, in2, vld_st0, vld_st1, vld_stride, \ vst_st0, vst_st1, vst_st2, vst_st3, vst_stride, \ transpose8x8, shift la.local t0, idct_coeffs vldrepl.w vr20, t0, 64 // 201 vldrepl.w vr21, t0, 68 // 4091 vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 vssrarni.h.w vr9, vr8, 12 // t31a vssrarni.h.w vr10, vr11, 12 // t16a vldrepl.w vr20, t0, 72 // 3035 vldrepl.w vr21, t0, 76 // 2751 vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0 vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 vssrarni.h.w vr0, vr8, 12 // t30a vssrarni.h.w vr30, vr11, 12 // t17a vldrepl.w vr20, t0, 80 // 1751 vldrepl.w vr21, t0, 84 // 3703 vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19 vssrarni.h.w vr7, vr8, 12 // t29a vssrarni.h.w vr19, vr11, 12 // t18a vldrepl.w vr20, t0, 88 // 3857 vldrepl.w vr21, t0, 92 // 1380 vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26 vssrarni.h.w vr4, vr8, 12 // t28a vssrarni.h.w vr26, vr11, 12 // t19a vldrepl.w vr20, t0, 96 // 995 vldrepl.w vr21, t0, 100 // 3973 vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27 vssrarni.h.w vr3, vr8, 12 // t27a vssrarni.h.w vr27, vr11, 12 // t20a vldrepl.w vr20, t0, 104 // 3513 vldrepl.w vr21, t0, 108 // 2106 vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28 vssrarni.h.w vr2, vr8, 12 // t26a vssrarni.h.w vr28, vr11, 12 // t21a vldrepl.w vr20, t0, 112 // 2440 -> 1220 vldrepl.w vr21, t0, 116 // 3290 -> 1645 vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25 vssrarni.h.w vr5, vr8, 12 // t25a vssrarni.h.w vr25, vr11, 12 // t22a vldrepl.w vr20, t0, 120 // 4052 vldrepl.w vr21, t0, 124 // 601 vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24 vssrarni.h.w vr6, vr8, 12 // t24a vssrarni.h.w vr24, vr11, 12 // t23a vsadd.h vr1, vr10, vr30 // t16 vssub.h vr29, vr10, vr30 // t17 vssub.h vr8, vr26, vr19 // t18 vsadd.h vr31, vr26, vr19 // t19 vsadd.h vr10, vr27, vr28 // t20 vssub.h vr30, vr27, vr28 // t21 vssub.h vr19, vr24, vr25 // t22 vsadd.h vr26, vr24, vr25 // t23 vsadd.h vr27, vr6, vr5 // t24 vssub.h vr28, vr6, vr5 // t25 vssub.h vr24, vr3, vr2 // t26 vsadd.h vr25, vr3, vr2 // t27 vsadd.h vr5, vr4, vr7 // t28 vssub.h vr6, vr4, vr7 // t29 vssub.h vr2, vr9, vr0 // t30 vsadd.h vr3, vr9, vr0 // t31 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 vssrarni.h.w vr7, vr4, 12 // t30a vssrarni.h.w vr0, vr11, 12 // t17a vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 vneg.w vr4, vr4 vneg.w vr9, vr9 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 vssrarni.h.w vr9, vr4, 12 // t18a vssrarni.h.w vr2, vr11, 12 // t29a vldrepl.w vr20, t0, 24 // 3406 -> 1703 vldrepl.w vr21, t0, 28 // 2276 -> 1138 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 vssrarni.h.w vr29, vr4, 12 // t26a vssrarni.h.w vr6, vr11, 12 // t21a vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 vneg.w vr4, vr4 vneg.w vr8, vr8 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 vssrarni.h.w vr8, vr4, 12 // t22a vssrarni.h.w vr24, vr11, 12 // t25a vsadd.h vr4, vr1, vr31 // t16a vssub.h vr30, vr1, vr31 // t19a vsadd.h vr19, vr0, vr9 // t17 vssub.h vr28, vr0, vr9 // t18 vssub.h vr1, vr26, vr10 // t20a vsadd.h vr31, vr26, vr10 // t23a vssub.h vr0, vr8, vr6 // t21 vsadd.h vr9, vr8, vr6 // t22 vsadd.h vr10, vr27, vr25 // t24a vssub.h vr26, vr27, vr25 // t27a vsadd.h vr6, vr24, vr29 // t25 vssub.h vr8, vr24, vr29 // t26 vssub.h vr25, vr3, vr5 // t28a vsadd.h vr27, vr3, vr5 // t31a vssub.h vr24, vr7, vr2 // t29 vsadd.h vr29, vr7, vr2 // t30 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 vssrarni.h.w vr5, vr3, 12 // t29a vssrarni.h.w vr2, vr11, 12 // 18a vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 vssrarni.h.w vr7, vr3, 12 // t28 vssrarni.h.w vr24, vr11, 12 // t19 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 vneg.w vr3, vr3 vneg.w vr28, vr28 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 vssrarni.h.w vr28, vr3, 12 // t20 vssrarni.h.w vr25, vr11, 12 // t27 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 vneg.w vr3, vr3 vneg.w vr30, vr30 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 vssrarni.h.w vr30, vr3, 12 // t21a vssrarni.h.w vr1, vr11, 12 // t26a vsadd.h vr3, vr4, vr31 // t16 vssub.h vr26, vr4, vr31 // t23 vsadd.h vr0, vr19, vr9 // t17a vssub.h vr8, vr19, vr9 // t22a vsadd.h vr4, vr2, vr30 // t18 vssub.h vr31, vr2, vr30 // t21 vsadd.h vr9, vr24, vr28 // t19a vssub.h vr19, vr24, vr28 // t20a vssub.h vr2, vr27, vr10 // t24 vsadd.h vr30, vr27, vr10 // t31 vssub.h vr24, vr29, vr6 // t25a vsadd.h vr28, vr29, vr6 // t30a vssub.h vr10, vr5, vr1 // t26 vsadd.h vr27, vr5, vr1 // t29 vssub.h vr6, vr7, vr25 // t27a vsadd.h vr29, vr7, vr25 // t28a vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 vssrarni.h.w vr5, vr1, 12 // t20 vssrarni.h.w vr7, vr11, 12 // t27 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 vssrarni.h.w vr25, vr1, 12 // t21a vssrarni.h.w vr6, vr11, 12 // t26a vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 vssrarni.h.w vr19, vr1, 12 // t22 vssrarni.h.w vr10, vr11, 12 // t25 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 vssrarni.h.w vr31, vr1, 12 // t23a vssrarni.h.w vr8, vr11, 12 // t24a // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 vld_x8 \in2, \vld_st0, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr30 // c[0] vssub.h vr2, vr11, vr30 // c[31] vsadd.h vr24, vr12, vr28 // c[1] vssub.h vr26, vr12, vr28 // c[30] vsadd.h vr11, vr13, vr27 // c[2] vssub.h vr30, vr13, vr27 // c[29] vsadd.h vr12, vr14, vr29 // c[3] vssub.h vr28, vr14, vr29 // c[28] vsadd.h vr13, vr15, vr7 // c[4] vssub.h vr27, vr15, vr7 // c[27] vsadd.h vr14, vr16, vr6 // c[5] vssub.h vr29, vr16, vr6 // c[26] vsadd.h vr7, vr17, vr10 // c[6] vssub.h vr15, vr17, vr10 // c[25] vsadd.h vr6, vr18, vr8 // c[7] vssub.h vr16, vr18, vr8 // c[24] .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st0, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st1, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vld_x8 \in2, \vld_st1, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr31 // c[8] vssub.h vr2, vr11, vr31 // c[23] vsadd.h vr24, vr12, vr19 // c[9] vssub.h vr26, vr12, vr19 // c[22] vsadd.h vr11, vr13, vr25 // c[10] vssub.h vr30, vr13, vr25 // c[21] vsadd.h vr12, vr14, vr5 // c[11] vssub.h vr28, vr14, vr5 // c[20] vsadd.h vr13, vr15, vr9 // c[12] vssub.h vr27, vr15, vr9 // c[19] vsadd.h vr14, vr16, vr4 // c[13] vssub.h vr29, vr16, vr4 // c[18] vsadd.h vr7, vr17, vr0 // c[14] vssub.h vr15, vr17, vr0 // c[17] vsadd.h vr6, vr18, vr3 // c[15] vssub.h vr16, vr18, vr3 // c[16] .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st2, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st3, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 .endm const eob_32x32 .short 36, 136, 300, 1024 endconst const eob_8x32 .short 43, 107, 171, 256 endconst const eob_16x32 .short 36, 151, 279, 512 endconst .macro DST_ADD_W32 in0, in1, in2, in3, in4, in5, in6, in7 vsllwil.hu.bu vr4, vr10, 0 vsllwil.hu.bu vr5, vr11, 0 vsllwil.hu.bu vr6, vr12, 0 vsllwil.hu.bu vr7, vr13, 0 vexth.hu.bu vr10, vr10 vexth.hu.bu vr11, vr11 vexth.hu.bu vr12, vr12 vexth.hu.bu vr13, vr13 vadd.h vr4, vr4, \in0 vadd.h vr10, vr10, \in1 vadd.h vr5, vr5, \in2 vadd.h vr11, vr11, \in3 vadd.h vr6, vr6, \in4 vadd.h vr12, vr12, \in5 vadd.h vr7, vr7, \in6 vadd.h vr13, vr13, \in7 vssrani.bu.h vr10, vr4, 0 vssrani.bu.h vr11, vr5, 0 vssrani.bu.h vr12, vr6, 0 vssrani.bu.h vr13, vr7, 0 vst vr10, a0, 0 vst vr11, a0, 16 vst vr12, t2, 0 vst vr13, t2, 16 .endm .macro idct_dc_w32 w, h, shift ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 add.d t2, a0, a1 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr13, t2, 16 .if (2*\w == \h) || (2*\h == \w) vmul.w vr2, vr2, vr0 vsrari.w vr2, vr2, 8 .endif .if \shift>0 vsrari.w vr2, vr2, \shift // (dc + rnd) >> shift .endif vld vr11, a0, 16 vmadd.w vr20, vr2, vr0 vld vr12, t2, 0 vssrarni.h.w vr20, vr20, 12 vld vr10, a0, 0 .endm function inv_txfm_add_dct_dct_32x8_8bpc_lsx bnez a3, .NO_HAS_DCONLY_32x8 idct_dc_w32 32, 8, 2 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .rept 3 alsl.d a0, a1, a0, 1 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_32X8_END .NO_HAS_DCONLY_32x8: malloc_space 512+256 addi.d t1, sp, 64 addi.d t2, a2, 0 addi.d t3, sp, 64 addi.d t3, t3, 512 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 vst_x16 t2, 0, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vxor.v vr31, vr31, vr31 vst_x16 t2, 16, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2 addi.d t2, sp, 64 .rept 4 vld_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 4 .endr vst_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d t2, t2, 16 .endr addi.d t0, sp, 64 .rept 4 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 1 addi.d t0, t0, 128 .endr free_space 512+256 .DCT_DCT_32X8_END: endfunc function inv_txfm_add_dct_dct_32x16_8bpc_lsx bnez a3, .NO_HAS_DCONLY_32x16 idct_dc_w32 32, 16, 1 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .rept 7 alsl.d a0, a1, a0, 1 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_32X16_END .NO_HAS_DCONLY_32x16: malloc_space 1024+256 // 32*32*2+512 addi.d t1, sp, 64 addi.d t2, a2, 0 addi.d t3, sp, 64 addi.d t3, t3, 1024 .rept 2 vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 vst_x16 t2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 rect2_lsx \i, vr23, \i .endr inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 la.local t0, idct_coeffs vldrepl.w vr23, t0, 0 // 2896 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 rect2_lsx \i, vr23, \i .endr vxor.v vr31, vr31, vr31 vst_x16 t2, 32, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 1 addi.d t2, t2, 16 addi.d t1, t1, 512 .endr addi.d t2, sp, 64 .rept 4 vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 4 .endr vst_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t2, t2, 16 .endr addi.d t0, sp, 64 .rept 8 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 1 addi.d t0, t0, 128 .endr free_space 1024+256 .DCT_DCT_32X16_END: endfunc function inv_txfm_add_dct_dct_32x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_32x32 idct_dc_w32 32, 32, 2 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .rept 15 alsl.d a0, a1, a0, 1 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_32X32_END .NO_HAS_DCONLY_32x32: malloc_space 2560 // 32*32*2+512 addi.d t1, sp, 64 addi.d t2, a2, 0 addi.d t3, sp, 1024 addi.d t3, t3, 1024 addi.d t3, t3, 64 la.local t8, eob_32x32 .DCT_DCT_EOB_32x32: ld.h t7, t8, 0 addi.d t8, t8, 2 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 vst_x16 t2, 0, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vxor.v vr31, vr31, vr31 vst_x16 t2, 64, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2 addi.d t2, t2, 16 addi.d t1, t1, 512 bge a3, t7, .DCT_DCT_EOB_32x32 la.local t8, eob_32x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_32x32_END // a3>=t7 vst_x16 sp, 64+1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t1, sp, 256+64 vst_x16 t1, 1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_32x32_END vst_x16 sp, 64+1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x16 t1, 1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_32x32_END vst_x16 sp, 64+512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x16 t1, 512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 .DCT_DCT_EOB_32x32_END: addi.d t2, sp, 64 addi.d t1, sp, 64 .rept 4 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 1536, 512, 1024, 64, , 4 addi.d t2, t2, 16 addi.d t1, t1, 16 .endr addi.d t0, sp, 64 .rept 16 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 1 addi.d t0, t0, 128 .endr free_space 2560 // 32*32*2+512 .DCT_DCT_32X32_END: endfunc /* * temp: vr8, vr9, vr10, vr12, vr20, vr21, vr22, vr23 */ .macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \ out1, out2, out3, out4, out5, out6, out7, rect2 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 rect2_lsx \i, vr23, \i .endr .endif la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vsllwil.w.h vr22, \in2, 0 vexth.w.h vr23, \in2 vmul.w vr8, vr22, vr20 vmul.w vr10, vr23, vr20 vmul.w \in2, vr22, vr21 vmul.w vr9, vr23, vr21 vssrarni.h.w vr10, vr8, 12 // t2 vssrarni.h.w vr9, \in2, 12 // t3 vldrepl.w vr20, t0, 0 // 2896 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w vr8, vr22, vr20 vmul.w \in2, vr23, vr20 vssrarni.h.w \in2, vr8, 12 vsadd.h vr8, \in2, vr9 // c[0] vssub.h vr9, \in2, vr9 // c[3] vsadd.h \in0, \in2, vr10 // c[1] vssub.h vr10, \in2, vr10 // c[2] // inv_dct8_1d_internal_c tx64 // in1 in3 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmul.w \in2, vr22, vr21 vmul.w \in4, vr23, vr21 vmul.w \in1, vr22, vr20 vmul.w \in6, vr23, vr20 vssrarni.h.w \in4, \in2, 12 // t7a vssrarni.h.w \in6, \in1, 12 // t4a vldrepl.w vr20, t0, 24 // 3406 vldrepl.w vr21, t0, 28 // 2276 vsllwil.w.h vr22, \in3, 0 vexth.w.h vr23, \in3 vneg.w vr21, vr21 vmul.w \in2, vr22, vr20 vmul.w \in1, vr23, vr20 vmul.w \in3, vr22, vr21 vmul.w \in7, vr23, vr21 vssrarni.h.w \in1, \in2, 12 // t6a vssrarni.h.w \in7, \in3, 12 // t5a vsadd.h \in3, \in6, \in7 // t4 vssub.h \in6, \in6, \in7 // t5a vsadd.h \in5, \in4, \in1 // t7 vssub.h \in4, \in4, \in1 // t6a vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1 vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 vssrarni.h.w \in1, vr21, 12 // t6 vssrarni.h.w \in7, \in2, 12 // t5 vsadd.h \out0, vr8, \in5 // c[0] vssub.h \out7, vr8, \in5 // c[7] vsadd.h \out1, \in0, \in1 // c[1] vssub.h \out6, \in0, \in1 // c[6] vsadd.h \out2, vr10, \in7 // c[2] vssub.h \out5, vr10, \in7 // c[5] vsadd.h \out3, vr9, \in3 // c[3] vssub.h \out4, vr9, \in3 // c[4] .endm /* * input: in0, in1, in2, in3, in4, in5, in6, in7 (fixed) * vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 * in8, in9, in10, in11, in12, in13, in14, in15 * vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 * output: out0, out1, out2, out3, out4, out5, out6, out7 (fixed) * vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 * out8, out9, out10, out11, out12, out13, out14, out15 * vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 */ .macro dct_8x16_tx64_core_lsx rect2 dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \ vr12, vr13, vr14, vr15, vr16, vr17, vr18, \rect2 // in1 in3 in5 in7 in9 in11 in13 in15 // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, vr1, vr3, vr5, vr7, vr24, vr26, vr28, vr30 rect2_lsx \i, vr23, \i .endr .endif vldrepl.w vr20, t0, 32 // 401 vldrepl.w vr21, t0, 36 // 4076 vsllwil.w.h vr22, vr1, 0 vexth.w.h vr23, vr1 vmul.w vr0, vr22, vr21 vmul.w vr10, vr23, vr21 vmul.w vr1, vr22, vr20 vmul.w vr29, vr23, vr20 vssrarni.h.w vr10, vr0, 12 // t15a vssrarni.h.w vr29, vr1, 12 // t8a vldrepl.w vr20, t0, 40 // 3166 -> 1583 vldrepl.w vr21, t0, 44 // 2598 -> 1299 vsllwil.w.h vr22, vr7, 0 vexth.w.h vr23, vr7 vneg.w vr21, vr21 vmul.w vr0, vr22, vr20 vmul.w vr30, vr23, vr20 vmul.w vr7, vr22, vr21 vmul.w vr31, vr23, vr21 vssrarni.h.w vr30, vr0, 12 // t14a vssrarni.h.w vr31, vr7, 12 // t9a vldrepl.w vr20, t0, 48 // 1931 vldrepl.w vr21, t0, 52 // 3612 vsllwil.w.h vr22, vr5, 0 vexth.w.h vr23, vr5 vmul.w vr0, vr22, vr21 vmul.w vr24, vr23, vr21 vmul.w vr5, vr22, vr20 vmul.w vr25, vr23, vr20 vssrarni.h.w vr24, vr0, 12 // t13a vssrarni.h.w vr25, vr5, 12 // t10a vldrepl.w vr20, t0, 56 // 3920 vldrepl.w vr21, t0, 60 // 1189 vsllwil.w.h vr22, vr3, 0 vexth.w.h vr23, vr3 vneg.w vr21, vr21 vmul.w vr0, vr22, vr20 vmul.w vr26, vr23, vr20 vmul.w vr3, vr22, vr21 vmul.w vr27, vr23, vr21 vssrarni.h.w vr26, vr0, 12 // t12a vssrarni.h.w vr27, vr3, 12 // t11a // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 vsadd.h vr28, vr29, vr31 // t8 vssub.h vr19, vr29, vr31 // t9 vssub.h vr29, vr27, vr25 // t10 vsadd.h vr9, vr27, vr25 // t11 vsadd.h vr31, vr26, vr24 // t12 vssub.h vr25, vr26, vr24 // t13 vssub.h vr27, vr10, vr30 // t14 vsadd.h vr24, vr10, vr30 // t15 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30 vssrarni.h.w vr26, vr0, 12 // t14a vssrarni.h.w vr30, vr1, 12 // t9a vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 vneg.w vr0, vr0 vneg.w vr19, vr19 vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27 vssrarni.h.w vr19, vr0, 12 // t10a vssrarni.h.w vr27, vr1, 12 // t13a vsadd.h vr25, vr28, vr9 // t8a vssub.h vr29, vr28, vr9 // t11a vssub.h vr28, vr24, vr31 // t12a vsadd.h vr10, vr24, vr31 // t15a vsadd.h vr9, vr30, vr19 // t9 vssub.h vr31, vr30, vr19 // t10 vssub.h vr30, vr26, vr27 // t13 vsadd.h vr24, vr26, vr27 // t14 vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27 vssrarni.h.w vr26, vr0, 12 // t13a vssrarni.h.w vr27, vr1, 12 // t10a vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30 vssrarni.h.w vr31, vr0, 12 // t12 vssrarni.h.w vr30, vr1, 12 // t11 // vr11 vr12 ... vr18 vsadd.h vr28, vr14, vr31 // c[3] vssub.h vr29, vr14, vr31 // c[12] vsadd.h vr20, vr15, vr30 // c[4] vssub.h vr21, vr15, vr30 // c[11] vsadd.h vr14, vr16, vr27 // c[5] vssub.h vr23, vr16, vr27 // c[10] vsadd.h vr15, vr17, vr9 // c[6] vssub.h vr30, vr17, vr9 // c[9] vsadd.h vr16, vr18, vr25 // c[7] vssub.h vr27, vr18, vr25 // c[8] vsadd.h vr17, vr13, vr26 // c[2] vssub.h vr26, vr13, vr26 // c[13] vsadd.h vr18, vr12, vr24 // c[1] vssub.h vr25, vr12, vr24 // c[14] vsadd.h vr22, vr11, vr10 // c[0] vssub.h vr24, vr11, vr10 // c[15] .endm // dct_8x16_tx64_core_lsx .macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \tmp0, vr22, \in1 vmul.w \out0, vr23, \in1 vmul.w \tmp1, vr22, \in2 vmul.w \out1, vr23, \in2 vssrarni.h.w \out0, \tmp0, 12 vssrarni.h.w \out1, \tmp1, 12 .endm const idct64_coeffs, align=4 .word 101, 4095, 2967, -2824 .word 1660, 3745, 3822, -1474 .word 4076, 401, 4017, 799 .word 4036, -700, 2359, 3349 .word 3461, -2191, 897, 3996 .word -3166, -2598, -799, -4017 .word 501, 4065, 3229, -2520 .word 2019, 3564, 3948, -1092 .word 3612, 1931, 2276, 3406 .word 4085, -301, 2675, 3102 .word 3659, -1842, 1285, 3889 .word -3920, -1189, -3406, -2276 endconst .macro dct64_step1_lsx vldrepl.w vr20, t0, 0 // 101 vldrepl.w vr21, t0, 4 // 4095 vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a vldrepl.w vr20, t0, 8 // 2967 vldrepl.w vr21, t0, 12 // -2824 vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a vldrepl.w vr20, t0, 16 // 1660 vldrepl.w vr21, t0, 20 // 3745 vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a vldrepl.w vr20, t0, 24 // 3822 vldrepl.w vr21, t0, 28 // -1474 vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a vsadd.h vr0, vr8, vr11 // t32 vssub.h vr1, vr8, vr11 // t33 vssub.h vr2, vr15, vr12 // t34 vsadd.h vr3, vr15, vr12 // t35 vsadd.h vr4, vr14, vr13 // t60 vssub.h vr5, vr14, vr13 // t61 vssub.h vr6, vr9, vr10 // t62 vsadd.h vr7, vr9, vr10 // t63 vldrepl.w vr20, t0, 32 // 4076 vldrepl.w vr21, t0, 36 // 401 vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10 vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11 vssrarni.h.w vr10, vr9, 12 // t62a vssrarni.h.w vr11, vr13, 12 // t33a vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1 vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6 vneg.w vr9, vr9 vneg.w vr1, vr1 vssrarni.h.w vr6, vr13, 12 // t61a vssrarni.h.w vr1, vr9, 12 // t34a vsadd.h vr2, vr0, vr3 // t32a vssub.h vr5, vr0, vr3 // t35a vsadd.h vr9, vr11, vr1 // t33 vssub.h vr13, vr11, vr1 // t34 vssub.h vr0, vr7, vr4 // t60a vsadd.h vr3, vr7, vr4 // t63a vssub.h vr1, vr10, vr6 // t61 vsadd.h vr11, vr10, vr6 // t62 vldrepl.w vr20, t0, 40 // 4017 vldrepl.w vr21, t0, 44 // 799 vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4 vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7 vssrarni.h.w vr4, vr8, 12 // t61a vssrarni.h.w vr7, vr12, 12 // t34a vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6 vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10 vssrarni.h.w vr6, vr8, 12 // t60 vssrarni.h.w vr10, vr12, 12 // t35 vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3 .endm // dct64_step1 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a .macro dct64_step2_lsx vld vr0, t5, 0 // t32a vld vr2, t4, 0 // t63a vld vr3, t5, 16*8 // t56a vld vr1, t4, 16*8 // t39a vld vr4, t5, 16*16 // t40a vld vr6, t4, 16*16 // t55a vld vr7, t5, 16*24 // t48a vld vr5, t4, 16*24 // t47a vsadd.h vr8, vr0, vr1 // t32 vssub.h vr9, vr0, vr1 // t39 vsadd.h vr10, vr2, vr3 // t63 vssub.h vr11, vr2, vr3 // t56 vssub.h vr12, vr5, vr4 // t40 vsadd.h vr13, vr5, vr4 // t47 vsadd.h vr14, vr7, vr6 // t48 vssub.h vr15, vr7, vr6 // t55 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2 vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3 vssrarni.h.w vr2, vr0, 12 // t56a vssrarni.h.w vr3, vr1, 12 // t39a vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4 vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5 vneg.w vr0, vr0 vneg.w vr4, vr4 vssrarni.h.w vr5, vr1, 12 // t55a vssrarni.h.w vr4, vr0, 12 // t40a vsadd.h vr9, vr8, vr13 // t32a vssub.h vr11, vr8, vr13 // t47a vsadd.h vr6, vr3, vr4 // t39 vssub.h vr7, vr3, vr4 // t40 vssub.h vr12, vr10, vr14 // t48a vsadd.h vr15, vr10, vr14 // t63a vssub.h vr0, vr2, vr5 // t55 vsadd.h vr1, vr2, vr5 // t56 vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13 vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4 vssrarni.h.w vr13, vr8, 12 // t40a vssrarni.h.w vr4, vr3, 12 // t55a vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10 vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14 vssrarni.h.w vr10, vr8, 12 // t47 vssrarni.h.w vr14, vr3, 12 // t48 // t32a t39 t40a t47 t48 t55a t56 t63a // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15 vst vr9, t5, 0 // t32a vst vr6, t4, 0 // t39 vst vr13, t5, 16*8 // t40a vst vr10, t4, 16*8 // t47 vst vr14, t5, 16*16 // t48 vst vr4, t4, 16*16 // t55a vst vr1, t5, 16*24 // t56 vst vr15, t4, 16*24 // t63a .endm // dct64_step2_lsx .macro dct64_step3_lsx // t0 t1 t2 t3 t4 t5 t6 t7 vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17 vld vr9, t5, 16*24 // t56 vld vr6, t5, 16*24+16 // t57a vld vr13, t5, 16*24+32 // t58 vld vr10, t5, 16*24+48 // t59a vld vr14, t4, 16*24-48 // t60 vld vr4, t4, 16*24-32 // t61a vld vr1, t4, 16*24-16 // t62 vld vr15, t4, 16*24 // t63a vsadd.h vr20, vr2, vr15 // c[0] vssub.h vr21, vr2, vr15 // c[63] vsadd.h vr22, vr3, vr1 // c[1] vssub.h vr23, vr3, vr1 // c[62] vsadd.h vr24, vr7, vr4 // c[2] vssub.h vr25, vr7, vr4 // c[61] vsadd.h vr26, vr8, vr14 // c[3] vssub.h vr27, vr8, vr14 // c[60] vsadd.h vr28, vr11, vr10 // c[4] vssub.h vr29, vr11, vr10 // c[59] vsadd.h vr30, vr12, vr13 // c[5] vssub.h vr31, vr12, vr13 // c[58] vsadd.h vr2, vr16, vr6 // c[6] vssub.h vr15, vr16, vr6 // c[57] vsadd.h vr1, vr17, vr9 // c[7] vssub.h vr3, vr17, vr9 // c[56] .endm // dct64_step3_lsx .macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1 dct64_step3_lsx .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 .endif .ifnb \shift .irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 vsrari.h \i, \i, \shift .endr .endif vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 .endm // dct64_step4_lsx .macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7 fld.d f4, t0, 0 fldx.d f5, t0, a1 fld.d f6, t6, 0 fldx.d f7, t6, a1 alsl.d t0, a1, t0, 2 alsl.d t6, a1, t6, 2 fld.d f8, t0, 0 fldx.d f9, t0, a1 fld.d f10, t6, 0 fldx.d f11, t6, a1 .irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11 vsllwil.hu.bu \i, \i, 0 .endr vsrari.h vr20, \in0, 4 vsrari.h vr22, \in1, 4 vsrari.h vr24, \in2, 4 vsrari.h vr26, \in3, 4 vsrari.h vr28, \in4, 4 vsrari.h vr30, \in5, 4 vsrari.h vr2, \in6, 4 vsrari.h vr1, \in7, 4 vadd.h vr4, vr4, vr20 vadd.h vr5, vr5, vr22 vadd.h vr6, vr6, vr24 vadd.h vr7, vr7, vr26 vadd.h vr8, vr8, vr28 vadd.h vr9, vr9, vr30 vadd.h vr10, vr10, vr2 vadd.h vr11, vr11, vr1 vssrani.bu.h vr5, vr4, 0 vssrani.bu.h vr7, vr6, 0 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vstelm.d vr5, t1, 0, 0 vstelm.d vr5, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr7, t1, 0, 0 vstelm.d vr7, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr9, t1, 0, 0 vstelm.d vr9, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr11, t1, 0, 0 vstelm.d vr11, t2, 0, 1 .endm // dct64_step5_lsx .macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1, rect2 vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 dct_8x16_tx64_core_lsx \rect2 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vxor.v vr31, vr31, vr31 vst_x8 t2, \vld_loc0, \stride0, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vst_x8 t2, \vld_loc1, \stride1, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 rect2_lsx \i, vr23, \i .endr .endif vldrepl.w vr20, t0, 64 // 201 vldrepl.w vr21, t0, 68 // 4091 vsllwil.w.h vr22, vr0, 0 vexth.w.h vr23, vr0 vmul.w vr8, vr22, vr21 vmul.w vr9, vr23, vr21 vmul.w vr0, vr22, vr20 vmul.w vr10, vr23, vr20 vssrarni.h.w vr9, vr8, 12 // t31a vssrarni.h.w vr10, vr0, 12 // t16a vldrepl.w vr20, t0, 72 // 3035 vldrepl.w vr21, t0, 76 // 2751 vsllwil.w.h vr22, vr7, 0 vexth.w.h vr23, vr7 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr0, vr23, vr20 vmul.w vr7, vr22, vr21 vmul.w vr30, vr23, vr21 vssrarni.h.w vr0, vr8, 12 // t30a vssrarni.h.w vr30, vr7, 12 // t17a vldrepl.w vr20, t0, 80 // 1751 vldrepl.w vr21, t0, 84 // 3703 vsllwil.w.h vr22, vr4, 0 vexth.w.h vr23, vr4 vmul.w vr8, vr22, vr21 vmul.w vr7, vr23, vr21 vmul.w vr4, vr22, vr20 vmul.w vr19, vr23, vr20 vssrarni.h.w vr7, vr8, 12 // t29a vssrarni.h.w vr19, vr4, 12 // t18a vldrepl.w vr20, t0, 88 // 3857 vldrepl.w vr21, t0, 92 // 1380 vsllwil.w.h vr22, vr3, 0 vexth.w.h vr23, vr3 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr4, vr23, vr20 vmul.w vr3, vr22, vr21 vmul.w vr26, vr23, vr21 vssrarni.h.w vr4, vr8, 12 // t28a vssrarni.h.w vr26, vr3, 12 // t19a vldrepl.w vr20, t0, 96 // 995 vldrepl.w vr21, t0, 100 // 3973 vsllwil.w.h vr22, vr2, 0 vexth.w.h vr23, vr2 vmul.w vr8, vr22, vr21 vmul.w vr3, vr23, vr21 vmul.w vr2, vr22, vr20 vmul.w vr27, vr23, vr20 vssrarni.h.w vr3, vr8, 12 // t27a vssrarni.h.w vr27, vr2, 12 // t20a vldrepl.w vr20, t0, 104 // 3513 vldrepl.w vr21, t0, 108 // 2106 vsllwil.w.h vr22, vr5, 0 vexth.w.h vr23, vr5 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr2, vr23, vr20 vmul.w vr5, vr22, vr21 vmul.w vr28, vr23, vr21 vssrarni.h.w vr2, vr8, 12 // t26a vssrarni.h.w vr28, vr5, 12 // t21a vldrepl.w vr20, t0, 112 // 2440 -> 1220 vldrepl.w vr21, t0, 116 // 3290 -> 1645 vsllwil.w.h vr22, vr6, 0 vexth.w.h vr23, vr6 vmul.w vr8, vr22, vr21 vmul.w vr5, vr23, vr21 vmul.w vr6, vr22, vr20 vmul.w vr25, vr23, vr20 vssrarni.h.w vr5, vr8, 12 // t25a vssrarni.h.w vr25, vr6, 12 // t22a vldrepl.w vr20, t0, 120 // 4052 vldrepl.w vr21, t0, 124 // 601 vsllwil.w.h vr22, vr1, 0 vexth.w.h vr23, vr1 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr6, vr23, vr20 vmul.w vr1, vr22, vr21 vmul.w vr24, vr23, vr21 vssrarni.h.w vr6, vr8, 12 // t24a vssrarni.h.w vr24, vr1, 12 // t23a vsadd.h vr1, vr10, vr30 // t16 vssub.h vr29, vr10, vr30 // t17 vssub.h vr8, vr26, vr19 // t18 vsadd.h vr31, vr26, vr19 // t19 vsadd.h vr10, vr27, vr28 // t20 vssub.h vr30, vr27, vr28 // t21 vssub.h vr19, vr24, vr25 // t22 vsadd.h vr26, vr24, vr25 // t23 vsadd.h vr27, vr6, vr5 // t24 vssub.h vr28, vr6, vr5 // t25 vssub.h vr24, vr3, vr2 // t26 vsadd.h vr25, vr3, vr2 // t27 vsadd.h vr5, vr4, vr7 // t28 vssub.h vr6, vr4, vr7 // t29 vssub.h vr2, vr9, vr0 // t30 vsadd.h vr3, vr9, vr0 // t31 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 vssrarni.h.w vr7, vr4, 12 // t30a vssrarni.h.w vr0, vr11, 12 // t17a vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 vneg.w vr4, vr4 vneg.w vr9, vr9 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 vssrarni.h.w vr9, vr4, 12 // t18a vssrarni.h.w vr2, vr11, 12 // t29a vldrepl.w vr20, t0, 24 // 3406 -> 1703 vldrepl.w vr21, t0, 28 // 2276 -> 1138 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 vssrarni.h.w vr29, vr4, 12 // t26a vssrarni.h.w vr6, vr11, 12 // t21a vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 vneg.w vr4, vr4 vneg.w vr8, vr8 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 vssrarni.h.w vr8, vr4, 12 // t22a vssrarni.h.w vr24, vr11, 12 // t25a vsadd.h vr4, vr1, vr31 // t16a vssub.h vr30, vr1, vr31 // t19a vsadd.h vr19, vr0, vr9 // t17 vssub.h vr28, vr0, vr9 // t18 vssub.h vr1, vr26, vr10 // t20a vsadd.h vr31, vr26, vr10 // t23a vssub.h vr0, vr8, vr6 // t21 vsadd.h vr9, vr8, vr6 // t22 vsadd.h vr10, vr27, vr25 // t24a vssub.h vr26, vr27, vr25 // t27a vsadd.h vr6, vr24, vr29 // t25 vssub.h vr8, vr24, vr29 // t26 vssub.h vr25, vr3, vr5 // t28a vsadd.h vr27, vr3, vr5 // t31a vssub.h vr24, vr7, vr2 // t29 vsadd.h vr29, vr7, vr2 // t30 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 vssrarni.h.w vr5, vr3, 12 // t29a vssrarni.h.w vr2, vr11, 12 // 18a vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 vssrarni.h.w vr7, vr3, 12 // t28 vssrarni.h.w vr24, vr11, 12 // t19 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 vneg.w vr3, vr3 vneg.w vr28, vr28 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 vssrarni.h.w vr28, vr3, 12 // t20 vssrarni.h.w vr25, vr11, 12 // t27 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 vneg.w vr3, vr3 vneg.w vr30, vr30 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 vssrarni.h.w vr30, vr3, 12 // t21a vssrarni.h.w vr1, vr11, 12 // t26a vsadd.h vr3, vr4, vr31 // t16 vssub.h vr26, vr4, vr31 // t23 vsadd.h vr0, vr19, vr9 // t17a vssub.h vr8, vr19, vr9 // t22a vsadd.h vr4, vr2, vr30 // t18 vssub.h vr31, vr2, vr30 // t21 vsadd.h vr9, vr24, vr28 // t19a vssub.h vr19, vr24, vr28 // t20a vssub.h vr2, vr27, vr10 // t24 vsadd.h vr30, vr27, vr10 // t31 vssub.h vr24, vr29, vr6 // t25a vsadd.h vr28, vr29, vr6 // t30a vssub.h vr10, vr5, vr1 // t26 vsadd.h vr27, vr5, vr1 // t29 vssub.h vr6, vr7, vr25 // t27a vsadd.h vr29, vr7, vr25 // t28a vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 vssrarni.h.w vr5, vr1, 12 // t20 vssrarni.h.w vr7, vr11, 12 // t27 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 vssrarni.h.w vr25, vr1, 12 // t21a vssrarni.h.w vr6, vr11, 12 // t26a vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 vssrarni.h.w vr19, vr1, 12 // t22 vssrarni.h.w vr10, vr11, 12 // t25 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 vssrarni.h.w vr31, vr1, 12 // t23a vssrarni.h.w vr8, vr11, 12 // t24a // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr30 // c[0] vssub.h vr2, vr11, vr30 // c[31] vsadd.h vr24, vr12, vr28 // c[1] vssub.h vr26, vr12, vr28 // c[30] vsadd.h vr11, vr13, vr27 // c[2] vssub.h vr30, vr13, vr27 // c[29] vsadd.h vr12, vr14, vr29 // c[3] vssub.h vr28, vr14, vr29 // c[28] vsadd.h vr13, vr15, vr7 // c[4] vssub.h vr27, vr15, vr7 // c[27] vsadd.h vr14, vr16, vr6 // c[5] vssub.h vr29, vr16, vr6 // c[26] vsadd.h vr7, vr17, vr10 // c[6] vssub.h vr15, vr17, vr10 // c[25] vsadd.h vr6, vr18, vr8 // c[7] vssub.h vr16, vr18, vr8 // c[24] vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr31 // c[8] vssub.h vr2, vr11, vr31 // c[23] vsadd.h vr24, vr12, vr19 // c[9] vssub.h vr26, vr12, vr19 // c[22] vsadd.h vr11, vr13, vr25 // c[10] vssub.h vr30, vr13, vr25 // c[21] vsadd.h vr12, vr14, vr5 // c[11] vssub.h vr28, vr14, vr5 // c[20] vsadd.h vr13, vr15, vr9 // c[12] vssub.h vr27, vr15, vr9 // c[19] vsadd.h vr14, vr16, vr4 // c[13] vssub.h vr29, vr16, vr4 // c[18] vsadd.h vr7, vr17, vr0 // c[14] vssub.h vr15, vr17, vr0 // c[17] vsadd.h vr6, vr18, vr3 // c[15] vssub.h vr16, vr18, vr3 // c[16] vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 .endm // dct_8x32_tx64_new_lsx .macro DST_ADD_W64 in0, in1, in2, in3, in4, in5, in6, in7 vsllwil.hu.bu vr4, vr10, 0 vsllwil.hu.bu vr5, vr11, 0 vsllwil.hu.bu vr6, vr12, 0 vsllwil.hu.bu vr7, vr13, 0 vexth.hu.bu vr10, vr10 vexth.hu.bu vr11, vr11 vexth.hu.bu vr12, vr12 vexth.hu.bu vr13, vr13 vadd.h vr4, vr4, \in0 vadd.h vr10, vr10, \in1 vadd.h vr5, vr5, \in2 vadd.h vr11, vr11, \in3 vadd.h vr6, vr6, \in4 vadd.h vr12, vr12, \in5 vadd.h vr7, vr7, \in6 vadd.h vr13, vr13, \in7 vssrani.bu.h vr10, vr4, 0 vssrani.bu.h vr11, vr5, 0 vssrani.bu.h vr12, vr6, 0 vssrani.bu.h vr13, vr7, 0 vst vr10, a0, 0 vst vr11, a0, 16 vst vr12, a0, 32 vst vr13, a0, 48 .endm .macro idct_dc_w64 w, h, shift ld.h t2, a2, 0 vldi vr0, 0x8b5 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 vmul.w vr2, vr0, vr1 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 vld vr13, a0, 48 .if (2*\w == \h) || (2*\h == \w) vmul.w vr2, vr2, vr0 vsrari.w vr2, vr2, 8 .endif .if \shift>0 vsrari.w vr2, vr2, \shift .endif vld vr11, a0, 16 vmadd.w vr20, vr2, vr0 vld vr12, a0, 32 vssrarni.h.w vr20, vr20, 12 vld vr10, a0, 0 .endm function inv_txfm_add_dct_dct_64x64_8bpc_lsx bnez a3, .NO_HAS_DCONLY_64x64 idct_dc_w64 64, 64, 2 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 li.w t3, 63 .loop63: add.d a0, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, a0, 32 vld vr13, a0, 48 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 addi.d t3, t3, -1 blt zero, t3, .loop63 b .DCT_DCT_64X64_END .NO_HAS_DCONLY_64x64: malloc_space 64*32*2+512+512 .macro dct64x64_core1_lsx shift, rect2 //addi.d t2, a2, \in0 //addi.d t7, t7, \in1 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t6, t3, 512 add.d t5, t6, zero dct_8x32_tx64_new_lsx 0, 256, 128, 256, \rect2 la.local t0, idct64_coeffs vxor.v vr31, vr31, vr31 //addi.d a4, a2, \in2 // 32 ... // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a vld vr0, a4, 128*0 // in1 vld vr1, a4, 128*15 // in31 vld vr2, a4, 128*8 // in17 vld vr3, a4, 128*7 // in15 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*0 vst vr31, a4, 128*15 vst vr31, a4, 128*8 vst vr31, a4, 128*7 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a vld vr0, a4, 128*3 // in7 vld vr1, a4, 128*12 // in25 vld vr2, a4, 128*11 // in23 vld vr3, a4, 128*4 // in9 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*3 vst vr31, a4, 128*12 vst vr31, a4, 128*11 vst vr31, a4, 128*4 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a vld vr0, a4, 128*2 // in5 vld vr1, a4, 128*13 // in27 vld vr2, a4, 128*10 // in21 vld vr3, a4, 128*5 // in11 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*2 vst vr31, a4, 128*13 vst vr31, a4, 128*10 vst vr31, a4, 128*5 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vld vr0, a4, 128*1 // in3 vld vr1, a4, 128*14 // in29 vld vr2, a4, 128*9 // in19 vld vr3, a4, 128*6 // in13 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*1 vst vr31, a4, 128*14 vst vr31, a4, 128*9 vst vr31, a4, 128*6 dct64_step1_lsx la.local t0, idct_coeffs addi.d t4, t5, 16*7 // t32a/t39/t40a/t47/t48/t55a/t56/t63a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t34a/t37/t42a/t45/t50/t53a/t58/t61a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 dct64_step2_lsx li.w t4, 64*32*2+64+512 add.d t5, t4, sp addi.d t4, t5, 16*7 dct64_step4_lsx transpose8x8, \shift, 0, 128, 112, 128 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step4_lsx transpose8x8, \shift, 16, 128, 96, 128 addi.d t5, t5, -16*8 addi.d t4, t4, -16*8 addi.d t3, t3, 128 dct64_step4_lsx transpose8x8, \shift, 32, 128, 80, 128 addi.d t5, t5, -16*8 addi.d t4, t4, -16*8 addi.d t3, t3, 128 dct64_step4_lsx transpose8x8, \shift, 48, 128, 64, 128 .endm la.local t8, eob_32x32 addi.d t2, a2, 0 addi.d t7, sp, 64 addi.d t7, t7, 0 addi.d a4, a2, 64 .DCT_DCT_EOB_64x64: ld.h a5, t8, 0 addi.d t8, t8, 2 dct64x64_core1_lsx 2, no_rect2 addi.d t2, t2, 16 addi.d t7, t7, 128*8 addi.d a4, a4, 16 bge a3, a5, .DCT_DCT_EOB_64x64 la.local t8, eob_32x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_64x64_END li.d t1, 1024*3+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_64x64_END li.d t1, 1024*2+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_64x64_END li.d t1, 1024*1+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr .DCT_DCT_EOB_64x64_END: .macro dct64x64_core2_lsx in0, in1, rect2 addi.d t2, sp, 64+\in0 addi.d t7, sp, 64+\in0 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t6, t3, 512 add.d t5, t6, zero addi.d t2, t2, 1024 addi.d t2, t2, 1024 dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512, \rect2 la.local t0, idct64_coeffs addi.d t2, sp, 64+64*2+\in0 addi.d t4, t2, 256*7 addi.d t4, t4, 256 vld vr0, t2, 256*0 // in1 vld vr1, t4, 256*7 // in31 vld vr2, t4, 256*0 // in17 vld vr3, t2, 256*7 // in15 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*3 // in7 vld vr1, t4, 256*4 // in25 vld vr2, t4, 256*3 // in23 vld vr3, t2, 256*4 // in9 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*2 // in5 vld vr1, t4, 256*5 // in27 vld vr2, t4, 256*2 // in21 vld vr3, t2, 256*5 // in11 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*1 // in3 vld vr1, t4, 256*6 // in29 vld vr2, t4, 256*1 // in19 vld vr3, t2, 256*6 // in13 dct64_step1_lsx la.local t0, idct_coeffs addi.d t4, t5, 16*7 // t32a/t39/t40a/t47/t48/t55a/t56/t63a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t34a/t37/t42a/t45/t50/t53a/t58/t61a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 dct64_step2_lsx li.w t4, 64*32*2+64+512 add.d t5, t4, sp addi.d t4, t5, 16*7 addi.d a0, a0, \in1 // 0 - 7, 56 -63 dct64_step3_lsx li.w t8, 0 mul.w t0, t8, a1 add.d t0, a0, t0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 56 mul.w t0, t8, a1 add.d t0, a0, t0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 8 - 15, 48 - 55 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 8 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 48 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 16 - 23, 40 - 47 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 16 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 40 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 24 - 31, 32 - 39 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 24 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 32 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 .endm dct64x64_core2_lsx 16*0, 0, no_rect2 dct64x64_core2_lsx 16*1, 8, no_rect2 dct64x64_core2_lsx 16*2, 8, no_rect2 dct64x64_core2_lsx 16*3, 8, no_rect2 dct64x64_core2_lsx 16*4, 8, no_rect2 dct64x64_core2_lsx 16*5, 8, no_rect2 dct64x64_core2_lsx 16*6, 8, no_rect2 dct64x64_core2_lsx 16*7, 8, no_rect2 free_space 64*32*2+512+512 .DCT_DCT_64X64_END: endfunc function inv_txfm_add_dct_dct_64x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_64x32 idct_dc_w64 64, 32, 1 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 li.w t3, 31 .loop31: add.d a0, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, a0, 32 vld vr13, a0, 48 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 addi.d t3, t3, -1 blt zero, t3, .loop31 b .DCT_DCT_64X32_END .NO_HAS_DCONLY_64x32: malloc_space 64*32*2+512+512 la.local t8, eob_32x32 addi.d t2, a2, 0 addi.d t7, sp, 64 addi.d t7, t7, 0 addi.d a4, a2, 64 .DCT_DCT_EOB_64x32: ld.h a5, t8, 0 addi.d t8, t8, 2 dct64x64_core1_lsx 1, rect2_lsx addi.d t2, t2, 16 addi.d t7, t7, 128*8 addi.d a4, a4, 16 bge a3, a5, .DCT_DCT_EOB_64x32 la.local t8, eob_32x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_64x32_END li.d t1, 1024*3+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_64x32_END li.d t1, 1024*2+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_64x32_END li.d t1, 1024*1+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr .DCT_DCT_EOB_64x32_END: addi.d t2, sp, 64 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t5, sp, 64 addi.d t5, t5, 1024 addi.d t5, t5, 1024 .rept 8 vld_x8 t2, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d t4, t2, 1024 addi.d t4, t4, 1024 vld_x8 t4, 0, 256, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx no_rect2 vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t4, t2, 128 vld_x8 t4, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d t4, t4, 1024 addi.d t4, t4, 1024 vld_x8 t4, 0, 256, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t5, t3, 0, 128, 16, -2048, 1024, -1024, 0, 128, , 4 addi.d t2, t2, 16 addi.d t5, t5, 16 addi.d t1, t1, 16 .endr addi.d t2, sp, 64 li.w t3, 32 .loop32: vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, a0, 32 vld vr13, a0, 48 vld_x8 t2, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W64 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 add.d a0, a0, a1 addi.d t2, t2, 128 addi.d t3, t3, -1 blt zero, t3, .loop32 free_space 64*32*2+512+512 .DCT_DCT_64X32_END: endfunc .macro VLD_DST_ADD_W8_H32 in0 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, t3, 64 add.d a0, a1, a0 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, sp, \in0 add.d a0, a1, a0 alsl.d t2, a1, t2, 2 .endm function inv_txfm_add_dct_dct_8x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_8x32 idct_dc 8, 32, 2 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 .rept 7 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 .endr b .DCT_DCT_8X32_END .NO_HAS_DCONLY_8x32: malloc_space 512 la.local t8, eob_8x32 addi.d t3, sp, 64 addi.d t2, a2, 0 .DCT_DCT_EOB_8x32: ld.h t7, t8, 0 addi.d t8, t8, 2 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 2 .endr vxor.v vr31, vr31, vr31 vst_x8 a2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d a2, a2, 16 addi.d t3, t3, 128 bge a3, t7, .DCT_DCT_EOB_8x32 la.local t8, eob_8x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_8x32_END vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_8x32_END vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_8x32_END vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 .DCT_DCT_EOB_8x32_END: addi.d t2, sp, 64 addi.d t3, sp, 64 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h vst_x16 t3, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t2, t3, 0, 256, 32, 0, 128, 256, 384, 16, , 4 alsl.d t2, a1, a0, 1 addi.d t3, sp, 64 VLD_DST_ADD_W8_H32 320 VLD_DST_ADD_W8_H32 448 VLD_DST_ADD_W8_H32 192 VLD_DST_ADD_W8_H32 0 free_space 512 .DCT_DCT_8X32_END: endfunc function inv_txfm_add_identity_identity_8x32_8bpc_lsx la.local t7, eob_8x32 alsl.d t2, a1, a0, 1 .IDENTITY_IDENTITY_EOB_8x32: ld.h t6, t7, 0 addi.d t7, t7, 2 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vxor.v vr23, vr23, vr23 vst_x8 a2, 0, 64, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 1 .endr LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vsrari.h \i, \i, 2 .endr VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 addi.d a2, a2, 16 bge a3, t6, .IDENTITY_IDENTITY_EOB_8x32 endfunc .macro def_fn_16x4_base txfm functionl inv_txfm_\txfm\()add_16x4_lsx vld_x8 a2, 0, 16, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14 .ifc \txfm, identity_ li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14 inv_identity16_lsx \i, vr20, \i, \i, .8h .endr vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vilvh.d vr5, vr4, vr4 vilvh.d vr7, vr6, vr6 vilvh.d vr9, vr8, vr8 vilvh.d vr11, vr10, vr10 vilvh.d vr13, vr12, vr12 vilvh.d vr15, vr14, vr14 .else vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vilvh.d vr5, vr4, vr4 vilvh.d vr7, vr6, vr6 vilvh.d vr9, vr8, vr8 vilvh.d vr11, vr10, vr10 vilvh.d vr13, vr12, vr12 vilvh.d vr15, vr14, vr14 move t6, ra jirl ra, t7, 0 move ra, t6 .endif vxor.v vr23, vr23, vr23 vst_x8 a2, 0, 16, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23 LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \ vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21 LSX_TRANSPOSE8x4_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, vr4, \ vr5, vr6, vr7, vr16, vr17, vr18, vr19, vr20, vr21 vsrari.h vr0, vr0, 1 vsrari.h vr1, vr1, 1 vsrari.h vr2, vr2, 1 vsrari.h vr3, vr3, 1 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari.h vr8, vr0, 4 vsrari.h vr9, vr1, 4 vsrari.h vr10, vr2, 4 vsrari.h vr11, vr3, 4 vsrari.h vr0, vr4, 1 vsrari.h vr1, vr5, 1 vsrari.h vr2, vr6, 1 vsrari.h vr3, vr7, 1 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari.h vr16, vr0, 4 vsrari.h vr17, vr1, 4 vsrari.h vr18, vr2, 4 vsrari.h vr19, vr3, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr8, vr16, vr9, vr17, vr10, vr18, vr11, vr19 endfuncl .endm def_fn_16x4_base identity_ def_fn_16x4_base .macro fn_16x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x4_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_16x4 idct_dc 16, 4, 1 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 b .\txfm1\()_\txfm2\()_16x4_END .NO_HAS_DCONLY_16x4: .endif .ifnc \txfm1, identity la.local t7, inv_\txfm1\()_4h_x16_lsx .endif la.local t8, inv_\txfm2\()_8h_x4_lsx .ifc \txfm1, identity b inv_txfm_identity_add_16x4_lsx .else b inv_txfm_add_16x4_lsx .endif .\txfm1\()_\txfm2\()_16x4_END: endfunc .endm fn_16x4 dct, dct fn_16x4 identity, identity fn_16x4 adst, dct .macro VLD_DST_ADD_W16_H32 in0 vld vr14, t3, 0 vld vr15, t3, 16 vld vr16, t3, 32 vld vr17, t3, 48 vld vr18, t5, 0 vld vr19, t5, 16 vld vr20, t5, 32 vld vr21, t5, 48 vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \ vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4 VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 addi.d t3, t3, 64 addi.d t5, t5, 64 vld vr14, t3, 0 vld vr15, t3, 16 vld vr16, t3, 32 vld vr17, t3, 48 vld vr18, t5, 0 vld vr19, t5, 16 vld vr20, t5, 32 vld vr21, t5, 48 vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \ vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4 VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 addi.d t3, sp, \in0 addi.d t5, sp, \in0+512 .endm function inv_txfm_add_dct_dct_16x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_16x32 idct_dc 16, 32, 1 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 .rept 7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_16x32_END .NO_HAS_DCONLY_16x32: malloc_space 512+512 addi.d t3, sp, 64 la.local t8, eob_16x32 .DCT_DCT_EOB_16x32: ld.h t7, t8, 0 addi.d t8, t8, 2 vld_x16 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 .irp i, 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960 vst vr31, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 rect2_lsx \i, vr23, \i .endr inv_dct16_lsx .8h LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 1 .endr vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vst_x8 t3, 512, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d a2, a2, 16 addi.d t3, t3, 128 bge a3, t7, .DCT_DCT_EOB_16x32 la.local t8, eob_16x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_16x32_END vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x8 sp, 64+896, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_16x32_END vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x8 sp, 64+768, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_16x32_END vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x8 sp, 64+512+128, 16 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 .DCT_DCT_EOB_16x32_END: addi.d t7, sp, 64 .rept 2 vld_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h vst_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t7, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t7, t7, 0, 256, 32, 0, 128, 256, 384, 16, , addi.d t7, t7, 512 .endr alsl.d t2, a1, a0, 1 addi.d t3, sp, 64 addi.d t5, sp, 512+64 VLD_DST_ADD_W16_H32 320 VLD_DST_ADD_W16_H32 448 VLD_DST_ADD_W16_H32 192 VLD_DST_ADD_W16_H32 0 free_space 512+512 .DCT_DCT_16x32_END: endfunc .macro xvmulev_xvmaddod_lasx in0, in1, in2, in3, out0, out1 xvmulwev.w.h \out0, \in0, \in2 xvmulwod.w.h \out1, \in0, \in2 xvmaddwev.w.h \out0, \in1, \in3 xvmaddwod.w.h \out1, \in1, \in3 .endm .macro xvsrari_h_x16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ in11, in12, in13, in14, in15, out0, out1, out2, out3, \ out4, out5, out6, out7, out8, out9, out10, out11, out12, \ out13, out14, out15, shift xvsrari.h \out0, \in0, \shift xvsrari.h \out1, \in1, \shift xvsrari.h \out2, \in2, \shift xvsrari.h \out3, \in3, \shift xvsrari.h \out4, \in4, \shift xvsrari.h \out5, \in5, \shift xvsrari.h \out6, \in6, \shift xvsrari.h \out7, \in7, \shift xvsrari.h \out8, \in8, \shift xvsrari.h \out9, \in9, \shift xvsrari.h \out10, \in10, \shift xvsrari.h \out11, \in11, \shift xvsrari.h \out12, \in12, \shift xvsrari.h \out13, \in13, \shift xvsrari.h \out14, \in14, \shift xvsrari.h \out15, \in15, \shift .endm .macro xvpermi_q_x2 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1 xvor.v \tmp0, \in0, \in0 xvor.v \tmp1, \in1, \in1 xvpermi.q \out0, \in2, 0x02 xvpermi.q \out1, \in3, 0x02 xvpermi.q \out2, \tmp0, 0x31 xvpermi.q \out3, \tmp1, 0x31 .endm .macro DST_ADD_W16_LASX in0, in1, in2, in3, in4, in5, in6, in7 vext2xv.hu.bu xr0, \in0 vext2xv.hu.bu xr1, \in1 vext2xv.hu.bu xr2, \in2 vext2xv.hu.bu xr3, \in3 xvadd.h xr0, xr0, \in4 xvadd.h xr1, xr1, \in5 xvadd.h xr2, xr2, \in6 xvadd.h xr3, xr3, \in7 xvssrani.bu.h xr1, xr0, 0 xvssrani.bu.h xr3, xr2, 0 xvpermi.d xr0, xr1, 0b11011000 xvpermi.d xr2, xr3, 0b11011000 xvpermi.d xr1, xr0, 0b00001110 xvpermi.d xr3, xr2, 0b00001110 vst vr0, a0, 0 vstx vr1, a0, a1 vst vr2, t2, 0 vstx vr3, t2, a1 .endm .macro XVLD_DST_ADD_W16 in0, in1, in2, in3 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W16_LASX xr0, xr1, xr2, xr3, \in0, \in1, \in2, \in3 .endm .macro inv_adst16_lasx la.local t0, iadst16_coeffs_h xvldrepl.h xr20, t0, 0 // 4091 xvldrepl.h xr21, t0, 2 // 201 xvmulev_xvmaddod_lasx xr15, xr0, xr20, xr21, xr16, xr18 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr15, xr0, xr21, xr20, xr17, xr19 xvilvl.w xr15, xr18, xr16 xvilvl.w xr0, xr19, xr17 xvilvh.w xr18, xr18, xr16 xvilvh.w xr19, xr19, xr17 xvssrarni.h.w xr18, xr15, 12 // t0 xvssrarni.h.w xr19, xr0, 12 // t1 xvldrepl.h xr20, t0, 4 // 3973 xvldrepl.h xr21, t0, 6 // 995 xvmulev_xvmaddod_lasx xr13, xr2, xr20, xr21, xr16, xr0 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr13, xr2, xr21, xr20, xr17, xr15 xvilvl.w xr13, xr0, xr16 xvilvl.w xr2, xr15, xr17 xvilvh.w xr0, xr0, xr16 xvilvh.w xr15, xr15, xr17 xvssrarni.h.w xr0, xr13, 12 // t2 xvssrarni.h.w xr15, xr2, 12 // t3 xvldrepl.h xr20, t0, 8 // 3703 xvldrepl.h xr21, t0, 10 // 1751 xvmulev_xvmaddod_lasx xr11, xr4, xr20, xr21, xr16, xr2 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr11, xr4, xr21, xr20, xr17, xr13 xvilvl.w xr11, xr2, xr16 xvilvl.w xr4, xr13, xr17 xvilvh.w xr2, xr2, xr16 xvilvh.w xr13, xr13, xr17 xvssrarni.h.w xr2, xr11, 12 // t4 xvssrarni.h.w xr13, xr4, 12 // t5 xvldrepl.h xr20, t0, 12 // 3290 -> 1645 xvldrepl.h xr21, t0, 14 // 2440 -> 1220 xvmulev_xvmaddod_lasx xr9, xr6, xr20, xr21, xr16, xr4 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr9, xr6, xr21, xr20, xr17, xr11 xvilvl.w xr9, xr4, xr16 xvilvl.w xr6, xr11, xr17 xvilvh.w xr4, xr4, xr16 xvilvh.w xr11, xr11, xr17 xvssrarni.h.w xr4, xr9, 12 // t6 xvssrarni.h.w xr11, xr6, 12 // t7 xvldrepl.h xr20, t0, 16 // 2751 xvldrepl.h xr21, t0, 18 // 3035 xvmulev_xvmaddod_lasx xr7, xr8, xr20, xr21, xr16, xr6 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr7, xr8, xr21, xr20, xr17, xr9 xvilvl.w xr7, xr6, xr16 xvilvl.w xr8, xr9, xr17 xvilvh.w xr6, xr6, xr16 xvilvh.w xr9, xr9, xr17 xvssrarni.h.w xr6, xr7, 12 // t8 xvssrarni.h.w xr9, xr8, 12 // t9 xvldrepl.h xr20, t0, 20 // 2106 xvldrepl.h xr21, t0, 22 // 3513 xvmulev_xvmaddod_lasx xr5, xr10, xr20, xr21, xr16, xr7 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr5, xr10, xr21, xr20, xr17, xr8 xvilvl.w xr5, xr7, xr16 xvilvl.w xr10, xr8, xr17 xvilvh.w xr7, xr7, xr16 xvilvh.w xr8, xr8, xr17 xvssrarni.h.w xr7, xr5, 12 // t10 xvssrarni.h.w xr8, xr10, 12 // t11 xvldrepl.h xr20, t0, 24 // 1380 xvldrepl.h xr21, t0, 26 // 3857 xvmulev_xvmaddod_lasx xr3, xr12, xr20, xr21, xr16, xr5 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr3, xr12, xr21, xr20, xr17, xr10 xvilvl.w xr3, xr5, xr16 xvilvl.w xr12, xr10, xr17 xvilvh.w xr5, xr5, xr16 xvilvh.w xr10, xr10, xr17 xvssrarni.h.w xr5, xr3, 12 // t12 xvssrarni.h.w xr10, xr12, 12 // t13 xvldrepl.h xr20, t0, 28 // 601 xvldrepl.h xr21, t0, 30 // 4052 xvmulev_xvmaddod_lasx xr1, xr14, xr20, xr21, xr16, xr3 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr1, xr14, xr21, xr20, xr17, xr12 xvilvl.w xr1, xr3, xr16 xvilvl.w xr14, xr12, xr17 xvilvh.w xr3, xr3, xr16 xvilvh.w xr12, xr12, xr17 xvssrarni.h.w xr3, xr1, 12 // t14 xvssrarni.h.w xr12, xr14, 12 // t15 xvsadd.h xr1, xr18, xr6 // t0a xvssub.h xr14, xr18, xr6 // t8a xvsadd.h xr16, xr19, xr9 // t1a xvssub.h xr17, xr19, xr9 // t9a xvsadd.h xr6, xr0, xr7 // t2a xvssub.h xr18, xr0, xr7 // t10a xvsadd.h xr9, xr15, xr8 // t3a xvssub.h xr19, xr15, xr8 // t11a xvsadd.h xr0, xr2, xr5 // t4a xvssub.h xr7, xr2, xr5 // t12a xvsadd.h xr8, xr13, xr10 // t5a xvssub.h xr15, xr13, xr10 // t13a xvsadd.h xr2, xr4, xr3 // t6a xvssub.h xr5, xr4, xr3 // t14a xvsadd.h xr10, xr11, xr12 // t7a xvssub.h xr13, xr11, xr12 // t15a la.local t0, idct_coeffs_h xvldrepl.h xr20, t0, 8 // 799 xvldrepl.h xr21, t0, 10 // 4017 xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr3, xr11 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr4, xr12 xvilvl.w xr14, xr11, xr3 xvilvl.w xr17, xr12, xr4 xvilvh.w xr11, xr11, xr3 xvilvh.w xr12, xr12, xr4 xvssrarni.h.w xr11, xr14, 12 // t8 xvssrarni.h.w xr12, xr17, 12 // t9 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr15, xr7, xr20, xr21, xr3, xr14 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr15, xr7, xr21, xr20, xr4, xr17 xvilvl.w xr15, xr14, xr3 xvilvl.w xr7, xr17, xr4 xvilvh.w xr14, xr14, xr3 xvilvh.w xr17, xr17, xr4 xvssrarni.h.w xr14, xr15, 12 // t13 xvssrarni.h.w xr17, xr7, 12 // t12 xvldrepl.h xr20, t0, 12 // 3406 xvldrepl.h xr21, t0, 14 // 2276 xvmulev_xvmaddod_lasx xr18, xr19, xr21, xr20, xr3, xr7 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr18, xr19, xr20, xr21, xr4, xr15 xvilvl.w xr18, xr7, xr3 xvilvl.w xr19, xr15, xr4 xvilvh.w xr7, xr7, xr3 xvilvh.w xr15, xr15, xr4 xvssrarni.h.w xr7, xr18, 12 // t10 xvssrarni.h.w xr15, xr19, 12 // t11 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr13, xr5, xr20, xr21, xr3, xr18 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr13, xr5, xr21, xr20, xr4, xr19 xvilvl.w xr13, xr18, xr3 xvilvl.w xr5, xr19, xr4 xvilvh.w xr18, xr18, xr3 xvilvh.w xr19, xr19, xr4 xvssrarni.h.w xr18, xr13, 12 // t15 xvssrarni.h.w xr19, xr5, 12 // t14 xvsadd.h xr5, xr1, xr0 // t0 xvssub.h xr13, xr1, xr0 // t4 xvsadd.h xr3, xr16, xr8 // t1 xvssub.h xr4, xr16, xr8 // t5 xvsadd.h xr0, xr6, xr2 // t2 xvssub.h xr1, xr6, xr2 // t6 xvsadd.h xr8, xr9, xr10 // t3 xvssub.h xr16, xr9, xr10 // t7 xvsadd.h xr2, xr11, xr17 // t8a xvssub.h xr6, xr11, xr17 // t12a xvsadd.h xr9, xr12, xr14 // t9a xvssub.h xr10, xr12, xr14 // t13a xvsadd.h xr11, xr7, xr19 // t10a xvssub.h xr17, xr7, xr19 // t14a xvsadd.h xr12, xr15, xr18 // t11a xvssub.h xr14, xr15, xr18 // t15a la.local t0, idct_coeffs_h xvldrepl.h xr20, t0, 4 // 1567 xvldrepl.h xr21, t0, 6 // 3784 xvmulev_xvmaddod_lasx xr13, xr4, xr21, xr20, xr7, xr18 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr13, xr4, xr20, xr21, xr15, xr19 xvilvl.w xr13, xr18, xr7 xvilvl.w xr4, xr19, xr15 xvilvh.w xr18, xr18, xr7 xvilvh.w xr19, xr19, xr15 xvssrarni.h.w xr18, xr13, 12 // t4a xvssrarni.h.w xr19, xr4, 12 // t5a xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr16, xr1, xr20, xr21, xr7, xr4 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr16, xr1, xr21, xr20, xr15, xr13 xvilvl.w xr16, xr4, xr7 xvilvl.w xr1, xr13, xr15 xvilvh.w xr4, xr4, xr7 xvilvh.w xr13, xr13, xr15 xvssrarni.h.w xr4, xr16, 12 // t7a xvssrarni.h.w xr13, xr1, 12 // t6a xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr6, xr10, xr21, xr20, xr7, xr1 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr6, xr10, xr20, xr21, xr15, xr16 xvilvl.w xr6, xr1, xr7 xvilvl.w xr10, xr16, xr15 xvilvh.w xr1, xr1, xr7 xvilvh.w xr16, xr16, xr15 xvssrarni.h.w xr1, xr6, 12 // t12 xvssrarni.h.w xr16, xr10, 12 // t13 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr7, xr6 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr15, xr10 xvilvl.w xr14, xr6, xr7 xvilvl.w xr17, xr10, xr15 xvilvh.w xr6, xr6, xr7 xvilvh.w xr10, xr10, xr15 xvssrarni.h.w xr6, xr14, 12 // t15 xvssrarni.h.w xr10, xr17, 12 // t14 xvsadd.h xr14, xr5, xr0 // out[0] xvssub.h xr17, xr5, xr0 // t2a xvssub.h xr7, xr3, xr8 // t3a xvsadd.h xr15, xr3, xr8 // out[15] xvsllwil.w.h xr22, xr15, 0 xvexth.w.h xr15, xr15 xvneg.w xr22, xr22 xvneg.w xr15, xr15 xvssrarni.h.w xr15, xr22, 0 // out[15] xvssub.h xr7, xr3, xr8 // t3a xvsadd.h xr3, xr19, xr4 // out[12] xvssub.h xr8, xr19, xr4 // t7 xvssub.h xr0, xr18, xr13 // t6 xvsadd.h xr5, xr18, xr13 // out[3] xvsllwil.w.h xr22, xr5, 0 xvexth.w.h xr5, xr5 xvneg.w xr22, xr22 xvneg.w xr5, xr5 xvssrarni.h.w xr5, xr22, 0 // out[3] xvsadd.h xr13, xr9, xr12 // out[14] xvssub.h xr19, xr9, xr12 // t11 xvssub.h xr4, xr2, xr11 // t10 xvsadd.h xr18, xr2, xr11 // out[1] xvsllwil.w.h xr22, xr18, 0 xvexth.w.h xr18, xr18 xvneg.w xr22, xr22 xvneg.w xr18, xr18 xvssrarni.h.w xr18, xr22, 0 // out[1] xvsadd.h xr2, xr1, xr10 // out[2] xvssub.h xr11, xr1, xr10 // t14a xvssub.h xr12, xr16, xr6 // t15a xvsadd.h xr9, xr16, xr6 // out[13] xvsllwil.w.h xr22, xr9, 0 xvexth.w.h xr9, xr9 xvneg.w xr22, xr22 xvneg.w xr9, xr9 xvssrarni.h.w xr9, xr22, 0 // out[13] xvldrepl.h xr20, t0, 0 // 2896 xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr20, xr6, xr10 xvneg.h xr21, xr20 xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr21, xr16, xr1 xvilvl.w xr17, xr10, xr6 xvilvl.w xr7, xr1, xr16 xvilvh.w xr10, xr10, xr6 xvilvh.w xr1, xr1, xr16 xvssrarni.h.w xr1, xr7, 12 // out[8] xvsrari.w xr17, xr17, 12 xvsrari.w xr10, xr10, 12 xvneg.w xr17, xr17 xvneg.w xr10, xr10 xvssrarni.h.w xr10, xr17, 0 // out[7] xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr21, xr16, xr17 xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr20, xr6, xr7 xvilvl.w xr0, xr17, xr16 xvilvl.w xr8, xr7, xr6 xvilvh.w xr17, xr17, xr16 xvilvh.w xr7, xr7, xr6 xvssrarni.h.w xr7, xr8, 12 // out[4] xvsrari.w xr0, xr0, 12 xvsrari.w xr17, xr17, 12 xvneg.w xr0, xr0 xvneg.w xr17, xr17 xvssrarni.h.w xr17, xr0, 0 // out[11] xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr21, xr16, xr0 xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr20, xr6, xr8 xvilvl.w xr4, xr0, xr16 xvilvl.w xr19, xr8, xr6 xvilvh.w xr0, xr0, xr16 xvilvh.w xr8, xr8, xr6 xvssrarni.h.w xr8, xr19, 12 // out[6] xvsrari.w xr4, xr4, 12 xvsrari.w xr0, xr0, 12 xvneg.w xr4, xr4 xvneg.w xr0, xr0 xvssrarni.h.w xr0, xr4, 0 // out[9] xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr20, xr6, xr4 xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr21, xr16, xr19 xvilvl.w xr11, xr4, xr6 xvilvl.w xr12, xr19, xr16 xvilvh.w xr4, xr4, xr6 xvilvh.w xr19, xr19, xr16 xvssrarni.h.w xr19, xr12, 12 // out[10] xvsrari.w xr11, xr11, 12 xvsrari.w xr4, xr4, 12 xvneg.w xr11, xr11 xvneg.w xr4, xr4 xvssrarni.h.w xr4, xr11, 0 // out[5] .endm function inv_txfm_add_adst_adst_16x16_8bpc_lasx PUSH_REG xvld_x16 a2, 0, 32, xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \ xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15 inv_adst16_lasx LASX_TRANSPOSE8x8_H xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \ xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \ xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27 LASX_TRANSPOSE8x8_H xr1, xr0, xr19, xr17, xr3, xr9, xr13, xr15, \ xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \ xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27 xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \ xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \ xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \ xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15, 2 xvpermi_q_x2 xr0, xr1, xr8, xr9, xr0, xr1, xr8, xr9, xr20, xr21 xvpermi_q_x2 xr2, xr3, xr10, xr11, xr2, xr3, xr10, xr11, xr20, xr21 xvpermi_q_x2 xr4, xr5, xr12, xr13, xr4, xr5, xr12, xr13, xr20, xr21 xvpermi_q_x2 xr6, xr7, xr14, xr15, xr6, xr7, xr14, xr15, xr20, xr21 inv_adst16_lasx xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \ xr1, xr0, xr19, xr17, xr3, xr9, xr13, xr15, \ xr14, xr18, xr11, xr5, xr7, xr4, xr8, xr10, \ xr12, xr16, xr19, xr17, xr20, xr9, xr13, xr15, 4 xvxor.v xr23, xr23, xr23 .irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480 xvst xr23, a2, \i .endr alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr14, xr18, xr11, xr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr7, xr4, xr8, xr10 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr12, xr16, xr19, xr17 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr20, xr9, xr13, xr15 POP_REG endfunc