/* * Copyright © 2024, VideoLAN and dav1d authors * Copyright © 2024, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" // static int cdef_find_dir_lsx(const pixel *img, const ptrdiff_t stride, // unsigned *const var HIGHBD_DECL_SUFFIX) // param: img: a0, stride: a1, var: a2 function cdef_find_dir_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 li.d a3, 128 vreplgr2vr.w vr31, a3 // hv: vr0-vr3 diag: vr4-vr11 alt: vr12-vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, vr9, vr10, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr20, vr21, vr22, vr23 vxor.v \i, \i, \i .endr .CFDL01: // 8 // 0 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vadd.w vr4, vr4, vr24 //diag[0][y+x] vadd.w vr5, vr5, vr25 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr12, vr12, vr26 vadd.w vr12, vr12, vr27 //alt[0][y+(x>>1)] vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 0 //hv[0][y] vadd.w vr15, vr15, vr26 vadd.w vr15, vr15, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr15, vr15, 0x1b vadd.w vr9, vr9, vr24 vadd.w vr8, vr8, vr25 vpermi.w vr8, vr8, 0x1b vpermi.w vr9, vr9, 0x1b //diag[1][7+y-x] vxor.v vr28, vr28, vr28 vxor.v vr29, vr29, vr29 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.w vr18, vr28, 0x30 vshuf4i.w vr19, vr28, 0x39 vextrins.w vr19, vr29, 0x30 vshuf4i.w vr20, vr29, 0x39 //alt[2][3-(y>>1)+7] vinsgr2vr.w vr20, zero, 3 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vadd.w vr21, vr21, vr24 vadd.w vr22, vr22, vr25 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 1 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr4, 4 //1-4 vbsrl.v vr29, vr5, 4 //5-8 vextrins.w vr28, vr5, 0x30 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vbsll.v vr5, vr29, 4 vextrins.w vr5, vr28, 0x03 vextrins.w vr6, vr29, 0x03 vextrins.w vr28, vr4, 0x30 vshuf4i.w vr4, vr28, 0x93 vbsrl.v vr28, vr12, 4 vextrins.w vr28, vr13, 0x30 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr13, vr28, 0x03 vextrins.w vr28, vr12, 0x30 vshuf4i.w vr12, vr28, 0x93 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 1 //hv[0][y] vbsrl.v vr28, vr15, 4 vextrins.w vr28, vr16, 0x30 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr16, vr28, 0x00 vextrins.w vr28, vr15, 0x00 vshuf4i.w vr15, vr28, 0x6c vbsrl.v vr28, vr8, 4 //4321 vbsrl.v vr29, vr9, 4 //8765 vextrins.w vr28, vr9, 0x30 vpermi.w vr28, vr28, 0x1b vpermi.w vr29, vr29, 0x1b vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr10, vr29, 0x00 vextrins.w vr29, vr28, 0x00 vshuf4i.w vr9, vr29, 0x6c vextrins.w vr28, vr8, 0x00 vshuf4i.w vr8, vr28, 0x6c vbsll.v vr28, vr19, 4 vextrins.w vr28, vr18, 0x03 vbsll.v vr29, vr20, 4 vextrins.w vr29, vr19, 0x03 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] vextrins.w vr18, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr19, vr28, 0x39 vbsrl.v vr20, vr29, 4 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vadd.w vr21, vr21, vr24 vadd.w vr22, vr22, vr25 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 2 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr4, 8 vbsrl.v vr29, vr5, 8 vextrins.d vr28, vr5, 0x10 //2-5 vextrins.d vr29, vr6, 0x10 //6-9 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.d vr4, vr28, 0x10 vextrins.d vr5, vr28, 0x01 vextrins.d vr5, vr29, 0x10 vextrins.d vr6, vr29, 0x01 vbsrl.v vr28, vr12, 8 vextrins.d vr28, vr13, 0x10 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.d vr12, vr28, 0x10 vextrins.d vr13, vr28, 0x01 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 2 //hv[0][y] vbsrl.v vr28, vr15, 8 vextrins.d vr28, vr16, 0x10 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr28, vr28, 0x1b vextrins.d vr15, vr28, 0x10 vextrins.d vr16, vr28, 0x01 vbsrl.v vr28, vr8, 8 vextrins.d vr28, vr9, 0x10 vbsrl.v vr29, vr9, 8 vextrins.d vr29, vr10, 0x10 vpermi.w vr28, vr28, 0x1b //5432 vpermi.w vr29, vr29, 0x1b //9876 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 vpermi.w vr28, vr28, 0x1b vpermi.w vr29, vr29, 0x1b vextrins.d vr8, vr28, 0x10 vextrins.d vr9, vr28, 0x01 vextrins.d vr9, vr29, 0x10 vextrins.d vr10, vr29, 0x01 //diag[1][7+y-x] vbsrl.v vr28, vr18, 8 vextrins.d vr28, vr19, 0x10 //2345 vbsrl.v vr29, vr19, 8 vextrins.d vr29, vr20, 0x10 //6789 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr18, vr28, 0x10 vextrins.d vr19, vr28, 0x01 vextrins.d vr19, vr29, 0x10 vextrins.d vr20, vr29, 0x01 //alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 4 vextrins.w vr28, vr22, 0x30 //1234 vbsrl.v vr29, vr22, 4 //5678 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr23, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr22, vr29, 0x93 vextrins.w vr28, vr21, 0x30 vshuf4i.w vr21, vr28, 0x93 add.d a0, a0, a1 // 3 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsll.v vr28, vr5, 4 vextrins.w vr28, vr4, 0x03 //3456 vbsll.v vr29, vr6, 4 vextrins.w vr29, vr5, 0x03 //78910 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.w vr4, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr5, vr28, 0x39 vbsrl.v vr6, vr29, 4 vbsll.v vr28, vr13, 4 vextrins.w vr28, vr12, 0x03 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr12, vr28, 0x30 vbsrl.v vr13, vr28, 4 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 3 //hv[0][y] vbsll.v vr28, vr16, 4 vextrins.w vr28, vr15, 0x03 vpermi.w vr28, vr28, 0x1b //6543 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr15, vr28, 0x33 vshuf4i.w vr16, vr28, 0xc6 vinsgr2vr.w vr16, zero, 3 vbsll.v vr28, vr9, 4 vextrins.w vr28, vr8, 0x03 //3456 vbsll.v vr29, vr10, 4 vextrins.w vr29, vr9, 0x03 //78910 vpermi.w vr28, vr28, 0x1b //6543 vpermi.w vr29, vr29, 0x1b //10987 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr8, vr28, 0x33 vextrins.w vr28, vr29, 0x33 vshuf4i.w vr9, vr28, 0xc6 vshuf4i.w vr10, vr29, 0xc6 vinsgr2vr.w vr10, zero, 3 vbsrl.v vr28, vr18, 8 vextrins.d vr28, vr19, 0x10 //2345 vbsrl.v vr29, vr19, 8 vextrins.d vr29, vr20, 0x10 //6789 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr18, vr28, 0x10 vextrins.d vr19, vr28, 0x01 vextrins.d vr19, vr29, 0x10 vextrins.d vr20, vr29, 0x01 //alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 4 vextrins.w vr28, vr22, 0x30 //1234 vbsrl.v vr29, vr22, 4 //5678 vextrins.w vr29, vr23, 0x30 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr23, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr22, vr29, 0x93 vextrins.w vr28, vr21, 0x30 vshuf4i.w vr21, vr28, 0x93 add.d a0, a0, a1 // 4 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vadd.w vr5, vr5, vr24 //diag[0][y+x] vadd.w vr6, vr6, vr25 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr13, vr13, vr26 vadd.w vr13, vr13, vr27 //alt[0][y+(x>>1)] vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 0 //hv[0][y] vpermi.w vr16, vr16, 0x1b vadd.w vr16, vr16, vr26 vadd.w vr16, vr16, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr16, vr16, 0x1b vpermi.w vr9, vr9, 0x1b vpermi.w vr10, vr10, 0x1b vadd.w vr10, vr10, vr24 vadd.w vr9, vr9, vr25 vpermi.w vr9, vr9, 0x1b vpermi.w vr10, vr10, 0x1b //diag[1][7+y-x] vbsrl.v vr28, vr18, 4 vextrins.w vr28, vr19, 0x30 //1234 vbsrl.v vr29, vr19, 4 vextrins.w vr29, vr20, 0x30 //5678 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] vextrins.w vr20, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr19, vr29, 0x93 vbsll.v vr18, vr28, 4 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 8 vextrins.d vr28, vr22, 0x10 vbsrl.v vr29, vr22, 8 vextrins.d vr29, vr23, 0x10 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr21, vr28, 0x10 vextrins.d vr22, vr28, 0x01 vextrins.d vr22, vr29, 0x10 vextrins.d vr23, vr29, 0x01 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 5 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr5, 4 //5-8 vbsrl.v vr29, vr6, 4 //9-12 vextrins.w vr28, vr6, 0x30 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.w vr7, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr6, vr29, 0x93 vextrins.w vr28, vr5, 0x30 vshuf4i.w vr5, vr28, 0x93 vbsrl.v vr28, vr13, 4 vextrins.w vr28, vr14, 0x30 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr14, vr28, 0x03 vextrins.w vr28, vr13, 0x30 vshuf4i.w vr13, vr28, 0x93 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 1 //hv[0][y] vbsrl.v vr28, vr16, 4 vextrins.w vr28, vr17, 0x30 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr17, vr28, 0x00 vextrins.w vr28, vr16, 0x00 vshuf4i.w vr16, vr28, 0x6c vbsrl.v vr28, vr9, 4 vbsrl.v vr29, vr10, 4 vextrins.w vr28, vr10, 0x30 vpermi.w vr28, vr28, 0x1b //8-5 vpermi.w vr29, vr29, 0x1b //12-9 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr11, vr29, 0x00 vextrins.w vr29, vr28, 0x00 vshuf4i.w vr10, vr29, 0x6c vextrins.w vr28, vr9, 0x00 vshuf4i.w vr9, vr28, 0x6c vbsrl.v vr28, vr18, 4 vextrins.w vr28, vr19, 0x30 //1234 vbsrl.v vr29, vr19, 4 vextrins.w vr29, vr20, 0x30 //5678 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] vextrins.w vr20, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr19, vr29, 0x93 vbsll.v vr18, vr28, 4 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 8 vextrins.d vr28, vr22, 0x10 vbsrl.v vr29, vr22, 8 vextrins.d vr29, vr23, 0x10 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr21, vr28, 0x10 vextrins.d vr22, vr28, 0x01 vextrins.d vr22, vr29, 0x10 vextrins.d vr23, vr29, 0x01 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 6 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr5, 8 vbsrl.v vr29, vr6, 8 vextrins.d vr28, vr6, 0x10 //6-9 vextrins.d vr29, vr7, 0x10 //10-13 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.d vr5, vr28, 0x10 vextrins.d vr6, vr28, 0x01 vextrins.d vr6, vr29, 0x10 vextrins.d vr7, vr29, 0x01 vbsrl.v vr28, vr13, 8 vextrins.d vr28, vr14, 0x10 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.d vr13, vr28, 0x10 vextrins.d vr14, vr28, 0x01 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 2 //hv[0][y] vbsrl.v vr28, vr16, 8 vextrins.d vr28, vr17, 0x10 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr28, vr28, 0x1b vextrins.d vr16, vr28, 0x10 vextrins.d vr17, vr28, 0x01 vbsrl.v vr28, vr9, 8 vextrins.d vr28, vr10, 0x10 vbsrl.v vr29, vr10, 8 vextrins.d vr29, vr11, 0x10 vpermi.w vr28, vr28, 0x1b //9876 vpermi.w vr29, vr29, 0x1b //13-10 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 vpermi.w vr28, vr28, 0x1b vpermi.w vr29, vr29, 0x1b vextrins.d vr9, vr28, 0x10 vextrins.d vr10, vr28, 0x01 vextrins.d vr10, vr29, 0x10 vextrins.d vr11, vr29, 0x01 //diag[1][7+y-x] vadd.w vr18, vr18, vr24 //0123 vadd.w vr19, vr19, vr25 //4567 alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsll.v vr28, vr22, 4 vextrins.w vr28, vr21, 0x03 //3456 vbsll.v vr29, vr23, 4 vextrins.w vr29, vr22, 0x03 //78910 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr21, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr22, vr28, 0x39 vbsrl.v vr23, vr29, 4 add.d a0, a0, a1 // 7 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsll.v vr28, vr6, 4 vextrins.w vr28, vr5, 0x03 //78910 vbsll.v vr29, vr7, 4 vextrins.w vr29, vr6, 0x03 //11-14 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.w vr5, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr6, vr28, 0x39 vbsrl.v vr7, vr29, 4 vbsll.v vr28, vr14, 4 vextrins.w vr28, vr13, 0x03 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr13, vr28, 0x30 vbsrl.v vr14, vr28, 4 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 3 //hv[0][y] vbsll.v vr28, vr17, 4 vextrins.w vr28, vr16, 0x03 vpermi.w vr28, vr28, 0x1b //10987 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr16, vr28, 0x33 vshuf4i.w vr17, vr28, 0xc6 vinsgr2vr.w vr17, zero, 3 vbsll.v vr28, vr10, 4 vextrins.w vr28, vr9, 0x03 //7-10 vbsll.v vr29, vr11, 4 vextrins.w vr29, vr10, 0x03 //11-14 vpermi.w vr28, vr28, 0x1b //10-7 vpermi.w vr29, vr29, 0x1b //14-11 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr9, vr28, 0x33 vextrins.w vr28, vr29, 0x33 vshuf4i.w vr10, vr28, 0xc6 vshuf4i.w vr11, vr29, 0xc6 vinsgr2vr.w vr11, zero, 3 vadd.w vr18, vr18, vr24 //0123 vadd.w vr19, vr19, vr25 //4567 alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsll.v vr28, vr22, 4 vextrins.w vr28, vr21, 0x03 //3456 vbsll.v vr29, vr23, 4 vextrins.w vr29, vr22, 0x03 //78910 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr21, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr22, vr28, 0x39 vbsrl.v vr23, vr29, 4 add.d a0, a0, a1 vxor.v vr24, vr24, vr24 //unsigned cost[8] vxor.v vr25, vr25, vr25 vmul.w vr26, vr0, vr0 vmul.w vr27, vr1, vr1 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr27, vr27 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vmul.w vr26, vr2, vr2 vmul.w vr27, vr3, vr3 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vhaddw.d.w vr28, vr27, vr27 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a5, vr28, 0 add.d a4, a4, a5 li.d a6, 105 mul.w a3, a3, a6 mul.w a4, a4, a6 vinsgr2vr.w vr24, a3, 2 vinsgr2vr.w vr25, a4, 2 vxor.v vr30, vr30, vr30 //div_table vxor.v vr31, vr31, vr31 li.d t0, 840 vinsgr2vr.w vr30, t0, 0 li.d t0, 420 vinsgr2vr.w vr30, t0, 1 li.d t0, 280 vinsgr2vr.w vr30, t0, 2 li.d t0, 210 vinsgr2vr.w vr30, t0, 3 li.d t0, 168 vinsgr2vr.w vr31, t0, 0 li.d t0, 140 vinsgr2vr.w vr31, t0, 1 li.d t0, 120 vinsgr2vr.w vr31, t0, 2 vbsll.v vr27, vr7, 4 vextrins.w vr27, vr6, 0x03 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr4, vr4 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr30 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vbsll.v vr27, vr6, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr5, vr5 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr31 vextrins.w vr26, vr31, 0x33 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 //cost[0] vbsll.v vr27, vr11, 4 vextrins.w vr27, vr10, 0x03 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr8, vr8 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr30 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vbsll.v vr27, vr10, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr9, vr9 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr31 vextrins.w vr26, vr31, 0x33 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a5, vr28, 0 add.d a4, a4, a5 //cost[4] vpickve2gr.w a5, vr5, 3 mul.w a5, a5, a5 mul.w a5, a5, a6 add.w a3, a3, a5 vinsgr2vr.w vr24, a3, 0 vpickve2gr.w a5, vr9, 3 mul.w a5, a5, a5 mul.w a5, a5, a6 add.w a4, a4, a5 vinsgr2vr.w vr25, a4, 0 //n=0 vpickve2gr.w a3, vr24, 1 vmul.w vr26, vr13, vr13 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr12, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vextrins.w vr29, vr30, 0x01 vextrins.w vr29, vr30, 0x13 vextrins.w vr29, vr31, 0x21 vextrins.w vr29, vr31, 0x33 vbsll.v vr27, vr14, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr12, vr12 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr24, a3, 1 //n=1 vpickve2gr.w a3, vr24, 3 vmul.w vr26, vr16, vr16 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr15, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vbsll.v vr27, vr17, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr15, vr15 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr24, a3, 3 //n=2 vpickve2gr.w a3, vr25, 1 vmul.w vr26, vr19, vr19 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr18, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vbsll.v vr27, vr20, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr18, vr18 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr25, a3, 1 //n=3 vpickve2gr.w a3, vr25, 3 vmul.w vr26, vr22, vr22 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr21, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vbsll.v vr27, vr23, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr21, vr21 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr25, a3, 3 xor a3, a3, a3 //best_dir vpickve2gr.w a4, vr24, 0 //best_cost .BSETDIR01: vpickve2gr.w a5, vr24, 1 bge a4, a5, .BSETDIR02 or a4, a5, a5 ori a3, zero, 1 .BSETDIR02: vpickve2gr.w a5, vr24, 2 bge a4, a5, .BSETDIR03 or a4, a5, a5 ori a3, zero, 2 .BSETDIR03: vpickve2gr.w a5, vr24, 3 bge a4, a5, .BSETDIR04 or a4, a5, a5 ori a3, zero, 3 .BSETDIR04: vpickve2gr.w a5, vr25, 0 bge a4, a5, .BSETDIR05 or a4, a5, a5 ori a3, zero, 4 .BSETDIR05: vpickve2gr.w a5, vr25, 1 bge a4, a5, .BSETDIR06 or a4, a5, a5 ori a3, zero, 5 .BSETDIR06: vpickve2gr.w a5, vr25, 2 bge a4, a5, .BSETDIR07 or a4, a5, a5 ori a3, zero, 6 .BSETDIR07: vpickve2gr.w a5, vr25, 3 bge a4, a5, .BSETDIREND or a4, a5, a5 ori a3, zero, 7 .BSETDIREND: xori a5, a3, 4 li.d a1, 4 bge a5, a1, .GETCOST01 vreplve.w vr26, vr24, a5 b .GETCOST02 .GETCOST01: vreplve.w vr26, vr25, a5 .GETCOST02: vpickve2gr.w a5, vr26, 0 sub.w a5, a4, a5 srai.d a5, a5, 10 st.w a5, a2, 0 or a0, a3, a3 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc .macro cdef_fill tmp, stride, w, h beqz \h, 700f //h or t0, zero, zero //y 100: or t1, zero, zero //xx srai.d s6, \w, 3 //x beqz s6, 300f 200: vstx vr18, \tmp, t1 addi.d t1, t1, 16 addi.d s6, s6, -1 bnez s6, 200b 300: andi s6, \w, 4 beqz s6, 400f fstx.d f18, \tmp, t1 addi.d t1, t1, 8 400: andi s6, \w, 2 beqz s6, 500f fstx.s f18, \tmp, t1 addi.d t1, t1, 4 500: andi s6, \w, 1 beqz s6, 600f li.w s6, -16384 stx.h s6, \tmp, t1 addi.d t1, t1, 2 600: add.d \tmp, \tmp, \stride add.d \tmp, \tmp, \stride addi.d t0, t0, 1 blt t0, \h, 100b 700: .endm const dav1d_cdef_directions .byte 1 * 12 + 0, 2 * 12 + 0 .byte 1 * 12 + 0, 2 * 12 - 1 .byte -1 * 12 + 1, -2 * 12 + 2 .byte 0 * 12 + 1, -1 * 12 + 2 .byte 0 * 12 + 1, 0 * 12 + 2 .byte 0 * 12 + 1, 1 * 12 + 2 .byte 1 * 12 + 1, 2 * 12 + 2 .byte 1 * 12 + 0, 2 * 12 + 1 .byte 1 * 12 + 0, 2 * 12 + 0 .byte 1 * 12 + 0, 2 * 12 - 1 .byte -1 * 12 + 1, -2 * 12 + 2 .byte 0 * 12 + 1, -1 * 12 + 2 endconst .macro constrain_vrh in0, in1, in2, tmp0, tmp1, out vabsd.h \tmp0, \in0, vr23 //adiff vsra.h \tmp1, \tmp0, \in2 vsub.h \tmp1, \in1, \tmp1 vmax.h \tmp1, vr23, \tmp1 //imax vmin.h \tmp0, \tmp0, \tmp1 //imin //apply_sign vslt.h \tmp1, \in0, vr23 vandn.v \in0, \tmp1, \tmp0 vsigncov.h \tmp0, \tmp1, \tmp0 vor.v \out, \in0, \tmp0 .endm .macro iclip_vrh in0, in1, in2, tmp0, tmp1, out vmin.h \tmp0, \in2, \in0 vslt.h \in0, \in0, \in1 vand.v \tmp1, \in0, \in1 vandn.v \tmp0, \in0, \tmp0 vor.v \out, \tmp1, \tmp0 .endm .macro cdef_padding_data //y < 0 beqz t7, 90f 4: or t4, t5, t5 //data index xx slli.d t0, t4, 1 mul.w t2, t7, s5 slli.d t2, t2, 1 add.d t2, s4, t2 sub.d t3, t6, t5 //loop param x srai.d t3, t3, 3 add.d t3, t3, t5 beq t5, t3, 6f 5: // /8 fldx.d f18, a3, t4 vsllwil.hu.bu vr18, vr18, 0 vstx vr18, t2, t0 addi.d t0, t0, 16 addi.d t4, t4, 8 addi.d t3, t3, -1 bne t5, t3, 5b 6: // &4 sub.d t1, t6, t5 andi t1, t1, 4 beqz t1, 7f fldx.s f18, a3, t4 vsllwil.hu.bu vr18, vr18, 0 fstx.d f18, t2, t0 addi.d t0, t0, 8 addi.d t4, t4, 4 7: // &2 sub.d t1, t6, t5 andi t1, t1, 2 beqz t1, 9f ldx.bu t1, a3, t4 stx.h t1, t2, t0 addi.d t0, t0, 2 addi.d t4, t4, 1 ldx.bu t1, a3, t4 stx.h t1, t2, t0 addi.d t0, t0, 2 addi.d t4, t4, 1 9: add.d a3, a3, a1 addi.d t7, t7, 1 bnez t7, 4b 90: // y < h beqz s1, 12f beqz t5, 12f or t7, zero, zero //y 10: or t4, t5, t5 //data index x 11: slli.d t3, t7, 1 addi.d t3, t3, 2 add.d t3, t3, t4 ldx.bu t1, a2, t3 mul.w t3, t7, s5 add.d t3, t3, t4 slli.d t3, t3, 1 stx.h t1, s4, t3 addi.d t4, t4, 1 bnez t4, 11b addi.d t7, t7, 1 bne t7, s1, 10b 12: // y = 0 ; y < h or s0, s4, s4 beqz s1, 20f or s6, a0, a0 or t7, zero, zero //y srai.d t4, t6, 3 //loop max 13: or t0, zero, zero //loop param or t3, t0, t0 //data index src or t1, t0, t0 //data index tmp beqz t4, 16f 15: // /8 fldx.d f18, s6, t3 vsllwil.hu.bu vr18, vr18, 0 vstx vr18, s0, t1 addi.d t3, t3, 8 addi.d t1, t1, 16 addi.d t0, t0, 1 blt t0, t4, 15b 16: // &4 andi t0, t6, 4 beqz t0, 17f fldx.s f18, s6, t3 vsllwil.hu.bu vr18, vr18, 0 fstx.d f18, s0, t1 addi.d t3, t3, 4 addi.d t1, t1, 8 17: // &2 andi t0, t6, 2 beqz t0, 19f ldx.bu t2, s6, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 ldx.bu t2, s6, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 19: // src+ tmp+ add.d s6, s6, a1 add.d s0, s0, s5 add.d s0, s0, s5 addi.d t7, t7, 1 blt t7, s1, 13b // y = h ; y < y_end 20: beq s1, t8, 27f or t7, s1, s1 //y sub.d t4, t6, t5 srai.d t4, t4, 3 add.d t4, t4, t5 //8 loop max 21: or t0, t5, t5 //xx or t3, t0, t0 //data index bottom slli.d t1, t0, 1 //data index tmp beq t5, t4, 23f 22: // /8 fldx.d f18, a4, t3 vsllwil.hu.bu vr18, vr18, 0 vstx vr18, s0, t1 addi.d t3, t3, 8 addi.d t1, t1, 16 addi.d t0, t0, 1 blt t0, t4, 22b 23: // &4 sub.d t0, t6, t5 andi t0, t0, 4 beqz t0, 24f fldx.s f18, a4, t3 vsllwil.hu.bu vr18, vr18, 0 fstx.d f18, s0, t1 addi.d t3, t3, 4 addi.d t1, t1, 8 24: // &2 sub.d t0, t6, t5 andi t0, t0, 2 beqz t0, 26f ldx.bu t2, a4, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 ldx.bu t2, a4, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 26: // bottom+ tmp+ add.d a4, a4, a1 add.d s0, s0, s5 add.d s0, s0, s5 addi.d t7, t7, 1 blt t7, t8, 21b 27: // padding end .endm .macro cdef_pri_sec_init clz.w t3, a6 sub.w t3, t2, t3 sub.w t3, s7, t3 //sec_shift vreplgr2vr.h vr4, t0 //pri_tap_k vreplgr2vr.h vr9, a5 //pri_strength vreplgr2vr.h vr10, t1 //pri_shift vreplgr2vr.h vr18, a6 //sec_strength vreplgr2vr.h vr19, t3 //sec_shift or t2, s1, s1 //dowhile loop param addi.d s1, a7, 2 slli.d s1, s1, 1 //directions dir+2 addi.d s2, a7, 4 slli.d s2, s2, 1 //directions dir+4 slli.d s3, a7, 1 //directions dir+0 la.local t0, dav1d_cdef_directions add.d s1, t0, s1 ld.b a2, s1, 0 //off01 ld.b a3, s1, 1 //off11 add.d s2, t0, s2 ld.b s1, s2, 0 //off02 ld.b s2, s2, 1 //off12 add.d s3, t0, s3 ld.b t0, s3, 0 //off03 ld.b s3, s3, 1 //off13 slli.d a2, a2, 1 slli.d a3, a3, 1 slli.d s1, s1, 1 slli.d s2, s2, 1 slli.d t0, t0, 1 slli.d s3, s3, 1 .endm .macro cdef_pri_init vreplgr2vr.h vr4, t0 //pri_tap_k vreplgr2vr.h vr9, a5 //pri_strength vreplgr2vr.h vr10, t1 //pri_shift or t2, s1, s1 //dowhile loop param addi.d s1, a7, 2 slli.d s1, s1, 1 //directions dir+2 la.local t0, dav1d_cdef_directions add.d s1, t0, s1 ld.b a2, s1, 0 //off01 ld.b a3, s1, 1 //off11 slli.d a2, a2, 1 slli.d a3, a3, 1 .endm .macro cdef_sec_init clz.w t3, a6 li.w t2, 31 sub.w t3, t2, t3 sub.w t3, s7, t3 //sec_shift vreplgr2vr.h vr18, a6 //sec_strength vreplgr2vr.h vr19, t3 //sec_shift or t2, s1, s1 //dowhile loop param addi.d s2, a7, 4 slli.d s2, s2, 1 //directions dir+4 slli.d s3, a7, 1 //directions dir+0 la.local t0, dav1d_cdef_directions add.d s1, t0, s1 add.d s2, t0, s2 ld.b s1, s2, 0 //off02 ld.b s2, s2, 1 //off12 add.d s3, t0, s3 ld.b t0, s3, 0 //off03 ld.b s3, s3, 1 //off13 slli.d s1, s1, 1 slli.d s2, s2, 1 slli.d t0, t0, 1 slli.d s3, s3, 1 .endm .macro cdef_process_data_w8 in0, in1 vsub.h vr11, vr5, vr0 vsub.h vr12, vr6, vr0 vsub.h vr13, vr7, vr0 vsub.h vr14, vr8, vr0 constrain_vrh vr11, \in0, \in1, vr16, vr17, vr11 constrain_vrh vr12, \in0, \in1, vr16, vr17, vr12 constrain_vrh vr13, \in0, \in1, vr16, vr17, vr13 constrain_vrh vr14, \in0, \in1, vr16, vr17, vr14 .endm .macro cdef_process_data_w4 in0, in1 vpermi.w vr6, vr5, 0x44 vpermi.w vr8, vr7, 0x44 vsub.h vr12, vr6, vr0 vsub.h vr14, vr8, vr0 constrain_vrh vr12, \in0, \in1, vr16, vr17, vr12 constrain_vrh vr14, \in0, \in1, vr16, vr17, vr14 .endm .macro cdef_calc_sum_tapchange_w8 vmul.h vr1, vr15, vr11 //sum vmadd.h vr1, vr15, vr12 //sum vand.v vr15, vr15, vr21 vor.v vr15, vr15, vr22 vmadd.h vr1, vr15, vr13 //sum vmadd.h vr1, vr15, vr14 //sum .endm .macro cdef_calc_sum_tapchange_w4 vmul.h vr1, vr15, vr12 //sum vand.v vr15, vr15, vr21 vor.v vr15, vr15, vr22 vmadd.h vr1, vr15, vr14 //sum .endm .macro cdef_calc_sum_no_tapchange_w4 in0 vmadd.h vr1, \in0, vr12 vmadd.h vr1, \in0, vr14 .endm .macro cdef_calc_sum_no_tapchange_w8 in0 vmadd.h vr1, \in0, vr11 //sum vmadd.h vr1, \in0, vr12 vmadd.h vr1, \in0, vr13 vmadd.h vr1, \in0, vr14 .endm .macro cdef_calc_maxmin_w4 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr6, vr2 vmin.hu vr3, vr8, vr3 //min vmax.h vr2, vr8, vr2 //max .endm .macro cdef_calc_maxmin_w8 vmin.hu vr3, vr5, vr3 vmax.h vr2, vr5, vr2 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr6, vr2 vmin.hu vr3, vr7, vr3 vmax.h vr2, vr7, vr2 vmin.hu vr3, vr8, vr3 //min vmax.h vr2, vr8, vr2 //max .endm .macro cdef_calc_dst vslti.h vr5, vr1, 0 vand.v vr5, vr5, vr20 vsub.h vr5, vr1, vr5 vaddi.hu vr5, vr5, 8 vsrai.h vr5, vr5, 4 vadd.h vr5, vr0, vr5 .endm //static NOINLINE void cdef_filter_block_lsx // (pixel *dst, const ptrdiff_t dst_stride, // const pixel (*left)[2], const pixel *const top, // const int pri_strength, const int sec_strength, // const int dir, const int damping, const int w, int h, // const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) // w=4 h=4 //param: dst:a0, dst_stride:a1, left:a2, top:a3, bottom:a4, pri_strength:a5 //sec_strength:a6, dir:a7, damping:s7, w:s0, h:s1, edges:s2 function cdef_filter_block_4x4_8bpc_lsx ld.w t0, sp, 0 ld.w t1, sp, 8 addi.d sp, sp, -(64+288) st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 li.w s0, 4 //w li.w s1, 4 //h or s2, t1, t1 //edges or s7, t0, t0 //damping li.d s5, 12 //tmp_stride addi.d s4, sp, 64 slli.d t0, s5, 1 addi.d t0, t0, 2 slli.d t0, t0, 1 add.d s4, s4, t0 //ptr tmp vxor.v vr23, vr23, vr23 li.w t2, 1 vreplgr2vr.h vr20, t2 vaddi.hu vr21, vr20, 2 vaddi.hu vr22, vr20, 1 li.w t0, -16384 vreplgr2vr.h vr18, t0 //padding li.w t5, -2 //x_start addi.d t6, s0, 2 //x_end li.w t7, -2 //y_start addi.d t8, s1, 2 //y_end li.w t2, 2 andi t4, s2, 4 bnez t4, 1f //CDEF_HAVE_TOP slli.d t3, s5, 2 addi.d t4, s4, -4 sub.d t4, t4, t3 addi.d t3, s0, 4 cdef_fill t4, s5, t3, t2 or t7, zero, zero 1: //CDEF_HAVE_BOTTOM andi t4, s2,8 bnez t4, 2f mul.w t3, s1, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 li.d t3, 8 cdef_fill t4, s5, t3, t2 addi.d t8, t8, -2 2: //CDEF_HAVE_LEFT andi t4, s2,1 bnez t4, 3f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 or t5, zero, zero 3: //CDEF_HAVE_RIGHT andi t4, s2,2 bnez t4, 40f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, 8 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 addi.d t6, t6, -2 40: cdef_padding_data beqz a5, 33f 28: //if (pri_strength) li.w t0, 4 andi t1, a5, 1 sub.d t0, t0, t1 //pri_tap clz.w t1, a5 li.d t2, 31 sub.w t1, t2, t1 sub.w t1, s7, t1 blt t1, zero, 281f or t1, t1, t1 b 282f 281: or t1, zero, zero //t1: pri_shift 282: beqz a6, 31f 29: //if (sec_strength) cdef_pri_sec_init 30: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr2, vr0, vr0 //max vor.v vr3, vr0, vr0 //min vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 cdef_calc_maxmin_w4 sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 cdef_calc_maxmin_w4 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 cdef_calc_maxmin_w4 vshuf4i.w vr5, vr1, 0x0e vshuf4i.w vr6, vr3, 0x0e vshuf4i.w vr7, vr2, 0x0e vadd.h vr1, vr1, vr5 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr7, vr2 cdef_calc_dst iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 30b b 35f 31: // pri_strength only cdef_pri_init 32: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 32b b 35f 33: // sec_strength only cdef_sec_init 34: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 34b 35: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 addi.d sp, sp, (64+288) endfunc function cdef_filter_block_4x8_8bpc_lsx ld.w t0, sp, 0 ld.w t1, sp, 8 addi.d sp, sp, -(64+288) st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 li.w s0, 4 //w li.w s1, 8 //h or s2, t1, t1 //edges or s7, t0, t0 //damping li.d s5, 12 //tmp_stride addi.d s4, sp, 64 slli.d t0, s5, 1 addi.d t0, t0, 2 slli.d t0, t0, 1 add.d s4, s4, t0 //ptr tmp vxor.v vr23, vr23, vr23 li.w t2, 1 vreplgr2vr.h vr20, t2 vaddi.hu vr21, vr20, 2 vaddi.hu vr22, vr20, 1 li.w t0, -16384 vreplgr2vr.h vr18, t0 //padding li.w t5, -2 //x_start addi.d t6, s0, 2 //x_end li.w t7, -2 //y_start addi.d t8, s1, 2 //y_end li.w t2, 2 andi t4, s2, 4 bnez t4, 1f //CDEF_HAVE_TOP slli.d t3, s5, 2 addi.d t4, s4, -4 sub.d t4, t4, t3 addi.d t3, s0, 4 cdef_fill t4, s5, t3, t2 or t7, zero, zero 1: //CDEF_HAVE_BOTTOM andi t4, s2,8 bnez t4, 2f mul.w t3, s1, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 li.d t3, 8 cdef_fill t4, s5, t3, t2 addi.d t8, t8, -2 2: //CDEF_HAVE_LEFT andi t4, s2,1 bnez t4, 3f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 or t5, zero, zero 3: //CDEF_HAVE_RIGHT andi t4, s2,2 bnez t4, 40f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, 8 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 addi.d t6, t6, -2 40: cdef_padding_data beqz a5, 33f 28: //if (pri_strength) li.w t0, 4 andi t1, a5, 1 sub.d t0, t0, t1 //pri_tap clz.w t1, a5 li.d t2, 31 sub.w t1, t2, t1 sub.w t1, s7, t1 blt t1, zero, 281f or t1, t1, t1 b 282f 281: or t1, zero, zero //t1: pri_shift 282: beqz a6, 31f 29: //if (sec_strength) cdef_pri_sec_init 30: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr2, vr0, vr0 //max vor.v vr3, vr0, vr0 //min vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 cdef_calc_maxmin_w4 sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 cdef_calc_maxmin_w4 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 cdef_calc_maxmin_w4 vshuf4i.w vr5, vr1, 0x0e vshuf4i.w vr6, vr3, 0x0e vshuf4i.w vr7, vr2, 0x0e vadd.h vr1, vr1, vr5 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr7, vr2 cdef_calc_dst iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 30b b 35f 31: // pri_strength only cdef_pri_init 32: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 32b b 35f 33: // sec_strength only cdef_sec_init 34: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 34b 35: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 addi.d sp, sp, (64+288) endfunc function cdef_filter_block_8x8_8bpc_lsx ld.w t0, sp, 0 ld.w t1, sp, 8 addi.d sp, sp, -(64+288) st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 li.w s0, 8 //w li.w s1, 8 //h or s2, t1, t1 //edges or s7, t0, t0 //damping // cdef_filter_block_kernel li.d s5, 12 //tmp_stride addi.d s4, sp, 64 slli.d t0, s5, 1 addi.d t0, t0, 2 slli.d t0, t0, 1 add.d s4, s4, t0 //ptr tmp vxor.v vr23, vr23, vr23 li.w t2, 1 vreplgr2vr.h vr20, t2 vaddi.hu vr21, vr20, 2 vaddi.hu vr22, vr20, 1 li.w t0, -16384 vreplgr2vr.h vr18, t0 //padding li.w t5, -2 //x_start addi.d t6, s0, 2 //x_end li.w t7, -2 //y_start addi.d t8, s1, 2 //y_end li.w t2, 2 andi t4, s2, 4 bnez t4, 1f //CDEF_HAVE_TOP slli.d t3, s5, 2 addi.d t4, s4, -4 sub.d t4, t4, t3 addi.d t3, s0, 4 cdef_fill t4, s5, t3, t2 or t7, zero, zero 1: //CDEF_HAVE_BOTTOM andi t4, s2,8 bnez t4, 2f mul.w t3, s1, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 li.d t3, 12 cdef_fill t4, s5, t3, t2 addi.d t8, t8, -2 2: //CDEF_HAVE_LEFT andi t4, s2,1 bnez t4, 3f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 sub.d t3, t8, t7 li.d t2, 2 cdef_fill t4, s5, t2, t3 or t5, zero, zero 3: //CDEF_HAVE_RIGHT andi t4, s2,2 bnez t4, 40f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, 16 sub.d t3, t8, t7 li.d t2, 2 cdef_fill t4, s5, t2, t3 addi.d t6, t6, -2 40: cdef_padding_data beqz a5, 33f 28: //if (pri_strength) li.w t0, 4 andi t1, a5, 1 sub.d t0, t0, t1 //pri_tap //edit clz.w t1, a5 li.d t2, 31 sub.w t3, t2, t1 sub.w t3, s7, t3 or t1, zero, zero //t1: pri_shift blt t3, zero, 281f or t1, t3, t3 281: beqz a6, 31f 29: //if (sec_strength) cdef_pri_sec_init 301: fld.d f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vxor.v vr1, vr1, vr1 //sum vor.v vr2, vr0, vr0 //max vor.v vr3, vr0, vr0 //min vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 vldx vr5, s4, a2 vld vr6, t4, 0 vldx vr7, s4, a3 vld vr8, t5, 0 cdef_process_data_w8 vr9, vr10 cdef_calc_sum_tapchange_w8 cdef_calc_maxmin_w8 //s 00-03 sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] vldx vr5, s4, s1 vld vr6, t4, 0 vldx vr7, s4, t0 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr22 cdef_calc_maxmin_w8 //s 10-13 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] vldx vr5, s4, s2 vld vr6, t4, 0 vldx vr7, s4, s3 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr20 cdef_calc_maxmin_w8 cdef_calc_dst iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 vsrlni.b.h vr5, vr5, 0 fst.d f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 301b b 35f 31: // pri_strength only cdef_pri_init 32: fld.d f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vxor.v vr1, vr1, vr1 //sum vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 vldx vr5, s4, a2 vld vr6, t4, 0 vldx vr7, s4, a3 vld vr8, t5, 0 cdef_process_data_w8 vr9, vr10 cdef_calc_sum_tapchange_w8 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.d f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 32b b 35f 33: // sec_strength only cdef_sec_init 34: fld.d f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vxor.v vr1, vr1, vr1 //sum sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] vldx vr5, s4, s1 vld vr6, t4, 0 vldx vr7, s4, t0 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr22 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] vldx vr5, s4, s2 vld vr6, t4, 0 vldx vr7, s4, s3 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr20 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.d f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 34b 35: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 addi.d sp, sp, (64+288) endfunc