/* * Copyright © 2023, VideoLAN and dav1d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "loongson_asm.S" const min_prob .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 endconst const ph_0xff00 .rept 8 .short 0xff00 .endr endconst .macro decode_symbol_adapt w addi.d sp, sp, -48 vldrepl.h vr0, a0, 24 //rng fst.s f0, sp, 0 //val==0 vld vr1, a1, 0 //cdf .if \w == 16 vld vr11, a1, 16 .endif vldrepl.d vr2, a0, 16 //dif ld.w t1, a0, 32 //allow_update_cdf la.local t2, min_prob addi.d t2, t2, 30 slli.w t3, a2, 1 sub.d t2, t2, t3 vld vr3, t2, 0 //min_prob .if \w == 16 vld vr13, t2, 16 .endif vsrli.h vr4, vr0, 8 //r = s->rng >> 8 vslli.h vr4, vr4, 8 //r << 8 vsrli.h vr5, vr1, 6 vslli.h vr5, vr5, 7 .if \w == 16 vsrli.h vr15, vr11, 6 vslli.h vr15, vr15, 7 .endif vmuh.hu vr5, vr4, vr5 vadd.h vr5, vr5, vr3 //v .if \w == 16 vmuh.hu vr15, vr4, vr15 vadd.h vr15, vr15, vr13 .endif addi.d t8, sp, 2 vst vr5, t8, 0 //store v .if \w == 16 vst vr15, t8, 16 .endif vreplvei.h vr20, vr2, 3 //c vsle.hu vr6, vr5, vr20 .if \w == 16 vsle.hu vr16, vr15, vr20 vpickev.b vr21, vr16, vr6 .endif .if \w <= 8 vmskltz.h vr10, vr6 .else vmskltz.b vr10, vr21 .endif beqz t1, .renorm\()\w // update_cdf alsl.d t1, a2, a1, 1 ld.h t2, t1, 0 //count srli.w t3, t2, 4 //count >> 4 .if \w == 16 addi.w t3, t3, 5 //rate .else addi.w t3, t3, 4 li.w t5, 2 sltu t5, t5, a2 add.w t3, t3, t5 //rate .endif sltui t5, t2, 32 add.w t2, t2, t5 //count + (count < 32) vreplgr2vr.h vr9, t3 vseq.h vr7, vr7, vr7 vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768 vsub.h vr5, vr5, vr1 vsub.h vr8, vr1, vr6 .if \w == 16 vavgr.hu vr15, vr16, vr7 vsub.h vr15, vr15, vr11 vsub.h vr18, vr11, vr16 .endif vsra.h vr5, vr5, vr9 vadd.h vr8, vr8, vr5 .if \w == 4 fst.d f8, a1, 0 .else vst vr8, a1, 0 .endif .if \w == 16 vsra.h vr15, vr15, vr9 vadd.h vr18, vr18, vr15 vst vr18, a1, 16 .endif st.h t2, t1, 0 .renorm\()\w: vpickve2gr.h t3, vr10, 0 ctz.w a7, t3 // ret alsl.d t3, a7, t8, 1 ld.hu t4, t3, 0 // v ld.hu t5, t3, -2 // u sub.w t5, t5, t4 // rng slli.d t4, t4, 48 vpickve2gr.d t6, vr2, 0 sub.d t6, t6, t4 // dif clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 ld.w t0, a0, 28 //cnt sll.w t5, t5, t4 sub.w t7, t0, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu t0, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, a7 addi.d sp, sp, 48 .endm function msac_decode_symbol_adapt4_lsx decode_symbol_adapt 4 endfunc function msac_decode_symbol_adapt8_lsx decode_symbol_adapt 8 endfunc function msac_decode_symbol_adapt16_lsx decode_symbol_adapt 16 endfunc function msac_decode_bool_lsx ld.w t0, a0, 24 // rng srli.w a1, a1, 6 ld.d t1, a0, 16 // dif srli.w t2, t0, 8 // r >> 8 mul.w t2, t2, a1 ld.w a5, a0, 28 // cnt srli.w t2, t2, 1 addi.w t2, t2, 4 // v slli.d t3, t2, 48 // vw sltu t4, t1, t3 move t8, t4 // ret xori t4, t4, 1 maskeqz t6, t3, t4 // if (ret) vw sub.d t6, t1, t6 // dif slli.w t5, t2, 1 sub.w t5, t0, t5 // r - 2v maskeqz t7, t5, t4 // if (ret) r - 2v add.w t5, t2, t7 // v(rng) // renorm clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 sll.w t5, t5, t4 sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, t8 endfunc function msac_decode_bool_equi_lsx ld.w t0, a0, 24 // rng ld.d t1, a0, 16 // dif ld.w a5, a0, 28 // cnt srli.w t2, t0, 8 // r >> 8 slli.w t2, t2, 7 addi.w t2, t2, 4 // v slli.d t3, t2, 48 // vw sltu t4, t1, t3 move t8, t4 // ret xori t4, t4, 1 maskeqz t6, t3, t4 // if (ret) vw sub.d t6, t1, t6 // dif slli.w t5, t2, 1 sub.w t5, t0, t5 // r - 2v maskeqz t7, t5, t4 // if (ret) r - 2v add.w t5, t2, t7 // v(rng) // renorm clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 sll.w t5, t5, t4 sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, t8 endfunc function msac_decode_bool_adapt_lsx ld.hu a3, a1, 0 // cdf[0] /f ld.w t0, a0, 24 // rng ld.d t1, a0, 16 // dif srli.w t2, t0, 8 // r >> 8 srli.w a7, a3, 6 mul.w t2, t2, a7 ld.w a4, a0, 32 // allow_update_cdf ld.w a5, a0, 28 // cnt srli.w t2, t2, 1 addi.w t2, t2, 4 // v slli.d t3, t2, 48 // vw sltu t4, t1, t3 move t8, t4 // bit xori t4, t4, 1 maskeqz t6, t3, t4 // if (ret) vw sub.d t6, t1, t6 // dif slli.w t5, t2, 1 sub.w t5, t0, t5 // r - 2v maskeqz t7, t5, t4 // if (ret) r - 2v add.w t5, t2, t7 // v(rng) beqz a4, .renorm // update_cdf ld.hu t0, a1, 2 // cdf[1] srli.w t1, t0, 4 addi.w t1, t1, 4 // rate sltui t2, t0, 32 // count < 32 add.w t0, t0, t2 // count + (count < 32) sub.w a3, a3, t8 // cdf[0] -= bit slli.w t4, t8, 15 sub.w t7, a3, t4 // cdf[0] - bit - 32768 sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate sub.w t7, a3, t7 // cdf[0] st.h t7, a1, 0 st.h t0, a1, 2 .renorm: clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 sll.w t5, t5, t4 sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, t8 endfunc .macro HI_TOK allow_update_cdf .\allow_update_cdf\()_hi_tok_lsx_start: .if \allow_update_cdf == 1 ld.hu a4, a1, 0x06 // cdf[3] .endif vor.v vr1, vr0, vr0 vsrli.h vr1, vr1, 0x06 // cdf[val] >> EC_PROB_SHIFT vstelm.h vr2, sp, 0, 0 // -0x1a vand.v vr2, vr2, vr4 // (8 x rng) & 0xff00 vslli.h vr1, vr1, 0x07 vmuh.hu vr1, vr1, vr2 vadd.h vr1, vr1, vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val); vst vr1, sp, 0x02 // -0x18 vssub.hu vr1, vr1, vr3 // v - c vseqi.h vr1, vr1, 0 .if \allow_update_cdf == 1 addi.d t4, a4, 0x50 srli.d t4, t4, 0x04 sltui t7, a4, 32 add.w a4, a4, t7 vreplgr2vr.h vr7, t4 vavgr.hu vr9, vr8, vr1 vsub.h vr9, vr9, vr0 vsub.h vr0, vr0, vr1 vsra.h vr9, vr9, vr7 vadd.h vr0, vr0, vr9 vstelm.d vr0, a1, 0, 0 st.h a4, a1, 0x06 .endif vmsknz.b vr7, vr1 movfr2gr.s t4, f7 ctz.w t4, t4 // loop_times * 2 addi.d t7, t4, 2 ldx.hu t6, sp, t4 // u ldx.hu t5, sp, t7 // v addi.w t3, t3, 0x05 addi.w t4, t4, -0x05 // if t4 == 3, continue sub.w t6, t6, t5 // u - v , rng for ctx_norm slli.d t5, t5, 0x30 // (ec_win)v << (EC_WIN_SIZE - 16) sub.d t1, t1, t5 // s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)) // Init ctx_norm param clz.w t7, t6 xori t7, t7, 0x1f xori t7, t7, 0x0f // d = 15 ^ (31 ^ clz(rng)); sll.d t1, t1, t7 // dif << d sll.d t6, t6, t7 // rng << d // update vr2 8 x rng vreplgr2vr.h vr2, t6 vreplvei.h vr2, vr2, 0 st.w t6, a0, 0x18 // store rng move t0, t2 sub.w t2, t2, t7 // cnt - d bgeu t0, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end // if ((unsigned)cnt < (unsigned)d) goto ctx_norm_end // Step into ctx_fill ld.d t5, a0, 0x00 // buf_pos ld.d t6, a0, 0x08 // end_pos addi.d t7, t5, 0x08 // buf_pos + 8 sub.d t7, t7, t6 // (buf_pos + 8) - end_pos blt zero, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob // (end_pos - buf_pos) >= 8 ld.d t6, t5, 0x00 // load buf_pos[0]~buf_pos[7] addi.w t7, t2, -0x30 // cnt - 0x30 nor t6, t6, t6 // not buf data revb.d t6, t6 // Byte reversal srl.d t6, t6, t7 // Replace left shift with right shift sub.w t7, zero, t7 // neg srli.w t7, t7, 0x03 // Loop times or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob: bge t5, t6, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one // end_pos - buf_pos < 8 && buf_pos < end_pos ld.d t0, t6, -0x08 slli.d t7, t7, 0x03 srl.d t6, t0, t7 // Retrieve the buf data and remove the excess data addi.w t7, t2, -0x30 // cnt - 0x30 nor t6, t6, t6 // not revb.d t6, t6 // Byte reversal srl.d t6, t6, t7 // Replace left shift with right shift sub.w t7, zero, t7 // neg or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c ld.d t6, a0, 0x08 // end_pos srli.w t7, t7, 0x03 // Loop times sub.d t6, t6, t5 // end_pos - buf_pos slt t0, t6, t7 maskeqz a3, t6, t0 // min(loop_times, end_pos - buf_pos) masknez t0, t7, t0 or t7, a3, t0 b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one: // buf_pos >= end_pos addi.w t7, t2, -0x10 andi t7, t7, 0xf nor t0, zero, zero srl.d t0, t0, t7 or t1, t1, t0 // dif |= ~(~(ec_win)0xff << c); b .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end: add.d t5, t5, t7 // buf_pos + Loop_times st.d t5, a0, 0x00 // Store buf_pos alsl.w t2, t7, t2, 0x03 // update cnt .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end: srli.d t7, t1, 0x30 vreplgr2vr.h vr3, t7 // broadcast the high 16 bits of dif add.w t3, t4, t3 // update control parameter beqz t3, .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times. blt zero, t4, .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3 .\allow_update_cdf\()_hi_tok_lsx_end: addi.d t3, t3, 0x1e st.d t1, a0, 0x10 // store dif st.w t2, a0, 0x1c // store cnt srli.w a0, t3, 0x01 // tok addi.d sp, sp, 0x1a .endm /** * @param unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) * * Reg Alloction * * vr0: cdf; * * vr1: temp; * * vr2: rng; * * vr3: dif; * * vr4: const 0xff00ff00...ff00ff00; * * vr5: const 0x0004080c; * * vr6: const 0; * * t0: allow_update_cdf, tmp; * * t1: dif; * * t2: cnt; * * t3: 0xffffffe8, outermost control parameter; * * t4: loop time * * t5: v, buf_pos, temp; * * t6: u, rng, end_pos, buf, temp; * * t7: temp; */ function msac_decode_hi_tok_lsx fld.d f0, a1, 0 // Load cdf[0]~cdf[3] vldrepl.h vr2, a0, 0x18 // 8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid vldrepl.h vr3, a0, 0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16) ld.w t0, a0, 0x20 // allow_update_cdf la.local t7, ph_0xff00 vld vr4, t7, 0x00 // 0xff00ff00...ff00ff00 la.local t7, min_prob vld vr5, t7, 12 * 2 // 0x0004080c vxor.v vr6, vr6, vr6 // const 0 ld.d t1, a0, 0x10 // dif ld.w t2, a0, 0x1c // cnt orn t3, t3, t3 srli.d t3, t3, 32 addi.d t3, t3, -0x17 // 0xffffffe8 vseq.h vr8, vr8, vr8 addi.d sp, sp, -0x1a // alloc stack beqz t0, .hi_tok_lsx_no_update_cdf HI_TOK 1 jirl zero, ra, 0x0 .hi_tok_lsx_no_update_cdf: HI_TOK 0 endfunc