/***************************************************************************** * Copyright (C) 2021 MulticoreWare, Inc * * Authors: Min Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ // Functions in this file: // ***** luma_vpp ***** #include "asm.S" #ifdef __APPLE__ .section __RODATA,__rodata #else .section .rodata #endif .align 4 .text .align 4 tbl_const_1to8_7to0: .byte 1, 2, 3, 4, 5, 6, 7, 8 .byte 7, 6, 5, 4, 3, 2, 1, 0 .byte 9, 10, 11, 12, 13, 14, 15, 16 .byte 15, 14, 13, 12, 11, 10, 9, 8 // ***** planar_pred ***** // void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/) function PFX(intra_pred_planar8_neon) // Register map // x0 = dst // x1 = dstStride // x2 = *srcPix // x3 = left[x] // x4 = tmp // v0 = above[7:0] // v1 = left[7:0] // v2 = topRight = rep(above[blkSize]) // v3 = bottomLeft = rep(left[blkSize]) // v4 = const[8 7 6 5 4 3 2 1] // v5 = const[7 6 5 4 3 2 1 0] //{ // const int blkSize = 1 << log2Size; // const pixel* above = srcPix + 1; // const pixel* left = srcPix + (2 * blkSize + 1); // pixel topRight = above[blkSize]; // pixel bottomLeft = left[blkSize]; // for (int y = 0; y < blkSize; y++) // for (int x = 0; x < blkSize; x++) // dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) * left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1)); //} ldurb w3, [x2, #(1+8)] // topRight ldurb w4, [x2, #(2*8+1+8)] // bottomLeft dup v2.8b, w3 // v2 = topRight_b dup v3.8h, w4 // v3 = bottomLeft_h ldr x3, [x2, #(2*8+1)] // x3 = left[x]_b ldr d0, [x2, #1] // v0 = above[x]_b adr x4, tbl_const_1to8_7to0 ldr d4, [x4] // v4 = const_b[8 7 6 5 4 3 2 1] ldr d5, [x4, #8] // v5 = const_b[7 6 5 4 3 2 1 0] ushll v6.8h, v0.8b, #3 // v6 = 8 * above[x] usubw v0.8h, v3.8h, v0.8b // v0 = bottomLeft - above[x] umlal v6.8h, v4.8b, v2.8b // v6 = 8 * above[x] + (x + 1) * topRight mov w4, #8 1: dup v1.8b, w3 lsr x3, x3, #8 add v6.8h, v6.8h, v0.8h // v6 = (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft mov v3.16b, v6.16b umlal v3.8h, v5.8b, v1.8b // v3 = (blkSize - 1 - x) * left[y=0] + (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft rshrn v3.8b, v3.8h, #4 sub w4, w4, #1 st1 {v3.8b}, [x0], x1 cbnz w4, 1b ret endfunc // void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/) function PFX(intra_pred_planar16_neon) // Register map // x0 = dst // x1 = dstStride // x2 = *srcPix // x3 = left[x] // x4 = tmp // v0 = above[7:0] // v1 = left[7:0] // v2 = topRight = rep(above[blkSize]) // v3 = bottomLeft = rep(left[blkSize]) // v4 = const[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] // v5 = const[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] //{ // const int blkSize = 1 << log2Size; // const pixel* above = srcPix + 1; // const pixel* left = srcPix + (2 * blkSize + 1); // pixel topRight = above[blkSize]; // pixel bottomLeft = left[blkSize]; // for (int y = 0; y < blkSize; y++) // for (int x = 0; x < blkSize; x++) // dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) * left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1)); //} ldurb w3, [x2, #(1+16)] // topRight ldurb w4, [x2, #(2*16+1+16)] // bottomLeft ldr q0, [x2, #(2*16+1)] // v0 = left[x]_b ldr q1, [x2, #1] // v1 = above[x]_b dup v2.16b, w3 // v2 = topRight_b dup v3.8h, w4 // v3 = bottomLeft_h adr x4, tbl_const_1to8_7to0 ld2 {v4.2d, v5.2d}, [x4] // v4 = const_b[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] ext v5.16b, v5.16b, v5.16b, #8 // v5 = const_b[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] ushll v16.8h, v1.8b, #4 // v16,v17 = 16 * above[x] ushll2 v17.8h, v1.16b, #4 usubw v6.8h, v3.8h, v1.8b // v6,v7 = bottomLeft - above[x] usubw2 v7.8h, v3.8h, v1.16b umlal v16.8h, v4.8b, v2.8b // v16,v17 = 16 * above[x] + (x + 1) * topRight umlal2 v17.8h, v4.16b, v2.16b mov w4, #16 1: dup v1.16b, v0.b[0] // v1 = left[x]_b ext v0.16b, v0.16b, v0.16b, #1 add v16.8h, v16.8h, v6.8h // v16,v17 = (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft add v17.8h, v17.8h, v7.8h mov v18.16b, v16.16b mov v19.16b, v17.16b umlal v18.8h, v5.8b, v1.8b // v3 = (blkSize - 1 - x) * left[y=0] + (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft umlal2 v19.8h, v5.16b, v1.16b rshrn v18.8b, v18.8h, #5 rshrn2 v18.16b, v19.8h, #5 st1 {v18.16b}, [x0], x1 sub w4, w4, #1 cbnz w4, 1b ret endfunc