;***************************************************************************** ;* Copyright (C) 2013-2020 MulticoreWare, Inc ;* ;* Authors: Min Chen ;* Praveen Kumar Tiwari ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 const intra_pred_shuff_0_8, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_0_8 times 8 db 0, 8 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0 c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 const tab_S1, db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 const tab_S2, db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0 const tab_Si, db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0 c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0 c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15 c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0 c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0 c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6 c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4 c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2 c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2 c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2 c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1 c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ALIGN 32 c_ang8_src1_9_2_10: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 c_ang8_26_20: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 c_ang8_src3_11_4_12: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 c_ang8_14_8: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 c_ang8_src5_13_5_13: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 c_ang8_2_28: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 c_ang8_src6_14_7_15: db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 c_ang8_22_16: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 c_ang8_21_10 : db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 c_ang8_src2_10_3_11: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 c_ang8_31_20: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 c_ang8_src4_12_4_12: times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 c_ang8_9_30: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 c_ang8_src5_13_6_14: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13 c_ang8_19_8: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 c_ang8_17_2: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 c_ang8_19_4: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 c_ang8_21_6: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 c_ang8_23_8: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, c_ang8_src4_12_5_13: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 c_ang8_13_26: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 c_ang8_7_20: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 c_ang8_1_14: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 c_ang8_27_8: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 c_ang8_src2_10_2_10: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 c_ang8_src3_11_3_11: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 c_ang8_31_8: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 c_ang8_13_22: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 c_ang8_27_4: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 c_ang8_9_18: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 c_ang8_5_10: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 c_ang8_15_20: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 c_ang8_25_30: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 c_ang8_3_8: db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 c_ang8_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 c_ang8_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 c_ang8_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 ALIGN 32 c_ang16_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 ALIGN 32 c_ang16_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 ALIGN 32 c_ang16_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 ALIGN 32 c_ang16_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 ALIGN 32 intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15 ALIGN 32 c_ang16_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 c_ang16_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 intra_pred_shuff_0_4: times 4 db 0, 1, 1, 2, 2, 3, 3, 4 intra_pred4_shuff1: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5 intra_pred4_shuff2: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5 intra_pred4_shuff31: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6 intra_pred4_shuff33: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7 intra_pred4_shuff3: db 8, 9, 9, 10, 10, 11, 11, 12, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15 intra_pred4_shuff4: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15 intra_pred4_shuff5: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15 intra_pred4_shuff6: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14 intra_pred4_shuff7: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14 intra_pred4_shuff9: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13 intra_pred4_shuff12: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12,0, 9, 9, 10, 10, 11, 11, 12 intra_pred4_shuff13: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11 intra_pred4_shuff14: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11 intra_pred4_shuff15: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10 intra_pred4_shuff16: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10 intra_pred4_shuff17: db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9 intra_pred4_shuff19: db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1 intra_pred4_shuff20: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2 intra_pred4_shuff21: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2 intra_pred4_shuff22: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3 intra_pred4_shuff23: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3 c_ang4_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 c_ang4_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 c_ang4_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 c_ang4_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 c_ang4_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 c_ang4_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20 c_ang4_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8 c_ang4_mode_5: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 c_ang4_mode_6: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 c_ang4_mode_7: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 c_ang4_mode_8: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 c_ang4_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 c_ang4_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 c_ang4_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 c_ang4_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 c_ang4_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 c_ang4_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4 c_ang4_mode_16: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 c_ang4_mode_17: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 c_ang4_mode_19: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 c_ang4_mode_20: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 c_ang4_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28 c_ang4_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 c_ang4_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 c_ang4_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 c_ang4_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 ALIGN 32 ;; (blkSize - 1 - x) pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 ALIGN 32 c_ang8_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 ALIGN 32 c_ang8_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 ALIGN 32 c_ang8_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 const c_ang8_mode_16, db 8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 12, 13, 15, 0, 0 const intra_pred8_shuff16, db 0, 1, 1, 2, 3, 3, 4, 5 db 1, 2, 2, 3, 4, 4, 5, 6 db 2, 3, 3, 4, 5, 5, 6, 7 db 3, 4, 4, 5, 6, 6, 7, 8 db 4, 5, 5, 6, 7, 7, 8, 9 const angHor8_tab_16, db (32-11), 11, (32-22), 22, (32-1 ), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24 const c_ang8_mode_20, db 15, 13, 12, 10, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0 ; NOTE: this big table improve speed ~10%, if we have broadcast instruction work on high-128bits infuture, we can remove the table const angHor8_tab_20, times 8 db (32-24), 24 times 8 db (32-13), 13 times 8 db (32- 2), 2 times 8 db (32-23), 23 times 8 db (32-12), 12 times 8 db (32- 1), 1 times 8 db (32-22), 22 times 8 db (32-11), 11 const ang16_shuf_mode9, times 8 db 0, 1 times 8 db 1, 2 const angHor_tab_9, db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16 db (32-18), 18, (32-20), 20, (32-22), 22, (32-24), 24, (32-26), 26, (32-28), 28, (32-30), 30, (32-32), 32 const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16 db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0 const ang16_shuf_mode12, db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3 db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2 const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24 db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16 const ang16_shuf_mode13, db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4 db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0 const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24 db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16 const ang16_shuf_mode14, db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5 db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0 const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24 db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16 const ang16_shuf_mode15, db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6 db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2 db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0 const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24 db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16 const ang16_shuf_mode16, db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7 db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2 db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0 const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24 db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16 const ang16_shuf_mode17, db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8 db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2 db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0 const angHor_tab_17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16 db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0 ; Intrapred_angle32x32, modes 1 to 33 constants const ang32_shuf_mode9, times 8 db 0, 1 times 8 db 1, 2 const ang32_shuf_mode11, times 8 db 1, 2 times 8 db 0, 1 const ang32_fact_mode12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7), 7, (32- 2), 2, (32-29), 29, (32-24), 24 db (32-11), 11, (32- 6), 6, (32- 1), 1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8), 8 db (32-19), 19, (32-14), 14, (32- 9), 9, (32- 4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16 db (32- 3), 3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15, (32-10), 10, (32- 5), 5, (32- 0), 0 const ang32_shuf_mode12, db 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 db 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 const ang32_shuf_mode24, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 3, 3 dd 0, 0, 7, 3, 0, 0, 7, 3 const ang32_fact_mode13, db (32-23), 23, (32-14), 14, (32- 5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1), 1, (32-24), 24 db (32- 7), 7, (32-30), 30, (32-21), 21, (32-12), 12, (32- 3), 3, (32-26), 26, (32-17), 17, (32- 8), 8 db (32-15), 15, (32- 6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32- 2), 2, (32-25), 25, (32-16), 16 db (32-31), 31, (32-22), 22, (32-13), 13, (32- 4), 4, (32-27), 27, (32-18), 18, (32- 9), 9, (32- 0), 0 const ang32_shuf_mode13, db 14, 15, 14, 15, 14, 15, 13, 14, 13, 14, 13, 14, 13, 14, 12, 13, 10, 11, 9, 10, 9, 10, 9, 10, 9, 10, 8, 9, 8, 9, 8, 9 db 12, 13, 12, 13, 11, 12, 11, 12, 11, 12, 11, 12, 10, 11, 10, 11, 7, 8, 7, 8, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 6, 7 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 9, 5, 2 const ang32_shuf_mode23, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 11, 11, 7, 7, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 5, 5, 2, 2 const ang32_fact_mode14, db (32-19), 19, (32- 6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5), 5, (32-24), 24 db (32- 3), 3, (32-22), 22, (32- 9), 9, (32-28), 28, (32-15), 15, (32- 2), 2, (32-21), 21, (32- 8), 8 db (32-11), 11, (32-30), 30, (32-17), 17, (32- 4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16 db (32-27), 27, (32-14), 14, (32- 1), 1, (32-20), 20, (32- 7), 7, (32-26), 26, (32-13), 13, (32- 0), 0 const ang32_shuf_mode14, db 14, 15, 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 12, 13, 11, 12, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 5, 6, 5, 6 db 11, 12, 10, 11, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3 db 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 9, 6, 4, 1 const ang32_shuf_mode22, db 0, 0, 15, 15, 13, 13, 10, 10, 8, 8, 5, 5, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 7, 7, 4, 4, 2 const ang32_fact_mode15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9), 9, (32-24), 24 db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8), 8 db (32- 7), 7, (32-22), 22, (32- 5), 5, (32-20), 20, (32- 3), 3, (32-18), 18, (32- 1), 1, (32-16), 16 db (32-23), 23, (32- 6), 6, (32-21), 21, (32- 4), 4, (32-19), 19, (32- 2), 2, (32-17), 17, (32- 0), 0 const ang32_shuf_mode15, db 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 11, 12, 11, 12, 10, 11, 5, 6, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3 db 12, 13, 11, 12, 11, 12, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1 db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 7, 5, 3, 1 const ang32_shuf_mode21, db 15, 15, 13, 13, 11, 11, 9, 9, 8, 8, 6, 6, 4, 4, 2, 2, 14, 14, 12, 12, 10, 10, 8, 8, 7, 7, 5, 5, 3, 3, 1, 1 const ang32_fact_mode16, db (32-11), 11, (32-22), 22, (32- 1), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24 db (32- 3), 3, (32-14), 14, (32-25), 25, (32- 4), 4, (32-15), 15, (32-26), 26, (32- 5), 5, (32-16), 16 db (32-27), 27, (32- 6), 6, (32-17), 17, (32-28), 28, (32- 7), 7, (32-18), 18, (32-29), 29, (32- 8), 8 db (32-19), 19, (32-30), 30, (32- 9), 9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0), 0 const ang32_shuf_mode16, db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 5, 6, 4, 5 db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6 db 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 13, 11, 10, 8, 7, 5, 4, 2, 1 dd 7, 1, 2, 3, 7, 1, 2, 3 const ang32_shuf_mode20, db 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 14, 15, 8, 7, 5, 4, 2, 1, 0, 0, 14, 13, 13, 11, 11, 10, 10, 8 db 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 1, 1, 0, 0 const ang32_fact_mode17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16 db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0 const ang32_shuf_mode17, db 14, 15, 13, 14, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 4, 5, 3, 4, 2, 3, 2, 3 db 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0 const ang32_shuf_mode19, db 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15 dd 0, 0, 2, 3, 0, 0, 7, 1 dd 0, 0, 5, 6, 0, 0, 0, 0 ; Intrapred_angle8x8, modes 1 to 33 constants const ang8_shuf_mode3, db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 5, 5, 6, 6, 7, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 5, 6, 6, 7, 7, 8 const ang8_shuf_mode4, db 0, 1, 1, 2, 1, 2, 2, 3, 3, 4, 3, 4, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 4, 5, 4, 5, 5, 6, 6, 7 const ang8_shuf_mode5, db 0, 1, 1, 2, 1, 2, 2, 3, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 3, 4, 4, 5, 4, 5, 5, 6 const ang8_shuf_mode6, db 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3, 2, 3, 3, 4, 1, 2, 1, 2, 2, 3, 2, 3, 3, 4, 3, 4, 3, 4, 4, 5 const ang8_shuf_mode7, db 0, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 3, 1, 2, 1, 2, 1, 2, 2, 3, 2, 3, 2, 3, 2, 3, 3, 4 const ang8_shuf_mode8, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 3, 2, 3 const ang8_shuf_mode9, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 const ang8_shuf_mode12, db 7, 8, 7, 8, 7, 8, 7, 8, 7, 8, 7, 8, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 7, 8, 7, 8 const ang8_shuf_mode13, db 8, 9, 8, 9, 8, 9, 7, 8, 7, 8, 7, 8, 7, 8, 6, 7, 9, 10, 9, 10, 9, 10, 8, 9, 8, 9, 8, 9, 8, 9, 7, 8 const ang8_shuf_mode14, db 9, 10, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 7, 8, 6, 7, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 8, 9, 7, 8 const ang8_shuf_mode15, db 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 11, 12, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 7, 8 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 6, 4, 2, 0 const ang8_shuf_mode16, db 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 12, 13, 11, 12, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 6, 5, 3, 2, 0 const ang8_shuf_mode17, db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 2, 1, 0 const ang8_fact_mode3, db (32-26), 26, (32-20), 20, (32-14), 14, (32- 8), 8, (32- 2), 2, (32-28), 28, (32-22), 22, (32-16), 16 const ang8_fact_mode4, db (32-21), 21, (32-10), 10, (32-31), 31, (32-20), 20, (32- 9), 9, (32-30), 30, (32-19), 19, (32- 8), 8 const ang8_fact_mode5, db (32-17), 17, (32- 2), 2, (32-19), 19, (32- 4), 4, (32-21), 21, (32- 6), 6, (32-23), 23, (32- 8), 8 const ang8_fact_mode6, db (32-13), 13, (32-26), 26, (32- 7), 7, (32-20), 20, (32- 1), 1, (32-14), 14, (32-27), 27, (32- 8), 8 const ang8_fact_mode7, db (32- 9), 9, (32-18), 18, (32-27), 27, (32- 4), 4, (32-13), 13, (32-22), 22, (32-31), 31, (32- 8), 8 const ang8_fact_mode8, db (32- 5), 5, (32-10), 10, (32-15), 15, (32-20), 20, (32-25), 25, (32-30), 30, (32- 3), 3, (32- 8), 8 const ang8_fact_mode9, db (32- 2), 2, (32- 4), 4, (32- 6), 6, (32- 8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16 const ang8_fact_mode11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16 const ang8_fact_mode12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7), 7, (32- 2), 2, (32-29), 29, (32-24), 24 const ang8_fact_mode13, db (32-23), 23, (32-14), 14, (32- 5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1), 1, (32-24), 24 const ang8_fact_mode14, db (32-19), 19, (32- 6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5), 5, (32-24), 24 const ang8_fact_mode15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9), 9, (32-24), 24 const ang8_fact_mode16, db (32-11), 11, (32-22), 22, (32- 1), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24 const ang8_fact_mode17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16 const ang_table %assign x 0 %rep 32 times 8 db (32-x), x %assign x x+1 %endrep const ang_table_avx2 %assign x 0 %rep 32 times 16 db (32-x), x %assign x x+1 %endrep const pw_ang_table %assign x 0 %rep 32 times 4 dw (32-x), x %assign x x+1 %endrep SECTION .text cextern pb_1 cextern pb_2 cextern pw_2 cextern pw_3 cextern pw_4 cextern pw_7 cextern pw_8 cextern pw_16 cextern pw_15 cextern pw_31 cextern pw_32 cextern pw_257 cextern pw_512 cextern pw_1024 cextern pw_4096 cextern pw_00ff cextern pb_unpackbd1 cextern multiL cextern multiH cextern multiH2 cextern multiH3 cextern multi_2Row cextern trans8_shuf cextern pw_planar16_mul cextern pw_planar32_mul ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal intra_pred_dc4, 5,5,3 inc r2 pxor m0, m0 movu m1, [r2] pshufd m1, m1, 0xF8 psadbw m1, m0 ; m1 = sum test r4d, r4d paddw m1, [pw_4] psraw m1, 3 movd r4d, m1 ; r4d = dc_val pmullw m1, [pw_257] pshuflw m1, m1, 0x00 ; store DC 4x4 lea r3, [r1 * 3] movd [r0], m1 movd [r0 + r1], m1 movd [r0 + r1 * 2], m1 movd [r0 + r3], m1 ; do DC filter jz .end lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2 add r4d, r3d ; r4d = DC * 3 + 2 movd m1, r4d pshuflw m1, m1, 0 ; m1 = pixDCx3 ; filter top movd m2, [r2] punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 movd [r0], m2 ; overwrite top-left pixel, we will update it later ; filter top-left movzx r4d, byte [r2 + 8] add r3d, r4d movzx r4d, byte [r2] add r3d, r4d shr r3d, 2 mov [r0], r3b ; filter left add r0, r1 movq m2, [r2 + 9] punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 %if ARCH_X86_64 movq r4, m2 mov [r0], r4b shr r4, 8 mov [r0 + r1], r4b shr r4, 8 mov [r0 + r1 * 2], r4b %else movd r2d, m2 mov [r0], r2b shr r2, 8 mov [r0 + r1], r2b shr r2, 8 mov [r0 + r1 * 2], r2b %endif .end: RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal intra_pred_dc8, 5, 7, 3 pxor m0, m0 movh m1, [r2 + 1] movh m2, [r2 + 17] punpcklqdq m1, m2 psadbw m1, m0 pshufd m2, m1, 2 paddw m1, m2 paddw m1, [pw_8] psraw m1, 4 pmullw m1, [pw_257] pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] test r4d, r4d ; store DC 8x8 lea r6, [r1 + r1 * 2] lea r5, [r6 + r1 * 2] movh [r0], m1 movh [r0 + r1], m1 movh [r0 + r1 * 2], m1 movh [r0 + r6], m1 movh [r0 + r1 * 4], m1 movh [r0 + r5], m1 movh [r0 + r6 * 2], m1 lea r5, [r5 + r1 * 2] movh [r0 + r5], m1 ; Do DC Filter jz .end psrlw m1, 8 movq m2, [pw_2] pmullw m2, m1 paddw m2, [pw_2] movd r4d, m2 ; r4d = DC * 2 + 2 paddw m1, m2 ; m1 = DC * 3 + 2 pshufd m1, m1, 0 ; filter top movq m2, [r2 + 1] punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 ; sum = sum / 16 packuswb m2, m2 movh [r0], m2 ; filter top-left movzx r3d, byte [r2 + 17] add r4d, r3d movzx r3d, byte [r2 + 1] add r3d, r4d shr r3d, 2 mov [r0], r3b ; filter left movq m2, [r2 + 18] punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 movd r2d, m2 lea r0, [r0 + r1] lea r5, [r6 + r1 * 2] mov [r0], r2b shr r2, 8 mov [r0 + r1], r2b shr r2, 8 mov [r0 + r1 * 2], r2b shr r2, 8 mov [r0 + r6], r2b pshufd m2, m2, 0x01 movd r2d, m2 mov [r0 + r1 * 4], r2b shr r2, 8 mov [r0 + r5], r2b shr r2, 8 mov [r0 + r6 * 2], r2b .end: RET ;-------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;-------------------------------------------------------------------------------------------- INIT_XMM sse2 %if ARCH_X86_64 cglobal intra_pred_dc16, 5, 10, 4 %else cglobal intra_pred_dc16, 5, 7, 4 %endif pxor m0, m0 movu m1, [r2 + 1] movu m2, [r2 + 33] psadbw m1, m0 psadbw m2, m0 paddw m1, m2 pshufd m2, m1, 2 paddw m1, m2 paddw m1, [pw_16] psraw m1, 5 pmullw m1, [pw_257] pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] pshufd m1, m1, 0x00 test r4d, r4d ; store DC 16x16 %if ARCH_X86_64 lea r6, [r1 + r1 * 2] ;index 3 lea r7, [r1 + r1 * 4] ;index 5 lea r8, [r6 + r1 * 4] ;index 7 lea r9, [r0 + r8] ;base + 7 movu [r0], m1 movu [r0 + r1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r6], m1 movu [r0 + r1 * 4], m1 movu [r0 + r7], m1 movu [r0 + r6 * 2], m1 movu [r0 + r8], m1 movu [r0 + r1 * 8], m1 movu [r9 + r1 * 2], m1 movu [r0 + r7 * 2], m1 movu [r9 + r1 * 4], m1 movu [r0 + r6 * 4], m1 movu [r9 + r6 * 2], m1 movu [r0 + r8 * 2], m1 movu [r9 + r1 * 8], m1 %else ;32 bit mov r6, r0 movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 %endif ; Do DC Filter jz .end psrlw m1, 8 mova m2, [pw_2] pmullw m2, m1 paddw m2, [pw_2] movd r4d, m2 paddw m1, m2 ; filter top movh m2, [r2 + 1] punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 movh m3, [r2 + 9] punpcklbw m3, m0 paddw m3, m1 psraw m3, 2 packuswb m3, m3 ; filter top-left movzx r5d, byte [r2 + 33] add r4d, r5d movzx r3d, byte [r2 + 1] add r3d, r4d shr r3d, 2 %if ARCH_X86_64 movh [r0], m2 movh [r0 + 8], m3 mov [r0], r3b %else ;32 bit movh [r6], m2 movh [r6 + 8], m3 mov [r6], r3b add r6, r1 %endif ; filter left movh m2, [r2 + 34] punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 movh m3, [r2 + 42] punpcklbw m3, m0 paddw m3, m1 psraw m3, 2 packuswb m3, m3 %if ARCH_X86_64 movh r3, m2 mov [r0 + r1], r3b shr r3, 8 mov [r0 + r1 * 2], r3b shr r3, 8 mov [r0 + r6], r3b shr r3, 8 mov [r0 + r1 * 4], r3b shr r3, 8 mov [r0 + r7], r3b shr r3, 8 mov [r0 + r6 * 2], r3b shr r3, 8 mov [r0 + r8], r3b shr r3, 8 mov [r0 + r1 * 8], r3b movh r3, m3 mov [r9 + r1 * 2], r3b shr r3, 8 mov [r0 + r7 * 2], r3b shr r3, 8 mov [r9 + r1 * 4], r3b shr r3, 8 mov [r0 + r6 * 4], r3b shr r3, 8 mov [r9 + r6 * 2], r3b shr r3, 8 mov [r0 + r8 * 2], r3b shr r3, 8 mov [r9 + r1 * 8], r3b %else ;32 bit movd r2d, m2 pshufd m2, m2, 0x01 mov [r6], r2b shr r2, 8 mov [r6 + r1], r2b shr r2, 8 mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] shr r2, 8 mov [r6 + r1], r2b movd r2d, m2 mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] shr r2, 8 mov [r6 + r1], r2b shr r2, 8 mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] shr r2, 8 mov [r6 + r1], r2b movd r2d, m3 pshufd m3, m3, 0x01 mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] shr r2, 8 mov [r6 + r1], r2b shr r2, 8 mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] shr r2, 8 mov [r6 + r1], r2b movd r2d, m3 mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] shr r2, 8 mov [r6 + r1], r2b shr r2, 8 mov [r6 + r1 * 2], r2b %endif .end: RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal intra_pred_dc32, 3, 3, 5 pxor m0, m0 movu m1, [r2 + 1] movu m2, [r2 + 17] movu m3, [r2 + 65] movu m4, [r2 + 81] psadbw m1, m0 psadbw m2, m0 psadbw m3, m0 psadbw m4, m0 paddw m1, m2 paddw m3, m4 paddw m1, m3 pshufd m2, m1, 2 paddw m1, m2 paddw m1, [pw_32] psraw m1, 6 pmullw m1, [pw_257] pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] pshufd m1, m1, 0x00 %assign x 0 %rep 16 ; store DC 16x16 movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16], m1 %if x < 16 lea r0, [r0 + 2 * r1] %endif %assign x x+1 %endrep RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal intra_pred_planar4, 3,3,5 pxor m0, m0 movh m1, [r2 + 1] punpcklbw m1, m0 movh m2, [r2 + 9] punpcklbw m2, m0 pshufhw m3, m1, 0 ; topRight pshufd m3, m3, 0xAA pshufhw m4, m2, 0 ; bottomLeft pshufd m4, m4, 0xAA pmullw m3, [multi_2Row] ; (x + 1) * topRight pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] paddw m3, [pw_4] paddw m3, m4 paddw m3, m0 psubw m4, m1 pshuflw m1, m2, 0 pmullw m1, [pw_planar4_0] paddw m1, m3 paddw m3, m4 psraw m1, 3 packuswb m1, m1 movd [r0], m1 pshuflw m1, m2, 01010101b pmullw m1, [pw_planar4_0] paddw m1, m3 paddw m3, m4 psraw m1, 3 packuswb m1, m1 movd [r0 + r1], m1 lea r0, [r0 + 2 * r1] pshuflw m1, m2, 10101010b pmullw m1, [pw_planar4_0] paddw m1, m3 paddw m3, m4 psraw m1, 3 packuswb m1, m1 movd [r0], m1 pshuflw m1, m2, 11111111b pmullw m1, [pw_planar4_0] paddw m1, m3 psraw m1, 3 packuswb m1, m1 movd [r0 + r1], m1 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal intra_pred_planar8, 3,3,6 pxor m0, m0 movh m1, [r2 + 1] punpcklbw m1, m0 movh m2, [r2 + 17] punpcklbw m2, m0 movd m3, [r2 + 9] ; topRight = above[8]; movd m4, [r2 + 25] ; bottomLeft = left[8]; pand m3, [pw_00ff] pand m4, [pw_00ff] pshuflw m3, m3, 0x00 pshuflw m4, m4, 0x00 pshufd m3, m3, 0x44 pshufd m4, m4, 0x44 pmullw m3, [multiL] ; (x + 1) * topRight pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] paddw m3, [pw_8] paddw m3, m4 paddw m3, m0 psubw m4, m1 %macro INTRA_PRED_PLANAR_8 1 %if (%1 < 4) pshuflw m5, m2, 0x55 * %1 pshufd m5, m5, 0 %else pshufhw m5, m2, 0x55 * (%1 - 4) pshufd m5, m5, 0xAA %endif pmullw m5, [pw_planar16_mul + mmsize] paddw m5, m3 psraw m5, 4 packuswb m5, m5 movh [r0], m5 %if (%1 < 7) paddw m3, m4 lea r0, [r0 + r1] %endif %endmacro INTRA_PRED_PLANAR_8 0 INTRA_PRED_PLANAR_8 1 INTRA_PRED_PLANAR_8 2 INTRA_PRED_PLANAR_8 3 INTRA_PRED_PLANAR_8 4 INTRA_PRED_PLANAR_8 5 INTRA_PRED_PLANAR_8 6 INTRA_PRED_PLANAR_8 7 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal intra_pred_planar16, 3,5,8 pxor m0, m0 movh m2, [r2 + 1] punpcklbw m2, m0 movh m7, [r2 + 9] punpcklbw m7, m0 movd m3, [r2 + 17] ; topRight = above[16] movd m6, [r2 + 49] ; bottomLeft = left[16] pand m3, [pw_00ff] pand m6, [pw_00ff] pshuflw m3, m3, 0x00 pshuflw m6, m6, 0x00 pshufd m3, m3, 0x44 ; v_topRight pshufd m6, m6, 0x44 ; v_bottomLeft pmullw m4, m3, [multiH] ; (x + 1) * topRight pmullw m3, [multiL] ; (x + 1) * topRight pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] paddw m4, [pw_16] paddw m3, [pw_16] paddw m4, m6 paddw m3, m6 paddw m4, m5 paddw m3, m1 psubw m1, m6, m7 psubw m6, m2 movh m2, [r2 + 33] punpcklbw m2, m0 movh m7, [r2 + 41] punpcklbw m7, m0 %macro INTRA_PRED_PLANAR_16 1 %if (%1 < 4) pshuflw m5, m2, 0x55 * %1 pshufd m5, m5, 0 %else %if (%1 < 8) pshufhw m5, m2, 0x55 * (%1 - 4) pshufd m5, m5, 0xAA %else %if (%1 < 12) pshuflw m5, m7, 0x55 * (%1 - 8) pshufd m5, m5, 0 %else pshufhw m5, m7, 0x55 * (%1 - 12) pshufd m5, m5, 0xAA %endif %endif %endif %if (%1 > 0) paddw m3, m6 paddw m4, m1 lea r0, [r0 + r1] %endif pmullw m0, m5, [pw_planar16_mul + mmsize] pmullw m5, [pw_planar16_mul] paddw m0, m4 paddw m5, m3 psraw m5, 5 psraw m0, 5 packuswb m5, m0 movu [r0], m5 %endmacro INTRA_PRED_PLANAR_16 0 INTRA_PRED_PLANAR_16 1 INTRA_PRED_PLANAR_16 2 INTRA_PRED_PLANAR_16 3 INTRA_PRED_PLANAR_16 4 INTRA_PRED_PLANAR_16 5 INTRA_PRED_PLANAR_16 6 INTRA_PRED_PLANAR_16 7 INTRA_PRED_PLANAR_16 8 INTRA_PRED_PLANAR_16 9 INTRA_PRED_PLANAR_16 10 INTRA_PRED_PLANAR_16 11 INTRA_PRED_PLANAR_16 12 INTRA_PRED_PLANAR_16 13 INTRA_PRED_PLANAR_16 14 INTRA_PRED_PLANAR_16 15 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse2 %if ARCH_X86_64 == 1 cglobal intra_pred_planar32, 3,3,16 movd m3, [r2 + 33] ; topRight = above[32] pxor m7, m7 pand m3, [pw_00ff] pshuflw m3, m3, 0x00 pshufd m3, m3, 0x44 pmullw m0, m3, [multiL] ; (x + 1) * topRight pmullw m1, m3, [multiH] ; (x + 1) * topRight pmullw m2, m3, [multiH2] ; (x + 1) * topRight pmullw m3, [multiH3] ; (x + 1) * topRight movd m11, [r2 + 97] ; bottomLeft = left[32] pand m11, [pw_00ff] pshuflw m11, m11, 0x00 pshufd m11, m11, 0x44 mova m5, m11 paddw m5, [pw_32] paddw m0, m5 paddw m1, m5 paddw m2, m5 paddw m3, m5 mova m8, m11 mova m9, m11 mova m10, m11 mova m12, [pw_31] movh m4, [r2 + 1] punpcklbw m4, m7 psubw m8, m4 pmullw m4, m12 paddw m0, m4 movh m4, [r2 + 9] punpcklbw m4, m7 psubw m9, m4 pmullw m4, m12 paddw m1, m4 movh m4, [r2 + 17] punpcklbw m4, m7 psubw m10, m4 pmullw m4, m12 paddw m2, m4 movh m4, [r2 + 25] punpcklbw m4, m7 psubw m11, m4 pmullw m4, m12 paddw m3, m4 mova m12, [pw_planar32_mul] mova m13, [pw_planar32_mul + mmsize] mova m14, [pw_planar16_mul] mova m15, [pw_planar16_mul + mmsize] %macro PROCESS 1 pmullw m5, %1, m12 pmullw m6, %1, m13 paddw m5, m0 paddw m6, m1 psraw m5, 6 psraw m6, 6 packuswb m5, m6 movu [r0], m5 pmullw m5, %1, m14 pmullw %1, m15 paddw m5, m2 paddw %1, m3 psraw m5, 6 psraw %1, 6 packuswb m5, %1 movu [r0 + 16], m5 %endmacro %macro INCREMENT 0 paddw m2, m10 paddw m3, m11 paddw m0, m8 paddw m1, m9 add r0, r1 %endmacro %assign x 0 %rep 4 pxor m7, m7 movq m4, [r2 + 65 + x * 8] punpcklbw m4, m7 %assign y 0 %rep 8 %if y < 4 pshuflw m7, m4, 0x55 * y pshufd m7, m7, 0x44 %else pshufhw m7, m4, 0x55 * (y - 4) pshufd m7, m7, 0xEE %endif PROCESS m7 %if x + y < 10 INCREMENT %endif %assign y y+1 %endrep %assign x x+1 %endrep RET %else ;end ARCH_X86_64, start ARCH_X86_32 cglobal intra_pred_planar32, 3,3,8,0-(4*mmsize) movd m3, [r2 + 33] ; topRight = above[32] pxor m7, m7 pand m3, [pw_00ff] pshuflw m3, m3, 0x00 pshufd m3, m3, 0x44 pmullw m0, m3, [multiL] ; (x + 1) * topRight pmullw m1, m3, [multiH] ; (x + 1) * topRight pmullw m2, m3, [multiH2] ; (x + 1) * topRight pmullw m3, [multiH3] ; (x + 1) * topRight movd m6, [r2 + 97] ; bottomLeft = left[32] pand m6, [pw_00ff] pshuflw m6, m6, 0x00 pshufd m6, m6, 0x44 mova m5, m6 paddw m5, [pw_32] paddw m0, m5 paddw m1, m5 paddw m2, m5 paddw m3, m5 movh m4, [r2 + 1] punpcklbw m4, m7 psubw m5, m6, m4 mova [rsp + 0 * mmsize], m5 pmullw m4, [pw_31] paddw m0, m4 movh m4, [r2 + 9] punpcklbw m4, m7 psubw m5, m6, m4 mova [rsp + 1 * mmsize], m5 pmullw m4, [pw_31] paddw m1, m4 movh m4, [r2 + 17] punpcklbw m4, m7 psubw m5, m6, m4 mova [rsp + 2 * mmsize], m5 pmullw m4, [pw_31] paddw m2, m4 movh m4, [r2 + 25] punpcklbw m4, m7 psubw m5, m6, m4 mova [rsp + 3 * mmsize], m5 pmullw m4, [pw_31] paddw m3, m4 %macro PROCESS 1 pmullw m5, %1, [pw_planar32_mul] pmullw m6, %1, [pw_planar32_mul + mmsize] paddw m5, m0 paddw m6, m1 psraw m5, 6 psraw m6, 6 packuswb m5, m6 movu [r0], m5 pmullw m5, %1, [pw_planar16_mul] pmullw %1, [pw_planar16_mul + mmsize] paddw m5, m2 paddw %1, m3 psraw m5, 6 psraw %1, 6 packuswb m5, %1 movu [r0 + 16], m5 %endmacro %macro INCREMENT 0 paddw m0, [rsp + 0 * mmsize] paddw m1, [rsp + 1 * mmsize] paddw m2, [rsp + 2 * mmsize] paddw m3, [rsp + 3 * mmsize] add r0, r1 %endmacro %assign y 0 %rep 4 pxor m7, m7 movq m4, [r2 + 65 + y * 8] punpcklbw m4, m7 %assign x 0 %rep 8 %if x < 4 pshuflw m7, m4, 0x55 * x pshufd m7, m7, 0x44 %else pshufhw m7, m4, 0x55 * (x - 4) pshufd m7, m7, 0xEE %endif PROCESS m7 %if x + y < 10 INCREMENT %endif %assign x x+1 %endrep %assign y y+1 %endrep RET %endif ; end ARCH_X86_32 %macro STORE_4x4 0 movd [r0], m0 psrldq m0, 4 movd [r0 + r1], m0 psrldq m0, 4 movd [r0 + r1 * 2], m0 lea r1, [r1 * 3] psrldq m0, 4 movd [r0 + r1], m0 %endmacro %macro TRANSPOSE_4x4 0 pshufd m0, m0, 0xD8 pshufd m1, m2, 0xD8 pshuflw m0, m0, 0xD8 pshuflw m1, m1, 0xD8 pshufhw m0, m0, 0xD8 pshufhw m1, m1, 0xD8 mova m2, m0 punpckldq m0, m1 punpckhdq m2, m1 packuswb m0, m2 %endmacro ;----------------------------------------------------------------------------------------- ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) ;----------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal intra_pred_ang4_2, 3,5,1 lea r4, [r2 + 2] add r2, 10 cmp r3m, byte 34 cmove r2, r4 movh m0, [r2] movd [r0], m0 psrldq m0, 1 movd [r0 + r1], m0 psrldq m0, 1 movd [r0 + r1 * 2], m0 lea r1, [r1 * 3] psrldq m0, 1 movd [r0 + r1], m0 RET INIT_XMM sse2 cglobal intra_pred_ang4_3, 3,3,5 movh m3, [r2 + 9] ; [8 7 6 5 4 3 2 1] punpcklbw m3, m3 psrldq m3, 1 movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1] psrldq m3, 2 movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] psrldq m3, 2 movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3] psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4] pxor m4, m4 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 20 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 26 * 16] packssdw m0, m1 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m4 pmaddwd m3, [pw_ang_table + 8 * 16] punpcklbw m2, m4 pmaddwd m2, [pw_ang_table + 14 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_4, 3,3,5 movh m1, [r2 + 9] ;[8 7 6 5 4 3 2 1] punpcklbw m1, m1 psrldq m1, 1 movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1] psrldq m1, 2 movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2] psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] pxor m4, m4 punpcklbw m2, m4 mova m3, m2 pmaddwd m3, [pw_ang_table + 10 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 21 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 20 * 16] pmaddwd m2, [pw_ang_table + 31 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_5, 3,3,5 movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1] punpcklbw m3, m3 psrldq m3, 1 mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] psrldq m3, 2 mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] pxor m1, m1 punpcklbw m2, m1 mova m4, m2 pmaddwd m4, [pw_ang_table + 2 * 16] punpcklbw m0, m1 pmaddwd m0, [pw_ang_table + 17 * 16] packssdw m0, m4 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m1 pmaddwd m3, [pw_ang_table + 4 * 16] pmaddwd m2, [pw_ang_table + 19 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_6, 3,3,4 movh m2, [r2 + 9] ;[8 7 6 5 4 3 2 1] punpcklbw m2, m2 psrldq m2, 1 movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2] pxor m1, m1 punpcklbw m0, m1 mova m3, m0 pmaddwd m3, [pw_ang_table + 26 * 16] pmaddwd m0, [pw_ang_table + 13 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m2, m1 mova m3, m2 pmaddwd m3, [pw_ang_table + 20 * 16] pmaddwd m2, [pw_ang_table + 7 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_7, 3,3,5 movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1] punpcklbw m3, m3 psrldq m3, 1 movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2] pxor m1, m1 punpcklbw m0, m1 mova m4, m0 mova m2, m0 pmaddwd m4, [pw_ang_table + 18 * 16] pmaddwd m0, [pw_ang_table + 9 * 16] packssdw m0, m4 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m1 pmaddwd m3, [pw_ang_table + 4 * 16] pmaddwd m2, [pw_ang_table + 27 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_8, 3,3,5 movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1] punpcklbw m0, m0 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 10 * 16] pmaddwd m0, [pw_ang_table + 5 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 20 * 16] pmaddwd m2, [pw_ang_table + 15 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_9, 3,3,5 movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1] punpcklbw m0, m0 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 4 * 16] pmaddwd m0, [pw_ang_table + 2 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 8 * 16] pmaddwd m2, [pw_ang_table + 6 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_10, 3,5,4 movd m0, [r2 + 9] ;[8 7 6 5 4 3 2 1] punpcklbw m0, m0 punpcklwd m0, m0 pshufd m1, m0, 1 movhlps m2, m0 pshufd m3, m0, 3 movd [r0 + r1], m1 movd [r0 + r1 * 2], m2 lea r1, [r1 * 3] movd [r0 + r1], m3 cmp r4m, byte 0 jz .quit ; filter pxor m3, m3 punpcklbw m0, m3 movh m1, [r2] ;[4 3 2 1 0] punpcklbw m1, m3 pshuflw m2, m1, 0x00 psrldq m1, 2 psubw m1, m2 psraw m1, 1 paddw m0, m1 packuswb m0, m0 .quit: movd [r0], m0 RET cglobal intra_pred_ang4_11, 3,3,5 movd m1, [r2 + 9] ;[4 3 2 1] movh m0, [r2 - 7] ;[A x x x x x x x] punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]] psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 28 * 16] pmaddwd m0, [pw_ang_table + 30 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 24 * 16] pmaddwd m2, [pw_ang_table + 26 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_12, 3,3,5 movd m1, [r2 + 9] ;[4 3 2 1] movh m0, [r2 - 7] ;[A x x x x x x x] punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x] psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 22 * 16] pmaddwd m0, [pw_ang_table + 27 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 12 * 16] pmaddwd m2, [pw_ang_table + 17 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_24, 3,3,5 movd m1, [r2 + 1] ;[4 3 2 1] movh m0, [r2 - 7] ;[A x x x x x x x] punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x] psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 22 * 16] pmaddwd m0, [pw_ang_table + 27 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 12 * 16] pmaddwd m2, [pw_ang_table + 17 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_13, 3,3,5 movd m1, [r2 - 1] ;[x x A x] movd m2, [r2 + 9] ;[4 3 2 1] movd m0, [r2 + 3] ;[x x B x] punpcklbw m0, m1 ;[x x x x A B x x] punpckldq m0, m2 ;[4 3 2 1 A B x x] psrldq m0, 2 ;[x x 4 3 2 1 A B] punpcklbw m0, m0 psrldq m0, 1 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m4, m0 mova m2, m0 pmaddwd m4, [pw_ang_table + 14 * 16] pmaddwd m0, [pw_ang_table + 23 * 16] packssdw m0, m4 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m1 pmaddwd m3, [pw_ang_table + 28 * 16] pmaddwd m2, [pw_ang_table + 5 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_14, 3,3,4 movd m1, [r2 - 1] ;[x x A x] movd m0, [r2 + 1] ;[x x B x] punpcklbw m0, m1 ;[A B x x] movd m1, [r2 + 9] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B x x] psrldq m0, 2 ;[x x 4 3 2 1 A B] punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B] psrldq m0, 1 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m3, m0 pmaddwd m3, [pw_ang_table + 6 * 16] pmaddwd m0, [pw_ang_table + 19 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m2, m1 mova m3, m2 pmaddwd m3, [pw_ang_table + 12 * 16] pmaddwd m2, [pw_ang_table + 25 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_15, 3,3,5 movd m0, [r2] ;[x x x A] movd m1, [r2 + 2] ;[x x x B] punpcklbw m1, m0 ;[x x A B] movd m0, [r2 + 3] ;[x x C x] punpcklwd m0, m1 ;[A B C x] movd m1, [r2 + 9] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B C x] psrldq m0, 1 ;[x 4 3 2 1 A B C] punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] psrldq m0, 1 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] psrldq m0, 2 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m4, m4 punpcklbw m2, m4 mova m3, m2 pmaddwd m3, [pw_ang_table + 30 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 15 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 28 * 16] pmaddwd m2, [pw_ang_table + 13 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_16, 3,3,5 movd m2, [r2] ;[x x x A] movd m1, [r2 + 2] ;[x x x B] punpcklbw m1, m2 ;[x x A B] movd m0, [r2 + 2] ;[x x C x] punpcklwd m0, m1 ;[A B C x] movd m1, [r2 + 9] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B C x] psrldq m0, 1 ;[x 4 3 2 1 A B C] punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] psrldq m0, 1 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] psrldq m0, 2 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m4, m4 punpcklbw m2, m4 mova m3, m2 pmaddwd m3, [pw_ang_table + 22 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 11 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 12 * 16] pmaddwd m2, [pw_ang_table + 1 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_17, 3,3,5 movd m2, [r2] ;[x x x A] movd m3, [r2 + 1] ;[x x x B] movd m4, [r2 + 2] ;[x x x C] movd m0, [r2 + 4] ;[x x x D] punpcklbw m3, m2 ;[x x A B] punpcklbw m0, m4 ;[x x C D] punpcklwd m0, m3 ;[A B C D] movd m1, [r2 + 9] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B C D] punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D] psrldq m0, 1 movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D] psrldq m0, 2 movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] psrldq m0, 2 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m4, m4 punpcklbw m3, m4 pmaddwd m3, [pw_ang_table + 12 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 6 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 24 * 16] punpcklbw m2, m4 pmaddwd m2, [pw_ang_table + 18 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 TRANSPOSE_4x4 STORE_4x4 RET cglobal intra_pred_ang4_18, 3,4,2 mov r3d, [r2 + 8] mov r3b, byte [r2] bswap r3d movd m0, r3d movd m1, [r2 + 1] punpckldq m0, m1 lea r3, [r1 * 3] movd [r0 + r3], m0 psrldq m0, 1 movd [r0 + r1 * 2], m0 psrldq m0, 1 movd [r0 + r1], m0 psrldq m0, 1 movd [r0], m0 RET cglobal intra_pred_ang4_19, 3,3,5 movd m2, [r2] ;[x x x A] movd m3, [r2 + 9] ;[x x x B] movd m4, [r2 + 10] ;[x x x C] movd m0, [r2 + 12] ;[x x x D] punpcklbw m3, m2 ;[x x A B] punpcklbw m0, m4 ;[x x C D] punpcklwd m0, m3 ;[A B C D] movd m1, [r2 + 1] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B C D] punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D] psrldq m0, 1 movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D] psrldq m0, 2 movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] psrldq m0, 2 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m4, m4 punpcklbw m3, m4 pmaddwd m3, [pw_ang_table + 12 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 6 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 24 * 16] punpcklbw m2, m4 pmaddwd m2, [pw_ang_table + 18 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_20, 3,3,5 movd m2, [r2] ;[x x x A] movd m1, [r2 + 10] ;[x x x B] punpcklbw m1, m2 ;[x x A B] movd m0, [r2 + 10] ;[x x C x] punpcklwd m0, m1 ;[A B C x] movd m1, [r2 + 1] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B C x] psrldq m0, 1 ;[x 4 3 2 1 A B C] punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] psrldq m0, 1 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] psrldq m0, 2 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m4, m4 punpcklbw m2, m4 mova m3, m2 pmaddwd m3, [pw_ang_table + 22 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 11 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 12 * 16] pmaddwd m2, [pw_ang_table + 1 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_21, 3,3,5 movd m0, [r2] ;[x x x A] movd m1, [r2 + 10] ;[x x x B] punpcklbw m1, m0 ;[x x A B] movd m0, [r2 + 11] ;[x x C x] punpcklwd m0, m1 ;[A B C x] movd m1, [r2 + 1] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B C x] psrldq m0, 1 ;[x 4 3 2 1 A B C] punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] psrldq m0, 1 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] psrldq m0, 2 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m4, m4 punpcklbw m2, m4 mova m3, m2 pmaddwd m3, [pw_ang_table + 30 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 15 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 28 * 16] pmaddwd m2, [pw_ang_table + 13 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_22, 3,3,4 movd m1, [r2 - 1] ;[x x A x] movd m0, [r2 + 9] ;[x x B x] punpcklbw m0, m1 ;[A B x x] movd m1, [r2 + 1] ;[4 3 2 1] punpckldq m0, m1 ;[4 3 2 1 A B x x] psrldq m0, 2 ;[x x 4 3 2 1 A B] punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B] psrldq m0, 1 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m3, m0 pmaddwd m3, [pw_ang_table + 6 * 16] pmaddwd m0, [pw_ang_table + 19 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m2, m1 mova m3, m2 pmaddwd m3, [pw_ang_table + 12 * 16] pmaddwd m2, [pw_ang_table + 25 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_23, 3,3,5 movd m1, [r2 - 1] ;[x x A x] movd m2, [r2 + 1] ;[4 3 2 1] movd m0, [r2 + 11] ;[x x B x] punpcklbw m0, m1 ;[x x x x A B x x] punpckldq m0, m2 ;[4 3 2 1 A B x x] psrldq m0, 2 ;[x x 4 3 2 1 A B] punpcklbw m0, m0 psrldq m0, 1 mova m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m4, m0 mova m2, m0 pmaddwd m4, [pw_ang_table + 14 * 16] pmaddwd m0, [pw_ang_table + 23 * 16] packssdw m0, m4 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m1 pmaddwd m3, [pw_ang_table + 28 * 16] pmaddwd m2, [pw_ang_table + 5 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_25, 3,3,5 movd m1, [r2 + 1] ;[4 3 2 1] movh m0, [r2 - 7] ;[A x x x x x x x] punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x] psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 28 * 16] pmaddwd m0, [pw_ang_table + 30 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 24 * 16] pmaddwd m2, [pw_ang_table + 26 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_26, 3,4,4 movd m0, [r2 + 1] ;[8 7 6 5 4 3 2 1] ; store movd [r0], m0 movd [r0 + r1], m0 movd [r0 + r1 * 2], m0 lea r3, [r1 * 3] movd [r0 + r3], m0 ; filter cmp r4m, byte 0 jz .quit pxor m3, m3 punpcklbw m0, m3 pshuflw m0, m0, 0x00 movd m2, [r2] punpcklbw m2, m3 pshuflw m2, m2, 0x00 movd m1, [r2 + 9] punpcklbw m1, m3 psubw m1, m2 psraw m1, 1 paddw m0, m1 packuswb m0, m0 movd r2d, m0 mov [r0], r2b shr r2, 8 mov [r0 + r1], r2b shr r2, 8 mov [r0 + r1 * 2], r2b shr r2, 8 mov [r0 + r3], r2b .quit: RET cglobal intra_pred_ang4_27, 3,3,5 movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1] punpcklbw m0, m0 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 4 * 16] pmaddwd m0, [pw_ang_table + 2 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 8 * 16] pmaddwd m2, [pw_ang_table + 6 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_28, 3,3,5 movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1] punpcklbw m0, m0 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pxor m1, m1 punpcklbw m0, m1 mova m2, m0 mova m3, m0 mova m4, m2 pmaddwd m3, [pw_ang_table + 10 * 16] pmaddwd m0, [pw_ang_table + 5 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 pmaddwd m4, [pw_ang_table + 20 * 16] pmaddwd m2, [pw_ang_table + 15 * 16] packssdw m2, m4 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_29, 3,3,5 movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1] punpcklbw m3, m3 psrldq m3, 1 movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2] pxor m1, m1 punpcklbw m0, m1 mova m4, m0 mova m2, m0 pmaddwd m4, [pw_ang_table + 18 * 16] pmaddwd m0, [pw_ang_table + 9 * 16] packssdw m0, m4 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m1 pmaddwd m3, [pw_ang_table + 4 * 16] pmaddwd m2, [pw_ang_table + 27 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_30, 3,3,4 movh m2, [r2 + 1] ;[8 7 6 5 4 3 2 1] punpcklbw m2, m2 psrldq m2, 1 movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2] pxor m1, m1 punpcklbw m0, m1 mova m3, m0 pmaddwd m3, [pw_ang_table + 26 * 16] pmaddwd m0, [pw_ang_table + 13 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m2, m1 mova m3, m2 pmaddwd m3, [pw_ang_table + 20 * 16] pmaddwd m2, [pw_ang_table + 7 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_31, 3,3,5 movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1] punpcklbw m3, m3 psrldq m3, 1 mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] psrldq m3, 2 mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] pxor m1, m1 punpcklbw m2, m1 mova m4, m2 pmaddwd m4, [pw_ang_table + 2 * 16] punpcklbw m0, m1 pmaddwd m0, [pw_ang_table + 17 * 16] packssdw m0, m4 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m1 pmaddwd m3, [pw_ang_table + 4 * 16] pmaddwd m2, [pw_ang_table + 19 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_32, 3,3,5 movh m1, [r2 + 1] ;[8 7 6 5 4 3 2 1] punpcklbw m1, m1 psrldq m1, 1 movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1] psrldq m1, 2 movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2] psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] pxor m4, m4 punpcklbw m2, m4 mova m3, m2 pmaddwd m3, [pw_ang_table + 10 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 21 * 16] packssdw m0, m3 paddw m0, [pw_16] psraw m0, 5 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 20 * 16] pmaddwd m2, [pw_ang_table + 31 * 16] packssdw m2, m1 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET cglobal intra_pred_ang4_33, 3,3,5 movh m3, [r2 + 1] ; [8 7 6 5 4 3 2 1] punpcklbw m3, m3 psrldq m3, 1 movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1] psrldq m3, 2 movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] psrldq m3, 2 movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3] psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4] pxor m4, m4 punpcklbw m1, m4 pmaddwd m1, [pw_ang_table + 20 * 16] punpcklbw m0, m4 pmaddwd m0, [pw_ang_table + 26 * 16] packssdw m0, m1 paddw m0, [pw_16] psraw m0, 5 punpcklbw m3, m4 pmaddwd m3, [pw_ang_table + 8 * 16] punpcklbw m2, m4 pmaddwd m2, [pw_ang_table + 14 * 16] packssdw m2, m3 paddw m2, [pw_16] psraw m2, 5 packuswb m0, m2 STORE_4x4 RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_dc4, 5,5,3 inc r2 pxor m0, m0 movd m1, [r2] movd m2, [r2 + 8] punpckldq m1, m2 psadbw m1, m0 ; m1 = sum test r4d, r4d pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8 movd r4d, m1 ; r4d = dc_val pshufb m1, m0 ; m1 = byte [dc_val ...] ; store DC 4x4 lea r3, [r1 * 3] movd [r0], m1 movd [r0 + r1], m1 movd [r0 + r1 * 2], m1 movd [r0 + r3], m1 ; do DC filter jz .end lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2 add r4d, r3d ; r4d = DC * 3 + 2 movd m1, r4d pshuflw m1, m1, 0 ; m1 = pixDCx3 pshufd m1, m1, 0 ; filter top movd m2, [r2] movd m0, [r2 + 9] punpckldq m2, m0 pmovzxbw m2, m2 paddw m2, m1 psraw m2, 2 packuswb m2, m2 movd [r0], m2 ; overwrite top-left pixel, we will update it later ; filter top-left movzx r4d, byte [r2 + 8] add r3d, r4d movzx r4d, byte [r2] add r3d, r4d shr r3d, 2 mov [r0], r3b ; filter left add r0, r1 pextrb [r0], m2, 4 pextrb [r0 + r1], m2, 5 pextrb [r0 + r1 * 2], m2, 6 .end: RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_dc8, 5, 7, 3 lea r3, [r2 + 17] inc r2 pxor m0, m0 movh m1, [r2] movh m2, [r3] punpcklqdq m1, m2 psadbw m1, m0 pshufd m2, m1, 2 paddw m1, m2 movd r5d, m1 add r5d, 8 shr r5d, 4 ; sum = sum / 16 movd m1, r5d pshufb m1, m0 ; m1 = byte [dc_val ...] test r4d, r4d ; store DC 8x8 mov r6, r0 movh [r0], m1 movh [r0 + r1], m1 lea r0, [r0 + r1 * 2] movh [r0], m1 movh [r0 + r1], m1 lea r0, [r0 + r1 * 2] movh [r0], m1 movh [r0 + r1], m1 lea r0, [r0 + r1 * 2] movh [r0], m1 movh [r0 + r1], m1 ; Do DC Filter jz .end lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 add r5d, r4d ; r5d = DC * 3 + 2 movd m1, r5d pshuflw m1, m1, 0 ; m1 = pixDCx3 pshufd m1, m1, 0 ; filter top pmovzxbw m2, [r2] paddw m2, m1 psraw m2, 2 packuswb m2, m2 movh [r6], m2 ; filter top-left movzx r5d, byte [r3] add r4d, r5d movzx r3d, byte [r2] add r3d, r4d shr r3d, 2 mov [r6], r3b ; filter left add r6, r1 pmovzxbw m2, [r2 + 17] paddw m2, m1 psraw m2, 2 packuswb m2, m2 pextrb [r6], m2, 0 pextrb [r6 + r1], m2, 1 pextrb [r6 + 2 * r1], m2, 2 lea r6, [r6 + r1 * 2] pextrb [r6 + r1], m2, 3 pextrb [r6 + r1 * 2], m2, 4 pextrb [r6 + r1 * 4], m2, 6 lea r1, [r1 * 3] pextrb [r6 + r1], m2, 5 .end: RET ;-------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;-------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_dc16, 5, 7, 4 lea r3, [r2 + 33] inc r2 pxor m0, m0 movu m1, [r2] movu m2, [r3] psadbw m1, m0 psadbw m2, m0 paddw m1, m2 pshufd m2, m1, 2 paddw m1, m2 movd r5d, m1 add r5d, 16 shr r5d, 5 ; sum = sum / 32 movd m1, r5d pshufb m1, m0 ; m1 = byte [dc_val ...] test r4d, r4d ; store DC 16x16 mov r6, r0 movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 ; Do DC Filter jz .end lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 add r5d, r4d ; r5d = DC * 3 + 2 movd m1, r5d pshuflw m1, m1, 0 ; m1 = pixDCx3 pshufd m1, m1, 0 ; filter top pmovzxbw m2, [r2] paddw m2, m1 psraw m2, 2 packuswb m2, m2 movh [r6], m2 pmovzxbw m3, [r2 + 8] paddw m3, m1 psraw m3, 2 packuswb m3, m3 movh [r6 + 8], m3 ; filter top-left movzx r5d, byte [r3] add r4d, r5d movzx r3d, byte [r2] add r3d, r4d shr r3d, 2 mov [r6], r3b ; filter left add r6, r1 pmovzxbw m2, [r2 + 33] paddw m2, m1 psraw m2, 2 packuswb m2, m2 pextrb [r6], m2, 0 pextrb [r6 + r1], m2, 1 pextrb [r6 + r1 * 2], m2, 2 lea r6, [r6 + r1 * 2] pextrb [r6 + r1], m2, 3 pextrb [r6 + r1 * 2], m2, 4 lea r6, [r6 + r1 * 2] pextrb [r6 + r1], m2, 5 pextrb [r6 + r1 * 2], m2, 6 lea r6, [r6 + r1 * 2] pextrb [r6 + r1], m2, 7 pmovzxbw m3, [r2 + 41] paddw m3, m1 psraw m3, 2 packuswb m3, m3 pextrb [r6 + r1 * 2], m3, 0 lea r6, [r6 + r1 * 2] pextrb [r6 + r1], m3, 1 pextrb [r6 + r1 * 2], m3, 2 lea r6, [r6 + r1 * 2] pextrb [r6 + r1], m3, 3 pextrb [r6 + r1 * 2], m3, 4 lea r6, [r6 + r1 * 2] pextrb [r6 + r1], m3, 5 pextrb [r6 + r1 * 2], m3, 6 .end: RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_dc32, 3, 5, 5 lea r3, [r2 + 65] inc r2 pxor m0, m0 movu m1, [r2] movu m2, [r2 + 16] movu m3, [r3] movu m4, [r3 + 16] psadbw m1, m0 psadbw m2, m0 psadbw m3, m0 psadbw m4, m0 paddw m1, m2 paddw m3, m4 paddw m1, m3 pshufd m2, m1, 2 paddw m1, m2 movd r4d, m1 add r4d, 32 shr r4d, 6 ; sum = sum / 64 movd m1, r4d pshufb m1, m0 ; m1 = byte [dc_val ...] %rep 2 ; store DC 16x16 movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] movu [r0], m1 movu [r0 + r1], m1 movu [r0 + 16], m1 movu [r0 + r1 + 16],m1 lea r0, [r0 + 2 * r1] %endrep RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal intra_pred_dc32, 3, 4, 3 lea r3, [r1 * 3] pxor m0, m0 movu m1, [r2 + 1] movu m2, [r2 + 65] psadbw m1, m0 psadbw m2, m0 paddw m1, m2 vextracti128 xm2, m1, 1 paddw m1, m2 pshufd m2, m1, 2 paddw m1, m2 pmulhrsw m1, [pw_512] ; sum = (sum + 32) / 64 vpbroadcastb m1, xm1 ; m1 = byte [dc_val ...] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 lea r0, [r0 + 4 * r1] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 lea r0, [r0 + 4 * r1] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 lea r0, [r0 + 4 * r1] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 lea r0, [r0 + 4 * r1] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 lea r0, [r0 + 4 * r1] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 lea r0, [r0 + 4 * r1] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 lea r0, [r0 + 4 * r1] movu [r0 + r1 * 0], m1 movu [r0 + r1 * 1], m1 movu [r0 + r1 * 2], m1 movu [r0 + r3 * 1], m1 RET %endif ;; ARCH_X86_64 == 1 ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_planar4, 3,3,7 pmovzxbw m1, [r2 + 1] pmovzxbw m2, [r2 + 9] pshufhw m3, m1, 0 ; topRight pshufd m3, m3, 0xAA pshufhw m4, m2, 0 ; bottomLeft pshufd m4, m4, 0xAA pmullw m3, [multi_2Row] ; (x + 1) * topRight pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] mova m6, [pw_planar4_0] paddw m3, [pw_4] paddw m3, m4 paddw m3, m0 psubw m4, m1 pshuflw m5, m2, 0 pmullw m5, m6 paddw m5, m3 paddw m3, m4 psraw m5, 3 packuswb m5, m5 movd [r0], m5 pshuflw m5, m2, 01010101b pmullw m5, m6 paddw m5, m3 paddw m3, m4 psraw m5, 3 packuswb m5, m5 movd [r0 + r1], m5 lea r0, [r0 + 2 * r1] pshuflw m5, m2, 10101010b pmullw m5, m6 paddw m5, m3 paddw m3, m4 psraw m5, 3 packuswb m5, m5 movd [r0], m5 pshuflw m5, m2, 11111111b pmullw m5, m6 paddw m5, m3 paddw m3, m4 psraw m5, 3 packuswb m5, m5 movd [r0 + r1], m5 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_planar8, 3,3,7 pmovzxbw m1, [r2 + 1] pmovzxbw m2, [r2 + 17] movd m3, [r2 + 9] ; topRight = above[8]; movd m4, [r2 + 25] ; bottomLeft = left[8]; pxor m0, m0 pshufb m3, m0 pshufb m4, m0 punpcklbw m3, m0 ; v_topRight punpcklbw m4, m0 ; v_bottomLeft pmullw m3, [multiL] ; (x + 1) * topRight pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] mova m6, [pw_planar16_mul + mmsize] paddw m3, [pw_8] paddw m3, m4 paddw m3, m0 psubw m4, m1 %macro INTRA_PRED_PLANAR8 1 %if (%1 < 4) pshuflw m5, m2, 0x55 * %1 pshufd m5, m5, 0 %else pshufhw m5, m2, 0x55 * (%1 - 4) pshufd m5, m5, 0xAA %endif pmullw m5, m6 paddw m5, m3 paddw m3, m4 psraw m5, 4 packuswb m5, m5 movh [r0], m5 lea r0, [r0 + r1] %endmacro INTRA_PRED_PLANAR8 0 INTRA_PRED_PLANAR8 1 INTRA_PRED_PLANAR8 2 INTRA_PRED_PLANAR8 3 INTRA_PRED_PLANAR8 4 INTRA_PRED_PLANAR8 5 INTRA_PRED_PLANAR8 6 INTRA_PRED_PLANAR8 7 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_planar16, 3,3,8 pmovzxbw m2, [r2 + 1] pmovzxbw m7, [r2 + 9] movd m3, [r2 + 17] ; topRight = above[16] movd m6, [r2 + 49] ; bottomLeft = left[16] pxor m0, m0 pshufb m3, m0 pshufb m6, m0 punpcklbw m3, m0 ; v_topRight punpcklbw m6, m0 ; v_bottomLeft pmullw m4, m3, [multiH] ; (x + 1) * topRight pmullw m3, [multiL] ; (x + 1) * topRight pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] paddw m4, [pw_16] paddw m3, [pw_16] paddw m4, m6 paddw m3, m6 paddw m4, m5 paddw m3, m1 psubw m1, m6, m7 psubw m6, m2 pmovzxbw m2, [r2 + 33] pmovzxbw m7, [r2 + 41] %macro INTRA_PRED_PLANAR16 1 %if (%1 < 4) pshuflw m5, m2, 0x55 * %1 pshufd m5, m5, 0 %else %if (%1 < 8) pshufhw m5, m2, 0x55 * (%1 - 4) pshufd m5, m5, 0xAA %else %if (%1 < 12) pshuflw m5, m7, 0x55 * (%1 - 8) pshufd m5, m5, 0 %else pshufhw m5, m7, 0x55 * (%1 - 12) pshufd m5, m5, 0xAA %endif %endif %endif pmullw m0, m5, [pw_planar16_mul + mmsize] pmullw m5, [pw_planar16_mul] paddw m0, m4 paddw m5, m3 paddw m3, m6 paddw m4, m1 psraw m5, 5 psraw m0, 5 packuswb m5, m0 movu [r0], m5 lea r0, [r0 + r1] %endmacro INTRA_PRED_PLANAR16 0 INTRA_PRED_PLANAR16 1 INTRA_PRED_PLANAR16 2 INTRA_PRED_PLANAR16 3 INTRA_PRED_PLANAR16 4 INTRA_PRED_PLANAR16 5 INTRA_PRED_PLANAR16 6 INTRA_PRED_PLANAR16 7 INTRA_PRED_PLANAR16 8 INTRA_PRED_PLANAR16 9 INTRA_PRED_PLANAR16 10 INTRA_PRED_PLANAR16 11 INTRA_PRED_PLANAR16 12 INTRA_PRED_PLANAR16 13 INTRA_PRED_PLANAR16 14 INTRA_PRED_PLANAR16 15 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal intra_pred_planar16, 3,3,6 vpbroadcastw m3, [r2 + 17] mova m5, [pw_00ff] vpbroadcastw m4, [r2 + 49] mova m0, [pw_planar16_mul] pmovzxbw m2, [r2 + 1] pand m3, m5 ; v_topRight pand m4, m5 ; v_bottomLeft pmullw m3, [multiL] ; (x + 1) * topRight pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] paddw m3, [pw_16] paddw m3, m4 paddw m3, m1 psubw m4, m2 add r2, 33 %macro INTRA_PRED_PLANAR16_AVX2 1 vpbroadcastw m1, [r2 + %1] vpsrlw m2, m1, 8 pand m1, m5 pmullw m1, m0 pmullw m2, m0 paddw m1, m3 paddw m3, m4 psraw m1, 5 paddw m2, m3 psraw m2, 5 paddw m3, m4 packuswb m1, m2 vpermq m1, m1, 11011000b movu [r0], xm1 vextracti128 [r0 + r1], m1, 1 lea r0, [r0 + r1 * 2] %endmacro INTRA_PRED_PLANAR16_AVX2 0 INTRA_PRED_PLANAR16_AVX2 2 INTRA_PRED_PLANAR16_AVX2 4 INTRA_PRED_PLANAR16_AVX2 6 INTRA_PRED_PLANAR16_AVX2 8 INTRA_PRED_PLANAR16_AVX2 10 INTRA_PRED_PLANAR16_AVX2 12 INTRA_PRED_PLANAR16_AVX2 14 %undef INTRA_PRED_PLANAR16_AVX2 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- INIT_XMM sse4 %if ARCH_X86_64 == 1 cglobal intra_pred_planar32, 3,4,12 %else cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize) %define m8 [rsp + 0 * mmsize] %define m9 [rsp + 1 * mmsize] %define m10 [rsp + 2 * mmsize] %define m11 [rsp + 3 * mmsize] %endif movd m3, [r2 + 33] ; topRight = above[32] pxor m7, m7 pshufb m3, m7 punpcklbw m3, m7 ; v_topRight pmullw m0, m3, [multiL] ; (x + 1) * topRight pmullw m1, m3, [multiH] ; (x + 1) * topRight pmullw m2, m3, [multiH2] ; (x + 1) * topRight pmullw m3, [multiH3] ; (x + 1) * topRight movd m6, [r2 + 97] ; bottomLeft = left[32] pshufb m6, m7 punpcklbw m6, m7 ; v_bottomLeft paddw m0, m6 paddw m1, m6 paddw m2, m6 paddw m3, m6 paddw m0, [pw_32] paddw m1, [pw_32] paddw m2, [pw_32] paddw m3, [pw_32] pmovzxbw m4, [r2 + 1] pmullw m5, m4, [pw_31] paddw m0, m5 psubw m5, m6, m4 mova m8, m5 pmovzxbw m4, [r2 + 9] pmullw m5, m4, [pw_31] paddw m1, m5 psubw m5, m6, m4 mova m9, m5 pmovzxbw m4, [r2 + 17] pmullw m5, m4, [pw_31] paddw m2, m5 psubw m5, m6, m4 mova m10, m5 pmovzxbw m4, [r2 + 25] pmullw m5, m4, [pw_31] paddw m3, m5 psubw m5, m6, m4 mova m11, m5 add r2, 65 ; (2 * blkSize + 1) %macro INTRA_PRED_PLANAR32 0 movd m4, [r2] pshufb m4, m7 punpcklbw m4, m7 pmullw m5, m4, [pw_planar32_mul] pmullw m6, m4, [pw_planar32_mul + mmsize] paddw m5, m0 paddw m6, m1 paddw m0, m8 paddw m1, m9 psraw m5, 6 psraw m6, 6 packuswb m5, m6 movu [r0], m5 pmullw m5, m4, [pw_planar16_mul] pmullw m4, [pw_planar16_mul + mmsize] paddw m5, m2 paddw m4, m3 paddw m2, m10 paddw m3, m11 psraw m5, 6 psraw m4, 6 packuswb m5, m4 movu [r0 + 16], m5 lea r0, [r0 + r1] inc r2 %endmacro mov r3, 4 .loop: INTRA_PRED_PLANAR32 INTRA_PRED_PLANAR32 INTRA_PRED_PLANAR32 INTRA_PRED_PLANAR32 INTRA_PRED_PLANAR32 INTRA_PRED_PLANAR32 INTRA_PRED_PLANAR32 INTRA_PRED_PLANAR32 dec r3 jnz .loop RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal intra_pred_planar32, 3,4,11 mova m6, [pw_00ff] vpbroadcastw m3, [r2 + 33] ; topRight = above[32] vpbroadcastw m2, [r2 + 97] ; bottomLeft = left[32] pand m3, m6 pand m2, m6 pmullw m0, m3, [multiL] ; (x + 1) * topRight pmullw m3, [multiH2] ; (x + 1) * topRight paddw m0, m2 paddw m3, m2 paddw m0, [pw_32] paddw m3, [pw_32] pmovzxbw m4, [r2 + 1] pmovzxbw m1, [r2 + 17] pmullw m5, m4, [pw_31] paddw m0, m5 psubw m5, m2, m4 psubw m2, m1 pmullw m1, [pw_31] paddw m3, m1 mova m1, m5 add r2, 65 ; (2 * blkSize + 1) mova m9, [pw_planar32_mul] mova m10, [pw_planar16_mul] %macro INTRA_PRED_PLANAR32_AVX2 0 vpbroadcastw m4, [r2] vpsrlw m7, m4, 8 pand m4, m6 pmullw m5, m4, m9 pmullw m4, m4, m10 paddw m5, m0 paddw m4, m3 paddw m0, m1 paddw m3, m2 psraw m5, 6 psraw m4, 6 packuswb m5, m4 pmullw m8, m7, m9 pmullw m7, m7, m10 vpermq m5, m5, 11011000b paddw m8, m0 paddw m7, m3 paddw m0, m1 paddw m3, m2 psraw m8, 6 psraw m7, 6 packuswb m8, m7 add r2, 2 vpermq m8, m8, 11011000b movu [r0], m5 movu [r0 + r1], m8 lea r0, [r0 + r1 * 2] %endmacro INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 INTRA_PRED_PLANAR32_AVX2 %undef INTRA_PRED_PLANAR32_AVX2 RET %endif ;; ARCH_X86_64 == 1 ;----------------------------------------------------------------------------------------- ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) ;----------------------------------------------------------------------------------------- INIT_XMM ssse3 cglobal intra_pred_ang4_2, 3,5,3 lea r4, [r2 + 2] add r2, 10 cmp r3m, byte 34 cmove r2, r4 movh m0, [r2] movd [r0], m0 palignr m1, m0, 1 movd [r0 + r1], m1 palignr m2, m0, 2 movd [r0 + r1 * 2], m2 lea r1, [r1 * 3] psrldq m0, 3 movd [r0 + r1], m0 RET INIT_XMM sse4 cglobal intra_pred_ang4_3, 3,5,5 mov r4, 1 cmp r3m, byte 33 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] punpcklqdq m0, m1 punpcklqdq m2, m3 lea r3, [ang_table + 20 * 16] movh m3, [r3 + 6 * 16] ; [26] movhps m3, [r3] ; [20] movh m4, [r3 - 6 * 16] ; [14] movhps m4, [r3 - 12 * 16] ; [ 8] jmp .do_filter4x4 ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose ALIGN 16 .do_filter4x4: mova m1, [pw_1024] pmaddubsw m0, m3 pmulhrsw m0, m1 pmaddubsw m2, m4 pmulhrsw m2, m1 packuswb m0, m2 ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before jz .store ; transpose 4x4 pshufb m0, [c_trans_4x4] .store: ; TODO: use pextrd here after intrinsic ssse3 removed movd [r0], m0 pextrd [r0 + r1], m0, 1 pextrd [r0 + r1 * 2], m0, 2 lea r1, [r1 * 3] pextrd [r0 + r1], m0, 3 RET cglobal intra_pred_ang4_4, 3,5,5 xor r4, r4 inc r4 cmp r3m, byte 32 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] punpcklqdq m0, m1 punpcklqdq m2, m1, m3 lea r3, [ang_table + 18 * 16] movh m3, [r3 + 3 * 16] ; [21] movhps m3, [r3 - 8 * 16] ; [10] movh m4, [r3 + 13 * 16] ; [31] movhps m4, [r3 + 2 * 16] ; [20] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_5, 3,5,5 xor r4, r4 inc r4 cmp r3m, byte 31 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] punpcklqdq m0, m1 punpcklqdq m2, m1, m3 lea r3, [ang_table + 10 * 16] movh m3, [r3 + 7 * 16] ; [17] movhps m3, [r3 - 8 * 16] ; [ 2] movh m4, [r3 + 9 * 16] ; [19] movhps m4, [r3 - 6 * 16] ; [ 4] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_6, 3,5,5 xor r4, r4 inc r4 cmp r3m, byte 30 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] punpcklqdq m0, m0 punpcklqdq m2, m2 lea r3, [ang_table + 19 * 16] movh m3, [r3 - 6 * 16] ; [13] movhps m3, [r3 + 7 * 16] ; [26] movh m4, [r3 - 12 * 16] ; [ 7] movhps m4, [r3 + 1 * 16] ; [20] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_7, 3,5,5 xor r4, r4 inc r4 cmp r3m, byte 29 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] punpcklqdq m2, m0, m3 punpcklqdq m0, m0 lea r3, [ang_table + 20 * 16] movh m3, [r3 - 11 * 16] ; [ 9] movhps m3, [r3 - 2 * 16] ; [18] movh m4, [r3 + 7 * 16] ; [27] movhps m4, [r3 - 16 * 16] ; [ 4] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_8, 3,5,5 xor r4, r4 inc r4 cmp r3m, byte 28 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklqdq m0, m0 mova m2, m0 lea r3, [ang_table + 13 * 16] movh m3, [r3 - 8 * 16] ; [ 5] movhps m3, [r3 - 3 * 16] ; [10] movh m4, [r3 + 2 * 16] ; [15] movhps m4, [r3 + 7 * 16] ; [20] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_9, 3,5,5 xor r4, r4 inc r4 cmp r3m, byte 27 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklqdq m0, m0 mova m2, m0 lea r3, [ang_table + 4 * 16] movh m3, [r3 - 2 * 16] ; [ 2] movhps m3, [r3 - 0 * 16] ; [ 4] movh m4, [r3 + 2 * 16] ; [ 6] movhps m4, [r3 + 4 * 16] ; [ 8] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_10, 3,3,4 movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1] pshufb m0, [pb_unpackbd1] pshufd m1, m0, 1 movhlps m2, m0 pshufd m3, m0, 3 movd [r0 + r1], m1 movd [r0 + r1 * 2], m2 lea r1, [r1 * 3] movd [r0 + r1], m3 cmp r4m, byte 0 jz .quit ; filter pmovzxbw m0, m0 ; [-1 -1 -1 -1] movh m1, [r2] ; [4 3 2 1 0] pshufb m2, m1, [pb_0_8] ; [0 0 0 0] pshufb m1, [pb_unpackbw1] ; [4 3 2 1] psubw m1, m2 psraw m1, 1 paddw m0, m1 packuswb m0, m0 .quit: movd [r0], m0 RET INIT_XMM sse4 cglobal intra_pred_ang4_26, 3,4,3 movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] ; store movd [r0], m0 movd [r0 + r1], m0 movd [r0 + r1 * 2], m0 lea r3, [r1 * 3] movd [r0 + r3], m0 ; filter cmp r4m, byte 0 jz .quit pshufb m0, [pb_0_8] ; [ 1 1 1 1] movh m1, [r2 + 8] ; [-4 -3 -2 -1 0] pinsrb m1, [r2], 0 pshufb m2, m1, [pb_0_8] ; [0 0 0 0] pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1] psubw m1, m2 psraw m1, 1 paddw m0, m1 packuswb m0, m0 pextrb [r0], m0, 0 pextrb [r0 + r1], m0, 1 pextrb [r0 + r1 * 2], m0, 2 pextrb [r0 + r3], m0, 3 .quit: RET cglobal intra_pred_ang4_11, 3,5,5 xor r4, r4 cmp r3m, byte 25 mov r3, 8 cmove r3, r4 movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] pinsrb m0, [r2], 0 palignr m1, m0, 1 ; [x x x x 4 3 2 1] punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] punpcklqdq m0, m0 mova m2, m0 lea r3, [ang_table + 24 * 16] movh m3, [r3 + 6 * 16] ; [24] movhps m3, [r3 + 4 * 16] ; [26] movh m4, [r3 + 2 * 16] ; [28] movhps m4, [r3 + 0 * 16] ; [30] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_12, 3,5,5 xor r4, r4 cmp r3m, byte 24 mov r3, 8 cmove r3, r4 movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] pinsrb m0, [r2], 0 palignr m1, m0, 1 ; [x x x x 4 3 2 1] punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] punpcklqdq m0, m0 mova m2, m0 lea r3, [ang_table + 20 * 16] movh m3, [r3 + 7 * 16] ; [27] movhps m3, [r3 + 2 * 16] ; [22] movh m4, [r3 - 3 * 16] ; [17] movhps m4, [r3 - 8 * 16] ; [12] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_13, 4,5,5 xor r4, r4 cmp r3m, byte 23 mov r3, 8 jz .next xchg r3, r4 .next: movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] pinsrb m1, [r2], 1 palignr m0, m1, 1 ; [x x x 4 3 2 1 0] palignr m2, m1, 2 ; [x x x x 4 3 2 1] pinsrb m1, [r2 + r3 + 4], 0 punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x] punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0] punpcklqdq m2, m0, m1 punpcklqdq m0, m0 lea r3, [ang_table + 21 * 16] movh m3, [r3 + 2 * 16] ; [23] movhps m3, [r3 - 7 * 16] ; [14] movh m4, [r3 - 16 * 16] ; [ 5] movhps m4, [r3 + 7 * 16] ; [28] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_14, 4,5,5 xor r4, r4 cmp r3m, byte 22 mov r3, 8 jz .next xchg r3, r4 .next: movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] pinsrb m2, [r2], 1 palignr m0, m2, 1 ; [x x x 4 3 2 1 0] palignr m1, m2, 2 ; [x x x x 4 3 2 1] pinsrb m2, [r2 + r3 + 2], 0 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] punpcklqdq m0, m0 punpcklqdq m2, m2 lea r3, [ang_table + 19 * 16] movh m3, [r3 + 0 * 16] ; [19] movhps m3, [r3 - 13 * 16] ; [ 6] movh m4, [r3 + 6 * 16] ; [25] movhps m4, [r3 - 7 * 16] ; [12] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_15, 4,5,5 xor r4, r4 cmp r3m, byte 21 mov r3, 8 jz .next xchg r3, r4 .next: movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] pinsrb m2, [r2], 1 palignr m0, m2, 1 ; [x x x 4 3 2 1 0] palignr m1, m2, 2 ; [x x x x 4 3 2 1] pinsrb m2, [r2 + r3 + 2], 0 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] pinsrb m3, [r2 + r3 + 4], 0 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] punpcklqdq m0, m2 punpcklqdq m2, m4 lea r3, [ang_table + 23 * 16] movh m3, [r3 - 8 * 16] ; [15] movhps m3, [r3 + 7 * 16] ; [30] movh m4, [r3 - 10 * 16] ; [13] movhps m4, [r3 + 5 * 16] ; [28] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_16, 3,5,5 xor r4, r4 cmp r3m, byte 20 mov r3, 8 jz .next xchg r3, r4 .next: movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] pinsrb m2, [r2], 1 palignr m0, m2, 1 ; [x x x 4 3 2 1 0] palignr m1, m2, 2 ; [x x x x 4 3 2 1] pinsrb m2, [r2 + r3 + 2], 0 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] pinsrb m3, [r2 + r3 + 3], 0 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] punpcklqdq m0, m2 punpcklqdq m2, m4 lea r3, [ang_table + 19 * 16] movh m3, [r3 - 8 * 16] ; [11] movhps m3, [r3 + 3 * 16] ; [22] movh m4, [r3 - 18 * 16] ; [ 1] movhps m4, [r3 - 7 * 16] ; [12] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_17, 3,5,5 xor r4, r4 cmp r3m, byte 19 mov r3, 8 jz .next xchg r3, r4 .next: movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x] pinsrb m3, [r2], 1 palignr m0, m3, 1 ; [- - - 4 3 2 1 0] palignr m1, m3, 2 ; [- - - - 4 3 2 1] mova m4, m0 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] pinsrb m3, [r2 + r3 + 1], 0 punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x] punpcklqdq m0, m1 pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y] pinsrb m2, [r2 + r3 + 2], 0 pslldq m1, m2, 1 ; [4 3 2 1 0 x y z] pinsrb m1, [r2 + r3 + 4], 0 punpcklbw m1, m2 ; [1 0 0 x x y y z] punpcklbw m2, m3 ; [2 1 1 0 0 x x y] punpcklqdq m2, m1 lea r3, [ang_table + 14 * 16] movh m3, [r3 - 8 * 16] ; [ 6] movhps m3, [r3 - 2 * 16] ; [12] movh m4, [r3 + 4 * 16] ; [18] movhps m4, [r3 + 10 * 16] ; [24] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) cglobal intra_pred_ang4_18, 3,5,1 mov r4d, [r2 + 8] mov r3b, byte [r2] mov [r2 + 8], r3b mov r3d, [r2 + 8] bswap r3d movd m0, r3d pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3] lea r3, [r1 * 3] movd [r0 + r3], m0 psrldq m0, 1 movd [r0 + r1 * 2], m0 psrldq m0, 1 movd [r0 + r1], m0 psrldq m0, 1 movd [r0], m0 mov [r2 + 8], r4w RET ;----------------------------------------------------------------------------------------- ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) ;----------------------------------------------------------------------------------------- INIT_XMM ssse3 cglobal intra_pred_ang8_2, 3,5,2 lea r4, [r2 + 2] add r2, 18 cmp r3m, byte 34 cmove r2, r4 movu m0, [r2] lea r4, [r1 * 3] movh [r0], m0 palignr m1, m0, 1 movh [r0 + r1], m1 palignr m1, m0, 2 movh [r0 + r1 * 2], m1 palignr m1, m0, 3 movh [r0 + r4], m1 palignr m1, m0, 4 lea r0, [r0 + r1 * 4] movh [r0], m1 palignr m1, m0, 5 movh [r0 + r1], m1 palignr m1, m0, 6 movh [r0 + r1 * 2], m1 palignr m1, m0, 7 movh [r0 + r4], m1 RET INIT_XMM sse4 cglobal intra_pred_ang8_3, 3,5,8 lea r4, [r2 + 1] add r2, 17 cmp r3m, byte 33 cmove r2, r4 lea r3, [ang_table + 22 * 16] lea r4, [ang_table + 8 * 16] mova m3, [pw_1024] movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] pmaddubsw m4, m0, [r3 + 4 * 16] ; [26] pmulhrsw m4, m3 pmaddubsw m1, [r3 - 2 * 16] ; [20] pmulhrsw m1, m3 packuswb m4, m1 palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] pmaddubsw m5, [r3 - 8 * 16] ; [14] pmulhrsw m5, m3 palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] pmaddubsw m6, [r4] ; [ 8] pmulhrsw m6, m3 packuswb m5, m6 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2] pmulhrsw m6, m3 pmaddubsw m1, [r3 + 6 * 16] ; [28] pmulhrsw m1, m3 packuswb m6, m1 palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] pmaddubsw m1, [r3] ; [22] pmulhrsw m1, m3 palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] pmaddubsw m2, [r3 - 6 * 16] ; [16] pmulhrsw m2, m3 packuswb m1, m2 jmp .transpose8x8 ALIGN 16 .transpose8x8: jz .store ; transpose 8x8 punpckhbw m0, m4, m5 punpcklbw m4, m5 punpckhbw m2, m4, m0 punpcklbw m4, m0 punpckhbw m0, m6, m1 punpcklbw m6, m1 punpckhbw m1, m6, m0 punpcklbw m6, m0 punpckhdq m5, m4, m6 punpckldq m4, m6 punpckldq m6, m2, m1 punpckhdq m2, m1 mova m1, m2 .store: lea r4, [r1 * 3] movh [r0], m4 movhps [r0 + r1], m4 movh [r0 + r1 * 2], m5 movhps [r0 + r4], m5 add r0, r4 movh [r0 + r1], m6 movhps [r0 + r1 * 2], m6 movh [r0 + r4], m1 movhps [r0 + r1 * 4], m1 RET cglobal intra_pred_ang8_4, 3,5,8 lea r4, [r2 + 1] add r2, 17 cmp r3m, byte 32 cmove r2, r4 lea r3, [ang_table + 24 * 16] lea r4, [ang_table + 10 * 16] mova m3, [pw_1024] movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] mova m5, m1 pmaddubsw m4, m0, [r3 - 3 * 16] ; [21] pmulhrsw m4, m3 pmaddubsw m1, [r4] ; [10] pmulhrsw m1, m3 packuswb m4, m1 pmaddubsw m5, [r3 + 7 * 16] ; [31] pmulhrsw m5, m3 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] pmaddubsw m6, [r3 - 4 * 16] ; [ 20] pmulhrsw m6, m3 packuswb m5, m6 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9] pmulhrsw m6, m3 pmaddubsw m1, [r3 + 6 * 16] ; [30] pmulhrsw m1, m3 packuswb m6, m1 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] pmaddubsw m1, [r3 - 5 * 16] ; [19] pmulhrsw m1, m3 palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8] pmaddubsw m2, [r4 - 2 * 16] ; [8] pmulhrsw m2, m3 packuswb m1, m2 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_5, 3,5,8 lea r4, [r2 + 1] add r2, 17 cmp r3m, byte 31 cmove r2, r4 lea r3, [ang_table + 17 * 16] lea r4, [ang_table + 2 * 16] mova m3, [pw_1024] movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] mova m5, m1 pmaddubsw m4, m0, [r3] ; [17] pmulhrsw m4, m3 pmaddubsw m1, [r4] ; [2] pmulhrsw m1, m3 packuswb m4, m1 pmaddubsw m5, [r3 + 2 * 16] ; [19] pmulhrsw m5, m3 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] mova m1, m6 pmaddubsw m1, [r4 + 2 * 16] ; [4] pmulhrsw m1, m3 packuswb m5, m1 pmaddubsw m6, [r3 + 4 * 16] ; [21] pmulhrsw m6, m3 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] mova m7, m1 pmaddubsw m7, [r4 + 4 * 16] ; [6] pmulhrsw m7, m3 packuswb m6, m7 pmaddubsw m1, [r3 + 6 * 16] ; [23] pmulhrsw m1, m3 palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9] pmaddubsw m2, [r4 + 6 * 16] ; [8] pmulhrsw m2, m3 packuswb m1, m2 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_6, 3,5,8 lea r4, [r2 + 1] add r2, 17 cmp r3m, byte 30 cmove r2, r4 lea r3, [ang_table + 20 * 16] lea r4, [ang_table + 8 * 16] mova m7, [pw_1024] movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] mova m1, m0 pmaddubsw m4, m0, [r3 - 7 * 16] ; [13] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 6 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] pmaddubsw m5, m6, [r4 - 1 * 16] ; [7] pmulhrsw m5, m7 pmaddubsw m6, [r3] ; [20] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] pmaddubsw m6, m1, [r4 - 7 * 16] ; [1] pmulhrsw m6, m7 mova m3, m1 pmaddubsw m3, [r3 - 6 * 16] ; [14] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 7 * 16] ; [27] pmulhrsw m1, m7 palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] pmaddubsw m2, [r4] ; [8] pmulhrsw m2, m7 packuswb m1, m2 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_7, 3,5,8 lea r4, [r2 + 1] add r2, 17 cmp r3m, byte 29 cmove r2, r4 lea r3, [ang_table + 24 * 16] lea r4, [ang_table + 6 * 16] mova m7, [pw_1024] movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m0, [r4 + 3 * 16] ; [9] pmulhrsw m4, m7 pmaddubsw m3, m0, [r3 - 6 * 16] ; [18] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r3 + 3 * 16] ; [27] pmulhrsw m5, m7 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] pmaddubsw m6, m1, [r4 - 2 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r4 + 7 * 16] ; [13] pmulhrsw m6, m7 mova m3, m1 pmaddubsw m3, [r3 - 2 * 16] ; [22] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 7 * 16] ; [31] pmulhrsw m1, m7 palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] pmaddubsw m2, [r4 + 2 * 16] ; [8] pmulhrsw m2, m7 packuswb m1, m2 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_8, 3,5,8 lea r4, [r2 + 1] add r2, 17 cmp r3m, byte 28 cmove r2, r4 lea r3, [ang_table + 23 * 16] lea r4, [ang_table + 8 * 16] mova m7, [pw_1024] movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] pmaddubsw m4, m0, [r4 - 3 * 16] ; [5] pmulhrsw m4, m7 pmaddubsw m3, m0, [r4 + 2 * 16] ; [10] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r3 - 8 * 16] ; [15] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 - 3 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 + 2 * 16] ; [25] pmulhrsw m6, m7 pmaddubsw m0, [r3 + 7 * 16] ; [30] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r4 - 5 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m2, [r4] ; [8] pmulhrsw m2, m7 packuswb m1, m2 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_9, 3,5,8 lea r4, [r2 + 1] add r2, 17 cmp r3m, byte 27 cmove r2, r4 lea r3, [ang_table + 10 * 16] mova m7, [pw_1024] movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m0, [r3 - 8 * 16] ; [2] pmulhrsw m4, m7 pmaddubsw m3, m0, [r3 - 6 * 16] ; [4] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r3 - 4 * 16] ; [6] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 - 2 * 16] ; [8] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3] ; [10] pmulhrsw m6, m7 pmaddubsw m2, m0, [r3 + 2 * 16] ; [12] pmulhrsw m2, m7 packuswb m6, m2 pmaddubsw m1, m0, [r3 + 4 * 16] ; [14] pmulhrsw m1, m7 pmaddubsw m0, [r3 + 6 * 16] ; [16] pmulhrsw m0, m7 packuswb m1, m0 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_10, 3,6,5 movh m0, [r2 + 17] mova m4, [pb_unpackbq] palignr m1, m0, 2 pshufb m1, m4 palignr m2, m0, 4 pshufb m2, m4 palignr m3, m0, 6 pshufb m3, m4 pshufb m0, m4 lea r5, [r1 * 3] movhps [r0 + r1], m0 movh [r0 + r1 * 2], m1 movhps [r0 + r5], m1 lea r3, [r0 + r1 * 4] movh [r3], m2 movhps [r3 + r1], m2 movh [r3 + r1 * 2], m3 movhps [r3 + r5], m3 ; filter cmp r4m, byte 0 jz .quit pmovzxbw m0, m0 movu m1, [r2] palignr m2, m1, 1 pshufb m1, m4 pmovzxbw m1, m1 pmovzxbw m2, m2 psubw m2, m1 psraw m2, 1 paddw m0, m2 packuswb m0, m0 .quit: movh [r0], m0 RET cglobal intra_pred_ang8_26, 3,6,3 movu m2, [r2] palignr m0, m2, 1 lea r5, [r1 * 3] movh [r0], m0 movh [r0 + r1], m0 movh [r0 + r1 * 2], m0 movh [r0 + r5], m0 lea r3, [r0 + r1 * 4] movh [r3], m0 movh [r3 + r1], m0 movh [r3 + r1 * 2], m0 movh [r3 + r5], m0 ; filter cmp r4m, byte 0 jz .quit pshufb m2, [pb_unpackbq] movhlps m1, m2 pmovzxbw m2, m2 movu m0, [r2 + 17] pmovzxbw m1, m1 pmovzxbw m0, m0 psubw m0, m2 psraw m0, 1 paddw m1, m0 packuswb m1, m1 pextrb [r0], m1, 0 pextrb [r0 + r1], m1, 1 pextrb [r0 + r1 * 2], m1, 2 pextrb [r0 + r5], m1, 3 pextrb [r3], m1, 4 pextrb [r3 + r1], m1, 5 pextrb [r3 + r1 * 2], m1, 6 pextrb [r3 + r5], m1, 7 .quit: RET cglobal intra_pred_ang8_11, 3,5,8 xor r4, r4 cmp r3m, byte 25 mov r3, 16 cmove r3, r4 movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m0, [r2], 0 palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] lea r3, [ang_table + 23 * 16] mova m7, [pw_1024] pmaddubsw m4, m0, [r3 + 7 * 16] ; [30] pmulhrsw m4, m7 pmaddubsw m3, m0, [r3 + 5 * 16] ; [28] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r3 + 3 * 16] ; [26] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 + 1 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 - 1 * 16] ; [22] pmulhrsw m6, m7 pmaddubsw m2, m0, [r3 - 3 * 16] ; [20] pmulhrsw m2, m7 packuswb m6, m2 pmaddubsw m1, m0, [r3 - 5 * 16] ; [18] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 7 * 16] ; [16] pmulhrsw m0, m7 packuswb m1, m0 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_12, 3,5,8 xor r4, r4 cmp r3m, byte 24 mov r3, 16 jz .next xchg r3, r4 .next: movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m1, [r2], 0 pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] pinsrb m0, [r2 + r3 + 6], 0 lea r4, [ang_table + 22 * 16] mova m7, [pw_1024] punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m2, [r4 + 5 * 16] ; [27] pmulhrsw m4, m7 pmaddubsw m3, m2, [r4] ; [22] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m1, m0, [r4 + 7 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m0, [r4 + 2 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 pmaddubsw m5, m2, [r4 - 5 * 16] ; [17] pmulhrsw m5, m7 lea r4, [ang_table + 7 * 16] pmaddubsw m6, m2, [r4 + 5 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4] ; [7] pmulhrsw m6, m7 pmaddubsw m2, [r4 - 5 * 16] ; [2] pmulhrsw m2, m7 packuswb m6, m2 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_13, 4,5,8 xor r4, r4 cmp r3m, byte 23 mov r3, 16 jz .next xchg r3, r4 .next: movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m1, [r2], 0 pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] pinsrb m1, [r2 + r3 + 4], 0 pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] pinsrb m0, [r2 + r3 + 7], 0 punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] lea r4, [ang_table + 24 * 16] mova m7, [pw_1024] pmaddubsw m4, m5, [r4 - 1 * 16] ; [23] pmulhrsw m4, m7 pmaddubsw m6, m1, [r4 + 4 * 16] ; [28] pmulhrsw m6, m7 pmaddubsw m0, [r4] ; [24] pmulhrsw m0, m7 lea r4, [ang_table + 13 * 16] pmaddubsw m3, m5, [r4 + 1 * 16] ; [14] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, [r4 - 8 * 16] ; [5] pmulhrsw m5, m7 packuswb m5, m6 pmaddubsw m6, m1, [r4 + 6 * 16] ; [19] pmulhrsw m6, m7 pmaddubsw m2, m1, [r4 - 3 * 16] ; [10] pmulhrsw m2, m7 packuswb m6, m2 pmaddubsw m1, [r4 - 12 * 16] ; [1] pmulhrsw m1, m7 packuswb m1, m0 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_14, 4,5,8 xor r4, r4 cmp r3m, byte 22 mov r3, 16 jz .next xchg r3, r4 .next: movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] pinsrb m1, [r2], 2 pinsrb m1, [r2 + r3 + 2], 1 pinsrb m1, [r2 + r3 + 5], 0 pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] pinsrb m0, [r2 + r3 + 7], 0 punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] lea r4, [ang_table + 24 * 16] mova m3, [pw_1024] pmaddubsw m4, m2, [r4 - 5 * 16] ; [19] pmulhrsw m4, m3 pmaddubsw m0, [r4] ; [24] pmulhrsw m0, m3 pmaddubsw m5, m6, [r4 + 1 * 16] ; [25] pmulhrsw m5, m3 lea r4, [ang_table + 12 * 16] pmaddubsw m6, [r4] ; [12] pmulhrsw m6, m3 packuswb m5, m6 pmaddubsw m6, m1, [r4 + 19 * 16] ; [31] pmulhrsw m6, m3 pmaddubsw m2, [r4 - 6 * 16] ; [6] pmulhrsw m2, m3 packuswb m4, m2 pmaddubsw m2, m1, [r4 + 6 * 16] ; [18] pmulhrsw m2, m3 packuswb m6, m2 pmaddubsw m1, [r4 - 7 * 16] ; [5] pmulhrsw m1, m3 packuswb m1, m0 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_15, 4,5,8 xor r4, r4 cmp r3m, byte 21 mov r3, 16 jz .next xchg r3, r4 .next: movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m1, [r2], 0 movu m2, [r2 + r3] pshufb m2, [c_mode16_15] palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] pinsrb m0, [r2 + r3 + 8], 0 punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] lea r4, [ang_table + 23 * 16] mova m3, [pw_1024] pmaddubsw m4, [r4 - 8 * 16] ; [15] pmulhrsw m4, m3 pmaddubsw m2, m5, [r4 + 7 * 16] ; [30] pmulhrsw m2, m3 packuswb m4, m2 pmaddubsw m5, [r4 - 10 * 16] ; [13] pmulhrsw m5, m3 pmaddubsw m2, m6, [r4 + 5 * 16] ; [28] pmulhrsw m2, m3 packuswb m5, m2 pmaddubsw m2, m1, [r4 + 3 * 16] ; [26] pmulhrsw m2, m3 pmaddubsw m0, [r4 + 1 * 16] ; [24] pmulhrsw m0, m3 lea r4, [ang_table + 11 * 16] pmaddubsw m6, [r4] ; [11] pmulhrsw m6, m3 packuswb m6, m2 pmaddubsw m1, [r4 - 2 * 16] ; [9] pmulhrsw m1, m3 packuswb m1, m0 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_16, 4,5,8 xor r4, r4 cmp r3m, byte 20 mov r3, 16 jz .next xchg r3, r4 .next: movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m1, [r2], 0 movu m2, [r2 + r3] pshufb m2, [c_mode16_16] palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] pinsrb m0, [r2 + r3 + 8], 0 punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e] palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] lea r4, [ang_table + 22 * 16] mova m7, [pw_1024] pmaddubsw m3, m5, [r4] ; [22] pmulhrsw m3, m7 pmaddubsw m0, [r4 + 2 * 16] ; [24] pmulhrsw m0, m7 lea r4, [ang_table + 9 * 16] pmaddubsw m4, [r4 + 2 * 16] ; [11] pmulhrsw m4, m7 packuswb m4, m3 pmaddubsw m2, [r4 + 3 * 16] ; [12] pmulhrsw m2, m7 pmaddubsw m5, [r4 - 8 * 16] ; [1] pmulhrsw m5, m7 packuswb m5, m2 mova m2, m6 pmaddubsw m6, [r4 + 14 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m2, [r4 - 7 * 16] ; [2] pmulhrsw m2, m7 packuswb m6, m2 pmaddubsw m1, [r4 + 4 * 16] ; [13] pmulhrsw m1, m7 packuswb m1, m0 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_17, 4,5,8 xor r4, r4 cmp r3m, byte 19 mov r3, 16 jz .next xchg r3, r4 .next: movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m2, [r2], 0 movu m1, [r2 + r3] pshufb m1, [c_mode16_17] palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f] pinsrb m0, [r2 + r3 + 7], 0 punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f] palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] lea r4, [ang_table + 17 * 16] mova m3, [pw_1024] pmaddubsw m2, [r4 - 5 * 16] ; [12] pmulhrsw m2, m3 pmaddubsw m4, [r4 - 11 * 16] ; [6] pmulhrsw m4, m3 packuswb m4, m2 pmaddubsw m5, [r4 + 1 * 16] ; [18] pmulhrsw m5, m3 palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] pmaddubsw m2, [r4 + 7 * 16] ; [24] pmulhrsw m2, m3 packuswb m5, m2 palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] mova m2, m6 pmaddubsw m6, [r4 + 13 * 16] ; [30] pmulhrsw m6, m3 pmaddubsw m2, [r4 - 13 * 16] ; [4] pmulhrsw m2, m3 packuswb m6, m2 palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e] pmaddubsw m1, [r4 - 7 * 16] ; [10] pmulhrsw m1, m3 pmaddubsw m0, [r4 - 1 * 16] ; [16] pmulhrsw m0, m3 packuswb m1, m0 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) cglobal intra_pred_ang8_18, 4,4,1 movu m0, [r2 + 16] pinsrb m0, [r2], 0 pshufb m0, [pb_swap8] movhps m0, [r2 + 1] lea r2, [r0 + r1 * 4] lea r3, [r1 * 3] movh [r2 + r3], m0 psrldq m0, 1 movh [r2 + r1 * 2], m0 psrldq m0, 1 movh [r2 + r1], m0 psrldq m0, 1 movh [r2], m0 psrldq m0, 1 movh [r0 + r3], m0 psrldq m0, 1 movh [r0 + r1 * 2], m0 psrldq m0, 1 movh [r0 + r1], m0 psrldq m0, 1 movh [r0], m0 RET %macro TRANSPOSE_STORE_8x8 6 %if %2 == 1 ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 punpckhbw m0, %3, %4 punpcklbw %3, %4 punpckhbw %4, %3, m0 punpcklbw %3, m0 punpckhbw m0, %5, m1 punpcklbw %5, %6 punpckhbw %6, %5, m0 punpcklbw %5, m0 punpckhdq m0, %3, %5 punpckldq %3, %5 punpckldq %5, %4, %6 punpckhdq %4, %6 movh [r0 + + %1 * 8], %3 movhps [r0 + r1 + %1 * 8], %3 movh [r0 + r1*2 + %1 * 8], m0 movhps [r0 + r5 + %1 * 8], m0 movh [r6 + %1 * 8], %5 movhps [r6 + r1 + %1 * 8], %5 movh [r6 + r1*2 + %1 * 8], %4 movhps [r6 + r5 + %1 * 8], %4 %else ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32 movh [r0 ], %3 movhps [r0 + r1 ], %3 movh [r0 + r1 * 2], %4 movhps [r0 + r5 ], %4 lea r0, [r0 + r1 * 4] movh [r0 ], %5 movhps [r0 + r1 ], %5 movh [r0 + r1 * 2], %6 movhps [r0 + r5 ], %6 lea r0, [r0 + r1 * 4] %endif %endmacro ;------------------------------------------------------------------------------------------ ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) ;------------------------------------------------------------------------------------------ INIT_XMM ssse3 cglobal intra_pred_ang16_2, 3,5,3 lea r4, [r2 + 2] add r2, 34 cmp r3m, byte 34 cmove r2, r4 movu m0, [r2] movu m1, [r2 + 16] movu [r0], m0 palignr m2, m1, m0, 1 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] palignr m2, m1, m0, 2 movu [r0], m2 palignr m2, m1, m0, 3 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] palignr m2, m1, m0, 4 movu [r0], m2 palignr m2, m1, m0, 5 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] palignr m2, m1, m0, 6 movu [r0], m2 palignr m2, m1, m0, 7 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] palignr m2, m1, m0, 8 movu [r0], m2 palignr m2, m1, m0, 9 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] palignr m2, m1, m0, 10 movu [r0], m2 palignr m2, m1, m0, 11 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] palignr m2, m1, m0, 12 movu [r0], m2 palignr m2, m1, m0, 13 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] palignr m2, m1, m0, 14 movu [r0], m2 palignr m2, m1, m0, 15 movu [r0 + r1], m2 RET INIT_XMM sse4 cglobal intra_pred_ang16_3, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 4 pmaddubsw m5, [r3 - 2 * 16] ; [14] pmulhrsw m5, m7 palignr m6, m2, m0, 6 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 8 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 palignr m2, m0, 12 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 movu m0, [r2 + 8] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m5, m2, m0, 2 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 14 * 16] ; [30] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 6 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] pmulhrsw m6, m7 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 14] ; [00] TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_33, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 4 pmaddubsw m5, [r3 - 2 * 16] ; [14] pmulhrsw m5, m7 palignr m6, m2, m0, 6 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 8 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 palignr m2, m0, 12 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 movu m0, [r2 + 8] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m5, m2, m0, 2 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 14 * 16] ; [30] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 6 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] pmulhrsw m6, m7 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m1, m1 movh m2, [r2 + 14] ; [00] movh [r0 ], m4 movhps [r0 + r1 ], m4 movh [r0 + r1 * 2], m5 movhps [r0 + r5 ], m5 lea r0, [r0 + r1 * 4] movh [r0 ], m6 movhps [r0 + r1 ], m6 movh [r0 + r1 * 2], m1 movh [r0 + r5 ], m2 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_4, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 mova m5, m1 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 pmaddubsw m1, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 15 * 16] ; [31] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 + 4 * 16] ; [ 20] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 6 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 + 3 * 16] ; [19] pmulhrsw m1, m7 palignr m2, m0, 10 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] pmulhrsw m4, m7 movu m0, [r2 + 6] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 pmaddubsw m1, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 4 mova m6, m5 pmaddubsw m5, [r3 - 9 * 16] ; [07] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 palignr m6, m2, m0, 6 pmaddubsw m6, [r3 + 16] ; [17] pmulhrsw m6, m7 palignr m1, m2, m0, 8 palignr m2, m0, 10 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_32, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] .loop: movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 mova m5, m1 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 pmaddubsw m1, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 15 * 16] ; [31] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 + 4 * 16] ; [ 20] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 6 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 + 3 * 16] ; [19] pmulhrsw m1, m7 palignr m2, m0, 10 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] pmulhrsw m4, m7 movu m0, [r2 + 6] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 pmaddubsw m1, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 4 mova m6, m5 pmaddubsw m5, [r3 - 9 * 16] ; [07] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 palignr m6, m2, m0, 6 pmaddubsw m6, [r3 + 16] ; [17] pmulhrsw m6, m7 palignr m1, m2, m0, 8 palignr m2, m0, 10 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_5, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m5, m2, m3, 2 pmaddubsw m4, m3, [r3 + 16] ; [17] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m3, 4 pmaddubsw m5, [r3 + 3 * 16] ; [19] pmulhrsw m5, m7 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] pmulhrsw m1, m7 packuswb m5, m1 palignr m1, m2, m3, 6 pmaddubsw m6, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] pmulhrsw m0, m7 packuswb m6, m0 palignr m0, m2, m3, 8 pmaddubsw m1, [r3 + 7 * 16] ; [23] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 palignr m4, m2, m3, 8 palignr m5, m2, m3, 10 pmaddubsw m4, [r3 + 9 * 16] ; [25] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m3, 12 pmaddubsw m5, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 packuswb m5, m1 palignr m1, m2, m3, 14 pmaddubsw m6, [r3 + 13 * 16] ; [29] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_31, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m5, m2, m3, 2 pmaddubsw m4, m3, [r3 + 16] ; [17] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m3, 4 pmaddubsw m5, [r3 + 3 * 16] ; [19] pmulhrsw m5, m7 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] pmulhrsw m1, m7 packuswb m5, m1 palignr m1, m2, m3, 6 pmaddubsw m6, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] pmulhrsw m0, m7 packuswb m6, m0 palignr m0, m2, m3, 8 pmaddubsw m1, [r3 + 7 * 16] ; [23] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 palignr m4, m2, m3, 8 palignr m5, m2, m3, 10 pmaddubsw m4, [r3 + 9 * 16] ; [25] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m3, 12 pmaddubsw m5, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 packuswb m5, m1 palignr m1, m2, m3, 14 pmaddubsw m6, [r3 + 13 * 16] ; [29] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_6, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m3, 2 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m3, 4 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m6, m0 palignr m0, m2, m3, 6 pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 palignr m4, m2, m3, 6 palignr m6, m2, m3, 8 pmaddubsw m4, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m6, [r3 - 16] ; [15] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 palignr m0, m2, m3, 10 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] pmulhrsw m6, m7 pmaddubsw m0, [r3 + 6 * 16] ; [22] pmulhrsw m0, m7 packuswb m6, m0 palignr m2, m3, 12 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_30, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m3, 2 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m3, 4 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m6, m0 palignr m0, m2, m3, 6 pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 palignr m4, m2, m3, 6 palignr m6, m2, m3, 8 pmaddubsw m4, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m6, [r3 - 16] ; [15] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 palignr m0, m2, m3, 10 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] pmulhrsw m6, m7 pmaddubsw m0, [r3 + 6 * 16] ; [22] pmulhrsw m0, m7 packuswb m6, m0 palignr m2, m3, 12 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_7, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] pmulhrsw m4, m7 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m4, m0 palignr m1, m2, m3, 2 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] pmulhrsw m0, m7 packuswb m6, m0 palignr m0, m2, m3, 4 pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 palignr m1, m2, m3, 4 pmaddubsw m4, m1, [r3 + 16] ; [17] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m0, m2, m3, 6 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 pmaddubsw m0, [r3 + 14 * 16] ; [30] pmulhrsw m0, m7 packuswb m6, m0 palignr m2, m3, 8 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_29, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] pmulhrsw m4, m7 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m4, m0 palignr m1, m2, m3, 2 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] pmulhrsw m0, m7 packuswb m6, m0 palignr m0, m2, m3, 4 pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 palignr m1, m2, m3, 4 pmaddubsw m4, m1, [r3 + 16] ; [17] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m0, m2, m3, 6 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 pmaddubsw m0, [r3 + 14 * 16] ; [30] pmulhrsw m0, m7 packuswb m6, m0 palignr m2, m3, 8 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_8, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] pmulhrsw m4, m7 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] pmulhrsw m2, m7 packuswb m4, m2 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] pmulhrsw m5, m7 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] pmulhrsw m6, m7 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] pmulhrsw m2, m7 packuswb m6, m2 palignr m2, m0, m1, 2 palignr m3, m0, m1, 4 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] pmulhrsw m5, m7 pmaddubsw m2, [r3 + 12 * 16] ; [28] pmulhrsw m2, m7 packuswb m5, m2 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] pmulhrsw m6, m7 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] pmulhrsw m1, m7 pmaddubsw m3, [r3] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_28, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] .loop: movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] pmulhrsw m4, m7 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] pmulhrsw m2, m7 packuswb m4, m2 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] pmulhrsw m5, m7 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] pmulhrsw m6, m7 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] pmulhrsw m2, m7 packuswb m6, m2 palignr m2, m0, m1, 2 palignr m3, m0, m1, 4 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] pmulhrsw m5, m7 pmaddubsw m2, [r3 + 12 * 16] ; [28] pmulhrsw m2, m7 packuswb m5, m2 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] pmulhrsw m6, m7 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] pmulhrsw m1, m7 pmaddubsw m3, [r3] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_9, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] pmulhrsw m4, m7 pmaddubsw m0, m2, [r3 - 12 * 16] ; [4] pmulhrsw m0, m7 packuswb m4, m0 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] pmulhrsw m5, m7 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] pmulhrsw m6, m7 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 pmaddubsw m0, m2, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] pmulhrsw m4, m7 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] pmulhrsw m5, m7 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] pmulhrsw m6, m7 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m1, m1 punpcklqdq m1, m3 ; [00] TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_27, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] pmaddubsw m4, m3, [r3 - 14 * 16] ; [2] pmulhrsw m4, m7 pmaddubsw m0, m3, [r3 - 12 * 16] ; [4] pmulhrsw m0, m7 packuswb m4, m0 pmaddubsw m5, m3, [r3 - 10 * 16] ; [6] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 - 8 * 16] ; [8] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 - 6 * 16] ; [10] pmulhrsw m6, m7 pmaddubsw m0, m3, [r3 - 4 * 16] ; [12] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 pmaddubsw m0, m3, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r3 + 2 * 16] ; [18] pmulhrsw m4, m7 pmaddubsw m5, m3, [r3 + 4 * 16] ; [20] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r3 + 6 * 16] ; [22] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 + 10 * 16] ; [26] pmulhrsw m6, m7 pmaddubsw m1, m3, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m1, m1 movh [r0 ], m4 movhps [r0 + r1 ], m4 movh [r0 + r1 * 2], m5 movhps [r0 + r5 ], m5 lea r0, [r0 + r1 * 4] movh [r0 ], m6 movhps [r0 + r1 ], m6 movh [r0 + r1 * 2], m1 movh [r0 + r5 ], m2 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_10, 5,6,8 lea r5, [r1 * 3] pxor m7, m7 movu m0, [r2 + 1 + 32] palignr m1, m0, 1 pshufb m1, m7 palignr m2, m0, 2 pshufb m2, m7 palignr m3, m0, 3 pshufb m3, m7 palignr m4, m0, 4 pshufb m4, m7 palignr m5, m0, 5 pshufb m5, m7 palignr m6, m0, 6 pshufb m6, m7 movu [r0 + r1], m1 movu [r0 + r1 * 2], m2 movu [r0 + r5], m3 lea r3, [r0 + r1 * 4] movu [r3], m4 movu [r3 + r1], m5 movu [r3 + r1 * 2], m6 palignr m1, m0, 7 pshufb m1, m7 movhlps m2, m0 pshufb m2, m7 palignr m3, m0, 9 pshufb m3, m7 palignr m4, m0, 10 pshufb m4, m7 palignr m5, m0, 11 pshufb m5, m7 palignr m6, m0, 12 pshufb m6, m7 movu [r3 + r5], m1 lea r3, [r3 + r1 * 4] movu [r3], m2 movu [r3 + r1], m3 movu [r3 + r1 * 2], m4 movu [r3 + r5], m5 lea r3, [r3 + r1 * 4] movu [r3], m6 palignr m1, m0, 13 pshufb m1, m7 palignr m2, m0, 14 pshufb m2, m7 palignr m3, m0, 15 pshufb m3, m7 pshufb m0, m7 movu [r3 + r1], m1 movu [r3 + r1 * 2], m2 movu [r3 + r5], m3 ; filter cmp r4w, byte 0 jz .quit pmovzxbw m0, m0 mova m1, m0 movu m2, [r2] movu m3, [r2 + 1] pshufb m2, m7 pmovzxbw m2, m2 movhlps m4, m3 pmovzxbw m3, m3 pmovzxbw m4, m4 psubw m3, m2 psubw m4, m2 psraw m3, 1 psraw m4, 1 paddw m0, m3 paddw m1, m4 packuswb m0, m1 .quit: movu [r0], m0 RET INIT_XMM sse4 %if ARCH_X86_64 == 1 cglobal intra_pred_ang16_26, 3,8,5 mov r7, r4mp %define bfilter r7w %else cglobal intra_pred_ang16_26, 5,7,5,0-4 %define bfilter dword[rsp] mov bfilter, r4 %endif movu m0, [r2 + 1] lea r4, [r1 * 3] lea r3, [r0 + r1 * 4] lea r5, [r3 + r1 * 4] lea r6, [r5 + r1 * 4] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + r1 * 2], m0 movu [r0 + r4], m0 movu [r3], m0 movu [r3 + r1], m0 movu [r3 + r1 * 2], m0 movu [r3 + r4], m0 movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 movu [r6], m0 movu [r6 + r1], m0 movu [r6 + r1 * 2], m0 movu [r6 + r4], m0 ; filter cmp bfilter, byte 0 jz .quit pxor m4, m4 pshufb m0, m4 pmovzxbw m0, m0 mova m1, m0 movu m2, [r2 + 32] pinsrb m2, [r2], 0 movu m3, [r2 + 1 + 32] pshufb m2, m4 pmovzxbw m2, m2 movhlps m4, m3 pmovzxbw m3, m3 pmovzxbw m4, m4 psubw m3, m2 psubw m4, m2 psraw m3, 1 psraw m4, 1 paddw m0, m3 paddw m1, m4 packuswb m0, m1 pextrb [r0], m0, 0 pextrb [r0 + r1], m0, 1 pextrb [r0 + r1 * 2], m0, 2 pextrb [r0 + r4], m0, 3 pextrb [r3], m0, 4 pextrb [r3 + r1], m0, 5 pextrb [r3 + r1 * 2], m0, 6 pextrb [r3 + r4], m0, 7 pextrb [r5], m0, 8 pextrb [r5 + r1], m0, 9 pextrb [r5 + r1 * 2], m0, 10 pextrb [r5 + r4], m0, 11 pextrb [r6], m0, 12 pextrb [r6 + r1], m0, 13 pextrb [r6 + r1 * 2], m0, 14 pextrb [r6 + r4], m0, 15 .quit: RET INIT_XMM sse4 cglobal intra_pred_ang16_11, 3,7,8 lea r3, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 mova m2, m3 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] pmulhrsw m4, m7 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] pmulhrsw m0, m7 packuswb m4, m0 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] pmulhrsw m6, m7 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 pmaddubsw m0, m3, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] pmulhrsw m4, m7 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] pmulhrsw m6, m7 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m1, m1 punpcklqdq m1, m2 ;[00] TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] mova m2, m3 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] pmulhrsw m4, m7 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] pmulhrsw m0, m7 packuswb m4, m0 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] pmulhrsw m6, m7 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 pmaddubsw m0, m3, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] pmulhrsw m4, m7 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] pmulhrsw m6, m7 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m1, m1 punpcklqdq m1, m2 ;[00] TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_25, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] .loop: movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] mova m2, m3 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] pmulhrsw m4, m7 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] pmulhrsw m0, m7 packuswb m4, m0 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] pmulhrsw m6, m7 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 pmaddubsw m0, m3, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] pmulhrsw m4, m7 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] pmulhrsw m5, m7 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] pmulhrsw m6, m7 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m1, m1 movh [r0 ], m4 movhps [r0 + r1 ], m4 movh [r0 + r1 * 2], m5 movhps [r0 + r5 ], m5 lea r0, [r0 + r1 * 4] movh [r0 ], m6 movhps [r0 + r1 ], m6 movh [r0 + r1 * 2], m1 movh [r0 + r5 ], m2 lea r0, [r6 + 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang16_12, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] pshufb m2, [c_mode16_12] palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 pmaddubsw m0, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 palignr m3, m2, 15 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_24, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pshufb m2, [c_mode16_12] palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 pmaddubsw m0, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 palignr m3, m2, 15 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_13, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] pshufb m2, [c_mode16_13] palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m4, m0 pmaddubsw m5, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 palignr m3, m2, 15 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_23, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pshufb m2, [c_mode16_13] palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m4, m0 pmaddubsw m5, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 palignr m3, m2, 15 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_14, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] pshufb m2, [c_mode16_14] palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_22, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pshufb m2, [c_mode16_14] palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_15, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] pshufb m2, [c_mode16_15] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_21, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pinsrb m2, [r2], 0 pshufb m2, [c_mode16_15] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_16, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] pmulhrsw m5, m7 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] pmulhrsw m4, m7 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] palignr m3, m2, 14 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 16] ; [15] pmulhrsw m6, m7 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 16] ; [15] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_20, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pinsrb m2, [r2], 0 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] pmulhrsw m5, m7 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] pmulhrsw m4, m7 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] palignr m3, m2, 14 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 16] ; [15] pmulhrsw m6, m7 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 16] ; [15] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_17, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, [r4 - 10 * 16] ; [06] pmulhrsw m4, m7 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] pmulhrsw m5, m7 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] palignr m3, m2, 14 pmaddubsw m0, m3, [r4] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] palignr m3, m2, 14 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] pmulhrsw m4, m7 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] pmulhrsw m5, m7 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] pmulhrsw m6, m7 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 pmaddubsw m3, [r4 - 16 * 16] ; [00] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x] pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] pmulhrsw m4, m7 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 pmaddubsw m3, [r4 - 16 * 16] ; [00] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_19, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pinsrb m2, [r2], 0 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] pmaddubsw m4, [r4 - 10 * 16] ; [06] pmulhrsw m4, m7 palignr m3, m2, 15 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] pmulhrsw m5, m7 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] palignr m3, m2, 14 pmaddubsw m0, m3, [r4] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] palignr m3, m2, 14 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] pmulhrsw m4, m7 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] pmulhrsw m5, m7 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] pmulhrsw m6, m7 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 pmaddubsw m3, [r4 - 16 * 16] ; [00] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] pmulhrsw m4, m7 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] pmulhrsw m6, m7 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 pmaddubsw m3, [r4 - 16 * 16] ; [00] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 RET INIT_XMM sse4 cglobal intra_pred_ang16_18, 4,5,3 movu m0, [r2] movu m1, [r2 + 32] mova m2, [c_mode16_18] pshufb m1, m2 lea r2, [r1 * 2] lea r3, [r1 * 3] lea r4, [r1 * 4] movu [r0], m0 palignr m2, m0, m1, 15 movu [r0 + r1], m2 palignr m2, m0, m1, 14 movu [r0 + r2], m2 palignr m2, m0, m1, 13 movu [r0 + r3], m2 lea r0, [r0 + r4] palignr m2, m0, m1, 12 movu [r0], m2 palignr m2, m0, m1, 11 movu [r0 + r1], m2 palignr m2, m0, m1, 10 movu [r0 + r2], m2 palignr m2, m0, m1, 9 movu [r0 + r3], m2 lea r0, [r0 + r4] palignr m2, m0, m1, 8 movu [r0], m2 palignr m2, m0, m1, 7 movu [r0 + r1], m2 palignr m2, m0, m1, 6 movu [r0 + r2], m2 palignr m2, m0, m1, 5 movu [r0 + r3], m2 lea r0, [r0 + r4] palignr m2, m0, m1, 4 movu [r0], m2 palignr m2, m0, m1, 3 movu [r0 + r1], m2 palignr m2, m0, m1, 2 movu [r0 + r2], m2 palignr m0, m1, 1 movu [r0 + r3], m0 RET ; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8 %macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7 %if %3 == 0 %else pshufb m0, [r3] pmaddubsw m0, [r4 + %3 * 16] pmulhrsw m0, [pw_1024] %endif %if %4 == 0 pmovzxbw m1, m1 %else pshufb m1, [r3] pmaddubsw m1, [r4 + %4 * 16] pmulhrsw m1, [pw_1024] %endif %if %3 == 0 packuswb m1, m1 movlhps m0, m1 %else packuswb m0, m1 %endif mova m1, [pw_1024] %if %5 == 0 %else pshufb m2, [r3] pmaddubsw m2, [r4 + %5 * 16] pmulhrsw m2, m1 %endif %if %6 == 0 pmovzxbw m3, m3 %else pshufb m3, [r3] pmaddubsw m3, [r4 + %6 * 16] pmulhrsw m3, m1 %endif %if %5 == 0 packuswb m3, m3 movlhps m2, m3 %else packuswb m2, m3 %endif %if %7 == 0 %else pshufb m4, [r3] pmaddubsw m4, [r4 + %7 * 16] pmulhrsw m4, m1 %endif %if %8 == 0 pmovzxbw m5, m5 %else pshufb m5, [r3] pmaddubsw m5, [r4 + %8 * 16] pmulhrsw m5, m1 %endif %if %7 == 0 packuswb m5, m5 movlhps m4, m5 %else packuswb m4, m5 %endif %if %9 == 0 %else pshufb m6, [r3] pmaddubsw m6, [r4 + %9 * 16] pmulhrsw m6, m1 %endif %if %10 == 0 pmovzxbw m7, m7 %else pshufb m7, [r3] pmaddubsw m7, [r4 + %10 * 16] pmulhrsw m7, m1 %endif %if %9 == 0 packuswb m7, m7 movlhps m6, m7 %else packuswb m6, m7 %endif %if %2 == 1 ; transpose punpckhbw m1, m0, m2 punpcklbw m0, m2 punpckhbw m3, m0, m1 punpcklbw m0, m1 punpckhbw m1, m4, m6 punpcklbw m4, m6 punpckhbw m6, m4, m1 punpcklbw m4, m1 punpckhdq m2, m0, m4 punpckldq m0, m4 punpckldq m4, m3, m6 punpckhdq m3, m6 movh [r0 + + %1 * 8], m0 movhps [r0 + r1 + %1 * 8], m0 movh [r0 + r1*2 + %1 * 8], m2 movhps [r0 + r5 + %1 * 8], m2 movh [r6 + %1 * 8], m4 movhps [r6 + r1 + %1 * 8], m4 movh [r6 + r1*2 + %1 * 8], m3 movhps [r6 + r5 + %1 * 8], m3 %else movh [r0 ], m0 movhps [r0 + r1 ], m0 movh [r0 + r1 * 2], m2 movhps [r0 + r5 ], m2 lea r0, [r0 + r1 * 4] movh [r0 ], m4 movhps [r0 + r1 ], m4 movh [r0 + r1 * 2], m6 movhps [r0 + r5 ], m6 %endif %endmacro %macro MODE_3_33 1 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 4 pmaddubsw m5, [r3 - 2 * 16] ; [14] pmulhrsw m5, m7 palignr m6, m2, m0, 6 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 8 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 palignr m2, m0, 12 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 movu m0, [r2 + 8] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m5, m2, m0, 2 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 14 * 16] ; [30] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 6 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] pmulhrsw m6, m7 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 14] ; [00] TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 movu m0, [r2 + 14] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 4 pmaddubsw m5, [r3 - 2 * 16] ; [14] pmulhrsw m5, m7 palignr m6, m2, m0, 6 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 8 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 palignr m2, m0, 12 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 movu m0, [r2 + 21] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m5, m2, m0, 2 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] pmulhrsw m4, m7 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 14 * 16] ; [30] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 6 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] pmulhrsw m6, m7 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 10 pmaddubsw m1, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 27] ; [00] TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_4_32 1 movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 mova m5, m1 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 pmaddubsw m1, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 15 * 16] ; [31] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 + 4 * 16] ; [ 20] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 6 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 + 3 * 16] ; [19] pmulhrsw m1, m7 palignr m2, m0, 10 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] pmulhrsw m4, m7 movu m0, [r2 + 6] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 pmaddubsw m1, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 4 mova m6, m5 pmaddubsw m5, [r3 - 9 * 16] ; [07] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 palignr m6, m2, m0, 6 pmaddubsw m6, [r3 + 16] ; [17] pmulhrsw m6, m7 palignr m1, m2, m0, 8 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 palignr m2, m0, 10 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 movu m0, [r2 + 12] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 mova m1, m0 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 2 pmaddubsw m5, [r3 - 16] ; [15] pmulhrsw m5, m7 palignr m6, m2, m0, 4 mova m1, m6 pmaddubsw m1, [r3 - 12 * 16] ; [4] pmulhrsw m1, m7 packuswb m5, m1 pmaddubsw m6, [r3 + 9 * 16] ; [25] pmulhrsw m6, m7 palignr m1, m2, m0, 6 pmaddubsw m1, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 8 mova m2, m1 pmaddubsw m1, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m2, [r3 + 8 * 16] ; [24] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 movu m0, [r2 + 17] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 palignr m5, m2, m0, 2 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 7 * 16] ; [23] pmulhrsw m5, m7 palignr m6, m2, m0, 4 pmaddubsw m6, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 palignr m6, m2, m0, 6 mova m1, m6 pmaddubsw m6, [r3 - 15 * 16] ; [1] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 packuswb m6, m1 palignr m1, m2, m0, 8 pmaddubsw m1, [r3 - 5 * 16] ; [11] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 22] ; [00] TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_5_31 1 movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 mova m5, m1 pmaddubsw m4, m0, [r3 + 16] ; [17] pmulhrsw m4, m7 pmaddubsw m1, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 3 * 16] ; [19] pmulhrsw m5, m7 palignr m6, m2, m0, 4 mova m1, m6 pmaddubsw m6, [r3 - 12 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 palignr m1, m2, m0, 6 mova m3, m1 pmaddubsw m3, [r3 - 10 * 16] ; [6] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 7 * 16] ; [23] pmulhrsw m1, m7 palignr m2, m0, 8 pmaddubsw m2, [r3 - 8 * 16] ; [8] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 movu m0, [r2 + 5] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m1, m2, m0, 2 mova m5, m1 pmaddubsw m4, m0, [r3 + 9 * 16] ; [25] pmulhrsw m4, m7 pmaddubsw m1, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 palignr m6, m2, m0, 4 mova m1, m6 pmaddubsw m6, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 + 13 * 16] ; [29] pmulhrsw m6, m7 palignr m1, m2, m0, 6 mova m3, m1 pmaddubsw m3, [r3 - 2 * 16] ; [14] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 palignr m2, m0, 8 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 movu m0, [r2 + 10] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 mova m1, m0 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 2 mova m1, m5 pmaddubsw m5, [r3 - 13 * 16] ; [3] pmulhrsw m5, m7 pmaddubsw m1, [r3 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m5, m1 palignr m1, m2, m0, 4 pmaddubsw m6, m1, [r3 - 11 * 16] ; [5] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 packuswb m6, m1 palignr m2, m0, 6 pmaddubsw m1, m2, [r3 - 9 * 16] ; [7] pmulhrsw m1, m7 pmaddubsw m2, [r3 + 8 * 16] ; [24] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 movu m0, [r2 + 14] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 mova m1, m0 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 2 mova m1, m5 pmaddubsw m5, [r3 - 5 * 16] ; [11] pmulhrsw m5, m7 pmaddubsw m1, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m5, m1 palignr m1, m2, m0, 4 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m6, m1 palignr m2, m0, 6 pmaddubsw m1, m2, [r3 - 16] ; [15] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 18] ; [00] TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_6_30 1 movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 mova m1, m0 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m0, 2 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 palignr m1, m2, m0, 4 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] pmulhrsw m6, m7 pmaddubsw m3, m1, [r3 - 2 * 16] ; [14] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 palignr m2, m0, 6 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 movu m0, [r2 + 5] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 mova m6, m0 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m6, [r3 - 16] ; [15] pmulhrsw m5, m7 pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 palignr m3, m2, m0, 2 pmaddubsw m6, m3, [r3 - 7 * 16] ; [9] pmulhrsw m6, m7 pmaddubsw m3, [r3 + 6 * 16] ; [22] pmulhrsw m3, m7 packuswb m6, m3 palignr m2, m0, 4 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m3, m2, [r3] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] pmulhrsw m4, m7 movu m0, [r2 + 7] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m5, m2, m0, 2 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 7 * 16] ; [23] pmulhrsw m5, m7 palignr m1, m2, m0, 4 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 + 16] ; [17] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m6, m1 palignr m2, m2, m0, 6 pmaddubsw m1, m2, [r3 - 5 * 16] ; [11] pmulhrsw m1, m7 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 movu m0, [r2 + 11] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 mova m5, m0 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] pmulhrsw m4, m7 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, [r3 + 15 * 16] ; [31] pmulhrsw m5, m7 palignr m6, m2, m0, 2 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 packuswb m5, m1 pmaddubsw m6, [r3 + 9 * 16] ; [25] pmulhrsw m6, m7 palignr m1, m2, m0, 4 pmaddubsw m2, m1, [r3 - 10 * 16] ; [6] pmulhrsw m2, m7 packuswb m6, m2 pmaddubsw m1, [r3 + 3 * 16] ; [19] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 14] ; [00] TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_7_29 1 movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 mova m5, m0 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] pmulhrsw m4, m7 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 palignr m1, m2, m0, 2 palignr m2, m0, 4 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] pmulhrsw m6, m7 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 16] ; [17] pmulhrsw m4, m7 pmaddubsw m2, [r3 + 10 * 16] ; [26] pmulhrsw m2, m7 packuswb m4, m2 movu m0, [r2 + 4] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m2, m0, 2 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 pmaddubsw m0, [r3 + 14 * 16] ; [30] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] pmulhrsw m1, m7 pmaddubsw m3, m2, [r3] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 9 * 16] ; [25] pmulhrsw m4, m7 movu m0, [r2 + 6] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m2, m0, 2 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r3 - 5 * 16] ; [11] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 + 13 * 16] ; [29] pmulhrsw m6, m7 pmaddubsw m1, m2, [r3 - 10 * 16] ; [6] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r3 - 16] ; [15] pmulhrsw m1, m7 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] pmulhrsw m2, m7 packuswb m1, m2 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 movu m0, [r2 + 8] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] pmulhrsw m4, m7 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r3 + 3 * 16] ; [19] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 palignr m2, m0, 2 pmaddubsw m6, m2, [r3 - 11 * 16] ; [5] pmulhrsw m6, m7 pmaddubsw m0, m2, [r3 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r3 + 7 * 16] ; [23] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 10] ; [0] TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_8_28 1 movu m0, [r2 + 1] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m2, m0, 2 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] pmulhrsw m4, m7 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r3 - 1 * 16] ; [15] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 + 9 * 16] ; [25] pmulhrsw m6, m7 pmaddubsw m0, [r3 + 14 * 16] ; [30] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] pmulhrsw m5, m7 pmaddubsw m2, [r3 + 12 * 16] ; [28] pmulhrsw m2, m7 packuswb m5, m2 movu m0, [r2 + 3] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 pmaddubsw m6, m0, [r3 - 15 * 16] ; [01] pmulhrsw m6, m7 pmaddubsw m1, m0, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m0, [r3 - 5 * 16] ; [11] pmulhrsw m1, m7 mova m2, m0 pmaddubsw m0, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 pmaddubsw m5, m2, [r3 + 10 * 16] ; [26] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r3 + 15 * 16] ; [31] pmulhrsw m5, m7 movu m0, [r2 + 4] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 pmaddubsw m2, m0, [r3 - 12 * 16] ; [4] pmulhrsw m2, m7 packuswb m5, m2 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] pmulhrsw m6, m7 pmaddubsw m1, m0, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m0, [r3 + 3 * 16] ; [19] pmulhrsw m1, m7 mova m2, m0 pmaddubsw m0, [r3 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] pmulhrsw m4, m7 movu m0, [r2 + 5] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r3 - 9 * 16] ; [7] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r3 + 16] ; [17] pmulhrsw m6, m7 pmaddubsw m1, m0, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m0, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 6] ; [00] TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_9_27 1 movu m2, [r2 + 1] palignr m1, m2, 1 punpckhbw m0, m2, m1 punpcklbw m2, m1 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] pmulhrsw m4, m7 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] pmulhrsw m5, m7 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] pmulhrsw m6, m7 pmaddubsw m3, m2, [r3 - 4 * 16] ; [12] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 pmaddubsw m0, m2, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] pmulhrsw m4, m7 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] pmulhrsw m5, m7 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] pmulhrsw m6, m7 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 2] ; [00] TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 movu m2, [r2 + 2] palignr m1, m2, 1 punpcklbw m2, m1 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] pmulhrsw m4, m7 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] pmulhrsw m5, m7 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] pmulhrsw m6, m7 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 pmaddubsw m0, m2, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 movu m2, [r2 + 2] palignr m1, m2, 1 punpcklbw m2, m1 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] pmulhrsw m4, m7 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] pmulhrsw m5, m7 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] pmulhrsw m6, m7 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m1, m1 movhps m1, [r2 + 3] ; [00] TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_12_24 1 movu m2, [r2] palignr m1, m2, 1 punpckhbw m0, m2, m1 punpcklbw m2, m1 palignr m0, m2, 2 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 pmaddubsw m3, m0, [r4 + 6 * 16] ; [22] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r4 + 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 pmaddubsw m3, m0, [r4 - 14 * 16] ; [2] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 movu m0, [r2 - 2] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m2, m0, 2 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, m2, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] pmulhrsw m5, m7 movu m0, [r2 - 3] palignr m1, m0, 1 punpckhbw m2, m0, m1 punpcklbw m0, m1 palignr m2, m0, 2 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] pmulhrsw m4, m7 movu m2, [r2 - 4] palignr m1, m2, 1 punpckhbw m0, m2, m1 punpcklbw m2, m1 palignr m0, m2, 2 pmaddubsw m5, m0, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m0, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m0, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m0, [r4 - 16] ; [15] pmulhrsw m6, m7 pmaddubsw m1, m0, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m0, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 movu m2, [pb_fact0] pshufb m0, m2 pmovzxbw m0, m0 packuswb m1, m0 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro ;------------------------------------------------------------------------------------------ ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) ;------------------------------------------------------------------------------------------ INIT_XMM ssse3 cglobal intra_pred_ang32_2, 3,5,4 lea r4, [r2] add r2, 64 cmp r3m, byte 34 cmove r2, r4 movu m0, [r2 + 2] movu m1, [r2 + 18] movu m3, [r2 + 34] lea r3, [r1 * 3] movu [r0], m0 movu [r0 + 16], m1 palignr m2, m1, m0, 1 movu [r0 + r1], m2 palignr m2, m3, m1, 1 movu [r0 + r1 + 16], m2 palignr m2, m1, m0, 2 movu [r0 + r1 * 2], m2 palignr m2, m3, m1, 2 movu [r0 + r1 * 2 + 16], m2 palignr m2, m1, m0, 3 movu [r0 + r3], m2 palignr m2, m3, m1, 3 movu [r0 + r3 + 16], m2 lea r0, [r0 + r1 * 4] palignr m2, m1, m0, 4 movu [r0], m2 palignr m2, m3, m1, 4 movu [r0 + 16], m2 palignr m2, m1, m0, 5 movu [r0 + r1], m2 palignr m2, m3, m1, 5 movu [r0 + r1 + 16], m2 palignr m2, m1, m0, 6 movu [r0 + r1 * 2], m2 palignr m2, m3, m1, 6 movu [r0 + r1 * 2 + 16], m2 palignr m2, m1, m0, 7 movu [r0 + r3], m2 palignr m2, m3, m1, 7 movu [r0 + r3 + 16], m2 lea r0, [r0 + r1 * 4] palignr m2, m1, m0, 8 movu [r0], m2 palignr m2, m3, m1, 8 movu [r0 + 16], m2 palignr m2, m1, m0, 9 movu [r0 + r1], m2 palignr m2, m3, m1, 9 movu [r0 + r1 + 16], m2 palignr m2, m1, m0, 10 movu [r0 + r1 * 2], m2 palignr m2, m3, m1, 10 movu [r0 + r1 * 2 + 16], m2 palignr m2, m1, m0, 11 movu [r0 + r3], m2 palignr m2, m3, m1, 11 movu [r0 + r3 + 16], m2 lea r0, [r0 + r1 * 4] palignr m2, m1, m0, 12 movu [r0], m2 palignr m2, m3, m1, 12 movu [r0 + 16], m2 palignr m2, m1, m0, 13 movu [r0 + r1], m2 palignr m2, m3, m1, 13 movu [r0 + r1 + 16], m2 palignr m2, m1, m0, 14 movu [r0 + r1 * 2], m2 palignr m2, m3, m1, 14 movu [r0 + r1 * 2 + 16], m2 palignr m2, m1, m0, 15 movu [r0 + r3], m2 palignr m2, m3, m1, 15 movu [r0 + r3 + 16], m2 lea r0, [r0 + r1 * 4] movu [r0], m1 movu m0, [r2 + 50] movu [r0 + 16], m3 palignr m2, m3, m1, 1 movu [r0 + r1], m2 palignr m2, m0, m3, 1 movu [r0 + r1 + 16], m2 palignr m2, m3, m1, 2 movu [r0 + r1 * 2], m2 palignr m2, m0, m3, 2 movu [r0 + r1 * 2 + 16], m2 palignr m2, m3, m1, 3 movu [r0 + r3], m2 palignr m2, m0, m3, 3 movu [r0 + r3 + 16], m2 lea r0, [r0 + r1 * 4] palignr m2, m3, m1, 4 movu [r0], m2 palignr m2, m0, m3, 4 movu [r0 + 16], m2 palignr m2, m3, m1, 5 movu [r0 + r1], m2 palignr m2, m0, m3, 5 movu [r0 + r1 + 16], m2 palignr m2, m3, m1, 6 movu [r0 + r1 * 2], m2 palignr m2, m0, m3, 6 movu [r0 + r1 * 2 + 16], m2 palignr m2, m3, m1, 7 movu [r0 + r3], m2 palignr m2, m0, m3, 7 movu [r0 + r3 + 16], m2 lea r0, [r0 + r1 * 4] palignr m2, m3, m1, 8 movu [r0], m2 palignr m2, m0, m3, 8 movu [r0 + 16], m2 palignr m2, m3, m1, 9 movu [r0 + r1], m2 palignr m2, m0, m3, 9 movu [r0 + r1 + 16], m2 palignr m2, m3, m1, 10 movu [r0 + r1 * 2], m2 palignr m2, m0, m3, 10 movu [r0 + r1 * 2 + 16], m2 palignr m2, m3, m1, 11 movu [r0 + r3], m2 palignr m2, m0, m3, 11 movu [r0 + r3 + 16], m2 lea r0, [r0 + r1 * 4] palignr m2, m3, m1, 12 movu [r0], m2 palignr m2, m0, m3, 12 movu [r0 + 16], m2 palignr m2, m3, m1, 13 movu [r0 + r1], m2 palignr m2, m0, m3, 13 movu [r0 + r1 + 16], m2 palignr m2, m3, m1, 14 movu [r0 + r1 * 2], m2 palignr m2, m0, m3, 14 movu [r0 + r1 * 2 + 16], m2 palignr m2, m3, m1, 15 movu [r0 + r3], m2 palignr m2, m0, m3, 15 movu [r0 + r3 + 16], m2 RET INIT_XMM sse4 cglobal intra_pred_ang32_3, 3,7,8 add r2, 64 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: MODE_3_33 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_4, 3,7,8 add r2, 64 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: MODE_4_32 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_5, 3,7,8 add r2, 64 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: MODE_5_31 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_6, 3,7,8 add r2, 64 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: MODE_6_30 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_7, 3,7,8 add r2, 64 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: MODE_7_29 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_8, 3,7,8 add r2, 64 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: MODE_8_28 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_9, 3,7,8 add r2, 64 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: MODE_9_27 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize) %define m8 [rsp + 0 * mmsize] %define m9 [rsp + 1 * mmsize] pxor m7, m7 mov r6, 2 movu m0, [r2] movu m1, [r2 + 1] mova m8, m0 mova m9, m1 mov r3d, r4d lea r4, [r1 * 3] .loop: movu m0, [r2 + 1 + 64] palignr m1, m0, 1 pshufb m1, m7 palignr m2, m0, 2 pshufb m2, m7 palignr m3, m0, 3 pshufb m3, m7 palignr m4, m0, 4 pshufb m4, m7 palignr m5, m0, 5 pshufb m5, m7 palignr m6, m0, 6 pshufb m6, m7 movu [r0 + r1], m1 movu [r0 + r1 + 16], m1 movu [r0 + r1 * 2], m2 movu [r0 + r1 * 2 + 16], m2 movu [r0 + r4], m3 movu [r0 + r4 + 16], m3 lea r5, [r0 + r1 * 4] movu [r5], m4 movu [r5 + 16], m4 movu [r5 + r1], m5 movu [r5 + r1 + 16], m5 movu [r5 + r1 * 2], m6 movu [r5 + r1 * 2 + 16], m6 palignr m1, m0, 7 pshufb m1, m7 movhlps m2, m0 pshufb m2, m7 palignr m3, m0, 9 pshufb m3, m7 palignr m4, m0, 10 pshufb m4, m7 palignr m5, m0, 11 pshufb m5, m7 palignr m6, m0, 12 pshufb m6, m7 movu [r5 + r4], m1 movu [r5 + r4 + 16], m1 lea r5, [r5 + r1 * 4] movu [r5], m2 movu [r5 + 16], m2 movu [r5 + r1], m3 movu [r5 + r1 + 16], m3 movu [r5 + r1 * 2], m4 movu [r5 + r1 * 2 + 16], m4 movu [r5 + r4], m5 movu [r5 + r4 + 16], m5 lea r5, [r5 + r1 * 4] movu [r5], m6 movu [r5 + 16], m6 palignr m1, m0, 13 pshufb m1, m7 palignr m2, m0, 14 pshufb m2, m7 palignr m3, m0, 15 pshufb m3, m7 pshufb m0, m7 movu [r5 + r1], m1 movu [r5 + r1 + 16], m1 movu [r5 + r1 * 2], m2 movu [r5 + r1 * 2 + 16], m2 movu [r5 + r4], m3 movu [r5 + r4 + 16], m3 ; filter cmp r3d, byte 0 jz .quit movhlps m1, m0 pmovzxbw m0, m0 mova m1, m0 movu m2, m8 movu m3, m9 pshufb m2, m7 pmovzxbw m2, m2 movhlps m4, m3 pmovzxbw m3, m3 pmovzxbw m4, m4 psubw m3, m2 psubw m4, m2 psraw m3, 1 psraw m4, 1 paddw m0, m3 paddw m1, m4 packuswb m0, m1 .quit: movu [r0], m0 movu [r0 + 16], m0 dec r6 lea r0, [r5 + r1 * 4] lea r2, [r2 + 16] jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_11, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2 + 16] pxor m1, m1 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] mova [rsp], m0 movu m0, [r2 + 64] pinsrb m0, [r2], 0 movu m1, [r2 + 16 + 64] movu m2, [r2 + 32 + 64] movu [rsp + 1], m0 movu [rsp + 1 + 16], m1 movu [rsp + 1 + 32], m2 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 1] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16 ; Row[8 - 15] movu m7, [r2] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0 ; Row[16 - 23] movu m7, [r2 - 1] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16 ; Row[24 - 31] movu m7, [r2 - 1] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET %macro MODE_12_24_ROW0 1 movu m0, [r3 + 6] pshufb m0, [c_mode32_12_0] pinsrb m0, [r3 + 26], 12 mova above, m0 movu m2, [r2] %if %1 == 1 pinsrb m2, [r3], 0 %endif palignr m1, m2, 1 punpcklbw m2, m1 pmaddubsw m4, m2, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 pmaddubsw m3, m2, [r4 + 6 * 16] ; [22] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r4 + 16] ; [17] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 pmaddubsw m3, m2, [r4 - 14 * 16] ; [2] pmulhrsw m3, m7 packuswb m6, m3 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] %if %1 == 1 pinsrb m1, [r3], 0 %endif palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] pmulhrsw m1, m7 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, m2, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] pmulhrsw m5, m7 pslldq m1, above, 1 palignr m2, m1, 14 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] pmulhrsw m4, m7 pslldq m1, above, 2 palignr m2, m1, 14 pmaddubsw m5, m2, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r4 + 9 * 16] ; [25] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 - 16] ; [15] pmulhrsw m6, m7 pmaddubsw m1, m2, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 movu m0, [pb_fact0] pshufb m2, m0 pmovzxbw m2, m2 packuswb m1, m2 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro INIT_XMM sse4 cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize) %define above [rsp + 0 * mmsize] mov r3, r2 add r2, 64 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] MODE_12_24_ROW0 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 7 mov r3, 3 .loop: MODE_12_24 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r3 jnz .loop RET %macro MODE_13_23_ROW0 1 movu m0, [r3 + 1] movu m1, [r3 + 15] pshufb m0, [c_mode32_13_0] pshufb m1, [c_mode32_13_0] punpckldq m0, m1 pshufb m0, [c_mode32_13_shuf] mova above, m0 movu m2, [r2] %if (%1 == 1) pinsrb m2, [r3], 0 %endif palignr m1, m2, 1 punpcklbw m2, m1 pmaddubsw m4, m2, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 pmaddubsw m3, m2, [r4 - 2 * 16] ; [14] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r4 - 11 * 16] ; [5] pmulhrsw m5, m7 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] %if (%1 == 1) pinsrb m1, [r3], 0 %endif palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 pmaddubsw m0, m2, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] pmulhrsw m1, m7 palignr m2, above, 14 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 - 16] ; [15] pmulhrsw m4, m7 pmaddubsw m5, m2, [r4 - 10 * 16] ; [6] pmulhrsw m5, m7 packuswb m4, m5 pslldq m0, above, 1 palignr m2, m0, 14 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m6, m1 pslldq m0, 1 palignr m2, m0, 14 pmaddubsw m1, m2, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m0, m2, [r4] ; [16] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 - 9 * 16] ; [7] pmulhrsw m4, m7 pslldq m0, above, 3 palignr m2, m0, 14 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] pmulhrsw m6, m7 pslldq m0, 1 palignr m2, m0, 14 pmaddubsw m0, m2, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 pmaddubsw m1, m2, [r4 + 16] ; [17] pmulhrsw m1, m7 pmaddubsw m0, m2, [r4 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 pslldq m0, above, 5 palignr m2, m0, 14 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] pmulhrsw m4, m7 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m0, 1 palignr m2, m0, 14 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] pmulhrsw m6, m7 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 pmaddubsw m3, m2, [r4 - 16 * 16] ; [00] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro %macro MODE_13_23 2 movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] pmaddubsw m4, m0, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 pmaddubsw m3, m0, [r4 - 2 * 16] ; [14] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m0, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 pmaddubsw m3, m2, [r4 - 6 * 16] ; [10] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] pmulhrsw m1, m7 movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1] palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] punpckhbw m0, m2, m3 punpcklbw m2, m3 palignr m0, m2, 2 pmaddubsw m3, m0, [r4 + 8 * 16] ; [24] pmulhrsw m3, m7 packuswb m1, m3 mova m3, m0 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 pmaddubsw m5, m3, [r4 - 10 * 16] ; [6] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] pmulhrsw m1, m7 packuswb m6, m1 movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] pmaddubsw m1, m0, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, m0, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 mova m3, m0 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 pmaddubsw m4, m3, [r4 - 9 * 16] ; [7] pmulhrsw m4, m7 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] pmulhrsw m6, m7 movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] pmaddubsw m3, m0, [r4 + 10 * 16] ; [26] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m1, m0, [r4 + 16] ; [17] pmulhrsw m1, m7 pmaddubsw m3, m0, [r4 - 8 * 16] ; [8] pmulhrsw m3, m7 packuswb m1, m3 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] pmulhrsw m4, m7 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] %if ((%1 & %2) == 1) pinsrb m2, [r3], 0 %endif palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] pmulhrsw m6, m7 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 movu m0, [pb_fact0] pshufb m2, m0 pmovzxbw m2, m2 packuswb m1, m2 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 %endmacro INIT_XMM sse4 cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize) %define above [rsp + 0 * mmsize] mov r3, r2 add r2, 64 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] MODE_13_23_ROW0 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 7 MODE_13_23 1, 1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 mov r3, 2 .loop: MODE_13_23 1, 0 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec r3 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_14, 3,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2] movu m1, [r2 + 15] pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] mova [rsp], m0 movu m0, [r2 + 1 + 64] movu m1, [r2 + 1 + 16 + 64] movu [rsp + 13], m0 movu [rsp + 13 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 13] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 4] palignr m0, m7, 3 mova m1, m0 palignr m2, m7, 2 mova m3, m2 palignr m4, m7, 1 mova m5, m4 mova m6, m4 PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24 ; Row[8 - 15] movu m7, [r2 - 7] palignr m0, m7, 3 palignr m1, m7, 2 mova m2, m1 mova m3, m1 palignr m4, m7, 1 mova m5, m4 mova m6, m7 PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16 ; Row[16 - 23] movu m7, [r2 - 10] palignr m0, m7, 3 palignr m1, m7, 2 mova m2, m1 palignr m3, m7, 1 mova m4, m3 mova m5, m3 mova m6, m7 PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8 ; Row[24 - 31] movu m7, [r2 - 13] palignr m0, m7, 2 mova m1, m0 mova m2, m0 palignr m3, m7, 1 mova m4, m3 mova m5, m7 mova m6, m7 PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_15, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2] movu m1, [r2 + 15] pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] mova [rsp], m1 movu [rsp + 8], m0 movu m0, [r2 + 1 + 64] movu m1, [r2 + 1 + 16 + 64] movu [rsp + 17], m0 movu [rsp + 17 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 17] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 5] palignr m0, m7, 4 palignr m1, m7, 3 mova m2, m1 palignr m3, m7, 2 mova m4, m3 palignr m5, m7, 1 mova m6, m5 PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24 ; Row[8 - 15] movu m7, [r2 - 9] palignr m0, m7, 4 palignr m1, m7, 3 mova m2, m1 palignr m3, m7, 2 mova m4, m3 palignr m5, m7, 1 mova m6, m5 PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16 ; Row[16 - 23] movu m7, [r2 - 13] palignr m0, m7, 3 mova m1, m0 palignr m2, m7, 2 mova m3, m2 palignr m4, m7, 1 mova m5, m4 mova m6, m7 PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8 ; Row[24 - 31] movu m7, [r2 - 17] palignr m0, m7, 3 mova m1, m0 palignr m2, m7, 2 mova m3, m2 palignr m4, m7, 1 mova m5, m4 mova m6, m7 PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_16, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2] movu m1, [r2 + 15] pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] mova [rsp], m1 movu [rsp + 10], m0 movu m0, [r2 + 1 + 64] movu m1, [r2 + 1 + 16 + 64] movu [rsp + 21], m0 movu [rsp + 21 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 21] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 6] palignr m0, m7, 5 palignr m1, m7, 4 mova m2, m1 palignr m3, m7, 3 palignr m4, m7, 2 mova m5, m4 palignr m6, m7, 1 PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24 ; Row[8 - 15] movu m7, [r2 - 11] palignr m0, m7, 5 palignr m1, m7, 4 palignr m2, m7, 3 mova m3, m2 palignr m4, m7, 2 palignr m5, m7, 1 mova m6, m5 PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16 ; Row[16 - 23] movu m7, [r2 - 16] palignr m0, m7, 4 mova m1, m0 palignr m2, m7, 3 palignr m3, m7, 2 mova m4, m3 palignr m5, m7, 1 mova m6, m7 PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8 ; Row[24 - 31] movu m7, [r2 - 21] palignr m0, m7, 4 palignr m1, m7, 3 mova m2, m1 palignr m3, m7, 2 palignr m4, m7, 1 mova m5, m4 mova m6, m7 PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_17, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2] movu m1, [r2 + 16] pshufb m0, [c_mode32_17_0] pshufb m1, [c_mode32_17_0] mova [rsp ], m1 movu [rsp + 13], m0 movu m0, [r2 + 1 + 64] movu m1, [r2 + 1 + 16 + 64] movu [rsp + 26], m0 movu [rsp + 26 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 25] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 6] palignr m0, m7, 6 palignr m1, m7, 5 palignr m2, m7, 4 palignr m3, m7, 3 palignr m4, m7, 2 mova m5, m4 palignr m6, m7, 1 PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16 ; Row[7 - 15] movu m7, [r2 - 12] palignr m0, m7, 5 palignr m1, m7, 4 mova m2, m1 palignr m3, m7, 3 palignr m4, m7, 2 palignr m5, m7, 1 mova m6, m7 PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0 ; Row[16 - 23] movu m7, [r2 - 19] palignr m0, m7, 6 palignr m1, m7, 5 palignr m2, m7, 4 palignr m3, m7, 3 palignr m4, m7, 2 mova m5, m4 palignr m6, m7, 1 PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16 ; Row[24 - 31] movu m7, [r2 - 25] palignr m0, m7, 5 palignr m1, m7, 4 mova m2, m1 palignr m3, m7, 3 palignr m4, m7, 2 palignr m5, m7, 1 mova m6, m7 PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_YMM avx2 cglobal intra_pred_ang32_18, 4, 4, 3 movu m0, [r2] movu xm1, [r2 + 1 + 64] pshufb xm1, [intra_pred_shuff_15_0] mova xm2, xm0 vinserti128 m1, m1, xm2, 1 lea r3, [r1 * 3] movu [r0], m0 palignr m2, m0, m1, 15 movu [r0 + r1], m2 palignr m2, m0, m1, 14 movu [r0 + r1 * 2], m2 palignr m2, m0, m1, 13 movu [r0 + r3], m2 lea r0, [r0 + r1 * 4] palignr m2, m0, m1, 12 movu [r0], m2 palignr m2, m0, m1, 11 movu [r0 + r1], m2 palignr m2, m0, m1, 10 movu [r0 + r1 * 2], m2 palignr m2, m0, m1, 9 movu [r0 + r3], m2 lea r0, [r0 + r1 * 4] palignr m2, m0, m1, 8 movu [r0], m2 palignr m2, m0, m1, 7 movu [r0 + r1], m2 palignr m2, m0, m1, 6 movu [r0 + r1 * 2], m2 palignr m2, m0, m1, 5 movu [r0 + r3], m2 lea r0, [r0 + r1 * 4] palignr m2, m0, m1, 4 movu [r0], m2 palignr m2, m0, m1, 3 movu [r0 + r1], m2 palignr m2, m0, m1, 2 movu [r0 + r1 * 2], m2 palignr m2, m0, m1, 1 movu [r0 + r3], m2 lea r0, [r0 + r1 * 4] movu [r0], m1 movu xm0, [r2 + 64 + 17] pshufb xm0, [intra_pred_shuff_15_0] vinserti128 m0, m0, xm1, 1 palignr m2, m1, m0, 15 movu [r0 + r1], m2 palignr m2, m1, m0, 14 movu [r0 + r1 * 2], m2 palignr m2, m1, m0, 13 movu [r0 + r3], m2 lea r0, [r0 + r1 * 4] palignr m2, m1, m0, 12 movu [r0], m2 palignr m2, m1, m0, 11 movu [r0 + r1], m2 palignr m2, m1, m0, 10 movu [r0 + r1 * 2], m2 palignr m2, m1, m0, 9 movu [r0 + r3], m2 lea r0, [r0 + r1 * 4] palignr m2, m1, m0, 8 movu [r0], m2 palignr m2, m1, m0, 7 movu [r0 + r1], m2 palignr m2, m1, m0,6 movu [r0 + r1 * 2], m2 palignr m2, m1, m0, 5 movu [r0 + r3], m2 lea r0, [r0 + r1 * 4] palignr m2, m1, m0, 4 movu [r0], m2 palignr m2, m1, m0, 3 movu [r0 + r1], m2 palignr m2, m1, m0,2 movu [r0 + r1 * 2], m2 palignr m2, m1, m0, 1 movu [r0 + r3], m2 RET INIT_XMM sse4 cglobal intra_pred_ang32_18, 4,5,5 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16] movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] lea r2, [r1 * 2] lea r3, [r1 * 3] lea r4, [r1 * 4] movu [r0], m0 movu [r0 + 16], m1 pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32] palignr m4, m0, m2, 15 movu [r0 + r1], m4 palignr m4, m1, m0, 15 movu [r0 + r1 + 16], m4 palignr m4, m0, m2, 14 movu [r0 + r2], m4 palignr m4, m1, m0, 14 movu [r0 + r2 + 16], m4 palignr m4, m0, m2, 13 movu [r0 + r3], m4 palignr m4, m1, m0, 13 movu [r0 + r3 + 16], m4 lea r0, [r0 + r4] palignr m4, m0, m2, 12 movu [r0], m4 palignr m4, m1, m0, 12 movu [r0 + 16], m4 palignr m4, m0, m2, 11 movu [r0 + r1], m4 palignr m4, m1, m0, 11 movu [r0 + r1 + 16], m4 palignr m4, m0, m2, 10 movu [r0 + r2], m4 palignr m4, m1, m0, 10 movu [r0 + r2 + 16], m4 palignr m4, m0, m2, 9 movu [r0 + r3], m4 palignr m4, m1, m0, 9 movu [r0 + r3 + 16], m4 lea r0, [r0 + r4] palignr m4, m0, m2, 8 movu [r0], m4 palignr m4, m1, m0, 8 movu [r0 + 16], m4 palignr m4, m0, m2, 7 movu [r0 + r1], m4 palignr m4, m1, m0, 7 movu [r0 + r1 + 16], m4 palignr m4, m0, m2, 6 movu [r0 + r2], m4 palignr m4, m1, m0, 6 movu [r0 + r2 + 16], m4 palignr m4, m0, m2, 5 movu [r0 + r3], m4 palignr m4, m1, m0, 5 movu [r0 + r3 + 16], m4 lea r0, [r0 + r4] palignr m4, m0, m2, 4 movu [r0], m4 palignr m4, m1, m0, 4 movu [r0 + 16], m4 palignr m4, m0, m2, 3 movu [r0 + r1], m4 palignr m4, m1, m0, 3 movu [r0 + r1 + 16], m4 palignr m4, m0, m2, 2 movu [r0 + r2], m4 palignr m4, m1, m0, 2 movu [r0 + r2 + 16], m4 palignr m4, m0, m2, 1 movu [r0 + r3], m4 palignr m4, m1, m0, 1 movu [r0 + r3 + 16], m4 lea r0, [r0 + r4] movu [r0], m2 movu [r0 + 16], m0 palignr m4, m2, m3, 15 movu [r0 + r1], m4 palignr m4, m0, m2, 15 movu [r0 + r1 + 16], m4 palignr m4, m2, m3, 14 movu [r0 + r2], m4 palignr m4, m0, m2, 14 movu [r0 + r2 + 16], m4 palignr m4, m2, m3, 13 movu [r0 + r3], m4 palignr m4, m0, m2, 13 movu [r0 + r3 + 16], m4 lea r0, [r0 + r4] palignr m4, m2, m3, 12 movu [r0], m4 palignr m4, m0, m2, 12 movu [r0 + 16], m4 palignr m4, m2, m3, 11 movu [r0 + r1], m4 palignr m4, m0, m2, 11 movu [r0 + r1 + 16], m4 palignr m4, m2, m3, 10 movu [r0 + r2], m4 palignr m4, m0, m2, 10 movu [r0 + r2 + 16], m4 palignr m4, m2, m3, 9 movu [r0 + r3], m4 palignr m4, m0, m2, 9 movu [r0 + r3 + 16], m4 lea r0, [r0 + r4] palignr m4, m2, m3, 8 movu [r0], m4 palignr m4, m0, m2, 8 movu [r0 + 16], m4 palignr m4, m2, m3, 7 movu [r0 + r1], m4 palignr m4, m0, m2, 7 movu [r0 + r1 + 16], m4 palignr m4, m2, m3, 6 movu [r0 + r2], m4 palignr m4, m0, m2, 6 movu [r0 + r2 + 16], m4 palignr m4, m2, m3, 5 movu [r0 + r3], m4 palignr m4, m0, m2, 5 movu [r0 + r3 + 16], m4 lea r0, [r0 + r4] palignr m4, m2, m3, 4 movu [r0], m4 palignr m4, m0, m2, 4 movu [r0 + 16], m4 palignr m4, m2, m3, 3 movu [r0 + r1], m4 palignr m4, m0, m2, 3 movu [r0 + r1 + 16], m4 palignr m4, m2, m3, 2 movu [r0 + r2], m4 palignr m4, m0, m2, 2 movu [r0 + r2 + 16], m4 palignr m4, m2, m3, 1 movu [r0 + r3], m4 palignr m4, m0, m2, 1 movu [r0 + r3 + 16], m4 RET INIT_XMM sse4 cglobal intra_pred_ang32_19, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2 + 64] pinsrb m0, [r2], 0 movu m1, [r2 + 16 + 64] pshufb m0, [c_mode32_17_0] pshufb m1, [c_mode32_17_0] mova [rsp ], m1 movu [rsp + 13], m0 movu m0, [r2 + 1] movu m1, [r2 + 1 + 16] movu [rsp + 26], m0 movu [rsp + 26 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 25] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0] ; r6 -> r0 mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 6] palignr m0, m7, 6 palignr m1, m7, 5 palignr m2, m7, 4 palignr m3, m7, 3 palignr m4, m7, 2 mova m5, m4 palignr m6, m7, 1 PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16 ; Row[7 - 15] movu m7, [r2 - 12] palignr m0, m7, 5 palignr m1, m7, 4 mova m2, m1 palignr m3, m7, 3 palignr m4, m7, 2 palignr m5, m7, 1 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0 ; Row[16 - 23] movu m7, [r2 - 19] palignr m0, m7, 6 palignr m1, m7, 5 palignr m2, m7, 4 palignr m3, m7, 3 palignr m4, m7, 2 mova m5, m4 palignr m6, m7, 1 lea r0, [r0 + r1 * 4] PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16 ; Row[24 - 31] movu m7, [r2 - 25] palignr m0, m7, 5 palignr m1, m7, 4 mova m2, m1 palignr m3, m7, 3 palignr m4, m7, 2 palignr m5, m7, 1 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0 add r6, 8 mov r0, r6 add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_20, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2 + 64] pinsrb m0, [r2], 0 movu m1, [r2 + 15 + 64] pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] mova [rsp], m1 movu [rsp + 10], m0 movu m0, [r2 + 1] movu m1, [r2 + 1 + 16] movu [rsp + 21], m0 movu [rsp + 21 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 21] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0] ; r6 -> r0 mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 6] palignr m0, m7, 5 palignr m1, m7, 4 mova m2, m1 palignr m3, m7, 3 palignr m4, m7, 2 mova m5, m4 palignr m6, m7, 1 PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24 ; Row[8 - 15] movu m7, [r2 - 11] palignr m0, m7, 5 palignr m1, m7, 4 palignr m2, m7, 3 mova m3, m2 palignr m4, m7, 2 palignr m5, m7, 1 mova m6, m5 lea r0, [r0 + r1 * 4] PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16 ; Row[16 - 23] movu m7, [r2 - 16] palignr m0, m7, 4 mova m1, m0 palignr m2, m7, 3 palignr m3, m7, 2 mova m4, m3 palignr m5, m7, 1 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8 ; Row[24 - 31] movu m7, [r2 - 21] palignr m0, m7, 4 palignr m1, m7, 3 mova m2, m1 palignr m3, m7, 2 palignr m4, m7, 1 mova m5, m4 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0 add r6, 8 mov r0, r6 add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_21, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2 + 64] pinsrb m0, [r2], 0 movu m1, [r2 + 15 + 64] pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] mova [rsp], m1 movu [rsp + 8], m0 movu m0, [r2 + 1] movu m1, [r2 + 1 + 16] movu [rsp + 17], m0 movu [rsp + 17 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 17] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0] ; r6 -> r0 mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 5] palignr m0, m7, 4 palignr m1, m7, 3 mova m2, m1 palignr m3, m7, 2 mova m4, m3 palignr m5, m7, 1 mova m6, m5 PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24 ; Row[8 - 15] movu m7, [r2 - 9] palignr m0, m7, 4 palignr m1, m7, 3 mova m2, m1 palignr m3, m7, 2 mova m4, m3 palignr m5, m7, 1 mova m6, m5 lea r0, [r0 + r1 * 4] PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16 ; Row[16 - 23] movu m7, [r2 - 13] palignr m0, m7, 3 mova m1, m0 palignr m2, m7, 2 mova m3, m2 palignr m4, m7, 1 mova m5, m4 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8 ; Row[24 - 31] movu m7, [r2 - 17] palignr m0, m7, 3 mova m1, m0 palignr m2, m7, 2 mova m3, m2 palignr m4, m7, 1 mova m5, m4 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0 add r6, 8 mov r0, r6 add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_22, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2 + 64] pinsrb m0, [r2], 0 movu m1, [r2 + 15 + 64] pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] mova [rsp], m0 movu m0, [r2 + 1] movu m1, [r2 + 1 + 16] movu [rsp + 13], m0 movu [rsp + 13 + 16], m1 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 13] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0] ; r6 -> r0 mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2 - 4] palignr m0, m7, 3 mova m1, m0 palignr m2, m7, 2 mova m3, m2 palignr m4, m7, 1 mova m5, m4 mova m6, m4 PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24 ; Row[8 - 15] movu m7, [r2 - 7] palignr m0, m7, 3 palignr m1, m7, 2 mova m2, m1 mova m3, m1 palignr m4, m7, 1 mova m5, m4 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16 ; Row[16 - 23] movu m7, [r2 - 10] palignr m0, m7, 3 palignr m1, m7, 2 mova m2, m1 palignr m3, m7, 1 mova m4, m3 mova m5, m3 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8 ; Row[24 - 31] movu m7, [r2 - 13] palignr m0, m7, 2 mova m1, m0 mova m2, m0 palignr m3, m7, 1 mova m4, m3 mova m5, m7 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0 add r6, 8 mov r0, r6 add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize) %define above [rsp + 0 * mmsize] lea r3, [r2 + 64] lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] MODE_13_23_ROW0 0 add r6, 8 mov r0, r6 add r2, 7 mov r3, 3 .loop: MODE_13_23 0, 0 add r6, 8 mov r0, r6 add r2, 8 dec r3 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize) %define above [rsp + 0 * mmsize] lea r3, [r2 + 64] lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] MODE_12_24_ROW0 0 add r6, 8 mov r0, r6 add r2, 7 mov r3, 3 .loop: MODE_12_24 0 add r6, 8 mov r0, r6 add r2, 8 dec r3 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_25, 4,7,8 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line mov r6, rsp sub rsp, 64+gprsize and rsp, ~63 mov [rsp+64], r6 ; collect reference pixel movu m0, [r2 + 16 + 64] pxor m1, m1 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] mova [rsp], m0 movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu [rsp + 1], m0 movu [rsp + 1 + 16], m1 movu [rsp + 1 + 32], m2 mov [rsp + 63], byte 4 ; filter lea r2, [rsp + 1] ; r2 -> [0] lea r3, [c_shuf8_0] ; r3 -> shuffle8 lea r4, [ang_table] ; r4 -> ang_table lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0] ; r6 -> r0 mova m5, [pw_1024] ; m5 -> 1024 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 .loop: ; Row[0 - 7] movu m7, [r2] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16 ; Row[8 - 15] movu m7, [r2] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0 ; Row[16 - 23] movu m7, [r2 - 1] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16 ; Row[24 - 31] movu m7, [r2 - 1] mova m0, m7 mova m1, m7 mova m2, m7 mova m3, m7 mova m4, m7 mova m5, m7 mova m6, m7 lea r0, [r0 + r1 * 4] PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0 add r6, 8 mov r0, r6 add r2, 8 dec byte [rsp + 63] jnz .loop mov rsp, [rsp+64] RET INIT_XMM sse4 cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize) %define m8 [rsp + 0 * mmsize] %define m9 [rsp + 1 * mmsize] mov r6, 2 movu m0, [r2 + 64] pinsrb m0, [r2], 0 movu m1, [r2 + 1 + 64] mova m8, m0 mova m9, m1 mov r3d, r4d lea r4, [r1 * 3] .loop: movu m0, [r2 + 1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + r1 * 2], m0 movu [r0 + r4], m0 lea r5, [r0 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r0 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 lea r5, [r5 + r1 * 4] movu [r5], m0 movu [r5 + r1], m0 movu [r5 + r1 * 2], m0 movu [r5 + r4], m0 ; filter cmp r3d, byte 0 jz .quit pxor m4, m4 pshufb m0, m4 pmovzxbw m0, m0 mova m1, m0 movu m2, m8 movu m3, m9 pshufb m2, m4 pmovzxbw m2, m2 movhlps m4, m3 pmovzxbw m3, m3 pmovzxbw m4, m4 psubw m3, m2 psubw m4, m2 psraw m3, 1 psraw m4, 1 paddw m0, m3 paddw m1, m4 packuswb m0, m1 pextrb [r0], m0, 0 pextrb [r0 + r1], m0, 1 pextrb [r0 + r1 * 2], m0, 2 pextrb [r0 + r4], m0, 3 lea r5, [r0 + r1 * 4] pextrb [r5], m0, 4 pextrb [r5 + r1], m0, 5 pextrb [r5 + r1 * 2], m0, 6 pextrb [r5 + r4], m0, 7 lea r5, [r5 + r1 * 4] pextrb [r5], m0, 8 pextrb [r5 + r1], m0, 9 pextrb [r5 + r1 * 2], m0, 10 pextrb [r5 + r4], m0, 11 lea r5, [r5 + r1 * 4] pextrb [r5], m0, 12 pextrb [r5 + r1], m0, 13 pextrb [r5 + r1 * 2], m0, 14 pextrb [r5 + r4], m0, 15 .quit: lea r2, [r2 + 16] add r0, 16 dec r6d jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_27, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: MODE_9_27 0 add r6, 8 mov r0, r6 add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_28, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: MODE_8_28 0 add r6, 8 mov r0, r6 add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_29, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: MODE_7_29 0 add r6, 8 mov r0, r6 add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_30, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: MODE_6_30 0 add r6, 8 mov r0, r6 add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_31, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: MODE_5_31 0 add r6, 8 mov r0, r6 add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_32, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: MODE_4_32 0 add r6, 8 mov r0, r6 add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 cglobal intra_pred_ang32_33, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 4 lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: MODE_3_33 0 add r6, 8 mov r0, r6 add r2, 8 dec r4 jnz .loop RET ;----------------------------------------------------------------------------------------- ; start of intra_pred_ang32 angular modes avx2 asm ;----------------------------------------------------------------------------------------- %if ARCH_X86_64 == 1 INIT_YMM avx2 ; register mapping : ; %1-%8 - output registers ; %9 - temp register ; %10 - for label naming %macro TRANSPOSE_32x8_AVX2 10 jnz .skip%10 ; transpose 8x32 to 32x8 and then store punpcklbw m%9, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%2, m%3, m%4 punpckhbw m%3, m%4 punpcklbw m%4, m%5, m%6 punpckhbw m%5, m%6 punpcklbw m%6, m%7, m%8 punpckhbw m%7, m%8 punpcklwd m%8, m%9, m%2 punpckhwd m%9, m%2 punpcklwd m%2, m%4, m%6 punpckhwd m%4, m%6 punpcklwd m%6, m%1, m%3 punpckhwd m%1, m%3 punpcklwd m%3, m%5, m%7 punpckhwd m%5, m%7 punpckldq m%7, m%8, m%2 punpckhdq m%8, m%2 punpckldq m%2, m%6, m%3 punpckhdq m%6, m%3 punpckldq m%3, m%9, m%4 punpckhdq m%9, m%4 punpckldq m%4, m%1, m%5 punpckhdq m%1, m%5 movq [r0 + r1 * 0], xm%7 movhps [r0 + r1 * 1], xm%7 movq [r0 + r1 * 2], xm%8 movhps [r0 + r5 * 1], xm%8 lea r0, [r0 + r6] movq [r0 + r1 * 0], xm%3 movhps [r0 + r1 * 1], xm%3 movq [r0 + r1 * 2], xm%9 movhps [r0 + r5 * 1], xm%9 lea r0, [r0 + r6] movq [r0 + r1 * 0], xm%2 movhps [r0 + r1 * 1], xm%2 movq [r0 + r1 * 2], xm%6 movhps [r0 + r5 * 1], xm%6 lea r0, [r0 + r6] movq [r0 + r1 * 0], xm%4 movhps [r0 + r1 * 1], xm%4 movq [r0 + r1 * 2], xm%1 movhps [r0 + r5 * 1], xm%1 lea r0, [r0 + r6] vpermq m%8, m%8, 00001110b vpermq m%7, m%7, 00001110b vpermq m%6, m%6, 00001110b vpermq m%3, m%3, 00001110b vpermq m%9, m%9, 00001110b vpermq m%2, m%2, 00001110b vpermq m%4, m%4, 00001110b vpermq m%1, m%1, 00001110b movq [r0 + r1 * 0], xm%7 movhps [r0 + r1 * 1], xm%7 movq [r0 + r1 * 2], xm%8 movhps [r0 + r5 * 1], xm%8 lea r0, [r0 + r6] movq [r0 + r1 * 0], xm%3 movhps [r0 + r1 * 1], xm%3 movq [r0 + r1 * 2], xm%9 movhps [r0 + r5 * 1], xm%9 lea r0, [r0 + r6] movq [r0 + r1 * 0], xm%2 movhps [r0 + r1 * 1], xm%2 movq [r0 + r1 * 2], xm%6 movhps [r0 + r5 * 1], xm%6 lea r0, [r0 + r6] movq [r0 + r1 * 0], xm%4 movhps [r0 + r1 * 1], xm%4 movq [r0 + r1 * 2], xm%1 movhps [r0 + r5 * 1], xm%1 lea r0, [r4 + 8] jmp .end%10 .skip%10: movu [r0 + r1 * 0], m%1 movu [r0 + r1 * 1], m%2 movu [r0 + r1 * 2], m%3 movu [r0 + r5 * 1], m%4 lea r0, [r0 + r6] movu [r0 + r1 * 0], m%5 movu [r0 + r1 * 1], m%6 movu [r0 + r1 * 2], m%7 movu [r0 + r5 * 1], m%8 lea r0, [r0 + r6] .end%10: %endmacro cglobal ang32_mode_3_33_row_0_15 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 + 10 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m5, [r3 + 4 * 32] ; [20] pmulhrsw m5, m7 pmaddubsw m1, [r3 + 4 * 32] pmulhrsw m1, m7 packuswb m5, m1 palignr m6, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m6, [r3 - 2 * 32] ; [14] pmulhrsw m6, m7 pmaddubsw m1, [r3 - 2 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m8, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m8, [r3 - 8 * 32] ; [8] pmulhrsw m8, m7 pmaddubsw m1, [r3 - 8 * 32] pmulhrsw m1, m7 packuswb m8, m1 palignr m10, m2, m0, 8 palignr m11, m3, m2, 8 pmaddubsw m9, m10, [r3 - 14 * 32] ; [2] pmulhrsw m9, m7 pmaddubsw m1, m11, [r3 - 14 * 32] pmulhrsw m1, m7 packuswb m9, m1 pmaddubsw m10, [r3 + 12 * 32] ; [28] pmulhrsw m10, m7 pmaddubsw m11, [r3 + 12 * 32] pmulhrsw m11, m7 packuswb m10, m11 palignr m11, m2, m0, 10 palignr m1, m3, m2, 10 pmaddubsw m11, [r3 + 6 * 32] ; [22] pmulhrsw m11, m7 pmaddubsw m1, [r3 + 6 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m12, m2, m0, 12 palignr m1, m3, m2, 12 pmaddubsw m12, [r3] ; [16] pmulhrsw m12, m7 pmaddubsw m1, [r3] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m4, m2, m0, 14 palignr m1, m3, m2, 14 pmaddubsw m4, [r3 - 6 * 32] ; [10] pmulhrsw m4, m7 pmaddubsw m1, [r3 - 6 * 32] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m2, [r3 - 12 * 32] ; [4] pmulhrsw m5, m7 pmaddubsw m1, m3, [r3 - 12 * 32] pmulhrsw m1, m7 packuswb m5, m1 pmaddubsw m6, m2, [r3 + 14 * 32] ; [30] pmulhrsw m6, m7 pmaddubsw m1, m3, [r3 + 14 * 32] pmulhrsw m1, m7 packuswb m6, m1 movu m0, [r2 + 25] movu m1, [r2 + 26] punpcklbw m0, m1 palignr m8, m3, m2, 2 palignr m1, m0, m3, 2 pmaddubsw m8, [r3 + 8 * 32] ; [24] pmulhrsw m8, m7 pmaddubsw m1, [r3 + 8 * 32] pmulhrsw m1, m7 packuswb m8, m1 palignr m9, m3, m2, 4 palignr m1, m0, m3, 4 pmaddubsw m9, [r3 + 2 * 32] ; [18] pmulhrsw m9, m7 pmaddubsw m1, [r3 + 2 * 32] pmulhrsw m1, m7 packuswb m9, m1 palignr m10, m3, m2, 6 palignr m1, m0, m3, 6 pmaddubsw m10, [r3 - 4 * 32] ; [12] pmulhrsw m10, m7 pmaddubsw m1, [r3 - 4 * 32] pmulhrsw m1, m7 packuswb m10, m1 palignr m11, m3, m2, 8 palignr m1, m0, m3, 8 pmaddubsw m11, [r3 - 10 * 32] ; [6] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 10 * 32] pmulhrsw m1, m7 packuswb m11, m1 movu m12, [r2 + 14] TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8 ret INIT_YMM avx2 cglobal intra_pred_ang32_3, 3,8,13 add r2, 64 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] mov r4, r0 xor r7d, r7d call ang32_mode_3_33_row_0_15 add r4, 16 mov r0, r4 add r2, 13 call ang32_mode_3_33_row_0_15 RET INIT_YMM avx2 cglobal intra_pred_ang32_33, 3,8,13 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] xor r7d, r7d inc r7d call ang32_mode_3_33_row_0_15 add r2, 13 call ang32_mode_3_33_row_0_15 RET cglobal ang32_mode_4_32_row_0_15 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 + 5 * 32] ; [21] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 + 5 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10] pmulhrsw m5, m7 pmaddubsw m8, m1, [r3 - 6 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, [r3 + 15 * 32] ; [31] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 15 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m8, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m8, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 pmaddubsw m1, [r3 + 4 * 32] pmulhrsw m1, m7 packuswb m8, m1 palignr m10, m2, m0, 6 palignr m11, m3, m2, 6 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9] pmulhrsw m9, m7 pmaddubsw m1, m11, [r3 - 7 * 32] pmulhrsw m1, m7 packuswb m9, m1 pmaddubsw m10, [r3 + 14 * 32] ; [30] pmulhrsw m10, m7 pmaddubsw m11, [r3 + 14 * 32] pmulhrsw m11, m7 packuswb m10, m11 palignr m11, m2, m0, 8 palignr m1, m3, m2, 8 pmaddubsw m11, [r3 + 3 * 32] ; [19] pmulhrsw m11, m7 pmaddubsw m1, [r3 + 3 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m12, m2, m0, 10 palignr m1, m3, m2, 10 pmaddubsw m12, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 pmaddubsw m1, [r3 - 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m4, m2, m0, 10 palignr m1, m3, m2, 10 pmaddubsw m4, [r3 + 13 * 32] ; [29] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 13 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m5, m2, m0, 12 palignr m1, m3, m2, 12 pmaddubsw m5, [r3 + 2 * 32] ; [18] pmulhrsw m5, m7 pmaddubsw m1, [r3 + 2 * 32] pmulhrsw m1, m7 packuswb m5, m1 palignr m8, m2, m0, 14 palignr m1, m3, m2, 14 pmaddubsw m6, m8, [r3 - 9 * 32] ; [7] pmulhrsw m6, m7 pmaddubsw m9, m1, [r3 - 9 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, [r3 + 12 * 32] ; [28] pmulhrsw m8, m7 pmaddubsw m1, [r3 + 12 * 32] pmulhrsw m1, m7 packuswb m8, m1 pmaddubsw m9, m2, [r3 + 1 * 32] ; [17] pmulhrsw m9, m7 pmaddubsw m1, m3, [r3 + 1 * 32] pmulhrsw m1, m7 packuswb m9, m1 movu m0, [r2 + 25] movu m1, [r2 + 26] punpcklbw m0, m1 palignr m11, m3, m2, 2 palignr m1, m0, m3, 2 pmaddubsw m10, m11, [r3 - 10 * 32] ; [6] pmulhrsw m10, m7 pmaddubsw m12, m1, [r3 - 10 * 32] pmulhrsw m12, m7 packuswb m10, m12 pmaddubsw m11, [r3 + 11 * 32] ; [27] pmulhrsw m11, m7 pmaddubsw m1, [r3 + 11 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m0, m3, 4 palignr m3, m2, 4 pmaddubsw m3, [r3] ; [16] pmulhrsw m3, m7 pmaddubsw m0, [r3] pmulhrsw m0, m7 packuswb m3, m0 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 3, 0, 8 ret cglobal ang32_mode_4_32_row_16_31 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 - 11 * 32] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26] pmulhrsw m5, m7 pmaddubsw m1, m2, [r3 + 10 * 32] pmulhrsw m1, m7 packuswb m5, m1 palignr m6, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m6, [r3 - 1 * 32] ; [15] pmulhrsw m6, m7 pmaddubsw m1, [r3 - 1 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m9, m2, m0, 4 palignr m10, m3, m2, 4 pmaddubsw m8, m9, [r3 - 12 * 32] ; [4] pmulhrsw m8, m7 pmaddubsw m1, m10, [r3 - 12 * 32] pmulhrsw m1, m7 packuswb m8, m1 pmaddubsw m9, [r3 + 9 * 32] ; [25] pmulhrsw m9, m7 pmaddubsw m10, [r3 + 9 * 32] pmulhrsw m10, m7 packuswb m9, m10 palignr m10, m2, m0, 6 palignr m11, m3, m2, 6 pmaddubsw m10, [r3 - 2 * 32] ; [14] pmulhrsw m10, m7 pmaddubsw m11, [r3 - 2 * 32] pmulhrsw m11, m7 packuswb m10, m11 palignr m12, m2, m0, 8 palignr m1, m3, m2, 8 pmaddubsw m11, m12, [r3 - 13 * 32] ; [3] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 13 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m1, m3, m2, 8 pmaddubsw m12, [r3 + 8 * 32] ; [24] pmulhrsw m12, m7 pmaddubsw m1, [r3 + 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m4, m2, m0, 10 palignr m1, m3, m2, 10 pmaddubsw m4, [r3 - 3 * 32] ; [13] pmulhrsw m4, m7 pmaddubsw m1, [r3 - 3 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m0, 12 palignr m8, m3, m2, 12 pmaddubsw m5, m6, [r3 - 14 * 32] ; [2] pmulhrsw m5, m7 pmaddubsw m1, m8, [r3 - 14 * 32] pmulhrsw m1, m7 packuswb m5, m1 pmaddubsw m6, [r3 + 7 * 32] ; [23] pmulhrsw m6, m7 pmaddubsw m8, [r3 + 7 * 32] pmulhrsw m8, m7 packuswb m6, m8 palignr m8, m2, m0, 14 palignr m1, m3, m2, 14 pmaddubsw m8, [r3 - 4 * 32] ; [12] pmulhrsw m8, m7 pmaddubsw m1, [r3 - 4 * 32] pmulhrsw m1, m7 packuswb m8, m1 pmaddubsw m9, m2, [r3 - 15 * 32] ; [1] pmulhrsw m9, m7 pmaddubsw m1, m3, [r3 - 15 * 32] pmulhrsw m1, m7 packuswb m9, m1 pmaddubsw m10, m2, [r3 + 6 * 32] ; [22] pmulhrsw m10, m7 pmaddubsw m1, m3, [r3 + 6 * 32] pmulhrsw m1, m7 packuswb m10, m1 movu m0, [r2 + 25] movu m1, [r2 + 26] punpcklbw m0, m1 palignr m11, m3, m2, 2 palignr m1, m0, m3, 2 pmaddubsw m11, [r3 - 5 * 32] ; [11] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 5 * 32] pmulhrsw m1, m7 packuswb m11, m1 movu m12, [r2 + 11] TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8 ret INIT_YMM avx2 cglobal intra_pred_ang32_4, 3,8,13 add r2, 64 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] mov r4, r0 xor r7d, r7d call ang32_mode_4_32_row_0_15 add r4, 16 mov r0, r4 add r2, 11 call ang32_mode_4_32_row_16_31 RET INIT_YMM avx2 cglobal intra_pred_ang32_32, 3,8,13 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] xor r7d, r7d inc r7d call ang32_mode_4_32_row_0_15 add r2, 11 call ang32_mode_4_32_row_16_31 RET cglobal ang32_mode_5_31_row_0_15 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 + 1 * 32] ; [17] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 + 1 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m5, m6, [r3 - 14 * 32] ; [2] pmulhrsw m5, m7 pmaddubsw m8, m1, [r3 - 14 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, [r3 + 3 * 32] ; [19] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 3 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m9, m2, m0, 4 palignr m10, m3, m2, 4 pmaddubsw m8, m9, [r3 - 12 * 32] ; [4] pmulhrsw m8, m7 pmaddubsw m1, m10, [r3 - 12 * 32] pmulhrsw m1, m7 packuswb m8, m1 pmaddubsw m9, [r3 + 5 * 32] ; [21] pmulhrsw m9, m7 pmaddubsw m10, [r3 + 5 * 32] pmulhrsw m10, m7 packuswb m9, m10 palignr m11, m2, m0, 6 palignr m12, m3, m2, 6 pmaddubsw m10, m11, [r3 - 10 * 32] ; [6] pmulhrsw m10, m7 pmaddubsw m1, m12, [r3 - 10 * 32] pmulhrsw m1, m7 packuswb m10, m1 pmaddubsw m11, [r3 + 7 * 32] ; [23] pmulhrsw m11, m7 pmaddubsw m12, [r3 + 7 * 32] pmulhrsw m12, m7 packuswb m11, m12 palignr m12, m2, m0, 8 palignr m1, m3, m2, 8 pmaddubsw m12, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 pmaddubsw m1, [r3 - 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m4, m2, m0, 8 palignr m1, m3, m2, 8 pmaddubsw m4, [r3 + 9 * 32] ; [25] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 9 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m0, 10 palignr m1, m3, m2, 10 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10] pmulhrsw m5, m7 pmaddubsw m8, m1, [r3 - 6 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, [r3 + 11 * 32] ; [27] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 11 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m9, m2, m0, 12 palignr m1, m3, m2, 12 pmaddubsw m8, m9, [r3 - 4 * 32] ; [12] pmulhrsw m8, m7 pmaddubsw m10, m1, [r3 - 4 * 32] pmulhrsw m10, m7 packuswb m8, m10 pmaddubsw m9, [r3 + 13 * 32] ; [29] pmulhrsw m9, m7 pmaddubsw m1, [r3 + 13 * 32] pmulhrsw m1, m7 packuswb m9, m1 palignr m11, m2, m0, 14 palignr m1, m3, m2, 14 pmaddubsw m10, m11, [r3 - 2 * 32] ; [14] pmulhrsw m10, m7 pmaddubsw m12, m1, [r3 - 2 * 32] pmulhrsw m12, m7 packuswb m10, m12 pmaddubsw m11, [r3 + 15 * 32] ; [31] pmulhrsw m11, m7 pmaddubsw m1, [r3 + 15 * 32] pmulhrsw m1, m7 packuswb m11, m1 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 pmaddubsw m3, [r3] pmulhrsw m3, m7 packuswb m2, m3 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 ret cglobal ang32_mode_5_31_row_16_31 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 - 15 * 32] ; [1] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 - 15 * 32] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18] pmulhrsw m5, m7 pmaddubsw m8, m2, [r3 + 2 * 32] pmulhrsw m8, m7 packuswb m5, m8 palignr m8, m2, m0, 2 palignr m9, m3, m2, 2 pmaddubsw m6, m8, [r3 - 13 * 32] ; [3] pmulhrsw m6, m7 pmaddubsw m1, m9, [r3 - 13 * 32] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m8, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 pmaddubsw m9, [r3 + 4 * 32] pmulhrsw m9, m7 packuswb m8, m9 palignr m10, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m9, m10, [r3 - 11 * 32] ; [5] pmulhrsw m9, m7 pmaddubsw m11, m1, [r3 - 11 * 32] pmulhrsw m11, m7 packuswb m9, m11 pmaddubsw m10, [r3 + 6 * 32] ; [22] pmulhrsw m10, m7 pmaddubsw m1, [r3 + 6 * 32] pmulhrsw m1, m7 packuswb m10, m1 palignr m12, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m11, m12, [r3 - 9 * 32] ; [7] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 9 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m1, m3, m2, 6 pmaddubsw m12, [r3 + 8 * 32] ; [24] pmulhrsw m12, m7 pmaddubsw m1, [r3 + 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m5, m2, m0, 8 palignr m8, m3, m2, 8 pmaddubsw m4, m5, [r3 - 7 * 32] ; [9] pmulhrsw m4, m7 pmaddubsw m1, m8, [r3 - 7 * 32] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, [r3 + 10 * 32] ; [26] pmulhrsw m5, m7 pmaddubsw m8, [r3 + 10 * 32] pmulhrsw m8, m7 packuswb m5, m8 palignr m8, m2, m0, 10 palignr m9, m3, m2, 10 pmaddubsw m6, m8, [r3 - 5 * 32] ; [11] pmulhrsw m6, m7 pmaddubsw m1, m9, [r3 - 5 * 32] pmulhrsw m1, m7 packuswb m6, m1 pmaddubsw m8, [r3 + 12 * 32] ; [28] pmulhrsw m8, m7 pmaddubsw m9, [r3 + 12 * 32] pmulhrsw m9, m7 packuswb m8, m9 palignr m10, m2, m0, 12 palignr m11, m3, m2, 12 pmaddubsw m9, m10, [r3 - 3 * 32] ; [13] pmulhrsw m9, m7 pmaddubsw m1, m11, [r3 - 3 * 32] pmulhrsw m1, m7 packuswb m9, m1 pmaddubsw m10, [r3 + 14 * 32] ; [30] pmulhrsw m10, m7 pmaddubsw m11, [r3 + 14 * 32] pmulhrsw m11, m7 packuswb m10, m11 palignr m11, m2, m0, 14 palignr m1, m3, m2, 14 pmaddubsw m11, [r3 - 1 * 32] ; [15] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 1 * 32] pmulhrsw m1, m7 packuswb m11, m1 movu m2, [r2 + 9] TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 ret INIT_YMM avx2 cglobal intra_pred_ang32_5, 3,8,13 add r2, 64 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] mov r4, r0 xor r7d, r7d call ang32_mode_5_31_row_0_15 add r4, 16 mov r0, r4 add r2, 9 call ang32_mode_5_31_row_16_31 RET INIT_YMM avx2 cglobal intra_pred_ang32_31, 3,8,13 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] xor r7d, r7d inc r7d call ang32_mode_5_31_row_0_15 add r2, 9 call ang32_mode_5_31_row_16_31 RET cglobal ang32_mode_6_30_row_0_15 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 - 3 * 32] ; [13] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 - 3 * 32] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26] pmulhrsw m5, m7 pmaddubsw m8, m2, [r3 + 10 * 32] pmulhrsw m8, m7 packuswb m5, m8 palignr m8, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m6, m8, [r3 - 9 * 32] ; [7] pmulhrsw m6, m7 pmaddubsw m9, m1, [r3 - 9 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 pmaddubsw m1, [r3 + 4 * 32] pmulhrsw m1, m7 packuswb m8, m1 palignr m11, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m9, m11, [r3 - 15 * 32] ; [1] pmulhrsw m9, m7 pmaddubsw m12, m1, [r3 - 15 * 32] pmulhrsw m12, m7 packuswb m9, m12 pmaddubsw m10, m11, [r3 - 2 * 32] ; [14] pmulhrsw m10, m7 pmaddubsw m12, m1, [r3 - 2 * 32] pmulhrsw m12, m7 packuswb m10, m12 pmaddubsw m11, [r3 + 11 * 32] ; [27] pmulhrsw m11, m7 pmaddubsw m1, [r3 + 11 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m12, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m12, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 pmaddubsw m1, [r3 - 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m4, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m4, [r3 + 5 * 32] ; [21] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 5 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m8, m2, m0, 8 palignr m1, m3, m2, 8 pmaddubsw m5, m8, [r3 - 14 * 32] ; [2] pmulhrsw m5, m7 pmaddubsw m9, m1, [r3 - 14 * 32] pmulhrsw m9, m7 packuswb m5, m9 pmaddubsw m6, m8, [r3 - 1 * 32] ; [15] pmulhrsw m6, m7 pmaddubsw m9, m1, [r3 - 1 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, [r3 + 12 * 32] ; [28] pmulhrsw m8, m7 pmaddubsw m1, [r3 + 12 * 32] pmulhrsw m1, m7 packuswb m8, m1 palignr m10, m2, m0, 10 palignr m1, m3, m2, 10 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9] pmulhrsw m9, m7 pmaddubsw m11, m1, [r3 - 7 * 32] pmulhrsw m11, m7 packuswb m9, m11 pmaddubsw m10, [r3 + 6 * 32] ; [22] pmulhrsw m10, m7 pmaddubsw m1, m1, [r3 + 6 * 32] pmulhrsw m1, m7 packuswb m10, m1 palignr m3, m2, 12 palignr m2, m0, 12 pmaddubsw m11, m2, [r3 - 13 * 32] ; [3] pmulhrsw m11, m7 pmaddubsw m1, m3, [r3 - 13 * 32] pmulhrsw m1, m7 packuswb m11, m1 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 pmaddubsw m3, [r3] pmulhrsw m3, m7 packuswb m2, m3 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 ret cglobal ang32_mode_6_30_row_16_31 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 + 13 * 32] ; [29] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 + 13 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m6, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10] pmulhrsw m5, m7 pmaddubsw m8, m1, [r3 - 6 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, [r3 + 7 * 32] ; [23] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 7 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m10, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m8, m10, [r3 - 12 * 32] ; [4] pmulhrsw m8, m7 pmaddubsw m11, m1, [r3 - 12 * 32] pmulhrsw m11, m7 packuswb m8, m11 pmaddubsw m9, m10, [r3 + 1 * 32] ; [17] pmulhrsw m9, m7 pmaddubsw m11, m1, [r3 + 1 * 32] pmulhrsw m11, m7 packuswb m9, m11 pmaddubsw m10, [r3 + 14 * 32] ; [30] pmulhrsw m10, m7 pmaddubsw m1, [r3 + 14 * 32] pmulhrsw m1, m7 packuswb m10, m1 palignr m12, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m11, m12, [r3 - 5 * 32] ; [11] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 5 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m1, m3, m2, 6 pmaddubsw m12, [r3 + 8 * 32] ; [24] pmulhrsw m12, m7 pmaddubsw m1, [r3 + 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m6, m2, m0, 8 palignr m1, m3, m2, 8 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] pmulhrsw m4, m7 pmaddubsw m8, m1, [r3 - 11 * 32] pmulhrsw m8, m7 packuswb m4, m8 pmaddubsw m5, m6, [r3 + 2 * 32] ; [18] pmulhrsw m5, m7 pmaddubsw m9, m1, [r3 + 2 * 32] pmulhrsw m9, m7 packuswb m5, m9 pmaddubsw m6, [r3 + 15 * 32] ; [31] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 15 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m9, m2, m0, 10 palignr m1, m3, m2, 10 pmaddubsw m8, m9, [r3 - 4 * 32] ; [12] pmulhrsw m8, m7 pmaddubsw m10, m1, [r3 - 4 * 32] pmulhrsw m10, m7 packuswb m8, m10 pmaddubsw m9, [r3 + 9 * 32] ; [25] pmulhrsw m9, m7 pmaddubsw m1, [r3 + 9 * 32] pmulhrsw m1, m7 packuswb m9, m1 palignr m3, m2, 12 palignr m2, m0, 12 pmaddubsw m10, m2, [r3 - 10 * 32] ; [6] pmulhrsw m10, m7 pmaddubsw m1, m3, [r3 - 10 * 32] pmulhrsw m1, m7 packuswb m10, m1 pmaddubsw m2, [r3 + 3 * 32] ; [19] pmulhrsw m2, m7 pmaddubsw m3, [r3 + 3 * 32] pmulhrsw m3, m7 packuswb m2, m3 movu m3, [r2 + 8] ; [0] TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 8 ret INIT_YMM avx2 cglobal intra_pred_ang32_6, 3,8,13 add r2, 64 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] mov r4, r0 xor r7d, r7d call ang32_mode_6_30_row_0_15 add r4, 16 mov r0, r4 add r2, 6 call ang32_mode_6_30_row_16_31 RET INIT_YMM avx2 cglobal intra_pred_ang32_30, 3,8,13 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] xor r7d, r7d inc r7d call ang32_mode_6_30_row_0_15 add r2, 6 call ang32_mode_6_30_row_16_31 RET cglobal ang32_mode_7_29_row_0_15 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 - 7 * 32] ; [9] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 - 7 * 32] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18] pmulhrsw m5, m7 pmaddubsw m8, m2, [r3 + 2 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, m0, [r3 + 11 * 32] ; [27] pmulhrsw m6, m7 pmaddubsw m9, m2, [r3 + 11 * 32] pmulhrsw m9, m7 packuswb m6, m9 palignr m11, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m8, m11, [r3 - 12 * 32] ; [4] pmulhrsw m8, m7 pmaddubsw m12, m1, [r3 - 12 * 32] pmulhrsw m12, m7 packuswb m8, m12 pmaddubsw m9, m11, [r3 - 3 * 32] ; [13] pmulhrsw m9, m7 pmaddubsw m12, m1, [r3 - 3 * 32] pmulhrsw m12, m7 packuswb m9, m12 pmaddubsw m10, m11, [r3 + 6 * 32] ; [22] pmulhrsw m10, m7 pmaddubsw m12, m1, [r3 + 6 * 32] pmulhrsw m12, m7 packuswb m10, m12 pmaddubsw m11, [r3 + 15 * 32] ; [31] pmulhrsw m11, m7 pmaddubsw m1, [r3 + 15 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m12, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m12, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 pmaddubsw m1, [r3 - 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m5, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m4, m5, [r3 + 1 * 32] ; [17] pmulhrsw m4, m7 pmaddubsw m8, m1, [r3 + 1 * 32] pmulhrsw m8, m7 packuswb m4, m8 pmaddubsw m5, [r3 + 10 * 32] ; [26] pmulhrsw m5, m7 pmaddubsw m1, [r3 + 10 * 32] pmulhrsw m1, m7 packuswb m5, m1 palignr m10, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m6, m10, [r3 - 13 * 32] ; [3] pmulhrsw m6, m7 pmaddubsw m9, m1, [r3 - 13 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, m10, [r3 - 4 * 32] ; [12] pmulhrsw m8, m7 pmaddubsw m11, m1, [r3 - 4 * 32] pmulhrsw m11, m7 packuswb m8, m11 pmaddubsw m9, m10, [r3 + 5 * 32] ; [21] pmulhrsw m9, m7 pmaddubsw m11, m1, [r3 + 5 * 32] pmulhrsw m11, m7 packuswb m9, m11 pmaddubsw m10, [r3 + 14 * 32] ; [30] pmulhrsw m10, m7 pmaddubsw m1, [r3 + 14 * 32] pmulhrsw m1, m7 packuswb m10, m1 palignr m3, m2, 8 palignr m2, m0, 8 pmaddubsw m11, m2, [r3 - 9 * 32] ; [7] pmulhrsw m11, m7 pmaddubsw m1, m3, [r3 - 9 * 32] pmulhrsw m1, m7 packuswb m11, m1 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 pmaddubsw m3, [r3] pmulhrsw m3, m7 packuswb m2, m3 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 ret cglobal ang32_mode_7_29_row_16_31 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 + 9 * 32] ; [25] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 + 9 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m9, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m5, m9, [r3 - 14 * 32] ; [2] pmulhrsw m5, m7 pmaddubsw m8, m1, [r3 - 14 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, m9, [r3 - 5 * 32] ; [11] pmulhrsw m6, m7 pmaddubsw m10, m1, [r3 - 5 * 32] pmulhrsw m10, m7 packuswb m6, m10 pmaddubsw m8, m9, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 pmaddubsw m10, m1, [r3 + 4 * 32] pmulhrsw m10, m7 packuswb m8, m10 pmaddubsw m9, [r3 + 13 * 32] ; [29] pmulhrsw m9, m7 pmaddubsw m1, [r3 + 13 * 32] pmulhrsw m1, m7 packuswb m9, m1 palignr m12, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m10, m12, [r3 - 10 * 32] ; [6] pmulhrsw m10, m7 pmaddubsw m11, m1, [r3 - 10 * 32] pmulhrsw m11, m7 packuswb m10, m11 pmaddubsw m11, m12, [r3 - 1 * 32] ; [15] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 1 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m1, m3, m2, 4 pmaddubsw m12, [r3 + 8 * 32] ; [24] pmulhrsw m12, m7 pmaddubsw m1, [r3 + 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m8, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m4, m8, [r3 - 15 * 32] ; [1] pmulhrsw m4, m7 pmaddubsw m9, m1, [r3 - 15 * 32] pmulhrsw m9, m7 packuswb m4, m9 pmaddubsw m5, m8, [r3 - 6 * 32] ; [10] pmulhrsw m5, m7 pmaddubsw m9, m1, [r3 - 6 * 32] pmulhrsw m9, m7 packuswb m5, m9 pmaddubsw m6, m8, [r3 + 3 * 32] ; [19] pmulhrsw m6, m7 pmaddubsw m9, m1, [r3 + 3 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, [r3 + 12 * 32] ; [28] pmulhrsw m8, m7 pmaddubsw m1, [r3 + 12 * 32] pmulhrsw m1, m7 packuswb m8, m1 palignr m3, m2, 8 palignr m2, m0, 8 pmaddubsw m9, m2, [r3 - 11 * 32] ; [5] pmulhrsw m9, m7 pmaddubsw m1, m3, [r3 - 11 * 32] pmulhrsw m1, m7 packuswb m9, m1 pmaddubsw m10, m2, [r3 - 2 * 32] ; [14] pmulhrsw m10, m7 pmaddubsw m1, m3, [r3 - 2 * 32] pmulhrsw m1, m7 packuswb m10, m1 pmaddubsw m2, [r3 + 7 * 32] ; [23] pmulhrsw m2, m7 pmaddubsw m3, [r3 + 7 * 32] pmulhrsw m3, m7 packuswb m2, m3 movu m1, [r2 + 6] ; [0] TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 1, 0, 8 ret INIT_YMM avx2 cglobal intra_pred_ang32_7, 3,8,13 add r2, 64 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] mov r4, r0 xor r7d, r7d call ang32_mode_7_29_row_0_15 add r4, 16 mov r0, r4 add r2, 4 call ang32_mode_7_29_row_16_31 RET INIT_YMM avx2 cglobal intra_pred_ang32_29, 3,8,13 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] xor r7d, r7d inc r7d call ang32_mode_7_29_row_0_15 add r2, 4 call ang32_mode_7_29_row_16_31 RET cglobal ang32_mode_8_28_avx2 test r7d, r7d ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] pmulhrsw m4, m7 pmaddubsw m1, m2, [r3 - 11 * 32] pmulhrsw m1, m7 packuswb m4, m1 pmaddubsw m5, m0, [r3 - 6 * 32] ; [10] pmulhrsw m5, m7 pmaddubsw m8, m2, [r3 - 6 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, m0, [r3 - 1 * 32] ; [15] pmulhrsw m6, m7 pmaddubsw m9, m2, [r3 - 1 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, m0, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 pmaddubsw m12, m2, [r3 + 4 * 32] pmulhrsw m12, m7 packuswb m8, m12 pmaddubsw m9, m0, [r3 + 9 * 32] ; [25] pmulhrsw m9, m7 pmaddubsw m12, m2, [r3 + 9 * 32] pmulhrsw m12, m7 packuswb m9, m12 pmaddubsw m10, m0, [r3 + 14 * 32] ; [30] pmulhrsw m10, m7 pmaddubsw m12, m2, [r3 + 14 * 32] pmulhrsw m12, m7 packuswb m10, m12 palignr m12, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m11, m12, [r3 - 13 * 32] ; [3] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 13 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m1, m3, m2, 2 pmaddubsw m12, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 pmaddubsw m1, [r3 - 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 ; rows 8 to 15 palignr m8, m2, m0, 2 palignr m1, m3, m2, 2 pmaddubsw m4, m8, [r3 - 3 * 32] ; [13] pmulhrsw m4, m7 pmaddubsw m9, m1, [r3 - 3 * 32] pmulhrsw m9, m7 packuswb m4, m9 pmaddubsw m5, m8, [r3 + 2 * 32] ; [18] pmulhrsw m5, m7 pmaddubsw m9, m1, [r3 + 2 * 32] pmulhrsw m9, m7 packuswb m5, m9 pmaddubsw m6, m8, [r3 + 7 * 32] ; [23] pmulhrsw m6, m7 pmaddubsw m9, m1, [r3 + 7 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, [r3 + 12 * 32] ; [28] pmulhrsw m8, m7 pmaddubsw m1, [r3 + 12 * 32] pmulhrsw m1, m7 packuswb m8, m1 palignr m12, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m9, m12, [r3 - 15 * 32] ; [1] pmulhrsw m9, m7 pmaddubsw m11, m1, [r3 - 15 * 32] pmulhrsw m11, m7 packuswb m9, m11 pmaddubsw m10, m12, [r3 - 10 * 32] ; [6] pmulhrsw m10, m7 pmaddubsw m11, m1, [r3 - 10 * 32] pmulhrsw m11, m7 packuswb m10, m11 pmaddubsw m11, m12, [r3 - 5 * 32] ; [11] pmulhrsw m11, m7 pmaddubsw m1, [r3 - 5 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m1, m3, m2, 4 pmaddubsw m12, [r3] ; [16] pmulhrsw m12, m7 pmaddubsw m1, [r3] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8 ; rows 16 to 23 jnz .doNotAdjustBufferPtr lea r4, [r4 + mmsize/2] mov r0, r4 .doNotAdjustBufferPtr: palignr m6, m2, m0, 4 palignr m1, m3, m2, 4 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] pmulhrsw m4, m7 pmaddubsw m8, m1, [r3 + 5 * 32] pmulhrsw m8, m7 packuswb m4, m8 pmaddubsw m5, m6, [r3 + 10 * 32] ; [26] pmulhrsw m5, m7 pmaddubsw m8, m1, [r3 + 10 * 32] pmulhrsw m8, m7 packuswb m5, m8 pmaddubsw m6, [r3 + 15 * 32] ; [31] pmulhrsw m6, m7 pmaddubsw m1, [r3 + 15 * 32] pmulhrsw m1, m7 packuswb m6, m1 palignr m12, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m8, m12, [r3 - 12 * 32] ; [4] pmulhrsw m8, m7 pmaddubsw m11, m1, [r3 - 12 * 32] pmulhrsw m11, m7 packuswb m8, m11 pmaddubsw m9, m12, [r3 - 7 * 32] ; [9] pmulhrsw m9, m7 pmaddubsw m11, m1, [r3 - 7 * 32] pmulhrsw m11, m7 packuswb m9, m11 pmaddubsw m10, m12, [r3 - 2 * 32] ; [14] pmulhrsw m10, m7 pmaddubsw m11, m1, [r3 - 2 * 32] pmulhrsw m11, m7 packuswb m10, m11 pmaddubsw m11, m12, [r3 + 3 * 32] ; [19] pmulhrsw m11, m7 pmaddubsw m1, [r3 + 3 * 32] pmulhrsw m1, m7 packuswb m11, m1 palignr m1, m3, m2, 6 pmaddubsw m12, [r3 + 8 * 32] ; [24] pmulhrsw m12, m7 pmaddubsw m1, [r3 + 8 * 32] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 16 ; rows 24 to 31 palignr m4, m2, m0, 6 palignr m1, m3, m2, 6 pmaddubsw m4, [r3 + 13 * 32] ; [29] pmulhrsw m4, m7 pmaddubsw m1, [r3 + 13 * 32] pmulhrsw m1, m7 packuswb m4, m1 palignr m3, m2, 8 palignr m2, m0, 8 pmaddubsw m5, m2, [r3 - 14 * 32] ; [2] pmulhrsw m5, m7 pmaddubsw m9, m3, [r3 - 14 * 32] pmulhrsw m9, m7 packuswb m5, m9 pmaddubsw m6, m2, [r3 - 9 * 32] ; [7] pmulhrsw m6, m7 pmaddubsw m9, m3, [r3 - 9 * 32] pmulhrsw m9, m7 packuswb m6, m9 pmaddubsw m8, m2, [r3 - 4 * 32] ; [12] pmulhrsw m8, m7 pmaddubsw m1, m3, [r3 - 4 * 32] pmulhrsw m1, m7 packuswb m8, m1 pmaddubsw m9, m2, [r3 + 1 * 32] ; [17] pmulhrsw m9, m7 pmaddubsw m11, m3, [r3 + 1 * 32] pmulhrsw m11, m7 packuswb m9, m11 pmaddubsw m10, m2, [r3 + 6 * 32] ; [22] pmulhrsw m10, m7 pmaddubsw m1, m3, [r3 + 6 * 32] pmulhrsw m1, m7 packuswb m10, m1 pmaddubsw m2, [r3 + 11 * 32] ; [27] pmulhrsw m2, m7 pmaddubsw m3, [r3 + 11 * 32] pmulhrsw m3, m7 packuswb m2, m3 movu m3, [r2 + 6] ; [0] TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 24 ret INIT_YMM avx2 cglobal intra_pred_ang32_8, 3,8,13 add r2, 64 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] mov r4, r0 xor r7d, r7d call ang32_mode_8_28_avx2 RET INIT_YMM avx2 cglobal intra_pred_ang32_28, 3,8,13 lea r3, [ang_table_avx2 + 32 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] xor r7d, r7d inc r7d call ang32_mode_8_28_avx2 RET INIT_YMM avx2 cglobal intra_pred_ang32_9, 3,5,8 vbroadcasti128 m0, [angHor_tab_9] vbroadcasti128 m1, [angHor_tab_9 + mmsize/2] mova m2, [pw_1024] mova m7, [ang32_shuf_mode9] lea r3, [r1 * 3] vbroadcasti128 m3, [r2 + mmsize*2 + 1] vbroadcasti128 m6, [r2 + mmsize*2 + 17] pshufb m5, m3, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 1 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 2 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m6, m3, 3 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 5 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 6 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m6, m3, 7 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 9 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 10 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m6, m3, 11 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 13 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 14 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m6, m3, 15 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] vbroadcasti128 m3, [r2 + mmsize*2 + 33] pshufb m5, m6, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 1 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 2 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m3, m6, 3 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 4 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 5 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 6 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m3, m6, 7 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 8 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 9 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 10 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m3, m6, 11 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 12 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 13 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 14 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1*2], m4 palignr m5, m3, m6, 15 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_27, 3,5,6 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] ; r4 -> 3 * stride mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] pmaddubsw m4, m0, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m0, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m3, m2, 2 palignr m2, m0, 2 movu m1, [r2 + 2] ; [0] movu [r0 + r4], m1 lea r0, [r0 + r1 * 4] ; rows 16 to 23 pmaddubsw m4, m2, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m2, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m2, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m2, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m2, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m2, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m2, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m2, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m2, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m2, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m2, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m2, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m2, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m2, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m2, [r3 + 14 * 32] ; [30] pmulhrsw m2, m5 pmaddubsw m3, [r3 + 14 * 32] pmulhrsw m3, m5 packuswb m2, m3 movu [r0 + r1*2], m2 movu m1, [r2 + 3] ; [0] movu [r0 + r4], m1 RET cglobal intra_pred_ang32_10, 5,5,4 pxor m0, m0 mova m1, [pb_1] lea r4, [r1 * 3] vbroadcasti128 m2, [r2 + mmsize*2 + 1] pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 lea r0, [r0 + r1 * 4] paddb m0, m1 pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 lea r0, [r0 + r1 * 4] paddb m0, m1 pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 lea r0, [r0 + r1 * 4] paddb m0, m1 pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 lea r0, [r0 + r1 * 4] pxor m0, m0 vbroadcasti128 m2, [r2 + mmsize*2 + mmsize/2 + 1] pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 lea r0, [r0 + r1 * 4] paddb m0, m1 pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 lea r0, [r0 + r1 * 4] paddb m0, m1 pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 lea r0, [r0 + r1 * 4] paddb m0, m1 pshufb m3, m2, m0 movu [r0], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r1 * 2], m3 paddb m0, m1 pshufb m3, m2, m0 movu [r0 + r4], m3 RET cglobal intra_pred_ang32_11, 3,4,8 vbroadcasti128 m0, [angHor_tab_11] vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] mova m2, [pw_1024] mova m7, [ang32_shuf_mode11] lea r3, [r1 * 3] ; prepare for [16 0 -1 -2 ...] movu xm3, [r2 + mmsize*2 - 1] vbroadcasti128 m6, [r2 + mmsize*2 + 15] pinsrb xm3, [r2 + 0], 1 pinsrb xm3, [r2 + 16], 0 vinserti128 m3, m3, xm3, 1 ; [16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 16 0 16 0 16 0 16 0 16 0 16 0 16 0 16 0] pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 1 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 2 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 3 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 5 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 6 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 7 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 9 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 10 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 11 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 13 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 14 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 15 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] mova m3, m6 vbroadcasti128 m6, [r2 + mmsize*2 + 15 + 16] pshufb m5, m3, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 1 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 2 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 3 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 5 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 6 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 7 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 9 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 10 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 11 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m6, m3, 13 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m6, m3, 14 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m6, m3, 15 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_25, 3,5,7 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 0] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] movu m1, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pinsrb xm3, [r2], 15 pinsrb xm3, [r2 + mmsize*2 + 16], 14 punpckhbw m2, m0, m1 ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] punpcklbw m0, m1 ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] vinserti128 m3, m3, xm2, 1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 0 16 x x x x x x x x x x x x x x] pmaddubsw m4, m0, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 movu m1, [r2] ; [0] movu [r0 + r4], m1 lea r0, [r0 + r1 * 4] palignr m2, m0, 14 palignr m0, m3, 14 ; rows 16 to 23 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 24 to 31 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m0, [r3 - 14 * 32] ; [2] pmulhrsw m0, m5 pmaddubsw m2, [r3 - 14 * 32] pmulhrsw m2, m5 packuswb m0, m2 movu [r0 + r1*2], m0 movu m1, [r2 + 1] ; [0] palignr m1, m3, 14 movu [r0 + r4], m1 RET cglobal intra_pred_ang32_12, 3,4,9 movu m0, [ang32_fact_mode12] movu m1, [ang32_fact_mode12 + mmsize] mova m2, [pw_1024] mova m7, [ang32_shuf_mode12] mova m8, [ang32_shuf_mode12 + mmsize] lea r3, [r1 * 3] ; prepare for [26, 19, 13, 6, 0, -1, -2....] movu xm4, [r2 + mmsize*2 - 4] vbroadcasti128 m6, [r2 + mmsize*2 + 12] pinsrb xm4, [r2 + 0], 4 pinsrb xm4, [r2 + 6], 3 pinsrb xm4, [r2 + 13], 2 pinsrb xm4, [r2 + 19], 1 pinsrb xm4, [r2 + 26], 0 vinserti128 m3, m4, xm4, 1 ; [26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pshufb m4, m3, m7 ; [ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13] pshufb m5, m3, m8 ; [ 6, 0, 6, 0, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 13, 6, 19, 13, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19] pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 1 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 2 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 3 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m6, m3, 4 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 5 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 6 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 7 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m6, m3, 8 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 9 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 10 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 11 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m6, m3, 12 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 13 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 14 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 15 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] mova m3, m6 vbroadcasti128 m6, [r2 + mmsize*2 + 12 + 16] pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 1 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 2 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 3 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m6, m3, 4 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 5 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 6 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 7 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m6, m3, 8 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 9 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 10 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 11 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m6, m3, 12 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m6, m3, 13 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m6, m3, 14 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m6, m3, 15 pshufb m5, m4, m8 pshufb m4, m7 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_24, 3,5,8 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 0] movu m1, [r2 + 1] punpckhbw m2, m0, m1 punpcklbw m0, m1 movu m4, [r2 + mmsize*2] pshufb m4, [ang32_shuf_mode24] mova m3, [ang32_shuf_mode24 + mmsize] vpermd m4, m3, m4 ; [6 6 13 13 19 19 26 26 x x x...] palignr m3, m0, m4, 1 vinserti128 m3, m3, xm2, 1 pmaddubsw m4, m0, [r3 + 11 * 32] ; [27] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 + 1 * 32] ; [17] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m0, [r3 - 9 * 32] ; [7] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 14 palignr m7, m2, m0, 14 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m0, m3, 12 palignr m7, m2, m0, 12 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m6, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 16 to 23 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 10 palignr m7, m2, m0, 10 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 24 to 31 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 8 palignr m7, m2, m0, 8 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 - 1 * 32] ; [15] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pand m6, [pw_00ff] pand m7, [pw_00ff] packuswb m6, m7 movu [r0 + r4], m6 RET cglobal intra_pred_ang32_13, 3,4,9 movu m0, [ang32_fact_mode13] movu m1, [ang32_fact_mode13 + mmsize] mova m2, [pw_1024] mova m7, [ang32_shuf_mode13] mova m8, [ang32_shuf_mode13 + mmsize] lea r3, [r1 * 3] ; prepare for [28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2....] movu m6, [r2] pshufb m6, [ang32_shuf_mode13 + mmsize*2] mova m3, [ang32_shuf_mode24 + mmsize*1] vpermd m6, m3, m6 palignr m6, m6, 1 vbroadcasti128 m3, [r2 + mmsize*2 + 1] palignr m5, m3, m6, 1 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 3 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 5 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 7 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 9 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 11 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 13 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 15 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] mova m6, m3 vbroadcasti128 m3, [r2 + mmsize*2 + 17] palignr m5, m3, m6, 1 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 3 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 4 pshufb m4, m5, m7 pshufb m5, m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 5 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 7 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 9 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 11 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 13 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 15 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_23, 3,5,8 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 0] movu m1, [r2 + 1] punpckhbw m2, m0, m1 punpcklbw m0, m1 movu m4, [r2 + mmsize*2] pshufb m4, [ang32_shuf_mode23] vpermq m4, m4, q1313 palignr m3, m0, m4, 1 vinserti128 m3, m3, xm2, 1 pmaddubsw m4, m0, [r3 + 7 * 32] ; [23] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 14 palignr m7, m2, m0, 14 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 + 3 * 32] ; [19] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 12 palignr m7, m2, m0, 12 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m6, [r3 - 1 * 32] ; [15] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 10 palignr m7, m2, m0, 10 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 8 palignr m7, m2, m0, 8 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m6, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 16 to 23 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 6 palignr m7, m2, m0, 6 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 4 palignr m7, m2, m0, 4 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 24 to 31 palignr m6, m0, m3, 2 palignr m7, m2, m0, 2 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m3, [r3 + 11 * 32] ; [27] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 + 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m3, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m3, [r3 - 7 * 32] ; [9] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 - 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pand m3, [pw_00ff] pand m0, [pw_00ff] packuswb m3, m0 movu [r0 + r4], m3 RET cglobal intra_pred_ang32_14, 3,4,9 movu m0, [ang32_fact_mode14] movu m1, [ang32_fact_mode14 + mmsize] mova m2, [pw_1024] mova m7, [ang32_shuf_mode14] mova m8, [ang32_shuf_mode14 + mmsize] lea r3, [r1 * 3] ; prepare for [30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2...] movu m6, [r2] pshufb m6, [ang32_shuf_mode14 + mmsize*2] vpermq m6, m6, 01110111b pslldq m6, m6, 1 vbroadcasti128 m3, [r2 + mmsize*2 + 1] palignr m5, m3, m6, 1 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 3 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 5 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 7 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 9 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 11 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 13 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 15 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] mova m6, m3 vbroadcasti128 m3, [r2 + mmsize*2 + 17] palignr m5, m3, m6, 1 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 3 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 4 pshufb m4, m5, m7 pshufb m5, m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 5 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 7 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 9 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 11 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m5, m3, m6, 13 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m5, m3, m6, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m5, m3, m6, 15 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_22, 3,5,9 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 0] movu m1, [r2 + 1] punpckhbw m2, m0, m1 punpcklbw m0, m1 movu m4, [r2 + mmsize*2 + 2] pshufb m4, [ang32_shuf_mode22] vextracti128 xm8, m4, 1 palignr m3, m0, m4, 2 palignr m3, m8, 15 vinserti128 m3, m3, xm2, 1 vinserti128 m8, m8, xm0, 1 pmaddubsw m4, m0, [r3 + 3 * 32] ; [19] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 + 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 14 palignr m7, m2, m0, 14 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m0, m3, 12 palignr m7, m2, m0, 12 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 10 palignr m7, m2, m0, 10 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 8 palignr m7, m2, m0, 8 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m0, m3, 6 palignr m7, m2, m0, 6 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 4 palignr m7, m2, m0, 4 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m6, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 16 to 23 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 2 palignr m7, m2, m0, 2 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m3, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m3, [r3 - 1 * 32] ; [15] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 - 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m3, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 14 palignr m7, m0, m3, 14 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 24 to 31 palignr m6, m3, m8, 12 palignr m7, m0, m3, 12 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 palignr m6, m3, m8, 10 palignr m7, m0, m3, 10 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m0, m3, 8 palignr m3, m8, 8 pmaddubsw m4, m3, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m3, [r3 - 3 * 32] ; [13] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 - 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pand m3, [pw_00ff] pand m0, [pw_00ff] packuswb m3, m0 movu [r0 + r4], m3 RET cglobal intra_pred_ang32_15, 3,4,9 movu m0, [ang32_fact_mode15] movu m1, [ang32_fact_mode15 + mmsize] mova m2, [pw_1024] mova m7, [ang32_shuf_mode15] mova m8, [ang32_shuf_mode15 + mmsize] lea r3, [r1 * 3] ; prepare for [30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2...] movu m6, [r2] pshufb m6, [ang32_shuf_mode15 + mmsize*2] vpermq m6, m6, 01110111b movu xm3, [r2 + mmsize*2] pinsrb xm3, [r2], 0 vpermq m3, m3, 01000100b palignr m4, m3, m6, 2 pshufb m4, m7 pshufb m5, m6, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 3 pshufb m4, m7 palignr m5, m3, m6, 1 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m3, m6, 4 pshufb m4, m7 palignr m5, m3, m6, 2 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 5 pshufb m4, m7 palignr m5, m3, m6, 3 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 6 pshufb m4, m7 palignr m5, m3, m6, 4 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 7 pshufb m4, m7 palignr m5, m3, m6, 5 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m3, m6, 8 pshufb m4, m7 palignr m5, m3, m6, 6 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 9 pshufb m4, m7 palignr m5, m3, m6, 7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 10 pshufb m4, m7 palignr m5, m3, m6, 8 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 11 pshufb m4, m7 palignr m5, m3, m6, 9 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m3, m6, 12 pshufb m4, m7 palignr m5, m3, m6, 10 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 13 pshufb m4, m7 palignr m5, m3, m6, 11 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 14 pshufb m4, m7 palignr m5, m3, m6, 12 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 15 pshufb m4, m7 palignr m5, m3, m6, 13 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 pshufb m4, m3, m7 palignr m5, m3, m6, 14 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 15 mova m6, m3 vbroadcasti128 m3, [r2 + mmsize*2 + 16] palignr m4, m3, m6, 1 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 2 pshufb m4, m7 pshufb m5, m6, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 3 pshufb m4, m7 palignr m5, m3, m6, 1 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m3, m6, 4 pshufb m4, m7 palignr m5, m3, m6, 2 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 5 pshufb m4, m7 palignr m5, m3, m6, 3 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 6 pshufb m4, m7 palignr m5, m3, m6, 4 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 7 pshufb m4, m7 palignr m5, m3, m6, 5 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m3, m6, 8 pshufb m4, m7 palignr m5, m3, m6, 6 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 9 pshufb m4, m7 palignr m5, m3, m6, 7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 10 pshufb m4, m7 palignr m5, m3, m6, 8 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 11 pshufb m4, m7 palignr m5, m3, m6, 9 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 palignr m4, m3, m6, 12 pshufb m4, m7 palignr m5, m3, m6, 10 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 13 pshufb m4, m7 palignr m5, m3, m6, 11 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 14 pshufb m4, m7 palignr m5, m3, m6, 12 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], m4 palignr m4, m3, m6, 15 pshufb m4, m7 palignr m5, m3, m6, 13 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1], m4 pshufb m4, m3, m7 palignr m5, m3, m6, 14 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 15 vbroadcasti128 m6, [r2 + mmsize*2 + 32] palignr m4, m6, m3, 1 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_21, 3,5,9 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 0] movu m1, [r2 + 1] punpckhbw m2, m0, m1 punpcklbw m0, m1 movu m4, [r2 + mmsize*2] pshufb m4, [ang32_shuf_mode21] vextracti128 xm6, m4, 1 palignr m3, m0, m4, 1 palignr m8, m3, m6, 1 vinserti128 m3, m3, xm2, 1 vinserti128 m8, m8, xm0, 1 pmaddubsw m4, m0, [r3 - 1 * 32] ; [15] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 14 palignr m7, m2, m0, 14 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 12 palignr m7, m2, m0, 12 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 10 palignr m7, m2, m0, 10 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 8 palignr m7, m2, m0, 8 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 6 palignr m7, m2, m0, 6 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 4 palignr m7, m2, m0, 4 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 2 palignr m7, m2, m0, 2 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m3, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 16 to 23 palignr m6, m3, m8, 14 palignr m7, m0, m3, 14 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 12 palignr m7, m0, m3, 12 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m3, m8, 10 palignr m7, m0, m3, 10 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 8 palignr m7, m0, m3, 8 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 24 to 31 palignr m6, m3, m8, 6 palignr m7, m0, m3, 6 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 4 palignr m7, m0, m3, 4 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m3, m8, 2 palignr m7, m0, m3, 2 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m8, [r3 + 1 * 32] ; [17] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pand m8, [pw_00ff] pand m3, [pw_00ff] packuswb m8, m3 movu [r0 + r4], m8 RET cglobal intra_pred_ang32_16, 3,4,10 movu m0, [ang32_fact_mode16] movu m1, [ang32_fact_mode16 + mmsize] mova m2, [pw_1024] mova m7, [ang32_shuf_mode16] mova m8, [ang32_shuf_mode16 + mmsize] lea r3, [r1 * 3] ; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2...] movu m6, [r2] pshufb m6, [ang32_shuf_mode16 + mmsize*2] mova m9, m6 mova m3, [ang32_shuf_mode16 + mmsize*3] vpermd m6, m3, m6 vpermq m9, m9, q3232 pslldq m9, 4 palignr m6, m9, 15 pslldq m9, 1 vbroadcasti128 m3, [r2 + mmsize*2 + 1] palignr m4, m3, m6, 1 palignr m5, m6, m9, 6 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 2 palignr m5, m6, m9, 7 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m3, m6, 3 palignr m5, m6, m9, 8 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 4 palignr m5, m6, m9, 9 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 5 palignr m5, m6, m9, 10 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 6 palignr m5, m6, m9, 11 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m3, m6, 7 palignr m5, m6, m9, 12 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 8 palignr m5, m6, m9, 13 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 9 palignr m5, m6, m9, 14 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 10 palignr m5, m6, m9, 15 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m3, m6, 11 pshufb m4, m7 pshufb m5, m6, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 12 palignr m5, m3, m6, 1 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 13 palignr m5, m3, m6, 2 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 14 palignr m5, m3, m6, 3 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m3, m6, 15 palignr m5, m3, m6, 4 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m5, m3, m6, 5 pshufb m4, m3, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] vbroadcasti128 m9, [r2 + mmsize*2 + 17] palignr m4, m9, m3, 1 palignr m5, m3, m6, 6 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m9, m3, 2 palignr m5, m3, m6, 7 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m9, m3, 3 palignr m5, m3, m6, 8 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m9, m3, 4 palignr m5, m3, m6, 9 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m9, m3, 5 palignr m5, m3, m6, 10 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m9, m3, 6 palignr m5, m3, m6, 11 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m9, m3, 7 palignr m5, m3, m6, 12 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m9, m3, 8 palignr m5, m3, m6, 13 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m9, m3, 9 palignr m5, m3, m6, 14 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m9, m3, 10 palignr m5, m3, m6, 15 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m9, m3, 11 pshufb m4, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m9, m3, 12 palignr m5, m9, m3, 1 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m9, m3, 13 palignr m5, m9, m3, 2 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m9, m3, 14 palignr m5, m9, m3, 3 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m9, m3, 15 palignr m5, m9, m3, 4 pshufb m4, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m5, m9, m3, 5 pshufb m4, m9, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_20, 3,5,10 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 0] movu m1, [r2 + 1] punpckhbw m2, m0, m1 punpcklbw m0, m1 movu m4, [r2 + mmsize*2] pshufb m4, [ang32_shuf_mode20] mova m9, m4 vpermq m9, m9, q3333 mova m7, m4 vpermq m7, m7, q1111 palignr m4, m7, 14 pshufb m4, [ang32_shuf_mode20 + mmsize*1] vextracti128 xm6, m4, 1 palignr m3, m0, m4, 1 palignr m8, m3, m6, 1 vinserti128 m3, m3, xm2, 1 vinserti128 m8, m8, xm0, 1 vinserti128 m9, m9, xm3, 1 pmaddubsw m4, m0, [r3 - 5 * 32] ; [11] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 14 palignr m7, m2, m0, 14 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 12 palignr m7, m2, m0, 12 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m0, m3, 10 palignr m7, m2, m0, 10 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 8 palignr m7, m2, m0, 8 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 6 palignr m7, m2, m0, 6 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 4 palignr m7, m2, m0, 4 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 2 palignr m7, m2, m0, 2 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m3, [r3 - 1 * 32] ; [15] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 - 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m3, m8, 14 palignr m7, m0, m3, 14 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 palignr m6, m3, m8, 12 palignr m7, m0, m3, 12 pmaddubsw m4, m6, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 16 to 23 palignr m6, m3, m8, 10 palignr m7, m0, m3, 10 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 11 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 8 palignr m7, m0, m3, 8 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 1 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m3, m8, 6 palignr m7, m0, m3, 6 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 9 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m3, m8, 4 palignr m7, m0, m3, 4 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 2 palignr m7, m0, m3, 2 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 13 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 24 to 31 pmaddubsw m4, m8, [r3 + 3 * 32] ; [19] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 3 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m8, m9, 14 palignr m7, m3, m8, 14 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 7 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 palignr m6, m8, m9, 12 palignr m7, m3, m8, 12 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m8, m9, 10 palignr m7, m3, m8, 10 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 15 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m8, m9, 8 palignr m7, m3, m8, 8 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 5 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pand m6, [pw_00ff] pand m7, [pw_00ff] packuswb m6, m7 movu [r0 + r4], m6 RET cglobal intra_pred_ang32_17, 3,4,8 movu m0, [ang32_fact_mode17] mova m2, [pw_1024] mova m7, [ang32_shuf_mode17] lea r3, [r1 * 3] ; prepare for [31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2...] movu m6, [r2] pshufb m6, [ang32_shuf_mode17 + mmsize] mova m1, m6 mova m3, [ang32_shuf_mode16 + mmsize*3] vpermd m6, m3, m6 vpermq m1, m1, q3232 pslldq m1, 4 movu xm4, [r2 + mmsize*2] pinsrb xm4, [r2], 0 vinserti128 m3, m4, xm4, 1 palignr m4, m3, m6, 2 palignr m5, m6, m1, 5 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 3 palignr m5, m6, m1, 6 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m3, m6, 4 palignr m5, m6, m1, 7 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 5 palignr m5, m6, m1, 8 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 6 palignr m5, m6, m1, 9 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 7 palignr m5, m6, m1, 10 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m3, m6, 8 palignr m5, m6, m1, 11 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 9 palignr m5, m6, m1, 12 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 10 palignr m5, m6, m1, 13 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 11 palignr m5, m6, m1, 14 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m3, m6, 12 palignr m5, m6, m1, 15 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m3, m6, 13 pshufb m4, m7 pshufb m5, m6, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m3, m6, 14 palignr m5, m3, m6, 1 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m3, m6, 15 palignr m5, m3, m6, 2 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m5, m3, m6, 3 pshufb m4, m3, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 vbroadcasti128 m1, [r2 + mmsize*2 + 16] palignr m4, m1, m3, 1 palignr m5, m3, m6, 4 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m1, m3, 2 palignr m5, m3, m6, 5 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m1, m3, 3 palignr m5, m3, m6, 6 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m1, m3, 4 palignr m5, m3, m6, 7 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m1, m3, 5 palignr m5, m3, m6, 8 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m1, m3, 6 palignr m5, m3, m6, 9 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m1, m3, 7 palignr m5, m3, m6, 10 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m1, m3, 8 palignr m5, m3, m6, 11 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m1, m3, 9 palignr m5, m3, m6, 12 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m1, m3, 10 palignr m5, m3, m6, 13 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m1, m3, 11 palignr m5, m3, m6, 14 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 palignr m4, m1, m3, 12 palignr m5, m3, m6, 15 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m1, m3, 13 pshufb m4, m7 pshufb m5, m3, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 lea r0, [r0 + r1 * 4] palignr m4, m1, m3, 14 palignr m5, m1, m3, 1 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0], m4 palignr m4, m1, m3, 15 palignr m5, m1, m3, 2 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1], m4 vbroadcasti128 m6, [r2 + mmsize*2 + mmsize] palignr m5, m1, m3, 3 pshufb m4, m1, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r1 * 2], m4 palignr m4, m6, m1, 1 palignr m5, m1, m3, 4 pshufb m4, m7 pshufb m5, m7 pmaddubsw m4, m0 pmaddubsw m5, m0 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 vpermq m4, m4, q3120 movu [r0 + r3], m4 RET cglobal intra_pred_ang32_19, 3,5,10 lea r3, [ang_table_avx2 + 32 * 16] lea r4, [r1 * 3] mova m5, [pw_1024] ; rows 0 to 7 movu m0, [r2 + 0] movu m1, [r2 + 1] punpckhbw m2, m0, m1 punpcklbw m0, m1 movu m4, [r2 + mmsize*2] pshufb m4, [ang32_shuf_mode17 + mmsize*1] mova m3, [ang32_shuf_mode19 + mmsize*1] mova m6, [ang32_shuf_mode19 + mmsize*2] mova m9, m4 vpermd m4, m3, m4 vpermd m9, m6, m9 pshufb m4, [ang32_shuf_mode19] pshufb m9, [ang32_shuf_mode19] vextracti128 xm6, m4, 1 palignr m3, m0, m4, 1 palignr m8, m3, m6, 1 palignr m7, m8, m9, 1 vinserti128 m3, m3, xm2, 1 vinserti128 m8, m8, xm0, 1 vinserti128 m9, m7, xm3, 1 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m2, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m0, m3, 14 palignr m7, m2, m0, 14 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 12 palignr m7, m2, m0, 12 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 10 palignr m7, m2, m0, 10 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m0, m3, 8 palignr m7, m2, m0, 8 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m0, m3, 6 palignr m7, m2, m0, 6 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m0, m3, 4 palignr m7, m2, m0, 4 pmaddubsw m4, m6, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 8 to 15 palignr m6, m0, m3, 2 palignr m7, m2, m0, 2 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m3, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m3, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m0, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m3, m8, 14 palignr m7, m0, m3, 14 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m3, m8, 12 palignr m7, m0, m3, 12 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m3, m8, 10 palignr m7, m0, m3, 10 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 8 palignr m7, m0, m3, 8 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pand m6, [pw_00ff] pand m7, [pw_00ff] packuswb m6, m7 movu [r0 + r4], m6 lea r0, [r0 + r1 * 4] ; rows 16 to 23 palignr m6, m3, m8, 6 palignr m7, m0, m3, 6 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m3, m8, 4 palignr m7, m0, m3, 4 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m3, m8, 2 palignr m7, m0, m3, 2 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 pmaddubsw m4, m8, [r3 + 8 * 32] ; [24] pmulhrsw m4, m5 pmaddubsw m1, m3, [r3 + 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] palignr m6, m8, m9, 14 palignr m7, m3, m8, 14 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m8, m9, 12 palignr m7, m3, m8, 12 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m8, m9, 10 palignr m7, m3, m8, 10 pmaddubsw m4, m6, [r3] ; [16] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] ; rows 24 to 31 palignr m6, m8, m9, 8 palignr m7, m3, m8, 8 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 6 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 palignr m6, m8, m9, 6 palignr m7, m3, m8, 6 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 12 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 14 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1*2], m4 palignr m6, m8, m9, 4 palignr m7, m3, m8, 4 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 8 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r4], m4 lea r0, [r0 + r1 * 4] vpbroadcastb m0, [r2 + mmsize*2 + 31] palignr m1, m9, m0, 1 vinserti128 m0, m1, xm8, 1 palignr m6, m8, m9, 2 palignr m7, m3, m8, 2 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 - 2 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0], m4 pmaddubsw m4, m9, [r3 + 4 * 32] ; [20] pmulhrsw m4, m5 pmaddubsw m1, m8, [r3 + 4 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1], m4 palignr m6, m9, m0, 14 palignr m7, m8, m9, 14 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] pmulhrsw m4, m5 pmaddubsw m1, m7, [r3 + 10 * 32] pmulhrsw m1, m5 packuswb m4, m1 movu [r0 + r1 * 2], m4 pand m6, [pw_00ff] pand m7, [pw_00ff] packuswb m6, m7 movu [r0 + r4], m6 RET %endif ; ARCH_X86_64 ;----------------------------------------------------------------------------------------- ; end of intra_pred_ang32 angular modes avx2 asm ;----------------------------------------------------------------------------------------- ;----------------------------------------------------------------------------------------- ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) ;----------------------------------------------------------------------------------------- INIT_YMM avx2 %macro ang8_store8x8 0 lea r3, [3 * r1] vextracti128 xm2, m1, 1 vextracti128 xm5, m4, 1 movq [r0], xm1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 movq [r0 + r1], xm5 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm5 %endmacro cglobal intra_pred_ang8_3, 3,4,6 vbroadcasti128 m0, [r2 + 17] mova m5, [ang8_shuf_mode3] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode3] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_33, 3,4,5 mova m3, [pw_1024] vbroadcasti128 m0, [r2 + 1] pshufb m1, m0, [c_ang8_src1_9_2_10] pshufb m2, m0, [c_ang8_src3_11_4_12] pshufb m4, m0, [c_ang8_src5_13_5_13] pshufb m0, [c_ang8_src6_14_7_15] pmaddubsw m1, [c_ang8_26_20] pmulhrsw m1, m3 pmaddubsw m2, [c_ang8_14_8] pmulhrsw m2, m3 pmaddubsw m4, [c_ang8_2_28] pmulhrsw m4, m3 pmaddubsw m0, [c_ang8_22_16] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_4, 3,4,6 vbroadcasti128 m0, [r2 + 17] mova m5, [ang8_shuf_mode4] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode4] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_32, 3,4,5 mova m3, [pw_1024] vbroadcasti128 m0, [r2 + 1] pshufb m1, m0, [c_ang8_src1_9_2_10] pshufb m2, m0, [c_ang8_src2_10_3_11] pshufb m4, m0, [c_ang8_src4_12_4_12] pshufb m0, [c_ang8_src5_13_6_14] pmaddubsw m1, [c_ang8_21_10] pmulhrsw m1, m3 pmaddubsw m2, [c_ang8_31_20] pmulhrsw m2, m3 pmaddubsw m4, [c_ang8_9_30] pmulhrsw m4, m3 pmaddubsw m0, [c_ang8_19_8] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_5, 3, 4, 6 vbroadcasti128 m0, [r2 + 17] mova m5, [ang8_shuf_mode5] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode5] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_31, 3, 4, 5 mova m3, [pw_1024] vbroadcasti128 m0, [r2 + 1] pshufb m1, m0, [c_ang8_src1_9_2_10] pshufb m2, m0, [c_ang8_src2_10_3_11] pshufb m4, m0, [c_ang8_src3_11_4_12] pshufb m0, [c_ang8_src4_12_5_13] pmaddubsw m1, [c_ang8_17_2] pmulhrsw m1, m3 pmaddubsw m2, [c_ang8_19_4] pmulhrsw m2, m3 pmaddubsw m4, [c_ang8_21_6] pmulhrsw m4, m3 pmaddubsw m0, [c_ang8_23_8] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_6, 3, 4, 6 vbroadcasti128 m0, [r2 + 17] mova m5, [ang8_shuf_mode6] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode6] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_30, 3, 4, 5 mova m3, [pw_1024] vbroadcasti128 m0, [r2 + 1] pshufb m1, m0, [intra_pred_shuff_0_8] pshufb m2, m0, [c_ang8_src2_10_2_10] pshufb m4, m0, [c_ang8_src3_11_3_11] pshufb m0, [c_ang8_src3_11_4_12] pmaddubsw m1, [c_ang8_13_26] pmulhrsw m1, m3 pmaddubsw m2, [c_ang8_7_20] pmulhrsw m2, m3 pmaddubsw m4, [c_ang8_1_14] pmulhrsw m4, m3 pmaddubsw m0, [c_ang8_27_8] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_9, 3, 5, 6 vbroadcasti128 m0, [r2 + 17] mova m5, [ang8_shuf_mode9] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode9] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_27, 3, 5, 5 mova m3, [pw_1024] vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred_shuff_0_8] lea r4, [c_ang8_mode_27] pmaddubsw m1, m0, [r4] pmulhrsw m1, m3 pmaddubsw m2, m0, [r4 + mmsize] pmulhrsw m2, m3 pmaddubsw m4, m0, [r4 + 2 * mmsize] pmulhrsw m4, m3 pmaddubsw m0, [r4 + 3 * mmsize] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_25, 3, 5, 5 mova m3, [pw_1024] vbroadcasti128 m0, [r2] pshufb m0, [intra_pred_shuff_0_8] lea r4, [c_ang8_mode_25] pmaddubsw m1, m0, [r4] pmulhrsw m1, m3 pmaddubsw m2, m0, [r4 + mmsize] pmulhrsw m2, m3 pmaddubsw m4, m0, [r4 + 2 * mmsize] pmulhrsw m4, m3 pmaddubsw m0, [r4 + 3 * mmsize] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_7, 3, 4, 6 vbroadcasti128 m0, [r2 + 17] mova m5, [ang8_shuf_mode7] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode7] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_29, 3, 4, 5 mova m3, [pw_1024] vbroadcasti128 m0, [r2 + 1] pshufb m1, m0, [intra_pred_shuff_0_8] pshufb m2, m0, [c_ang8_src1_9_2_10] pshufb m4, m0, [c_ang8_src2_10_2_10] pshufb m0, [c_ang8_src2_10_3_11] pmaddubsw m1, [c_ang8_9_18] pmulhrsw m1, m3 pmaddubsw m2, [c_ang8_27_4] pmulhrsw m2, m3 pmaddubsw m4, [c_ang8_13_22] pmulhrsw m4, m3 pmaddubsw m0, [c_ang8_31_8] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_8, 3, 4, 6 vbroadcasti128 m0, [r2 + 17] mova m5, [ang8_shuf_mode8] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode8] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_28, 3, 4, 6 mova m3, [pw_1024] vbroadcasti128 m0, [r2 + 1] mova m5, [intra_pred_shuff_0_8] pshufb m1, m0, m5 pshufb m2, m0, m5 pshufb m4, m0, m5 pshufb m0, [c_ang8_src2_10_2_10] pmaddubsw m1, [c_ang8_5_10] pmulhrsw m1, m3 pmaddubsw m2, [c_ang8_15_20] pmulhrsw m2, m3 pmaddubsw m4, [c_ang8_25_30] pmulhrsw m4, m3 pmaddubsw m0, [c_ang8_3_8] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_11, 3, 5, 6 mova m3, [pw_1024] movu xm1, [r2 + 16] pinsrb xm1, [r2], 0 vinserti128 m0, m1, xm1, 1 mova m5, [ang8_shuf_mode9] mova m3, [pb_2] pshufb m1, m0, m5 paddb m5, m3 pshufb m2, m0, m5 paddb m5, m3 pshufb m4, m0, m5 paddb m5, m3 pshufb m0, m5 vbroadcasti128 m5, [ang8_fact_mode11] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_15, 3, 6, 6 vbroadcasti128 m1, [r2 + 17] vbroadcasti128 m2, [r2] mova m3, [ang8_shuf_mode15 + mmsize] pshufb m2, m3 palignr m1, m2, 11 mova m5, [ang8_shuf_mode15] mova m3, [pb_2] pshufb m0, m1, m5 psubb m5, m3 pshufb m4, m1, m5 psubb m5, m3 pshufb m2, m1, m5 psubb m5, m3 pshufb m1, m5 vbroadcasti128 m5, [ang8_fact_mode15] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_16, 3,4,6 vbroadcasti128 m1, [r2 + 17] vbroadcasti128 m2, [r2] mova m3, [ang8_shuf_mode16 + mmsize] pshufb m2, m3 palignr m1, m2, 10 mova m5, [ang8_shuf_mode16] mova m3, [pb_2] pshufb m0, m1, m5 psubb m5, m3 pshufb m4, m1, m5 psubb m5, m3 pshufb m2, m1, m5 psubb m5, m3 pshufb m1, m5 vbroadcasti128 m5, [ang8_fact_mode16] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_17, 3,4,6 vbroadcasti128 m1, [r2 + 17] vbroadcasti128 m2, [r2] mova m3, [ang8_shuf_mode17 + mmsize] pshufb m2, m3 palignr m1, m2, 9 mova m5, [ang8_shuf_mode17] mova m3, [pb_2] pshufb m0, m1, m5 psubb m5, m3 pshufb m4, m1, m5 psubb m5, m3 pshufb m2, m1, m5 psubb m5, m3 pshufb m1, m5 vbroadcasti128 m5, [ang8_fact_mode17] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET %if 1 INIT_YMM avx2 cglobal intra_pred_ang8_20, 3,5,6 lea r0, [r0 + r1 * 8] sub r0, r1 neg r1 lea r3, [angHor8_tab_20] lea r4, [r1 * 3] movu m5, [intra_pred_shuff_0_8 + 16] ; prepare reference pixel movq xm1, [r2 + 1] ; m3 = [ 1 2 3 4 5 6 7 8 x x x x x x x x] movhps xm1, [r2 + 16 + 2] ; m3 = [ 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8 x] palignr xm1, xm1, [r2 - 15], 15 ; m3 = [ 0 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8] pshufb xm1, [c_ang8_mode_20] vinserti128 m1, m1, xm1, 1 ; process 4 rows pshufb m3, m1, m5 psrldq m1, 2 pmaddubsw m3, [r3 + 0 * 16] pmulhrsw m3, [pw_1024] pshufb m4, m1, [intra_pred_shuff_0_8] psrldq m1, 1 pmaddubsw m4, [r3 + 2 * 16] pmulhrsw m4, [pw_1024] packuswb m3, m4 vextracti128 xm4, m3, 1 movq [r0], xm3 movq [r0 + r1], xm4 movhps [r0 + r1 * 2], xm3 movhps [r0 + r4], xm4 ; process 4 rows lea r0, [r0 + r1 * 4] pshufb m3, m1, m5 psrldq m1, 1 pmaddubsw m3, [r3 + 4 * 16] pmulhrsw m3, [pw_1024] pshufb m4, m1, m5 pmaddubsw m4, [r3 + 6 * 16] pmulhrsw m4, [pw_1024] packuswb m3, m4 vextracti128 xm4, m3, 1 movq [r0], xm3 movq [r0 + r1], xm4 movhps [r0 + r1 * 2], xm3 movhps [r0 + r4], xm4 RET %else INIT_YMM avx2 cglobal intra_pred_ang8_20, 3, 6, 6 mova m3, [pw_1024] movu xm5, [r2] lea r5, [intra_pred_shuff_0_8] mova xm0, xm5 pslldq xm5, 1 pinsrb xm5, [r2 + 2 + 16], 0 vinserti128 m0, m0, xm5, 1 pshufb m0, [r5] lea r4, [c_ang8_mode_20] pmaddubsw m1, m0, [r4] pmulhrsw m1, m3 mova xm0, xm5 pslldq xm5, 1 pinsrb xm5, [r2 + 3 + 16], 0 vinserti128 m0, m0, xm5, 1 pshufb m0, [r5] pmaddubsw m2, m0, [r4 + mmsize] pmulhrsw m2, m3 pslldq xm5, 1 pinsrb xm5, [r2 + 5 + 16], 0 vinserti128 m0, m5, xm5, 1 pshufb m0, [r5] pmaddubsw m4, m0, [r4 + 2 * mmsize] pmulhrsw m4, m3 pslldq xm5, 1 pinsrb xm5, [r2 + 6 + 16], 0 mova xm0, xm5 pslldq xm5, 1 pinsrb xm5, [r2 + 8 + 16], 0 vinserti128 m0, m0, xm5, 1 pshufb m0, [r5] pmaddubsw m0, [r4 + 3 * mmsize] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET %endif INIT_YMM avx2 cglobal intra_pred_ang8_21, 3, 6, 6 mova m3, [pw_1024] movu xm5, [r2] lea r5, [intra_pred_shuff_0_8] mova xm0, xm5 pslldq xm5, 1 pinsrb xm5, [r2 + 2 + 16], 0 vinserti128 m0, m0, xm5, 1 pshufb m0, [r5] lea r4, [c_ang8_mode_15] pmaddubsw m1, m0, [r4] pmulhrsw m1, m3 mova xm0, xm5 pslldq xm5, 1 pinsrb xm5, [r2 + 4 + 16], 0 vinserti128 m0, m0, xm5, 1 pshufb m0, [r5] pmaddubsw m2, m0, [r4 + mmsize] pmulhrsw m2, m3 mova xm0, xm5 pslldq xm5, 1 pinsrb xm5, [r2 + 6 + 16], 0 vinserti128 m0, m0, xm5, 1 pshufb m0, [r5] pmaddubsw m4, m0, [r4 + 2 * mmsize] pmulhrsw m4, m3 mova xm0, xm5 pslldq xm5, 1 pinsrb xm5, [r2 + 8 + 16], 0 vinserti128 m0, m0, xm5, 1 pshufb m0, [r5] pmaddubsw m0, [r4 + 3 * mmsize] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_22, 3, 6, 6 mova m3, [pw_1024] movu xm5, [r2] lea r5, [intra_pred_shuff_0_8] vinserti128 m0, m5, xm5, 1 pshufb m0, [r5] lea r4, [c_ang8_mode_14] pmaddubsw m1, m0, [r4] pmulhrsw m1, m3 pslldq xm5, 1 pinsrb xm5, [r2 + 2 + 16], 0 vinserti128 m0, m5, xm5, 1 pshufb m0, [r5] pmaddubsw m2, m0, [r4 + mmsize] pmulhrsw m2, m3 pslldq xm5, 1 pinsrb xm5, [r2 + 5 + 16], 0 vinserti128 m0, m5, xm5, 1 pshufb m0, [r5] pmaddubsw m4, m0, [r4 + 2 * mmsize] pmulhrsw m4, m3 pslldq xm5, 1 pinsrb xm5, [r2 + 7 + 16], 0 pshufb xm5, [r5] vinserti128 m0, m0, xm5, 1 pmaddubsw m0, [r4 + 3 * mmsize] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_14, 3, 6, 6 movu xm1, [r2 + 13] vinserti128 m1, m1, xm1, 1 pinsrb xm1, [r2 + 0], 3 pinsrb xm1, [r2 + 2], 2 pinsrb xm1, [r2 + 5], 1 pinsrb xm1, [r2 + 7], 0 vinserti128 m1, m1, xm1, 1 mova m5, [ang8_shuf_mode14] mova m3, [pb_2] pshufb m0, m1, m5 psubb m5, m3 pshufb m4, m1, m5 psubb m5, m3 pshufb m2, m1, m5 psubb m5, m3 pshufb m1, m5 vbroadcasti128 m5, [ang8_fact_mode14] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_13, 3, 6, 6 movu xm1, [r2 + 14] pinsrb xm1, [r2 + 0], 2 pinsrb xm1, [r2 + 4], 1 pinsrb xm1, [r2 + 7], 0 vinserti128 m1, m1, xm1, 1 mova m5, [ang8_shuf_mode13] mova m3, [pb_2] pshufb m0, m1, m5 psubb m5, m3 pshufb m4, m1, m5 psubb m5, m3 pshufb m2, m1, m5 psubb m5, m3 pshufb m1, m5 vbroadcasti128 m5, [ang8_fact_mode13] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_23, 3, 6, 6 mova m3, [pw_1024] movu xm5, [r2] lea r5, [intra_pred_shuff_0_8] vinserti128 m0, m5, xm5, 1 pshufb m0, [r5] lea r4, [c_ang8_mode_13] pmaddubsw m1, m0, [r4] pmulhrsw m1, m3 pslldq xm5, 1 pinsrb xm5, [r2 + 4 + 16], 0 pshufb xm4, xm5, [r5] vinserti128 m0, m0, xm4, 1 pmaddubsw m2, m0, [r4 + mmsize] pmulhrsw m2, m3 vinserti128 m0, m0, xm4, 0 pmaddubsw m4, m0, [r4 + 2 * mmsize] pmulhrsw m4, m3 pslldq xm5, 1 pinsrb xm5, [r2 + 7 + 16], 0 pshufb xm5, [r5] vinserti128 m0, m0, xm5, 1 pmaddubsw m0, [r4 + 3 * mmsize] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET INIT_YMM avx2 cglobal intra_pred_ang8_12, 3, 5, 6 movu xm1, [r2 + 15] pinsrb xm1, [r2 + 0], 1 pinsrb xm1, [r2 + 6], 0 vinserti128 m1, m1, xm1, 1 mova m5, [ang8_shuf_mode12] mova m3, [pb_2] pshufb m0, m1, m5 psubb m5, m3 pshufb m4, m1, m5 psubb m5, m3 pshufb m2, m1, m5 psubb m5, m3 pshufb m1, m5 vbroadcasti128 m5, [ang8_fact_mode12] mova m3, [pw_1024] pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m4, m5 pmaddubsw m0, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 pmulhrsw m4, m3 pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 ang8_store8x8 RET INIT_YMM avx2 cglobal intra_pred_ang8_24, 3, 5, 5 mova m3, [pw_1024] vbroadcasti128 m0, [r2] pshufb m0, [intra_pred_shuff_0_8] lea r4, [c_ang8_mode_24] pmaddubsw m1, m0, [r4] pmulhrsw m1, m3 pmaddubsw m2, m0, [r4 + mmsize] pmulhrsw m2, m3 pmaddubsw m4, m0, [r4 + 2 * mmsize] pmulhrsw m4, m3 pslldq xm0, 2 pinsrb xm0, [r2 + 16 + 6], 0 pinsrb xm0, [r2 + 0], 1 vinserti128 m0, m0, xm0, 1 pmaddubsw m0, [r4 + 3 * mmsize] pmulhrsw m0, m3 packuswb m1, m2 packuswb m4, m0 lea r3, [3 * r1] movq [r0], xm1 vextracti128 xm2, m1, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm1 movhps [r0 + r3], xm2 lea r0, [r0 + 4 * r1] movq [r0], xm4 vextracti128 xm2, m4, 1 movq [r0 + r1], xm2 movhps [r0 + 2 * r1], xm4 movhps [r0 + r3], xm2 RET %macro INTRA_PRED_ANG16_MC0 3 pmaddubsw m3, m1, [r4 + %3 * mmsize] pmulhrsw m3, m0 pmaddubsw m4, m2, [r4 + %3 * mmsize] pmulhrsw m4, m0 packuswb m3, m4 movu [%1], xm3 vextracti128 xm4, m3, 1 movu [%2], xm4 %endmacro %macro INTRA_PRED_ANG16_MC1 1 INTRA_PRED_ANG16_MC0 r0, r0 + r1, %1 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, (%1 + 1) %endmacro %macro INTRA_PRED_ANG16_MC2 1 vbroadcasti128 m1, [r2 + %1] pshufb m1, m5 vbroadcasti128 m2, [r2 + (%1 + 8)] pshufb m2, m5 %endmacro %macro INTRA_PRED_ANG16_MC3 2 vperm2i128 m1, m1, m2, 00100000b pmaddubsw m3, m1, [r4 + (%2 * mmsize)] pmulhrsw m3, m0 packuswb m3, m3 vpermq m3, m3, 11011000b movu [%1], xm3 %endmacro %macro INTRA_PRED_ANG16_MC4 3 vperm2i128 m1, m1, m2, 00100000b pmaddubsw m4, m1, [r4 + (%3 * mmsize)] pmulhrsw m4, m0 packuswb m3, m4 vpermq m3, m3, 11011000b movu [%1], xm3 vextracti128 xm3, m3, 1 movu [%2], xm3 %endmacro %if ARCH_X86_64 == 1 %macro INTRA_PRED_TRANS_STORE_16x16 0 punpcklbw m8, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklwd m7, m8, m1 punpckhwd m8, m1 punpcklwd m1, m3, m5 punpckhwd m3, m5 punpcklwd m5, m0, m2 punpckhwd m0, m2 punpcklwd m2, m4, m6 punpckhwd m4, m6 punpckldq m6, m7, m1 punpckhdq m7, m1 punpckldq m1, m8, m3 punpckhdq m8, m3 punpckldq m3, m5, m2 punpckhdq m5, m2 punpckldq m2, m0, m4 punpckhdq m0, m4 vpermq m6, m6, 0xD8 vpermq m7, m7, 0xD8 vpermq m1, m1, 0xD8 vpermq m8, m8, 0xD8 vpermq m3, m3, 0xD8 vpermq m5, m5, 0xD8 vpermq m2, m2, 0xD8 vpermq m0, m0, 0xD8 movu [r0], xm6 vextracti128 xm4, m6, 1 movu [r0 + r1], xm4 movu [r0 + 2 * r1], xm7 vextracti128 xm4, m7, 1 movu [r0 + r3], xm4 lea r0, [r0 + 4 * r1] movu [r0], xm1 vextracti128 xm4, m1, 1 movu [r0 + r1], xm4 movu [r0 + 2 * r1], xm8 vextracti128 xm4, m8, 1 movu [r0 + r3], xm4 lea r0, [r0 + 4 * r1] movu [r0], xm3 vextracti128 xm4, m3, 1 movu [r0 + r1], xm4 movu [r0 + 2 * r1], xm5 vextracti128 xm4, m5, 1 movu [r0 + r3], xm4 lea r0, [r0 + 4 * r1] movu [r0], xm2 vextracti128 xm4, m2, 1 movu [r0 + r1], xm4 movu [r0 + 2 * r1], xm0 vextracti128 xm4, m0, 1 movu [r0 + r3], xm4 %endmacro %macro INTRA_PRED_ANG16_CAL_ROW 3 pmaddubsw %1, m9, [r4 + (%3 * mmsize)] pmulhrsw %1, m11 pmaddubsw %2, m10, [r4 + (%3 * mmsize)] pmulhrsw %2, m11 packuswb %1, %2 %endmacro INIT_YMM avx2 cglobal intra_pred_ang16_12, 3,4,9 vbroadcasti128 m0, [angHor_tab_12] vbroadcasti128 m1, [angHor_tab_12 + mmsize/2] mova m2, [pw_1024] mova m7, [ang16_shuf_mode12] mova m8, [ang16_shuf_mode12 + mmsize] lea r3, [r1 * 3] movu xm4, [r2 + mmsize - 2] pinsrb xm4, [r2 + 0], 2 pinsrb xm4, [r2 + 6], 1 pinsrb xm4, [r2 + 13], 0 vbroadcasti128 m6, [r2 + mmsize + 14] vinserti128 m3, m4, xm4, 1 pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET INIT_YMM avx2 cglobal intra_pred_ang16_13, 3,4,9 vbroadcasti128 m0, [angHor_tab_13] vbroadcasti128 m1, [angHor_tab_13 + mmsize/2] mova m2, [pw_1024] mova m7, [ang16_shuf_mode13] mova m8, [ang16_shuf_mode13 + mmsize] lea r3, [r1 * 3] vbroadcasti128 m3, [r2 + mmsize + 1] vbroadcasti128 m4, [r2] pshufb m4, [ang16_shuf_mode13 + mmsize * 2] palignr m3, m4, 11 vbroadcasti128 m6, [r2 + mmsize + 12] pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET INIT_YMM avx2 cglobal intra_pred_ang16_14, 3,4,9 vbroadcasti128 m0, [angHor_tab_14] vbroadcasti128 m1, [angHor_tab_14 + mmsize/2] mova m2, [pw_1024] mova m7, [ang16_shuf_mode14] mova m8, [ang16_shuf_mode14 + mmsize] lea r3, [r1 * 3] vbroadcasti128 m3, [r2 + mmsize + 1] vbroadcasti128 m4, [r2] pshufb m4, [ang16_shuf_mode14 + mmsize * 2] palignr m3, m4, 9 vbroadcasti128 m6, [r2 + mmsize + 10] pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET INIT_YMM avx2 cglobal intra_pred_ang16_15, 3,4,9 vbroadcasti128 m0, [angHor_tab_15] vbroadcasti128 m1, [angHor_tab_15 + mmsize/2] mova m2, [pw_1024] mova m7, [ang16_shuf_mode15] mova m8, [ang16_shuf_mode15 + mmsize] lea r3, [r1 * 3] vbroadcasti128 m3, [r2 + mmsize + 1] vbroadcasti128 m4, [r2] pshufb m4, [ang16_shuf_mode15 + mmsize * 2] palignr m3, m3, m4, 7 vbroadcasti128 m6, [r2 + mmsize + 8] pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET INIT_YMM avx2 cglobal intra_pred_ang16_16, 3,4,9 vbroadcasti128 m0, [angHor_tab_16] vbroadcasti128 m1, [angHor_tab_16 + mmsize/2] mova m2, [pw_1024] mova m7, [ang16_shuf_mode16] mova m8, [ang16_shuf_mode16 + mmsize] lea r3, [r1 * 3] vbroadcasti128 m3, [r2 + mmsize + 1] vbroadcasti128 m4, [r2] pshufb m4, [ang16_shuf_mode16 + mmsize * 2] palignr m3, m4, 5 vbroadcasti128 m6, [r2 + mmsize + 6] pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET INIT_YMM avx2 cglobal intra_pred_ang16_17, 3,4,9 vbroadcasti128 m0, [angHor_tab_17] vbroadcasti128 m1, [angHor_tab_17 + mmsize/2] mova m2, [pw_1024] mova m7, [ang16_shuf_mode17] mova m8, [ang16_shuf_mode17 + mmsize] lea r3, [r1 * 3] vbroadcasti128 m3, [r2 + mmsize + 1] vbroadcasti128 m4, [r2] pshufb m4, [ang16_shuf_mode17 + mmsize * 2] palignr m3, m4, 3 vbroadcasti128 m6, [r2 + mmsize + 4] pshufb m4, m3, m7 pshufb m5, m3, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m4, m5, m7 pshufb m5, m8 pmaddubsw m4, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET INIT_YMM avx2 cglobal intra_pred_ang16_11, 3,4,8 vbroadcasti128 m0, [angHor_tab_11] vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] mova m2, [pw_1024] mova m7, [ang32_shuf_mode9] lea r3, [r1 * 3] ; prepare for [0 -1 -2...] movu xm3, [r2 + mmsize] pinsrb xm3, [r2], 0 vbroadcasti128 m6, [r2 + mmsize + 16] vinserti128 m3, m3, xm3, 1 pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2] pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET ; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm %if ARCH_X86_64 == 1 INIT_YMM avx2 %macro TRANSPOSE_STORE_8x32 12 jc .skip punpcklbw m%9, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%10, m%3, m%4 punpckhbw m%3, m%4 punpcklwd m%11, m%9, m%10 punpckhwd m%9, m%10 punpcklwd m%10, m%1, m%3 punpckhwd m%1, m%3 punpckldq m%12, m%11, m%10 punpckhdq m%11, m%10 punpckldq m%10, m%9, m%1 punpckhdq m%9, m%1 punpcklbw m%1, m%5, m%6 punpckhbw m%5, m%6 punpcklbw m%2, m%7, m%8 punpckhbw m%7, m%8 punpcklwd m%3, m%1, m%2 punpckhwd m%1, m%2 punpcklwd m%4, m%5, m%7 punpckhwd m%5, m%7 punpckldq m%2, m%3, m%4 punpckhdq m%3, m%4 punpckldq m%4, m%1, m%5 punpckhdq m%1, m%5 punpckldq m%5, m%12, m%2 punpckhdq m%6, m%12, m%2 punpckldq m%7, m%10, m%4 punpckhdq m%8, m%10, m%4 punpckldq m%2, m%11, m%3 punpckhdq m%11, m%11, m%3 punpckldq m%4, m%9, m%1 punpckhdq m%9, m%9, m%1 movu [r0 + r1 * 0], xm%5 movu [r0 + r1 * 1], xm%6 movu [r0 + r1 * 2], xm%2 movu [r0 + r5 * 1], xm%11 add r0, r6 movu [r0 + r1 * 0], xm%7 movu [r0 + r1 * 1], xm%8 movu [r0 + r1 * 2], xm%4 movu [r0 + r5 * 1], xm%9 add r0, r6 vextracti128 [r0 + r1 * 0], m%5, 1 vextracti128 [r0 + r1 * 1], m%6, 1 vextracti128 [r0 + r1 * 2], m%2, 1 vextracti128 [r0 + r5 * 1], m%11, 1 add r0, r6 vextracti128 [r0 + r1 * 0], m%7, 1 vextracti128 [r0 + r1 * 1], m%8, 1 vextracti128 [r0 + r1 * 2], m%4, 1 vextracti128 [r0 + r5 * 1], m%9, 1 jmp .end .skip: vpermq m%1, m%1, q3120 vpermq m%2, m%2, q3120 vpermq m%3, m%3, q3120 vpermq m%4, m%4, q3120 vpermq m%5, m%5, q3120 vpermq m%6, m%6, q3120 vpermq m%7, m%7, q3120 vpermq m%8, m%8, q3120 movu [r0 + r1 * 0], xm%1 movu [r0 + r1 * 1], xm%2 movu [r0 + r1 * 2], xm%3 movu [r0 + r5 * 1], xm%4 add r0, r6 movu [r0 + r1 * 0], xm%5 movu [r0 + r1 * 1], xm%6 movu [r0 + r1 * 2], xm%7 movu [r0 + r5 * 1], xm%8 add r0, r6 vextracti128 [r0 + r1 * 0], m%1, 1 vextracti128 [r0 + r1 * 1], m%2, 1 vextracti128 [r0 + r1 * 2], m%3, 1 vextracti128 [r0 + r5 * 1], m%4, 1 add r0, r6 vextracti128 [r0 + r1 * 0], m%5, 1 vextracti128 [r0 + r1 * 1], m%6, 1 vextracti128 [r0 + r1 * 2], m%7, 1 vextracti128 [r0 + r5 * 1], m%8, 1 .end: %endmacro cglobal ang16_mode_3_33 ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vextracti128 xm1, m0, 1 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] pmulhrsw m4, m7 palignr m5, m2, m0, 2 pmaddubsw m5, [r3 + 4 * 32] ; [20] pmulhrsw m5, m7 palignr m6, m2, m0, 4 palignr m8, m2, m0, 6 pmaddubsw m6, [r3 - 2 * 32] ; [14] pmulhrsw m6, m7 pmaddubsw m8, [r3 - 8 * 32] ; [8] pmulhrsw m8, m7 palignr m10, m2, m0, 8 pmaddubsw m9, m10, [r3 - 14 * 32] ; [2] pmulhrsw m9, m7 pmaddubsw m10, [r3 + 12 * 32] ; [28] pmulhrsw m10, m7 palignr m11, m2, m0, 10 palignr m12, m2, m0, 12 pmaddubsw m11, [r3 + 6 * 32] ; [22] pmulhrsw m11, m7 pmaddubsw m12, [r3] ; [16] pmulhrsw m12, m7 ; rows 8 to 15 palignr m3, m2, m0, 14 palignr m1, m1, m2, 14 pmaddubsw m3, [r3 - 6 * 32] ; [10] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m3, m2, [r3 - 12 * 32] ; [4] pmulhrsw m3, m7 packuswb m5, m3 pmaddubsw m3, m2, [r3 + 14 * 32] ; [30] pmulhrsw m3, m7 packuswb m6, m3 movu xm0, [r2 + 25] movu xm1, [r2 + 26] punpcklbw m0, m1 mova m1, m2 vinserti128 m1, m1, xm0, 0 vpermq m1, m1, 01001110b palignr m3, m1, m2, 2 pmaddubsw m3, [r3 + 8 * 32] ; [24] pmulhrsw m3, m7 packuswb m8, m3 palignr m3, m1, m2, 4 pmaddubsw m3, [r3 + 2 * 32] ; [18] pmulhrsw m3, m7 packuswb m9, m3 palignr m3, m1, m2, 6 pmaddubsw m3, [r3 - 4 * 32] ; [12] pmulhrsw m3, m7 packuswb m10, m3 palignr m3, m1, m2, 8 pmaddubsw m3, [r3 - 10 * 32] ; [6] pmulhrsw m3, m7 packuswb m11, m3 pmovzxbw m1, [r2 + 14] packuswb m12, m1 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 ret INIT_YMM avx2 cglobal intra_pred_ang16_3, 3, 7, 13 add r2, 32 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] clc call ang16_mode_3_33 RET INIT_YMM avx2 cglobal intra_pred_ang16_33, 3, 7, 13 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] stc call ang16_mode_3_33 RET cglobal ang16_mode_4_32 ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vextracti128 xm1, m0, 1 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] pmaddubsw m4, m0, [r3 + 5 * 32] ; [21] pmulhrsw m4, m7 palignr m1, m2, m0, 2 pmaddubsw m5, m1, [r3 - 6 * 32] ; [10] pmulhrsw m5, m7 palignr m8, m2, m0, 4 pmaddubsw m6, m1, [r3 + 15 * 32] ; [31] pmulhrsw m6, m7 pmaddubsw m8, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 palignr m10, m2, m0, 6 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9] pmulhrsw m9, m7 pmaddubsw m10, [r3 + 14 * 32] ; [30] pmulhrsw m10, m7 palignr m11, m2, m0, 8 palignr m1, m2, m0, 10 pmaddubsw m11, [r3 + 3 * 32] ; [19] pmulhrsw m11, m7 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 ; rows 8 to 15 pmaddubsw m3, m1, [r3 + 13 * 32] ; [29] pmulhrsw m3, m7 packuswb m4, m3 palignr m3, m2, m0, 12 pmaddubsw m3, m3, [r3 + 2 * 32] ; [18] pmulhrsw m3, m7 packuswb m5, m3 palignr m1, m2, m0, 14 pmaddubsw m3, m1, [r3 - 9 * 32] ; [7] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28] pmulhrsw m3, m7 packuswb m8, m3 palignr m3, m2, m0, 16 pmaddubsw m3, [r3 + 1 * 32] ; [17] pmulhrsw m3, m7 packuswb m9, m3 movu xm0, [r2 + 25] movu xm1, [r2 + 26] punpcklbw m0, m1 mova m1, m2 vinserti128 m1, m1, xm0, 0 vpermq m1, m1, 01001110b palignr m0, m1, m2, 2 pmaddubsw m3, m0, [r3 - 10 * 32] ; [6] pmulhrsw m3, m7 packuswb m10, m3 pmaddubsw m3, m0, [r3 + 11 * 32] ; [27] pmulhrsw m3, m7 packuswb m11, m3 palignr m1, m1, m2, 4 pmaddubsw m1, [r3] ; [16] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 ret INIT_YMM avx2 cglobal intra_pred_ang16_4, 3, 7, 13 add r2, 32 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] clc call ang16_mode_4_32 RET INIT_YMM avx2 cglobal intra_pred_ang16_32, 3, 7, 13 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] stc call ang16_mode_4_32 RET cglobal ang16_mode_5 ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vextracti128 xm1, m0, 1 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] pmaddubsw m4, m0, [r3 + 1 * 32] ; [17] pmulhrsw m4, m7 palignr m1, m2, m0, 2 pmaddubsw m5, m1, [r3 - 14 * 32] ; [2] pmulhrsw m5, m7 palignr m3, m2, m0, 4 pmaddubsw m6, m1, [r3 + 3 * 32] ; [19] pmulhrsw m6, m7 pmaddubsw m8, m3, [r3 - 12 * 32] ; [4] pmulhrsw m8, m7 pmaddubsw m9, m3, [r3 + 5 * 32] ; [21] pmulhrsw m9, m7 palignr m3, m2, m0, 6 pmaddubsw m10, m3, [r3 - 10 * 32] ; [6] pmulhrsw m10, m7 palignr m1, m2, m0, 8 pmaddubsw m11, m3, [r3 + 7 * 32] ; [23] pmulhrsw m11, m7 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 ; rows 8 to 15 pmaddubsw m3, m1, [r3 + 9 * 32] ; [25] pmulhrsw m3, m7 packuswb m4, m3 palignr m1, m2, m0, 10 pmaddubsw m3, m1, [r3 - 6 * 32] ; [10] pmulhrsw m3, m7 packuswb m5, m3 pmaddubsw m3, m1, [r3 + 11 * 32] ; [27] pmulhrsw m3, m7 packuswb m6, m3 palignr m1, m2, m0, 12 pmaddubsw m3, m1, [r3 - 4 * 32] ; [12] pmulhrsw m3, m7 packuswb m8, m3 pmaddubsw m3, m1, [r3 + 13 * 32] ; [29] pmulhrsw m3, m7 packuswb m9, m3 palignr m1, m2, m0, 14 pmaddubsw m3, m1, [r3 - 2 * 32] ; [14] pmulhrsw m3, m7 packuswb m10, m3 pmaddubsw m3, m1, [r3 + 15 * 32] ; [31] pmulhrsw m3, m7 packuswb m11, m3 palignr m1, m2, m0, 16 pmaddubsw m1, [r3] ; [16] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 ret INIT_YMM avx2 cglobal intra_pred_ang16_5, 3, 7, 13 add r2, 32 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] clc call ang16_mode_5 RET cglobal ang16_mode_6 ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vextracti128 xm1, m0, 1 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] pmaddubsw m4, m0, [r3 - 3 * 32] ; [13] pmulhrsw m4, m7 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26] pmulhrsw m5, m7 palignr m3, m2, m0, 2 pmaddubsw m6, m3, [r3 - 9 * 32] ; [7] pmulhrsw m6, m7 pmaddubsw m8, m3, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 palignr m3, m2, m0, 4 pmaddubsw m9, m3, [r3 - 15 * 32] ; [1] pmulhrsw m9, m7 pmaddubsw m10, m3, [r3 - 2 * 32] ; [14] pmulhrsw m10, m7 pmaddubsw m11, m3, [r3 + 11 * 32] ; [27] pmulhrsw m11, m7 palignr m1, m2, m0, 6 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 ; rows 8 to 15 pmaddubsw m3, m1, [r3 + 5 * 32] ; [21] pmulhrsw m3, m7 packuswb m4, m3 palignr m1, m2, m0, 8 pmaddubsw m3, m1, [r3 - 14 * 32] ; [2] pmulhrsw m3, m7 packuswb m5, m3 pmaddubsw m3, m1, [r3 - 1 * 32] ; [15] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28] pmulhrsw m3, m7 packuswb m8, m3 palignr m1, m2, m0, 10 pmaddubsw m3, m1, [r3 - 7 * 32] ; [9] pmulhrsw m3, m7 packuswb m9, m3 pmaddubsw m3, m1, [r3 + 6 * 32] ; [22] pmulhrsw m3, m7 packuswb m10, m3 palignr m1, m2, m0, 12 pmaddubsw m3, m1, [r3 - 13 * 32] ; [3] pmulhrsw m3, m7 packuswb m11, m3 pmaddubsw m1, [r3] ; [16] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 ret INIT_YMM avx2 cglobal intra_pred_ang16_6, 3, 7, 13 add r2, 32 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] clc call ang16_mode_6 RET cglobal ang16_mode_7 ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vextracti128 xm1, m0, 1 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] pmaddubsw m4, m0, [r3 - 7 * 32] ; [9] pmulhrsw m4, m7 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 + 11 * 32] ; [27] pmulhrsw m6, m7 palignr m3, m2, m0, 2 pmaddubsw m8, m3, [r3 - 12 * 32] ; [4] pmulhrsw m8, m7 pmaddubsw m9, m3, [r3 - 3 * 32] ; [13] pmulhrsw m9, m7 pmaddubsw m10, m3, [r3 + 6 * 32] ; [22] pmulhrsw m10, m7 pmaddubsw m11, m3, [r3 + 15 * 32] ; [31] pmulhrsw m11, m7 palignr m1, m2, m0, 4 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 ; rows 8 to 15 pmaddubsw m3, m1, [r3 + 1 * 32] ; [17] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m3, m1, [r3 + 10 * 32] ; [26] pmulhrsw m3, m7 packuswb m5, m3 palignr m1, m2, m0, 6 pmaddubsw m3, m1, [r3 - 13 * 32] ; [3] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m3, m1, [r3 - 4 * 32] ; [12] pmulhrsw m3, m7 packuswb m8, m3 pmaddubsw m3, m1, [r3 + 5 * 32] ; [21] pmulhrsw m3, m7 packuswb m9, m3 pmaddubsw m3, m1, [r3 + 14 * 32] ; [30] pmulhrsw m3, m7 packuswb m10, m3 palignr m1, m2, m0, 8 pmaddubsw m3, m1, [r3 - 9 * 32] ; [7] pmulhrsw m3, m7 packuswb m11, m3 pmaddubsw m1, [r3] ; [16] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 ret INIT_YMM avx2 cglobal intra_pred_ang16_7, 3, 7, 13 add r2, 32 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] clc call ang16_mode_7 RET cglobal ang16_mode_8 ; rows 0 to 7 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vextracti128 xm1, m0, 1 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] pmulhrsw m4, m7 pmaddubsw m5, m0, [r3 - 6 * 32] ; [10] pmulhrsw m5, m7 pmaddubsw m6, m0, [r3 - 1 * 32] ; [15] pmulhrsw m6, m7 pmaddubsw m8, m0, [r3 + 4 * 32] ; [20] pmulhrsw m8, m7 pmaddubsw m9, m0, [r3 + 9 * 32] ; [25] pmulhrsw m9, m7 pmaddubsw m10, m0, [r3 + 14 * 32] ; [30] pmulhrsw m10, m7 palignr m1, m2, m0, 2 pmaddubsw m11, m1, [r3 - 13 * 32] ; [3] pmulhrsw m11, m7 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] pmulhrsw m12, m7 ; rows 8 to 15 pmaddubsw m3, m1, [r3 - 3 * 32] ; [13] pmulhrsw m3, m7 packuswb m4, m3 pmaddubsw m3, m1, [r3 + 2 * 32] ; [18] pmulhrsw m3, m7 packuswb m5, m3 pmaddubsw m3, m1, [r3 + 7 * 32] ; [23] pmulhrsw m3, m7 packuswb m6, m3 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28] pmulhrsw m3, m7 packuswb m8, m3 palignr m1, m2, m0, 4 pmaddubsw m3, m1, [r3 - 15 * 32] ; [1] pmulhrsw m3, m7 packuswb m9, m3 pmaddubsw m3, m1, [r3 - 10 * 32] ; [6] pmulhrsw m3, m7 packuswb m10, m3 pmaddubsw m3, m1, [r3 - 5 * 32] ; [11] pmulhrsw m3, m7 packuswb m11, m3 pmaddubsw m1, [r3] ; [16] pmulhrsw m1, m7 packuswb m12, m1 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 ret INIT_YMM avx2 cglobal intra_pred_ang16_8, 3, 7, 13 add r2, 32 lea r3, [ang_table_avx2 + 16 * 32] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] clc call ang16_mode_8 RET %endif ; ARCH_X86_64 INIT_YMM avx2 cglobal intra_pred_ang16_9, 3,4,8 vbroadcasti128 m0, [angHor_tab_9] vbroadcasti128 m1, [angHor_tab_9 + mmsize/2] mova m2, [pw_1024] lea r3, [r1 * 3] mova m7, [ang16_shuf_mode9] vbroadcasti128 m6, [r2 + mmsize + 17] vbroadcasti128 m3, [r2 + mmsize + 1] pshufb m5, m3, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 2 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 4 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 6 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 8 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 10 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 lea r0, [r0 + r1 * 4] palignr m5, m6, m3, 12 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0], xm4 vextracti128 [r0 + r1], m4, 1 palignr m5, m6, m3, 14 pshufb m5, m7 pmaddubsw m4, m5, m0 pmaddubsw m5, m1 pmulhrsw m4, m2 pmulhrsw m5, m2 packuswb m4, m5 movu [r0 + r1 * 2], xm4 vextracti128 [r0 + r3], m4, 1 RET %endif INIT_YMM avx2 cglobal intra_pred_ang16_25, 3, 5, 5 mova m0, [pw_1024] vbroadcasti128 m1, [r2] pshufb m1, [intra_pred_shuff_0_8] vbroadcasti128 m2, [r2 + 8] pshufb m2, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_25] INTRA_PRED_ANG16_MC1 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC1 2 add r4, 4 * mmsize lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC1 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC1 2 RET INIT_YMM avx2 cglobal intra_pred_ang16_28, 3, 5, 6 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_28] INTRA_PRED_ANG16_MC2 1 INTRA_PRED_ANG16_MC1 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 INTRA_PRED_ANG16_MC2 2 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 lea r0, [r0 + 4 * r1] add r4, 4 * mmsize INTRA_PRED_ANG16_MC1 0 INTRA_PRED_ANG16_MC2 3 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC1 2 RET INIT_YMM avx2 cglobal intra_pred_ang16_27, 3, 5, 5 mova m0, [pw_1024] lea r3, [3 * r1] lea r4, [c_ang16_mode_27] vbroadcasti128 m1, [r2 + 1] pshufb m1, [intra_pred_shuff_0_8] vbroadcasti128 m2, [r2 + 9] pshufb m2, [intra_pred_shuff_0_8] INTRA_PRED_ANG16_MC1 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC1 2 lea r0, [r0 + 4 * r1] add r4, 4 * mmsize INTRA_PRED_ANG16_MC1 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 vperm2i128 m1, m1, m2, 00100000b pmaddubsw m3, m1, [r4 + 3 * mmsize] pmulhrsw m3, m0 vbroadcasti128 m2, [r2 + 2] pshufb m2, [intra_pred_shuff_0_15] pmaddubsw m2, [r4 + 4 * mmsize] pmulhrsw m2, m0 packuswb m3, m2 vpermq m3, m3, 11011000b movu [r0 + 2 * r1], xm3 vextracti128 xm4, m3, 1 movu [r0 + r3], xm4 RET INIT_YMM avx2 cglobal intra_pred_ang16_29, 3, 5, 5 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_29] INTRA_PRED_ANG16_MC2 1 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1 INTRA_PRED_ANG16_MC2 2 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 lea r0, [r0 + r1 * 4] INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 INTRA_PRED_ANG16_MC2 3 add r4, 4 * mmsize INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 lea r0, [r0 + r1 * 4] INTRA_PRED_ANG16_MC3 r0 + r1, 1 INTRA_PRED_ANG16_MC2 4 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2 lea r0, [r0 + r1 * 4] INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 add r4, 4 * mmsize INTRA_PRED_ANG16_MC2 5 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0 RET INIT_YMM avx2 cglobal intra_pred_ang16_30, 3, 5, 6 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_30] INTRA_PRED_ANG16_MC2 1 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 INTRA_PRED_ANG16_MC2 2 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1 INTRA_PRED_ANG16_MC2 3 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3 INTRA_PRED_ANG16_MC2 4 add r4, 4 * mmsize INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 INTRA_PRED_ANG16_MC2 5 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 INTRA_PRED_ANG16_MC3 r0 + r3 , 2 INTRA_PRED_ANG16_MC2 6 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 INTRA_PRED_ANG16_MC2 7 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4 RET INIT_YMM avx2 cglobal intra_pred_ang16_31, 3, 5, 6 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_31] INTRA_PRED_ANG16_MC2 1 INTRA_PRED_ANG16_MC3 r0, 0 INTRA_PRED_ANG16_MC2 2 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 INTRA_PRED_ANG16_MC2 3 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 INTRA_PRED_ANG16_MC2 4 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 INTRA_PRED_ANG16_MC2 5 add r4, 4 * mmsize INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 INTRA_PRED_ANG16_MC2 6 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 INTRA_PRED_ANG16_MC2 7 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 INTRA_PRED_ANG16_MC2 8 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 INTRA_PRED_ANG16_MC2 9 INTRA_PRED_ANG16_MC3 r0 + r3, 4 RET INIT_YMM avx2 cglobal intra_pred_ang16_24, 3, 5, 6 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_24] INTRA_PRED_ANG16_MC2 0 INTRA_PRED_ANG16_MC1 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 movu xm1, [r2 - 1] pinsrb xm1, [r2 + 38], 0 vinserti128 m1, m1, xm1, 1 pshufb m1, m5 vbroadcasti128 m2, [r2 + 7] pshufb m2, m5 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 lea r0, [r0 + 4 * r1] add r4, 4 * mmsize INTRA_PRED_ANG16_MC1 0 movu xm1, [r2 - 2] pinsrb xm1, [r2 + 45], 0 pinsrb xm1, [r2 + 38], 1 vinserti128 m1, m1, xm1, 1 pshufb m1, m5 vbroadcasti128 m2, [r2 + 6] pshufb m2, m5 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC1 2 RET %macro INTRA_PRED_ANG16_MC5 2 pslldq xm6, xm6, 1 pinsrb xm6, [r2 + %1], 0 vinserti128 m1, m6, xm6, 1 pshufb m1, m5 vbroadcasti128 m2, [r2 + %2] pshufb m2, m5 %endmacro INIT_YMM avx2 cglobal intra_pred_ang16_23, 3, 5, 7 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_23] INTRA_PRED_ANG16_MC2 0 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1 movu xm6, [r2 - 1] pinsrb xm6, [r2 + 36], 0 vinserti128 m1, m6, xm6, 1 pshufb m1, m5 vbroadcasti128 m2, [r2 + 7] pshufb m2, m5 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 add r4, 4 * mmsize INTRA_PRED_ANG16_MC5 39, 6 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC3 r0 + r1, 1 INTRA_PRED_ANG16_MC5 43, 5 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 add r4, 4 * mmsize INTRA_PRED_ANG16_MC5 46, 4 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0 RET INIT_YMM avx2 cglobal intra_pred_ang16_22, 3, 5, 7 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] lea r3, [3 * r1] lea r4, [c_ang16_mode_22] INTRA_PRED_ANG16_MC2 0 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 movu xm6, [r2 - 1] pinsrb xm6, [r2 + 34], 0 vinserti128 m1, m6, xm6, 1 pshufb m1, m5 vbroadcasti128 m2, [r2 + 7] pshufb m2, m5 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC5 37, 6 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3 add r4, 4 * mmsize INTRA_PRED_ANG16_MC5 39, 5 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC5 42, 4 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 INTRA_PRED_ANG16_MC3 r0 + r3, 2 lea r0, [r0 + 4 * r1] INTRA_PRED_ANG16_MC5 44, 3 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 INTRA_PRED_ANG16_MC5 47, 2 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4 RET %macro INTRA_PRED_ANG32_ALIGNR_STORE 1 lea r0, [r0 + 4 * r1] palignr m2, m1, m0, %1 movu [r0], m2 palignr m2, m1, m0, (%1 + 1) movu [r0 + r1], m2 palignr m2, m1, m0, (%1 + 2) movu [r0 + 2 * r1], m2 palignr m2, m1, m0, (%1 + 3) movu [r0 + r3], m2 %endmacro INIT_YMM avx2 cglobal intra_pred_ang32_34, 3, 4,3 lea r3, [3 * r1] movu m0, [r2 + 2] movu m1, [r2 + 18] movu [r0], m0 palignr m2, m1, m0, 1 movu [r0 + r1], m2 palignr m2, m1, m0, 2 movu [r0 + 2 * r1], m2 palignr m2, m1, m0, 3 movu [r0 + r3], m2 INTRA_PRED_ANG32_ALIGNR_STORE 4 INTRA_PRED_ANG32_ALIGNR_STORE 8 INTRA_PRED_ANG32_ALIGNR_STORE 12 lea r0, [r0 + 4 * r1] palignr m2, m1, m0, 16 movu [r0], m2 movu m0, [r2 + 19] movu [r0 + r1], m0 movu m1, [r2 + 35] palignr m2, m1, m0, 1 movu [r0 + 2 * r1], m2 palignr m2, m1, m0, 2 movu [r0 + r3], m2 INTRA_PRED_ANG32_ALIGNR_STORE 3 INTRA_PRED_ANG32_ALIGNR_STORE 7 INTRA_PRED_ANG32_ALIGNR_STORE 11 RET INIT_YMM avx2 cglobal intra_pred_ang32_2, 3, 4,3 lea r3, [3 * r1] movu m0, [r2 + 64 + 2] movu m1, [r2 + 64 + 18] movu [r0], m0 palignr m2, m1, m0, 1 movu [r0 + r1], m2 palignr m2, m1, m0, 2 movu [r0 + 2 * r1], m2 palignr m2, m1, m0, 3 movu [r0 + r3], m2 INTRA_PRED_ANG32_ALIGNR_STORE 4 INTRA_PRED_ANG32_ALIGNR_STORE 8 INTRA_PRED_ANG32_ALIGNR_STORE 12 lea r0, [r0 + 4 * r1] palignr m2, m1, m0, 16 movu [r0], m2 movu m0, [r2 + 64 + 19] movu [r0 + r1], m0 movu m1, [r2 + 64 + 35] palignr m2, m1, m0, 1 movu [r0 + 2 * r1], m2 palignr m2, m1, m0, 2 movu [r0 + r3], m2 INTRA_PRED_ANG32_ALIGNR_STORE 3 INTRA_PRED_ANG32_ALIGNR_STORE 7 INTRA_PRED_ANG32_ALIGNR_STORE 11 RET %macro INTRA_PRED_ANG32_STORE 0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + r1 * 2], m0 movu [r0 + r3], m0 %endmacro INIT_YMM avx2 cglobal intra_pred_ang32_26, 3, 4, 1 lea r3, [3 * r1] movu m0, [r2 + 1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + r1 * 2], m0 movu [r0 + r3], m0 INTRA_PRED_ANG32_STORE INTRA_PRED_ANG32_STORE INTRA_PRED_ANG32_STORE INTRA_PRED_ANG32_STORE INTRA_PRED_ANG32_STORE INTRA_PRED_ANG32_STORE INTRA_PRED_ANG32_STORE RET %macro INTRA_PRED_STORE_4x4 0 movd [r0], xm0 pextrd [r0 + r1], xm0, 1 vextracti128 xm0, m0, 1 lea r0, [r0 + 2 * r1] movd [r0], xm0 pextrd [r0 + r1], xm0, 1 %endmacro %macro INTRA_PRED_TRANS_STORE_4x4 0 vpermq m0, m0, 00001000b pshufb m0, [c_trans_4x4] ;store movd [r0], xm0 pextrd [r0 + r1], xm0, 1 lea r0, [r0 + 2 * r1] pextrd [r0], xm0, 2 pextrd [r0 + r1], xm0, 3 %endmacro INIT_YMM avx2 cglobal intra_pred_ang4_27, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred_shuff_0_4] pmaddubsw m0, [c_ang4_mode_27] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_28, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred_shuff_0_4] pmaddubsw m0, [c_ang4_mode_28] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_29, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred4_shuff1] pmaddubsw m0, [c_ang4_mode_29] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_30, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred4_shuff2] pmaddubsw m0, [c_ang4_mode_30] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_31, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred4_shuff31] pmaddubsw m0, [c_ang4_mode_31] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_32, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred4_shuff31] pmaddubsw m0, [c_ang4_mode_32] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_33, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred4_shuff33] pmaddubsw m0, [c_ang4_mode_33] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_3, 3, 3, 1 vbroadcasti128 m0, [r2 + 1] pshufb m0, [intra_pred4_shuff3] pmaddubsw m0, [c_ang4_mode_33] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_4, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff5] pmaddubsw m0, [c_ang4_mode_32] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_5, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff5] pmaddubsw m0, [c_ang4_mode_5] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_6, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff6] pmaddubsw m0, [c_ang4_mode_6] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_7, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff7] pmaddubsw m0, [c_ang4_mode_7] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_8, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff9] pmaddubsw m0, [c_ang4_mode_8] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_9, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff9] pmaddubsw m0, [c_ang4_mode_9] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_11, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff12] pmaddubsw m0, [c_ang4_mode_11] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_12, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff12] pmaddubsw m0, [c_ang4_mode_12] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_13, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff13] pmaddubsw m0, [c_ang4_mode_13] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_14, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff14] pmaddubsw m0, [c_ang4_mode_14] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_15, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff15] pmaddubsw m0, [c_ang4_mode_15] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_16, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff16] pmaddubsw m0, [c_ang4_mode_16] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_17, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff17] pmaddubsw m0, [c_ang4_mode_17] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_TRANS_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_19, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff19] pmaddubsw m0, [c_ang4_mode_19] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_20, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff20] pmaddubsw m0, [c_ang4_mode_20] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_21, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff21] pmaddubsw m0, [c_ang4_mode_21] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_22, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff22] pmaddubsw m0, [c_ang4_mode_22] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_23, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred4_shuff23] pmaddubsw m0, [c_ang4_mode_23] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_24, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred_shuff_0_4] pmaddubsw m0, [c_ang4_mode_24] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET INIT_YMM avx2 cglobal intra_pred_ang4_25, 3, 3, 1 vbroadcasti128 m0, [r2] pshufb m0, [intra_pred_shuff_0_4] pmaddubsw m0, [c_ang4_mode_25] pmulhrsw m0, [pw_1024] packuswb m0, m0 INTRA_PRED_STORE_4x4 RET ;----------------------------------------------------------------------------------- ; void intra_filter_NxN(const pixel* references, pixel* filtered) ;----------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_filter_4x4, 2,4,5 mov r2b, byte [r0 + 8] ; topLast mov r3b, byte [r0 + 16] ; LeftLast ; filtering top pmovzxbw m0, [r0 + 0] pmovzxbw m1, [r0 + 8] pmovzxbw m2, [r0 + 16] pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] palignr m3, m1, m0, 4 pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1] psllw m0, 1 paddw m4, m3 paddw m0, m4 paddw m0, [pw_2] psrlw m0, 2 ; filtering left palignr m4, m1, m1, 14 ; [14 13 12 11 10 9 8 15] samples[i - 1] pinsrb m4, [r0], 2 ; [14 13 12 11 10 9 0 15] samples[i + 1] palignr m3, m2, m1, 4 pshufb m3, [intra_filter4_shuf1] psllw m1, 1 paddw m4, m3 paddw m1, m4 paddw m1, [pw_2] psrlw m1, 2 packuswb m0, m1 movu [r1], m0 mov [r1 + 8], r2b ; topLast mov [r1 + 16], r3b ; LeftLast RET INIT_XMM sse4 cglobal intra_filter_8x8, 2,4,6 mov r2b, byte [r0 + 16] ; topLast mov r3b, byte [r0 + 32] ; LeftLast ; filtering top pmovzxbw m0, [r0 + 0] pmovzxbw m1, [r0 + 8] pmovzxbw m2, [r0 + 16] pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] palignr m5, m1, m0, 2 pinsrb m5, [r0 + 17], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1] palignr m3, m1, m0, 14 psllw m0, 1 paddw m4, m5 paddw m0, m4 paddw m0, [pw_2] psrlw m0, 2 palignr m4, m2, m1, 2 psllw m1, 1 paddw m4, m3 paddw m1, m4 paddw m1, [pw_2] psrlw m1, 2 packuswb m0, m1 movu [r1], m0 ; filtering left pmovzxbw m1, [r0 + 24] pmovzxbw m0, [r0 + 32] palignr m4, m2, m2, 14 pinsrb m4, [r0], 2 palignr m5, m1, m2, 2 palignr m3, m1, m2, 14 palignr m0, m1, 2 psllw m2, 1 paddw m4, m5 paddw m2, m4 paddw m2, [pw_2] psrlw m2, 2 psllw m1, 1 paddw m0, m3 paddw m1, m0 paddw m1, [pw_2] psrlw m1, 2 packuswb m2, m1 movu [r1 + 16], m2 mov [r1 + 16], r2b ; topLast mov [r1 + 32], r3b ; LeftLast RET INIT_XMM sse4 cglobal intra_filter_16x16, 2,4,6 mov r2b, byte [r0 + 32] ; topLast mov r3b, byte [r0 + 64] ; LeftLast ; filtering top pmovzxbw m0, [r0 + 0] pmovzxbw m1, [r0 + 8] pmovzxbw m2, [r0 + 16] pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] palignr m5, m1, m0, 2 pinsrb m5, [r0 + 33], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1] palignr m3, m1, m0, 14 psllw m0, 1 paddw m4, m5 paddw m0, m4 paddw m0, [pw_2] psrlw m0, 2 palignr m4, m2, m1, 2 psllw m5, m1, 1 paddw m4, m3 paddw m5, m4 paddw m5, [pw_2] psrlw m5, 2 packuswb m0, m5 movu [r1], m0 pmovzxbw m0, [r0 + 24] pmovzxbw m5, [r0 + 32] palignr m3, m2, m1, 14 palignr m4, m0, m2, 2 psllw m1, m2, 1 paddw m3, m4 paddw m1, m3 paddw m1, [pw_2] psrlw m1, 2 palignr m3, m0, m2, 14 palignr m4, m5, m0, 2 psllw m0, 1 paddw m4, m3 paddw m0, m4 paddw m0, [pw_2] psrlw m0, 2 packuswb m1, m0 movu [r1 + 16], m1 ; filtering left pmovzxbw m1, [r0 + 40] pmovzxbw m2, [r0 + 48] palignr m4, m5, m5, 14 pinsrb m4, [r0], 2 palignr m0, m1, m5, 2 psllw m3, m5, 1 paddw m4, m0 paddw m3, m4 paddw m3, [pw_2] psrlw m3, 2 palignr m0, m1, m5, 14 palignr m4, m2, m1, 2 psllw m5, m1, 1 paddw m4, m0 paddw m5, m4 paddw m5, [pw_2] psrlw m5, 2 packuswb m3, m5 movu [r1 + 32], m3 pmovzxbw m5, [r0 + 56] pmovzxbw m0, [r0 + 64] palignr m3, m2, m1, 14 palignr m4, m5, m2, 2 psllw m1, m2, 1 paddw m3, m4 paddw m1, m3 paddw m1, [pw_2] psrlw m1, 2 palignr m3, m5, m2, 14 palignr m4, m0, m5, 2 psllw m5, 1 paddw m4, m3 paddw m5, m4 paddw m5, [pw_2] psrlw m5, 2 packuswb m1, m5 movu [r1 + 48], m1 mov [r1 + 32], r2b ; topLast mov [r1 + 64], r3b ; LeftLast RET INIT_XMM sse4 cglobal intra_filter_32x32, 2,4,6 mov r2b, byte [r0 + 64] ; topLast mov r3b, byte [r0 + 128] ; LeftLast ; filtering top ; 0 to 15 pmovzxbw m0, [r0 + 0] pmovzxbw m1, [r0 + 8] pmovzxbw m2, [r0 + 16] pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] palignr m5, m1, m0, 2 pinsrb m5, [r0 + 65], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1] palignr m3, m1, m0, 14 psllw m0, 1 paddw m4, m5 paddw m0, m4 paddw m0, [pw_2] psrlw m0, 2 palignr m4, m2, m1, 2 psllw m5, m1, 1 paddw m4, m3 paddw m5, m4 paddw m5, [pw_2] psrlw m5, 2 packuswb m0, m5 movu [r1], m0 ; 16 to 31 pmovzxbw m0, [r0 + 24] pmovzxbw m5, [r0 + 32] palignr m3, m2, m1, 14 palignr m4, m0, m2, 2 psllw m1, m2, 1 paddw m3, m4 paddw m1, m3 paddw m1, [pw_2] psrlw m1, 2 palignr m3, m0, m2, 14 palignr m4, m5, m0, 2 psllw m2, m0, 1 paddw m4, m3 paddw m2, m4 paddw m2, [pw_2] psrlw m2, 2 packuswb m1, m2 movu [r1 + 16], m1 ; 32 to 47 pmovzxbw m1, [r0 + 40] pmovzxbw m2, [r0 + 48] palignr m3, m5, m0, 14 palignr m4, m1, m5, 2 psllw m0, m5, 1 paddw m3, m4 paddw m0, m3 paddw m0, [pw_2] psrlw m0, 2 palignr m3, m1, m5, 14 palignr m4, m2, m1, 2 psllw m5, m1, 1 paddw m4, m3 paddw m5, m4 paddw m5, [pw_2] psrlw m5, 2 packuswb m0, m5 movu [r1 + 32], m0 ; 48 to 63 pmovzxbw m0, [r0 + 56] pmovzxbw m5, [r0 + 64] palignr m3, m2, m1, 14 palignr m4, m0, m2, 2 psllw m1, m2, 1 paddw m3, m4 paddw m1, m3 paddw m1, [pw_2] psrlw m1, 2 palignr m3, m0, m2, 14 palignr m4, m5, m0, 2 psllw m0, 1 paddw m4, m3 paddw m0, m4 paddw m0, [pw_2] psrlw m0, 2 packuswb m1, m0 movu [r1 + 48], m1 ; filtering left ; 64 to 79 pmovzxbw m1, [r0 + 72] pmovzxbw m2, [r0 + 80] palignr m4, m5, m5, 14 pinsrb m4, [r0], 2 palignr m0, m1, m5, 2 psllw m3, m5, 1 paddw m4, m0 paddw m3, m4 paddw m3, [pw_2] psrlw m3, 2 palignr m0, m1, m5, 14 palignr m4, m2, m1, 2 psllw m5, m1, 1 paddw m4, m0 paddw m5, m4 paddw m5, [pw_2] psrlw m5, 2 packuswb m3, m5 movu [r1 + 64], m3 ; 80 to 95 pmovzxbw m5, [r0 + 88] pmovzxbw m0, [r0 + 96] palignr m3, m2, m1, 14 palignr m4, m5, m2, 2 psllw m1, m2, 1 paddw m3, m4 paddw m1, m3 paddw m1, [pw_2] psrlw m1, 2 palignr m3, m5, m2, 14 palignr m4, m0, m5, 2 psllw m2, m5, 1 paddw m4, m3 paddw m2, m4 paddw m2, [pw_2] psrlw m2, 2 packuswb m1, m2 movu [r1 + 80], m1 ; 96 to 111 pmovzxbw m1, [r0 + 104] pmovzxbw m2, [r0 + 112] palignr m3, m0, m5, 14 palignr m4, m1, m0, 2 psllw m5, m0, 1 paddw m3, m4 paddw m5, m3 paddw m5, [pw_2] psrlw m5, 2 palignr m3, m1, m0, 14 palignr m4, m2, m1, 2 psllw m0, m1, 1 paddw m4, m3 paddw m0, m4 paddw m0, [pw_2] psrlw m0, 2 packuswb m5, m0 movu [r1 + 96], m5 ; 112 to 127 pmovzxbw m5, [r0 + 120] pmovzxbw m0, [r0 + 128] palignr m3, m2, m1, 14 palignr m4, m5, m2, 2 psllw m1, m2, 1 paddw m3, m4 paddw m1, m3 paddw m1, [pw_2] psrlw m1, 2 palignr m3, m5, m2, 14 palignr m4, m0, m5, 2 psllw m5, 1 paddw m4, m3 paddw m5, m4 paddw m5, [pw_2] psrlw m5, 2 packuswb m1, m5 movu [r1 + 112], m1 mov [r1 + 64], r2b ; topLast mov [r1 + 128], r3b ; LeftLast RET INIT_YMM avx2 cglobal intra_filter_4x4, 2,4,4 mov r2b, byte [r0 + 8] ; topLast mov r3b, byte [r0 + 16] ; LeftLast ; filtering top pmovzxbw m0, [r0] vpbroadcastw m2, xm0 pmovzxbw m1, [r0 + 8] palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0] pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1] palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2] palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2] psllw m0, 1 paddw m3, m1 paddw m0, m3 paddw m0, [pw_2] psrlw m0, 2 packuswb m0, m0 vpermq m0, m0, 10001000b movu [r1], xm0 mov [r1 + 8], r2b ; topLast mov [r1 + 16], r3b ; LeftLast RET