LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbHighbdIntraPrediction_AVX2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 1270 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 209 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include <immintrin.h>
       7             : #include "EbHighbdIntraPrediction_SSE2.h"
       8             : #include "EbDefinitions.h"
       9             : #include "aom_dsp_rtcd.h"
      10             : #include "EbIntraPrediction_AVX2.h"
      11             : 
      12             : // =============================================================================
      13             : 
      14             : // DC RELATED PRED
      15             : 
      16             : // Handle number of elements: up to 64.
      17           0 : static INLINE __m128i dc_sum_large(const __m256i src) {
      18           0 :     const __m128i s_lo = _mm256_extracti128_si256(src, 0);
      19           0 :     const __m128i s_hi = _mm256_extracti128_si256(src, 1);
      20             :     __m128i sum, sum_hi;
      21           0 :     sum = _mm_add_epi16(s_lo, s_hi);
      22           0 :     sum_hi = _mm_srli_si128(sum, 8);
      23           0 :     sum = _mm_add_epi16(sum, sum_hi);
      24             :     // Unpack to avoid 12-bit overflow.
      25           0 :     sum = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
      26             : 
      27           0 :     return dc_sum_4x32bit(sum);
      28             : }
      29             : 
      30             : // Handle number of elements: 65 to 128.
      31           0 : static INLINE __m128i dc_sum_larger(const __m256i src) {
      32           0 :     const __m128i s_lo = _mm256_extracti128_si256(src, 0);
      33           0 :     const __m128i s_hi = _mm256_extracti128_si256(src, 1);
      34             :     __m128i sum, sum_hi;
      35           0 :     sum = _mm_add_epi16(s_lo, s_hi);
      36             :     // Unpack to avoid 12-bit overflow.
      37           0 :     sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
      38           0 :     sum = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
      39           0 :     sum = _mm_add_epi32(sum, sum_hi);
      40             : 
      41           0 :     return dc_sum_4x32bit(sum);
      42             : }
      43             : 
      44           0 : static INLINE __m128i dc_sum_16(const uint16_t *const src) {
      45           0 :     const __m256i s = _mm256_loadu_si256((const __m256i *)src);
      46           0 :     const __m128i s_lo = _mm256_extracti128_si256(s, 0);
      47           0 :     const __m128i s_hi = _mm256_extracti128_si256(s, 1);
      48           0 :     const __m128i sum = _mm_add_epi16(s_lo, s_hi);
      49           0 :     return dc_sum_8x16bit(sum);
      50             : }
      51             : 
      52           0 : static INLINE __m128i dc_sum_32(const uint16_t *const src) {
      53           0 :     const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src + 0x00));
      54           0 :     const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + 0x10));
      55           0 :     const __m256i sum = _mm256_add_epi16(s0, s1);
      56           0 :     return dc_sum_large(sum);
      57             : }
      58             : 
      59           0 : static INLINE __m128i dc_sum_64(const uint16_t *const src) {
      60           0 :     const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src + 0x00));
      61           0 :     const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + 0x10));
      62           0 :     const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 0x20));
      63           0 :     const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 0x30));
      64           0 :     const __m256i s01 = _mm256_add_epi16(s0, s1);
      65           0 :     const __m256i s23 = _mm256_add_epi16(s2, s3);
      66           0 :     const __m256i sum = _mm256_add_epi16(s01, s23);
      67           0 :     return dc_sum_large(sum);
      68             : }
      69             : 
      70           0 : static INLINE __m128i dc_sum_4_16(const uint16_t *const src_4,
      71             :     const uint16_t *const src_16) {
      72           0 :     const __m128i s_4 = _mm_loadl_epi64((const __m128i *)src_4);
      73           0 :     const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
      74           0 :     const __m128i s_lo = _mm256_extracti128_si256(s_16, 0);
      75           0 :     const __m128i s_hi = _mm256_extracti128_si256(s_16, 1);
      76           0 :     const __m128i s_16_sum0 = _mm_add_epi16(s_lo, s_hi);
      77           0 :     const __m128i s_16_sum_hi = _mm_srli_si128(s_16_sum0, 8);
      78           0 :     const __m128i s_16_sum = _mm_add_epi16(s_16_sum0, s_16_sum_hi);
      79           0 :     const __m128i sum = _mm_add_epi16(s_16_sum, s_4);
      80           0 :     return dc_sum_4x16bit_large(sum);
      81             : }
      82             : 
      83           0 : static INLINE __m128i dc_sum_8_16(const uint16_t *const src_8,
      84             :     const uint16_t *const src_16) {
      85           0 :     const __m128i s_8 = _mm_load_si128((const __m128i *)src_8);
      86           0 :     const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
      87           0 :     const __m128i s_lo = _mm256_extracti128_si256(s_16, 0);
      88           0 :     const __m128i s_hi = _mm256_extracti128_si256(s_16, 1);
      89           0 :     const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
      90           0 :     const __m128i sum = _mm_add_epi16(s_16_sum, s_8);
      91           0 :     return dc_sum_8x16bit_large(sum);
      92             : }
      93             : 
      94           0 : static INLINE __m128i dc_sum_8_32(const uint16_t *const src_8,
      95             :     const uint16_t *const src_32) {
      96           0 :     const __m128i s_8 = _mm_loadu_si128((const __m128i *)src_8);
      97           0 :     const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
      98           0 :     const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
      99           0 :     const __m256i s_32 = _mm256_add_epi16(s_32_0, s_32_1);
     100           0 :     const __m128i s_lo = _mm256_extracti128_si256(s_32, 0);
     101           0 :     const __m128i s_hi = _mm256_extracti128_si256(s_32, 1);
     102           0 :     const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
     103           0 :     const __m128i sum = _mm_add_epi16(s_8, s_16_sum);
     104           0 :     return dc_sum_8x16bit_large(sum);
     105             : }
     106             : 
     107           0 : static INLINE __m128i dc_sum_16_16(const uint16_t *const src0,
     108             :     const uint16_t *const src1) {
     109           0 :     const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
     110           0 :     const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
     111           0 :     const __m256i sum = _mm256_add_epi16(s0, s1);
     112           0 :     return dc_sum_large(sum);
     113             : }
     114             : 
     115           0 : static INLINE __m128i dc_sum_16_32(const uint16_t *const src_16,
     116             :     const uint16_t *const src_32) {
     117           0 :     const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
     118           0 :     const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
     119           0 :     const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
     120           0 :     const __m256i sum0 = _mm256_add_epi16(s_16, s_32_0);
     121           0 :     const __m256i sum = _mm256_add_epi16(sum0, s_32_1);
     122           0 :     return dc_sum_large(sum);
     123             : }
     124             : 
     125           0 : static INLINE __m128i dc_sum_32_32(const uint16_t *const src0,
     126             :     const uint16_t *const src1) {
     127           0 :     const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
     128           0 :     const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
     129           0 :     const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
     130           0 :     const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
     131           0 :     const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
     132           0 :     const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
     133           0 :     const __m256i sum = _mm256_add_epi16(sum0, sum1);
     134           0 :     return dc_sum_large(sum);
     135             : }
     136             : 
     137           0 : static INLINE __m128i dc_sum_32_64(const uint16_t *const src_32,
     138             :     const uint16_t *const src_64) {
     139           0 :     const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
     140           0 :     const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
     141           0 :     const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
     142           0 :     const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
     143           0 :     const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
     144           0 :     const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
     145           0 :     const __m256i sum0 = _mm256_add_epi16(s_32_0, s_64_0);
     146           0 :     const __m256i sum1 = _mm256_add_epi16(s_32_1, s_64_1);
     147           0 :     const __m256i sum2 = _mm256_add_epi16(s_64_2, s_64_3);
     148           0 :     const __m256i sum3 = _mm256_add_epi16(sum0, sum1);
     149           0 :     const __m256i sum = _mm256_add_epi16(sum2, sum3);
     150           0 :     return dc_sum_larger(sum);
     151             : }
     152             : 
     153           0 : static INLINE __m128i dc_sum_64_64(const uint16_t *const src0,
     154             :     const uint16_t *const src1) {
     155           0 :     const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
     156           0 :     const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
     157           0 :     const __m256i s0_2 = _mm256_loadu_si256((const __m256i *)(src0 + 0x20));
     158           0 :     const __m256i s0_3 = _mm256_loadu_si256((const __m256i *)(src0 + 0x30));
     159           0 :     const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
     160           0 :     const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
     161           0 :     const __m256i s1_2 = _mm256_loadu_si256((const __m256i *)(src1 + 0x20));
     162           0 :     const __m256i s1_3 = _mm256_loadu_si256((const __m256i *)(src1 + 0x30));
     163           0 :     const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
     164           0 :     const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
     165           0 :     const __m256i sum2 = _mm256_add_epi16(s0_2, s1_2);
     166           0 :     const __m256i sum3 = _mm256_add_epi16(s0_3, s1_3);
     167           0 :     const __m256i sum4 = _mm256_add_epi16(sum0, sum1);
     168           0 :     const __m256i sum5 = _mm256_add_epi16(sum2, sum3);
     169           0 :     const __m256i sum = _mm256_add_epi16(sum4, sum5);
     170           0 :     return dc_sum_larger(sum);
     171             : }
     172             : 
     173           0 : static INLINE __m128i dc_sum_16_64(const uint16_t *const src_16,
     174             :     const uint16_t *const src_64)
     175             : {
     176           0 :     const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
     177           0 :     const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
     178           0 :     const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
     179           0 :     const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
     180           0 :     const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
     181           0 :     const __m256i s0 = _mm256_add_epi16(s_16, s_64_0);
     182           0 :     const __m256i s1 = _mm256_add_epi16(s0, s_64_1);
     183           0 :     const __m256i s2 = _mm256_add_epi16(s_64_2, s_64_3);
     184           0 :     const __m256i sum = _mm256_add_epi16(s1, s2);
     185           0 :     return dc_sum_larger(sum);
     186             : }
     187             : 
     188           0 : static INLINE void dc_common_predictor_16xh_kernel(uint16_t *dst,
     189             :     const ptrdiff_t stride, const int32_t h, const __m256i dc)
     190             : {
     191           0 :     for (int32_t i = 0; i < h; i++) {
     192             :         _mm256_storeu_si256((__m256i *)dst, dc);
     193           0 :         dst += stride;
     194             :     }
     195           0 : }
     196             : 
     197           0 : static INLINE void dc_common_predictor_32xh_kernel(uint16_t *dst,
     198             :     const ptrdiff_t stride, const int32_t h, const __m256i dc)
     199             : {
     200           0 :     for (int32_t i = 0; i < h; i++) {
     201             :         _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
     202           0 :         _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
     203           0 :         dst += stride;
     204             :     }
     205           0 : }
     206             : 
     207           0 : static INLINE void dc_common_predictor_64xh_kernel(uint16_t *dst,
     208             :     const ptrdiff_t stride, const int32_t h, const __m256i dc)
     209             : {
     210           0 :     for (int32_t i = 0; i < h; i++) {
     211             :         _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
     212           0 :         _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
     213           0 :         _mm256_storeu_si256((__m256i *)(dst + 0x20), dc);
     214           0 :         _mm256_storeu_si256((__m256i *)(dst + 0x30), dc);
     215           0 :         dst += stride;
     216             :     }
     217           0 : }
     218             : 
     219           0 : static INLINE void dc_common_predictor_16xh(uint16_t *const dst,
     220             :     const ptrdiff_t stride, const int32_t h, const __m128i dc)
     221             : {
     222           0 :     const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
     223           0 :     dc_common_predictor_16xh_kernel(dst, stride, h, expected_dc);
     224           0 : }
     225             : 
     226           0 : static INLINE void dc_common_predictor_32xh(uint16_t *const dst,
     227             :     const ptrdiff_t stride, const int32_t h, const __m128i dc)
     228             : {
     229           0 :     const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
     230           0 :     dc_common_predictor_32xh_kernel(dst, stride, h, expected_dc);
     231           0 : }
     232             : 
     233           0 : static INLINE void dc_common_predictor_64xh(uint16_t *const dst,
     234             :     const ptrdiff_t stride, const int32_t h, const __m128i dc)
     235             : {
     236           0 :     const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
     237           0 :     dc_common_predictor_64xh_kernel(dst, stride, h, expected_dc);
     238           0 : }
     239             : 
     240             : // =============================================================================
     241             : 
     242             : // DC_128_PRED
     243             : 
     244             : // 16xN
     245             : 
     246           0 : static INLINE void dc_128_predictor_16xh(uint16_t *const dst,
     247             :     const ptrdiff_t stride, const int32_t h, const int32_t bd)
     248             : {
     249           0 :     const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
     250           0 :     dc_common_predictor_16xh_kernel(dst, stride, h, dc);
     251           0 : }
     252             : 
     253           0 : void eb_aom_highbd_dc_128_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
     254             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     255             : {
     256             :     (void)above;
     257             :     (void)left;
     258           0 :     dc_128_predictor_16xh(dst, stride, 4, bd);
     259           0 : }
     260             : 
     261           0 : void eb_aom_highbd_dc_128_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
     262             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     263             : {
     264             :     (void)above;
     265             :     (void)left;
     266           0 :     dc_128_predictor_16xh(dst, stride, 8, bd);
     267           0 : }
     268             : 
     269           0 : void eb_aom_highbd_dc_128_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
     270             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     271             : {
     272             :     (void)above;
     273             :     (void)left;
     274           0 :     dc_128_predictor_16xh(dst, stride, 16, bd);
     275           0 : }
     276             : 
     277           0 : void eb_aom_highbd_dc_128_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
     278             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     279             : {
     280             :     (void)above;
     281             :     (void)left;
     282           0 :     dc_128_predictor_16xh(dst, stride, 32, bd);
     283           0 : }
     284             : 
     285           0 : void eb_aom_highbd_dc_128_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
     286             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     287             : {
     288             :     (void)above;
     289             :     (void)left;
     290           0 :     dc_128_predictor_16xh(dst, stride, 64, bd);
     291           0 : }
     292             : 
     293             : // 32xN
     294             : 
     295           0 : static INLINE void dc_128_predictor_32xh(uint16_t *const dst,
     296             :     const ptrdiff_t stride, const int32_t h, const int32_t bd)
     297             : {
     298           0 :     const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
     299           0 :     dc_common_predictor_32xh_kernel(dst, stride, h, dc);
     300           0 : }
     301             : 
     302           0 : void eb_aom_highbd_dc_128_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
     303             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     304             : {
     305             :     (void)above;
     306             :     (void)left;
     307           0 :     dc_128_predictor_32xh(dst, stride, 8, bd);
     308           0 : }
     309             : 
     310           0 : void eb_aom_highbd_dc_128_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
     311             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     312             : {
     313             :     (void)above;
     314             :     (void)left;
     315           0 :     dc_128_predictor_32xh(dst, stride, 16, bd);
     316           0 : }
     317             : 
     318           0 : void eb_aom_highbd_dc_128_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
     319             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     320             : {
     321             :     (void)above;
     322             :     (void)left;
     323           0 :     dc_128_predictor_32xh(dst, stride, 32, bd);
     324           0 : }
     325             : 
     326           0 : void eb_aom_highbd_dc_128_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
     327             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     328             : {
     329             :     (void)above;
     330             :     (void)left;
     331           0 :     dc_128_predictor_32xh(dst, stride, 64, bd);
     332           0 : }
     333             : 
     334             : // 64xN
     335             : 
     336           0 : static INLINE void dc_128_predictor_64xh(uint16_t *const dst,
     337             :     const ptrdiff_t stride, const int32_t h, const int32_t bd)
     338             : {
     339           0 :     const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
     340           0 :     dc_common_predictor_64xh_kernel(dst, stride, h, dc);
     341           0 : }
     342             : 
     343           0 : void eb_aom_highbd_dc_128_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
     344             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     345             : {
     346             :     (void)above;
     347             :     (void)left;
     348           0 :     dc_128_predictor_64xh(dst, stride, 16, bd);
     349           0 : }
     350             : 
     351           0 : void eb_aom_highbd_dc_128_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
     352             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     353             : {
     354             :     (void)above;
     355             :     (void)left;
     356           0 :     dc_128_predictor_64xh(dst, stride, 32, bd);
     357           0 : }
     358             : 
     359           0 : void eb_aom_highbd_dc_128_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
     360             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     361             : {
     362             :     (void)above;
     363             :     (void)left;
     364           0 :     dc_128_predictor_64xh(dst, stride, 64, bd);
     365           0 : }
     366             : 
     367             : // =============================================================================
     368             : 
     369             : // DC_LEFT_PRED
     370             : 
     371             : // 16xN
     372             : 
     373           0 : void eb_aom_highbd_dc_left_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
     374             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     375             : {
     376           0 :     const __m128i round = _mm_cvtsi32_si128(2);
     377             :     __m128i sum;
     378             :     (void)above;
     379             :     (void)bd;
     380             : 
     381           0 :     sum = dc_sum_4(left);
     382           0 :     sum = _mm_add_epi16(sum, round);
     383           0 :     sum = _mm_srli_epi16(sum, 2);
     384           0 :     dc_common_predictor_16xh(dst, stride, 4, sum);
     385           0 : }
     386             : 
     387           0 : void eb_aom_highbd_dc_left_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
     388             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     389             : {
     390           0 :     const __m128i round = _mm_cvtsi32_si128(4);
     391             :     __m128i sum;
     392             :     (void)above;
     393             :     (void)bd;
     394             : 
     395           0 :     sum = dc_sum_8(left);
     396           0 :     sum = _mm_add_epi16(sum, round);
     397           0 :     sum = _mm_srli_epi16(sum, 3);
     398           0 :     dc_common_predictor_16xh(dst, stride, 8, sum);
     399           0 : }
     400             : 
     401           0 : void eb_aom_highbd_dc_left_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
     402             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     403             : {
     404           0 :     const __m128i round = _mm_cvtsi32_si128(8);
     405             :     __m128i sum;
     406             :     (void)above;
     407             :     (void)bd;
     408             : 
     409           0 :     sum = dc_sum_16(left);
     410           0 :     sum = _mm_add_epi16(sum, round);
     411           0 :     sum = _mm_srli_epi16(sum, 4);
     412           0 :     dc_common_predictor_16xh(dst, stride, 16, sum);
     413           0 : }
     414             : 
     415           0 : void eb_aom_highbd_dc_left_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
     416             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     417             : {
     418           0 :     const __m128i round = _mm_cvtsi32_si128(16);
     419             :     __m128i sum;
     420             :     (void)above;
     421             :     (void)bd;
     422             : 
     423           0 :     sum = dc_sum_32(left);
     424           0 :     sum = _mm_add_epi32(sum, round);
     425           0 :     sum = _mm_srli_epi32(sum, 5);
     426           0 :     dc_common_predictor_16xh(dst, stride, 32, sum);
     427           0 : }
     428             : 
     429           0 : void eb_aom_highbd_dc_left_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
     430             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     431             : {
     432           0 :     const __m128i round = _mm_cvtsi32_si128(32);
     433             :     __m128i sum;
     434             :     (void)above;
     435             :     (void)bd;
     436             : 
     437           0 :     sum = dc_sum_64(left);
     438           0 :     sum = _mm_add_epi32(sum, round);
     439           0 :     sum = _mm_srli_epi32(sum, 6);
     440           0 :     dc_common_predictor_16xh(dst, stride, 64, sum);
     441           0 : }
     442             : 
     443             : // 32xN
     444             : 
     445           0 : void eb_aom_highbd_dc_left_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
     446             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     447             : {
     448           0 :     const __m128i round = _mm_cvtsi32_si128(4);
     449             :     __m128i sum;
     450             :     (void)above;
     451             :     (void)bd;
     452             : 
     453           0 :     sum = dc_sum_8(left);
     454           0 :     sum = _mm_add_epi16(sum, round);
     455           0 :     sum = _mm_srli_epi16(sum, 3);
     456           0 :     dc_common_predictor_32xh(dst, stride, 8, sum);
     457           0 : }
     458             : 
     459           0 : void eb_aom_highbd_dc_left_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
     460             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     461             : {
     462           0 :     const __m128i round = _mm_cvtsi32_si128(8);
     463             :     __m128i sum;
     464             :     (void)above;
     465             :     (void)bd;
     466             : 
     467           0 :     sum = dc_sum_16(left);
     468           0 :     sum = _mm_add_epi16(sum, round);
     469           0 :     sum = _mm_srli_epi16(sum, 4);
     470           0 :     dc_common_predictor_32xh(dst, stride, 16, sum);
     471           0 : }
     472             : 
     473           0 : void eb_aom_highbd_dc_left_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
     474             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     475             : {
     476           0 :     const __m128i round = _mm_cvtsi32_si128(16);
     477             :     __m128i sum;
     478             :     (void)above;
     479             :     (void)bd;
     480             : 
     481           0 :     sum = dc_sum_32(left);
     482           0 :     sum = _mm_add_epi32(sum, round);
     483           0 :     sum = _mm_srli_epi32(sum, 5);
     484           0 :     dc_common_predictor_32xh(dst, stride, 32, sum);
     485           0 : }
     486             : 
     487           0 : void eb_aom_highbd_dc_left_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
     488             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     489             : {
     490           0 :     const __m128i round = _mm_cvtsi32_si128(32);
     491             :     __m128i sum;
     492             :     (void)above;
     493             :     (void)bd;
     494             : 
     495           0 :     sum = dc_sum_64(left);
     496           0 :     sum = _mm_add_epi32(sum, round);
     497           0 :     sum = _mm_srli_epi32(sum, 6);
     498           0 :     dc_common_predictor_32xh(dst, stride, 64, sum);
     499           0 : }
     500             : 
     501             : // 64xN
     502             : 
     503           0 : void eb_aom_highbd_dc_left_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
     504             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     505             : {
     506           0 :     const __m128i round = _mm_cvtsi32_si128(8);
     507             :     __m128i sum;
     508             :     (void)above;
     509             :     (void)bd;
     510             : 
     511           0 :     sum = dc_sum_16(left);
     512           0 :     sum = _mm_add_epi16(sum, round);
     513           0 :     sum = _mm_srli_epi16(sum, 4);
     514           0 :     dc_common_predictor_64xh(dst, stride, 16, sum);
     515           0 : }
     516             : 
     517           0 : void eb_aom_highbd_dc_left_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
     518             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     519             : {
     520           0 :     const __m128i round = _mm_cvtsi32_si128(16);
     521             :     __m128i sum;
     522             :     (void)above;
     523             :     (void)bd;
     524             : 
     525           0 :     sum = dc_sum_32(left);
     526           0 :     sum = _mm_add_epi32(sum, round);
     527           0 :     sum = _mm_srli_epi32(sum, 5);
     528           0 :     dc_common_predictor_64xh(dst, stride, 32, sum);
     529           0 : }
     530             : 
     531           0 : void eb_aom_highbd_dc_left_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
     532             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     533             : {
     534           0 :     const __m128i round = _mm_cvtsi32_si128(32);
     535             :     __m128i sum;
     536             :     (void)above;
     537             :     (void)bd;
     538             : 
     539           0 :     sum = dc_sum_64(left);
     540           0 :     sum = _mm_add_epi32(sum, round);
     541           0 :     sum = _mm_srli_epi32(sum, 6);
     542           0 :     dc_common_predictor_64xh(dst, stride, 64, sum);
     543           0 : }
     544             : 
     545             : // =============================================================================
     546             : 
     547             : // DC_TOP_PRED
     548             : 
     549             : // 16xN
     550             : 
     551           0 : static INLINE void dc_top_predictor_16xh(uint16_t *const dst,
     552             :     const ptrdiff_t stride, const uint16_t *const above,
     553             :     const int32_t h, const int32_t bd)
     554             : {
     555             :     (void)bd;
     556           0 :     const __m128i round = _mm_cvtsi32_si128(8);
     557             :     __m128i sum;
     558             : 
     559           0 :     sum = dc_sum_16(above);
     560           0 :     sum = _mm_add_epi16(sum, round);
     561           0 :     sum = _mm_srli_epi16(sum, 4);
     562           0 :     dc_common_predictor_16xh(dst, stride, h, sum);
     563           0 : }
     564             : 
     565           0 : void eb_aom_highbd_dc_top_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
     566             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     567             : {
     568             :     (void)left;
     569           0 :     dc_top_predictor_16xh(dst, stride, above, 4, bd);
     570           0 : }
     571             : 
     572           0 : void eb_aom_highbd_dc_top_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
     573             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     574             : {
     575             :     (void)left;
     576           0 :     dc_top_predictor_16xh(dst, stride, above, 8, bd);
     577           0 : }
     578             : 
     579           0 : void eb_aom_highbd_dc_top_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
     580             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     581             : {
     582             :     (void)left;
     583           0 :     dc_top_predictor_16xh(dst, stride, above, 16, bd);
     584           0 : }
     585             : 
     586           0 : void eb_aom_highbd_dc_top_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
     587             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     588             : {
     589             :     (void)left;
     590           0 :     dc_top_predictor_16xh(dst, stride, above, 32, bd);
     591           0 : }
     592             : 
     593           0 : void eb_aom_highbd_dc_top_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
     594             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     595             : {
     596             :     (void)left;
     597           0 :     dc_top_predictor_16xh(dst, stride, above, 64, bd);
     598           0 : }
     599             : 
     600             : // 32xN
     601             : 
     602           0 : static INLINE void dc_top_predictor_32xh(uint16_t *const dst,
     603             :     const ptrdiff_t stride, const uint16_t *const above,
     604             :     const int32_t h, const int32_t bd)
     605             : {
     606           0 :     const __m128i round = _mm_cvtsi32_si128(16);
     607             :     __m128i sum;
     608             :     (void) bd;
     609             : 
     610           0 :     sum = dc_sum_32(above);
     611           0 :     sum = _mm_add_epi32(sum, round);
     612           0 :     sum = _mm_srli_epi32(sum, 5);
     613           0 :     dc_common_predictor_32xh(dst, stride, h, sum);
     614           0 : }
     615             : 
     616           0 : void eb_aom_highbd_dc_top_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
     617             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     618             : {
     619             :     (void)left;
     620           0 :     dc_top_predictor_32xh(dst, stride, above, 8, bd);
     621           0 : }
     622             : 
     623           0 : void eb_aom_highbd_dc_top_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
     624             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     625             : {
     626             :     (void)left;
     627           0 :     dc_top_predictor_32xh(dst, stride, above, 16, bd);
     628           0 : }
     629             : 
     630           0 : void eb_aom_highbd_dc_top_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
     631             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     632             : {
     633             :     (void)left;
     634           0 :     dc_top_predictor_32xh(dst, stride, above, 32, bd);
     635           0 : }
     636             : 
     637           0 : void eb_aom_highbd_dc_top_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
     638             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     639             : {
     640             :     (void)left;
     641           0 :     dc_top_predictor_32xh(dst, stride, above, 64, bd);
     642           0 : }
     643             : 
     644             : // 64xN
     645             : 
     646           0 : static INLINE void dc_top_predictor_64xh(uint16_t *const dst,
     647             :     const ptrdiff_t stride, const uint16_t *const above,
     648             :     const int32_t h, const int32_t bd)
     649             : {
     650           0 :     const __m128i round = _mm_cvtsi32_si128(32);
     651             :     __m128i sum;
     652             :     (void) bd;
     653             : 
     654           0 :     sum = dc_sum_64(above);
     655           0 :     sum = _mm_add_epi32(sum, round);
     656           0 :     sum = _mm_srli_epi32(sum, 6);
     657           0 :     dc_common_predictor_64xh(dst, stride, h, sum);
     658           0 : }
     659             : 
     660           0 : void eb_aom_highbd_dc_top_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
     661             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     662             : {
     663             :     (void)left;
     664           0 :     dc_top_predictor_64xh(dst, stride, above, 16, bd);
     665           0 : }
     666             : 
     667           0 : void eb_aom_highbd_dc_top_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
     668             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     669             : {
     670             :     (void)left;
     671           0 :     dc_top_predictor_64xh(dst, stride, above, 32, bd);
     672           0 : }
     673             : 
     674           0 : void eb_aom_highbd_dc_top_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
     675             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     676             : {
     677             :     (void)left;
     678           0 :     dc_top_predictor_64xh(dst, stride, above, 64, bd);
     679           0 : }
     680             : 
     681             : // =============================================================================
     682             : 
     683             : // DC_PRED
     684             : 
     685             : // 16xN
     686             : 
     687           0 : void eb_aom_highbd_dc_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
     688             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     689             :     (void)bd;
     690           0 :     __m128i sum = dc_sum_4_16(left, above);
     691           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     692           0 :     sum32 += 10;
     693           0 :     sum32 /= 20;
     694           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     695             : 
     696           0 :     dc_common_predictor_16xh_kernel(dst, stride, 4, dc);
     697           0 : }
     698             : 
     699           0 : void eb_aom_highbd_dc_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
     700             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     701             :     (void)bd;
     702           0 :     __m128i sum = dc_sum_8_16(left, above);
     703           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     704           0 :     sum32 += 12;
     705           0 :     sum32 /= 24;
     706           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     707             : 
     708           0 :     dc_common_predictor_16xh_kernel(dst, stride, 8, dc);
     709           0 : }
     710             : 
     711           0 : void eb_aom_highbd_dc_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
     712             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     713             :     (void)bd;
     714           0 :     __m128i sum = dc_sum_16_16(above, left);
     715           0 :     sum = _mm_add_epi32(sum, _mm_set1_epi32(16));
     716           0 :     sum = _mm_srli_epi32(sum, 5);
     717           0 :     dc_common_predictor_16xh(dst, stride, 16, sum);
     718           0 : }
     719             : 
     720           0 : void eb_aom_highbd_dc_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
     721             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     722             :     (void)bd;
     723           0 :     __m128i sum = dc_sum_16_32(above, left);
     724           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     725           0 :     sum32 += 24;
     726           0 :     sum32 /= 48;
     727           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     728             : 
     729           0 :     dc_common_predictor_16xh_kernel(dst, stride, 32, dc);
     730           0 : }
     731             : 
     732           0 : void eb_aom_highbd_dc_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
     733             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     734             :     (void)bd;
     735           0 :     __m128i sum = dc_sum_16_64(above, left);
     736           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     737           0 :     sum32 += 40;
     738           0 :     sum32 /= 80;
     739           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     740             : 
     741           0 :     dc_common_predictor_16xh_kernel(dst, stride, 64, dc);
     742           0 : }
     743             : 
     744             : // 32xN
     745             : 
     746           0 : void eb_aom_highbd_dc_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
     747             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     748             :     (void)bd;
     749           0 :     __m128i sum = dc_sum_8_32(left, above);
     750           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     751           0 :     sum32 += 20;
     752           0 :     sum32 /= 40;
     753           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     754             : 
     755           0 :     dc_common_predictor_32xh_kernel(dst, stride, 8, dc);
     756           0 : }
     757             : 
     758           0 : void eb_aom_highbd_dc_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
     759             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     760             :     (void)bd;
     761           0 :     __m128i sum = dc_sum_16_32(left, above);
     762           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     763           0 :     sum32 += 24;
     764           0 :     sum32 /= 48;
     765           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     766             : 
     767           0 :     dc_common_predictor_32xh_kernel(dst, stride, 16, dc);
     768           0 : }
     769             : 
     770           0 : void eb_aom_highbd_dc_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
     771             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     772             :     (void)bd;
     773           0 :     __m128i sum = dc_sum_32_32(above, left);
     774           0 :     sum = _mm_add_epi32(sum, _mm_set1_epi32(32));
     775           0 :     sum = _mm_srli_epi32(sum, 6);
     776           0 :     dc_common_predictor_32xh(dst, stride, 32, sum);
     777           0 : }
     778             : 
     779           0 : void eb_aom_highbd_dc_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
     780             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     781             :     (void)bd;
     782           0 :     __m128i sum = dc_sum_32_64(above, left);
     783           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     784           0 :     sum32 += 48;
     785           0 :     sum32 /= 96;
     786           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     787             : 
     788           0 :     dc_common_predictor_32xh_kernel(dst, stride, 64, dc);
     789           0 : }
     790             : 
     791             : // 64xN
     792             : 
     793           0 : void eb_aom_highbd_dc_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
     794             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     795             :     (void)bd;
     796           0 :     __m128i sum = dc_sum_16_64(left, above);
     797           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     798           0 :     sum32 += 40;
     799           0 :     sum32 /= 80;
     800           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     801             : 
     802           0 :     dc_common_predictor_64xh_kernel(dst, stride, 16, dc);
     803           0 : }
     804             : 
     805           0 : void eb_aom_highbd_dc_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
     806             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     807             :     (void)bd;
     808           0 :     __m128i sum = dc_sum_32_64(left, above);
     809           0 :     uint32_t sum32 = _mm_cvtsi128_si32(sum);
     810           0 :     sum32 += 48;
     811           0 :     sum32 /= 96;
     812           0 :     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
     813             : 
     814           0 :     dc_common_predictor_64xh_kernel(dst, stride, 32, dc);
     815           0 : }
     816             : 
     817           0 : void eb_aom_highbd_dc_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
     818             :     const uint16_t *above, const uint16_t *left, int32_t bd) {
     819             :     (void)bd;
     820           0 :     __m128i sum = dc_sum_64_64(above, left);
     821           0 :     sum = _mm_add_epi32(sum, _mm_set1_epi32(64));
     822           0 :     sum = _mm_srli_epi32(sum, 7);
     823           0 :     dc_common_predictor_64xh(dst, stride, 64, sum);
     824           0 : }
     825             : 
     826             : // =============================================================================
     827             : 
     828             : // H_PRED
     829             : 
     830             : // 16xN
     831             : 
     832           0 : static INLINE void h_pred_16(uint16_t **const dst, const ptrdiff_t stride,
     833             :     const __m128i left)
     834             : {
     835             :     // Broadcast the 16-bit left pixel to 256-bit register.
     836           0 :     const __m256i row = _mm256_broadcastw_epi16(left);
     837             : 
     838           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
     839           0 :     *dst += stride;
     840           0 : }
     841             : 
     842             : // Process 8 rows.
     843           0 : static INLINE void h_pred_16x8(uint16_t **dst, const ptrdiff_t stride,
     844             :     const uint16_t *const left)
     845             : {
     846             :     // dst and it's stride must be 32-byte aligned.
     847             :     assert(!((intptr_t)*dst % 32));
     848             :     assert(!(stride % 32));
     849             : 
     850           0 :     const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
     851             : 
     852           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 0));
     853           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 2));
     854           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 4));
     855           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 6));
     856           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 8));
     857           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 10));
     858           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 12));
     859           0 :     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 14));
     860           0 : }
     861             : 
     862             : // 16x4
     863             : 
     864           0 : void eb_aom_highbd_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
     865             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     866             : {
     867             :     (void)above;
     868             :     (void)bd;
     869             : 
     870             :     // dst and it's stride must be 32-byte aligned.
     871             :     assert(!((intptr_t)dst % 32));
     872             :     assert(!(stride % 32));
     873             : 
     874           0 :     const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
     875             : 
     876           0 :     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 0));
     877           0 :     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 2));
     878           0 :     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 4));
     879           0 :     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 6));
     880           0 : }
     881             : 
     882             : // 16x64
     883             : 
     884           0 : void eb_aom_highbd_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
     885             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     886             : {
     887             :     (void)above;
     888             :     (void)bd;
     889             : 
     890           0 :     for (int32_t i = 0; i < 8; i++, left += 8)
     891           0 :         h_pred_16x8(&dst, stride, left);
     892           0 : }
     893             : 
     894             : // -----------------------------------------------------------------------------
     895             : 
     896             : // 32xN
     897             : 
     898           0 : static INLINE void h_pred_32(uint16_t **const dst, const ptrdiff_t stride,
     899             :     const __m128i left)
     900             : {
     901             :     // Broadcast the 16-bit left pixel to 256-bit register.
     902           0 :     const __m256i row = _mm256_broadcastw_epi16(left);
     903             : 
     904           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
     905           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
     906           0 :     *dst += stride;
     907           0 : }
     908             : 
     909             : // Process 8 rows.
     910           0 : static INLINE void h_pred_32x8(uint16_t **dst, const ptrdiff_t stride,
     911             :     const uint16_t *const left)
     912             : {
     913             :     // dst and it's stride must be 32-byte aligned.
     914             :     assert(!((intptr_t)*dst % 32));
     915             :     assert(!(stride % 32));
     916             : 
     917           0 :     const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
     918             : 
     919           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 0));
     920           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 2));
     921           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 4));
     922           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 6));
     923           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 8));
     924           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 10));
     925           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 12));
     926           0 :     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 14));
     927           0 : }
     928             : 
     929             : // 32x8
     930             : 
     931           0 : void eb_aom_highbd_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
     932             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     933             : {
     934             :     (void)above;
     935             :     (void)bd;
     936             : 
     937           0 :     h_pred_32x8(&dst, stride, left);
     938           0 : }
     939             : 
     940             : // 32x64
     941             : 
     942           0 : void eb_aom_highbd_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
     943             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     944             : {
     945             :     (void)above;
     946             :     (void)bd;
     947             : 
     948           0 :     for (int32_t i = 0; i < 8; i++, left += 8)
     949           0 :         h_pred_32x8(&dst, stride, left);
     950           0 : }
     951             : 
     952             : // -----------------------------------------------------------------------------
     953             : 
     954             : // 64xN
     955             : 
     956           0 : static INLINE void h_pred_64(uint16_t **const dst, const ptrdiff_t stride,
     957             :     const __m128i left)
     958             : {
     959             :     // Broadcast the 16-bit left pixel to 256-bit register.
     960           0 :     const __m256i row = _mm256_broadcastw_epi16(left);
     961             : 
     962           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
     963           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
     964           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x20), row);
     965           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x30), row);
     966           0 :     *dst += stride;
     967           0 : }
     968             : 
     969             : // Process 8 rows.
     970           0 : static INLINE void h_pred_64x8(uint16_t **dst, const ptrdiff_t stride,
     971             :     const uint16_t *const left)
     972             : {
     973             :     // dst and it's stride must be 32-byte aligned.
     974             :     assert(!((intptr_t)*dst % 32));
     975             :     assert(!(stride % 32));
     976             : 
     977           0 :     const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
     978             : 
     979           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 0));
     980           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 2));
     981           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 4));
     982           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 6));
     983           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 8));
     984           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 10));
     985           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 12));
     986           0 :     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 14));
     987           0 : }
     988             : 
     989             : // 64x16
     990             : 
     991           0 : void eb_aom_highbd_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
     992             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     993             : {
     994             :     (void)above;
     995             :     (void)bd;
     996             : 
     997           0 :     for (int32_t i = 0; i < 2; i++, left += 8)
     998           0 :         h_pred_64x8(&dst, stride, left);
     999           0 : }
    1000             : 
    1001             : // 64x32
    1002             : 
    1003           0 : void eb_aom_highbd_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
    1004             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1005             : {
    1006             :     (void)above;
    1007             :     (void)bd;
    1008             : 
    1009           0 :     for (int32_t i = 0; i < 4; i++, left += 8)
    1010           0 :         h_pred_64x8(&dst, stride, left);
    1011           0 : }
    1012             : 
    1013             : // 64x64
    1014             : 
    1015           0 : void eb_aom_highbd_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
    1016             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1017             : {
    1018             :     (void)above;
    1019             :     (void)bd;
    1020             : 
    1021           0 :     for (int32_t i = 0; i < 8; i++, left += 8)
    1022           0 :         h_pred_64x8(&dst, stride, left);
    1023           0 : }
    1024             : 
    1025             : // =============================================================================
    1026             : 
    1027             : // V_PRED
    1028             : 
    1029             : // 16xN
    1030             : 
    1031           0 : static INLINE void v_pred_16(uint16_t **const dst, const ptrdiff_t stride,
    1032             :     const __m256i above0)
    1033             : {
    1034           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
    1035           0 :     *dst += stride;
    1036           0 : }
    1037             : 
    1038             : // Process 8 rows.
    1039           0 : static INLINE void v_pred_16x8(uint16_t **const dst, const ptrdiff_t stride,
    1040             :     const __m256i above)
    1041             : {
    1042             :     // dst and it's stride must be 32-byte aligned.
    1043             :     assert(!((intptr_t)*dst % 32));
    1044             :     assert(!(stride % 32));
    1045             : 
    1046           0 :     v_pred_16(dst, stride, above);
    1047           0 :     v_pred_16(dst, stride, above);
    1048           0 :     v_pred_16(dst, stride, above);
    1049           0 :     v_pred_16(dst, stride, above);
    1050           0 :     v_pred_16(dst, stride, above);
    1051           0 :     v_pred_16(dst, stride, above);
    1052           0 :     v_pred_16(dst, stride, above);
    1053           0 :     v_pred_16(dst, stride, above);
    1054           0 : }
    1055             : 
    1056             : // 16x4
    1057             : 
    1058           0 : void eb_aom_highbd_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
    1059             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1060             : {
    1061             :     // Load all 16 pixels in a row into 256-bit registers.
    1062           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1063             : 
    1064             :     (void)left;
    1065             :     (void)bd;
    1066             : 
    1067             :     // dst and it's stride must be 32-byte aligned.
    1068             :     assert(!((intptr_t)dst % 32));
    1069             :     assert(!(stride % 32));
    1070             : 
    1071           0 :     v_pred_16(&dst, stride, above0);
    1072           0 :     v_pred_16(&dst, stride, above0);
    1073           0 :     v_pred_16(&dst, stride, above0);
    1074           0 :     v_pred_16(&dst, stride, above0);
    1075           0 : }
    1076             : 
    1077             : // 16x8
    1078             : 
    1079           0 : void eb_aom_highbd_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
    1080             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1081             : {
    1082             :     // Load all 16 pixels in a row into 256-bit registers.
    1083           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1084             : 
    1085             :     (void)left;
    1086             :     (void)bd;
    1087             : 
    1088             :     // dst and it's stride must be 32-byte aligned.
    1089             :     assert(!((intptr_t)dst % 32));
    1090             :     assert(!(stride % 32));
    1091             : 
    1092           0 :     v_pred_16x8(&dst, stride, above0);
    1093           0 : }
    1094             : 
    1095             : // 16x16
    1096             : 
    1097           0 : void eb_aom_highbd_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
    1098             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1099             : {
    1100             :     // Load all 16 pixels in a row into 256-bit registers.
    1101           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1102             : 
    1103             :     (void)left;
    1104             :     (void)bd;
    1105             : 
    1106           0 :     for (int32_t i = 0; i < 2; i++)
    1107           0 :         v_pred_16x8(&dst, stride, above0);
    1108           0 : }
    1109             : 
    1110             : // 16x32
    1111             : 
    1112           0 : void eb_aom_highbd_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
    1113             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1114             : {
    1115             :     // Load all 16 pixels in a row into 256-bit registers.
    1116           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1117             : 
    1118             :     (void)left;
    1119             :     (void)bd;
    1120             : 
    1121           0 :     for (int32_t i = 0; i < 4; i++)
    1122           0 :         v_pred_16x8(&dst, stride, above0);
    1123           0 : }
    1124             : 
    1125             : // 16x64
    1126             : 
    1127           0 : void eb_aom_highbd_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
    1128             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1129             : {
    1130             :     // Load all 16 pixels in a row into 256-bit registers.
    1131           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1132             : 
    1133             :     (void)left;
    1134             :     (void)bd;
    1135             : 
    1136           0 :     for (int32_t i = 0; i < 8; i++)
    1137           0 :         v_pred_16x8(&dst, stride, above0);
    1138           0 : }
    1139             : 
    1140             : // -----------------------------------------------------------------------------
    1141             : 
    1142             : // 32xN
    1143             : 
    1144           0 : static INLINE void v_pred_32(uint16_t **const dst, const ptrdiff_t stride,
    1145             :     const __m256i above0, const __m256i above1)
    1146             : {
    1147           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
    1148           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
    1149           0 :     *dst += stride;
    1150           0 : }
    1151             : 
    1152             : // Process 8 rows.
    1153           0 : static INLINE void v_pred_32x8(uint16_t **const dst, const ptrdiff_t stride,
    1154             :     const __m256i above0, const __m256i above1)
    1155             : {
    1156             :     // dst and it's stride must be 32-byte aligned.
    1157             :     assert(!((intptr_t)*dst % 32));
    1158             :     assert(!(stride % 32));
    1159             : 
    1160           0 :     v_pred_32(dst, stride, above0, above1);
    1161           0 :     v_pred_32(dst, stride, above0, above1);
    1162           0 :     v_pred_32(dst, stride, above0, above1);
    1163           0 :     v_pred_32(dst, stride, above0, above1);
    1164           0 :     v_pred_32(dst, stride, above0, above1);
    1165           0 :     v_pred_32(dst, stride, above0, above1);
    1166           0 :     v_pred_32(dst, stride, above0, above1);
    1167           0 :     v_pred_32(dst, stride, above0, above1);
    1168           0 : }
    1169             : 
    1170             : // 32x8
    1171             : 
    1172           0 : void eb_aom_highbd_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
    1173             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1174             : {
    1175             :     // Load all 32 pixels in a row into 256-bit registers.
    1176           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1177           0 :     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
    1178             : 
    1179             :     (void)left;
    1180             :     (void)bd;
    1181             : 
    1182           0 :     v_pred_32x8(&dst, stride, above0, above1);
    1183           0 : }
    1184             : 
    1185             : // 32x16
    1186             : 
    1187           0 : void eb_aom_highbd_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
    1188             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1189             : {
    1190             :     // Load all 32 pixels in a row into 256-bit registers.
    1191           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1192           0 :     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
    1193             : 
    1194             :     (void)left;
    1195             :     (void)bd;
    1196             : 
    1197           0 :     for (int32_t i = 0; i < 2; i++)
    1198           0 :         v_pred_32x8(&dst, stride, above0, above1);
    1199           0 : }
    1200             : 
    1201             : // 32x32
    1202             : 
    1203           0 : void eb_aom_highbd_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
    1204             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1205             : {
    1206             :     // Load all 32 pixels in a row into 256-bit registers.
    1207           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1208           0 :     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
    1209             : 
    1210             :     (void)left;
    1211             :     (void)bd;
    1212             : 
    1213           0 :     for (int32_t i = 0; i < 4; i++)
    1214           0 :         v_pred_32x8(&dst, stride, above0, above1);
    1215           0 : }
    1216             : 
    1217             : // 32x64
    1218             : 
    1219           0 : void eb_aom_highbd_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
    1220             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1221             : {
    1222             :     // Load all 32 pixels in a row into 256-bit registers.
    1223           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1224           0 :     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
    1225             : 
    1226             :     (void)left;
    1227             :     (void)bd;
    1228             : 
    1229           0 :     for (int32_t i = 0; i < 8; i++)
    1230           0 :         v_pred_32x8(&dst, stride, above0, above1);
    1231           0 : }
    1232             : 
    1233             : // -----------------------------------------------------------------------------
    1234             : 
    1235             : // 64xN
    1236             : 
    1237           0 : static INLINE void v_pred_64(uint16_t **const dst, const ptrdiff_t stride,
    1238             :     const __m256i above0, const __m256i above1, const __m256i above2,
    1239             :     const __m256i above3)
    1240             : {
    1241           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
    1242           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
    1243           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x20), above2);
    1244           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x30), above3);
    1245           0 :     *dst += stride;
    1246           0 : }
    1247             : 
    1248             : // Process 8 rows.
    1249           0 : static INLINE void v_pred_64x8(uint16_t **const dst, const ptrdiff_t stride,
    1250             :     const __m256i above0, const __m256i above1, const __m256i above2,
    1251             :     const __m256i above3)
    1252             : {
    1253             :     // dst and it's stride must be 32-byte aligned.
    1254             :     assert(!((intptr_t)*dst % 32));
    1255             :     assert(!(stride % 32));
    1256             : 
    1257           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1258           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1259           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1260           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1261           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1262           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1263           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1264           0 :     v_pred_64(dst, stride, above0, above1, above2, above3);
    1265           0 : }
    1266             : 
    1267             : // 64x16
    1268             : 
    1269           0 : void eb_aom_highbd_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
    1270             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1271             : {
    1272             :     // Load all 64 pixels in a row into 256-bit registers.
    1273           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1274           0 :     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
    1275           0 :     const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
    1276           0 :     const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
    1277             : 
    1278             :     (void)left;
    1279             :     (void)bd;
    1280             : 
    1281           0 :     for (int32_t i = 0; i < 2; i++)
    1282           0 :         v_pred_64x8(&dst, stride, above0, above1, above2, above3);
    1283           0 : }
    1284             : 
    1285             : // 64x32
    1286             : 
    1287           0 : void eb_aom_highbd_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
    1288             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1289             : {
    1290             :     // Load all 64 pixels in a row into 256-bit registers.
    1291           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1292           0 :     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
    1293           0 :     const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
    1294           0 :     const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
    1295             : 
    1296             :     (void)left;
    1297             :     (void)bd;
    1298             : 
    1299           0 :     for (int32_t i = 0; i < 4; i++)
    1300           0 :         v_pred_64x8(&dst, stride, above0, above1, above2, above3);
    1301           0 : }
    1302             : 
    1303             : // 64x64
    1304             : 
    1305           0 : void eb_aom_highbd_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
    1306             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1307             : {
    1308             :     // Load all 64 pixels in a row into 256-bit registers.
    1309           0 :     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
    1310           0 :     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
    1311           0 :     const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
    1312           0 :     const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
    1313             : 
    1314             :     (void)left;
    1315             :     (void)bd;
    1316             : 
    1317           0 :     for (int32_t i = 0; i < 8; i++)
    1318           0 :         v_pred_64x8(&dst, stride, above0, above1, above2, above3);
    1319           0 : }
    1320             : 
    1321             : // =============================================================================
    1322             : 
    1323             : // Repeat for AVX2 optimizations.
    1324             : 
    1325             : // bs = 4
    1326             : EB_ALIGN(32) static const uint16_t sm_weights_d_4[16] = {
    1327             :     255, 1, 149, 107, 85, 171, 64, 192, //  0  1  2  3
    1328             :     255, 1, 149, 107, 85, 171, 64, 192  //  0  1  2  3
    1329             : };
    1330             : 
    1331             : // bs = 8
    1332             : EB_ALIGN(32) static const uint16_t sm_weights_d_8[32] = {
    1333             :     255, 1, 197, 59, 146, 110, 105, 151, //  0  1  2  3
    1334             :     255, 1, 197, 59, 146, 110, 105, 151, //  0  1  2  3
    1335             :     73, 183, 50, 206, 37, 219, 32, 224,  //  4  5  6  7
    1336             :     73, 183, 50, 206, 37, 219, 32, 224   //  4  5  6  7
    1337             : };
    1338             : 
    1339             : // bs = 16
    1340             : EB_ALIGN(32) static const uint16_t sm_weights_d_16[64] = {
    1341             :     255, 1, 225, 31, 196, 60, 170, 86,     //  0  1  2  3
    1342             :     255, 1, 225, 31, 196, 60, 170, 86,     //  0  1  2  3
    1343             :     145, 111, 123, 133, 102, 154, 84, 172, //  4  5  6  7
    1344             :     145, 111, 123, 133, 102, 154, 84, 172, //  4  5  6  7
    1345             :     68, 188, 54, 202, 43, 213, 33, 223,    //  8  9 10 11
    1346             :     68, 188, 54, 202, 43, 213, 33, 223,    //  8  9 10 11
    1347             :     26, 230, 20, 236, 17, 239, 16, 240,    // 12 13 14 15
    1348             :     26, 230, 20, 236, 17, 239, 16, 240     // 12 13 14 15
    1349             : };
    1350             : 
    1351             : // bs = 32
    1352             : EB_ALIGN(32) static const uint16_t sm_weights_d_32[128] = {
    1353             :     255, 1, 240, 16, 225, 31, 210, 46,      //  0  1  2  3
    1354             :     255, 1, 240, 16, 225, 31, 210, 46,      //  0  1  2  3
    1355             :     196, 60, 182, 74, 169, 87, 157, 99,     //  4  5  6  7
    1356             :     196, 60, 182, 74, 169, 87, 157, 99,     //  4  5  6  7
    1357             :     145, 111, 133, 123, 122, 134, 111, 145, //  8  9 10 11
    1358             :     145, 111, 133, 123, 122, 134, 111, 145, //  8  9 10 11
    1359             :     101, 155, 92, 164, 83, 173, 74, 182,    // 12 13 14 15
    1360             :     101, 155, 92, 164, 83, 173, 74, 182,    // 12 13 14 15
    1361             :     66, 190, 59, 197, 52, 204, 45, 211,     // 16 17 18 19
    1362             :     66, 190, 59, 197, 52, 204, 45, 211,     // 16 17 18 19
    1363             :     39, 217, 34, 222, 29, 227, 25, 231,     // 20 21 22 23
    1364             :     39, 217, 34, 222, 29, 227, 25, 231,     // 20 21 22 23
    1365             :     21, 235, 17, 239, 14, 242, 12, 244,     // 24 25 26 27
    1366             :     21, 235, 17, 239, 14, 242, 12, 244,     // 24 25 26 27
    1367             :     10, 246, 9, 247, 8, 248, 8, 248,        // 28 29 30 31
    1368             :     10, 246, 9, 247, 8, 248, 8, 248         // 28 29 30 31
    1369             : };
    1370             : 
    1371             : // bs = 64
    1372             : EB_ALIGN(32) static const uint16_t sm_weights_d_64[256] = {
    1373             :     255, 1, 248, 8, 240, 16, 233, 23,       //  0  1  2  3
    1374             :     255, 1, 248, 8, 240, 16, 233, 23,       //  0  1  2  3
    1375             :     225, 31, 218, 38, 210, 46, 203, 53,     //  4  5  6  7
    1376             :     225, 31, 218, 38, 210, 46, 203, 53,     //  4  5  6  7
    1377             :     196, 60, 189, 67, 182, 74, 176, 80,     //  8  9 10 11
    1378             :     196, 60, 189, 67, 182, 74, 176, 80,     //  8  9 10 11
    1379             :     169, 87, 163, 93, 156, 100, 150, 106,   // 12 13 14 15
    1380             :     169, 87, 163, 93, 156, 100, 150, 106,   // 12 13 14 15
    1381             :     144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
    1382             :     144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
    1383             :     121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
    1384             :     121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
    1385             :     101, 155, 96, 160, 91, 165, 86, 170,    // 24 25 26 27
    1386             :     101, 155, 96, 160, 91, 165, 86, 170,    // 24 25 26 27
    1387             :     82, 174, 77, 179, 73, 183, 69, 187,     // 28 29 30 31
    1388             :     82, 174, 77, 179, 73, 183, 69, 187,     // 28 29 30 31
    1389             :     65, 191, 61, 195, 57, 199, 54, 202,     // 32 33 34 35
    1390             :     65, 191, 61, 195, 57, 199, 54, 202,     // 32 33 34 35
    1391             :     50, 206, 47, 209, 44, 212, 41, 215,     // 36 37 38 39
    1392             :     50, 206, 47, 209, 44, 212, 41, 215,     // 36 37 38 39
    1393             :     38, 218, 35, 221, 32, 224, 29, 227,     // 40 41 42 43
    1394             :     38, 218, 35, 221, 32, 224, 29, 227,     // 40 41 42 43
    1395             :     27, 229, 25, 231, 22, 234, 20, 236,     // 44 45 46 47
    1396             :     27, 229, 25, 231, 22, 234, 20, 236,     // 44 45 46 47
    1397             :     18, 238, 16, 240, 15, 241, 13, 243,     // 48 49 50 51
    1398             :     18, 238, 16, 240, 15, 241, 13, 243,     // 48 49 50 51
    1399             :     12, 244, 10, 246, 9, 247, 8, 248,       // 52 53 54 55
    1400             :     12, 244, 10, 246, 9, 247, 8, 248,       // 52 53 54 55
    1401             :     7, 249, 6, 250, 6, 250, 5, 251,         // 56 57 58 59
    1402             :     7, 249, 6, 250, 6, 250, 5, 251,         // 56 57 58 59
    1403             :     5, 251, 4, 252, 4, 252, 4, 252,         // 60 61 62 63
    1404             :     5, 251, 4, 252, 4, 252, 4, 252          // 60 61 62 63
    1405             : };
    1406             : 
    1407             : // -----------------------------------------------------------------------------
    1408             : 
    1409             : // Shuffle for AVX2 optimizations.
    1410             : 
    1411             : // bs = 16
    1412             : EB_ALIGN(32) static const uint16_t sm_weights_16[32] = {
    1413             :     255, 1, 225, 31, 196, 60, 170, 86,     //  0  1  2  3
    1414             :     68, 188, 54, 202, 43, 213, 33, 223,    //  8  9 10 11
    1415             :     145, 111, 123, 133, 102, 154, 84, 172, //  4  5  6  7
    1416             :     26, 230, 20, 236, 17, 239, 16, 240     // 12 13 14 15
    1417             : };
    1418             : 
    1419             : // bs = 32
    1420             : EB_ALIGN(32) static const uint16_t sm_weights_32[64] = {
    1421             :     255, 1, 240, 16, 225, 31, 210, 46,      //  0  1  2  3
    1422             :     145, 111, 133, 123, 122, 134, 111, 145, //  8  9 10 11
    1423             :     196, 60, 182, 74, 169, 87, 157, 99,     //  4  5  6  7
    1424             :     101, 155, 92, 164, 83, 173, 74, 182,    // 12 13 14 15
    1425             :     66, 190, 59, 197, 52, 204, 45, 211,     // 16 17 18 19
    1426             :     21, 235, 17, 239, 14, 242, 12, 244,     // 24 25 26 27
    1427             :     39, 217, 34, 222, 29, 227, 25, 231,     // 20 21 22 23
    1428             :     10, 246, 9, 247, 8, 248, 8, 248         // 28 29 30 31
    1429             : };
    1430             : 
    1431             : // bs = 64
    1432             : EB_ALIGN(32) static const uint16_t sm_weights_64[128] = {
    1433             :     255, 1, 248, 8, 240, 16, 233, 23,       //  0  1  2  3
    1434             :     196, 60, 189, 67, 182, 74, 176, 80,     //  8  9 10 11
    1435             :     225, 31, 218, 38, 210, 46, 203, 53,     //  4  5  6  7
    1436             :     169, 87, 163, 93, 156, 100, 150, 106,   // 12 13 14 15
    1437             :     144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
    1438             :     101, 155, 96, 160, 91, 165, 86, 170,    // 24 25 26 27
    1439             :     121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
    1440             :     82, 174, 77, 179, 73, 183, 69, 187,     // 28 29 30 31
    1441             :     65, 191, 61, 195, 57, 199, 54, 202,     // 32 33 34 35
    1442             :     38, 218, 35, 221, 32, 224, 29, 227,     // 40 41 42 43
    1443             :     50, 206, 47, 209, 44, 212, 41, 215,     // 36 37 38 39
    1444             :     27, 229, 25, 231, 22, 234, 20, 236,     // 44 45 46 47
    1445             :     18, 238, 16, 240, 15, 241, 13, 243,     // 48 49 50 51
    1446             :     7, 249, 6, 250, 6, 250, 5, 251,         // 56 57 58 59
    1447             :     12, 244, 10, 246, 9, 247, 8, 248,       // 52 53 54 55
    1448             :     5, 251, 4, 252, 4, 252, 4, 252          // 60 61 62 63
    1449             : };
    1450             : 
    1451             : // SMOOTH_PRED
    1452             : 
    1453             : // 8xN
    1454             : 
    1455           0 : static INLINE void load_right_weights_8(const uint16_t *const above,
    1456             :     __m256i *const r, __m256i *const weights)
    1457             : {
    1458           0 :     *r = _mm256_set1_epi16((uint16_t)above[7]);
    1459             : 
    1460             :     // 0 1 2 3  0 1 2 3
    1461           0 :     weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_d_8 + 0x00));
    1462             :     // 4 5 6 7  4 5 6 7
    1463           0 :     weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_d_8 + 0x10));
    1464           0 : }
    1465             : 
    1466           0 : static INLINE __m256i load_left_4(const uint16_t *const left, const __m256i r) {
    1467           0 :     const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
    1468             :     // 0 1 2 3 x x x x  0 1 2 3 x x x x
    1469             :     const __m256i l =
    1470           0 :         _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
    1471           0 :     return _mm256_unpacklo_epi16(l, r); // 0 1 2 3  0 1 2 3
    1472             : }
    1473             : 
    1474           0 : static INLINE void load_left_8(const uint16_t *const left, const __m256i r,
    1475             :     __m256i *const lr)
    1476             : {
    1477           0 :     const __m128i l0 = _mm_load_si128((const __m128i *)left);
    1478             :     // 0 1 2 3 4 5 6 7  0 1 2 3 4 5 6 7
    1479             :     const __m256i l =
    1480           0 :         _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
    1481           0 :     lr[0] = _mm256_unpacklo_epi16(l, r); // 0 1 2 3  0 1 2 3
    1482           0 :     lr[1] = _mm256_unpackhi_epi16(l, r); // 4 5 6 7  4 5 6 7
    1483           0 : }
    1484             : 
    1485           0 : static INLINE void init_8(const uint16_t *const above,
    1486             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    1487             :     __m256i *const r, __m256i *const weights_w, __m256i *const rep)
    1488             : {
    1489           0 :     const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
    1490           0 :     const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
    1491           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    1492             :     __m256i a[2];
    1493           0 :     a[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
    1494           0 :     a[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
    1495           0 :     ab[0] = _mm256_unpacklo_epi16(a[0], b);
    1496           0 :     ab[1] = _mm256_unpacklo_epi16(a[1], b);
    1497           0 :     load_right_weights_8(above, r, weights_w);
    1498             : 
    1499           0 :     const __m128i rep0 = _mm_set1_epi32(0x03020100);
    1500           0 :     const __m128i rep1 = _mm_set1_epi32(0x07060504);
    1501           0 :     const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
    1502           0 :     const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
    1503           0 :     rep[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
    1504           0 :     rep[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
    1505           0 : }
    1506             : 
    1507           0 : static INLINE __m256i smooth_pred_kernel(const __m256i *const weights_w,
    1508             :     const __m256i weights_h, const __m256i rep, const __m256i *const ab,
    1509             :     const __m256i lr)
    1510             : {
    1511           0 :     const __m256i round = _mm256_set1_epi32((1 << sm_weight_log2_scale));
    1512             :     __m256i s[2], sum[2];
    1513             :     // 0 0 0 0  1 1 1 1
    1514           0 :     const __m256i w = _mm256_shuffle_epi8(weights_h, rep);
    1515           0 :     const __m256i t = _mm256_shuffle_epi8(lr, rep);
    1516           0 :     s[0] = _mm256_madd_epi16(ab[0], w);
    1517           0 :     s[1] = _mm256_madd_epi16(ab[1], w);
    1518             :     // width 8: 00 01 02 03  10 11 12 13
    1519             :     // width 16: 0 1 2 3  8 9 A B
    1520           0 :     sum[0] = _mm256_madd_epi16(t, weights_w[0]);
    1521             :     // width 8: 04 05 06 07  14 15 16 17
    1522             :     // width 16: 4 5 6 7  C D E F
    1523           0 :     sum[1] = _mm256_madd_epi16(t, weights_w[1]);
    1524           0 :     sum[0] = _mm256_add_epi32(sum[0], s[0]);
    1525           0 :     sum[1] = _mm256_add_epi32(sum[1], s[1]);
    1526           0 :     sum[0] = _mm256_add_epi32(sum[0], round);
    1527           0 :     sum[1] = _mm256_add_epi32(sum[1], round);
    1528           0 :     sum[0] = _mm256_srai_epi32(sum[0], 1 + sm_weight_log2_scale);
    1529           0 :     sum[1] = _mm256_srai_epi32(sum[1], 1 + sm_weight_log2_scale);
    1530             :     // width 8: 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
    1531             :     // width 16: 0 1 2 3 4 5 6 7  8 9 A B C D E F
    1532           0 :     return _mm256_packs_epi32(sum[0], sum[1]);
    1533             : }
    1534             : 
    1535           0 : static INLINE void smooth_pred_8x2(const __m256i *const weights_w,
    1536             :     const __m256i weights_h, const __m256i rep, const __m256i *const ab,
    1537             :     const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
    1538             : {
    1539             :     // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
    1540           0 :     const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
    1541           0 :     _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
    1542           0 :     *dst += stride;
    1543           0 :     _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
    1544           0 :     *dst += stride;
    1545           0 : }
    1546             : 
    1547           0 : static INLINE void smooth_pred_8x4(const __m256i *const weights_w,
    1548             :     const uint16_t *const sm_weights_h, const __m256i *const rep,
    1549             :     const __m256i *const ab, const __m256i lr, uint16_t **const dst,
    1550             :     const ptrdiff_t stride)
    1551             : {
    1552           0 :     const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
    1553           0 :     smooth_pred_8x2(weights_w, weights_h, rep[0], ab, lr, dst, stride);
    1554           0 :     smooth_pred_8x2(weights_w, weights_h, rep[1], ab, lr, dst, stride);
    1555           0 : }
    1556             : 
    1557           0 : static INLINE void smooth_pred_8x8(const uint16_t *const left,
    1558             :     const __m256i *const weights_w, const uint16_t *const sm_weights_h,
    1559             :     const __m256i *const rep, const __m256i *const ab, const __m256i r,
    1560             :     uint16_t **const dst, const ptrdiff_t stride)
    1561             : {
    1562             :     __m256i lr[2];
    1563           0 :     load_left_8(left, r, lr);
    1564             : 
    1565           0 :     smooth_pred_8x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
    1566           0 :     smooth_pred_8x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
    1567           0 : }
    1568             : 
    1569             : // 8x4
    1570             : 
    1571           0 : void eb_aom_highbd_smooth_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
    1572             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1573             : {
    1574             :     __m256i ab[2], r, lr, weights_w[2], rep[2];
    1575             :     (void)bd;
    1576             : 
    1577           0 :     init_8(above, left, 4, ab, &r, weights_w, rep);
    1578           0 :     lr = load_left_4(left, r);
    1579           0 :     smooth_pred_8x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
    1580           0 : }
    1581             : 
    1582             : // 8x8
    1583             : 
    1584           0 : void eb_aom_highbd_smooth_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
    1585             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1586             : {
    1587             :     __m256i ab[2], r, weights_w[2], rep[2];
    1588             :     (void)bd;
    1589             : 
    1590           0 :     init_8(above, left, 8, ab, &r, weights_w, rep);
    1591             : 
    1592           0 :     smooth_pred_8x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
    1593           0 : }
    1594             : 
    1595             : // 8x16
    1596             : 
    1597           0 : void eb_aom_highbd_smooth_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
    1598             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1599             : {
    1600             :     __m256i ab[2], r, weights_w[2], rep[2];
    1601             :     (void)bd;
    1602             : 
    1603           0 :     init_8(above, left, 16, ab, &r, weights_w, rep);
    1604             : 
    1605           0 :     for (int32_t i = 0; i < 2; i++) {
    1606           0 :         smooth_pred_8x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
    1607             :             ab, r, &dst, stride);
    1608             :     }
    1609           0 : }
    1610             : 
    1611             : // 8x32
    1612             : 
    1613           0 : void eb_aom_highbd_smooth_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
    1614             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1615             : {
    1616             :     __m256i ab[2], r, weights_w[2], rep[2];
    1617             :     (void)bd;
    1618             : 
    1619           0 :     init_8(above, left, 32, ab, &r, weights_w, rep);
    1620             : 
    1621           0 :     for (int32_t i = 0; i < 4; i++) {
    1622           0 :         smooth_pred_8x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
    1623             :             ab, r, &dst, stride);
    1624             :     }
    1625           0 : }
    1626             : 
    1627             : // -----------------------------------------------------------------------------
    1628             : // 16xN
    1629             : 
    1630           0 : static INLINE void load_right_weights_16(const uint16_t *const above,
    1631             :     __m256i *const r, __m256i *const weights)
    1632             : {
    1633           0 :     *r = _mm256_set1_epi16((uint16_t)above[15]);
    1634             : 
    1635             :     //  0  1  2  3   8  9 10 11
    1636           0 :     weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_16 + 0x00));
    1637             :     //  4  5  6  7  12 13 14 15
    1638           0 :     weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_16 + 0x10));
    1639           0 : }
    1640             : 
    1641           0 : static INLINE void prepare_ab(const uint16_t *const above, const __m256i b,
    1642             :     __m256i *const ab)
    1643             : {
    1644           0 :     const __m256i a = _mm256_loadu_si256((const __m256i *)above);
    1645           0 :     ab[0] = _mm256_unpacklo_epi16(a, b);
    1646           0 :     ab[1] = _mm256_unpackhi_epi16(a, b);
    1647           0 : }
    1648             : 
    1649           0 : static INLINE void init_16(const uint16_t *const above,
    1650             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    1651             :     __m256i *const r, __m256i *const weights_w, __m256i *const rep)
    1652             : {
    1653           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    1654           0 :     prepare_ab(above, b, ab);
    1655           0 :     load_right_weights_16(above, r, weights_w);
    1656             : 
    1657           0 :     rep[0] = _mm256_set1_epi32(0x03020100);
    1658           0 :     rep[1] = _mm256_set1_epi32(0x07060504);
    1659           0 :     rep[2] = _mm256_set1_epi32(0x0B0A0908);
    1660           0 :     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
    1661           0 : }
    1662             : 
    1663           0 : static INLINE void smooth_pred_16(const __m256i *const weights_w,
    1664             :     const __m256i weights_h, const __m256i rep, const __m256i *const ab,
    1665             :     const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
    1666             : {
    1667           0 :     const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
    1668           0 :     _mm256_storeu_si256((__m256i *)*dst, d);
    1669           0 :     *dst += stride;
    1670           0 : }
    1671             : 
    1672           0 : static INLINE void smooth_pred_16x4(const __m256i *const weights_w,
    1673             :     const uint16_t *const sm_weights_h, const __m256i *const rep,
    1674             :     const __m256i *const ab, const __m256i lr, uint16_t **const dst,
    1675             :     const ptrdiff_t stride)
    1676             : {
    1677           0 :     const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
    1678           0 :     smooth_pred_16(weights_w, weights_h, rep[0], ab, lr, dst, stride);
    1679           0 :     smooth_pred_16(weights_w, weights_h, rep[1], ab, lr, dst, stride);
    1680           0 :     smooth_pred_16(weights_w, weights_h, rep[2], ab, lr, dst, stride);
    1681           0 :     smooth_pred_16(weights_w, weights_h, rep[3], ab, lr, dst, stride);
    1682           0 : }
    1683             : 
    1684           0 : static INLINE void smooth_pred_16x8(const uint16_t *const left,
    1685             :     const __m256i *const weights_w, const uint16_t *const sm_weights_h,
    1686             :     const __m256i *const rep, const __m256i *const ab, const __m256i r,
    1687             :     uint16_t **const dst, const ptrdiff_t stride)
    1688             : {
    1689             :     __m256i lr[2];
    1690           0 :     load_left_8(left, r, lr);
    1691             : 
    1692           0 :     smooth_pred_16x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
    1693           0 :     smooth_pred_16x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
    1694           0 : }
    1695             : 
    1696             : // 16x4
    1697             : 
    1698           0 : void eb_aom_highbd_smooth_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
    1699             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1700             : {
    1701             :     __m256i ab[2], r, lr, weights_w[2], rep[4];
    1702             :     (void)bd;
    1703             : 
    1704           0 :     init_16(above, left, 4, ab, &r, weights_w, rep);
    1705           0 :     lr = load_left_4(left, r);
    1706           0 :     smooth_pred_16x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
    1707           0 : }
    1708             : 
    1709             : // 16x8
    1710             : 
    1711           0 : void eb_aom_highbd_smooth_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
    1712             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1713             : {
    1714             :     __m256i ab[2], r, weights_w[2], rep[4];
    1715             :     (void)bd;
    1716             : 
    1717           0 :     init_16(above, left, 8, ab, &r, weights_w, rep);
    1718             : 
    1719           0 :     smooth_pred_16x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
    1720           0 : }
    1721             : 
    1722             : // 16x16
    1723             : 
    1724           0 : void eb_aom_highbd_smooth_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
    1725             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1726             : {
    1727             :     __m256i ab[2], r, weights_w[2], rep[4];
    1728             :     (void)bd;
    1729             : 
    1730           0 :     init_16(above, left, 16, ab, &r, weights_w, rep);
    1731             : 
    1732           0 :     for (int32_t i = 0; i < 2; i++) {
    1733           0 :         smooth_pred_16x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
    1734             :             ab, r, &dst, stride);
    1735             :     }
    1736           0 : }
    1737             : 
    1738             : // 16x32
    1739             : 
    1740           0 : void eb_aom_highbd_smooth_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
    1741             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1742             : {
    1743             :     __m256i ab[2], r, weights_w[2], rep[4];
    1744             :     (void)bd;
    1745             : 
    1746           0 :     init_16(above, left, 32, ab, &r, weights_w, rep);
    1747             : 
    1748           0 :     for (int32_t i = 0; i < 4; i++) {
    1749           0 :         smooth_pred_16x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
    1750             :             ab, r, &dst, stride);
    1751             :     }
    1752           0 : }
    1753             : 
    1754             : // 16x64
    1755             : 
    1756           0 : void eb_aom_highbd_smooth_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
    1757             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1758             : {
    1759             :     __m256i ab[2], r, weights_w[2], rep[4];
    1760             :     (void)bd;
    1761             : 
    1762           0 :     init_16(above, left, 64, ab, &r, weights_w, rep);
    1763             : 
    1764           0 :     for (int32_t i = 0; i < 8; i++) {
    1765           0 :         smooth_pred_16x8(left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep,
    1766             :             ab, r, &dst, stride);
    1767             :     }
    1768           0 : }
    1769             : 
    1770             : // -----------------------------------------------------------------------------
    1771             : // 32xN
    1772             : 
    1773           0 : static INLINE void load_right_weights_32(const uint16_t *const above,
    1774             :     __m256i *const r, __m256i *const weights)
    1775             : {
    1776           0 :     *r = _mm256_set1_epi16((uint16_t)above[31]);
    1777             : 
    1778             :     //  0  1  2  3   8  9 10 11
    1779           0 :     weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x00));
    1780             :     //  4  5  6  7  12 13 14 15
    1781           0 :     weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x10));
    1782             :     // 16 17 18 19  24 25 26 27
    1783           0 :     weights[2] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x20));
    1784             :     // 20 21 22 23  28 29 30 31
    1785           0 :     weights[3] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x30));
    1786           0 : }
    1787             : 
    1788           0 : static INLINE void init_32(const uint16_t *const above,
    1789             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    1790             :     __m256i *const r, __m256i *const weights_w, __m256i *const rep)
    1791             : {
    1792           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    1793           0 :     prepare_ab(above + 0x00, b, ab + 0);
    1794           0 :     prepare_ab(above + 0x10, b, ab + 2);
    1795           0 :     load_right_weights_32(above, r, weights_w);
    1796             : 
    1797           0 :     rep[0] = _mm256_set1_epi32(0x03020100);
    1798           0 :     rep[1] = _mm256_set1_epi32(0x07060504);
    1799           0 :     rep[2] = _mm256_set1_epi32(0x0B0A0908);
    1800           0 :     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
    1801           0 : }
    1802             : 
    1803           0 : static INLINE void smooth_pred_32(const __m256i *const weights_w,
    1804             :     const __m256i weights_h, const __m256i rep, const __m256i *const ab,
    1805             :     const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
    1806             : {
    1807             :     __m256i d;
    1808             : 
    1809             :     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
    1810           0 :     d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
    1811           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
    1812             : 
    1813             :     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
    1814           0 :     d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
    1815           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
    1816           0 :     *dst += stride;
    1817           0 : }
    1818             : 
    1819           0 : static INLINE void smooth_pred_32x4(const __m256i *const weights_w,
    1820             :     const uint16_t *const sm_weights_h, const __m256i *const rep,
    1821             :     const __m256i *const ab, const __m256i lr, uint16_t **const dst,
    1822             :     const ptrdiff_t stride)
    1823             : {
    1824           0 :     const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
    1825           0 :     smooth_pred_32(weights_w, weights_h, rep[0], ab, lr, dst, stride);
    1826           0 :     smooth_pred_32(weights_w, weights_h, rep[1], ab, lr, dst, stride);
    1827           0 :     smooth_pred_32(weights_w, weights_h, rep[2], ab, lr, dst, stride);
    1828           0 :     smooth_pred_32(weights_w, weights_h, rep[3], ab, lr, dst, stride);
    1829           0 : }
    1830             : 
    1831           0 : static INLINE void smooth_pred_32x8(const uint16_t *const left,
    1832             :     const __m256i *const weights_w, const uint16_t *const sm_weights_h,
    1833             :     const __m256i *const rep, const __m256i *const ab, const __m256i r,
    1834             :     uint16_t **const dst, const ptrdiff_t stride)
    1835             : {
    1836             :     __m256i lr[2];
    1837           0 :     load_left_8(left, r, lr);
    1838             : 
    1839           0 :     smooth_pred_32x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
    1840           0 :     smooth_pred_32x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
    1841           0 : }
    1842             : 
    1843             : // 32x8
    1844             : 
    1845           0 : void eb_aom_highbd_smooth_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
    1846             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1847             : {
    1848             :     __m256i ab[4], r, weights_w[4], rep[4];
    1849             :     (void)bd;
    1850             : 
    1851           0 :     init_32(above, left, 8, ab, &r, weights_w, rep);
    1852             : 
    1853           0 :     smooth_pred_32x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
    1854           0 : }
    1855             : 
    1856             : // 32x16
    1857             : 
    1858           0 : void eb_aom_highbd_smooth_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
    1859             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1860             : {
    1861             :     __m256i ab[4], r, weights_w[4], rep[4];
    1862             :     (void)bd;
    1863             : 
    1864           0 :     init_32(above, left, 16, ab, &r, weights_w, rep);
    1865             : 
    1866           0 :     for (int32_t i = 0; i < 2; i++) {
    1867           0 :         smooth_pred_32x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
    1868             :             ab, r, &dst, stride);
    1869             :     }
    1870           0 : }
    1871             : 
    1872             : // 32x32
    1873             : 
    1874           0 : void eb_aom_highbd_smooth_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
    1875             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1876             : {
    1877             :     __m256i ab[4], r, weights_w[4], rep[4];
    1878             :     (void)bd;
    1879             : 
    1880           0 :     init_32(above, left, 32, ab, &r, weights_w, rep);
    1881             : 
    1882           0 :     for (int32_t i = 0; i < 4; i++) {
    1883           0 :         smooth_pred_32x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
    1884             :             ab, r, &dst, stride);
    1885             :     }
    1886           0 : }
    1887             : 
    1888             : // 32x64
    1889             : 
    1890           0 : void eb_aom_highbd_smooth_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
    1891             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1892             : {
    1893             :     __m256i ab[4], r, weights_w[4], rep[4];
    1894             :     (void)bd;
    1895             : 
    1896           0 :     init_32(above, left, 64, ab, &r, weights_w, rep);
    1897             : 
    1898           0 :     for (int32_t i = 0; i < 8; i++) {
    1899           0 :         smooth_pred_32x8(left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep,
    1900             :             ab, r, &dst, stride);
    1901             :     }
    1902           0 : }
    1903             : 
    1904             : // -----------------------------------------------------------------------------
    1905             : // 64xN
    1906             : 
    1907           0 : static INLINE void load_right_weights_64(const uint16_t *const above,
    1908             :     __m256i *const r, __m256i *const weights)
    1909             : {
    1910           0 :     *r = _mm256_set1_epi16((uint16_t)above[63]);
    1911             : 
    1912             :     //  0  1  2  3   8  9 10 11
    1913           0 :     weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x00));
    1914             :     //  4  5  6  7  12 13 14 15
    1915           0 :     weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x10));
    1916             :     // 16 17 18 19  24 25 26 27
    1917           0 :     weights[2] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x20));
    1918             :     // 20 21 22 23  28 29 30 31
    1919           0 :     weights[3] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x30));
    1920             :     // 32 33 34 35  40 41 42 43
    1921           0 :     weights[4] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x40));
    1922             :     // 36 37 38 39  44 45 46 47
    1923           0 :     weights[5] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x50));
    1924             :     // 48 49 50 51  56 57 58 59
    1925           0 :     weights[6] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x60));
    1926             :     // 52 53 54 55  60 61 62 63
    1927           0 :     weights[7] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x70));
    1928           0 : }
    1929             : 
    1930           0 : static INLINE void init_64(const uint16_t *const above,
    1931             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    1932             :     __m256i *const r, __m256i *const weights_w, __m256i *const rep)
    1933             : {
    1934           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    1935           0 :     prepare_ab(above + 0x00, b, ab + 0);
    1936           0 :     prepare_ab(above + 0x10, b, ab + 2);
    1937           0 :     prepare_ab(above + 0x20, b, ab + 4);
    1938           0 :     prepare_ab(above + 0x30, b, ab + 6);
    1939           0 :     load_right_weights_64(above, r, weights_w);
    1940             : 
    1941           0 :     rep[0] = _mm256_set1_epi32(0x03020100);
    1942           0 :     rep[1] = _mm256_set1_epi32(0x07060504);
    1943           0 :     rep[2] = _mm256_set1_epi32(0x0B0A0908);
    1944           0 :     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
    1945           0 : }
    1946             : 
    1947           0 : static INLINE void smooth_pred_64(const __m256i *const weights_w,
    1948             :     const __m256i weights_h, const __m256i rep, const __m256i *const ab,
    1949             :     const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
    1950             : {
    1951             :     __m256i d;
    1952             : 
    1953             :     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
    1954           0 :     d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
    1955           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
    1956             : 
    1957             :     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
    1958           0 :     d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
    1959           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
    1960             : 
    1961             :     // 32 33 34 35 36 37 38 39  40 41 42 43 44 45 46 47
    1962           0 :     d = smooth_pred_kernel(weights_w + 4, weights_h, rep, ab + 4, lr);
    1963           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
    1964             : 
    1965             :     // 48 49 50 51 52 53 54 55  56 57 58 59 60 61 62 63
    1966           0 :     d = smooth_pred_kernel(weights_w + 6, weights_h, rep, ab + 6, lr);
    1967           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
    1968           0 :     *dst += stride;
    1969           0 : }
    1970             : 
    1971           0 : static INLINE void smooth_pred_64x4(const __m256i *const weights_w,
    1972             :     const uint16_t *const sm_weights_h, const __m256i *const rep,
    1973             :     const __m256i *const ab, const __m256i lr, uint16_t **const dst,
    1974             :     const ptrdiff_t stride)
    1975             : {
    1976           0 :     const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
    1977           0 :     smooth_pred_64(weights_w, weights_h, rep[0], ab, lr, dst, stride);
    1978           0 :     smooth_pred_64(weights_w, weights_h, rep[1], ab, lr, dst, stride);
    1979           0 :     smooth_pred_64(weights_w, weights_h, rep[2], ab, lr, dst, stride);
    1980           0 :     smooth_pred_64(weights_w, weights_h, rep[3], ab, lr, dst, stride);
    1981           0 : }
    1982             : 
    1983           0 : static INLINE void smooth_pred_64x8(const uint16_t *const left,
    1984             :     const __m256i *const weights_w, const uint16_t *const sm_weights_h,
    1985             :     const __m256i *const rep, const __m256i *const ab, const __m256i r,
    1986             :     uint16_t **const dst, const ptrdiff_t stride)
    1987             : {
    1988             :     __m256i lr[2];
    1989           0 :     load_left_8(left, r, lr);
    1990             : 
    1991           0 :     smooth_pred_64x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
    1992           0 :     smooth_pred_64x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
    1993           0 : }
    1994             : 
    1995             : // 64x16
    1996             : 
    1997           0 : void eb_aom_highbd_smooth_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
    1998             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    1999             : {
    2000             :     __m256i ab[8], r, weights_w[8], rep[4];
    2001             :     (void)bd;
    2002             : 
    2003           0 :     init_64(above, left, 16, ab, &r, weights_w, rep);
    2004             : 
    2005           0 :     for (int32_t i = 0; i < 2; i++) {
    2006           0 :         smooth_pred_64x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
    2007             :             ab, r, &dst, stride);
    2008             :     }
    2009           0 : }
    2010             : 
    2011             : // 64x32
    2012             : 
    2013           0 : void eb_aom_highbd_smooth_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2014             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2015             : {
    2016             :     __m256i ab[8], r, weights_w[8], rep[4];
    2017             :     (void)bd;
    2018             : 
    2019           0 :     init_64(above, left, 32, ab, &r, weights_w, rep);
    2020             : 
    2021           0 :     for (int32_t i = 0; i < 4; i++) {
    2022           0 :         smooth_pred_64x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
    2023             :             ab, r, &dst, stride);
    2024             :     }
    2025           0 : }
    2026             : 
    2027             : // 64x64
    2028             : 
    2029           0 : void eb_aom_highbd_smooth_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
    2030             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2031             : {
    2032             :     __m256i ab[8], r, weights_w[8], rep[4];
    2033             :     (void)bd;
    2034             : 
    2035           0 :     init_64(above, left, 64, ab, &r, weights_w, rep);
    2036             : 
    2037           0 :     for (int32_t i = 0; i < 8; i++) {
    2038           0 :         smooth_pred_64x8(left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep,
    2039             :             ab, r, &dst, stride);
    2040             :     }
    2041           0 : }
    2042             : 
    2043             : // =============================================================================
    2044             : 
    2045             : // SMOOTH_H_PRED
    2046             : 
    2047             : // 8xN
    2048             : 
    2049           0 : static INLINE __m256i smooth_h_pred_kernel(const __m256i *const weights,
    2050             :     const __m256i lr)
    2051             : {
    2052           0 :     const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
    2053             :     __m256i sum[2];
    2054             :     // width 8: 00 01 02 03  10 11 12 13
    2055             :     // width 16: 0 1 2 3  8 9 A B
    2056           0 :     sum[0] = _mm256_madd_epi16(lr, weights[0]);
    2057             :     // width 8: 04 05 06 07  14 15 16 17
    2058             :     // width 16: 4 5 6 7  C D E F
    2059           0 :     sum[1] = _mm256_madd_epi16(lr, weights[1]);
    2060           0 :     sum[0] = _mm256_add_epi32(sum[0], round);
    2061           0 :     sum[1] = _mm256_add_epi32(sum[1], round);
    2062           0 :     sum[0] = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
    2063           0 :     sum[1] = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
    2064             :     // width 8: 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
    2065             :     // width 16: 0 1 2 3 4 5 6 7  8 9 A B C D E F
    2066           0 :     return _mm256_packs_epi32(sum[0], sum[1]);
    2067             : }
    2068             : 
    2069           0 : static INLINE void smooth_h_pred_8x2(const __m256i *const weights,
    2070             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2071             : {
    2072           0 :     const __m256i rep = _mm256_set1_epi32(0x03020100);
    2073             :     // lr: 0 1 2 3  1 2 3 4
    2074           0 :     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  1 1 1 1
    2075             :     // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
    2076           0 :     const __m256i d = smooth_h_pred_kernel(weights, t);
    2077           0 :     _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
    2078           0 :     *dst += stride;
    2079           0 :     _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
    2080           0 :     *dst += stride;
    2081           0 :     *lr = _mm256_srli_si256(*lr, 8); // 2 3 x x  3 4 x x
    2082           0 : }
    2083             : 
    2084           0 : static INLINE void smooth_h_pred_8x4(const __m256i *const weights,
    2085             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2086             : {
    2087           0 :     smooth_h_pred_8x2(weights, lr, dst, stride);
    2088           0 :     smooth_h_pred_8x2(weights, lr, dst, stride);
    2089           0 : }
    2090             : 
    2091           0 : static INLINE void smooth_h_pred_8x8(const uint16_t *const left,
    2092             :     const __m256i r, const __m256i *const weights, uint16_t **const dst,
    2093             :     const ptrdiff_t stride)
    2094             : {
    2095           0 :     const __m128i l0 = _mm_load_si128((const __m128i *)left);
    2096           0 :     const __m128i l1 = _mm_srli_si128(l0, 2);
    2097             :     // 0 1 2 3 4 5 6 7  1 2 3 4 5 6 7 x
    2098             :     const __m256i l =
    2099           0 :         _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
    2100             :     __m256i lr[2];
    2101           0 :     lr[0] = _mm256_unpacklo_epi16(l, r); // 0 1 2 3  1 2 3 4
    2102           0 :     lr[1] = _mm256_unpackhi_epi16(l, r); // 4 5 6 7  5 6 7 x
    2103           0 :     smooth_h_pred_8x4(weights, &lr[0], dst, stride);
    2104           0 :     smooth_h_pred_8x4(weights, &lr[1], dst, stride);
    2105           0 : }
    2106             : 
    2107             : // 8x4
    2108             : 
    2109           0 : void eb_aom_highbd_smooth_h_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
    2110             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2111             : {
    2112           0 :     const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
    2113           0 :     const __m128i l1 = _mm_srli_si128(l0, 2);
    2114             :     // 0 1 2 3 x x x x  1 2 3 4 x x x x
    2115             :     const __m256i l =
    2116           0 :         _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
    2117             :     __m256i r, weights[2];
    2118             :     (void)bd;
    2119             : 
    2120           0 :     load_right_weights_8(above, &r, weights);
    2121           0 :     __m256i lr = _mm256_unpacklo_epi16(l, r); // 0 1 2 3  1 2 3 4
    2122           0 :     smooth_h_pred_8x4(weights, &lr, &dst, stride);
    2123           0 : }
    2124             : 
    2125             : // 8x8
    2126             : 
    2127           0 : void eb_aom_highbd_smooth_h_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
    2128             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2129             : {
    2130             :     __m256i r, weights[2];
    2131             :     (void)bd;
    2132             : 
    2133           0 :     load_right_weights_8(above, &r, weights);
    2134           0 :     smooth_h_pred_8x8(left, r, weights, &dst, stride);
    2135           0 : }
    2136             : 
    2137             : // 8x16
    2138             : 
    2139           0 : void eb_aom_highbd_smooth_h_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2140             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2141             : {
    2142             :     __m256i r, weights[2];
    2143             :     (void)bd;
    2144             : 
    2145           0 :     load_right_weights_8(above, &r, weights);
    2146           0 :     smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
    2147           0 :     smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
    2148           0 : }
    2149             : 
    2150             : // 8x32
    2151             : 
    2152           0 : void eb_aom_highbd_smooth_h_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2153             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2154             : {
    2155             :     __m256i r, weights[2];
    2156             :     (void)bd;
    2157             : 
    2158           0 :     load_right_weights_8(above, &r, weights);
    2159             : 
    2160           0 :     for (int32_t i = 0; i < 2; i++) {
    2161           0 :         smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
    2162           0 :         smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
    2163           0 :         left += 16;
    2164             :     }
    2165           0 : }
    2166             : 
    2167             : // -----------------------------------------------------------------------------
    2168             : // 16xN
    2169             : 
    2170           0 : static INLINE void smooth_h_pred_16(const __m256i *const weights,
    2171             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2172             : {
    2173           0 :     const __m256i rep = _mm256_set1_epi32(0x03020100);
    2174             :     // lr: 0 1 2 3  0 1 2 3
    2175           0 :     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  0 0 0 0
    2176             :     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
    2177           0 :     const __m256i d = smooth_h_pred_kernel(weights, t);
    2178           0 :     _mm256_storeu_si256((__m256i *)*dst, d);
    2179           0 :     *dst += stride;
    2180           0 :     *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x  1 2 3 x
    2181           0 : }
    2182             : 
    2183           0 : static INLINE void smooth_h_pred_16x4(const __m256i *const weights,
    2184             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2185             : {
    2186           0 :     smooth_h_pred_16(weights, lr, dst, stride);
    2187           0 :     smooth_h_pred_16(weights, lr, dst, stride);
    2188           0 :     smooth_h_pred_16(weights, lr, dst, stride);
    2189           0 :     smooth_h_pred_16(weights, lr, dst, stride);
    2190           0 : }
    2191             : 
    2192           0 : static INLINE void smooth_h_pred_16x8(const uint16_t *const left,
    2193             :     const __m256i r, const __m256i *const weights, uint16_t **const dst,
    2194             :     const ptrdiff_t stride)
    2195             : {
    2196             :     __m256i lr[2];
    2197           0 :     load_left_8(left, r, lr);
    2198           0 :     smooth_h_pred_16x4(weights, &lr[0], dst, stride);
    2199           0 :     smooth_h_pred_16x4(weights, &lr[1], dst, stride);
    2200           0 : }
    2201             : 
    2202           0 : static INLINE void smooth_h_predictor_16x16(uint16_t *dst,
    2203             :     const ptrdiff_t stride, const uint16_t *const above, const uint16_t *left,
    2204             :     const int32_t n)
    2205             : {
    2206             :     __m256i r, weights[2];
    2207             : 
    2208           0 :     load_right_weights_16(above, &r, weights);
    2209             : 
    2210           0 :     for (int32_t i = 0; i < n; i++) {
    2211           0 :         smooth_h_pred_16x8(left + 0, r, weights, &dst, stride);
    2212           0 :         smooth_h_pred_16x8(left + 8, r, weights, &dst, stride);
    2213           0 :         left += 16;
    2214             :     }
    2215           0 : }
    2216             : 
    2217             : // 16x4
    2218             : 
    2219           0 : void eb_aom_highbd_smooth_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
    2220             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2221             : {
    2222             :     __m256i r, lr, weights[2];
    2223             :     (void)bd;
    2224             : 
    2225           0 :     load_right_weights_16(above, &r, weights);
    2226           0 :     lr = load_left_4(left, r);
    2227           0 :     smooth_h_pred_16x4(weights, &lr, &dst, stride);
    2228           0 : }
    2229             : 
    2230             : // 16x8
    2231             : 
    2232           0 : void eb_aom_highbd_smooth_h_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
    2233             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2234             : {
    2235             :     __m256i r, weights[2];
    2236             :     (void)bd;
    2237             : 
    2238           0 :     load_right_weights_16(above, &r, weights);
    2239           0 :     smooth_h_pred_16x8(left, r, weights, &dst, stride);
    2240           0 : }
    2241             : 
    2242             : // 16x16
    2243             : 
    2244           0 : void eb_aom_highbd_smooth_h_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2245             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2246             : {
    2247             :     (void)bd;
    2248           0 :     smooth_h_predictor_16x16(dst, stride, above, left, 1);
    2249           0 : }
    2250             : 
    2251             : // 16x32
    2252             : 
    2253           0 : void eb_aom_highbd_smooth_h_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2254             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2255             : {
    2256             :     (void)bd;
    2257           0 :     smooth_h_predictor_16x16(dst, stride, above, left, 2);
    2258           0 : }
    2259             : 
    2260             : // 16x64
    2261             : 
    2262           0 : void eb_aom_highbd_smooth_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
    2263             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2264             : {
    2265             :     (void)bd;
    2266           0 :     smooth_h_predictor_16x16(dst, stride, above, left, 4);
    2267           0 : }
    2268             : 
    2269             : // -----------------------------------------------------------------------------
    2270             : // 32xN
    2271             : 
    2272           0 : static INLINE void smooth_h_pred_32(const __m256i *const weights,
    2273             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2274             : {
    2275           0 :     const __m256i rep = _mm256_set1_epi32(0x03020100);
    2276             :     // lr: 0 1 2 3  0 1 2 3
    2277           0 :     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  0 0 0 0
    2278             :     __m256i d;
    2279             : 
    2280             :     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
    2281           0 :     d = smooth_h_pred_kernel(weights + 0, t);
    2282           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
    2283             : 
    2284             :     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
    2285           0 :     d = smooth_h_pred_kernel(weights + 2, t);
    2286           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
    2287           0 :     *dst += stride;
    2288           0 :     *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x  1 2 3 x
    2289           0 : }
    2290             : 
    2291           0 : static INLINE void smooth_h_pred_32x4(const __m256i *const weights,
    2292             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2293             : {
    2294           0 :     smooth_h_pred_32(weights, lr, dst, stride);
    2295           0 :     smooth_h_pred_32(weights, lr, dst, stride);
    2296           0 :     smooth_h_pred_32(weights, lr, dst, stride);
    2297           0 :     smooth_h_pred_32(weights, lr, dst, stride);
    2298           0 : }
    2299             : 
    2300           0 : static INLINE void smooth_h_pred_32x8(uint16_t *dst,
    2301             :     const ptrdiff_t stride, const uint16_t *const above, const uint16_t *left,
    2302             :     const int32_t n)
    2303             : {
    2304             :     __m256i r, lr[2], weights[4];
    2305             : 
    2306           0 :     load_right_weights_32(above, &r, weights);
    2307             : 
    2308           0 :     for (int32_t i = 0; i < n; i++) {
    2309           0 :         load_left_8(left, r, lr);
    2310           0 :         smooth_h_pred_32x4(weights, &lr[0], &dst, stride);
    2311           0 :         smooth_h_pred_32x4(weights, &lr[1], &dst, stride);
    2312           0 :         left += 8;
    2313             :     }
    2314           0 : }
    2315             : 
    2316             : // 32x8
    2317             : 
    2318           0 : void eb_aom_highbd_smooth_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
    2319             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2320             : {
    2321             :     (void)bd;
    2322           0 :     smooth_h_pred_32x8(dst, stride, above, left, 1);
    2323           0 : }
    2324             : 
    2325             : // 32x16
    2326             : 
    2327           0 : void eb_aom_highbd_smooth_h_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2328             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2329             : {
    2330             :     (void)bd;
    2331           0 :     smooth_h_pred_32x8(dst, stride, above, left, 2);
    2332           0 : }
    2333             : 
    2334             : // 32x32
    2335             : 
    2336           0 : void eb_aom_highbd_smooth_h_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2337             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2338             : {
    2339             :     (void)bd;
    2340           0 :     smooth_h_pred_32x8(dst, stride, above, left, 4);
    2341           0 : }
    2342             : 
    2343             : // 32x64
    2344             : 
    2345           0 : void eb_aom_highbd_smooth_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
    2346             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2347             : {
    2348             :     (void)bd;
    2349           0 :     smooth_h_pred_32x8(dst, stride, above, left, 8);
    2350           0 : }
    2351             : 
    2352             : // -----------------------------------------------------------------------------
    2353             : // 64xN
    2354             : 
    2355           0 : static INLINE void smooth_h_pred_64(const __m256i *const weights,
    2356             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2357             : {
    2358           0 :     const __m256i rep = _mm256_set1_epi32(0x03020100);
    2359             :     // lr: 0 1 2 3  0 1 2 3
    2360           0 :     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  0 0 0 0
    2361             :     __m256i d;
    2362             : 
    2363             :     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
    2364           0 :     d = smooth_h_pred_kernel(weights + 0, t);
    2365           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
    2366             : 
    2367             :     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
    2368           0 :     d = smooth_h_pred_kernel(weights + 2, t);
    2369           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
    2370             : 
    2371             :     // 32 33 34 35 36 37 38 39  40 41 42 43 44 45 46 47
    2372           0 :     d = smooth_h_pred_kernel(weights + 4, t);
    2373           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
    2374             : 
    2375             :     // 48 49 50 51 52 53 54 55  56 57 58 59 60 61 62 63
    2376           0 :     d = smooth_h_pred_kernel(weights + 6, t);
    2377           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
    2378           0 :     *dst += stride;
    2379           0 :     *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x  1 2 3 x
    2380           0 : }
    2381             : 
    2382           0 : static INLINE void smooth_h_pred_64x4(const __m256i *const weights,
    2383             :     __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
    2384             : {
    2385           0 :     smooth_h_pred_64(weights, lr, dst, stride);
    2386           0 :     smooth_h_pred_64(weights, lr, dst, stride);
    2387           0 :     smooth_h_pred_64(weights, lr, dst, stride);
    2388           0 :     smooth_h_pred_64(weights, lr, dst, stride);
    2389           0 : }
    2390             : 
    2391           0 : static INLINE void smooth_h_pred_64x8(uint16_t *dst,
    2392             :     const ptrdiff_t stride, const uint16_t *const above, const uint16_t *left,
    2393             :     const int32_t n)
    2394             : {
    2395             :     __m256i r, lr[2], weights[8];
    2396             : 
    2397           0 :     load_right_weights_64(above, &r, weights);
    2398             : 
    2399           0 :     for (int32_t i = 0; i < n; i++) {
    2400           0 :         load_left_8(left, r, lr);
    2401           0 :         smooth_h_pred_64x4(weights, &lr[0], &dst, stride);
    2402           0 :         smooth_h_pred_64x4(weights, &lr[1], &dst, stride);
    2403           0 :         left += 8;
    2404             :     }
    2405           0 : }
    2406             : 
    2407             : // 64x16
    2408             : 
    2409           0 : void eb_aom_highbd_smooth_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2410             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2411             : {
    2412             :     (void)bd;
    2413           0 :     smooth_h_pred_64x8(dst, stride, above, left, 2);
    2414           0 : }
    2415             : 
    2416             : // 64x32
    2417             : 
    2418           0 : void eb_aom_highbd_smooth_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2419             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2420             : {
    2421             :     (void)bd;
    2422           0 :     smooth_h_pred_64x8(dst, stride, above, left, 4);
    2423           0 : }
    2424             : 
    2425             : // 64x64
    2426             : 
    2427           0 : void eb_aom_highbd_smooth_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
    2428             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2429             : {
    2430             :     (void)bd;
    2431           0 :     smooth_h_pred_64x8(dst, stride, above, left, 8);
    2432           0 : }
    2433             : 
    2434             : // =============================================================================
    2435             : 
    2436             : // SMOOTH_V_PRED
    2437             : 
    2438             : // 8xN
    2439             : 
    2440           0 : static INLINE void smooth_v_init_8(const uint16_t *const above,
    2441             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    2442             :     __m256i *const rep)
    2443             : {
    2444           0 :     const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
    2445           0 :     const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
    2446           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    2447             :     __m256i a[2];
    2448           0 :     a[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
    2449           0 :     a[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
    2450           0 :     ab[0] = _mm256_unpacklo_epi16(a[0], b);
    2451           0 :     ab[1] = _mm256_unpacklo_epi16(a[1], b);
    2452             : 
    2453           0 :     const __m128i rep0 = _mm_set1_epi32(0x03020100);
    2454           0 :     const __m128i rep1 = _mm_set1_epi32(0x07060504);
    2455           0 :     const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
    2456           0 :     const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
    2457           0 :     rep[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
    2458           0 :     rep[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
    2459           0 : }
    2460             : 
    2461           0 : static INLINE __m256i smooth_v_pred_kernel(const __m256i weights,
    2462             :     const __m256i rep, const __m256i *const ab)
    2463             : {
    2464           0 :     const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
    2465             :     __m256i sum[2];
    2466             :     // 0 0 0 0  1 1 1 1
    2467           0 :     const __m256i w = _mm256_shuffle_epi8(weights, rep);
    2468           0 :     sum[0] = _mm256_madd_epi16(ab[0], w);
    2469           0 :     sum[1] = _mm256_madd_epi16(ab[1], w);
    2470           0 :     sum[0] = _mm256_add_epi32(sum[0], round);
    2471           0 :     sum[1] = _mm256_add_epi32(sum[1], round);
    2472           0 :     sum[0] = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
    2473           0 :     sum[1] = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
    2474             :     // width 8: 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
    2475             :     // width 16: 0 1 2 3 4 5 6 7  8 9 A B C D E F
    2476           0 :     return _mm256_packs_epi32(sum[0], sum[1]);
    2477             : }
    2478             : 
    2479           0 : static INLINE void smooth_v_pred_8x2(const __m256i weights, const __m256i rep,
    2480             :     const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
    2481             : {
    2482             :     // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
    2483           0 :     const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
    2484           0 :     _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
    2485           0 :     *dst += stride;
    2486           0 :     _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
    2487           0 :     *dst += stride;
    2488           0 : }
    2489             : 
    2490           0 : static INLINE void smooth_v_pred_8x4(const uint16_t *const sm_weights_h,
    2491             :     const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
    2492             :     const ptrdiff_t stride)
    2493             : {
    2494           0 :     const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
    2495           0 :     smooth_v_pred_8x2(weights, rep[0], ab, dst, stride);
    2496           0 :     smooth_v_pred_8x2(weights, rep[1], ab, dst, stride);
    2497           0 : }
    2498             : 
    2499           0 : static INLINE void smooth_v_pred_8x8(const uint16_t *const sm_weights_h,
    2500             :     const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
    2501             :     const ptrdiff_t stride)
    2502             : {
    2503           0 :     smooth_v_pred_8x4(sm_weights_h + 0, rep, ab, dst, stride);
    2504           0 :     smooth_v_pred_8x4(sm_weights_h + 16, rep, ab, dst, stride);
    2505           0 : }
    2506             : 
    2507             : // 8x4
    2508             : 
    2509           0 : void eb_aom_highbd_smooth_v_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
    2510             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2511             : {
    2512             :     __m256i ab[2], rep[2];
    2513             :     (void)bd;
    2514             : 
    2515           0 :     smooth_v_init_8(above, left, 4, ab, rep);
    2516           0 :     smooth_v_pred_8x4(sm_weights_d_4, rep, ab, &dst, stride);
    2517           0 : }
    2518             : 
    2519             : // 8x8
    2520             : 
    2521           0 : void eb_aom_highbd_smooth_v_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
    2522             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2523             : {
    2524             :     __m256i ab[2], rep[2];
    2525             :     (void)bd;
    2526             : 
    2527           0 :     smooth_v_init_8(above, left, 8, ab, rep);
    2528             : 
    2529           0 :     smooth_v_pred_8x8(sm_weights_d_8, rep, ab, &dst, stride);
    2530           0 : }
    2531             : 
    2532             : // 8x16
    2533             : 
    2534           0 : void eb_aom_highbd_smooth_v_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2535             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2536             : {
    2537             :     __m256i ab[2], rep[2];
    2538             :     (void)bd;
    2539             : 
    2540           0 :     smooth_v_init_8(above, left, 16, ab, rep);
    2541             : 
    2542           0 :     for (int32_t i = 0; i < 2; i++)
    2543           0 :         smooth_v_pred_8x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
    2544           0 : }
    2545             : 
    2546             : // 8x32
    2547             : 
    2548           0 : void eb_aom_highbd_smooth_v_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2549             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2550             : {
    2551             :     __m256i ab[2], rep[2];
    2552             :     (void)bd;
    2553             : 
    2554           0 :     smooth_v_init_8(above, left, 32, ab, rep);
    2555             : 
    2556           0 :     for (int32_t i = 0; i < 4; i++)
    2557           0 :         smooth_v_pred_8x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
    2558           0 : }
    2559             : 
    2560             : // -----------------------------------------------------------------------------
    2561             : // 16xN
    2562             : 
    2563           0 : static INLINE void smooth_v_prepare_ab(const uint16_t *const above,
    2564             :     const __m256i b, __m256i *const ab)
    2565             : {
    2566           0 :     const __m256i a = _mm256_loadu_si256((const __m256i *)above);
    2567           0 :     ab[0] = _mm256_unpacklo_epi16(a, b);
    2568           0 :     ab[1] = _mm256_unpackhi_epi16(a, b);
    2569           0 : }
    2570             : 
    2571           0 : static INLINE void smooth_v_init_16(const uint16_t *const above,
    2572             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    2573             :     __m256i *const rep)
    2574             : {
    2575           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    2576           0 :     smooth_v_prepare_ab(above, b, ab);
    2577             : 
    2578           0 :     rep[0] = _mm256_set1_epi32(0x03020100);
    2579           0 :     rep[1] = _mm256_set1_epi32(0x07060504);
    2580           0 :     rep[2] = _mm256_set1_epi32(0x0B0A0908);
    2581           0 :     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
    2582           0 : }
    2583             : 
    2584           0 : static INLINE void smooth_v_pred_16(const __m256i weights, const __m256i rep,
    2585             :     const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
    2586             : {
    2587           0 :     const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
    2588           0 :     _mm256_storeu_si256((__m256i *)*dst, d);
    2589           0 :     *dst += stride;
    2590           0 : }
    2591             : 
    2592           0 : static INLINE void smooth_v_pred_16x4(const uint16_t *const sm_weights_h,
    2593             :     const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
    2594             :     const ptrdiff_t stride)
    2595             : {
    2596           0 :     const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
    2597           0 :     smooth_v_pred_16(weights, rep[0], ab, dst, stride);
    2598           0 :     smooth_v_pred_16(weights, rep[1], ab, dst, stride);
    2599           0 :     smooth_v_pred_16(weights, rep[2], ab, dst, stride);
    2600           0 :     smooth_v_pred_16(weights, rep[3], ab, dst, stride);
    2601           0 : }
    2602             : 
    2603           0 : static INLINE void smooth_v_pred_16x8(const uint16_t *const sm_weights_h,
    2604             :     const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
    2605             :     const ptrdiff_t stride)
    2606             : {
    2607           0 :     smooth_v_pred_16x4(sm_weights_h + 0, rep, ab, dst, stride);
    2608           0 :     smooth_v_pred_16x4(sm_weights_h + 16, rep, ab, dst, stride);
    2609           0 : }
    2610             : 
    2611             : // 16x4
    2612             : 
    2613           0 : void eb_aom_highbd_smooth_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
    2614             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2615             : {
    2616             :     __m256i ab[2], rep[4];
    2617             :     (void)bd;
    2618             : 
    2619           0 :     smooth_v_init_16(above, left, 4, ab, rep);
    2620           0 :     smooth_v_pred_16x4(sm_weights_d_4, rep, ab, &dst, stride);
    2621           0 : }
    2622             : 
    2623             : // 16x8
    2624             : 
    2625           0 : void eb_aom_highbd_smooth_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
    2626             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2627             : {
    2628             :     __m256i ab[2], rep[4];
    2629             :     (void)bd;
    2630             : 
    2631           0 :     smooth_v_init_16(above, left, 8, ab, rep);
    2632             : 
    2633           0 :     smooth_v_pred_16x8(sm_weights_d_8, rep, ab, &dst, stride);
    2634           0 : }
    2635             : 
    2636             : // 16x16
    2637             : 
    2638           0 : void eb_aom_highbd_smooth_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2639             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2640             : {
    2641             :     __m256i ab[2], rep[4];
    2642             :     (void)bd;
    2643             : 
    2644           0 :     smooth_v_init_16(above, left, 16, ab, rep);
    2645             : 
    2646           0 :     for (int32_t i = 0; i < 2; i++)
    2647           0 :         smooth_v_pred_16x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
    2648           0 : }
    2649             : 
    2650             : // 16x32
    2651             : 
    2652           0 : void eb_aom_highbd_smooth_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2653             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2654             : {
    2655             :     __m256i ab[2], rep[4];
    2656             :     (void)bd;
    2657             : 
    2658           0 :     smooth_v_init_16(above, left, 32, ab, rep);
    2659             : 
    2660           0 :     for (int32_t i = 0; i < 4; i++)
    2661           0 :         smooth_v_pred_16x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
    2662           0 : }
    2663             : 
    2664             : // 16x64
    2665             : 
    2666           0 : void eb_aom_highbd_smooth_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
    2667             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2668             : {
    2669             :     __m256i ab[2], rep[4];
    2670             :     (void)bd;
    2671             : 
    2672           0 :     smooth_v_init_16(above, left, 64, ab, rep);
    2673             : 
    2674           0 :     for (int32_t i = 0; i < 8; i++)
    2675           0 :         smooth_v_pred_16x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
    2676           0 : }
    2677             : 
    2678             : // -----------------------------------------------------------------------------
    2679             : // 32xN
    2680             : 
    2681           0 : static INLINE void smooth_v_init_32(const uint16_t *const above,
    2682             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    2683             :     __m256i *const rep)
    2684             : {
    2685           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    2686           0 :     smooth_v_prepare_ab(above + 0x00, b, ab + 0);
    2687           0 :     smooth_v_prepare_ab(above + 0x10, b, ab + 2);
    2688             : 
    2689           0 :     rep[0] = _mm256_set1_epi32(0x03020100);
    2690           0 :     rep[1] = _mm256_set1_epi32(0x07060504);
    2691           0 :     rep[2] = _mm256_set1_epi32(0x0B0A0908);
    2692           0 :     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
    2693           0 : }
    2694             : 
    2695           0 : static INLINE void smooth_v_pred_32(const __m256i weights, const __m256i rep,
    2696             :     const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
    2697             : {
    2698             :     __m256i d;
    2699             : 
    2700             :     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
    2701           0 :     d = smooth_v_pred_kernel(weights, rep, ab + 0);
    2702           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
    2703             : 
    2704             :     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
    2705           0 :     d = smooth_v_pred_kernel(weights, rep, ab + 2);
    2706           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
    2707           0 :     *dst += stride;
    2708           0 : }
    2709             : 
    2710           0 : static INLINE void smooth_v_pred_32x4(const uint16_t *const sm_weights_h,
    2711             :     const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
    2712             :     const ptrdiff_t stride)
    2713             : {
    2714           0 :     const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
    2715           0 :     smooth_v_pred_32(weights, rep[0], ab, dst, stride);
    2716           0 :     smooth_v_pred_32(weights, rep[1], ab, dst, stride);
    2717           0 :     smooth_v_pred_32(weights, rep[2], ab, dst, stride);
    2718           0 :     smooth_v_pred_32(weights, rep[3], ab, dst, stride);
    2719           0 : }
    2720             : 
    2721           0 : static INLINE void smooth_v_pred_32x8(const uint16_t *const sm_weights_h,
    2722             :     const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
    2723             :     const ptrdiff_t stride)
    2724             : {
    2725           0 :     smooth_v_pred_32x4(sm_weights_h + 0, rep, ab, dst, stride);
    2726           0 :     smooth_v_pred_32x4(sm_weights_h + 16, rep, ab, dst, stride);
    2727           0 : }
    2728             : 
    2729             : // 32x8
    2730             : 
    2731           0 : void eb_aom_highbd_smooth_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
    2732             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2733             : {
    2734             :     __m256i ab[4], rep[4];
    2735             :     (void)bd;
    2736             : 
    2737           0 :     smooth_v_init_32(above, left, 8, ab, rep);
    2738             : 
    2739           0 :     smooth_v_pred_32x8(sm_weights_d_8, rep, ab, &dst, stride);
    2740           0 : }
    2741             : 
    2742             : // 32x16
    2743             : 
    2744           0 : void eb_aom_highbd_smooth_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2745             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2746             : {
    2747             :     __m256i ab[4], rep[4];
    2748             :     (void)bd;
    2749             : 
    2750           0 :     smooth_v_init_32(above, left, 16, ab, rep);
    2751             : 
    2752           0 :     for (int32_t i = 0; i < 2; i++)
    2753           0 :         smooth_v_pred_32x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
    2754           0 : }
    2755             : 
    2756             : // 32x32
    2757             : 
    2758           0 : void eb_aom_highbd_smooth_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2759             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2760             : {
    2761             :     __m256i ab[4], rep[4];
    2762             :     (void)bd;
    2763             : 
    2764           0 :     smooth_v_init_32(above, left, 32, ab, rep);
    2765             : 
    2766           0 :     for (int32_t i = 0; i < 4; i++)
    2767           0 :         smooth_v_pred_32x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
    2768           0 : }
    2769             : 
    2770             : // 32x64
    2771             : 
    2772           0 : void eb_aom_highbd_smooth_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
    2773             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2774             : {
    2775             :     __m256i ab[4], rep[4];
    2776             :     (void)bd;
    2777             : 
    2778           0 :     smooth_v_init_32(above, left, 64, ab, rep);
    2779             : 
    2780           0 :     for (int32_t i = 0; i < 8; i++)
    2781           0 :         smooth_v_pred_32x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
    2782           0 : }
    2783             : 
    2784             : // -----------------------------------------------------------------------------
    2785             : // 64xN
    2786             : 
    2787           0 : static INLINE void smooth_v_init_64(const uint16_t *const above,
    2788             :     const uint16_t *const left, const int32_t h, __m256i *const ab,
    2789             :     __m256i *const rep)
    2790             : {
    2791           0 :     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
    2792           0 :     smooth_v_prepare_ab(above + 0x00, b, ab + 0);
    2793           0 :     smooth_v_prepare_ab(above + 0x10, b, ab + 2);
    2794           0 :     smooth_v_prepare_ab(above + 0x20, b, ab + 4);
    2795           0 :     smooth_v_prepare_ab(above + 0x30, b, ab + 6);
    2796             : 
    2797           0 :     rep[0] = _mm256_set1_epi32(0x03020100);
    2798           0 :     rep[1] = _mm256_set1_epi32(0x07060504);
    2799           0 :     rep[2] = _mm256_set1_epi32(0x0B0A0908);
    2800           0 :     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
    2801           0 : }
    2802             : 
    2803           0 : static INLINE void smooth_v_pred_64(const __m256i weights, const __m256i rep,
    2804             :     const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
    2805             : {
    2806             :     __m256i d;
    2807             : 
    2808             :     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
    2809           0 :     d = smooth_v_pred_kernel(weights, rep, ab + 0);
    2810           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
    2811             : 
    2812             :     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
    2813           0 :     d = smooth_v_pred_kernel(weights, rep, ab + 2);
    2814           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
    2815             : 
    2816             :     // 32 33 34 35 36 37 38 39  40 41 42 43 44 45 46 47
    2817           0 :     d = smooth_v_pred_kernel(weights, rep, ab + 4);
    2818           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
    2819             : 
    2820             :     // 48 49 50 51 52 53 54 55  56 57 58 59 60 61 62 63
    2821           0 :     d = smooth_v_pred_kernel(weights, rep, ab + 6);
    2822           0 :     _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
    2823           0 :     *dst += stride;
    2824           0 : }
    2825             : 
    2826           0 : static INLINE void smooth_v_pred_64x4(const uint16_t *const sm_weights_h,
    2827             :     const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
    2828             :     const ptrdiff_t stride)
    2829             : {
    2830           0 :     const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
    2831           0 :     smooth_v_pred_64(weights, rep[0], ab, dst, stride);
    2832           0 :     smooth_v_pred_64(weights, rep[1], ab, dst, stride);
    2833           0 :     smooth_v_pred_64(weights, rep[2], ab, dst, stride);
    2834           0 :     smooth_v_pred_64(weights, rep[3], ab, dst, stride);
    2835           0 : }
    2836             : 
    2837           0 : static INLINE void smooth_v_pred_64x8(const uint16_t *const sm_weights_h,
    2838             :     const __m256i *const rep, const __m256i *const ab,
    2839             :     uint16_t **const dst, const ptrdiff_t stride)
    2840             : {
    2841           0 :     smooth_v_pred_64x4(sm_weights_h + 0, rep, ab, dst, stride);
    2842           0 :     smooth_v_pred_64x4(sm_weights_h + 16, rep, ab, dst, stride);
    2843           0 : }
    2844             : 
    2845             : // 64x16
    2846             : 
    2847           0 : void eb_aom_highbd_smooth_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
    2848             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2849             : {
    2850             :     __m256i ab[8], rep[4];
    2851             :     (void)bd;
    2852             : 
    2853           0 :     smooth_v_init_64(above, left, 16, ab, rep);
    2854             : 
    2855           0 :     for (int32_t i = 0; i < 2; i++)
    2856           0 :         smooth_v_pred_64x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
    2857           0 : }
    2858             : 
    2859             : // 64x32
    2860             : 
    2861           0 : void eb_aom_highbd_smooth_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
    2862             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2863             : {
    2864             :     __m256i ab[8], rep[4];
    2865             :     (void)bd;
    2866             : 
    2867           0 :     smooth_v_init_64(above, left, 32, ab, rep);
    2868             : 
    2869           0 :     for (int32_t i = 0; i < 4; i++)
    2870           0 :         smooth_v_pred_64x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
    2871           0 : }
    2872             : 
    2873             : // 64x64
    2874             : 
    2875           0 : void eb_aom_highbd_smooth_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
    2876             :     const uint16_t *above, const uint16_t *left, int32_t bd)
    2877             : {
    2878             :     __m256i ab[8], rep[4];
    2879             :     (void)bd;
    2880             : 
    2881           0 :     smooth_v_init_64(above, left, 64, ab, rep);
    2882             : 
    2883           0 :     for (int32_t i = 0; i < 8; i++)
    2884           0 :         smooth_v_pred_64x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
    2885           0 : }

Generated by: LCOV version 1.14