LCOV - code coverage report
Current view: top level - ASM_SSSE3 - intrapred_ssse3.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 649 689 94.2 %
Date: 2019-11-25 17:38:06 Functions: 78 80 97.5 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "EbDefinitions.h"
      13             : #include "aom_dsp_rtcd.h"
      14             : #include <tmmintrin.h>
      15             : 
      16             :  // Weights are quadratic from '1' to '1 / BlockSize', scaled by
      17             :  // 2^sm_weight_log2_scale.
      18             : static const int32_t sm_weight_log2_scale = 8;
      19             : 
      20             : // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
      21             : #define MAX_BLOCK_DIM 64
      22             : 
      23             : /* clang-format off */
      24             : static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
      25             :     // Unused, because we always offset by bs, which is at least 2.
      26             :     0, 0,
      27             :     // bs = 2
      28             :     255, 128,
      29             :     // bs = 4
      30             :     255, 149, 85, 64,
      31             :     // bs = 8
      32             :     255, 197, 146, 105, 73, 50, 37, 32,
      33             :     // bs = 16
      34             :     255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
      35             :     // bs = 32
      36             :     255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
      37             :     66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
      38             :     // bs = 64
      39             :     255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
      40             :     150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
      41             :     65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
      42             :     13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
      43             : };
      44             : 
      45             : // -----------------------------------------------------------------------------
      46             : // PAETH_PRED
      47             : 
      48             : // -----------------------------------------------------------------------------
      49             : // SMOOTH_PRED
      50             : 
      51             : // pixels[0]: above and below_pred interleave vector
      52             : // pixels[1]: left vector
      53             : // pixels[2]: right_pred vector
      54      267394 : static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
      55             :     int32_t height, __m128i *pixels) {
      56      267394 :     __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
      57      267394 :     if (height == 4)
      58      372746 :         pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
      59       81021 :     else if (height == 8)
      60      109728 :         pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
      61             :     else
      62       52314 :         pixels[1] = _mm_loadu_si128(((const __m128i *)left));
      63             : 
      64      267394 :     pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
      65             : 
      66      534788 :     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
      67      267394 :     const __m128i zero = _mm_setzero_si128();
      68      267394 :     d = _mm_unpacklo_epi8(d, zero);
      69      267394 :     pixels[0] = _mm_unpacklo_epi16(d, bp);
      70      267394 : }
      71             : 
      72             : // weight_h[0]: weight_h vector
      73             : // weight_h[1]: scale - weight_h vector
      74             : // weight_h[2]: same as [0], second half for height = 16 only
      75             : // weight_h[3]: same as [1], second half for height = 16 only
      76             : // weight_w[0]: weights_w and scale - weights_w interleave vector
      77      267393 : static INLINE void load_weight_w4(const uint8_t *weight_array, int32_t height,
      78             :     __m128i *weight_h, __m128i *weight_w) {
      79      267393 :     const __m128i zero = _mm_setzero_si128();
      80      267393 :     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
      81      534786 :     const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
      82      267393 :     weight_h[0] = _mm_unpacklo_epi8(t, zero);
      83      267393 :     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
      84      267393 :     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
      85             : 
      86      267393 :     if (height == 8) {
      87      109728 :         const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
      88       54864 :         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
      89      109728 :         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
      90             :     }
      91      212529 :     else if (height == 16) {
      92       52318 :         const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
      93       26159 :         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
      94       26159 :         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
      95       26159 :         weight_h[2] = _mm_unpackhi_epi8(weight, zero);
      96       52318 :         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
      97             :     }
      98      267393 : }
      99             : 
     100      293553 : static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
     101             :     const __m128i *ww, int32_t h, uint8_t *dst,
     102             :     ptrdiff_t stride, int32_t second_half) {
     103      587106 :     const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
     104      293553 :     const __m128i one = _mm_set1_epi16(1);
     105      293553 :     const __m128i inc = _mm_set1_epi16(0x202);
     106      293553 :     const __m128i gat = _mm_set1_epi32(0xc080400);
     107      587106 :     __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
     108      293553 :     __m128i d = _mm_set1_epi16(0x100);
     109             : 
     110     1896430 :     for (int32_t i = 0; i < h; ++i) {
     111     1602880 :         const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
     112     3205750 :         const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     113     1602880 :         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     114     1602880 :         __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
     115             : 
     116     1602880 :         __m128i b = _mm_shuffle_epi8(pixel[1], rep);
     117     1602880 :         b = _mm_unpacklo_epi16(b, pixel[2]);
     118     3205750 :         __m128i sum = _mm_madd_epi16(b, ww[0]);
     119             : 
     120     1602880 :         sum = _mm_add_epi32(s, sum);
     121     1602880 :         sum = _mm_add_epi32(sum, round);
     122     3205750 :         sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
     123             : 
     124     1602880 :         sum = _mm_shuffle_epi8(sum, gat);
     125     1602880 :         *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
     126     1602880 :         dst += stride;
     127             : 
     128     1602880 :         rep = _mm_add_epi16(rep, one);
     129     1602880 :         d = _mm_add_epi16(d, inc);
     130             :     }
     131      293553 : }
     132             : 
     133      186373 : void eb_aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     134             :     const uint8_t *above, const uint8_t *left) {
     135             :     __m128i pixels[3];
     136      186373 :     load_pixel_w4(above, left, 4, pixels);
     137             : 
     138             :     __m128i wh[4], ww[2];
     139      186373 :     load_weight_w4(sm_weight_arrays, 4, wh, ww);
     140             : 
     141      186373 :     smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
     142      186375 : }
     143             : 
     144       54864 : void eb_aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     145             :     const uint8_t *above, const uint8_t *left) {
     146             :     __m128i pixels[3];
     147       54864 :     load_pixel_w4(above, left, 8, pixels);
     148             : 
     149             :     __m128i wh[4], ww[2];
     150       54864 :     load_weight_w4(sm_weight_arrays, 8, wh, ww);
     151             : 
     152       54864 :     smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
     153       54864 : }
     154             : 
     155       26159 : void eb_aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     156             :     const uint8_t *above,
     157             :     const uint8_t *left) {
     158             :     __m128i pixels[3];
     159       26159 :     load_pixel_w4(above, left, 16, pixels);
     160             : 
     161             :     __m128i wh[4], ww[2];
     162       26159 :     load_weight_w4(sm_weight_arrays, 16, wh, ww);
     163             : 
     164       26159 :     smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
     165       26159 :     dst += stride << 3;
     166       26159 :     smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
     167       26159 : }
     168             : 
     169             : // pixels[0]: above and below_pred interleave vector, first half
     170             : // pixels[1]: above and below_pred interleave vector, second half
     171             : // pixels[2]: left vector
     172             : // pixels[3]: right_pred vector
     173             : // pixels[4]: above and below_pred interleave vector, first half
     174             : // pixels[5]: above and below_pred interleave vector, second half
     175             : // pixels[6]: left vector + 16
     176             : // pixels[7]: right_pred vector
     177      826684 : static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
     178             :     int32_t height, __m128i *pixels) {
     179      826684 :     const __m128i zero = _mm_setzero_si128();
     180     1653370 :     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
     181      826684 :     __m128i d = _mm_loadl_epi64((const __m128i *)above);
     182      826684 :     d = _mm_unpacklo_epi8(d, zero);
     183      826684 :     pixels[0] = _mm_unpacklo_epi16(d, bp);
     184      826684 :     pixels[1] = _mm_unpackhi_epi16(d, bp);
     185             : 
     186      826684 :     pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
     187             : 
     188      826684 :     if (height == 4)
     189      105128 :         pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
     190      774120 :     else if (height == 8)
     191      793418 :         pixels[2] = _mm_loadl_epi64((const __m128i *)left);
     192      377411 :     else if (height == 16)
     193      687440 :         pixels[2] = _mm_load_si128((const __m128i *)left);
     194             :     else {
     195       33691 :         pixels[2] = _mm_load_si128((const __m128i *)left);
     196       33691 :         pixels[4] = pixels[0];
     197       33691 :         pixels[5] = pixels[1];
     198       33691 :         pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
     199       33691 :         pixels[7] = pixels[3];
     200             :     }
     201      826684 : }
     202             : 
     203             : // weight_h[0]: weight_h vector
     204             : // weight_h[1]: scale - weight_h vector
     205             : // weight_h[2]: same as [0], offset 8
     206             : // weight_h[3]: same as [1], offset 8
     207             : // weight_h[4]: same as [0], offset 16
     208             : // weight_h[5]: same as [1], offset 16
     209             : // weight_h[6]: same as [0], offset 24
     210             : // weight_h[7]: same as [1], offset 24
     211             : // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
     212             : // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
     213      826687 : static INLINE void load_weight_w8(const uint8_t *weight_array, int32_t height,
     214             :     __m128i *weight_h, __m128i *weight_w) {
     215      826687 :     const __m128i zero = _mm_setzero_si128();
     216      826687 :     const int32_t we_offset = height < 8 ? 4 : 8;
     217     1653370 :     __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
     218      826687 :     weight_h[0] = _mm_unpacklo_epi8(we, zero);
     219      826687 :     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
     220      826687 :     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     221             : 
     222      826687 :     if (height == 4) {
     223       52564 :         we = _mm_srli_si128(we, 4);
     224       52564 :         __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
     225       52564 :         __m128i tmp2 = _mm_sub_epi16(d, tmp1);
     226       52564 :         weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
     227      105128 :         weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
     228             :     }
     229             :     else {
     230      774123 :         weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
     231     1548250 :         weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
     232             :     }
     233             : 
     234      826687 :     if (height == 16) {
     235      687438 :         we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
     236      343719 :         weight_h[0] = _mm_unpacklo_epi8(we, zero);
     237      343719 :         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     238      343719 :         weight_h[2] = _mm_unpackhi_epi8(we, zero);
     239      687438 :         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
     240             :     }
     241      482968 :     else if (height == 32) {
     242             :         const __m128i weight_lo =
     243       67404 :             _mm_loadu_si128((const __m128i *)&weight_array[32]);
     244       33702 :         weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
     245       33702 :         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     246       33702 :         weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
     247       33702 :         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
     248             :         const __m128i weight_hi =
     249       33702 :             _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
     250       33702 :         weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
     251       33702 :         weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
     252       33702 :         weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
     253       67404 :         weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
     254             :     }
     255      826687 : }
     256             : 
     257     1271490 : static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
     258             :     const __m128i *ww, int32_t h, uint8_t *dst,
     259             :     ptrdiff_t stride, int32_t second_half) {
     260     2542990 :     const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
     261     1271490 :     const __m128i one = _mm_set1_epi16(1);
     262     1271490 :     const __m128i inc = _mm_set1_epi16(0x202);
     263     1271490 :     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
     264             : 
     265     2542990 :     __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
     266     1271490 :     __m128i d = _mm_set1_epi16(0x100);
     267             : 
     268             :     int32_t i;
     269    11231900 :     for (i = 0; i < h; ++i) {
     270     9960430 :         const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
     271    19920900 :         const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     272     9960430 :         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     273     9960430 :         __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
     274     9960430 :         __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
     275             : 
     276     9960430 :         __m128i b = _mm_shuffle_epi8(pixels[2], rep);
     277     9960430 :         b = _mm_unpacklo_epi16(b, pixels[3]);
     278     9960430 :         __m128i sum0 = _mm_madd_epi16(b, ww[0]);
     279    19920900 :         __m128i sum1 = _mm_madd_epi16(b, ww[1]);
     280             : 
     281     9960430 :         s0 = _mm_add_epi32(s0, sum0);
     282     9960430 :         s0 = _mm_add_epi32(s0, round);
     283    19920900 :         s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
     284             : 
     285     9960430 :         s1 = _mm_add_epi32(s1, sum1);
     286     9960430 :         s1 = _mm_add_epi32(s1, round);
     287    19920900 :         s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
     288             : 
     289     9960430 :         sum0 = _mm_packus_epi16(s0, s1);
     290     9960430 :         sum0 = _mm_shuffle_epi8(sum0, gat);
     291     9960430 :         _mm_storel_epi64((__m128i *)dst, sum0);
     292     9960430 :         dst += stride;
     293             : 
     294     9960430 :         rep = _mm_add_epi16(rep, one);
     295     9960430 :         d = _mm_add_epi16(d, inc);
     296             :     }
     297     1271490 : }
     298             : 
     299       52564 : void eb_aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     300             :     const uint8_t *above, const uint8_t *left) {
     301             :     __m128i pixels[4];
     302       52564 :     load_pixel_w8(above, left, 4, pixels);
     303             : 
     304             :     __m128i wh[4], ww[2];
     305       52564 :     load_weight_w8(sm_weight_arrays, 4, wh, ww);
     306             : 
     307       52564 :     smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
     308       52564 : }
     309             : 
     310      396711 : void eb_aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     311             :     const uint8_t *above, const uint8_t *left) {
     312             :     __m128i pixels[4];
     313      396711 :     load_pixel_w8(above, left, 8, pixels);
     314             : 
     315             :     __m128i wh[4], ww[2];
     316      396709 :     load_weight_w8(sm_weight_arrays, 8, wh, ww);
     317             : 
     318      396713 :     smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
     319      396714 : }
     320             : 
     321      343719 : void eb_aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     322             :     const uint8_t *above,
     323             :     const uint8_t *left) {
     324             :     __m128i pixels[4];
     325      343719 :     load_pixel_w8(above, left, 16, pixels);
     326             : 
     327             :     __m128i wh[4], ww[2];
     328      343720 :     load_weight_w8(sm_weight_arrays, 16, wh, ww);
     329             : 
     330      343720 :     smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
     331      343721 :     dst += stride << 3;
     332      343721 :     smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
     333      343722 : }
     334             : 
     335       33702 : void eb_aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     336             :     const uint8_t *above,
     337             :     const uint8_t *left) {
     338             :     __m128i pixels[8];
     339       33702 :     load_pixel_w8(above, left, 32, pixels);
     340             : 
     341             :     __m128i wh[8], ww[2];
     342       33702 :     load_weight_w8(sm_weight_arrays, 32, wh, ww);
     343             : 
     344       33702 :     smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
     345       33702 :     dst += stride << 3;
     346       33702 :     smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
     347       33702 :     dst += stride << 3;
     348       33702 :     smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
     349       33702 :     dst += stride << 3;
     350       33702 :     smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
     351       33702 : }
     352             : 
     353      870553 : static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
     354             :     const uint8_t *above,
     355             :     const uint8_t *left, uint32_t bw,
     356             :     uint32_t bh) {
     357      870553 :     const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
     358      870553 :     const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
     359      870553 :     const __m128i zero = _mm_setzero_si128();
     360             :     const __m128i scale_value =
     361      870553 :         _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
     362     1741110 :     const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
     363      870553 :     const __m128i dup16 = _mm_set1_epi32(0x01000100);
     364             :     const __m128i top_right =
     365     2611660 :         _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
     366      870553 :     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
     367      870553 :     const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
     368             : 
     369    15931400 :     for (uint32_t y = 0; y < bh; ++y) {
     370    15060800 :         const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
     371    30121700 :         const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
     372    15060800 :         const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
     373    15060800 :         __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
     374    15060800 :         const __m128i wl_y =
     375    15060800 :             _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
     376    15060800 :         pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
     377    15060800 :         pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
     378             : 
     379    57215500 :         for (uint32_t x = 0; x < bw; x += 8) {
     380    42154600 :             const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
     381             :             const __m128i weights_x =
     382    84309300 :                 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
     383    42154600 :             const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
     384    42154600 :             const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
     385    42154600 :             const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
     386             : 
     387    42154600 :             __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
     388    42154600 :             __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
     389             : 
     390             :             const __m128i scale_m_weights_x =
     391    84309300 :                 _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
     392    42154600 :             const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
     393    42154600 :             const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
     394    42154600 :             const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
     395             : 
     396    42154600 :             pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
     397    42154600 :             pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
     398             : 
     399    42154600 :             pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
     400    42154600 :             pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
     401             : 
     402    42154600 :             pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
     403    84309300 :             pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
     404             : 
     405    42154600 :             __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
     406    42154600 :             pred = _mm_shuffle_epi8(pred, gat);
     407    42154600 :             _mm_storel_epi64((__m128i *)(dst + x), pred);
     408             :         }
     409    15060800 :         dst += stride;
     410             :     }
     411      870553 : }
     412             : 
     413       27620 : void eb_aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     414             :     const uint8_t *above,
     415             :     const uint8_t *left) {
     416       27620 :     smooth_predictor_wxh(dst, stride, above, left, 16, 4);
     417       27620 : }
     418             : 
     419      273507 : void eb_aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     420             :     const uint8_t *above,
     421             :     const uint8_t *left) {
     422      273507 :     smooth_predictor_wxh(dst, stride, above, left, 16, 8);
     423      273507 : }
     424             : 
     425      203771 : void eb_aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     426             :     const uint8_t *above,
     427             :     const uint8_t *left) {
     428      203771 :     smooth_predictor_wxh(dst, stride, above, left, 16, 16);
     429      203772 : }
     430             : 
     431      122161 : void eb_aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     432             :     const uint8_t *above,
     433             :     const uint8_t *left) {
     434      122161 :     smooth_predictor_wxh(dst, stride, above, left, 16, 32);
     435      122161 : }
     436             : 
     437       24708 : void eb_aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     438             :     const uint8_t *above,
     439             :     const uint8_t *left) {
     440       24708 :     smooth_predictor_wxh(dst, stride, above, left, 32, 8);
     441       24708 : }
     442             : 
     443      116297 : void eb_aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     444             :     const uint8_t *above,
     445             :     const uint8_t *left) {
     446      116297 :     smooth_predictor_wxh(dst, stride, above, left, 32, 16);
     447      116297 : }
     448             : 
     449       89259 : void eb_aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     450             :     const uint8_t *above,
     451             :     const uint8_t *left) {
     452       89259 :     smooth_predictor_wxh(dst, stride, above, left, 32, 32);
     453       89259 : }
     454             : 
     455        2079 : void eb_aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     456             :     const uint8_t *above,
     457             :     const uint8_t *left) {
     458        2079 :     smooth_predictor_wxh(dst, stride, above, left, 32, 64);
     459        2079 : }
     460             : 
     461        3720 : void eb_aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     462             :     const uint8_t *above,
     463             :     const uint8_t *left) {
     464        3720 :     smooth_predictor_wxh(dst, stride, above, left, 64, 64);
     465        3720 : }
     466             : 
     467        1569 : void eb_aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     468             :     const uint8_t *above,
     469             :     const uint8_t *left) {
     470        1569 :     smooth_predictor_wxh(dst, stride, above, left, 64, 32);
     471        1569 : }
     472             : 
     473        2438 : void eb_aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     474             :     const uint8_t *above,
     475             :     const uint8_t *left) {
     476        2438 :     smooth_predictor_wxh(dst, stride, above, left, 64, 16);
     477        2438 : }
     478             : 
     479        3436 : void eb_aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     480             :     const uint8_t *above,
     481             :     const uint8_t *left) {
     482        3436 :     smooth_predictor_wxh(dst, stride, above, left, 16, 64);
     483        3436 : }
     484             : 
     485             : // -----------------------------------------------------------------------------
     486             : // SMOOTH_V_PRED
     487             : 
     488             : // pixels[0]: above and below_pred interleave vector
     489      197579 : static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
     490             :     int32_t height, __m128i *pixels) {
     491      197579 :     const __m128i zero = _mm_setzero_si128();
     492      197579 :     __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
     493      395158 :     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
     494      197579 :     d = _mm_unpacklo_epi8(d, zero);
     495      197579 :     pixels[0] = _mm_unpacklo_epi16(d, bp);
     496      197579 : }
     497             : 
     498             : // weights[0]: weights_h vector
     499             : // weights[1]: scale - weights_h vector
     500      197580 : static INLINE void load_weight_v_w4(const uint8_t *weight_array, int32_t height,
     501             :     __m128i *weights) {
     502      197580 :     const __m128i zero = _mm_setzero_si128();
     503      197580 :     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
     504             : 
     505      197580 :     if (height == 4) {
     506             :         const __m128i weight =
     507      268940 :             _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
     508      134470 :         weights[0] = _mm_unpacklo_epi8(weight, zero);
     509      268940 :         weights[1] = _mm_sub_epi16(d, weights[0]);
     510             :     }
     511       63110 :     else if (height == 8) {
     512       85340 :         const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
     513       42670 :         weights[0] = _mm_unpacklo_epi8(weight, zero);
     514       85340 :         weights[1] = _mm_sub_epi16(d, weights[0]);
     515             :     }
     516             :     else {
     517       40880 :         const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
     518       20440 :         weights[0] = _mm_unpacklo_epi8(weight, zero);
     519       20440 :         weights[1] = _mm_sub_epi16(d, weights[0]);
     520       20440 :         weights[2] = _mm_unpackhi_epi8(weight, zero);
     521       40880 :         weights[3] = _mm_sub_epi16(d, weights[2]);
     522             :     }
     523      197580 : }
     524             : 
     525      218021 : static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
     526             :     const __m128i *weight, int32_t h, uint8_t *dst,
     527             :     ptrdiff_t stride) {
     528      436042 :     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
     529      218021 :     const __m128i inc = _mm_set1_epi16(0x202);
     530      218021 :     const __m128i gat = _mm_set1_epi32(0xc080400);
     531      218021 :     __m128i d = _mm_set1_epi16(0x100);
     532             : 
     533     1424300 :     for (int32_t i = 0; i < h; ++i) {
     534     1206270 :         const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
     535     2412550 :         const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
     536     1206270 :         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     537     2412550 :         __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
     538     1206270 :         sum = _mm_add_epi32(sum, pred_round);
     539     2412550 :         sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
     540     1206270 :         sum = _mm_shuffle_epi8(sum, gat);
     541     1206270 :         *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
     542     1206270 :         dst += stride;
     543     1206270 :         d = _mm_add_epi16(d, inc);
     544             :     }
     545      218021 : }
     546             : 
     547      134469 : void eb_aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     548             :     const uint8_t *above,
     549             :     const uint8_t *left) {
     550             :     __m128i pixels;
     551      134469 :     load_pixel_v_w4(above, left, 4, &pixels);
     552             : 
     553             :     __m128i weights[2];
     554      134470 :     load_weight_v_w4(sm_weight_arrays, 4, weights);
     555             : 
     556      134471 :     smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
     557      134471 : }
     558             : 
     559       42670 : void eb_aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     560             :     const uint8_t *above,
     561             :     const uint8_t *left) {
     562             :     __m128i pixels;
     563       42670 :     load_pixel_v_w4(above, left, 8, &pixels);
     564             : 
     565             :     __m128i weights[2];
     566       42670 :     load_weight_v_w4(sm_weight_arrays, 8, weights);
     567             : 
     568       42670 :     smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
     569       42670 : }
     570             : 
     571       20440 : void eb_aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     572             :     const uint8_t *above,
     573             :     const uint8_t *left) {
     574             :     __m128i pixels;
     575       20440 :     load_pixel_v_w4(above, left, 16, &pixels);
     576             : 
     577             :     __m128i weights[4];
     578       20440 :     load_weight_v_w4(sm_weight_arrays, 16, weights);
     579             : 
     580       20440 :     smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
     581       20440 :     dst += stride << 3;
     582       20440 :     smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
     583       20440 : }
     584             : 
     585             : // pixels[0]: above and below_pred interleave vector, first half
     586             : // pixels[1]: above and below_pred interleave vector, second half
     587      133577 : static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
     588             :     int32_t height, __m128i *pixels) {
     589      133577 :     const __m128i zero = _mm_setzero_si128();
     590      133577 :     __m128i d = _mm_loadl_epi64((const __m128i *)above);
     591      267154 :     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
     592      133577 :     d = _mm_unpacklo_epi8(d, zero);
     593      133577 :     pixels[0] = _mm_unpacklo_epi16(d, bp);
     594      133577 :     pixels[1] = _mm_unpackhi_epi16(d, bp);
     595      133577 : }
     596             : 
     597             : // weight_h[0]: weight_h vector
     598             : // weight_h[1]: scale - weight_h vector
     599             : // weight_h[2]: same as [0], offset 8
     600             : // weight_h[3]: same as [1], offset 8
     601             : // weight_h[4]: same as [0], offset 16
     602             : // weight_h[5]: same as [1], offset 16
     603             : // weight_h[6]: same as [0], offset 24
     604             : // weight_h[7]: same as [1], offset 24
     605      133577 : static INLINE void load_weight_v_w8(const uint8_t *weight_array, int32_t height,
     606             :     __m128i *weight_h) {
     607      133577 :     const __m128i zero = _mm_setzero_si128();
     608      133577 :     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
     609             : 
     610      133577 :     if (height < 16) {
     611      104335 :         const int32_t offset = height < 8 ? 4 : 8;
     612             :         const __m128i weight =
     613      208670 :             _mm_loadu_si128((const __m128i *)&weight_array[offset]);
     614      104335 :         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
     615      208670 :         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     616             :     }
     617       29242 :     else if (height == 16) {
     618       31926 :         const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
     619       15963 :         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
     620       15963 :         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     621       15963 :         weight_h[2] = _mm_unpackhi_epi8(weight, zero);
     622       31926 :         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
     623             :     }
     624             :     else {
     625             :         const __m128i weight_lo =
     626       26558 :             _mm_loadu_si128((const __m128i *)&weight_array[32]);
     627       13279 :         weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
     628       13279 :         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     629       13279 :         weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
     630       13279 :         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
     631             :         const __m128i weight_hi =
     632       13279 :             _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
     633       13279 :         weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
     634       13279 :         weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
     635       13279 :         weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
     636       26558 :         weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
     637             :     }
     638      133577 : }
     639             : 
     640      189379 : static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
     641             :     int32_t h, uint8_t *dst, ptrdiff_t stride) {
     642      378758 :     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
     643      189379 :     const __m128i inc = _mm_set1_epi16(0x202);
     644      189379 :     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
     645      189379 :     __m128i d = _mm_set1_epi16(0x100);
     646             : 
     647     1541810 :     for (int32_t i = 0; i < h; ++i) {
     648     1352430 :         const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
     649     2704860 :         const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     650     1352430 :         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     651     1352430 :         __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
     652     2704860 :         __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
     653             : 
     654     1352430 :         s0 = _mm_add_epi32(s0, pred_round);
     655     2704860 :         s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
     656             : 
     657     1352430 :         s1 = _mm_add_epi32(s1, pred_round);
     658     2704860 :         s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
     659             : 
     660     1352430 :         __m128i sum01 = _mm_packus_epi16(s0, s1);
     661     1352430 :         sum01 = _mm_shuffle_epi8(sum01, gat);
     662     1352430 :         _mm_storel_epi64((__m128i *)dst, sum01);
     663     1352430 :         dst += stride;
     664             : 
     665     1352430 :         d = _mm_add_epi16(d, inc);
     666             :     }
     667      189379 : }
     668             : 
     669       40646 : void eb_aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     670             :     const uint8_t *above,
     671             :     const uint8_t *left) {
     672             :     __m128i pixels[2];
     673       40646 :     load_pixel_v_w8(above, left, 4, pixels);
     674             : 
     675             :     __m128i wh[2];
     676       40646 :     load_weight_v_w8(sm_weight_arrays, 4, wh);
     677             : 
     678       40646 :     smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
     679       40646 : }
     680             : 
     681       63689 : void eb_aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     682             :     const uint8_t *above,
     683             :     const uint8_t *left) {
     684             :     __m128i pixels[2];
     685       63689 :     load_pixel_v_w8(above, left, 8, pixels);
     686             : 
     687             :     __m128i wh[2];
     688       63689 :     load_weight_v_w8(sm_weight_arrays, 8, wh);
     689             : 
     690       63689 :     smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
     691       63689 : }
     692             : 
     693       15963 : void eb_aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     694             :     const uint8_t *above,
     695             :     const uint8_t *left) {
     696             :     __m128i pixels[2];
     697       15963 :     load_pixel_v_w8(above, left, 16, pixels);
     698             : 
     699             :     __m128i wh[4];
     700       15963 :     load_weight_v_w8(sm_weight_arrays, 16, wh);
     701             : 
     702       15963 :     smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
     703       15963 :     dst += stride << 3;
     704       15963 :     smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
     705       15963 : }
     706             : 
     707       13280 : void eb_aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     708             :     const uint8_t *above,
     709             :     const uint8_t *left) {
     710             :     __m128i pixels[2];
     711       13280 :     load_pixel_v_w8(above, left, 32, pixels);
     712             : 
     713             :     __m128i wh[8];
     714       13280 :     load_weight_v_w8(sm_weight_arrays, 32, wh);
     715             : 
     716       13280 :     smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
     717       13280 :     dst += stride << 3;
     718       13280 :     smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
     719       13280 :     dst += stride << 3;
     720       13280 :     smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
     721       13280 :     dst += stride << 3;
     722       13280 :     smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
     723       13280 : }
     724             : 
     725      136903 : static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
     726             :     const uint8_t *above,
     727             :     const uint8_t *left, uint32_t bw,
     728             :     uint32_t bh) {
     729      136903 :     const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
     730      136903 :     const __m128i zero = _mm_setzero_si128();
     731             :     const __m128i scale_value =
     732      273806 :         _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
     733      136903 :     const __m128i dup16 = _mm_set1_epi32(0x01000100);
     734             :     const __m128i bottom_left =
     735      410709 :         _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
     736      136903 :     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
     737             :     const __m128i round =
     738      136903 :         _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
     739             : 
     740     2718280 :     for (uint32_t y = 0; y < bh; ++y) {
     741     5162740 :         const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
     742             :         const __m128i scale_m_weights_y =
     743     5162740 :             _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
     744     2581370 :         const __m128i wl_y =
     745     2581370 :             _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
     746             : 
     747    11659000 :         for (uint32_t x = 0; x < bw; x += 8) {
     748    18155200 :             const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
     749             :             // 8 -> 16
     750     9077590 :             const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
     751     9077590 :             const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
     752     9077590 :             const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
     753             :             // top_x * weights_y + scale_m_weights_y * bottom_left
     754     9077590 :             __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
     755     9077590 :             __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
     756             : 
     757     9077590 :             pred_lo = _mm_add_epi32(pred_lo, round);
     758     9077590 :             pred_hi = _mm_add_epi32(pred_hi, round);
     759     9077590 :             pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
     760    18155200 :             pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
     761             : 
     762     9077590 :             __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
     763     9077590 :             pred = _mm_shuffle_epi8(pred, gat);
     764     9077590 :             _mm_storel_epi64((__m128i *)(dst + x), pred);
     765             :         }
     766     2581370 :         dst += stride;
     767             :     }
     768      136903 : }
     769             : 
     770       23210 : void eb_aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     771             :     const uint8_t *above,
     772             :     const uint8_t *left) {
     773       23210 :     smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
     774       23210 : }
     775             : 
     776       16653 : void eb_aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     777             :     const uint8_t *above,
     778             :     const uint8_t *left) {
     779       16653 :     smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
     780       16653 : }
     781             : 
     782       34203 : void eb_aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     783             :     const uint8_t *above,
     784             :     const uint8_t *left) {
     785       34203 :     smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
     786       34203 : }
     787             : 
     788        8958 : void eb_aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     789             :     const uint8_t *above,
     790             :     const uint8_t *left) {
     791        8958 :     smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
     792        8958 : }
     793             : 
     794       13250 : void eb_aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     795             :     const uint8_t *above,
     796             :     const uint8_t *left) {
     797       13250 :     smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
     798       13250 : }
     799             : 
     800        8423 : void eb_aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     801             :     const uint8_t *above,
     802             :     const uint8_t *left) {
     803        8423 :     smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
     804        8424 : }
     805             : 
     806       19316 : void eb_aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     807             :     const uint8_t *above,
     808             :     const uint8_t *left) {
     809       19316 :     smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
     810       19317 : }
     811             : 
     812        2021 : void eb_aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     813             :     const uint8_t *above,
     814             :     const uint8_t *left) {
     815        2021 :     smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
     816        2021 : }
     817             : 
     818        3701 : void eb_aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     819             :     const uint8_t *above,
     820             :     const uint8_t *left) {
     821        3701 :     smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
     822        3701 : }
     823             : 
     824        1527 : void eb_aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     825             :     const uint8_t *above,
     826             :     const uint8_t *left) {
     827        1527 :     smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
     828        1527 : }
     829             : 
     830        2361 : void eb_aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     831             :     const uint8_t *above,
     832             :     const uint8_t *left) {
     833        2361 :     smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
     834        2361 : }
     835             : 
     836        3280 : void eb_aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     837             :     const uint8_t *above,
     838             :     const uint8_t *left) {
     839        3280 :     smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
     840        3280 : }
     841             : 
     842             : // -----------------------------------------------------------------------------
     843             : // SMOOTH_H_PRED
     844             : 
     845             : // pixels[0]: left vector
     846             : // pixels[1]: right_pred vector
     847      216365 : static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
     848             :     int32_t height, __m128i *pixels) {
     849      216365 :     if (height == 4)
     850      299778 :         pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
     851       66476 :     else if (height == 8)
     852       44486 :         pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
     853             :     else
     854       21990 :         pixels[0] = _mm_loadu_si128(((const __m128i *)left));
     855      216365 :     pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
     856      216365 : }
     857             : 
     858             : // weights[0]: weights_w and scale - weights_w interleave vector
     859      216365 : static INLINE void load_weight_h_w4(const uint8_t *weight_array, int32_t height,
     860             :     __m128i *weights) {
     861             :     (void)height;
     862      432730 :     const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
     863      216365 :     const __m128i zero = _mm_setzero_si128();
     864             : 
     865      216365 :     const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
     866      432730 :     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
     867      216365 :     const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
     868      216365 :     weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
     869      216365 : }
     870             : 
     871      238356 : static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
     872             :     const __m128i *weight, int32_t h, uint8_t *dst,
     873             :     ptrdiff_t stride) {
     874      476712 :     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
     875      238356 :     const __m128i one = _mm_set1_epi16(1);
     876      238356 :     const __m128i gat = _mm_set1_epi32(0xc080400);
     877      238356 :     __m128i rep = _mm_set1_epi16((short)0x8000);
     878             : 
     879     1545630 :     for (int32_t i = 0; i < h; ++i) {
     880     1307280 :         __m128i b = _mm_shuffle_epi8(pixel[0], rep);
     881     1307280 :         b = _mm_unpacklo_epi16(b, pixel[1]);
     882     2614550 :         __m128i sum = _mm_madd_epi16(b, weight[0]);
     883             : 
     884     1307280 :         sum = _mm_add_epi32(sum, pred_round);
     885     2614550 :         sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
     886             : 
     887     1307280 :         sum = _mm_shuffle_epi8(sum, gat);
     888     1307280 :         *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
     889     1307280 :         dst += stride;
     890             : 
     891     1307280 :         rep = _mm_add_epi16(rep, one);
     892             :     }
     893      238356 : }
     894             : 
     895      149889 : void eb_aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     896             :     const uint8_t *above,
     897             :     const uint8_t *left) {
     898             :     __m128i pixels[2];
     899      149889 :     load_pixel_h_w4(above, left, 4, pixels);
     900             : 
     901             :     __m128i weights;
     902      149889 :     load_weight_h_w4(sm_weight_arrays, 4, &weights);
     903             : 
     904      149889 :     smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
     905      149889 : }
     906             : 
     907       44486 : void eb_aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     908             :     const uint8_t *above,
     909             :     const uint8_t *left) {
     910             :     __m128i pixels[2];
     911       44486 :     load_pixel_h_w4(above, left, 8, pixels);
     912             : 
     913             :     __m128i weights;
     914       44486 :     load_weight_h_w4(sm_weight_arrays, 8, &weights);
     915             : 
     916       44486 :     smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
     917       44486 : }
     918             : 
     919       21991 : void eb_aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     920             :     const uint8_t *above,
     921             :     const uint8_t *left) {
     922             :     __m128i pixels[2];
     923       21991 :     load_pixel_h_w4(above, left, 16, pixels);
     924             : 
     925             :     __m128i weights;
     926       21991 :     load_weight_h_w4(sm_weight_arrays, 8, &weights);
     927             : 
     928       21991 :     smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
     929       21991 :     dst += stride << 3;
     930             : 
     931       21991 :     pixels[0] = _mm_srli_si128(pixels[0], 8);
     932       21991 :     smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
     933       21991 : }
     934             : 
     935             : // pixels[0]: left vector
     936             : // pixels[1]: right_pred vector
     937             : // pixels[2]: left vector + 16
     938             : // pixels[3]: right_pred vector
     939      151179 : static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
     940             :     int32_t height, __m128i *pixels) {
     941      151179 :     pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
     942             : 
     943      151179 :     if (height == 4)
     944       83884 :         pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
     945      109237 :     else if (height == 8)
     946       70118 :         pixels[0] = _mm_loadl_epi64((const __m128i *)left);
     947       39119 :     else if (height == 16)
     948       16968 :         pixels[0] = _mm_load_si128((const __m128i *)left);
     949             :     else {
     950       22151 :         pixels[0] = _mm_load_si128((const __m128i *)left);
     951       22151 :         pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
     952       22151 :         pixels[3] = pixels[1];
     953             :     }
     954      151179 : }
     955             : 
     956             : // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
     957             : // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
     958      151180 : static INLINE void load_weight_h_w8(const uint8_t *weight_array, int32_t height,
     959             :     __m128i *weight_w) {
     960             :     (void)height;
     961      151180 :     const __m128i zero = _mm_setzero_si128();
     962      151180 :     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
     963      302360 :     const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
     964      151180 :     const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
     965      151180 :     const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
     966      151180 :     weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
     967      151180 :     weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
     968      151180 : }
     969             : 
     970      234604 : static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
     971             :     int32_t h, uint8_t *dst, ptrdiff_t stride,
     972             :     int32_t second_half) {
     973      469208 :     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
     974      234604 :     const __m128i one = _mm_set1_epi16(1);
     975      234604 :     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
     976      469208 :     __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
     977             : 
     978     1943610 :     for (int32_t i = 0; i < h; ++i) {
     979     1709000 :         __m128i b = _mm_shuffle_epi8(pixels[0], rep);
     980     1709000 :         b = _mm_unpacklo_epi16(b, pixels[1]);
     981     1709000 :         __m128i sum0 = _mm_madd_epi16(b, ww[0]);
     982     3418010 :         __m128i sum1 = _mm_madd_epi16(b, ww[1]);
     983             : 
     984     1709000 :         sum0 = _mm_add_epi32(sum0, pred_round);
     985     3418010 :         sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
     986             : 
     987     1709000 :         sum1 = _mm_add_epi32(sum1, pred_round);
     988     3418010 :         sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
     989             : 
     990     1709000 :         sum0 = _mm_packus_epi16(sum0, sum1);
     991     1709000 :         sum0 = _mm_shuffle_epi8(sum0, gat);
     992     1709000 :         _mm_storel_epi64((__m128i *)dst, sum0);
     993     1709000 :         dst += stride;
     994             : 
     995     1709000 :         rep = _mm_add_epi16(rep, one);
     996             :     }
     997      234604 : }
     998             : 
     999       41942 : void eb_aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
    1000             :     const uint8_t *above,
    1001             :     const uint8_t *left) {
    1002             :     __m128i pixels[2];
    1003       41942 :     load_pixel_h_w8(above, left, 4, pixels);
    1004             : 
    1005             :     __m128i ww[2];
    1006       41942 :     load_weight_h_w8(sm_weight_arrays, 4, ww);
    1007             : 
    1008       41942 :     smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
    1009       41942 : }
    1010             : 
    1011       70118 : void eb_aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    1012             :     const uint8_t *above,
    1013             :     const uint8_t *left) {
    1014             :     __m128i pixels[2];
    1015       70118 :     load_pixel_h_w8(above, left, 8, pixels);
    1016             : 
    1017             :     __m128i ww[2];
    1018       70118 :     load_weight_h_w8(sm_weight_arrays, 8, ww);
    1019             : 
    1020       70118 :     smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
    1021       70118 : }
    1022             : 
    1023       16968 : void eb_aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    1024             :     const uint8_t *above,
    1025             :     const uint8_t *left) {
    1026             :     __m128i pixels[2];
    1027       16968 :     load_pixel_h_w8(above, left, 16, pixels);
    1028             : 
    1029             :     __m128i ww[2];
    1030       16968 :     load_weight_h_w8(sm_weight_arrays, 16, ww);
    1031             : 
    1032       16968 :     smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
    1033       16968 :     dst += stride << 3;
    1034       16968 :     smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
    1035       16968 : }
    1036             : 
    1037       22152 : void eb_aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    1038             :     const uint8_t *above,
    1039             :     const uint8_t *left) {
    1040             :     __m128i pixels[4];
    1041       22152 :     load_pixel_h_w8(above, left, 32, pixels);
    1042             : 
    1043             :     __m128i ww[2];
    1044       22152 :     load_weight_h_w8(sm_weight_arrays, 32, ww);
    1045             : 
    1046       22152 :     smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
    1047       22152 :     dst += stride << 3;
    1048       22152 :     smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
    1049       22152 :     dst += stride << 3;
    1050       22152 :     smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
    1051       22152 :     dst += stride << 3;
    1052       22152 :     smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
    1053       22152 : }
    1054             : 
    1055      153675 : static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
    1056             :     const uint8_t *above,
    1057             :     const uint8_t *left, uint32_t bw,
    1058             :     uint32_t bh) {
    1059      153675 :     const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
    1060      153675 :     const __m128i zero = _mm_setzero_si128();
    1061             :     const __m128i scale_value =
    1062      153675 :         _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
    1063      307350 :     const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
    1064      153675 :     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
    1065      153675 :     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
    1066             : 
    1067     3044320 :     for (uint32_t y = 0; y < bh; ++y) {
    1068     5781280 :         const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
    1069     2890640 :         const __m128i tr_ly =
    1070     2890640 :             _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
    1071             : 
    1072    12781400 :         for (uint32_t x = 0; x < bw; x += 8) {
    1073             :             const __m128i weights_x =
    1074    19781400 :                 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
    1075     9890720 :             const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
    1076     9890720 :             const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
    1077     9890720 :             const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
    1078     9890720 :             const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
    1079     9890720 :             __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
    1080     9890720 :             __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
    1081             : 
    1082     9890720 :             pred_lo = _mm_add_epi32(pred_lo, pred_round);
    1083     9890720 :             pred_hi = _mm_add_epi32(pred_hi, pred_round);
    1084             : 
    1085     9890720 :             pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
    1086    19781400 :             pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
    1087             : 
    1088     9890720 :             __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
    1089     9890720 :             pred = _mm_shuffle_epi8(pred, gat);
    1090     9890720 :             _mm_storel_epi64((__m128i *)(dst + x), pred);
    1091             :         }
    1092     2890640 :         dst += stride;
    1093             :     }
    1094      153675 : }
    1095             : 
    1096       24652 : void eb_aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
    1097             :     const uint8_t *above,
    1098             :     const uint8_t *left) {
    1099       24652 :     smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
    1100       24652 : }
    1101             : 
    1102       18280 : void eb_aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    1103             :     const uint8_t *above,
    1104             :     const uint8_t *left) {
    1105       18280 :     smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
    1106       18281 : }
    1107             : 
    1108       36175 : void eb_aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    1109             :     const uint8_t *above,
    1110             :     const uint8_t *left) {
    1111       36175 :     smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
    1112       36175 : }
    1113             : 
    1114       14245 : void eb_aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    1115             :     const uint8_t *above,
    1116             :     const uint8_t *left) {
    1117       14245 :     smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
    1118       14245 : }
    1119             : 
    1120        3259 : void eb_aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
    1121             :     const uint8_t *above,
    1122             :     const uint8_t *left) {
    1123        3259 :     smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
    1124        3259 : }
    1125             : 
    1126       16423 : void eb_aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    1127             :     const uint8_t *above,
    1128             :     const uint8_t *left) {
    1129       16423 :     smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
    1130       16423 : }
    1131             : 
    1132       10819 : void eb_aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    1133             :     const uint8_t *above,
    1134             :     const uint8_t *left) {
    1135       10819 :     smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
    1136       10819 : }
    1137             : 
    1138       20095 : void eb_aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    1139             :     const uint8_t *above,
    1140             :     const uint8_t *left) {
    1141       20095 :     smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
    1142       20096 : }
    1143             : 
    1144        2011 : void eb_aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
    1145             :     const uint8_t *above,
    1146             :     const uint8_t *left) {
    1147        2011 :     smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
    1148        2011 : }
    1149             : 
    1150        3706 : void eb_aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
    1151             :     const uint8_t *above,
    1152             :     const uint8_t *left) {
    1153        3706 :     smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
    1154        3706 : }
    1155             : 
    1156        1580 : void eb_aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    1157             :     const uint8_t *above,
    1158             :     const uint8_t *left) {
    1159        1580 :     smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
    1160        1580 : }
    1161             : 
    1162        2429 : void eb_aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    1163             :     const uint8_t *above,
    1164             :     const uint8_t *left) {
    1165        2429 :     smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
    1166        2429 : }
    1167             : 
    1168           0 : void eb_smooth_v_predictor_all_ssse3(uint8_t *dst, ptrdiff_t stride, int32_t bw,
    1169             :     int32_t bh, const uint8_t *above,
    1170             :     const uint8_t *left) {
    1171             :     (void)bh;
    1172             : 
    1173           0 :     switch (bw) {
    1174           0 :     case 4:
    1175           0 :         eb_aom_smooth_v_predictor_4x4_ssse3(
    1176             :             dst,
    1177             :             stride,
    1178             :             above,
    1179             :             left
    1180             :         );
    1181           0 :         break;
    1182           0 :     case 8:
    1183           0 :         eb_aom_smooth_v_predictor_8x8_ssse3(
    1184             :             dst,
    1185             :             stride,
    1186             :             above,
    1187             :             left
    1188             :         );
    1189           0 :         break;
    1190           0 :     case 16:
    1191           0 :         eb_aom_smooth_v_predictor_16x16_ssse3(
    1192             :             dst,
    1193             :             stride,
    1194             :             above,
    1195             :             left
    1196             :         );
    1197           0 :         break;
    1198           0 :     case 32:
    1199           0 :         eb_aom_smooth_v_predictor_32x32_ssse3(
    1200             :             dst,
    1201             :             stride,
    1202             :             above,
    1203             :             left
    1204             :         );
    1205           0 :         break;
    1206           0 :     case 64:
    1207           0 :         eb_aom_smooth_v_predictor_64x64_ssse3(
    1208             :             dst,
    1209             :             stride,
    1210             :             above,
    1211             :             left
    1212             :         );
    1213           0 :         break;
    1214           0 :     default:
    1215             : 
    1216           0 :         break;
    1217             :     }
    1218           0 : }
    1219           0 : void eb_smooth_h_predictor_all_ssse3(uint8_t *dst, ptrdiff_t stride, int32_t bw,
    1220             :     int32_t bh, const uint8_t *above,
    1221             :     const uint8_t *left) {
    1222             :     (void)bh;
    1223             :     //printf("here");
    1224           0 :     switch (bw) {
    1225           0 :     case 4:
    1226           0 :         eb_aom_smooth_h_predictor_4x4_ssse3(
    1227             :             dst,
    1228             :             stride,
    1229             :             above,
    1230             :             left
    1231             :         );
    1232           0 :         break;
    1233           0 :     case 8:
    1234           0 :         eb_aom_smooth_h_predictor_8x8_ssse3(
    1235             :             dst,
    1236             :             stride,
    1237             :             above,
    1238             :             left
    1239             :         );
    1240           0 :         break;
    1241           0 :     case 16:
    1242           0 :         eb_aom_smooth_h_predictor_16x16_ssse3(
    1243             :             dst,
    1244             :             stride,
    1245             :             above,
    1246             :             left
    1247             :         );
    1248           0 :         break;
    1249           0 :     case 32:
    1250           0 :         eb_aom_smooth_h_predictor_32x32_ssse3(
    1251             :             dst,
    1252             :             stride,
    1253             :             above,
    1254             :             left
    1255             :         );
    1256           0 :         break;
    1257           0 :     case 64:
    1258           0 :         eb_aom_smooth_h_predictor_64x64_ssse3(
    1259             :             dst,
    1260             :             stride,
    1261             :             above,
    1262             :             left
    1263             :         );
    1264           0 :         break;
    1265           0 :     default:
    1266             : 
    1267           0 :         break;
    1268             :     }
    1269           0 : }

Generated by: LCOV version 1.14