LCOV - code coverage report
Current view: top level - ASM_SSE2 - EbIntraPrediction_AV1_Intrinsic_SSE2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 768 867 88.6 %
Date: 2019-11-25 17:38:06 Functions: 88 92 95.7 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbDefinitions.h"
       7             : #include "emmintrin.h"
       8             : #include "aom_dsp_rtcd.h"
       9             : 
      10     2268160 : static INLINE __m128i dc_sum_16(const uint8_t *ref) {
      11     2268160 :     __m128i x = _mm_loadu_si128((__m128i const *)ref);
      12     2268160 :     const __m128i zero = _mm_setzero_si128();
      13     2268160 :     x = _mm_sad_epu8(x, zero);
      14     2268160 :     const __m128i high = _mm_unpackhi_epi64(x, x);
      15     2268160 :     return _mm_add_epi16(x, high);
      16             : }
      17             : 
      18     2457810 : static INLINE void dc_store_4xh(uint32_t dc, int32_t height, uint8_t *dst,
      19             :     ptrdiff_t stride) {
      20    14398700 :     for (int32_t i = 0; i < height; i += 2) {
      21    11940900 :         *(uint32_t *)dst = dc;
      22    11940900 :         dst += stride;
      23    11940900 :         *(uint32_t *)dst = dc;
      24    11940900 :         dst += stride;
      25             :     }
      26     2457810 : }
      27             : 
      28      195204 : static INLINE __m128i dc_sum_32(const uint8_t *ref) {
      29      195204 :     __m128i x0 = _mm_load_si128((__m128i const *)ref);
      30      390408 :     __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
      31      195204 :     const __m128i zero = _mm_setzero_si128();
      32      195204 :     x0 = _mm_sad_epu8(x0, zero);
      33      195204 :     x1 = _mm_sad_epu8(x1, zero);
      34      195204 :     x0 = _mm_add_epi16(x0, x1);
      35      195204 :     const __m128i high = _mm_unpackhi_epi64(x0, x0);
      36      195204 :     return _mm_add_epi16(x0, high);
      37             : }
      38             : 
      39        3265 : static INLINE __m128i dc_sum_64(const uint8_t *ref) {
      40        3265 :     __m128i x0 = _mm_load_si128((__m128i const *)ref);
      41        3265 :     __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
      42        3265 :     __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
      43        6530 :     __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
      44        3265 :     const __m128i zero = _mm_setzero_si128();
      45        3265 :     x0 = _mm_sad_epu8(x0, zero);
      46        3265 :     x1 = _mm_sad_epu8(x1, zero);
      47        3265 :     x2 = _mm_sad_epu8(x2, zero);
      48        3265 :     x3 = _mm_sad_epu8(x3, zero);
      49        3265 :     x0 = _mm_add_epi16(x0, x1);
      50        3265 :     x2 = _mm_add_epi16(x2, x3);
      51        3265 :     x0 = _mm_add_epi16(x0, x2);
      52        3265 :     const __m128i high = _mm_unpackhi_epi64(x0, x0);
      53        3265 :     return _mm_add_epi16(x0, high);
      54             : }
      55             : 
      56       32207 : static INLINE void dc_store_32xh(const __m128i *row, int32_t height, uint8_t *dst,
      57             :     ptrdiff_t stride) {
      58             :     int32_t i;
      59      289863 :     for (i = 0; i < height; ++i) {
      60      257656 :         _mm_storeu_si128((__m128i *)dst, *row);
      61      257656 :         _mm_storeu_si128((__m128i *)(dst + 16), *row);
      62      257656 :         dst += stride;
      63             :     }
      64       32207 : }
      65             : 
      66     1974630 : static INLINE void dc_store_16xh(const __m128i *row, int32_t height, uint8_t *dst,
      67             :     ptrdiff_t stride) {
      68             :     int32_t i;
      69    26823900 :     for (i = 0; i < height; ++i) {
      70    24849200 :         _mm_storeu_si128((__m128i *)dst, *row);
      71    24849200 :         dst += stride;
      72             :     }
      73     1974630 : }
      74             : 
      75           0 : void intra_mode_dc_16x16_av1_sse2_intrin(
      76             :     EbBool                         is_left_availble,
      77             :     EbBool                         is_above_availble,
      78             :     const uint32_t   size,                       //input parameter, denotes the size of the current PU
      79             :     uint8_t         *ref_samples,                 //input parameter, pointer to the reference samples
      80             :     uint8_t         *dst,              //output parameter, pointer to the prediction
      81             :     const uint32_t   prediction_buffer_stride,     //input parameter, denotes the stride for the prediction ptr
      82             :     const EbBool  skip)                       //skip half rows
      83             : {
      84           0 :     uint32_t leftOffset = 0;
      85           0 :     uint32_t topOffset = (size << 1) + 1;
      86           0 :     uint32_t rowStride = skip ? 2 : 1;
      87             : 
      88           0 :     if (is_left_availble && !is_above_availble) {
      89           0 :         __m128i sum_left = dc_sum_16(&ref_samples[leftOffset]);
      90           0 :         const __m128i eight = _mm_set1_epi16((uint16_t)8);
      91           0 :         sum_left = _mm_add_epi16(sum_left, eight);
      92           0 :         sum_left = _mm_srai_epi16(sum_left, 4);
      93           0 :         sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
      94           0 :         sum_left = _mm_shufflelo_epi16(sum_left, 0);
      95           0 :         const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
      96           0 :         dc_store_16xh(&row, 16, dst, rowStride * prediction_buffer_stride);
      97             :     }
      98           0 :     else if (is_above_availble && !is_left_availble) {
      99           0 :         __m128i sum_above = dc_sum_16(&ref_samples[topOffset]);
     100           0 :         const __m128i eight = _mm_set1_epi16((uint16_t)8);
     101           0 :         sum_above = _mm_add_epi16(sum_above, eight);
     102           0 :         sum_above = _mm_srai_epi16(sum_above, 4);
     103           0 :         sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     104           0 :         sum_above = _mm_shufflelo_epi16(sum_above, 0);
     105           0 :         const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
     106           0 :         dc_store_16xh(&row, 16, dst, rowStride * prediction_buffer_stride);
     107             :     }
     108             :     else
     109             :     {
     110           0 :         const __m128i sum_left = dc_sum_16(&ref_samples[leftOffset]);
     111           0 :         __m128i sum_above = dc_sum_16(&ref_samples[topOffset]);
     112           0 :         sum_above = _mm_add_epi16(sum_above, sum_left);
     113             : 
     114           0 :         uint32_t sum = _mm_cvtsi128_si32(sum_above);
     115           0 :         sum += 16;
     116           0 :         sum >>= 5;
     117           0 :         const __m128i row = _mm_set1_epi8((uint8_t)sum);
     118           0 :         dc_store_16xh(&row, 16, dst, rowStride * prediction_buffer_stride);
     119             :     }
     120           0 : }
     121     3747170 : static INLINE void dc_store_8xh(const __m128i *row, int32_t height, uint8_t *dst,
     122             :     ptrdiff_t stride) {
     123             :     int32_t i;
     124    41729600 :     for (i = 0; i < height; ++i) {
     125    37982400 :         _mm_storel_epi64((__m128i *)dst, *row);
     126    37982400 :         dst += stride;
     127             :     }
     128     3747170 : }
     129     4719570 : static INLINE __m128i dc_sum_8(const uint8_t *ref) {
     130     4719570 :     __m128i x = _mm_loadl_epi64((__m128i const *)ref);
     131     4719570 :     const __m128i zero = _mm_setzero_si128();
     132     4719570 :     return _mm_sad_epu8(x, zero);
     133             : }
     134             : 
     135           0 : void intra_mode_dc_8x8_av1_sse2_intrin(
     136             :     EbBool                         is_left_availble,
     137             :     EbBool                         is_above_availble,
     138             :     const uint32_t   size,                       //input parameter, denotes the size of the current PU
     139             :     uint8_t         *ref_samples,                 //input parameter, pointer to the reference samples
     140             :     uint8_t         *dst,              //output parameter, pointer to the prediction
     141             :     const uint32_t   prediction_buffer_stride,     //input parameter, denotes the stride for the prediction ptr
     142             :     const EbBool  skip)                       //skip half rows
     143             : {
     144           0 :     uint32_t leftOffset = 0;
     145           0 :     uint32_t topOffset = (size << 1) + 1;
     146           0 :     uint32_t rowStride = skip ? 2 : 1;
     147             : 
     148           0 :     if (is_left_availble && !is_above_availble) {
     149           0 :         __m128i sum_left = dc_sum_8(&ref_samples[leftOffset]);
     150           0 :         const __m128i four = _mm_set1_epi16((uint16_t)4);
     151           0 :         sum_left = _mm_add_epi16(sum_left, four);
     152           0 :         sum_left = _mm_srai_epi16(sum_left, 3);
     153           0 :         sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
     154           0 :         const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
     155           0 :         dc_store_8xh(&row, 8, dst, rowStride * prediction_buffer_stride);
     156             :     }
     157           0 :     else if (is_above_availble && !is_left_availble) {
     158           0 :         __m128i sum_above = dc_sum_8(&ref_samples[topOffset]);
     159           0 :         const __m128i four = _mm_set1_epi16((uint16_t)4);
     160           0 :         sum_above = _mm_add_epi16(sum_above, four);
     161           0 :         sum_above = _mm_srai_epi16(sum_above, 3);
     162           0 :         sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     163           0 :         const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
     164           0 :         dc_store_8xh(&row, 8, dst, rowStride * prediction_buffer_stride);
     165             :     }
     166             :     else
     167             :     {
     168           0 :         const __m128i sum_left = dc_sum_8(&ref_samples[leftOffset]);
     169           0 :         __m128i sum_above = dc_sum_8(&ref_samples[topOffset]);
     170           0 :         sum_above = _mm_add_epi16(sum_above, sum_left);
     171             : 
     172           0 :         uint32_t sum = _mm_cvtsi128_si32(sum_above);
     173           0 :         sum += 8;
     174           0 :         sum >>= 4;
     175           0 :         const __m128i row = _mm_set1_epi8((uint8_t)sum);
     176           0 :         dc_store_8xh(&row, 8, dst, rowStride * prediction_buffer_stride);
     177             :     }
     178           0 : }
     179     4272130 : static INLINE __m128i dc_sum_4(const uint8_t *ref) {
     180     4272130 :     __m128i x = _mm_loadl_epi64((__m128i const *)ref);
     181     4272130 :     const __m128i zero = _mm_setzero_si128();
     182     4272130 :     x = _mm_unpacklo_epi8(x, zero);
     183     4272130 :     return _mm_sad_epu8(x, zero);
     184             : }
     185           0 : static INLINE void dc_store_4x4(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
     186             :     int32_t i;
     187           0 :     for (i = 0; i < 2; ++i) {
     188           0 :         *(uint32_t *)dst = dc;
     189           0 :         dst += stride;
     190           0 :         *(uint32_t *)dst = dc;
     191           0 :         dst += stride;
     192             :     }
     193           0 : }
     194           0 : void intra_mode_dc_4x4_av1_sse2_intrin(
     195             :     EbBool                         is_left_availble,
     196             :     EbBool                         is_above_availble,
     197             :     const uint32_t   size,                       //input parameter, denotes the size of the current PU
     198             :     uint8_t         *ref_samples,                 //input parameter, pointer to the reference samples
     199             :     uint8_t         *dst,              //output parameter, pointer to the prediction
     200             :     const uint32_t   prediction_buffer_stride,     //input parameter, denotes the stride for the prediction ptr
     201             :     const EbBool  skip)                       //skip half rows
     202             : {
     203           0 :     uint32_t leftOffset = 0;
     204           0 :     uint32_t topOffset = (size << 1) + 1;
     205           0 :     uint32_t rowStride = skip ? 2 : 1;
     206             : 
     207           0 :     if (is_left_availble && !is_above_availble) {
     208           0 :         __m128i sum_left = dc_sum_4(&ref_samples[leftOffset]);
     209           0 :         const __m128i two = _mm_set1_epi16((uint16_t)2);
     210           0 :         sum_left = _mm_add_epi16(sum_left, two);
     211           0 :         sum_left = _mm_srai_epi16(sum_left, 2);
     212           0 :         sum_left = _mm_shufflelo_epi16(sum_left, 0);
     213           0 :         sum_left = _mm_packus_epi16(sum_left, sum_left);
     214             : 
     215           0 :         const uint32_t pred = _mm_cvtsi128_si32(sum_left);
     216           0 :         dc_store_4x4(pred, dst, rowStride * prediction_buffer_stride);
     217             :     }
     218           0 :     else if (is_above_availble && !is_left_availble) {
     219           0 :         __m128i sum_above = dc_sum_4(&ref_samples[topOffset]);
     220           0 :         const __m128i two = _mm_set1_epi16((int16_t)2);
     221           0 :         sum_above = _mm_add_epi16(sum_above, two);
     222           0 :         sum_above = _mm_srai_epi16(sum_above, 2);
     223           0 :         sum_above = _mm_shufflelo_epi16(sum_above, 0);
     224           0 :         sum_above = _mm_packus_epi16(sum_above, sum_above);
     225             : 
     226           0 :         const uint32_t pred = _mm_cvtsi128_si32(sum_above);
     227           0 :         dc_store_4x4(pred, dst, rowStride * prediction_buffer_stride);
     228             :     }
     229             :     else
     230             :     {
     231           0 :         const __m128i sum_left = dc_sum_4(&ref_samples[leftOffset]);
     232           0 :         __m128i sum_above = dc_sum_4(&ref_samples[topOffset]);
     233           0 :         sum_above = _mm_add_epi16(sum_left, sum_above);
     234             : 
     235           0 :         uint32_t sum = _mm_cvtsi128_si32(sum_above);
     236           0 :         sum += 4;
     237           0 :         sum >>= 3;
     238             : 
     239           0 :         const __m128i row = _mm_set1_epi8((uint8_t)sum);
     240           0 :         const uint32_t pred = _mm_cvtsi128_si32(row);
     241           0 :         dc_store_4x4(pred, dst, rowStride * prediction_buffer_stride);
     242             :     }
     243           0 : }
     244             : 
     245             : #define DC_SHIFT2 16
     246             : #define DC_MULTIPLIER_1X2 0x5556
     247             : #define DC_MULTIPLIER_1X4 0x3334
     248             : 
     249     5273090 : static INLINE int32_t divide_using_multiply_shift(int32_t num, int32_t shift1,
     250             :     int32_t multiplier) {
     251     5273090 :     const int32_t interm = num >> shift1;
     252     5273090 :     return interm * multiplier >> DC_SHIFT2;
     253             : }
     254             : 
     255       22917 : static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
     256             :     const uint8_t *above, int32_t height) {
     257       22917 :     const __m128i row0 = _mm_load_si128((__m128i const *)above);
     258       22917 :     const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
     259      206253 :     for (int32_t i = 0; i < height; ++i) {
     260             :         _mm_storeu_si128((__m128i *)dst, row0);
     261      183336 :         _mm_storeu_si128((__m128i *)(dst + 16), row1);
     262      183336 :         dst += stride;
     263             :     }
     264       22917 : }
     265             : 
     266     5508030 : static INLINE void h_pred_store_16xh(const __m128i *row, int32_t h, uint8_t *dst,
     267             :     ptrdiff_t stride) {
     268             :     int32_t i;
     269    27539700 :     for (i = 0; i < h; ++i) {
     270    22031700 :         _mm_storeu_si128((__m128i *)dst, row[i]);
     271    22031700 :         dst += stride;
     272             :     }
     273     5508030 : }
     274     3776090 : static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
     275     3776090 :     const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
     276     3776090 :     const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
     277     3776090 :     const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
     278     3776090 :     const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
     279             : 
     280     3776090 :     row[0] = _mm_unpacklo_epi64(u0, u0);
     281     3776090 :     row[1] = _mm_unpacklo_epi64(u1, u1);
     282     3776090 :     row[2] = _mm_unpacklo_epi64(u2, u2);
     283     3776090 :     row[3] = _mm_unpacklo_epi64(u3, u3);
     284     3776090 : }
     285     3757220 : static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
     286     3757220 :     const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
     287     3757220 :     const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
     288     3757220 :     const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
     289     3757220 :     const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
     290             : 
     291     3757220 :     row[0] = _mm_unpackhi_epi64(u0, u0);
     292     3757220 :     row[1] = _mm_unpackhi_epi64(u1, u1);
     293     3757220 :     row[2] = _mm_unpackhi_epi64(u2, u2);
     294     3757220 :     row[3] = _mm_unpackhi_epi64(u3, u3);
     295     3757220 : }
     296             : // Process 16x8, first 4 rows
     297             : // Use first 8 bytes of left register: xxxxxxxx33221100
     298     2763570 : static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
     299             :     ptrdiff_t stride) {
     300             :     __m128i row[4];
     301     2763570 :     repeat_low_4pixels(left, row);
     302     2763620 :     h_pred_store_16xh(row, 4, dst, stride);
     303     2763620 : }
     304             : 
     305             : // Process 16x8, second 4 rows
     306             : // Use second 8 bytes of left register: 77665544xxxxxxxx
     307     2744640 : static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
     308             :     ptrdiff_t stride) {
     309             :     __m128i row[4];
     310     2744640 :     repeat_high_4pixels(left, row);
     311     2744660 :     h_pred_store_16xh(row, 4, dst, stride);
     312     2744620 : }
     313             : 
     314      358560 : static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
     315             :     const uint8_t *left, int32_t count) {
     316      358560 :     int32_t i = 0;
     317             :     do {
     318      722269 :         const __m128i left_col = _mm_load_si128((const __m128i *)left);
     319      722269 :         const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
     320      722269 :         h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
     321      722270 :         dst += stride << 2;
     322      722270 :         h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
     323      722270 :         dst += stride << 2;
     324             : 
     325      722270 :         const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
     326      722270 :         h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
     327      722269 :         dst += stride << 2;
     328      722269 :         h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
     329      722274 :         dst += stride << 2;
     330             : 
     331      722274 :         left += 16;
     332      722274 :         i++;
     333      722274 :     } while (i < count);
     334      358565 : }
     335             : 
     336     2025250 : static INLINE void h_pred_store_32xh(const __m128i *row, int32_t h, uint8_t *dst,
     337             :     ptrdiff_t stride) {
     338             :     int32_t i;
     339    10126100 :     for (i = 0; i < h; ++i) {
     340     8100880 :         _mm_storeu_si128((__m128i *)dst, row[i]);
     341     8100880 :         _mm_storeu_si128((__m128i *)(dst + 16), row[i]);
     342     8100880 :         dst += stride;
     343             :     }
     344     2025250 : }
     345             : 
     346             : // Process 32x8, first 4 rows
     347             : // Use first 8 bytes of left register: xxxxxxxx33221100
     348     1012640 : static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
     349             :     ptrdiff_t stride) {
     350             :     __m128i row[4];
     351     1012640 :     repeat_low_4pixels(left, row);
     352     1012650 :     h_pred_store_32xh(row, 4, dst, stride);
     353     1012650 : }
     354             : 
     355             : // Process 32x8, second 4 rows
     356             : // Use second 8 bytes of left register: 77665544xxxxxxxx
     357     1012650 : static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
     358             :     ptrdiff_t stride) {
     359             :     __m128i row[4];
     360     1012650 :     repeat_high_4pixels(left, row);
     361     1012660 :     h_pred_store_32xh(row, 4, dst, stride);
     362     1012660 : }
     363             : 
     364        1490 : static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
     365             :     const uint8_t *left, int32_t height) {
     366        1490 :     int32_t i = height >> 2;
     367             :     do {
     368       47680 :         __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
     369       23840 :         left4 = _mm_unpacklo_epi8(left4, left4);
     370       23840 :         left4 = _mm_unpacklo_epi8(left4, left4);
     371       23840 :         const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
     372       23840 :         const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
     373             :         _mm_storeu_si128((__m128i *)dst, r0);
     374       23840 :         _mm_storeu_si128((__m128i *)(dst + 16), r0);
     375       23840 :         _mm_storeu_si128((__m128i *)(dst + stride), r1);
     376       23840 :         _mm_storeu_si128((__m128i *)(dst + stride + 16), r1);
     377       23840 :         const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
     378       23840 :         const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
     379       23840 :         _mm_storeu_si128((__m128i *)(dst + stride * 2), r2);
     380       23840 :         _mm_storeu_si128((__m128i *)(dst + stride * 2 + 16), r2);
     381       23840 :         _mm_storeu_si128((__m128i *)(dst + stride * 3), r3);
     382       23840 :         _mm_storeu_si128((__m128i *)(dst + stride * 3 + 16), r3);
     383       23840 :         left += 4;
     384       23840 :         dst += stride * 4;
     385       23840 :     } while (--i);
     386        1490 : }
     387             : 
     388     1139080 : static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
     389             :     const uint8_t *above, const uint8_t *left,
     390             :     int32_t count) {
     391             :     (void)above;
     392     2310010 :     for (int32_t i = 0; i < count; ++i) {
     393     1170930 :         const __m128i left_col = _mm_load_si128((__m128i const *)left);
     394     1170930 :         __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
     395     1170930 :         __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
     396             : 
     397     1170930 :         __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
     398     1170930 :         __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
     399     1170930 :         __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
     400     1170930 :         __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
     401     1170930 :         _mm_storel_epi64((__m128i *)dst, row0);
     402     1170930 :         dst += stride;
     403     1170930 :         _mm_storel_epi64((__m128i *)dst, row1);
     404     1170930 :         dst += stride;
     405     1170930 :         _mm_storel_epi64((__m128i *)dst, row2);
     406     1170930 :         dst += stride;
     407     1170930 :         _mm_storel_epi64((__m128i *)dst, row3);
     408     1170930 :         dst += stride;
     409             : 
     410     1170930 :         left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
     411     1170930 :         row0 = _mm_shufflelo_epi16(left_col_low, 0);
     412     1170930 :         row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
     413     1170930 :         row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
     414     1170930 :         row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
     415     1170930 :         _mm_storel_epi64((__m128i *)dst, row0);
     416     1170930 :         dst += stride;
     417     1170930 :         _mm_storel_epi64((__m128i *)dst, row1);
     418     1170930 :         dst += stride;
     419     1170930 :         _mm_storel_epi64((__m128i *)dst, row2);
     420     1170930 :         dst += stride;
     421     1170930 :         _mm_storel_epi64((__m128i *)dst, row3);
     422     1170930 :         dst += stride;
     423             : 
     424     1170930 :         row0 = _mm_shufflelo_epi16(left_col_high, 0);
     425     1170930 :         row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
     426     1170930 :         row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
     427     1170930 :         row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
     428     1170930 :         _mm_storel_epi64((__m128i *)dst, row0);
     429     1170930 :         dst += stride;
     430     1170930 :         _mm_storel_epi64((__m128i *)dst, row1);
     431     1170930 :         dst += stride;
     432     1170930 :         _mm_storel_epi64((__m128i *)dst, row2);
     433     1170930 :         dst += stride;
     434     1170930 :         _mm_storel_epi64((__m128i *)dst, row3);
     435     1170930 :         dst += stride;
     436             : 
     437     1170930 :         left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
     438     1170930 :         row0 = _mm_shufflelo_epi16(left_col_high, 0);
     439     1170930 :         row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
     440     1170930 :         row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
     441     1170930 :         row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
     442     1170930 :         _mm_storel_epi64((__m128i *)dst, row0);
     443     1170930 :         dst += stride;
     444     1170930 :         _mm_storel_epi64((__m128i *)dst, row1);
     445     1170930 :         dst += stride;
     446     1170930 :         _mm_storel_epi64((__m128i *)dst, row2);
     447     1170930 :         dst += stride;
     448     1170930 :         _mm_storel_epi64((__m128i *)dst, row3);
     449     1170930 :         dst += stride;
     450     1170930 :         left += 16;
     451             :     }
     452     1139080 : }
     453             : 
     454        5719 : static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
     455             :     const uint8_t *left, int32_t height) {
     456        5719 :     int32_t i = height >> 2;
     457             :     do {
     458      124344 :         __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
     459       62172 :         left4 = _mm_unpacklo_epi8(left4, left4);
     460       62172 :         left4 = _mm_unpacklo_epi8(left4, left4);
     461       62172 :         const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
     462       62172 :         const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
     463             :         _mm_storeu_si128((__m128i *)dst, r0);
     464       62172 :         _mm_storeu_si128((__m128i *)(dst + 16), r0);
     465       62172 :         _mm_storeu_si128((__m128i *)(dst + 32), r0);
     466       62172 :         _mm_storeu_si128((__m128i *)(dst + 48), r0);
     467       62172 :         _mm_storeu_si128((__m128i *)(dst + stride), r1);
     468       62172 :         _mm_storeu_si128((__m128i *)(dst + stride + 16), r1);
     469       62172 :         _mm_storeu_si128((__m128i *)(dst + stride + 32), r1);
     470       62172 :         _mm_storeu_si128((__m128i *)(dst + stride + 48), r1);
     471       62172 :         const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
     472       62172 :         const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
     473       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 2), r2);
     474       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 2 + 16), r2);
     475       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 2 + 32), r2);
     476       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 2 + 48), r2);
     477       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 3), r3);
     478       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 3 + 16), r3);
     479       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 3 + 32), r3);
     480       62172 :         _mm_storeu_si128((__m128i *)(dst + stride * 3 + 48), r3);
     481       62172 :         left += 4;
     482       62172 :         dst += stride * 4;
     483       62172 :     } while (--i);
     484        5719 : }
     485             : 
     486        2923 : void eb_aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
     487             :     const uint8_t *above, const uint8_t *left) {
     488             :     (void)above;
     489        2923 :     h_predictor_64xh(dst, stride, left, 64);
     490        2923 : }
     491             : 
     492        1055 : void eb_aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
     493             :     const uint8_t *above, const uint8_t *left) {
     494             :     (void)above;
     495        1055 :     h_predictor_64xh(dst, stride, left, 32);
     496        1055 : }
     497             : 
     498        1490 : void eb_aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
     499             :     const uint8_t *above, const uint8_t *left) {
     500             :     (void)above;
     501        1490 :     h_predictor_32xh(dst, stride, left, 64);
     502        1490 : }
     503             : 
     504        1741 : void eb_aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
     505             :     const uint8_t *above, const uint8_t *left) {
     506             :     (void)above;
     507        1741 :     h_predictor_64xh(dst, stride, left, 16);
     508        1741 : }
     509             : 
     510        2575 : void eb_aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
     511             :     const uint8_t *above, const uint8_t *left) {
     512             :     (void)above;
     513        2575 :     h_predictor_16xh(dst, stride, left, 4);
     514        2575 : }
     515             : 
     516      355985 : void eb_aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
     517             :     const uint8_t *above, const uint8_t *left) {
     518             :     (void)above;
     519      355985 :     h_predictor_16xh(dst, stride, left, 2);
     520      355989 : }
     521             : 
     522      496710 : void eb_aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
     523             :     const uint8_t *above, const uint8_t *left) {
     524             :     __m128i left_col, left_col_8p;
     525             :     (void)above;
     526             : 
     527      496710 :     left_col = _mm_load_si128((const __m128i *)left);
     528             : 
     529      496710 :     left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
     530      496710 :     h_prediction_32x8_1(&left_col_8p, dst, stride);
     531      496724 :     dst += stride << 2;
     532      496724 :     h_prediction_32x8_2(&left_col_8p, dst, stride);
     533      496723 :     dst += stride << 2;
     534             : 
     535      496723 :     left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
     536      496723 :     h_prediction_32x8_1(&left_col_8p, dst, stride);
     537      496723 :     dst += stride << 2;
     538      496723 :     h_prediction_32x8_2(&left_col_8p, dst, stride);
     539      496721 : }
     540             : 
     541       18998 : void eb_aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
     542             :     const uint8_t *above, const uint8_t *left) {
     543             :     (void)above;
     544       18998 :     const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
     545       18998 :     const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
     546       18998 :     h_prediction_16x8_1(&left_col_8p, dst, stride);
     547       18998 : }
     548             : 
     549     1300130 : void eb_aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
     550             :     const uint8_t *above, const uint8_t *left) {
     551             :     (void)above;
     552     1300130 :     const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
     553     1300130 :     const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
     554     1300130 :     h_prediction_16x8_1(&left_col_8p, dst, stride);
     555     1300180 :     dst += stride << 2;
     556     1300180 :     h_prediction_16x8_2(&left_col_8p, dst, stride);
     557     1300180 : }
     558             : 
     559       19217 : void eb_aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
     560             :     const uint8_t *above, const uint8_t *left) {
     561             :     __m128i left_col, left_col_8p;
     562             :     (void)above;
     563             : 
     564       19217 :     left_col = _mm_load_si128((const __m128i *)left);
     565             : 
     566       19217 :     left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
     567       19217 :     h_prediction_32x8_1(&left_col_8p, dst, stride);
     568       19217 :     dst += stride << 2;
     569       19217 :     h_prediction_32x8_2(&left_col_8p, dst, stride);
     570       19217 : }
     571             : 
     572       19761 : void eb_aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
     573             :     const uint8_t *above, const uint8_t *left) {
     574             :     (void)above;
     575       19761 :     const __m128i left_col = _mm_load_si128((__m128i const *)left);
     576       19761 :     __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
     577       19761 :     __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
     578             : 
     579       19761 :     __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
     580       19761 :     __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
     581       19761 :     __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
     582       19761 :     __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
     583       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
     584       19761 :     dst += stride;
     585       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
     586       19761 :     dst += stride;
     587       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
     588       19761 :     dst += stride;
     589       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
     590       19761 :     dst += stride;
     591             : 
     592       19761 :     left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
     593       19761 :     row0 = _mm_shufflelo_epi16(left_col_low, 0);
     594       19761 :     row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
     595       19761 :     row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
     596       19761 :     row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
     597       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
     598       19761 :     dst += stride;
     599       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
     600       19761 :     dst += stride;
     601       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
     602       19761 :     dst += stride;
     603       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
     604       19761 :     dst += stride;
     605             : 
     606       19761 :     row0 = _mm_shufflelo_epi16(left_col_high, 0);
     607       19761 :     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
     608       19761 :     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
     609       19761 :     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
     610       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
     611       19761 :     dst += stride;
     612       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
     613       19761 :     dst += stride;
     614       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
     615       19761 :     dst += stride;
     616       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
     617       19761 :     dst += stride;
     618             : 
     619       19761 :     left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
     620       19761 :     row0 = _mm_shufflelo_epi16(left_col_high, 0);
     621       19761 :     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
     622       19761 :     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
     623       19761 :     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
     624       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
     625       19761 :     dst += stride;
     626       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
     627       19761 :     dst += stride;
     628       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
     629       19761 :     dst += stride;
     630       19761 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
     631       19761 : }
     632             : 
     633       51564 : void eb_aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
     634             :     const uint8_t *above, const uint8_t *left) {
     635             :     (void)above;
     636       51564 :     __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
     637       51564 :     left_col = _mm_unpacklo_epi8(left_col, left_col);
     638       51564 :     __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
     639       51564 :     __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
     640       51564 :     __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
     641       51564 :     __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
     642       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
     643       51564 :     dst += stride;
     644       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
     645       51564 :     dst += stride;
     646       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
     647       51564 :     dst += stride;
     648       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
     649       51564 :     dst += stride;
     650       51564 :     left_col = _mm_unpackhi_epi64(left_col, left_col);
     651       51564 :     row0 = _mm_shufflelo_epi16(left_col, 0);
     652       51564 :     row1 = _mm_shufflelo_epi16(left_col, 0x55);
     653       51564 :     row2 = _mm_shufflelo_epi16(left_col, 0xaa);
     654       51564 :     row3 = _mm_shufflelo_epi16(left_col, 0xff);
     655       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
     656       51564 :     dst += stride;
     657       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
     658       51564 :     dst += stride;
     659       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
     660       51564 :     dst += stride;
     661       51564 :     *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
     662       51564 : }
     663             : 
     664     1107240 : void eb_aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
     665             :     const uint8_t *above, const uint8_t *left) {
     666     1107240 :     h_predictor_8x16xc(dst, stride, above, left, 1);
     667     1107280 : }
     668       31844 : void eb_aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
     669             :     const uint8_t *above, const uint8_t *left) {
     670       31844 :     h_predictor_8x16xc(dst, stride, above, left, 2);
     671       31844 : }
     672       49856 : void eb_aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
     673             :     const uint8_t *above, const uint8_t *left) {
     674             :     (void)above;
     675       49856 :     __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
     676       49856 :     left_col = _mm_unpacklo_epi8(left_col, left_col);
     677       49856 :     __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
     678       49856 :     __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
     679       49856 :     __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
     680       49856 :     __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
     681       49856 :     _mm_storel_epi64((__m128i *)dst, row0);
     682       49856 :     dst += stride;
     683       49856 :     _mm_storel_epi64((__m128i *)dst, row1);
     684       49856 :     dst += stride;
     685       49856 :     _mm_storel_epi64((__m128i *)dst, row2);
     686       49856 :     dst += stride;
     687       49856 :     _mm_storel_epi64((__m128i *)dst, row3);
     688       49856 : }
     689             : 
     690      316366 : void eb_aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
     691             :     const uint8_t *above, const uint8_t *left) {
     692      316366 :     const __m128i row = _mm_load_si128((__m128i const *)above);
     693             :     (void)left;
     694      316366 :     dc_store_16xh(&row, 32, dst, stride);
     695      316373 : }
     696             : 
     697       41766 : void eb_aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
     698             :     const uint8_t *above, const uint8_t *left) {
     699       20883 :     const __m128i row = _mm_load_si128((__m128i const *)above);
     700             :     (void)left;
     701       20883 :     dc_store_16xh(&row, 4, dst, stride);
     702       20883 : }
     703             : 
     704        2199 : void eb_aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
     705             :     const uint8_t *above, const uint8_t *left) {
     706        2199 :     const __m128i row = _mm_load_si128((__m128i const *)above);
     707             :     (void)left;
     708        2199 :     dc_store_16xh(&row, 64, dst, stride);
     709        2199 : }
     710      395111 : void eb_aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
     711             :     const uint8_t *above, const uint8_t *left) {
     712      395111 :     const __m128i row = _mm_load_si128((__m128i const *)above);
     713             :     (void)left;
     714      395111 :     dc_store_16xh(&row, 8, dst, stride);
     715      395110 : }
     716             : 
     717       22917 : void eb_aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
     718             :     const uint8_t *above, const uint8_t *left) {
     719             :     (void)left;
     720       22917 :     v_predictor_32xh(dst, stride, above, 8);
     721       22917 : }
     722       16720 : void eb_aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
     723             :     const uint8_t *above, const uint8_t *left) {
     724       16720 :     const uint32_t pred = *(uint32_t *)above;
     725             :     (void)left;
     726       16720 :     dc_store_4xh(pred, 16, dst, stride);
     727       16720 : }
     728       51014 : void eb_aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
     729             :     const uint8_t *above, const uint8_t *left) {
     730       51014 :     const uint32_t pred = *(uint32_t *)above;
     731             :     (void)left;
     732       51014 :     dc_store_4xh(pred, 8, dst, stride);
     733       51014 : }
     734     1126200 : void eb_aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
     735             :     const uint8_t *above, const uint8_t *left) {
     736     1126200 :     const __m128i row = _mm_loadl_epi64((__m128i const *)above);
     737             :     (void)left;
     738     1126200 :     dc_store_8xh(&row, 16, dst, stride);
     739     1126230 : }
     740       26671 : void eb_aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
     741             :     const uint8_t *above, const uint8_t *left) {
     742       26671 :     const __m128i row = _mm_loadl_epi64((__m128i const *)above);
     743             :     (void)left;
     744       26671 :     dc_store_8xh(&row, 32, dst, stride);
     745       26671 : }
     746       47030 : void eb_aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
     747             :     const uint8_t *above, const uint8_t *left) {
     748       47030 :     const __m128i row = _mm_loadl_epi64((__m128i const *)above);
     749             :     (void)left;
     750       47030 :     dc_store_8xh(&row, 4, dst, stride);
     751       47030 : }
     752             : // -----------------------------------------------------------------------------
     753             : // DC_128
     754             : 
     755        5487 : void eb_aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
     756             :     const uint8_t *above, const uint8_t *left) {
     757             :     (void)above;
     758             :     (void)left;
     759        5487 :     const uint32_t pred = 0x80808080;
     760        5487 :     dc_store_4xh(pred, 8, dst, stride);
     761        5487 : }
     762             : 
     763        2645 : void eb_aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
     764             :     const uint8_t *above, const uint8_t *left) {
     765             :     (void)above;
     766             :     (void)left;
     767        2645 :     const uint32_t pred = 0x80808080;
     768        2645 :     dc_store_4xh(pred, 16, dst, stride);
     769        2645 : }
     770             : 
     771        5973 : void eb_aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
     772             :     const uint8_t *above, const uint8_t *left) {
     773             :     (void)above;
     774             :     (void)left;
     775        5973 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     776        5973 :     dc_store_8xh(&row, 4, dst, stride);
     777        5973 : }
     778             : 
     779        3021 : void eb_aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
     780             :     const uint8_t *above, const uint8_t *left) {
     781             :     (void)above;
     782             :     (void)left;
     783        3021 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     784        3021 :     dc_store_8xh(&row, 16, dst, stride);
     785        3021 : }
     786             : 
     787         515 : void eb_aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
     788             :     const uint8_t *above, const uint8_t *left) {
     789             :     (void)above;
     790             :     (void)left;
     791         515 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     792         515 :     dc_store_8xh(&row, 32, dst, stride);
     793         515 : }
     794             : 
     795        2562 : void eb_aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
     796             :     const uint8_t *above, const uint8_t *left) {
     797             :     (void)above;
     798             :     (void)left;
     799        2562 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     800        2562 :     dc_store_16xh(&row, 4, dst, stride);
     801        2562 : }
     802             : 
     803        2959 : void eb_aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
     804             :     const uint8_t *above, const uint8_t *left) {
     805             :     (void)above;
     806             :     (void)left;
     807        2959 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     808        2959 :     dc_store_16xh(&row, 8, dst, stride);
     809        2959 : }
     810             : 
     811         374 : void eb_aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
     812             :     const uint8_t *above,
     813             :     const uint8_t *left) {
     814             :     (void)above;
     815             :     (void)left;
     816         374 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     817         374 :     dc_store_16xh(&row, 32, dst, stride);
     818         374 : }
     819             : 
     820          23 : void eb_aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
     821             :     const uint8_t *above,
     822             :     const uint8_t *left) {
     823             :     (void)above;
     824             :     (void)left;
     825          23 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     826          23 :     dc_store_16xh(&row, 64, dst, stride);
     827          23 : }
     828             : 
     829         448 : void eb_aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
     830             :     const uint8_t *above, const uint8_t *left) {
     831             :     (void)above;
     832             :     (void)left;
     833         448 :     const __m128i row = _mm_set1_epi8((uint8_t)128);
     834         448 :     dc_store_32xh(&row, 8, dst, stride);
     835         448 : }
     836             : 
     837             : // -----------------------------------------------------------------------------
     838             : // DC_TOP
     839             : 
     840      106255 : void eb_aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
     841             :     const uint8_t *above, const uint8_t *left) {
     842             :     (void)left;
     843      106255 :     __m128i sum_above = dc_sum_4(above);
     844      106255 :     const __m128i two = _mm_set1_epi16((int16_t)2);
     845      106255 :     sum_above = _mm_add_epi16(sum_above, two);
     846      106255 :     sum_above = _mm_srai_epi16(sum_above, 2);
     847      106255 :     sum_above = _mm_shufflelo_epi16(sum_above, 0);
     848      106255 :     sum_above = _mm_packus_epi16(sum_above, sum_above);
     849             : 
     850      106255 :     const uint32_t pred = _mm_cvtsi128_si32(sum_above);
     851      106255 :     dc_store_4xh(pred, 8, dst, stride);
     852      106255 : }
     853             : 
     854       19213 : void eb_aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
     855             :     const uint8_t *above, const uint8_t *left) {
     856             :     (void)left;
     857       19213 :     __m128i sum_above = dc_sum_4(above);
     858       19213 :     const __m128i two = _mm_set1_epi16((int16_t)2);
     859       19213 :     sum_above = _mm_add_epi16(sum_above, two);
     860       19213 :     sum_above = _mm_srai_epi16(sum_above, 2);
     861       19213 :     sum_above = _mm_shufflelo_epi16(sum_above, 0);
     862       19213 :     sum_above = _mm_packus_epi16(sum_above, sum_above);
     863             : 
     864       19213 :     const uint32_t pred = _mm_cvtsi128_si32(sum_above);
     865       19213 :     dc_store_4xh(pred, 16, dst, stride);
     866       19213 : }
     867             : 
     868      227262 : void eb_aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
     869             :     const uint8_t *above, const uint8_t *left) {
     870             :     (void)left;
     871      227262 :     __m128i sum_above = dc_sum_8(above);
     872      227261 :     const __m128i four = _mm_set1_epi16((uint16_t)4);
     873      227261 :     sum_above = _mm_add_epi16(sum_above, four);
     874      227261 :     sum_above = _mm_srai_epi16(sum_above, 3);
     875      227261 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     876      227261 :     const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
     877      227261 :     dc_store_8xh(&row, 4, dst, stride);
     878      227265 : }
     879             : 
     880       27594 : void eb_aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
     881             :     const uint8_t *above, const uint8_t *left) {
     882             :     (void)left;
     883       27594 :     __m128i sum_above = dc_sum_8(above);
     884       27594 :     const __m128i four = _mm_set1_epi16((uint16_t)4);
     885       27594 :     sum_above = _mm_add_epi16(sum_above, four);
     886       27594 :     sum_above = _mm_srai_epi16(sum_above, 3);
     887       27594 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     888       27594 :     const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
     889       27594 :     dc_store_8xh(&row, 16, dst, stride);
     890       27594 : }
     891             : 
     892        2039 : void eb_aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
     893             :     const uint8_t *above, const uint8_t *left) {
     894             :     (void)left;
     895        2039 :     __m128i sum_above = dc_sum_8(above);
     896        2039 :     const __m128i four = _mm_set1_epi16((uint16_t)4);
     897        2039 :     sum_above = _mm_add_epi16(sum_above, four);
     898        2039 :     sum_above = _mm_srai_epi16(sum_above, 3);
     899        2039 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     900        2039 :     const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
     901        2039 :     dc_store_8xh(&row, 32, dst, stride);
     902        2039 : }
     903             : 
     904       74793 : void eb_aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
     905             :     const uint8_t *above, const uint8_t *left) {
     906             :     (void)left;
     907       74793 :     __m128i sum_above = dc_sum_16(above);
     908       74793 :     const __m128i eight = _mm_set1_epi16((uint16_t)8);
     909       74793 :     sum_above = _mm_add_epi16(sum_above, eight);
     910       74793 :     sum_above = _mm_srai_epi16(sum_above, 4);
     911       74793 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     912       74793 :     sum_above = _mm_shufflelo_epi16(sum_above, 0);
     913       74793 :     const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
     914       74793 :     dc_store_16xh(&row, 4, dst, stride);
     915       74793 : }
     916             : 
     917       72995 : void eb_aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
     918             :     const uint8_t *above, const uint8_t *left) {
     919             :     (void)left;
     920       72995 :     __m128i sum_above = dc_sum_16(above);
     921       72995 :     const __m128i eight = _mm_set1_epi16((uint16_t)8);
     922       72995 :     sum_above = _mm_add_epi16(sum_above, eight);
     923       72995 :     sum_above = _mm_srai_epi16(sum_above, 4);
     924       72995 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     925       72995 :     sum_above = _mm_shufflelo_epi16(sum_above, 0);
     926       72995 :     const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
     927       72995 :     dc_store_16xh(&row, 8, dst, stride);
     928       72995 : }
     929             : 
     930        7800 : void eb_aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
     931             :     const uint8_t *above,
     932             :     const uint8_t *left) {
     933             :     (void)left;
     934        7800 :     __m128i sum_above = dc_sum_16(above);
     935        7800 :     const __m128i eight = _mm_set1_epi16((uint16_t)8);
     936        7800 :     sum_above = _mm_add_epi16(sum_above, eight);
     937        7800 :     sum_above = _mm_srai_epi16(sum_above, 4);
     938        7800 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     939        7800 :     sum_above = _mm_shufflelo_epi16(sum_above, 0);
     940        7800 :     const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
     941        7800 :     dc_store_16xh(&row, 32, dst, stride);
     942        7800 : }
     943             : 
     944         138 : void eb_aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
     945             :     const uint8_t *above,
     946             :     const uint8_t *left) {
     947             :     (void)left;
     948         138 :     __m128i sum_above = dc_sum_16(above);
     949         138 :     const __m128i eight = _mm_set1_epi16((uint16_t)8);
     950         138 :     sum_above = _mm_add_epi16(sum_above, eight);
     951         138 :     sum_above = _mm_srai_epi16(sum_above, 4);
     952         138 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     953         138 :     sum_above = _mm_shufflelo_epi16(sum_above, 0);
     954         138 :     const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
     955         138 :     dc_store_16xh(&row, 64, dst, stride);
     956         138 : }
     957             : 
     958        7751 : void eb_aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
     959             :     const uint8_t *above, const uint8_t *left) {
     960             :     (void)left;
     961        7751 :     __m128i sum_above = dc_sum_32(above);
     962        7751 :     const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
     963        7751 :     sum_above = _mm_add_epi16(sum_above, sixteen);
     964        7751 :     sum_above = _mm_srai_epi16(sum_above, 5);
     965        7751 :     sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
     966        7751 :     sum_above = _mm_shufflelo_epi16(sum_above, 0);
     967        7751 :     const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
     968        7751 :     dc_store_32xh(&row, 8, dst, stride);
     969        7751 : }
     970             : 
     971             : // -----------------------------------------------------------------------------
     972             : // DC_LEFT
     973             : 
     974      121184 : void eb_aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
     975             :     const uint8_t *above, const uint8_t *left) {
     976             :     (void)above;
     977      121184 :     __m128i sum_left = dc_sum_8(left);
     978      121184 :     const __m128i four = _mm_set1_epi16((uint16_t)4);
     979      121184 :     sum_left = _mm_add_epi16(sum_left, four);
     980      121184 :     sum_left = _mm_srai_epi16(sum_left, 3);
     981      121184 :     sum_left = _mm_shufflelo_epi16(sum_left, 0);
     982      121184 :     sum_left = _mm_packus_epi16(sum_left, sum_left);
     983             : 
     984      121184 :     const uint32_t pred = _mm_cvtsi128_si32(sum_left);
     985      121184 :     dc_store_4xh(pred, 8, dst, stride);
     986      121184 : }
     987             : 
     988       61725 : void eb_aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
     989             :     const uint8_t *above,
     990             :     const uint8_t *left) {
     991             :     (void)above;
     992       61725 :     __m128i sum_left = dc_sum_16(left);
     993       61725 :     const __m128i eight = _mm_set1_epi16((uint16_t)8);
     994       61725 :     sum_left = _mm_add_epi16(sum_left, eight);
     995       61725 :     sum_left = _mm_srai_epi16(sum_left, 4);
     996       61725 :     sum_left = _mm_shufflelo_epi16(sum_left, 0);
     997       61725 :     sum_left = _mm_packus_epi16(sum_left, sum_left);
     998             : 
     999       61725 :     const uint32_t pred = _mm_cvtsi128_si32(sum_left);
    1000       61725 :     dc_store_4xh(pred, 16, dst, stride);
    1001       61725 : }
    1002             : 
    1003       50950 : void eb_aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
    1004             :     const uint8_t *above, const uint8_t *left) {
    1005             :     (void)above;
    1006       50950 :     __m128i sum_left = dc_sum_4(left);
    1007       50950 :     const __m128i two = _mm_set1_epi16((uint16_t)2);
    1008       50950 :     sum_left = _mm_add_epi16(sum_left, two);
    1009       50950 :     sum_left = _mm_srai_epi16(sum_left, 2);
    1010       50950 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1011       50950 :     const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
    1012       50950 :     dc_store_8xh(&row, 4, dst, stride);
    1013       50950 : }
    1014             : 
    1015       64721 : void eb_aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
    1016             :     const uint8_t *above,
    1017             :     const uint8_t *left) {
    1018             :     (void)above;
    1019       64721 :     __m128i sum_left = dc_sum_16(left);
    1020       64721 :     const __m128i eight = _mm_set1_epi16((uint16_t)8);
    1021       64721 :     sum_left = _mm_add_epi16(sum_left, eight);
    1022       64721 :     sum_left = _mm_srai_epi16(sum_left, 4);
    1023       64721 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1024       64721 :     const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
    1025       64721 :     dc_store_8xh(&row, 16, dst, stride);
    1026       64721 : }
    1027             : 
    1028        7394 : void eb_aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
    1029             :     const uint8_t *above,
    1030             :     const uint8_t *left) {
    1031             :     (void)above;
    1032        7394 :     __m128i sum_left = dc_sum_32(left);
    1033        7394 :     const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
    1034        7394 :     sum_left = _mm_add_epi16(sum_left, sixteen);
    1035        7394 :     sum_left = _mm_srai_epi16(sum_left, 5);
    1036        7394 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1037        7394 :     const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
    1038        7394 :     dc_store_8xh(&row, 32, dst, stride);
    1039        7394 : }
    1040             : 
    1041       15905 : void eb_aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
    1042             :     const uint8_t *above,
    1043             :     const uint8_t *left) {
    1044             :     (void)above;
    1045       15905 :     __m128i sum_left = dc_sum_4(left);
    1046       15905 :     const __m128i two = _mm_set1_epi16((uint16_t)2);
    1047       15905 :     sum_left = _mm_add_epi16(sum_left, two);
    1048       15905 :     sum_left = _mm_srai_epi16(sum_left, 2);
    1049       15905 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1050       15905 :     sum_left = _mm_shufflelo_epi16(sum_left, 0);
    1051       15905 :     const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    1052       15905 :     dc_store_16xh(&row, 4, dst, stride);
    1053       15905 : }
    1054             : 
    1055       23277 : void eb_aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
    1056             :     const uint8_t *above,
    1057             :     const uint8_t *left) {
    1058             :     (void)above;
    1059       23277 :     __m128i sum_left = dc_sum_8(left);
    1060       23277 :     const __m128i four = _mm_set1_epi16((uint16_t)4);
    1061       23277 :     sum_left = _mm_add_epi16(sum_left, four);
    1062       23277 :     sum_left = _mm_srai_epi16(sum_left, 3);
    1063       23277 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1064       23277 :     sum_left = _mm_shufflelo_epi16(sum_left, 0);
    1065       23277 :     const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    1066       23277 :     dc_store_16xh(&row, 8, dst, stride);
    1067       23277 : }
    1068             : 
    1069       17698 : void eb_aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
    1070             :     const uint8_t *above,
    1071             :     const uint8_t *left) {
    1072             :     (void)above;
    1073       17698 :     __m128i sum_left = dc_sum_32(left);
    1074       17698 :     const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
    1075       17698 :     sum_left = _mm_add_epi16(sum_left, sixteen);
    1076       17698 :     sum_left = _mm_srai_epi16(sum_left, 5);
    1077       17698 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1078       17698 :     sum_left = _mm_shufflelo_epi16(sum_left, 0);
    1079       17698 :     const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    1080       17698 :     dc_store_16xh(&row, 32, dst, stride);
    1081       17698 : }
    1082             : 
    1083         358 : void eb_aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
    1084             :     const uint8_t *above,
    1085             :     const uint8_t *left) {
    1086             :     (void)above;
    1087         358 :     __m128i sum_left = dc_sum_64(left);
    1088         358 :     const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
    1089         358 :     sum_left = _mm_add_epi16(sum_left, thirtytwo);
    1090         358 :     sum_left = _mm_srai_epi16(sum_left, 6);
    1091         358 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1092         358 :     sum_left = _mm_shufflelo_epi16(sum_left, 0);
    1093         358 :     const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    1094         358 :     dc_store_16xh(&row, 64, dst, stride);
    1095         358 : }
    1096             : 
    1097        2922 : void eb_aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
    1098             :     const uint8_t *above,
    1099             :     const uint8_t *left) {
    1100             :     (void)above;
    1101        2922 :     __m128i sum_left = dc_sum_8(left);
    1102        2922 :     const __m128i four = _mm_set1_epi16((uint16_t)4);
    1103        2922 :     sum_left = _mm_add_epi16(sum_left, four);
    1104        2922 :     sum_left = _mm_srai_epi16(sum_left, 3);
    1105        2922 :     sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    1106        2922 :     sum_left = _mm_shufflelo_epi16(sum_left, 0);
    1107        2922 :     const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    1108        2922 :     dc_store_32xh(&row, 8, dst, stride);
    1109        2922 : }
    1110             : 
    1111             : // -----------------------------------------------------------------------------
    1112             : // DC_PRED
    1113             : 
    1114     1646180 : void eb_aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
    1115             :     const uint8_t *above, const uint8_t *left) {
    1116     1646180 :     const __m128i sum_left = dc_sum_8(left);
    1117     1646190 :     __m128i sum_above = dc_sum_4(above);
    1118     1646180 :     sum_above = _mm_add_epi16(sum_left, sum_above);
    1119             : 
    1120     1646180 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1121     1646180 :     sum += 6;
    1122     1646180 :     sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
    1123             : 
    1124     3292360 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1125     1646180 :     const uint32_t pred = _mm_cvtsi128_si32(row);
    1126     1646180 :     dc_store_4xh(pred, 8, dst, stride);
    1127     1646220 : }
    1128             : 
    1129      427359 : void eb_aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
    1130             :     const uint8_t *above, const uint8_t *left) {
    1131      427359 :     const __m128i sum_left = dc_sum_16(left);
    1132      427358 :     __m128i sum_above = dc_sum_4(above);
    1133      427357 :     sum_above = _mm_add_epi16(sum_left, sum_above);
    1134             : 
    1135      427357 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1136      427357 :     sum += 10;
    1137      427357 :     sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
    1138             : 
    1139      854714 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1140      427357 :     const uint32_t pred = _mm_cvtsi128_si32(row);
    1141      427357 :     dc_store_4xh(pred, 16, dst, stride);
    1142      427354 : }
    1143             : 
    1144     1589560 : void eb_aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
    1145             :     const uint8_t *above, const uint8_t *left) {
    1146     1589560 :     const __m128i sum_left = dc_sum_4(left);
    1147     1589560 :     __m128i sum_above = dc_sum_8(above);
    1148     1589550 :     sum_above = _mm_add_epi16(sum_above, sum_left);
    1149             : 
    1150     1589550 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1151     1589550 :     sum += 6;
    1152     1589550 :     sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
    1153             : 
    1154     1589550 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1155     1589550 :     dc_store_8xh(&row, 4, dst, stride);
    1156     1589560 : }
    1157             : 
    1158      537524 : void eb_aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
    1159             :     const uint8_t *above, const uint8_t *left) {
    1160      537524 :     const __m128i sum_left = dc_sum_16(left);
    1161      537524 :     __m128i sum_above = dc_sum_8(above);
    1162      537525 :     sum_above = _mm_add_epi16(sum_above, sum_left);
    1163             : 
    1164      537525 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1165      537525 :     sum += 12;
    1166      537525 :     sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
    1167      537525 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1168      537525 :     dc_store_8xh(&row, 16, dst, stride);
    1169      537527 : }
    1170             : 
    1171       30836 : void eb_aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
    1172             :     const uint8_t *above, const uint8_t *left) {
    1173       30836 :     const __m128i sum_left = dc_sum_32(left);
    1174       30836 :     __m128i sum_above = dc_sum_8(above);
    1175       30836 :     sum_above = _mm_add_epi16(sum_above, sum_left);
    1176             : 
    1177       30836 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1178       30836 :     sum += 20;
    1179       30836 :     sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
    1180       30836 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1181       30836 :     dc_store_8xh(&row, 32, dst, stride);
    1182       30836 : }
    1183             : 
    1184      417239 : void eb_aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
    1185             :     const uint8_t *above, const uint8_t *left) {
    1186      417239 :     const __m128i sum_left = dc_sum_4(left);
    1187      417240 :     __m128i sum_above = dc_sum_16(above);
    1188      417238 :     sum_above = _mm_add_epi16(sum_above, sum_left);
    1189             : 
    1190      417238 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1191      417238 :     sum += 10;
    1192      417238 :     sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
    1193      417238 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1194      417238 :     dc_store_16xh(&row, 4, dst, stride);
    1195      417238 : }
    1196             : 
    1197      490644 : void eb_aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
    1198             :     const uint8_t *above, const uint8_t *left) {
    1199      490644 :     const __m128i sum_left = dc_sum_8(left);
    1200      490643 :     __m128i sum_above = dc_sum_16(above);
    1201      490642 :     sum_above = _mm_add_epi16(sum_above, sum_left);
    1202             : 
    1203      490642 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1204      490642 :     sum += 12;
    1205      490642 :     sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
    1206      490642 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1207      490642 :     dc_store_16xh(&row, 8, dst, stride);
    1208      490642 : }
    1209             : 
    1210      110441 : void eb_aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
    1211             :     const uint8_t *above, const uint8_t *left) {
    1212      110441 :     const __m128i sum_left = dc_sum_32(left);
    1213      110440 :     __m128i sum_above = dc_sum_16(above);
    1214      110440 :     sum_above = _mm_add_epi16(sum_left, sum_above);
    1215             : 
    1216      110440 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1217      110440 :     sum += 24;
    1218      110440 :     sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
    1219      110440 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1220      110440 :     dc_store_16xh(&row, 32, dst, stride);
    1221      110442 : }
    1222             : 
    1223        2907 : void eb_aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
    1224             :     const uint8_t *above, const uint8_t *left) {
    1225        2907 :     const __m128i sum_left = dc_sum_64(left);
    1226        2907 :     __m128i sum_above = dc_sum_16(above);
    1227        2907 :     sum_above = _mm_add_epi16(sum_left, sum_above);
    1228             : 
    1229        2907 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1230        2907 :     sum += 40;
    1231        2907 :     sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
    1232        2907 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1233        2907 :     dc_store_16xh(&row, 64, dst, stride);
    1234        2907 : }
    1235             : 
    1236       21086 : void eb_aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
    1237             :     const uint8_t *above, const uint8_t *left) {
    1238       21086 :     __m128i sum_above = dc_sum_32(above);
    1239       21086 :     const __m128i sum_left = dc_sum_8(left);
    1240       21086 :     sum_above = _mm_add_epi16(sum_above, sum_left);
    1241             : 
    1242       21086 :     uint32_t sum = _mm_cvtsi128_si32(sum_above);
    1243       21086 :     sum += 20;
    1244       21086 :     sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
    1245       21086 :     const __m128i row = _mm_set1_epi8((uint8_t)sum);
    1246       21086 :     dc_store_32xh(&row, 8, dst, stride);
    1247       21086 : }

Generated by: LCOV version 1.14