LCOV - coverage.info - ASM_SSSE3/EbHighbdIntraPrediction_Intrinsic

LCOV - code coverage report

Current view:	top level - ASM_SSSE3 - EbHighbdIntraPrediction_Intrinsic_SSSE3.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	159	553	28.8 %
Date:	2019-11-25 17:38:06	Functions:	11	43	25.6 %

          Line data    Source code

       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : #include <tmmintrin.h>
       6             : #include "EbDefinitions.h"
       7             : #include "aom_dsp_rtcd.h"
       8             : 
       9             : static const int32_t sm_weight_log2_scale = 8;
      10             : 
      11             : // =============================================================================
      12             : 
      13             : // SMOOTH_PRED
      14             : 
      15             : // bs = 4
      16             : EB_ALIGN(16) static const uint16_t sm_weights_4[8] = {
      17             :     255, 1, 149, 107, 85, 171, 64, 192
      18             : };
      19             : 
      20             : // bs = 8
      21             : EB_ALIGN(32) static const uint16_t sm_weights_8[16] = {
      22             :     255, 1, 197, 59, 146, 110, 105, 151,
      23             :     73, 183, 50, 206, 37, 219, 32, 224
      24             : };
      25             : 
      26             : // bs = 16
      27             : EB_ALIGN(32) static const uint16_t sm_weights_16[32] = {
      28             :     255, 1, 225, 31, 196, 60, 170, 86,
      29             :     145, 111, 123, 133, 102, 154, 84, 172,
      30             :     68, 188, 54, 202, 43, 213, 33, 223,
      31             :     26, 230, 20, 236, 17, 239, 16, 240,
      32             : };
      33             : 
      34             : // 4xN
      35             : 
      36           0 : static INLINE void load_right_weights_4(const uint16_t *const above,
      37             :     __m128i *const r, __m128i *const weights)
      38             : {
      39           0 :     *r = _mm_set1_epi16((uint16_t)above[3]);
      40           0 :     *weights = _mm_load_si128((const __m128i *)sm_weights_4);
      41           0 : }
      42             : 
      43           0 : static INLINE void init_4(const uint16_t *const above,
      44             :     const uint16_t *const left, const int32_t h, __m128i *const ab,
      45             :     __m128i *const r, __m128i *const weights_w, __m128i *const rep)
      46             : {
      47           0 :     const __m128i a = _mm_loadl_epi64((const __m128i *)above);
      48           0 :     const __m128i b = _mm_set1_epi16((uint16_t)left[h - 1]);
      49           0 :     *ab = _mm_unpacklo_epi16(a, b);
      50           0 :     load_right_weights_4(above, r, weights_w);
      51             : 
      52           0 :     rep[0] = _mm_set1_epi32(0x03020100);
      53           0 :     rep[1] = _mm_set1_epi32(0x07060504);
      54           0 :     rep[2] = _mm_set1_epi32(0x0B0A0908);
      55           0 :     rep[3] = _mm_set1_epi32(0x0F0E0D0C);
      56           0 : }
      57             : 
      58           0 : static INLINE void load_left_8(const uint16_t *const left,
      59             :     const __m128i r, __m128i *const lr)
      60             : {
      61           0 :     const __m128i l = _mm_load_si128((const __m128i *)left);
      62           0 :     lr[0] = _mm_unpacklo_epi16(l, r); // 0 1 2 3
      63           0 :     lr[1] = _mm_unpackhi_epi16(l, r); // 4 5 6 7
      64           0 : }
      65             : 
      66           0 : static INLINE __m128i smooth_pred_4(const __m128i weights_w,
      67             :     const __m128i weights_h, const __m128i rep,
      68             :     const __m128i ab, const __m128i lr)
      69             : {
      70           0 :     const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
      71           0 :     const __m128i w = _mm_shuffle_epi8(weights_h, rep);
      72           0 :     const __m128i t = _mm_shuffle_epi8(lr, rep);
      73           0 :     const __m128i s0 = _mm_madd_epi16(ab, w);
      74           0 :     const __m128i s1 = _mm_madd_epi16(t, weights_w);
      75             :     __m128i sum;
      76             : 
      77           0 :     sum = _mm_add_epi32(s0, s1);
      78           0 :     sum = _mm_add_epi32(sum, round);
      79           0 :     sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
      80           0 :     return sum;
      81             : }
      82             : 
      83           0 : static INLINE void smooth_pred_4x2(const __m128i weights_w,
      84             :     const __m128i weights_h, const __m128i *const rep, const __m128i ab,
      85             :     const __m128i lr, uint16_t **const dst, const ptrdiff_t stride)
      86             : {
      87           0 :     const __m128i sum0 = smooth_pred_4(weights_w, weights_h, rep[0], ab, lr);
      88           0 :     const __m128i sum1 = smooth_pred_4(weights_w, weights_h, rep[1], ab, lr);
      89           0 :     const __m128i sum = _mm_packs_epi32(sum0, sum1);
      90           0 :     _mm_storel_epi64((__m128i *)*dst, sum);
      91           0 :     *dst += stride;
      92           0 :     _mm_storeh_pd((double *)*dst, _mm_castsi128_pd(sum));
      93           0 :     *dst += stride;
      94           0 : }
      95             : 
      96           0 : static INLINE void smooth_pred_4x4(const __m128i weights_w,
      97             :     const __m128i weights_h, const __m128i *const rep, const __m128i ab,
      98             :     const __m128i lr, uint16_t **const dst, const ptrdiff_t stride)
      99             : {
     100           0 :     smooth_pred_4x2(weights_w, weights_h, rep + 0, ab, lr, dst, stride);
     101           0 :     smooth_pred_4x2(weights_w, weights_h, rep + 2, ab, lr, dst, stride);
     102           0 : }
     103             : 
     104             : // 4x4
     105             : 
     106           0 : void eb_aom_highbd_smooth_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
     107             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     108             : {
     109           0 :     const __m128i l = _mm_loadl_epi64((const __m128i *)left);
     110             :     __m128i ab, r, lr, weights_w, rep[4];
     111             :     (void)bd;
     112             : 
     113           0 :     init_4(above, left, 4, &ab, &r, &weights_w, rep);
     114           0 :     lr = _mm_unpacklo_epi16(l, r);
     115           0 :     smooth_pred_4x4(weights_w, weights_w, rep, ab, lr, &dst, stride);
     116           0 : }
     117             : 
     118             : // 4x8
     119             : 
     120           0 : void eb_aom_highbd_smooth_predictor_4x8_ssse3(uint16_t *dst, ptrdiff_t stride,
     121             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     122             : {
     123             :     __m128i ab, r, lr[2], weights_w, weights_h, rep[4];
     124             :     (void)bd;
     125             : 
     126           0 :     init_4(above, left, 8, &ab, &r, &weights_w, rep);
     127           0 :     load_left_8(left, r, lr);
     128           0 :     weights_h = _mm_load_si128((const __m128i *)(sm_weights_8 + 0));
     129           0 :     smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[0], &dst, stride);
     130           0 :     weights_h = _mm_load_si128((const __m128i *)(sm_weights_8 + 8));
     131           0 :     smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[1], &dst, stride);
     132           0 : }
     133             : 
     134             : // 4x16
     135             : 
     136           0 : void eb_aom_highbd_smooth_predictor_4x16_ssse3(uint16_t *dst, ptrdiff_t stride,
     137             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     138             : {
     139             :     __m128i ab, r, lr[2], weights_w, weights_h, rep[4];
     140             :     (void)bd;
     141             : 
     142           0 :     init_4(above, left, 16, &ab, &r, &weights_w, rep);
     143             : 
     144           0 :     load_left_8(left + 0, r, lr);
     145           0 :     weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 0));
     146           0 :     smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[0], &dst, stride);
     147           0 :     weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 8));
     148           0 :     smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[1], &dst, stride);
     149             : 
     150           0 :     load_left_8(left + 8, r, lr);
     151           0 :     weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 16));
     152           0 :     smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[0], &dst, stride);
     153           0 :     weights_h = _mm_load_si128((const __m128i *)(sm_weights_16 + 24));
     154           0 :     smooth_pred_4x4(weights_w, weights_h, rep, ab, lr[1], &dst, stride);
     155           0 : }
     156             : 
     157             : // =============================================================================
     158             : 
     159             : // SMOOTH_H_PRED
     160             : 
     161             : // 4xN
     162             : 
     163           0 : static INLINE __m128i smooth_h_pred_4(const __m128i weights,
     164             :     __m128i *const lr)
     165             : {
     166           0 :     const __m128i round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
     167           0 :     const __m128i rep = _mm_set1_epi32(0x03020100);
     168           0 :     const __m128i t = _mm_shuffle_epi8(*lr, rep);
     169           0 :     const __m128i sum0 = _mm_madd_epi16(t, weights);
     170           0 :     const __m128i sum1 = _mm_add_epi32(sum0, round);
     171           0 :     const __m128i sum2 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
     172           0 :     *lr = _mm_srli_si128(*lr, 4);
     173           0 :     return sum2;
     174             : }
     175             : 
     176           0 : static INLINE void smooth_h_pred_4x2(const __m128i weights, __m128i *const lr,
     177             :     uint16_t **const dst, const ptrdiff_t stride)
     178             : {
     179           0 :     const __m128i sum0 = smooth_h_pred_4(weights, lr);
     180           0 :     const __m128i sum1 = smooth_h_pred_4(weights, lr);
     181           0 :     const __m128i sum = _mm_packs_epi32(sum0, sum1);
     182           0 :     _mm_storel_epi64((__m128i *)*dst, sum);
     183           0 :     *dst += stride;
     184           0 :     _mm_storeh_pd((double *)*dst, _mm_castsi128_pd(sum));
     185           0 :     *dst += stride;
     186           0 : }
     187             : 
     188           0 : static INLINE void smooth_h_pred_4x4(const __m128i weights, __m128i *const lr,
     189             :     uint16_t **const dst, const ptrdiff_t stride)
     190             : {
     191           0 :     smooth_h_pred_4x2(weights, lr, dst, stride);
     192           0 :     smooth_h_pred_4x2(weights, lr, dst, stride);
     193           0 : }
     194             : 
     195             : // 4x4
     196             : 
     197           0 : void eb_aom_highbd_smooth_h_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
     198             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     199             : {
     200           0 :     const __m128i l = _mm_loadl_epi64((const __m128i *)left);
     201             :     __m128i r, weights;
     202             :     (void)bd;
     203             : 
     204           0 :     load_right_weights_4(above, &r, &weights);
     205           0 :     __m128i lr = _mm_unpacklo_epi16(l, r);
     206           0 :     smooth_h_pred_4x4(weights, &lr, &dst, stride);
     207           0 : }
     208             : 
     209             : // 4x8
     210             : 
     211           0 : void eb_aom_highbd_smooth_h_predictor_4x8_ssse3(uint16_t *dst, ptrdiff_t stride,
     212             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     213             : {
     214             :     __m128i r, lr[2], weights;
     215             :     (void)bd;
     216             : 
     217           0 :     load_right_weights_4(above, &r, &weights);
     218           0 :     load_left_8(left, r, lr);
     219           0 :     smooth_h_pred_4x4(weights, &lr[0], &dst, stride);
     220           0 :     smooth_h_pred_4x4(weights, &lr[1], &dst, stride);
     221           0 : }
     222             : 
     223             : // 4x16
     224             : 
     225           0 : void eb_aom_highbd_smooth_h_predictor_4x16_ssse3(uint16_t *dst, ptrdiff_t stride,
     226             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     227             : {
     228             :     __m128i r, lr[2], weights;
     229             :     (void)bd;
     230             : 
     231           0 :     load_right_weights_4(above, &r, &weights);
     232           0 :     load_left_8(left + 0, r, lr);
     233           0 :     smooth_h_pred_4x4(weights, &lr[0], &dst, stride);
     234           0 :     smooth_h_pred_4x4(weights, &lr[1], &dst, stride);
     235           0 :     load_left_8(left + 8, r, lr);
     236           0 :     smooth_h_pred_4x4(weights, &lr[0], &dst, stride);
     237           0 :     smooth_h_pred_4x4(weights, &lr[1], &dst, stride);
     238           0 : }
     239             : 
     240             : // =============================================================================
     241             : 
     242             : // SMOOTH_V_PRED
     243             : 
     244             : // 4xN
     245             : 
     246           0 : static INLINE void smooth_v_init_4(const uint16_t *const above,
     247             :     const uint16_t *const left, const int32_t h, __m128i *const ab,
     248             :     __m128i *const rep)
     249             : {
     250           0 :     const __m128i a = _mm_loadl_epi64((const __m128i *)above);
     251           0 :     const __m128i b = _mm_set1_epi16((uint16_t)left[h - 1]);
     252           0 :     *ab = _mm_unpacklo_epi16(a, b);
     253             : 
     254           0 :     rep[0] = _mm_set1_epi32(0x03020100);
     255           0 :     rep[1] = _mm_set1_epi32(0x07060504);
     256           0 :     rep[2] = _mm_set1_epi32(0x0B0A0908);
     257           0 :     rep[3] = _mm_set1_epi32(0x0F0E0D0C);
     258           0 : }
     259             : 
     260           0 : static INLINE __m128i smooth_v_pred_4(const __m128i weights, const __m128i rep,
     261             :     const __m128i ab)
     262             : {
     263           0 :     const __m128i round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
     264           0 :     const __m128i w = _mm_shuffle_epi8(weights, rep);
     265           0 :     const __m128i sum0 = _mm_madd_epi16(ab, w);
     266             :     __m128i sum;
     267             : 
     268           0 :     sum = _mm_add_epi32(sum0, round);
     269           0 :     sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
     270           0 :     return sum;
     271             : }
     272             : 
     273           0 : static INLINE void smooth_v_pred_4x2(const __m128i weights,
     274             :     const __m128i *const rep, const __m128i ab, uint16_t **const dst,
     275             :     const ptrdiff_t stride)
     276             : {
     277           0 :     const __m128i sum0 = smooth_v_pred_4(weights, rep[0], ab);
     278           0 :     const __m128i sum1 = smooth_v_pred_4(weights, rep[1], ab);
     279           0 :     const __m128i sum = _mm_packs_epi32(sum0, sum1);
     280           0 :     _mm_storel_epi64((__m128i *)*dst, sum);
     281           0 :     *dst += stride;
     282           0 :     _mm_storeh_pd((double *)*dst, _mm_castsi128_pd(sum));
     283           0 :     *dst += stride;
     284           0 : }
     285             : 
     286           0 : static INLINE void smooth_v_pred_4x4(const __m128i weights,
     287             :     const __m128i *const rep, const __m128i ab, uint16_t **const dst,
     288             :     const ptrdiff_t stride)
     289             : {
     290           0 :     smooth_v_pred_4x2(weights, rep + 0, ab, dst, stride);
     291           0 :     smooth_v_pred_4x2(weights, rep + 2, ab, dst, stride);
     292           0 : }
     293             : 
     294             : // 4x4
     295             : 
     296           0 : void eb_aom_highbd_smooth_v_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
     297             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     298             : {
     299             :     __m128i ab, rep[4];
     300             :     (void)bd;
     301             : 
     302           0 :     smooth_v_init_4(above, left, 4, &ab, rep);
     303           0 :     const __m128i weights = _mm_load_si128((const __m128i *)sm_weights_4);
     304           0 :     smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
     305           0 : }
     306             : 
     307             : // 4x8
     308             : 
     309           0 : void eb_aom_highbd_smooth_v_predictor_4x8_ssse3(uint16_t *dst, ptrdiff_t stride,
     310             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     311             : {
     312             :     __m128i ab, weights, rep[4];
     313             :     (void)bd;
     314             : 
     315           0 :     smooth_v_init_4(above, left, 8, &ab, rep);
     316           0 :     weights = _mm_load_si128((const __m128i *)(sm_weights_8 + 0));
     317           0 :     smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
     318           0 :     weights = _mm_load_si128((const __m128i *)(sm_weights_8 + 8));
     319           0 :     smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
     320           0 : }
     321             : 
     322             : // 4x16
     323             : 
     324           0 : void eb_aom_highbd_smooth_v_predictor_4x16_ssse3(uint16_t *dst, ptrdiff_t stride,
     325             :     const uint16_t *above, const uint16_t *left, int32_t bd)
     326             : {
     327             :     __m128i ab, weights, rep[4];
     328             :     (void)bd;
     329             : 
     330           0 :     smooth_v_init_4(above, left, 16, &ab, rep);
     331             : 
     332           0 :     weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 0));
     333           0 :     smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
     334           0 :     weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 8));
     335           0 :     smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
     336             : 
     337           0 :     weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 16));
     338           0 :     smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
     339           0 :     weights = _mm_load_si128((const __m128i *)(sm_weights_16 + 24));
     340           0 :     smooth_v_pred_4x4(weights, rep, ab, &dst, stride);
     341           0 : }
     342             : 
     343             : // Return 8 16-bit pixels in one row
     344     3240830 : static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
     345             :                                      const __m128i *topleft) {
     346     6481660 :   const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
     347             : 
     348     6481660 :   __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
     349     6481660 :   __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
     350     9722480 :   __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
     351             : 
     352     3240830 :   __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
     353     6481660 :   mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
     354     3240830 :   __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
     355             : 
     356     3240830 :   pl = _mm_andnot_si128(mask1, *left);
     357             : 
     358     3240830 :   ptl = _mm_and_si128(mask2, *topleft);
     359     6481660 :   pt = _mm_andnot_si128(mask2, *top);
     360     3240830 :   pt = _mm_or_si128(pt, ptl);
     361     3240830 :   pt = _mm_and_si128(mask1, pt);
     362             : 
     363     3240830 :   return _mm_or_si128(pl, pt);
     364             : }
     365             : 
     366      131225 : void eb_aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     367             :                                    const uint8_t *above, const uint8_t *left) {
     368      131225 :   __m128i l = _mm_loadl_epi64((const __m128i *)left);
     369      131225 :   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     370      131225 :   const __m128i zero = _mm_setzero_si128();
     371      131225 :   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     372      262450 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     373      131225 :   __m128i rep = _mm_set1_epi16(0x8000);
     374      131225 :   const __m128i one = _mm_set1_epi16(1);
     375             : 
     376             :   int i;
     377      656119 :   for (i = 0; i < 4; ++i) {
     378      524896 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     379      524896 :     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     380             : 
     381      524894 :     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
     382      524894 :     dst += stride;
     383      524894 :     rep = _mm_add_epi16(rep, one);
     384             :   }
     385      131223 : }
     386             : 
     387       44528 : void eb_aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     388             :                                    const uint8_t *above, const uint8_t *left) {
     389       44528 :   __m128i l = _mm_loadl_epi64((const __m128i *)left);
     390       44528 :   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     391       44528 :   const __m128i zero = _mm_setzero_si128();
     392       44528 :   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     393       89056 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     394       44528 :   __m128i rep = _mm_set1_epi16(0x8000);
     395       44528 :   const __m128i one = _mm_set1_epi16(1);
     396             : 
     397             :   int i;
     398      400747 :   for (i = 0; i < 8; ++i) {
     399      356220 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     400      356220 :     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     401             : 
     402      356219 :     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
     403      356219 :     dst += stride;
     404      356219 :     rep = _mm_add_epi16(rep, one);
     405             :   }
     406       44527 : }
     407             : 
     408       20899 : void eb_aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     409             :                                     const uint8_t *above, const uint8_t *left) {
     410       20899 :   __m128i l = _mm_load_si128((const __m128i *)left);
     411       41798 :   const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
     412       20899 :   const __m128i zero = _mm_setzero_si128();
     413       20899 :   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     414       41798 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     415       20899 :   __m128i rep = _mm_set1_epi16(0x8000);
     416       20899 :   const __m128i one = _mm_set1_epi16(1);
     417             : 
     418      355275 :   for (int i = 0; i < 16; ++i) {
     419      334374 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     420      334374 :     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     421             : 
     422      334376 :     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
     423      334376 :     dst += stride;
     424      334376 :     rep = _mm_add_epi16(rep, one);
     425             :   }
     426       20901 : }
     427             : 
     428       42186 : void eb_aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     429             :                                    const uint8_t *above, const uint8_t *left) {
     430       42186 :   __m128i l = _mm_loadl_epi64((const __m128i *)left);
     431       42186 :   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     432       42186 :   const __m128i zero = _mm_setzero_si128();
     433       42186 :   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     434       84372 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     435       42186 :   __m128i rep = _mm_set1_epi16(0x8000);
     436       42186 :   const __m128i one = _mm_set1_epi16(1);
     437             : 
     438             :   int i;
     439      210930 :   for (i = 0; i < 4; ++i) {
     440      168744 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     441      168744 :     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     442             : 
     443      168744 :     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
     444      168744 :     dst += stride;
     445      168744 :     rep = _mm_add_epi16(rep, one);
     446             :   }
     447       42186 : }
     448             : 
     449       57007 : void eb_aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     450             :                                    const uint8_t *above, const uint8_t *left) {
     451       57007 :   __m128i l = _mm_loadl_epi64((const __m128i *)left);
     452       57007 :   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     453       57007 :   const __m128i zero = _mm_setzero_si128();
     454       57007 :   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     455      114014 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     456       57007 :   __m128i rep = _mm_set1_epi16(0x8000);
     457       57007 :   const __m128i one = _mm_set1_epi16(1);
     458             : 
     459             :   int i;
     460      513051 :   for (i = 0; i < 8; ++i) {
     461      456045 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     462      456045 :     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     463             : 
     464      456044 :     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
     465      456044 :     dst += stride;
     466      456044 :     rep = _mm_add_epi16(rep, one);
     467             :   }
     468       57006 : }
     469             : 
     470       15579 : void eb_aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     471             :                                     const uint8_t *above, const uint8_t *left) {
     472       15579 :   __m128i l = _mm_load_si128((const __m128i *)left);
     473       15579 :   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     474       15579 :   const __m128i zero = _mm_setzero_si128();
     475       15579 :   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     476       31158 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     477       15579 :   __m128i rep = _mm_set1_epi16(0x8000);
     478       15579 :   const __m128i one = _mm_set1_epi16(1);
     479             : 
     480             :   int i;
     481      264836 :   for (i = 0; i < 16; ++i) {
     482      249256 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     483      249256 :     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     484             : 
     485      249257 :     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
     486      249257 :     dst += stride;
     487      249257 :     rep = _mm_add_epi16(rep, one);
     488             :   }
     489       15580 : }
     490             : 
     491       14275 : void eb_aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     492             :                                     const uint8_t *above, const uint8_t *left) {
     493       14275 :   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     494       14275 :   const __m128i zero = _mm_setzero_si128();
     495       14275 :   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     496       28550 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     497       14275 :   const __m128i one = _mm_set1_epi16(1);
     498             : 
     499       42825 :   for (int j = 0; j < 2; ++j) {
     500       57100 :     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
     501       28550 :     __m128i rep = _mm_set1_epi16(0x8000);
     502      485350 :     for (int i = 0; i < 16; ++i) {
     503      456800 :       const __m128i l16 = _mm_shuffle_epi8(l, rep);
     504      456800 :       const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     505             : 
     506      456800 :       _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
     507      456800 :       dst += stride;
     508      456800 :       rep = _mm_add_epi16(rep, one);
     509             :     }
     510             :   }
     511       14275 : }
     512             : 
     513             : // Return 16 8-bit pixels in one row
     514      347360 : static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
     515             :                                       const __m128i *top1,
     516             :                                       const __m128i *topleft) {
     517      347360 :   const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
     518      347360 :   const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
     519      347360 :   return _mm_packus_epi16(p0, p1);
     520             : }
     521             : 
     522       23576 : void eb_aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     523             :                                     const uint8_t *above, const uint8_t *left) {
     524       47152 :   __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
     525       23576 :   const __m128i t = _mm_load_si128((const __m128i *)above);
     526       23576 :   const __m128i zero = _mm_setzero_si128();
     527       23576 :   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
     528       23576 :   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
     529       47152 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     530       23576 :   __m128i rep = _mm_set1_epi16(0x8000);
     531       23576 :   const __m128i one = _mm_set1_epi16(1);
     532             : 
     533      117880 :   for (int i = 0; i < 4; ++i) {
     534       94304 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     535       94304 :     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
     536             : 
     537             :     _mm_storeu_si128((__m128i *)dst, row);
     538       94304 :     dst += stride;
     539       94304 :     rep = _mm_add_epi16(rep, one);
     540             :   }
     541       23576 : }
     542             : 
     543           0 : void eb_aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     544             :                                     const uint8_t *above, const uint8_t *left) {
     545           0 :   __m128i l = _mm_loadl_epi64((const __m128i *)left);
     546           0 :   const __m128i t = _mm_load_si128((const __m128i *)above);
     547           0 :   const __m128i zero = _mm_setzero_si128();
     548           0 :   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
     549           0 :   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
     550           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     551           0 :   __m128i rep = _mm_set1_epi16(0x8000);
     552           0 :   const __m128i one = _mm_set1_epi16(1);
     553             : 
     554             :   int i;
     555           0 :   for (i = 0; i < 8; ++i) {
     556           0 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     557           0 :     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
     558             : 
     559             :     _mm_storeu_si128((__m128i *)dst, row);
     560           0 :     dst += stride;
     561           0 :     rep = _mm_add_epi16(rep, one);
     562             :   }
     563           0 : }
     564             : 
     565           0 : void eb_aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     566             :                                      const uint8_t *above,
     567             :                                      const uint8_t *left) {
     568           0 :   __m128i l = _mm_load_si128((const __m128i *)left);
     569           0 :   const __m128i t = _mm_load_si128((const __m128i *)above);
     570           0 :   const __m128i zero = _mm_setzero_si128();
     571           0 :   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
     572           0 :   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
     573           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     574           0 :   __m128i rep = _mm_set1_epi16(0x8000);
     575           0 :   const __m128i one = _mm_set1_epi16(1);
     576             : 
     577             :   int i;
     578           0 :   for (i = 0; i < 16; ++i) {
     579           0 :     const __m128i l16 = _mm_shuffle_epi8(l, rep);
     580           0 :     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
     581             : 
     582             :     _mm_storeu_si128((__m128i *)dst, row);
     583           0 :     dst += stride;
     584           0 :     rep = _mm_add_epi16(rep, one);
     585             :   }
     586           0 : }
     587             : 
     588           0 : void eb_aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     589             :                                      const uint8_t *above,
     590             :                                      const uint8_t *left) {
     591           0 :   __m128i l = _mm_load_si128((const __m128i *)left);
     592           0 :   const __m128i t = _mm_load_si128((const __m128i *)above);
     593           0 :   const __m128i zero = _mm_setzero_si128();
     594           0 :   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
     595           0 :   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
     596           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     597           0 :   __m128i rep = _mm_set1_epi16(0x8000);
     598           0 :   const __m128i one = _mm_set1_epi16(1);
     599             :   __m128i l16;
     600             : 
     601             :   int i;
     602           0 :   for (i = 0; i < 16; ++i) {
     603           0 :     l16 = _mm_shuffle_epi8(l, rep);
     604           0 :     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
     605             : 
     606             :     _mm_storeu_si128((__m128i *)dst, row);
     607           0 :     dst += stride;
     608           0 :     rep = _mm_add_epi16(rep, one);
     609             :   }
     610             : 
     611           0 :   l = _mm_load_si128((const __m128i *)(left + 16));
     612           0 :   rep = _mm_set1_epi16(0x8000);
     613           0 :   for (i = 0; i < 16; ++i) {
     614           0 :     l16 = _mm_shuffle_epi8(l, rep);
     615           0 :     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
     616             : 
     617             :     _mm_storeu_si128((__m128i *)dst, row);
     618           0 :     dst += stride;
     619           0 :     rep = _mm_add_epi16(rep, one);
     620             :   }
     621           0 : }
     622             : 
     623           0 : void eb_aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     624             :                                      const uint8_t *above,
     625             :                                      const uint8_t *left) {
     626           0 :   const __m128i t = _mm_load_si128((const __m128i *)above);
     627           0 :   const __m128i zero = _mm_setzero_si128();
     628           0 :   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
     629           0 :   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
     630           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     631           0 :   const __m128i one = _mm_set1_epi16(1);
     632             : 
     633           0 :   for (int j = 0; j < 4; ++j) {
     634           0 :     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
     635           0 :     __m128i rep = _mm_set1_epi16(0x8000);
     636           0 :     for (int i = 0; i < 16; ++i) {
     637           0 :       const __m128i l16 = _mm_shuffle_epi8(l, rep);
     638           0 :       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
     639             :       _mm_storeu_si128((__m128i *)dst, row);
     640           0 :       dst += stride;
     641           0 :       rep = _mm_add_epi16(rep, one);
     642             :     }
     643             :   }
     644           0 : }
     645             : 
     646       15816 : void eb_aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     647             :                                     const uint8_t *above, const uint8_t *left) {
     648       15816 :   const __m128i a = _mm_load_si128((const __m128i *)above);
     649       31632 :   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
     650       15816 :   const __m128i zero = _mm_setzero_si128();
     651       15816 :   const __m128i al = _mm_unpacklo_epi8(a, zero);
     652       15816 :   const __m128i ah = _mm_unpackhi_epi8(a, zero);
     653       15816 :   const __m128i bl = _mm_unpacklo_epi8(b, zero);
     654       15816 :   const __m128i bh = _mm_unpackhi_epi8(b, zero);
     655             : 
     656       31632 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     657       15816 :   __m128i rep = _mm_set1_epi16(0x8000);
     658       15816 :   const __m128i one = _mm_set1_epi16(1);
     659       15816 :   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
     660             :   __m128i l16;
     661             : 
     662      142344 :   for (int i = 0; i < 8; ++i) {
     663      126528 :     l16 = _mm_shuffle_epi8(l, rep);
     664      126528 :     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     665      126528 :     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     666             : 
     667             :     _mm_storeu_si128((__m128i *)dst, r32l);
     668      126528 :     _mm_storeu_si128((__m128i *)(dst + 16), r32h);
     669      126528 :     dst += stride;
     670      126528 :     rep = _mm_add_epi16(rep, one);
     671             :   }
     672       15816 : }
     673             : 
     674           0 : void eb_aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     675             :                                      const uint8_t *above,
     676             :                                      const uint8_t *left) {
     677           0 :   const __m128i a = _mm_load_si128((const __m128i *)above);
     678           0 :   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
     679           0 :   const __m128i zero = _mm_setzero_si128();
     680           0 :   const __m128i al = _mm_unpacklo_epi8(a, zero);
     681           0 :   const __m128i ah = _mm_unpackhi_epi8(a, zero);
     682           0 :   const __m128i bl = _mm_unpacklo_epi8(b, zero);
     683           0 :   const __m128i bh = _mm_unpackhi_epi8(b, zero);
     684             : 
     685           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     686           0 :   __m128i rep = _mm_set1_epi16(0x8000);
     687           0 :   const __m128i one = _mm_set1_epi16(1);
     688           0 :   __m128i l = _mm_load_si128((const __m128i *)left);
     689             :   __m128i l16;
     690             : 
     691             :   int i;
     692           0 :   for (i = 0; i < 16; ++i) {
     693           0 :     l16 = _mm_shuffle_epi8(l, rep);
     694           0 :     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     695           0 :     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     696             : 
     697             :     _mm_storeu_si128((__m128i *)dst, r32l);
     698           0 :     _mm_storeu_si128((__m128i *)(dst + 16), r32h);
     699           0 :     dst += stride;
     700           0 :     rep = _mm_add_epi16(rep, one);
     701             :   }
     702           0 : }
     703             : 
     704           0 : void eb_aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     705             :                                      const uint8_t *above,
     706             :                                      const uint8_t *left) {
     707           0 :   const __m128i a = _mm_load_si128((const __m128i *)above);
     708           0 :   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
     709           0 :   const __m128i zero = _mm_setzero_si128();
     710           0 :   const __m128i al = _mm_unpacklo_epi8(a, zero);
     711           0 :   const __m128i ah = _mm_unpackhi_epi8(a, zero);
     712           0 :   const __m128i bl = _mm_unpacklo_epi8(b, zero);
     713           0 :   const __m128i bh = _mm_unpackhi_epi8(b, zero);
     714             : 
     715           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     716           0 :   __m128i rep = _mm_set1_epi16(0x8000);
     717           0 :   const __m128i one = _mm_set1_epi16(1);
     718           0 :   __m128i l = _mm_load_si128((const __m128i *)left);
     719             :   __m128i l16;
     720             : 
     721             :   int i;
     722           0 :   for (i = 0; i < 16; ++i) {
     723           0 :     l16 = _mm_shuffle_epi8(l, rep);
     724           0 :     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     725           0 :     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     726             : 
     727             :     _mm_storeu_si128((__m128i *)dst, r32l);
     728           0 :     _mm_storeu_si128((__m128i *)(dst + 16), r32h);
     729           0 :     dst += stride;
     730           0 :     rep = _mm_add_epi16(rep, one);
     731             :   }
     732             : 
     733           0 :   rep = _mm_set1_epi16(0x8000);
     734           0 :   l = _mm_load_si128((const __m128i *)(left + 16));
     735           0 :   for (i = 0; i < 16; ++i) {
     736           0 :     l16 = _mm_shuffle_epi8(l, rep);
     737           0 :     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     738           0 :     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     739             : 
     740             :     _mm_storeu_si128((__m128i *)dst, r32l);
     741           0 :     _mm_storeu_si128((__m128i *)(dst + 16), r32h);
     742           0 :     dst += stride;
     743           0 :     rep = _mm_add_epi16(rep, one);
     744             :   }
     745           0 : }
     746             : 
     747           0 : void eb_aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     748             :                                      const uint8_t *above,
     749             :                                      const uint8_t *left) {
     750           0 :   const __m128i a = _mm_load_si128((const __m128i *)above);
     751           0 :   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
     752           0 :   const __m128i zero = _mm_setzero_si128();
     753           0 :   const __m128i al = _mm_unpacklo_epi8(a, zero);
     754           0 :   const __m128i ah = _mm_unpackhi_epi8(a, zero);
     755           0 :   const __m128i bl = _mm_unpacklo_epi8(b, zero);
     756           0 :   const __m128i bh = _mm_unpackhi_epi8(b, zero);
     757             : 
     758           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     759           0 :   const __m128i one = _mm_set1_epi16(1);
     760             :   __m128i l16;
     761             : 
     762             :   int i, j;
     763           0 :   for (j = 0; j < 4; ++j) {
     764           0 :     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
     765           0 :     __m128i rep = _mm_set1_epi16(0x8000);
     766           0 :     for (i = 0; i < 16; ++i) {
     767           0 :       l16 = _mm_shuffle_epi8(l, rep);
     768           0 :       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     769           0 :       const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     770             : 
     771             :       _mm_storeu_si128((__m128i *)dst, r32l);
     772           0 :       _mm_storeu_si128((__m128i *)(dst + 16), r32h);
     773           0 :       dst += stride;
     774           0 :       rep = _mm_add_epi16(rep, one);
     775             :     }
     776             :   }
     777           0 : }
     778             : 
     779           0 : void eb_aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     780             :                                      const uint8_t *above,
     781             :                                      const uint8_t *left) {
     782           0 :   const __m128i a = _mm_load_si128((const __m128i *)above);
     783           0 :   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
     784           0 :   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
     785           0 :   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
     786           0 :   const __m128i zero = _mm_setzero_si128();
     787           0 :   const __m128i al = _mm_unpacklo_epi8(a, zero);
     788           0 :   const __m128i ah = _mm_unpackhi_epi8(a, zero);
     789           0 :   const __m128i bl = _mm_unpacklo_epi8(b, zero);
     790           0 :   const __m128i bh = _mm_unpackhi_epi8(b, zero);
     791           0 :   const __m128i cl = _mm_unpacklo_epi8(c, zero);
     792           0 :   const __m128i ch = _mm_unpackhi_epi8(c, zero);
     793           0 :   const __m128i dl = _mm_unpacklo_epi8(d, zero);
     794           0 :   const __m128i dh = _mm_unpackhi_epi8(d, zero);
     795             : 
     796           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     797           0 :   const __m128i one = _mm_set1_epi16(1);
     798             :   __m128i l16;
     799             : 
     800             :   int i, j;
     801           0 :   for (j = 0; j < 2; ++j) {
     802           0 :     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
     803           0 :     __m128i rep = _mm_set1_epi16(0x8000);
     804           0 :     for (i = 0; i < 16; ++i) {
     805           0 :       l16 = _mm_shuffle_epi8(l, rep);
     806           0 :       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     807           0 :       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     808           0 :       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
     809           0 :       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
     810             : 
     811             :       _mm_storeu_si128((__m128i *)dst, r0);
     812           0 :       _mm_storeu_si128((__m128i *)(dst + 16), r1);
     813           0 :       _mm_storeu_si128((__m128i *)(dst + 32), r2);
     814           0 :       _mm_storeu_si128((__m128i *)(dst + 48), r3);
     815           0 :       dst += stride;
     816           0 :       rep = _mm_add_epi16(rep, one);
     817             :     }
     818             :   }
     819           0 : }
     820             : 
     821           0 : void eb_aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
     822             :                                      const uint8_t *above,
     823             :                                      const uint8_t *left) {
     824           0 :   const __m128i a = _mm_load_si128((const __m128i *)above);
     825           0 :   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
     826           0 :   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
     827           0 :   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
     828           0 :   const __m128i zero = _mm_setzero_si128();
     829           0 :   const __m128i al = _mm_unpacklo_epi8(a, zero);
     830           0 :   const __m128i ah = _mm_unpackhi_epi8(a, zero);
     831           0 :   const __m128i bl = _mm_unpacklo_epi8(b, zero);
     832           0 :   const __m128i bh = _mm_unpackhi_epi8(b, zero);
     833           0 :   const __m128i cl = _mm_unpacklo_epi8(c, zero);
     834           0 :   const __m128i ch = _mm_unpackhi_epi8(c, zero);
     835           0 :   const __m128i dl = _mm_unpacklo_epi8(d, zero);
     836           0 :   const __m128i dh = _mm_unpackhi_epi8(d, zero);
     837             : 
     838           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     839           0 :   const __m128i one = _mm_set1_epi16(1);
     840             :   __m128i l16;
     841             : 
     842             :   int i, j;
     843           0 :   for (j = 0; j < 4; ++j) {
     844           0 :     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
     845           0 :     __m128i rep = _mm_set1_epi16(0x8000);
     846           0 :     for (i = 0; i < 16; ++i) {
     847           0 :       l16 = _mm_shuffle_epi8(l, rep);
     848           0 :       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     849           0 :       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     850           0 :       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
     851           0 :       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
     852             : 
     853             :       _mm_storeu_si128((__m128i *)dst, r0);
     854           0 :       _mm_storeu_si128((__m128i *)(dst + 16), r1);
     855           0 :       _mm_storeu_si128((__m128i *)(dst + 32), r2);
     856           0 :       _mm_storeu_si128((__m128i *)(dst + 48), r3);
     857           0 :       dst += stride;
     858           0 :       rep = _mm_add_epi16(rep, one);
     859             :     }
     860             :   }
     861           0 : }
     862             : 
     863           0 : void eb_aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     864             :                                      const uint8_t *above,
     865             :                                      const uint8_t *left) {
     866           0 :   const __m128i a = _mm_load_si128((const __m128i *)above);
     867           0 :   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
     868           0 :   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
     869           0 :   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
     870           0 :   const __m128i zero = _mm_setzero_si128();
     871           0 :   const __m128i al = _mm_unpacklo_epi8(a, zero);
     872           0 :   const __m128i ah = _mm_unpackhi_epi8(a, zero);
     873           0 :   const __m128i bl = _mm_unpacklo_epi8(b, zero);
     874           0 :   const __m128i bh = _mm_unpackhi_epi8(b, zero);
     875           0 :   const __m128i cl = _mm_unpacklo_epi8(c, zero);
     876           0 :   const __m128i ch = _mm_unpackhi_epi8(c, zero);
     877           0 :   const __m128i dl = _mm_unpacklo_epi8(d, zero);
     878           0 :   const __m128i dh = _mm_unpackhi_epi8(d, zero);
     879             : 
     880           0 :   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
     881           0 :   const __m128i one = _mm_set1_epi16(1);
     882             :   __m128i l16;
     883             : 
     884             :   int i;
     885           0 :   const __m128i l = _mm_load_si128((const __m128i *)left);
     886           0 :   __m128i rep = _mm_set1_epi16(0x8000);
     887           0 :   for (i = 0; i < 16; ++i) {
     888           0 :     l16 = _mm_shuffle_epi8(l, rep);
     889           0 :     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
     890           0 :     const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
     891           0 :     const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
     892           0 :     const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
     893             : 
     894             :     _mm_storeu_si128((__m128i *)dst, r0);
     895           0 :     _mm_storeu_si128((__m128i *)(dst + 16), r1);
     896           0 :     _mm_storeu_si128((__m128i *)(dst + 32), r2);
     897           0 :     _mm_storeu_si128((__m128i *)(dst + 48), r3);
     898           0 :     dst += stride;
     899           0 :     rep = _mm_add_epi16(rep, one);
     900             :   }
     901           0 : }

Generated by: LCOV version 1.14