LCOV - code coverage report
Current view: top level - ASM_SSSE3 - EbMcp_Intrinsic_SSSE3.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 321 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 6 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbDefinitions.h"
       7             : 
       8             : #include "emmintrin.h"
       9             : 
      10             : #ifndef PREFETCH
      11             : #define PREFETCH 0 // prefetching: enables prefetching of data before interpolation
      12             : #endif
      13             : 
      14             : #include "tmmintrin.h"
      15             : 
      16             : #ifdef __GNUC__
      17             : #ifndef __cplusplus
      18             : __attribute__((visibility("hidden")))
      19             : #endif
      20             : #endif
      21             : const int16_t lumaFilterCoeff[4][8] =
      22             : {
      23             :   { 0, 0,  0, 64,  0,  0, 0,  0},
      24             :   {-1, 4,-10, 58, 17, -5, 1,  0},
      25             :   {-1, 4,-11, 40, 40,-11, 4, -1},
      26             :   { 0, 1, -5, 17, 58,-10, 4, -1}
      27             : };
      28             : 
      29             : #ifdef __GNUC__
      30             : #ifndef __cplusplus
      31             : __attribute__((visibility("hidden")))
      32             : #endif
      33             : #endif
      34             : const int16_t lumaFilterCoeff7[4][8] =
      35             : {
      36             :   { 0, 0,  0, 64,  0,  0, 0,  0},
      37             :   {-1, 4,-10, 58, 17, -5, 1,  0},
      38             :   {-1, 4,-11, 40, 40,-11, 4, -1},
      39             :   { 1, -5, 17, 58,-10, 4, -1, 0}
      40             : };
      41             : 
      42             : #ifdef __GNUC__
      43             : #ifndef __cplusplus
      44             : __attribute__((visibility("hidden")))
      45             : #endif
      46             : #endif
      47             : const int16_t chromaFilterCoeff[8][4] =
      48             : {
      49             :   { 0, 64,  0,  0},
      50             :   {-2, 58, 10, -2},
      51             :   {-4, 54, 16, -2},
      52             :   {-6, 46, 28, -4},
      53             :   {-4, 36, 36, -4},
      54             :   {-4, 28, 46, -6},
      55             :   {-2, 16, 54, -4},
      56             :   {-2, 10, 58, -2},
      57             : };
      58             : 
      59           0 : static void PrefetchBlock(uint8_t *src, uint32_t src_stride, uint32_t blkWidth, uint32_t blkHeight)
      60             : {
      61             : #if PREFETCH
      62             :     uint32_t row_count = blkHeight;
      63             : 
      64             :     do {
      65             :         uint8_t *addr0 = src;
      66             :         uint8_t *addr1 = addr0 + blkWidth - 1;
      67             :         src += src_stride;
      68             : 
      69             :         _mm_prefetch((char*)addr0, _MM_HINT_T0);
      70             :         _mm_prefetch((char*)addr1, _MM_HINT_T0);
      71             :     } while (--row_count != 0);
      72             : #else
      73             :     (void)src;
      74             :     (void)src_stride;
      75             :     (void)blkWidth;
      76             :     (void)blkHeight;
      77             : #endif
      78           0 : }
      79             : 
      80           0 : void LumaInterpolationFilterTwoDInRaw7_SSSE3(int16_t *first_pass_if_dst, EbByte dst, uint32_t dst_stride, uint32_t pu_width, uint32_t pu_height, uint32_t frac_pos_y)
      81             : {
      82             :     int32_t row_count, col_count;
      83             :     __m128i c0, c1, c2;
      84             :     __m128i a0, a1, a2, a3, a4, a5, a6;
      85             :     __m128i sum0, sum1;
      86             :     __m128i b0l, b0h, b1l, b1h, b2l, b2h;
      87             : 
      88             :     EbByte qtr;
      89             : 
      90           0 :     c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[frac_pos_y]);
      91           0 :     c2 = _mm_shuffle_epi32(c0, 0xaa);
      92           0 :     c1 = _mm_shuffle_epi32(c0, 0x55);
      93           0 :     c0 = _mm_shuffle_epi32(c0, 0x00);
      94             : 
      95           0 :     if (pu_width & 4)
      96             :     {
      97           0 :         row_count = pu_height;
      98             : 
      99           0 :         qtr = dst;
     100             : 
     101             :         do {
     102           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
     103           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
     104           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
     105           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
     106           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
     107           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
     108           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
     109           0 :             a0 = _mm_sub_epi16(a0, a6);
     110             : 
     111           0 :             sum0 = _mm_set1_epi32(257 << 11);
     112           0 :             sum1 = _mm_set1_epi32(257 << 11);
     113             : 
     114           0 :             b0l = _mm_unpacklo_epi16(a0, a1);
     115           0 :             b0h = _mm_unpackhi_epi16(a0, a1);
     116           0 :             b1l = _mm_unpacklo_epi16(a2, a3);
     117           0 :             b1h = _mm_unpackhi_epi16(a2, a3);
     118           0 :             b2l = _mm_unpacklo_epi16(a4, a5);
     119           0 :             b2h = _mm_unpackhi_epi16(a4, a5);
     120             : 
     121           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b0l, c0));
     122           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b0h, c0));
     123           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b1l, c1));
     124           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b1h, c1));
     125           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b2l, c2));
     126           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b2h, c2));
     127             : 
     128           0 :             sum0 = _mm_srai_epi32(sum0, 12);
     129           0 :             sum1 = _mm_srai_epi32(sum1, 12);
     130           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     131           0 :             sum0 = _mm_packus_epi16(sum0, sum0);
     132             : 
     133           0 :             *(uint32_t *)qtr = _mm_cvtsi128_si32(sum0); qtr += dst_stride;
     134           0 :             *(uint32_t *)qtr = _mm_cvtsi128_si32(_mm_srli_si128(sum0, 4)); qtr += dst_stride;
     135             : 
     136           0 :             first_pass_if_dst += 8;
     137           0 :             row_count -= 2;
     138           0 :         } while (row_count > 0);
     139             : 
     140           0 :         pu_width -= 4;
     141           0 :         if (pu_width == 0)
     142           0 :             return;
     143           0 :         first_pass_if_dst += (frac_pos_y == 2) ? 32 : 24;
     144           0 :         dst += 4;
     145             :     }
     146             : 
     147           0 :     col_count = pu_width;
     148             :     do {
     149           0 :         EbByte qtr = dst;
     150             : 
     151           0 :         row_count = pu_height;
     152             :         do {
     153           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
     154           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
     155           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
     156           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
     157           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
     158           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
     159           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
     160           0 :             a0 = _mm_sub_epi16(a0, a6);
     161             : 
     162           0 :             sum0 = _mm_set1_epi32(257 << 11);
     163           0 :             sum1 = _mm_set1_epi32(257 << 11);
     164           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0));
     165           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0));
     166           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
     167           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
     168           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a4, a5), c2));
     169           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a4, a5), c2));
     170             : 
     171           0 :             sum0 = _mm_srai_epi32(sum0, 12);
     172           0 :             sum1 = _mm_srai_epi32(sum1, 12);
     173           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     174           0 :             sum0 = _mm_packus_epi16(sum0, sum0);
     175             : 
     176           0 :             _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dst_stride;
     177             : 
     178           0 :             first_pass_if_dst += 8;
     179           0 :             row_count--;
     180           0 :         } while (row_count > 0);
     181             : 
     182           0 :         first_pass_if_dst += (frac_pos_y == 2) ? 56 : 48;
     183           0 :         dst += 8;
     184           0 :         col_count -= 8;
     185           0 :     } while (col_count > 0);
     186             : }
     187             : 
     188           0 : void LumaInterpolationFilterTwoDInRawOutRaw7_SSSE3(int16_t *first_pass_if_dst, int16_t *dst, uint32_t pu_width, uint32_t pu_height, uint32_t frac_pos_y)
     189             : {
     190             :     int32_t row_count, col_count;
     191             : 
     192             :     __m128i a0, a1, a2, a3, a4, a5, a6;
     193             :     __m128i c0, c1, c2;
     194           0 :     c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[frac_pos_y]);
     195           0 :     c2 = _mm_shuffle_epi32(c0, 0xaa);
     196           0 :     c1 = _mm_shuffle_epi32(c0, 0x55);
     197           0 :     c0 = _mm_shuffle_epi32(c0, 0x00);
     198             : 
     199           0 :     if (pu_width & 4)
     200             :     {
     201           0 :         row_count = pu_height;
     202             : 
     203             :         do {
     204             :             __m128i sum0, sum1;
     205           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
     206           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
     207           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
     208           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
     209           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
     210           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
     211           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
     212           0 :             a0 = _mm_sub_epi16(a0, a6);
     213             : 
     214           0 :             sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0);
     215           0 :             sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0);
     216           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
     217           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
     218           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a4, a5), c2));
     219           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a4, a5), c2));
     220             : 
     221           0 :             sum0 = _mm_srai_epi32(sum0, 6);
     222           0 :             sum1 = _mm_srai_epi32(sum1, 6);
     223           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     224             : 
     225             :             _mm_storeu_si128((__m128i *)dst, sum0);
     226           0 :             dst += 8;
     227             : 
     228           0 :             first_pass_if_dst += 8;
     229           0 :             row_count -= 2;
     230           0 :         } while (row_count > 0);
     231             : 
     232           0 :         pu_width -= 4;
     233           0 :         if (pu_width == 0)
     234           0 :             return;
     235           0 :         first_pass_if_dst += (frac_pos_y == 2) ? 32 : 24;
     236             :     }
     237             : 
     238           0 :     col_count = pu_width;
     239             :     do {
     240           0 :         row_count = pu_height;
     241             :         do {
     242             :             __m128i b0l, b0h, b1l, b1h, b2l, b2h;
     243             :             __m128i sum0, sum1;
     244             : 
     245           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
     246           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
     247           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
     248           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
     249           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
     250           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
     251           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
     252           0 :             a0 = _mm_sub_epi16(a0, a6);
     253             : 
     254           0 :             b0l = _mm_unpacklo_epi16(a0, a1);
     255           0 :             b0h = _mm_unpackhi_epi16(a0, a1);
     256           0 :             b1l = _mm_unpacklo_epi16(a2, a3);
     257           0 :             b1h = _mm_unpackhi_epi16(a2, a3);
     258           0 :             b2l = _mm_unpacklo_epi16(a4, a5);
     259           0 :             b2h = _mm_unpackhi_epi16(a4, a5);
     260             : 
     261           0 :             sum0 = _mm_madd_epi16(b0l, c0);
     262           0 :             sum1 = _mm_madd_epi16(b0h, c0);
     263           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b1l, c1));
     264           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b1h, c1));
     265           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(b2l, c2));
     266           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b2h, c2));
     267             : 
     268           0 :             sum0 = _mm_srai_epi32(sum0, 6);
     269           0 :             sum1 = _mm_srai_epi32(sum1, 6);
     270           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     271             : 
     272             :             _mm_storeu_si128((__m128i *)dst, sum0);
     273           0 :             dst += 8;
     274             : 
     275           0 :             first_pass_if_dst += 8;
     276           0 :             row_count--;
     277           0 :         } while (row_count > 0);
     278             : 
     279           0 :         first_pass_if_dst += (frac_pos_y == 2) ? 56 : 48;
     280           0 :         col_count -= 8;
     281           0 :     } while (col_count > 0);
     282             : }
     283             : 
     284           0 : void LumaInterpolationFilterTwoDInRawM_SSSE3(int16_t *first_pass_if_dst, EbByte dst, uint32_t dst_stride, uint32_t pu_width, uint32_t pu_height)
     285             : {
     286             :     int32_t row_count, col_count;
     287             : 
     288             :     __m128i c0, c1;
     289             :     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
     290             :     __m128i sum0, sum1;
     291             : 
     292             :     EbByte qtr;
     293             : 
     294           0 :     c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[2]);
     295           0 :     c1 = _mm_shuffle_epi32(c0, 0x55);
     296           0 :     c0 = _mm_shuffle_epi32(c0, 0x00);
     297             : 
     298           0 :     if (pu_width & 4){
     299           0 :         row_count = pu_height;
     300           0 :         qtr = dst;
     301             : 
     302             :         do {
     303           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
     304           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
     305           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
     306           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
     307           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
     308           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
     309           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
     310           0 :             a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 4));
     311             : 
     312           0 :             sum0 = _mm_set1_epi32(257 << 11);
     313           0 :             sum1 = _mm_set1_epi32(257 << 11);
     314             : 
     315           0 :             a0 = _mm_add_epi16(a0, a7);
     316           0 :             a1 = _mm_add_epi16(a1, a6);
     317           0 :             a2 = _mm_add_epi16(a2, a5);
     318           0 :             a3 = _mm_add_epi16(a3, a4);
     319           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0));
     320           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0));
     321           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
     322           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
     323             : 
     324           0 :             sum0 = _mm_srai_epi32(sum0, 12);
     325           0 :             sum1 = _mm_srai_epi32(sum1, 12);
     326           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     327           0 :             sum0 = _mm_packus_epi16(sum0, sum0);
     328             : 
     329           0 :             *(uint32_t *)qtr = _mm_cvtsi128_si32(sum0); qtr += dst_stride;
     330           0 :             *(uint32_t *)qtr = _mm_cvtsi128_si32(_mm_srli_si128(sum0, 4)); qtr += dst_stride;
     331           0 :             first_pass_if_dst += 8;
     332           0 :             row_count -= 2;
     333           0 :         } while (row_count > 0);
     334             : 
     335           0 :         pu_width -= 4;
     336           0 :         if (pu_width == 0)
     337           0 :             return;
     338           0 :         first_pass_if_dst += 32;
     339           0 :         dst += 4;
     340             :     }
     341             : 
     342           0 :     col_count = pu_width;
     343             :     do {
     344           0 :         qtr = dst;
     345             : 
     346           0 :         row_count = pu_height;
     347             :         do {
     348           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
     349           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
     350           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
     351           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
     352           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
     353           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
     354           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
     355           0 :             a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 8));
     356             : 
     357           0 :             sum0 = _mm_set1_epi32(257 << 11);
     358           0 :             sum1 = _mm_set1_epi32(257 << 11);
     359           0 :             a0 = _mm_add_epi16(a0, a7);
     360           0 :             a1 = _mm_add_epi16(a1, a6);
     361           0 :             a2 = _mm_add_epi16(a2, a5);
     362           0 :             a3 = _mm_add_epi16(a3, a4);
     363           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0));
     364           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0));
     365           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
     366           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
     367             : 
     368           0 :             sum0 = _mm_srai_epi32(sum0, 12);
     369           0 :             sum1 = _mm_srai_epi32(sum1, 12);
     370           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     371           0 :             sum0 = _mm_packus_epi16(sum0, sum0);
     372             : 
     373           0 :             _mm_storel_epi64((__m128i *)qtr, sum0); qtr += dst_stride;
     374           0 :             first_pass_if_dst += 8;
     375           0 :         } while (--row_count > 0);
     376             : 
     377           0 :         first_pass_if_dst += 56;
     378           0 :         dst += 8;
     379           0 :         col_count -= 8;
     380           0 :     } while (col_count > 0);
     381             : }
     382             : 
     383           0 : void LumaInterpolationFilterTwoDInRawOutRawM_SSSE3(int16_t *first_pass_if_dst, int16_t *dst, uint32_t pu_width, uint32_t pu_height){
     384             :     int32_t row_count, col_count;
     385             : 
     386             :     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
     387             :     __m128i c0, c1;
     388           0 :     c0 = _mm_loadu_si128((__m128i *)lumaFilterCoeff7[2]);
     389           0 :     c1 = _mm_shuffle_epi32(c0, 0x55);
     390           0 :     c0 = _mm_shuffle_epi32(c0, 0x00);
     391             : 
     392           0 :     if (pu_width & 4) {
     393           0 :         row_count = pu_height;
     394             : 
     395             :         do {
     396             :             __m128i sum0, sum1;
     397           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 4));
     398           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 4));
     399           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 4));
     400           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 4));
     401           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 4));
     402           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 4));
     403           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 4));
     404           0 :             a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 4));
     405             : 
     406           0 :             a0 = _mm_add_epi16(a0, a7);
     407           0 :             a1 = _mm_add_epi16(a1, a6);
     408           0 :             a2 = _mm_add_epi16(a2, a5);
     409           0 :             a3 = _mm_add_epi16(a3, a4);
     410           0 :             sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0);
     411           0 :             sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0);
     412           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
     413           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
     414             : 
     415           0 :             sum0 = _mm_srai_epi32(sum0, 6);
     416           0 :             sum1 = _mm_srai_epi32(sum1, 6);
     417           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     418             : 
     419             :             _mm_storeu_si128((__m128i *)dst, sum0);
     420           0 :             dst += 8;
     421           0 :             first_pass_if_dst += 8;
     422           0 :             row_count -= 2;
     423           0 :         } while (row_count > 0);
     424             : 
     425           0 :         pu_width -= 4;
     426           0 :         if (pu_width == 0)
     427           0 :             return;
     428           0 :         first_pass_if_dst += 32;
     429             :     }
     430             : 
     431           0 :     col_count = pu_width;
     432             :     do {
     433           0 :         row_count = pu_height;
     434             :         do {
     435             :             __m128i sum0, sum1;
     436           0 :             a0 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 0 * 8));
     437           0 :             a1 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 1 * 8));
     438           0 :             a2 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 2 * 8));
     439           0 :             a3 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 3 * 8));
     440           0 :             a4 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 4 * 8));
     441           0 :             a5 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 5 * 8));
     442           0 :             a6 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 6 * 8));
     443           0 :             a7 = _mm_loadu_si128((__m128i *)(first_pass_if_dst + 7 * 8));
     444             : 
     445           0 :             a0 = _mm_add_epi16(a0, a7);
     446           0 :             a1 = _mm_add_epi16(a1, a6);
     447           0 :             a2 = _mm_add_epi16(a2, a5);
     448           0 :             a3 = _mm_add_epi16(a3, a4);
     449           0 :             sum0 = _mm_madd_epi16(_mm_unpacklo_epi16(a0, a1), c0);
     450           0 :             sum1 = _mm_madd_epi16(_mm_unpackhi_epi16(a0, a1), c0);
     451           0 :             sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(_mm_unpacklo_epi16(a2, a3), c1));
     452           0 :             sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(_mm_unpackhi_epi16(a2, a3), c1));
     453             : 
     454           0 :             sum0 = _mm_srai_epi32(sum0, 6);
     455           0 :             sum1 = _mm_srai_epi32(sum1, 6);
     456           0 :             sum0 = _mm_packs_epi32(sum0, sum1);
     457             : 
     458             :             _mm_storeu_si128((__m128i *)dst, sum0);
     459           0 :             dst += 8;
     460           0 :             first_pass_if_dst += 8;
     461           0 :         } while (--row_count > 0);
     462             : 
     463           0 :         first_pass_if_dst += 56;
     464           0 :         col_count -= 8;
     465           0 :     } while (col_count > 0);
     466             : }
     467             : 
     468           0 : void PictureCopyKernelOutRaw_SSSE3(
     469             :     EbByte                  ref_pic,
     470             :     uint32_t                   src_stride,
     471             :     int16_t                  *dst,
     472             :     uint32_t                   pu_width,
     473             :     uint32_t                   pu_height,
     474             :     int16_t                   offset)
     475             : {
     476             :     uint32_t row_count, col_count;
     477             :     __m128i o;
     478             : 
     479           0 :     PrefetchBlock(ref_pic, src_stride, pu_width, pu_height);
     480             : 
     481           0 :     /*__m128i*/ o = _mm_set1_epi16(offset);
     482             : 
     483           0 :     if (pu_width & 2) {
     484             :         __m128i a0;
     485           0 :         EbByte ptr = ref_pic;
     486           0 :         row_count = pu_height;
     487           0 :         /*__m128i*/ a0 = _mm_setzero_si128();
     488             :         do {
     489           0 :             a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 0); ptr += src_stride;
     490           0 :             a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 1); ptr += src_stride;
     491           0 :             a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 2); ptr += src_stride;
     492           0 :             a0 = _mm_insert_epi16(a0, *(uint16_t *)ptr, 3); ptr += src_stride;
     493           0 :             a0 = _mm_unpacklo_epi8(a0, _mm_setzero_si128());
     494           0 :             a0 = _mm_slli_epi16(a0, 6);
     495           0 :             a0 = _mm_sub_epi16(a0, o);
     496             :             _mm_storeu_si128((__m128i *)dst, a0);
     497             : 
     498           0 :             dst += 8;
     499           0 :             row_count -= 4;
     500           0 :         } while (row_count != 0);
     501             : 
     502           0 :         pu_width -= 2;
     503           0 :         if (pu_width == 0)
     504           0 :             return;
     505           0 :         ref_pic += 2;
     506             :     }
     507             : 
     508           0 :     if (pu_width & 4) {
     509           0 :         EbByte ptr = ref_pic;
     510           0 :         row_count = pu_height;
     511             :         do {
     512             :             __m128i a0, a1;
     513           0 :             a0 = _mm_cvtsi32_si128(*(uint32_t *)ptr); ptr += src_stride;
     514           0 :             a1 = _mm_cvtsi32_si128(*(uint32_t *)ptr); ptr += src_stride;
     515           0 :             a0 = _mm_unpacklo_epi32(a0, a1);
     516           0 :             a0 = _mm_unpacklo_epi8(a0, _mm_setzero_si128());
     517           0 :             a0 = _mm_slli_epi16(a0, 6);
     518           0 :             a0 = _mm_sub_epi16(a0, o);
     519             :             _mm_storeu_si128((__m128i *)dst, a0);
     520             : 
     521           0 :             dst += 8;
     522           0 :             row_count -= 2;
     523           0 :         } while (row_count != 0);
     524             : 
     525           0 :         pu_width -= 4;
     526           0 :         if (pu_width == 0)
     527           0 :             return;
     528           0 :         ref_pic += 4;
     529             :     }
     530             : 
     531           0 :     col_count = pu_width;
     532             :     do {
     533             :         __m128i a0;
     534           0 :         EbByte ptr = ref_pic;
     535           0 :         row_count = pu_height;
     536             :         do {
     537           0 :             /*__m128i*/ a0 = _mm_loadl_epi64((__m128i *)ptr); ptr += src_stride;
     538           0 :             a0 = _mm_unpacklo_epi8(a0, _mm_setzero_si128());
     539           0 :             a0 = _mm_slli_epi16(a0, 6);
     540           0 :             a0 = _mm_sub_epi16(a0, o);
     541             :             _mm_storeu_si128((__m128i *)dst, a0);
     542           0 :             dst += 8;
     543           0 :         } while (--row_count != 0);
     544             : 
     545           0 :         col_count -= 8;
     546           0 :         ref_pic += 8;
     547           0 :     } while (col_count != 0);
     548             : }

Generated by: LCOV version 1.14