LCOV - code coverage report
Current view: top level - ASM_SSE2 - EbPackUnPack_Intrinsic_SSE2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 497 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 3 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbPackUnPack_SSE2.h"
       7             : 
       8             : #include <emmintrin.h>
       9             : #include <stdint.h>
      10             : 
      11             : /****************************************************************************************
      12             : eb_enc_msb_un_pack2d_sse2_intrin
      13             : ******************************************************************************************/
      14             : 
      15           0 : void eb_enc_msb_un_pack2d_sse2_intrin(
      16             :     uint16_t      *in16_bit_buffer,
      17             :     uint32_t       in_stride,
      18             :     uint8_t       *out8_bit_buffer,
      19             :     uint8_t       *outn_bit_buffer,
      20             :     uint32_t       out8_stride,
      21             :     uint32_t       outn_stride,
      22             :     uint32_t       width,
      23             :     uint32_t       height)
      24             : {
      25             :     uint32_t x, y;
      26             : 
      27             :     __m128i xmm_3, xmm_00FF, inPixel0, inPixel1, tempPixel0, tempPixel1, inPixel1_shftR_2_U8, inPixel0_shftR_2_U8, inPixel0_shftR_2, inPixel1_shftR_2;
      28             :     __m128i tempPixel0_U8, tempPixel1_U8;
      29             : 
      30           0 :     xmm_3 = _mm_set1_epi16(0x0003);
      31           0 :     xmm_00FF = _mm_set1_epi16(0x00FF);
      32             : 
      33           0 :     if (width == 4)
      34             :     {
      35           0 :         for (y = 0; y < height; y += 2)
      36             :         {
      37           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)in16_bit_buffer);
      38           0 :             inPixel1 = _mm_loadl_epi64((__m128i*)(in16_bit_buffer + in_stride));
      39             : 
      40           0 :             tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
      41           0 :             tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
      42             : 
      43           0 :             tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
      44           0 :             tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
      45             : 
      46           0 :             inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
      47           0 :             inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
      48             : 
      49           0 :             inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
      50           0 :             inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
      51             : 
      52           0 :             *(uint32_t*)outn_bit_buffer = _mm_cvtsi128_si32(tempPixel0_U8);
      53           0 :             *(uint32_t*)(outn_bit_buffer + outn_stride) = _mm_cvtsi128_si32(tempPixel1_U8);
      54           0 :             *(uint32_t*)out8_bit_buffer = _mm_cvtsi128_si32(inPixel0_shftR_2_U8);
      55           0 :             *(uint32_t*)(out8_bit_buffer + out8_stride) = _mm_cvtsi128_si32(inPixel1_shftR_2_U8);
      56             : 
      57           0 :             outn_bit_buffer += 2 * outn_stride;
      58           0 :             out8_bit_buffer += 2 * out8_stride;
      59           0 :             in16_bit_buffer += 2 * in_stride;
      60             :         }
      61             :     }
      62           0 :     else if (width == 8)
      63             :     {
      64           0 :         for (y = 0; y < height; y += 2)
      65             :         {
      66           0 :             inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
      67           0 :             inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
      68             : 
      69           0 :             tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
      70           0 :             tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
      71             : 
      72           0 :             tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
      73           0 :             tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
      74             : 
      75           0 :             inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
      76           0 :             inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
      77             : 
      78           0 :             inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
      79           0 :             inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
      80             : 
      81           0 :             _mm_storel_epi64((__m128i*)outn_bit_buffer, tempPixel0_U8);
      82           0 :             _mm_storel_epi64((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
      83           0 :             _mm_storel_epi64((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
      84           0 :             _mm_storel_epi64((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
      85             : 
      86           0 :             outn_bit_buffer += 2 * outn_stride;
      87           0 :             out8_bit_buffer += 2 * out8_stride;
      88           0 :             in16_bit_buffer += 2 * in_stride;
      89             :         }
      90             :     }
      91           0 :     else if (width == 16)
      92             :     {
      93             :         __m128i inPixel2, inPixel3;
      94             : 
      95           0 :         for (y = 0; y < height; y += 2)
      96             :         {
      97           0 :             inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
      98           0 :             inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + 8));
      99           0 :             inPixel2 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
     100           0 :             inPixel3 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 8));
     101             : 
     102           0 :             tempPixel0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
     103           0 :             tempPixel1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
     104             : 
     105           0 :             inPixel0_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
     106           0 :             inPixel1_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
     107             : 
     108             :             _mm_storeu_si128((__m128i*)outn_bit_buffer, tempPixel0_U8);
     109           0 :             _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
     110             :             _mm_storeu_si128((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
     111           0 :             _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
     112             : 
     113           0 :             outn_bit_buffer += 2 * outn_stride;
     114           0 :             out8_bit_buffer += 2 * out8_stride;
     115           0 :             in16_bit_buffer += 2 * in_stride;
     116             :         }
     117             :     }
     118           0 :     else if (width == 32)
     119             :     {
     120             :         __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     121             :         __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
     122             : 
     123           0 :         for (y = 0; y < height; y += 2)
     124             :         {
     125           0 :             inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
     126           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
     127           0 :             inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
     128           0 :             inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
     129           0 :             inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride));
     130           0 :             inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride + 8));
     131           0 :             inPixel6 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 16));
     132           0 :             inPixel7 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 24));
     133             : 
     134           0 :             outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
     135           0 :             outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
     136           0 :             outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
     137           0 :             outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
     138             : 
     139           0 :             out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
     140           0 :             out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
     141           0 :             out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
     142           0 :             out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
     143             : 
     144             :             _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
     145           0 :             _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
     146           0 :             _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), outn2_U8);
     147           0 :             _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride + 16), outn3_U8);
     148             : 
     149             :             _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
     150           0 :             _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
     151           0 :             _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), out8_2_U8);
     152           0 :             _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride + 16), out8_3_U8);
     153             : 
     154           0 :             outn_bit_buffer += 2 * outn_stride;
     155           0 :             out8_bit_buffer += 2 * out8_stride;
     156           0 :             in16_bit_buffer += 2 * in_stride;
     157             :         }
     158             :     }
     159           0 :     else if (width == 64)
     160             :     {
     161             :         __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     162             :         __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
     163             : 
     164           0 :         for (y = 0; y < height; ++y)
     165             :         {
     166           0 :             inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
     167           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
     168           0 :             inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
     169           0 :             inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
     170           0 :             inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 32));
     171           0 :             inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 40));
     172           0 :             inPixel6 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 48));
     173           0 :             inPixel7 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 56));
     174             : 
     175           0 :             outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
     176           0 :             outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
     177           0 :             outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
     178           0 :             outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
     179             : 
     180           0 :             out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
     181           0 :             out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
     182           0 :             out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
     183           0 :             out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
     184             : 
     185             :             _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
     186           0 :             _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
     187           0 :             _mm_storeu_si128((__m128i*)(outn_bit_buffer + 32), outn2_U8);
     188           0 :             _mm_storeu_si128((__m128i*)(outn_bit_buffer + 48), outn3_U8);
     189             : 
     190             :             _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
     191           0 :             _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
     192           0 :             _mm_storeu_si128((__m128i*)(out8_bit_buffer + 32), out8_2_U8);
     193           0 :             _mm_storeu_si128((__m128i*)(out8_bit_buffer + 48), out8_3_U8);
     194             : 
     195           0 :             outn_bit_buffer += outn_stride;
     196           0 :             out8_bit_buffer += out8_stride;
     197           0 :             in16_bit_buffer += in_stride;
     198             :         }
     199             :     }
     200             :     else
     201             :     {
     202           0 :         uint32_t inStrideDiff = (2 * in_stride) - width;
     203           0 :         uint32_t out8StrideDiff = (2 * out8_stride) - width;
     204           0 :         uint32_t outnStrideDiff = (2 * outn_stride) - width;
     205             : 
     206           0 :         uint32_t inStrideDiff64 = in_stride - width;
     207           0 :         uint32_t out8StrideDiff64 = out8_stride - width;
     208           0 :         uint32_t outnStrideDiff64 = outn_stride - width;
     209             : 
     210           0 :         if (!(width & 63))
     211             :         {
     212             :             __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     213             :             __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
     214             : 
     215           0 :             for (x = 0; x < height; x += 1) {
     216           0 :                 for (y = 0; y < width; y += 64) {
     217           0 :                     inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
     218           0 :                     inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
     219           0 :                     inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
     220           0 :                     inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
     221           0 :                     inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 32));
     222           0 :                     inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 40));
     223           0 :                     inPixel6 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 48));
     224           0 :                     inPixel7 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 56));
     225             : 
     226           0 :                     outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
     227           0 :                     outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
     228           0 :                     outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
     229           0 :                     outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
     230             : 
     231           0 :                     out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
     232           0 :                     out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
     233           0 :                     out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
     234           0 :                     out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
     235             : 
     236             :                     _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
     237           0 :                     _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
     238           0 :                     _mm_storeu_si128((__m128i*)(outn_bit_buffer + 32), outn2_U8);
     239           0 :                     _mm_storeu_si128((__m128i*)(outn_bit_buffer + 48), outn3_U8);
     240             : 
     241             :                     _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
     242           0 :                     _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
     243           0 :                     _mm_storeu_si128((__m128i*)(out8_bit_buffer + 32), out8_2_U8);
     244           0 :                     _mm_storeu_si128((__m128i*)(out8_bit_buffer + 48), out8_3_U8);
     245             : 
     246           0 :                     outn_bit_buffer += 64;
     247           0 :                     out8_bit_buffer += 64;
     248           0 :                     in16_bit_buffer += 64;
     249             :                 }
     250           0 :                 in16_bit_buffer += inStrideDiff64;
     251           0 :                 outn_bit_buffer += outnStrideDiff64;
     252           0 :                 out8_bit_buffer += out8StrideDiff64;
     253             :             }
     254             :         }
     255           0 :         else if (!(width & 31))
     256             :         {
     257             :             __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     258             :             __m128i outn0_U8, outn1_U8, outn2_U8, outn3_U8, out8_0_U8, out8_1_U8, out8_2_U8, out8_3_U8;
     259             : 
     260           0 :             for (x = 0; x < height; x += 2)
     261             :             {
     262           0 :                 for (y = 0; y < width; y += 32)
     263             :                 {
     264           0 :                     inPixel0 = _mm_loadu_si128((__m128i*)in16_bit_buffer);
     265           0 :                     inPixel1 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 8));
     266           0 :                     inPixel2 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 16));
     267           0 :                     inPixel3 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + 24));
     268           0 :                     inPixel4 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride));
     269           0 :                     inPixel5 = _mm_loadu_si128((__m128i*)(in16_bit_buffer + in_stride + 8));
     270           0 :                     inPixel6 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 16));
     271           0 :                     inPixel7 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 24));
     272             : 
     273           0 :                     outn0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
     274           0 :                     outn1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
     275           0 :                     outn2_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel4, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel5, xmm_3), 6));
     276           0 :                     outn3_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel6, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel7, xmm_3), 6));
     277             : 
     278           0 :                     out8_0_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
     279           0 :                     out8_1_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
     280           0 :                     out8_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel4, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel5, 2), xmm_00FF));
     281           0 :                     out8_3_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel6, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel7, 2), xmm_00FF));
     282             : 
     283             :                     _mm_storeu_si128((__m128i*)outn_bit_buffer, outn0_U8);
     284           0 :                     _mm_storeu_si128((__m128i*)(outn_bit_buffer + 16), outn1_U8);
     285           0 :                     _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), outn2_U8);
     286           0 :                     _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride + 16), outn3_U8);
     287             : 
     288             :                     _mm_storeu_si128((__m128i*)out8_bit_buffer, out8_0_U8);
     289           0 :                     _mm_storeu_si128((__m128i*)(out8_bit_buffer + 16), out8_1_U8);
     290           0 :                     _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), out8_2_U8);
     291           0 :                     _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride + 16), out8_3_U8);
     292             : 
     293           0 :                     outn_bit_buffer += 32;
     294           0 :                     out8_bit_buffer += 32;
     295           0 :                     in16_bit_buffer += 32;
     296             :                 }
     297           0 :                 in16_bit_buffer += inStrideDiff;
     298           0 :                 outn_bit_buffer += outnStrideDiff;
     299           0 :                 out8_bit_buffer += out8StrideDiff;
     300             :             }
     301             :         }
     302           0 :         else if (!(width & 15))
     303             :         {
     304             :             __m128i inPixel2, inPixel3;
     305             : 
     306           0 :             for (x = 0; x < height; x += 2)
     307             :             {
     308           0 :                 for (y = 0; y < width; y += 16)
     309             :                 {
     310           0 :                     inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
     311           0 :                     inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + 8));
     312           0 :                     inPixel2 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
     313           0 :                     inPixel3 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride + 8));
     314             : 
     315           0 :                     tempPixel0_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6));
     316           0 :                     tempPixel1_U8 = _mm_packus_epi16(_mm_slli_epi16(_mm_and_si128(inPixel2, xmm_3), 6), _mm_slli_epi16(_mm_and_si128(inPixel3, xmm_3), 6));
     317             : 
     318           0 :                     inPixel0_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF));
     319           0 :                     inPixel1_shftR_2_U8 = _mm_packus_epi16(_mm_and_si128(_mm_srli_epi16(inPixel2, 2), xmm_00FF), _mm_and_si128(_mm_srli_epi16(inPixel3, 2), xmm_00FF));
     320             : 
     321             :                     _mm_storeu_si128((__m128i*)outn_bit_buffer, tempPixel0_U8);
     322           0 :                     _mm_storeu_si128((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
     323             :                     _mm_storeu_si128((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
     324           0 :                     _mm_storeu_si128((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
     325             : 
     326           0 :                     outn_bit_buffer += 16;
     327           0 :                     out8_bit_buffer += 16;
     328           0 :                     in16_bit_buffer += 16;
     329             :                 }
     330           0 :                 in16_bit_buffer += inStrideDiff;
     331           0 :                 outn_bit_buffer += outnStrideDiff;
     332           0 :                 out8_bit_buffer += out8StrideDiff;
     333             :             }
     334             :         }
     335           0 :         else if (!(width & 7))
     336             :         {
     337           0 :             for (x = 0; x < height; x += 2)
     338             :             {
     339           0 :                 for (y = 0; y < width; y += 8)
     340             :                 {
     341           0 :                     inPixel0 = _mm_loadu_si128((__m128i*) in16_bit_buffer);
     342           0 :                     inPixel1 = _mm_loadu_si128((__m128i*) (in16_bit_buffer + in_stride));
     343             : 
     344           0 :                     tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
     345           0 :                     tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
     346             : 
     347           0 :                     tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
     348           0 :                     tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
     349             : 
     350           0 :                     inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
     351           0 :                     inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
     352             : 
     353           0 :                     inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
     354           0 :                     inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
     355             : 
     356           0 :                     _mm_storel_epi64((__m128i*)outn_bit_buffer, tempPixel0_U8);
     357           0 :                     _mm_storel_epi64((__m128i*)(outn_bit_buffer + outn_stride), tempPixel1_U8);
     358           0 :                     _mm_storel_epi64((__m128i*)out8_bit_buffer, inPixel0_shftR_2_U8);
     359           0 :                     _mm_storel_epi64((__m128i*)(out8_bit_buffer + out8_stride), inPixel1_shftR_2_U8);
     360             : 
     361           0 :                     outn_bit_buffer += 8;
     362           0 :                     out8_bit_buffer += 8;
     363           0 :                     in16_bit_buffer += 8;
     364             :                 }
     365           0 :                 in16_bit_buffer += inStrideDiff;
     366           0 :                 outn_bit_buffer += outnStrideDiff;
     367           0 :                 out8_bit_buffer += out8StrideDiff;
     368             :             }
     369             :         }
     370             :         else
     371             :         {
     372           0 :             for (x = 0; x < height; x += 2)
     373             :             {
     374           0 :                 for (y = 0; y < width; y += 4)
     375             :                 {
     376           0 :                     inPixel0 = _mm_loadl_epi64((__m128i*)in16_bit_buffer);
     377           0 :                     inPixel1 = _mm_loadl_epi64((__m128i*)(in16_bit_buffer + in_stride));
     378             : 
     379           0 :                     tempPixel0 = _mm_slli_epi16(_mm_and_si128(inPixel0, xmm_3), 6);
     380           0 :                     tempPixel1 = _mm_slli_epi16(_mm_and_si128(inPixel1, xmm_3), 6);
     381             : 
     382           0 :                     tempPixel0_U8 = _mm_packus_epi16(tempPixel0, tempPixel0);
     383           0 :                     tempPixel1_U8 = _mm_packus_epi16(tempPixel1, tempPixel1);
     384             : 
     385           0 :                     inPixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel0, 2), xmm_00FF);
     386           0 :                     inPixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(inPixel1, 2), xmm_00FF);
     387             : 
     388           0 :                     inPixel0_shftR_2_U8 = _mm_packus_epi16(inPixel0_shftR_2, inPixel0_shftR_2);
     389           0 :                     inPixel1_shftR_2_U8 = _mm_packus_epi16(inPixel1_shftR_2, inPixel1_shftR_2);
     390             : 
     391           0 :                     *(uint32_t*)outn_bit_buffer = _mm_cvtsi128_si32(tempPixel0_U8);
     392           0 :                     *(uint32_t*)(outn_bit_buffer + outn_stride) = _mm_cvtsi128_si32(tempPixel1_U8);
     393           0 :                     *(uint32_t*)out8_bit_buffer = _mm_cvtsi128_si32(inPixel0_shftR_2_U8);
     394           0 :                     *(uint32_t*)(out8_bit_buffer + out8_stride) = _mm_cvtsi128_si32(inPixel1_shftR_2_U8);
     395             : 
     396           0 :                     outn_bit_buffer += 4;
     397           0 :                     out8_bit_buffer += 4;
     398           0 :                     in16_bit_buffer += 4;
     399             :                 }
     400           0 :                 in16_bit_buffer += inStrideDiff;
     401           0 :                 outn_bit_buffer += outnStrideDiff;
     402           0 :                 out8_bit_buffer += out8StrideDiff;
     403             :             }
     404             :         }
     405             :     }
     406           0 :     return;
     407             : }
     408             : 
     409           0 : void unpack_avg_sse2_intrin(
     410             :     uint16_t *ref16_l0,
     411             :     uint32_t  ref_l0_stride,
     412             :     uint16_t *ref16_l1,
     413             :     uint32_t  ref_l1_stride,
     414             :     uint8_t  *dst_ptr,
     415             :     uint32_t  dst_stride,
     416             :     uint32_t  width,
     417             :     uint32_t  height)
     418             : {
     419             :     uint32_t  y;
     420             :     __m128i inPixel0, inPixel1;
     421             : 
     422           0 :     if (width == 4)
     423             :     {
     424             :         __m128i out8_0_U8_L0, out8_0_U8_L1;
     425             :         __m128i avg8_0_U8;
     426             : 
     427           0 :         for (y = 0; y < height; y += 2)
     428             :         {
     429             :             //--------
     430             :             //Line One
     431             :             //--------
     432             : 
     433             :             //List0
     434           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l0);
     435           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     436           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     437             : 
     438             :             //List1
     439           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l1);
     440           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     441           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     442             : 
     443             :             //AVG
     444           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     445             : 
     446           0 :             *(uint32_t*)dst_ptr = _mm_cvtsi128_si32(avg8_0_U8);
     447             : 
     448             :             //--------
     449             :             //Line Two
     450             :             //--------
     451             : 
     452             :             //List0
     453           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l0 + ref_l0_stride));
     454           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     455           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     456             : 
     457             :             //List1
     458             : 
     459           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l1 + ref_l1_stride));
     460           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     461           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     462             : 
     463             :             //AVG
     464           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     465             : 
     466           0 :             *(uint32_t*)(dst_ptr + dst_stride) = _mm_cvtsi128_si32(avg8_0_U8);
     467             : 
     468           0 :             dst_ptr += 2 * dst_stride;
     469           0 :             ref16_l0 += 2 * ref_l0_stride;
     470           0 :             ref16_l1 += 2 * ref_l1_stride;
     471             :         }
     472             :     }
     473           0 :     else if (width == 8)
     474             :     {
     475             :         __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
     476             :         __m128i avg8_0_U8, avg8_2_U8;
     477             : 
     478           0 :         for (y = 0; y < height; y += 2)
     479             :         {
     480             :             //--------
     481             :             //Line One
     482             :             //--------
     483             : 
     484             :             //List0
     485             : 
     486           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
     487             : 
     488           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     489           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     490             : 
     491             :             //List1
     492             : 
     493           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     494             : 
     495           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     496           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     497             : 
     498             :             //AVG
     499           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     500             : 
     501           0 :             _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
     502             : 
     503             :             //--------
     504             :             //Line Two
     505             :             //--------
     506             : 
     507             :             //List0
     508             : 
     509           0 :             inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l0 + ref_l0_stride));
     510             : 
     511           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     512           0 :             out8_2_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     513             : 
     514             :             //List1
     515             : 
     516           0 :             inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l1 + ref_l1_stride));
     517             : 
     518           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     519           0 :             out8_2_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     520             : 
     521             :             //AVG
     522           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     523             : 
     524           0 :             _mm_storel_epi64((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     525             : 
     526           0 :             dst_ptr += 2 * dst_stride;
     527           0 :             ref16_l0 += 2 * ref_l0_stride;
     528           0 :             ref16_l1 += 2 * ref_l1_stride;
     529             :         }
     530             :     }
     531           0 :     else if (width == 16)
     532             :     {
     533             :         __m128i inPixel4, inPixel5;
     534             :         __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
     535             :         __m128i avg8_0_U8, avg8_2_U8;
     536             : 
     537           0 :         for (y = 0; y < height; y += 2)
     538             :         {
     539             :             //--------
     540             :             //Line One
     541             :             //--------
     542             : 
     543             :             //List0
     544             : 
     545           0 :             inPixel0 = _mm_loadu_si128((__m128i*)  ref16_l0);
     546           0 :             inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
     547             : 
     548           0 :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     549             : 
     550             :             //List1
     551             : 
     552           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     553           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
     554             : 
     555           0 :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     556             : 
     557             :             //AVG
     558           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     559             : 
     560             :             _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
     561             : 
     562             :             //--------
     563             :             //Line Two
     564             :             //--------
     565             : 
     566             :             //List0
     567             : 
     568           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
     569           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
     570             : 
     571           0 :             out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     572             : 
     573             :             //List1
     574             : 
     575           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
     576           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
     577             : 
     578           0 :             out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     579             : 
     580             :             //AVG
     581           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     582             : 
     583           0 :             _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     584             : 
     585           0 :             dst_ptr += 2 * dst_stride;
     586           0 :             ref16_l0 += 2 * ref_l0_stride;
     587           0 :             ref16_l1 += 2 * ref_l1_stride;
     588             :         }
     589             :     }
     590           0 :     else if (width == 32)
     591             :     {
     592             :         __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     593             :         __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
     594             :         __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
     595             :         __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
     596             : 
     597           0 :         for (y = 0; y < height; y += 2)
     598             :         {
     599             :             //--------
     600             :             //Line One
     601             :             //--------
     602             : 
     603             :             //List0
     604             : 
     605           0 :             inPixel0 = _mm_loadu_si128((__m128i*)  ref16_l0);
     606           0 :             inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
     607           0 :             inPixel2 = _mm_loadu_si128((__m128i*) (ref16_l0 + 16));
     608           0 :             inPixel3 = _mm_loadu_si128((__m128i*) (ref16_l0 + 24));
     609             : 
     610           0 :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     611           0 :             out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     612             : 
     613             :             //List1
     614             : 
     615           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     616           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
     617           0 :             inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
     618           0 :             inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
     619             : 
     620           0 :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     621           0 :             out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     622             : 
     623             :             //AVG
     624           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     625           0 :             avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
     626             : 
     627             :             _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
     628           0 :             _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
     629             : 
     630             :             //--------
     631             :             //Line Two
     632             :             //--------
     633             : 
     634             :             //List0
     635             : 
     636           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
     637           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
     638           0 :             inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 16));
     639           0 :             inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 24));
     640             : 
     641           0 :             out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     642           0 :             out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     643             : 
     644             :             //List1
     645             : 
     646           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
     647           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
     648           0 :             inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 16));
     649           0 :             inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 24));
     650             : 
     651           0 :             out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     652           0 :             out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     653             : 
     654             :             //AVG
     655           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     656           0 :             avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
     657             : 
     658           0 :             _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     659           0 :             _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride + 16), avg8_3_U8);
     660             : 
     661           0 :             dst_ptr += 2 * dst_stride;
     662           0 :             ref16_l0 += 2 * ref_l0_stride;
     663           0 :             ref16_l1 += 2 * ref_l1_stride;
     664             :         }
     665             :     }
     666           0 :     else if (width == 64)
     667             :     {
     668             :         __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     669             :         __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
     670             :         __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
     671             :         __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
     672             : 
     673           0 :         for (y = 0; y < height; ++y)
     674             :         {
     675             :             //List0
     676             : 
     677           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
     678           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l0 + 8));
     679           0 :             inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l0 + 16));
     680           0 :             inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l0 + 24));
     681           0 :             inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l0 + 32));
     682           0 :             inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l0 + 40));
     683           0 :             inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l0 + 48));
     684           0 :             inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l0 + 56));
     685             : 
     686           0 :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     687           0 :             out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     688           0 :             out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     689           0 :             out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     690             : 
     691             :             //List1
     692             : 
     693           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     694           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
     695           0 :             inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
     696           0 :             inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
     697           0 :             inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l1 + 32));
     698           0 :             inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l1 + 40));
     699           0 :             inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l1 + 48));
     700           0 :             inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l1 + 56));
     701             : 
     702             :             //Note: old Version used to use _mm_and_si128 to mask the MSB bits of the pixels
     703           0 :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     704           0 :             out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     705           0 :             out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     706           0 :             out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     707             : 
     708             :             //AVG
     709           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     710           0 :             avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
     711           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     712           0 :             avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
     713             : 
     714             :             _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
     715           0 :             _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
     716           0 :             _mm_storeu_si128((__m128i*)(dst_ptr + 32), avg8_2_U8);
     717           0 :             _mm_storeu_si128((__m128i*)(dst_ptr + 48), avg8_3_U8);
     718             : 
     719           0 :             dst_ptr += dst_stride;
     720           0 :             ref16_l0 += ref_l0_stride;
     721           0 :             ref16_l1 += ref_l1_stride;
     722             :         }
     723             :     }
     724             : 
     725           0 :     return;
     726             : }
     727             : /********************************************************************************************************************
     728             : eb_enc_msb_pack2d_sse2_intrin
     729             : *********************************************************************************************************************/
     730           0 : void eb_enc_msb_pack2d_sse2_intrin(
     731             :     uint8_t     *in8_bit_buffer,
     732             :     uint32_t     in8_stride,
     733             :     uint8_t     *inn_bit_buffer,
     734             :     uint16_t    *out16_bit_buffer,
     735             :     uint32_t     inn_stride,
     736             :     uint32_t     out_stride,
     737             :     uint32_t     width,
     738             :     uint32_t     height)
     739             : {
     740             :     uint32_t count_width, count_height;
     741             : 
     742           0 :     if (width == 4) {
     743           0 :         for (count_height = 0; count_height < height; count_height += 2) {
     744           0 :             _mm_storel_epi64((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer)),
     745           0 :                 _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer))), 6));
     746           0 :             _mm_storel_epi64((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer + inn_stride)),
     747           0 :                 _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer + in8_stride))), 6));
     748           0 :             out16_bit_buffer += (out_stride << 1);
     749           0 :             in8_bit_buffer += (in8_stride << 1);
     750           0 :             inn_bit_buffer += (inn_stride << 1);
     751             :         }
     752             :     }
     753           0 :     else if (width == 8) {
     754           0 :         for (count_height = 0; count_height < height; count_height += 2) {
     755           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer)),
     756             :                 _mm_loadl_epi64((__m128i*)(in8_bit_buffer))), 6));
     757           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)),
     758           0 :                 _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6));
     759           0 :             out16_bit_buffer += (out_stride << 1);
     760           0 :             in8_bit_buffer += (in8_stride << 1);
     761           0 :             inn_bit_buffer += (inn_stride << 1);
     762             :         }
     763             :     }
     764           0 :     else if (width == 16) {
     765             :         __m128i outPixel1, outPixel2, outPixel3, outPixel4, innBitBuffer_lo, innBitBuffer_hi, in8BitBuffer_lo, in8BitBuffer_hi;
     766             : 
     767           0 :         for (count_height = 0; count_height < height; count_height += 2) {
     768           0 :             innBitBuffer_lo = _mm_loadu_si128((__m128i *)inn_bit_buffer);
     769           0 :             innBitBuffer_hi = _mm_loadu_si128((__m128i *)(inn_bit_buffer + inn_stride));
     770           0 :             in8BitBuffer_lo = _mm_loadu_si128((__m128i *)in8_bit_buffer);
     771           0 :             in8BitBuffer_hi = _mm_loadu_si128((__m128i *)(in8_bit_buffer + in8_stride));
     772             : 
     773           0 :             outPixel1 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer_lo, in8BitBuffer_lo), 6);
     774           0 :             outPixel2 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer_lo, in8BitBuffer_lo), 6);
     775           0 :             outPixel3 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer_hi, in8BitBuffer_hi), 6);
     776           0 :             outPixel4 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer_hi, in8BitBuffer_hi), 6);
     777             : 
     778             :             _mm_storeu_si128((__m128i*)out16_bit_buffer, outPixel1);
     779           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 8), outPixel2);
     780           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), outPixel3);
     781           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 8), outPixel4);
     782             : 
     783           0 :             in8_bit_buffer += (in8_stride << 1);
     784           0 :             inn_bit_buffer += (inn_stride << 1);
     785           0 :             out16_bit_buffer += (out_stride << 1);
     786             :         }
     787             :     }
     788           0 :     else if (width == 32) {
     789             :         __m128i innBitBuffer1, innBitBuffer2, innBitBuffer3, innBitBuffer4, in8BitBuffer1, in8BitBuffer2, in8BitBuffer3, in8BitBuffer4;
     790             :         __m128i outPixel1, outPixel2, outPixel3, outPixel4, outPixel5, outPixel6, outPixel7, outPixel8;
     791             : 
     792           0 :         for (count_height = 0; count_height < height; count_height += 2)
     793             :         {
     794           0 :             innBitBuffer1 = _mm_loadu_si128((__m128i *)inn_bit_buffer);
     795           0 :             innBitBuffer2 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 16));
     796           0 :             innBitBuffer3 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + inn_stride));
     797           0 :             innBitBuffer4 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + inn_stride + 16));
     798             : 
     799           0 :             in8BitBuffer1 = _mm_loadu_si128((__m128i *)in8_bit_buffer);
     800           0 :             in8BitBuffer2 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 16));
     801           0 :             in8BitBuffer3 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + in8_stride));
     802           0 :             in8BitBuffer4 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + in8_stride + 16));
     803             : 
     804           0 :             outPixel1 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer1, in8BitBuffer1), 6);
     805           0 :             outPixel2 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer1, in8BitBuffer1), 6);
     806           0 :             outPixel3 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer2, in8BitBuffer2), 6);
     807           0 :             outPixel4 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer2, in8BitBuffer2), 6);
     808           0 :             outPixel5 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer3, in8BitBuffer3), 6);
     809           0 :             outPixel6 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer3, in8BitBuffer3), 6);
     810           0 :             outPixel7 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer4, in8BitBuffer4), 6);
     811           0 :             outPixel8 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer4, in8BitBuffer4), 6);
     812             : 
     813             :             _mm_storeu_si128((__m128i*)out16_bit_buffer, outPixel1);
     814           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 8), outPixel2);
     815           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 16), outPixel3);
     816           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 24), outPixel4);
     817           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), outPixel5);
     818           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 8), outPixel6);
     819           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 16), outPixel7);
     820           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride + 24), outPixel8);
     821             : 
     822           0 :             in8_bit_buffer += (in8_stride << 1);
     823           0 :             inn_bit_buffer += (inn_stride << 1);
     824           0 :             out16_bit_buffer += (out_stride << 1);
     825             :         }
     826             :     }
     827           0 :     else if (width == 64) {
     828             :         __m128i innBitBuffer1, innBitBuffer2, innBitBuffer3, innBitBuffer4, in8BitBuffer1, in8BitBuffer2, in8BitBuffer3, in8BitBuffer4;
     829             :         __m128i outPixel1, outPixel2, outPixel3, outPixel4, outPixel5, outPixel6, outPixel7, outPixel8;
     830             : 
     831           0 :         for (count_height = 0; count_height < height; ++count_height)
     832             :         {
     833           0 :             innBitBuffer1 = _mm_loadu_si128((__m128i *)inn_bit_buffer);
     834           0 :             innBitBuffer2 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 16));
     835           0 :             innBitBuffer3 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 32));
     836           0 :             innBitBuffer4 = _mm_loadu_si128((__m128i *)(inn_bit_buffer + 48));
     837             : 
     838           0 :             in8BitBuffer1 = _mm_loadu_si128((__m128i *)in8_bit_buffer);
     839           0 :             in8BitBuffer2 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 16));
     840           0 :             in8BitBuffer3 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 32));
     841           0 :             in8BitBuffer4 = _mm_loadu_si128((__m128i *)(in8_bit_buffer + 48));
     842             : 
     843           0 :             outPixel1 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer1, in8BitBuffer1), 6);
     844           0 :             outPixel2 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer1, in8BitBuffer1), 6);
     845           0 :             outPixel3 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer2, in8BitBuffer2), 6);
     846           0 :             outPixel4 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer2, in8BitBuffer2), 6);
     847           0 :             outPixel5 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer3, in8BitBuffer3), 6);
     848           0 :             outPixel6 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer3, in8BitBuffer3), 6);
     849           0 :             outPixel7 = _mm_srli_epi16(_mm_unpacklo_epi8(innBitBuffer4, in8BitBuffer4), 6);
     850           0 :             outPixel8 = _mm_srli_epi16(_mm_unpackhi_epi8(innBitBuffer4, in8BitBuffer4), 6);
     851             : 
     852             :             _mm_storeu_si128((__m128i*)out16_bit_buffer, outPixel1);
     853           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 8), outPixel2);
     854           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 16), outPixel3);
     855           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 24), outPixel4);
     856           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 32), outPixel5);
     857           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 40), outPixel6);
     858           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 48), outPixel7);
     859           0 :             _mm_storeu_si128((__m128i*)(out16_bit_buffer + 56), outPixel8);
     860             : 
     861           0 :             in8_bit_buffer += in8_stride;
     862           0 :             inn_bit_buffer += inn_stride;
     863           0 :             out16_bit_buffer += out_stride;
     864             :         }
     865             :     }
     866             :     else {
     867           0 :         uint32_t innStrideDiff = (inn_stride << 1) - width;
     868           0 :         uint32_t in8StrideDiff = (in8_stride << 1) - width;
     869           0 :         uint32_t outStrideDiff = (out_stride << 1) - width;
     870             : 
     871           0 :         if (!(width & 7)) {
     872           0 :             for (count_height = 0; count_height < height; count_height += 2) {
     873           0 :                 for (count_width = 0; count_width < width; count_width += 8) {
     874           0 :                     _mm_storeu_si128((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer)),
     875             :                         _mm_loadl_epi64((__m128i*)(in8_bit_buffer))), 6));
     876           0 :                     _mm_storeu_si128((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)),
     877           0 :                         _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6));
     878           0 :                     out16_bit_buffer += 8;
     879           0 :                     in8_bit_buffer += 8;
     880           0 :                     inn_bit_buffer += 8;
     881             :                 }
     882           0 :                 in8_bit_buffer += in8StrideDiff;
     883           0 :                 inn_bit_buffer += innStrideDiff;
     884           0 :                 out16_bit_buffer += outStrideDiff;
     885             :             }
     886             :         }
     887             :         else {
     888           0 :             for (count_height = 0; count_height < height; count_height += 2) {
     889           0 :                 for (count_width = 0; count_width < width; count_width += 4) {
     890           0 :                     _mm_storel_epi64((__m128i*)(out16_bit_buffer), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer)),
     891           0 :                         _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer))), 6));
     892           0 :                     _mm_storel_epi64((__m128i*)(out16_bit_buffer + out_stride), _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t*)(inn_bit_buffer + inn_stride)),
     893           0 :                         _mm_cvtsi32_si128(*(uint32_t*)(in8_bit_buffer + in8_stride))), 6));
     894           0 :                     out16_bit_buffer += 4;
     895           0 :                     in8_bit_buffer += 4;
     896           0 :                     inn_bit_buffer += 4;
     897             :                 }
     898           0 :                 in8_bit_buffer += in8StrideDiff;
     899           0 :                 inn_bit_buffer += innStrideDiff;
     900           0 :                 out16_bit_buffer += outStrideDiff;
     901             :             }
     902             :         }
     903             :     }
     904           0 : }

Generated by: LCOV version 1.14