LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbPictureOperators_Intrinsic_AVX2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 164 1244 13.2 %
Date: 2019-11-25 17:38:06 Functions: 5 24 20.8 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbDefinitions.h"
       7             : #include <stdio.h>
       8             : #include <immintrin.h>
       9             : #include "EbPictureOperators_AVX2.h"
      10             : #include "EbPictureOperators_Inline_AVX2.h"
      11             : #include "EbPictureOperators_SSE2.h"
      12             : #include "EbMemory_AVX2.h"
      13             : #include "synonyms.h"
      14             : 
      15             : #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
      16             :     _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
      17             : 
      18           0 : void compressed_packmsb_avx2_intrin(
      19             :     uint8_t     *in8_bit_buffer,
      20             :     uint32_t     in8_stride,
      21             :     uint8_t     *inn_bit_buffer,
      22             :     uint16_t    *out16_bit_buffer,
      23             :     uint32_t     inn_stride,
      24             :     uint32_t     out_stride,
      25             :     uint32_t     width,
      26             :     uint32_t     height)
      27             : {
      28             :     uint32_t y;
      29             : 
      30           0 :     if (width == 32)
      31             :     {
      32             :         __m256i inNBit, in8Bit, inNBitStride, in8BitStride, concat0, concat1, concat2, concat3;
      33             :         __m256i out0_15, out16_31, out_s0_s15, out_s16_s31;
      34             : 
      35             :         __m128i in2Bit, ext0, ext1, ext2, ext3, ext01, ext23, ext01h, ext23h, ext0_15, ext16_31, ext32_47, ext48_63;
      36             :         __m128i msk0;
      37             : 
      38           0 :         msk0 = _mm_set1_epi8((int8_t)0xC0);//1100.000
      39             : 
      40             :         //processing 2 lines for chroma
      41           0 :         for (y = 0; y < height; y += 2)
      42             :         {
      43             :             //2 Lines Stored in 1D format-Could be replaced by 2 _mm_loadl_epi64
      44           0 :             in2Bit = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)inn_bit_buffer), _mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)));
      45             : 
      46           0 :             ext0 = _mm_and_si128(in2Bit, msk0);
      47           0 :             ext1 = _mm_and_si128(_mm_slli_epi16(in2Bit, 2), msk0);
      48           0 :             ext2 = _mm_and_si128(_mm_slli_epi16(in2Bit, 4), msk0);
      49           0 :             ext3 = _mm_and_si128(_mm_slli_epi16(in2Bit, 6), msk0);
      50             : 
      51           0 :             ext01 = _mm_unpacklo_epi8(ext0, ext1);
      52           0 :             ext23 = _mm_unpacklo_epi8(ext2, ext3);
      53           0 :             ext0_15 = _mm_unpacklo_epi16(ext01, ext23);
      54           0 :             ext16_31 = _mm_unpackhi_epi16(ext01, ext23);
      55             : 
      56           0 :             ext01h = _mm_unpackhi_epi8(ext0, ext1);
      57           0 :             ext23h = _mm_unpackhi_epi8(ext2, ext3);
      58           0 :             ext32_47 = _mm_unpacklo_epi16(ext01h, ext23h);
      59           0 :             ext48_63 = _mm_unpackhi_epi16(ext01h, ext23h);
      60             : 
      61           0 :             inNBit = _mm256_set_m128i(ext16_31, ext0_15);
      62           0 :             inNBitStride = _mm256_set_m128i(ext48_63, ext32_47);
      63             : 
      64           0 :             in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
      65           0 :             in8BitStride = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride));
      66             : 
      67             :             //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
      68           0 :             concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
      69           0 :             concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
      70           0 :             concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride, in8BitStride), 6);
      71           0 :             concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride, in8BitStride), 6);
      72             : 
      73             :             //Re-organize the packing for writing to the out buffer
      74           0 :             out0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
      75           0 :             out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
      76           0 :             out_s0_s15 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
      77           0 :             out_s16_s31 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
      78             : 
      79             :             _mm256_store_si256((__m256i*) out16_bit_buffer, out0_15);
      80           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
      81           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride), out_s0_s15);
      82           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 16), out_s16_s31);
      83             : 
      84           0 :             in8_bit_buffer += in8_stride << 1;
      85           0 :             inn_bit_buffer += inn_stride << 1;
      86           0 :             out16_bit_buffer += out_stride << 1;
      87             :         }
      88             :     }
      89           0 :     else if (width == 64)
      90             :     {
      91             :         __m256i inNBit, in8Bit, inNBit32, in8Bit32;
      92             :         __m256i concat0, concat1, concat2, concat3;
      93             :         __m256i out_0_15, out16_31, out32_47, out_48_63;
      94             : 
      95             :         __m128i in2Bit, ext0, ext1, ext2, ext3, ext01, ext23, ext01h, ext23h, ext0_15, ext16_31, ext32_47, ext48_63;
      96             :         __m128i msk;
      97             : 
      98           0 :         msk = _mm_set1_epi8((int8_t)0xC0);//1100.000
      99             : 
     100             :         //One row per iter
     101           0 :         for (y = 0; y < height; y++)
     102             :         {
     103           0 :             in2Bit = _mm_loadu_si128((__m128i*)inn_bit_buffer);
     104             : 
     105           0 :             ext0 = _mm_and_si128(in2Bit, msk);
     106           0 :             ext1 = _mm_and_si128(_mm_slli_epi16(in2Bit, 2), msk);
     107           0 :             ext2 = _mm_and_si128(_mm_slli_epi16(in2Bit, 4), msk);
     108           0 :             ext3 = _mm_and_si128(_mm_slli_epi16(in2Bit, 6), msk);
     109             : 
     110           0 :             ext01 = _mm_unpacklo_epi8(ext0, ext1);
     111           0 :             ext23 = _mm_unpacklo_epi8(ext2, ext3);
     112           0 :             ext0_15 = _mm_unpacklo_epi16(ext01, ext23);
     113           0 :             ext16_31 = _mm_unpackhi_epi16(ext01, ext23);
     114             : 
     115           0 :             ext01h = _mm_unpackhi_epi8(ext0, ext1);
     116           0 :             ext23h = _mm_unpackhi_epi8(ext2, ext3);
     117           0 :             ext32_47 = _mm_unpacklo_epi16(ext01h, ext23h);
     118           0 :             ext48_63 = _mm_unpackhi_epi16(ext01h, ext23h);
     119             : 
     120           0 :             inNBit = _mm256_set_m128i(ext16_31, ext0_15);
     121           0 :             inNBit32 = _mm256_set_m128i(ext48_63, ext32_47);
     122             : 
     123           0 :             in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
     124           0 :             in8Bit32 = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + 32));
     125             : 
     126             :             //(outPixel | nBitPixel) concatenation
     127           0 :             concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
     128           0 :             concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
     129           0 :             concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit32, in8Bit32), 6);
     130           0 :             concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit32, in8Bit32), 6);
     131             : 
     132             :             //Re-organize the packing for writing to the out buffer
     133           0 :             out_0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
     134           0 :             out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
     135           0 :             out32_47 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
     136           0 :             out_48_63 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
     137             : 
     138             :             _mm256_store_si256((__m256i*) out16_bit_buffer, out_0_15);
     139           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
     140           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 32), out32_47);
     141           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 48), out_48_63);
     142             : 
     143           0 :             in8_bit_buffer += in8_stride;
     144           0 :             inn_bit_buffer += inn_stride;
     145           0 :             out16_bit_buffer += out_stride;
     146             :         }
     147             :     }
     148           0 : }
     149             : 
     150           0 : void c_pack_avx2_intrin(
     151             :     const uint8_t     *inn_bit_buffer,
     152             :     uint32_t     inn_stride,
     153             :     uint8_t     *in_compn_bit_buffer,
     154             :     uint32_t     out_stride,
     155             :     uint8_t    *local_cache,
     156             :     uint32_t     width,
     157             :     uint32_t     height)
     158             : {
     159             :     uint32_t y;
     160             : 
     161           0 :     if (width == 32)
     162             :     {
     163             :         __m256i inNBit;
     164             : 
     165             :         __m256i ext0, ext1, ext2, ext3, ext0123, ext0123n, extp;
     166             :         __m256i msk0, msk1, msk2, msk3;
     167             : 
     168           0 :         msk0 = _mm256_set1_epi32(0x000000C0);//1100.0000
     169           0 :         msk1 = _mm256_set1_epi32(0x00000030);//0011.0000
     170           0 :         msk2 = _mm256_set1_epi32(0x0000000C);//0000.1100
     171           0 :         msk3 = _mm256_set1_epi32(0x00000003);//0000.0011
     172             : 
     173             :         //One row per iter
     174           0 :         for (y = 0; y < height; y++)
     175             :         {
     176           0 :             inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
     177             : 
     178           0 :             ext0 = _mm256_and_si256(inNBit, msk0);
     179           0 :             ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
     180           0 :             ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
     181           0 :             ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
     182             : 
     183           0 :             ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
     184             : 
     185           0 :             ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
     186             : 
     187           0 :             extp = _mm256_packus_epi32(ext0123, ext0123n);
     188           0 :             extp = _mm256_packus_epi16(extp, extp);
     189             : 
     190           0 :             _mm_storel_epi64((__m128i*) in_compn_bit_buffer, _mm256_castsi256_si128(extp));
     191           0 :             in_compn_bit_buffer += out_stride;
     192           0 :             inn_bit_buffer += inn_stride;
     193             :         }
     194             :     }
     195           0 :     else if (width == 64)
     196             :     {
     197             :         __m256i inNBit;
     198             :         __m256i ext0, ext1, ext2, ext3, ext0123, ext0123n, extp, extp1;
     199             :         __m256i msk0, msk1, msk2, msk3;
     200             : 
     201           0 :         msk0 = _mm256_set1_epi32(0x000000C0);//1100.0000
     202           0 :         msk1 = _mm256_set1_epi32(0x00000030);//0011.0000
     203           0 :         msk2 = _mm256_set1_epi32(0x0000000C);//0000.1100
     204           0 :         msk3 = _mm256_set1_epi32(0x00000003);//0000.0011
     205           0 :         if (height == 64)
     206             :         {
     207           0 :             uint8_t* localPtr = local_cache;
     208             : 
     209           0 :             for (y = 0; y < height; y++)
     210             :             {
     211           0 :                 inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
     212             : 
     213           0 :                 ext0 = _mm256_and_si256(inNBit, msk0);
     214           0 :                 ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
     215           0 :                 ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
     216           0 :                 ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
     217             : 
     218           0 :                 ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
     219             : 
     220           0 :                 ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
     221             : 
     222           0 :                 extp = _mm256_packus_epi32(ext0123, ext0123n);
     223           0 :                 extp = _mm256_packus_epi16(extp, extp);
     224             : 
     225           0 :                 inNBit = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + 32));
     226             : 
     227           0 :                 ext0 = _mm256_and_si256(inNBit, msk0);
     228           0 :                 ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
     229           0 :                 ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
     230           0 :                 ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
     231             : 
     232           0 :                 ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
     233             : 
     234           0 :                 ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
     235             : 
     236           0 :                 extp1 = _mm256_packus_epi32(ext0123, ext0123n);
     237           0 :                 extp1 = _mm256_packus_epi16(extp1, extp1);
     238             : 
     239           0 :                 extp = _mm256_unpacklo_epi64(extp, extp1);
     240             : 
     241           0 :                 _mm_storeu_si128((__m128i*)  (localPtr + 16 * (y & 3)), _mm256_castsi256_si128(extp));
     242             : 
     243           0 :                 if ((y & 3) == 3)
     244             :                 {
     245           0 :                     __m256i c0 = _mm256_loadu_si256((__m256i*)(localPtr));
     246           0 :                     __m256i c1 = _mm256_loadu_si256((__m256i*)(localPtr + 32));
     247           0 :                     _mm_storeu_si128((__m128i*)  (in_compn_bit_buffer), _mm256_extractf128_si256(c0, 0));
     248           0 :                     _mm_storeu_si128((__m128i*)  (in_compn_bit_buffer + out_stride), _mm256_extractf128_si256(c0, 1));
     249           0 :                     _mm_storeu_si128((__m128i*)  (in_compn_bit_buffer + 2 * out_stride), _mm256_extractf128_si256(c1, 0));
     250           0 :                     _mm_storeu_si128((__m128i*)  (in_compn_bit_buffer + 3 * out_stride), _mm256_extractf128_si256(c1, 1));
     251           0 :                     in_compn_bit_buffer += 4 * out_stride;
     252             :                 }
     253             : 
     254           0 :                 inn_bit_buffer += inn_stride;
     255             :             }
     256             :         }
     257             :         else {
     258             :             //One row per iter
     259           0 :             for (y = 0; y < height; y++)
     260             :             {
     261           0 :                 inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
     262             : 
     263           0 :                 ext0 = _mm256_and_si256(inNBit, msk0);
     264           0 :                 ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
     265           0 :                 ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
     266           0 :                 ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
     267             : 
     268           0 :                 ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
     269             : 
     270           0 :                 ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
     271             : 
     272           0 :                 extp = _mm256_packus_epi32(ext0123, ext0123n);
     273           0 :                 extp = _mm256_packus_epi16(extp, extp);
     274             : 
     275           0 :                 inNBit = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + 32));
     276             : 
     277           0 :                 ext0 = _mm256_and_si256(inNBit, msk0);
     278           0 :                 ext1 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 1 * 8 + 2), msk1);
     279           0 :                 ext2 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 2 * 8 + 4), msk2);
     280           0 :                 ext3 = _mm256_and_si256(_mm256_srli_epi32(inNBit, 3 * 8 + 6), msk3);
     281             : 
     282           0 :                 ext0123 = _mm256_or_si256(_mm256_or_si256(ext0, ext1), _mm256_or_si256(ext2, ext3));
     283             : 
     284           0 :                 ext0123n = _mm256_castsi128_si256(_mm256_extracti128_si256(ext0123, 1));
     285             : 
     286           0 :                 extp1 = _mm256_packus_epi32(ext0123, ext0123n);
     287           0 :                 extp1 = _mm256_packus_epi16(extp1, extp1);
     288             : 
     289           0 :                 extp = _mm256_unpacklo_epi64(extp, extp1);
     290             : 
     291           0 :                 _mm_storeu_si128((__m128i*)  in_compn_bit_buffer, _mm256_castsi256_si128(extp));
     292             : 
     293           0 :                 in_compn_bit_buffer += out_stride;
     294             : 
     295           0 :                 inn_bit_buffer += inn_stride;
     296             :             }
     297             :         }
     298             :     }
     299           0 : }
     300             : 
     301           0 : void eb_enc_msb_pack2d_avx2_intrin_al(
     302             :     uint8_t     *in8_bit_buffer,
     303             :     uint32_t     in8_stride,
     304             :     uint8_t     *inn_bit_buffer,
     305             :     uint16_t    *out16_bit_buffer,
     306             :     uint32_t     inn_stride,
     307             :     uint32_t     out_stride,
     308             :     uint32_t     width,
     309             :     uint32_t     height)
     310             : {
     311             :     //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
     312             : 
     313             :     uint32_t y, x;
     314             : 
     315             :     __m128i out0, out1;
     316             : 
     317           0 :     if (width == 4)
     318             :     {
     319           0 :         for (y = 0; y < height; y += 2) {
     320           0 :             out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)inn_bit_buffer), _mm_cvtsi32_si128(*(uint32_t *)in8_bit_buffer)), 6);
     321           0 :             out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)(inn_bit_buffer + inn_stride)), _mm_cvtsi32_si128(*(uint32_t *)(in8_bit_buffer + in8_stride))), 6);
     322             : 
     323           0 :             _mm_storel_epi64((__m128i*) out16_bit_buffer, out0);
     324           0 :             _mm_storel_epi64((__m128i*) (out16_bit_buffer + out_stride), out1);
     325             : 
     326           0 :             in8_bit_buffer += in8_stride << 1;
     327           0 :             inn_bit_buffer += inn_stride << 1;
     328           0 :             out16_bit_buffer += out_stride << 1;
     329             :         }
     330             :     }
     331           0 :     else if (width == 8)
     332             :     {
     333           0 :         for (y = 0; y < height; y += 2) {
     334           0 :             out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)inn_bit_buffer), _mm_loadl_epi64((__m128i*)in8_bit_buffer)), 6);
     335           0 :             out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)), _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6);
     336             : 
     337             :             _mm_storeu_si128((__m128i*) out16_bit_buffer, out0);
     338           0 :             _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride), out1);
     339             : 
     340           0 :             in8_bit_buffer += in8_stride << 1;
     341           0 :             inn_bit_buffer += inn_stride << 1;
     342           0 :             out16_bit_buffer += out_stride << 1;
     343             :         }
     344             :     }
     345           0 :     else if (width == 16)
     346             :     {
     347             :         __m128i inNBit, in8Bit, inNBitStride, in8BitStride, out0, out1, out2, out3;
     348             : 
     349           0 :         for (y = 0; y < height; y += 2) {
     350           0 :             inNBit = _mm_loadu_si128((__m128i*)inn_bit_buffer);
     351           0 :             in8Bit = _mm_loadu_si128((__m128i*)in8_bit_buffer);
     352           0 :             inNBitStride = _mm_loadu_si128((__m128i*)(inn_bit_buffer + inn_stride));
     353           0 :             in8BitStride = _mm_loadu_si128((__m128i*)(in8_bit_buffer + in8_stride));
     354             : 
     355           0 :             out0 = _mm_srli_epi16(_mm_unpacklo_epi8(inNBit, in8Bit), 6);
     356           0 :             out1 = _mm_srli_epi16(_mm_unpackhi_epi8(inNBit, in8Bit), 6);
     357           0 :             out2 = _mm_srli_epi16(_mm_unpacklo_epi8(inNBitStride, in8BitStride), 6);
     358           0 :             out3 = _mm_srli_epi16(_mm_unpackhi_epi8(inNBitStride, in8BitStride), 6);
     359             : 
     360             :             _mm_storeu_si128((__m128i*) out16_bit_buffer, out0);
     361           0 :             _mm_storeu_si128((__m128i*) (out16_bit_buffer + 8), out1);
     362           0 :             _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride), out2);
     363           0 :             _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride + 8), out3);
     364             : 
     365           0 :             in8_bit_buffer += in8_stride << 1;
     366           0 :             inn_bit_buffer += inn_stride << 1;
     367           0 :             out16_bit_buffer += out_stride << 1;
     368             :         }
     369             :     }
     370           0 :     else if (width == 32)
     371             :     {
     372             :         __m256i inNBit, in8Bit, inNBitStride, in8BitStride, concat0, concat1, concat2, concat3;
     373             :         __m256i out0_15, out16_31, out_s0_s15, out_s16_s31;
     374             : 
     375           0 :         for (y = 0; y < height; y += 2) {
     376           0 :             inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
     377           0 :             in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
     378           0 :             inNBitStride = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + inn_stride));
     379           0 :             in8BitStride = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride));
     380             : 
     381             :             //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
     382           0 :             concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
     383           0 :             concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
     384           0 :             concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride, in8BitStride), 6);
     385           0 :             concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride, in8BitStride), 6);
     386             : 
     387             :             //Re-organize the packing for writing to the out buffer
     388           0 :             out0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
     389           0 :             out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
     390           0 :             out_s0_s15 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
     391           0 :             out_s16_s31 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
     392             : 
     393             :             _mm256_store_si256((__m256i*) out16_bit_buffer, out0_15);
     394           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
     395           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride), out_s0_s15);
     396           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 16), out_s16_s31);
     397             : 
     398           0 :             in8_bit_buffer += in8_stride << 1;
     399             :             //inn_bit_buffer += inn_stride << 1;
     400           0 :             inn_bit_buffer += inn_stride * 2;
     401           0 :             out16_bit_buffer += out_stride << 1;
     402             :         }
     403             :     }
     404           0 :     else if (width == 64)
     405             :     {
     406             :         __m256i inNBit, in8Bit, inNBitStride, in8BitStride, inNBit32, in8Bit32, inNBitStride32, in8BitStride32;
     407             :         __m256i concat0, concat1, concat2, concat3, concat4, concat5, concat6, concat7;
     408             :         __m256i out_0_15, out16_31, out32_47, out_48_63, out_s0_s15, out_s16_s31, out_s32_s47, out_s48_s63;
     409             : 
     410           0 :         for (y = 0; y < height; y += 2) {
     411           0 :             inNBit = _mm256_loadu_si256((__m256i*)inn_bit_buffer);
     412           0 :             in8Bit = _mm256_loadu_si256((__m256i*)in8_bit_buffer);
     413           0 :             inNBit32 = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + 32));
     414           0 :             in8Bit32 = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + 32));
     415           0 :             inNBitStride = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + inn_stride));
     416           0 :             in8BitStride = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride));
     417           0 :             inNBitStride32 = _mm256_loadu_si256((__m256i*)(inn_bit_buffer + inn_stride + 32));
     418           0 :             in8BitStride32 = _mm256_loadu_si256((__m256i*)(in8_bit_buffer + in8_stride + 32));
     419             :             //(outPixel | nBitPixel) concatenation is done with unpacklo_epi8 and unpackhi_epi8
     420           0 :             concat0 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit, in8Bit), 6);
     421           0 :             concat1 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit, in8Bit), 6);
     422           0 :             concat2 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBit32, in8Bit32), 6);
     423           0 :             concat3 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBit32, in8Bit32), 6);
     424           0 :             concat4 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride, in8BitStride), 6);
     425           0 :             concat5 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride, in8BitStride), 6);
     426           0 :             concat6 = _mm256_srli_epi16(_mm256_unpacklo_epi8(inNBitStride32, in8BitStride32), 6);
     427           0 :             concat7 = _mm256_srli_epi16(_mm256_unpackhi_epi8(inNBitStride32, in8BitStride32), 6);
     428             : 
     429             :             //Re-organize the packing for writing to the out buffer
     430           0 :             out_0_15 = _mm256_inserti128_si256(concat0, _mm256_castsi256_si128(concat1), 1);
     431           0 :             out16_31 = _mm256_inserti128_si256(concat1, _mm256_extracti128_si256(concat0, 1), 0);
     432           0 :             out32_47 = _mm256_inserti128_si256(concat2, _mm256_castsi256_si128(concat3), 1);
     433           0 :             out_48_63 = _mm256_inserti128_si256(concat3, _mm256_extracti128_si256(concat2, 1), 0);
     434           0 :             out_s0_s15 = _mm256_inserti128_si256(concat4, _mm256_castsi256_si128(concat5), 1);
     435           0 :             out_s16_s31 = _mm256_inserti128_si256(concat5, _mm256_extracti128_si256(concat4, 1), 0);
     436           0 :             out_s32_s47 = _mm256_inserti128_si256(concat6, _mm256_castsi256_si128(concat7), 1);
     437           0 :             out_s48_s63 = _mm256_inserti128_si256(concat7, _mm256_extracti128_si256(concat6, 1), 0);
     438             : 
     439             :             _mm256_store_si256((__m256i*) out16_bit_buffer, out_0_15);
     440           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 16), out16_31);
     441           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 32), out32_47);
     442           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + 48), out_48_63);
     443             : 
     444           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride), out_s0_s15);
     445           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 16), out_s16_s31);
     446           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 32), out_s32_s47);
     447           0 :             _mm256_store_si256((__m256i*) (out16_bit_buffer + out_stride + 48), out_s48_s63);
     448             : 
     449           0 :             in8_bit_buffer += in8_stride << 1;
     450             :             //inn_bit_buffer += inn_stride << 1;
     451           0 :             inn_bit_buffer += inn_stride * 2;
     452           0 :             out16_bit_buffer += out_stride << 1;
     453             :         }
     454             :     }
     455             :     else
     456             :     {
     457           0 :         uint32_t innStrideDiff = 2 * inn_stride;
     458           0 :         uint32_t in8StrideDiff = 2 * in8_stride;
     459           0 :         uint32_t outStrideDiff = 2 * out_stride;
     460           0 :         innStrideDiff -= width;
     461           0 :         in8StrideDiff -= width;
     462           0 :         outStrideDiff -= width;
     463             : 
     464           0 :         if (!(width & 7)) {
     465           0 :             for (x = 0; x < height; x += 2) {
     466           0 :                 for (y = 0; y < width; y += 8) {
     467           0 :                     out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)inn_bit_buffer), _mm_loadl_epi64((__m128i*)in8_bit_buffer)), 6);
     468           0 :                     out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(inn_bit_buffer + inn_stride)), _mm_loadl_epi64((__m128i*)(in8_bit_buffer + in8_stride))), 6);
     469             : 
     470             :                     _mm_storeu_si128((__m128i*) out16_bit_buffer, out0);
     471           0 :                     _mm_storeu_si128((__m128i*) (out16_bit_buffer + out_stride), out1);
     472             : 
     473           0 :                     in8_bit_buffer += 8;
     474           0 :                     inn_bit_buffer += 8;
     475           0 :                     out16_bit_buffer += 8;
     476             :                 }
     477           0 :                 in8_bit_buffer += in8StrideDiff;
     478           0 :                 inn_bit_buffer += innStrideDiff;
     479           0 :                 out16_bit_buffer += outStrideDiff;
     480             :             }
     481             :         }
     482             :         else {
     483           0 :             for (x = 0; x < height; x += 2) {
     484           0 :                 for (y = 0; y < width; y += 4) {
     485           0 :                     out0 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)inn_bit_buffer), _mm_cvtsi32_si128(*(uint32_t *)in8_bit_buffer)), 6);
     486           0 :                     out1 = _mm_srli_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(uint32_t *)(inn_bit_buffer + inn_stride)), _mm_cvtsi32_si128(*(uint32_t *)(in8_bit_buffer + in8_stride))), 6);
     487             : 
     488           0 :                     _mm_storel_epi64((__m128i*) out16_bit_buffer, out0);
     489           0 :                     _mm_storel_epi64((__m128i*) (out16_bit_buffer + out_stride), out1);
     490             : 
     491           0 :                     in8_bit_buffer += 4;
     492           0 :                     inn_bit_buffer += 4;
     493           0 :                     out16_bit_buffer += 4;
     494             :                 }
     495           0 :                 in8_bit_buffer += in8StrideDiff;
     496           0 :                 inn_bit_buffer += innStrideDiff;
     497           0 :                 out16_bit_buffer += outStrideDiff;
     498             :             }
     499             :         }
     500             :     }
     501           0 : }
     502             : 
     503             : #define ALSTORE  1
     504             : #define B256     1
     505             : 
     506           0 : void unpack_avg_avx2_intrin(
     507             :     uint16_t *ref16_l0,
     508             :     uint32_t  ref_l0_stride,
     509             :     uint16_t *ref16_l1,
     510             :     uint32_t  ref_l1_stride,
     511             :     uint8_t  *dst_ptr,
     512             :     uint32_t  dst_stride,
     513             :     uint32_t  width,
     514             :     uint32_t  height)
     515             : {
     516             :     uint32_t   y;
     517             :     __m128i inPixel0, inPixel1;
     518             : 
     519           0 :     if (width == 4)
     520             :     {
     521             :         __m128i out8_0_U8_L0, out8_0_U8_L1;
     522             :         __m128i avg8_0_U8;
     523             : 
     524           0 :         for (y = 0; y < height; y += 2)
     525             :         {
     526             :             //--------
     527             :             //Line One
     528             :             //--------
     529             : 
     530             :             //List0
     531           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l0);
     532           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     533           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     534             : 
     535             :             //List1
     536           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)ref16_l1);
     537           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     538           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     539             : 
     540             :             //AVG
     541           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     542             : 
     543           0 :             *(uint32_t*)dst_ptr = _mm_cvtsi128_si32(avg8_0_U8);
     544             : 
     545             :             //--------
     546             :             //Line Two
     547             :             //--------
     548             : 
     549             :             //List0
     550           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l0 + ref_l0_stride));
     551           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     552           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     553             : 
     554             :             //List1
     555             : 
     556           0 :             inPixel0 = _mm_loadl_epi64((__m128i*)(ref16_l1 + ref_l1_stride));
     557           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     558           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     559             : 
     560             :             //AVG
     561           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     562             : 
     563           0 :             *(uint32_t*)(dst_ptr + dst_stride) = _mm_cvtsi128_si32(avg8_0_U8);
     564             : 
     565           0 :             dst_ptr += 2 * dst_stride;
     566           0 :             ref16_l0 += 2 * ref_l0_stride;
     567           0 :             ref16_l1 += 2 * ref_l1_stride;
     568             :         }
     569             :     }
     570           0 :     else if (width == 8)
     571             :     {
     572             :         __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
     573             :         __m128i avg8_0_U8, avg8_2_U8;
     574             : 
     575           0 :         for (y = 0; y < height; y += 2)
     576             :         {
     577             :             //--------
     578             :             //Line One
     579             :             //--------
     580             : 
     581             :             //List0
     582             : 
     583           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
     584             : 
     585           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     586           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     587             : 
     588             :             //List1
     589             : 
     590           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     591             : 
     592           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     593           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     594             : 
     595             :             //AVG
     596           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     597             : 
     598           0 :             _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
     599             : 
     600             :             //--------
     601             :             //Line Two
     602             :             //--------
     603             : 
     604             :             //List0
     605             : 
     606           0 :             inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l0 + ref_l0_stride));
     607             : 
     608           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     609           0 :             out8_2_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
     610             : 
     611             :             //List1
     612             : 
     613           0 :             inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l1 + ref_l1_stride));
     614             : 
     615           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
     616           0 :             out8_2_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
     617             : 
     618             :             //AVG
     619           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     620             : 
     621           0 :             _mm_storel_epi64((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     622             : 
     623           0 :             dst_ptr += 2 * dst_stride;
     624           0 :             ref16_l0 += 2 * ref_l0_stride;
     625           0 :             ref16_l1 += 2 * ref_l1_stride;
     626             :         }
     627             :     }
     628           0 :     else if (width == 16)
     629             :     {
     630             :         __m128i inPixel4, inPixel5;
     631             :         __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
     632             :         __m128i avg8_0_U8, avg8_2_U8;
     633             : 
     634           0 :         for (y = 0; y < height; y += 2)
     635             :         {
     636             :             //--------
     637             :             //Line One
     638             :             //--------
     639             : 
     640             :             //List0
     641             : 
     642           0 :             inPixel0 = _mm_loadu_si128((__m128i*)  ref16_l0);
     643           0 :             inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
     644             : 
     645           0 :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     646             : 
     647             :             //List1
     648             : 
     649           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     650           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
     651             : 
     652           0 :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     653             : 
     654             :             //AVG
     655           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     656             : #if ALSTORE
     657             :             _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
     658             : #else
     659             :             _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
     660             : #endif
     661             : 
     662             :             //--------
     663             :             //Line Two
     664             :             //--------
     665             : 
     666             :             //List0
     667             : 
     668           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
     669           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
     670             : 
     671           0 :             out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     672             : 
     673             :             //List1
     674             : 
     675           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
     676           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
     677             : 
     678           0 :             out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     679             : 
     680             :             //AVG
     681           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     682             : #if ALSTORE
     683           0 :             _mm_store_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     684             : #else
     685             :             _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     686             : #endif
     687           0 :             dst_ptr += 2 * dst_stride;
     688           0 :             ref16_l0 += 2 * ref_l0_stride;
     689           0 :             ref16_l1 += 2 * ref_l1_stride;
     690             :         }
     691             :     }
     692           0 :     else if (width == 32)
     693             :     {
     694             : #if B256
     695             :         __m256i inVal16b_0, inVal16b_1;
     696             :         __m256i data8b_32_0_L0, data8b_32_0_L1;
     697             :         __m256i avg8b_32_0;
     698             : #else
     699             :         __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     700             :         __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
     701             :         __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
     702             :         __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
     703             : #endif
     704             : 
     705           0 :         for (y = 0; y < height; y += 2)
     706             :         {
     707             : #if B256
     708             :             //--------
     709             :             //Line One
     710             :             //--------
     711             : 
     712             :             //List0
     713           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
     714           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
     715           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
     716             :             //List1
     717           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
     718           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
     719           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
     720             : 
     721             :             //Avg
     722           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
     723             : 
     724           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
     725             : 
     726             :             _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
     727             : 
     728             :             //--------
     729             :             //Line Two
     730             :             //--------
     731             :               //List0
     732           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride));
     733           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride + 16));
     734             : 
     735           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
     736             : 
     737             :             //List1
     738           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride));
     739           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride + 16));
     740             : 
     741           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
     742             : 
     743             :             //Avg
     744           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
     745             : 
     746           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
     747             : 
     748           0 :             _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), avg8b_32_0);
     749             : 
     750             : #else
     751             :             //--------
     752             :             //Line One
     753             :             //--------
     754             : 
     755             :             //List0
     756             : 
     757             :             inPixel0 = _mm_loadu_si128((__m128i*)  ref16_l0);
     758             :             inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
     759             :             inPixel2 = _mm_loadu_si128((__m128i*) (ref16_l0 + 16));
     760             :             inPixel3 = _mm_loadu_si128((__m128i*) (ref16_l0 + 24));
     761             : 
     762             :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     763             :             out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     764             : 
     765             :             //List1
     766             : 
     767             :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     768             :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
     769             :             inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
     770             :             inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
     771             : 
     772             :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     773             :             out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     774             : 
     775             :             //AVG
     776             :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     777             :             avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
     778             : #if ALSTORE
     779             :             _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
     780             :             _mm_store_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
     781             : #else
     782             :             _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
     783             :             _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
     784             : #endif
     785             : 
     786             :             //--------
     787             :             //Line Two
     788             :             //--------
     789             : 
     790             :             //List0
     791             : 
     792             :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
     793             :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
     794             :             inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 16));
     795             :             inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 24));
     796             : 
     797             :             out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     798             :             out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     799             : 
     800             :             //List1
     801             : 
     802             :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
     803             :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
     804             :             inPixel6 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 16));
     805             :             inPixel7 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 24));
     806             : 
     807             :             out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     808             :             out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     809             : 
     810             :             //AVG
     811             :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     812             :             avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
     813             : #if ALSTORE
     814             :             _mm_store_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     815             :             _mm_store_si128((__m128i*)(dst_ptr + dst_stride + 16), avg8_3_U8);
     816             : #else
     817             :             _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
     818             :             _mm_storeu_si128((__m128i*)(dst_ptr + dst_stride + 16), avg8_3_U8);
     819             : #endif
     820             : 
     821             : #endif
     822           0 :             dst_ptr += 2 * dst_stride;
     823           0 :             ref16_l0 += 2 * ref_l0_stride;
     824           0 :             ref16_l1 += 2 * ref_l1_stride;
     825             :         }
     826             :     }
     827           0 :     else if (width == 64)
     828             :     {
     829             : #if B256
     830             :         __m256i inVal16b_0, inVal16b_1, inVal16b_2, inVal16b_3;
     831             :         __m256i data8b_32_0_L0, data8b_32_1_L0, data8b_32_0_L1, data8b_32_1_L1;
     832             :         __m256i avg8b_32_0, avg8b_32_1;
     833             : #else
     834             :         __m128i inPixel2, inPixel3, inPixel4, inPixel5, inPixel6, inPixel7;
     835             :         __m128i out8_0_U8_L0, out8_1_U8_L0, out8_2_U8_L0, out8_3_U8_L0;
     836             :         __m128i out8_0_U8_L1, out8_1_U8_L1, out8_2_U8_L1, out8_3_U8_L1;
     837             :         __m128i avg8_0_U8, avg8_1_U8, avg8_2_U8, avg8_3_U8;
     838             : 
     839             : #endif
     840             : 
     841           0 :         for (y = 0; y < height; ++y)
     842             :         {
     843             : #if B256       // _mm256_lddqu_si256
     844             : 
     845             :             //List0
     846           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
     847           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
     848           0 :             inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 32));
     849           0 :             inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 48));
     850           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
     851           0 :             data8b_32_1_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
     852             :             //List1
     853           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
     854           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
     855           0 :             inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 32));
     856           0 :             inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 48));
     857           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
     858           0 :             data8b_32_1_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
     859             : 
     860             :             //Avg
     861           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
     862           0 :             avg8b_32_1 = _mm256_avg_epu8(data8b_32_1_L0, data8b_32_1_L1);
     863             : 
     864           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
     865           0 :             avg8b_32_1 = _mm256_permute4x64_epi64(avg8b_32_1, 216);
     866             : 
     867             :             _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
     868           0 :             _mm256_storeu_si256((__m256i *)(dst_ptr + 32), avg8b_32_1);
     869             : #else
     870             :             //List0
     871             :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
     872             :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l0 + 8));
     873             :             inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l0 + 16));
     874             :             inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l0 + 24));
     875             :             inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l0 + 32));
     876             :             inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l0 + 40));
     877             :             inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l0 + 48));
     878             :             inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l0 + 56));
     879             : 
     880             :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     881             :             out8_1_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     882             :             out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     883             :             out8_3_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     884             : 
     885             :             //List1
     886             :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
     887             :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
     888             :             inPixel2 = _mm_loadu_si128((__m128i*)(ref16_l1 + 16));
     889             :             inPixel3 = _mm_loadu_si128((__m128i*)(ref16_l1 + 24));
     890             :             inPixel4 = _mm_loadu_si128((__m128i*)(ref16_l1 + 32));
     891             :             inPixel5 = _mm_loadu_si128((__m128i*)(ref16_l1 + 40));
     892             :             inPixel6 = _mm_loadu_si128((__m128i*)(ref16_l1 + 48));
     893             :             inPixel7 = _mm_loadu_si128((__m128i*)(ref16_l1 + 56));
     894             : 
     895             :             //Note: old Version used to use _mm_and_si128 to mask the MSB bits of the pixels
     896             :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
     897             :             out8_1_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel2, 2), _mm_srli_epi16(inPixel3, 2));
     898             :             out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
     899             :             out8_3_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel6, 2), _mm_srli_epi16(inPixel7, 2));
     900             : 
     901             :             //AVG
     902             :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
     903             :             avg8_1_U8 = _mm_avg_epu8(out8_1_U8_L0, out8_1_U8_L1);
     904             :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
     905             :             avg8_3_U8 = _mm_avg_epu8(out8_3_U8_L0, out8_3_U8_L1);
     906             : #if ALSTORE
     907             :             _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
     908             :             _mm_store_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
     909             :             _mm_store_si128((__m128i*)(dst_ptr + 32), avg8_2_U8);
     910             :             _mm_store_si128((__m128i*)(dst_ptr + 48), avg8_3_U8);
     911             : #else
     912             :             _mm_storeu_si128((__m128i*) dst_ptr, avg8_0_U8);
     913             :             _mm_storeu_si128((__m128i*)(dst_ptr + 16), avg8_1_U8);
     914             :             _mm_storeu_si128((__m128i*)(dst_ptr + 32), avg8_2_U8);
     915             :             _mm_storeu_si128((__m128i*)(dst_ptr + 48), avg8_3_U8);
     916             : #endif
     917             : 
     918             : #endif
     919           0 :             dst_ptr += dst_stride;
     920           0 :             ref16_l0 += ref_l0_stride;
     921           0 :             ref16_l1 += ref_l1_stride;
     922             :         }
     923             :     }
     924             : 
     925           0 :     return;
     926             : }
     927             : 
     928           0 : int32_t  sum_residual8bit_avx2_intrin(
     929             :     int16_t * in_ptr,
     930             :     uint32_t   size,
     931             :     uint32_t   stride_in)
     932             : {
     933             :     int32_t  sumBlock;
     934             : 
     935             :     __m128i in0, in1, in01, in2, in3, in23, sum, sumL, sumH;
     936             :     __m256i sum0, sum1, sum2, sum3, sum0L, sum0H, sumT, sum01, sumTPerm;
     937             :     uint32_t row_index;
     938             : 
     939             :     //Assumption: 9bit or 11bit residual data . for bigger block sizes or bigger bit depths , re-asses the dynamic range of the internal calculation
     940             : 
     941           0 :     if (size == 4) { //SSSE3
     942             : 
     943           0 :         __m128i zer = _mm_setzero_si128();
     944             : 
     945           0 :         in0 = _mm_loadl_epi64((__m128i*)in_ptr);
     946           0 :         in1 = _mm_loadl_epi64((__m128i*)(in_ptr + stride_in));
     947           0 :         in1 = _mm_shuffle_epi32(in1, 0x4A); //01.00.10.10
     948           0 :         in01 = _mm_or_si128(in1, in0);
     949             : 
     950           0 :         in2 = _mm_loadl_epi64((__m128i*)(in_ptr + 2 * stride_in));
     951           0 :         in3 = _mm_loadl_epi64((__m128i*)(in_ptr + 3 * stride_in));
     952           0 :         in3 = _mm_shuffle_epi32(in3, 0x4A); //01.00.10.10
     953           0 :         in23 = _mm_or_si128(in3, in2);
     954             : 
     955           0 :         sum = _mm_add_epi16(in01, in23);
     956           0 :         sum = _mm_hadd_epi16(sum, zer);
     957           0 :         sum = _mm_hadd_epi16(sum, zer);
     958           0 :         sum = _mm_hadd_epi16(sum, zer);
     959             : 
     960           0 :         sum = _mm_cvtepi16_epi32(sum);
     961           0 :         sumBlock = _mm_cvtsi128_si32(sum);
     962             : 
     963           0 :         return sumBlock;
     964             :     }
     965           0 :     else if (size == 8) {//SSSE3
     966             : 
     967           0 :         __m128i zer = _mm_setzero_si128();
     968             : 
     969           0 :         sum = _mm_add_epi16(_mm_loadu_si128((__m128i*)(in_ptr + 0 * stride_in)), _mm_loadu_si128((__m128i*)(in_ptr + 1 * stride_in)));
     970           0 :         sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 2 * stride_in)));
     971           0 :         sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 3 * stride_in)));
     972           0 :         sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 4 * stride_in)));
     973           0 :         sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 5 * stride_in)));
     974           0 :         sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 6 * stride_in)));
     975           0 :         sum = _mm_add_epi16(sum, _mm_loadu_si128((__m128i*)(in_ptr + 7 * stride_in)));
     976             : 
     977           0 :         sum = _mm_hadd_epi16(sum, zer);
     978           0 :         sum = _mm_hadd_epi16(sum, zer);
     979             :         // the sum is on 16bit, for negative values, we need to extend the sign to
     980             :         // the next 16bit, so that the next extraction to int32_t is fine.
     981           0 :         sum = _mm_cvtepi16_epi32(sum);
     982           0 :         sum = _mm_hadd_epi32(sum, zer);
     983             : 
     984           0 :         sumBlock = _mm_cvtsi128_si32(sum);
     985             : 
     986           0 :         return sumBlock;
     987             :     }
     988           0 :     else if (size == 16) {//AVX2
     989             : 
     990           0 :         sum0 = _mm256_add_epi16(_mm256_loadu_si256((__m256i *)(in_ptr + 0 * stride_in)), _mm256_loadu_si256((__m256i *)(in_ptr + 1 * stride_in)));
     991           0 :         sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 2 * stride_in)));
     992           0 :         sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 3 * stride_in)));
     993           0 :         sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 4 * stride_in)));
     994           0 :         sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 5 * stride_in)));
     995           0 :         sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 6 * stride_in)));
     996           0 :         sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(in_ptr + 7 * stride_in)));
     997             : 
     998           0 :         in_ptr += 8 * stride_in;
     999           0 :         sum1 = _mm256_add_epi16(_mm256_loadu_si256((__m256i *)(in_ptr + 0 * stride_in)), _mm256_loadu_si256((__m256i *)(in_ptr + 1 * stride_in)));
    1000           0 :         sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 2 * stride_in)));
    1001           0 :         sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 3 * stride_in)));
    1002           0 :         sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 4 * stride_in)));
    1003           0 :         sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 5 * stride_in)));
    1004           0 :         sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 6 * stride_in)));
    1005           0 :         sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(in_ptr + 7 * stride_in)));
    1006             : 
    1007           0 :         sum01 = _mm256_add_epi16(sum0, sum1);
    1008             : 
    1009             :         //go from 16bit to 32bit (to support big values)
    1010           0 :         sumL = _mm256_castsi256_si128(sum01);
    1011           0 :         sumH = _mm256_extracti128_si256(sum01, 1);
    1012           0 :         sum0L = _mm256_cvtepi16_epi32(sumL);
    1013           0 :         sum0H = _mm256_cvtepi16_epi32(sumH);
    1014             : 
    1015           0 :         sumT = _mm256_add_epi32(sum0L, sum0H);
    1016             : 
    1017           0 :         sumT = _mm256_hadd_epi32(sumT, sumT);
    1018           0 :         sumT = _mm256_hadd_epi32(sumT, sumT);
    1019           0 :         sumTPerm = _mm256_permute4x64_epi64(sumT, 2); //00.00.00.10
    1020           0 :         sumT = _mm256_add_epi32(sumT, sumTPerm);
    1021             : 
    1022           0 :         sum = _mm256_castsi256_si128(sumT);
    1023           0 :         sumBlock = _mm_cvtsi128_si32(sum);
    1024             : 
    1025           0 :         return sumBlock;
    1026             :     }
    1027           0 :     else if (size == 32) {//AVX2
    1028           0 :         int16_t *inPtrTemp = in_ptr;
    1029             : 
    1030           0 :         sum0 = sum1 = sum2 = sum3 = _mm256_setzero_si256();
    1031           0 :         for (row_index = 0; row_index < size; row_index += 2) { // Parse every two rows
    1032           0 :             sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i *)(inPtrTemp)));
    1033           0 :             sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i *)(inPtrTemp + 16)));
    1034           0 :             inPtrTemp += stride_in;
    1035           0 :             sum2 = _mm256_add_epi16(sum2, _mm256_loadu_si256((__m256i *)(inPtrTemp)));
    1036           0 :             sum3 = _mm256_add_epi16(sum3, _mm256_loadu_si256((__m256i *)(inPtrTemp + 16)));
    1037           0 :             inPtrTemp += stride_in;
    1038             :         }
    1039             :         //go from 16bit to 32bit (to support big values)
    1040           0 :         sumL = _mm256_castsi256_si128(sum0);
    1041           0 :         sumH = _mm256_extracti128_si256(sum0, 1);
    1042           0 :         sum0L = _mm256_cvtepi16_epi32(sumL);
    1043           0 :         sum0H = _mm256_cvtepi16_epi32(sumH);
    1044           0 :         sumT = _mm256_add_epi32(sum0L, sum0H);
    1045             : 
    1046           0 :         sumL = _mm256_castsi256_si128(sum1);
    1047           0 :         sumH = _mm256_extracti128_si256(sum1, 1);
    1048           0 :         sum0L = _mm256_cvtepi16_epi32(sumL);
    1049           0 :         sum0H = _mm256_cvtepi16_epi32(sumH);
    1050           0 :         sumT = _mm256_add_epi32(sumT, sum0L);
    1051           0 :         sumT = _mm256_add_epi32(sumT, sum0H);
    1052             : 
    1053           0 :         sumL = _mm256_castsi256_si128(sum2);
    1054           0 :         sumH = _mm256_extracti128_si256(sum2, 1);
    1055           0 :         sum0L = _mm256_cvtepi16_epi32(sumL);
    1056           0 :         sum0H = _mm256_cvtepi16_epi32(sumH);
    1057           0 :         sumT = _mm256_add_epi32(sumT, sum0L);
    1058           0 :         sumT = _mm256_add_epi32(sumT, sum0H);
    1059             : 
    1060           0 :         sumL = _mm256_castsi256_si128(sum3);
    1061           0 :         sumH = _mm256_extracti128_si256(sum3, 1);
    1062           0 :         sum0L = _mm256_cvtepi16_epi32(sumL);
    1063           0 :         sum0H = _mm256_cvtepi16_epi32(sumH);
    1064           0 :         sumT = _mm256_add_epi32(sumT, sum0L);
    1065           0 :         sumT = _mm256_add_epi32(sumT, sum0H);
    1066             : 
    1067           0 :         sumT = _mm256_hadd_epi32(sumT, sumT);
    1068           0 :         sumT = _mm256_hadd_epi32(sumT, sumT);
    1069           0 :         sumTPerm = _mm256_permute4x64_epi64(sumT, 2); //00.00.00.10
    1070           0 :         sumT = _mm256_add_epi32(sumT, sumTPerm);
    1071             : 
    1072           0 :         sum = _mm256_castsi256_si128(sumT);
    1073           0 :         sumBlock = _mm_cvtsi128_si32(sum);
    1074             : 
    1075           0 :         return sumBlock;
    1076             :     } 
    1077             : 
    1078           0 :     else if (size == 64) {//AVX2 
    1079             :         // no more than 11 bit valid residual data is fine, when valid residual data is over 11bit,  the following code need to be rewritten.
    1080           0 :         int16_t* inPtrTemp = in_ptr;
    1081             :         __m256i sum4, sum5, sum6, sum7;
    1082             : 
    1083           0 :         const __m256i ones = _mm256_set1_epi16(1);
    1084           0 :         const __m256i zeros = _mm256_setzero_si256();
    1085             : 
    1086           0 :         sum0 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp)), _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
    1087           0 :         sum1 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + 32)), _mm256_loadu_si256((__m256i*)(inPtrTemp + 48)));
    1088           0 :         sum2 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
    1089           0 :         sum3 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));  
    1090             :                                         
    1091           0 :         inPtrTemp += 2 * stride_in;                                 
    1092             :                                                                        
    1093           0 :         sum4 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp)), _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
    1094           0 :         sum5 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + 32)),  _mm256_loadu_si256((__m256i*)(inPtrTemp  + 48)));
    1095           0 :         sum6 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
    1096           0 :         sum7 = _mm256_add_epi16(_mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)), _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));                                        
    1097             : 
    1098           0 :         inPtrTemp += 2 * stride_in;
    1099             : 
    1100           0 :         for (row_index = 4; row_index < 64; row_index += 4) {  // Parse every four rows
    1101           0 :             sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i*)(inPtrTemp)));
    1102           0 :             sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
    1103           0 :             sum2 = _mm256_add_epi16(sum2, _mm256_loadu_si256((__m256i*)(inPtrTemp + 32)));
    1104           0 :             sum3 = _mm256_add_epi16(sum3, _mm256_loadu_si256((__m256i*)(inPtrTemp + 48)));
    1105             : 
    1106           0 :             sum0 = _mm256_add_epi16(sum0, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)));
    1107           0 :             sum1 = _mm256_add_epi16(sum1, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
    1108           0 :             sum2 = _mm256_add_epi16(sum2, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)));
    1109           0 :             sum3 = _mm256_add_epi16(sum3, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));
    1110             :             
    1111           0 :             inPtrTemp += 2 * stride_in;
    1112             :             
    1113           0 :             sum4 = _mm256_add_epi16(sum4, _mm256_loadu_si256((__m256i*)(inPtrTemp)));
    1114           0 :             sum5 = _mm256_add_epi16(sum5, _mm256_loadu_si256((__m256i*)(inPtrTemp + 16)));
    1115           0 :             sum6 = _mm256_add_epi16(sum6, _mm256_loadu_si256((__m256i*)(inPtrTemp + 32)));
    1116           0 :             sum7 = _mm256_add_epi16(sum7, _mm256_loadu_si256((__m256i*)(inPtrTemp + 48)));
    1117             : 
    1118           0 :             sum4 = _mm256_add_epi16(sum4, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in)));
    1119           0 :             sum5 = _mm256_add_epi16(sum5, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 16)));
    1120           0 :             sum6 = _mm256_add_epi16(sum6, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 32)));
    1121           0 :             sum7 = _mm256_add_epi16(sum7, _mm256_loadu_si256((__m256i*)(inPtrTemp + stride_in + 48)));
    1122             : 
    1123           0 :             inPtrTemp += 2 * stride_in;
    1124             :         }
    1125             : 
    1126           0 :         sum0 = _mm256_madd_epi16(sum0, ones);
    1127           0 :         sum1 = _mm256_madd_epi16(sum1, ones);
    1128           0 :         sum2 = _mm256_madd_epi16(sum2, ones);
    1129           0 :         sum3 = _mm256_madd_epi16(sum3, ones);
    1130             :         
    1131           0 :         sum4 = _mm256_madd_epi16(sum4, ones);
    1132           0 :         sum5 = _mm256_madd_epi16(sum5, ones);
    1133           0 :         sum6 = _mm256_madd_epi16(sum6, ones);
    1134           0 :         sum7 = _mm256_madd_epi16(sum7, ones);
    1135             : 
    1136           0 :         sum0 = _mm256_add_epi32(sum0, sum1);
    1137           0 :         sum2 = _mm256_add_epi32(sum2, sum3);
    1138           0 :         sum4 = _mm256_add_epi32(sum4, sum5);
    1139           0 :         sum6 = _mm256_add_epi32(sum6, sum7);
    1140             :         
    1141           0 :         sum0 = _mm256_add_epi32(sum0, sum2);
    1142           0 :         sum4 = _mm256_add_epi32(sum4, sum6);
    1143           0 :         sum0 = _mm256_hadd_epi32(sum0, zeros);
    1144           0 :         sum4 = _mm256_hadd_epi32(sum4, zeros);
    1145             : 
    1146           0 :         sum0 = _mm256_add_epi32(sum0, sum4);
    1147           0 :         sum0 = _mm256_hadd_epi32(sum0, zeros);
    1148             : 
    1149           0 :         sumBlock = _mm_cvtsi128_si32(_mm_add_epi32(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1)));
    1150             : 
    1151           0 :         return sumBlock;
    1152             :     }
    1153             :     else {
    1154           0 :         printf("\n add the rest \n");
    1155           0 :         return 0;
    1156             :     }
    1157             : }
    1158             : 
    1159           0 : void memset16bit_block_avx2_intrin(
    1160             :     int16_t * in_ptr,
    1161             :     uint32_t   stride_in,
    1162             :     uint32_t   size,
    1163             :     int16_t   value
    1164             : )
    1165             : {
    1166           0 :     if (size == 4) {
    1167           0 :         __m128i line = _mm_set1_epi16(value);
    1168             : 
    1169           0 :         _mm_storel_epi64((__m128i *)(in_ptr + 0 * stride_in), line);
    1170           0 :         _mm_storel_epi64((__m128i *)(in_ptr + 1 * stride_in), line);
    1171           0 :         _mm_storel_epi64((__m128i *)(in_ptr + 2 * stride_in), line);
    1172           0 :         _mm_storel_epi64((__m128i *)(in_ptr + 3 * stride_in), line);
    1173             :     }
    1174           0 :     else if (size == 8) {
    1175           0 :         __m128i line = _mm_set1_epi16(value);
    1176             : 
    1177             :         _mm_storeu_si128((__m128i *)(in_ptr + 0 * stride_in), line);
    1178           0 :         _mm_storeu_si128((__m128i *)(in_ptr + 1 * stride_in), line);
    1179           0 :         _mm_storeu_si128((__m128i *)(in_ptr + 2 * stride_in), line);
    1180           0 :         _mm_storeu_si128((__m128i *)(in_ptr + 3 * stride_in), line);
    1181           0 :         _mm_storeu_si128((__m128i *)(in_ptr + 4 * stride_in), line);
    1182           0 :         _mm_storeu_si128((__m128i *)(in_ptr + 5 * stride_in), line);
    1183           0 :         _mm_storeu_si128((__m128i *)(in_ptr + 6 * stride_in), line);
    1184           0 :         _mm_storeu_si128((__m128i *)(in_ptr + 7 * stride_in), line);
    1185             :     }
    1186           0 :     else if (size == 16) {
    1187           0 :         __m256i line = _mm256_set1_epi16(value);
    1188             : 
    1189             :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
    1190           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
    1191           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
    1192           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
    1193           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
    1194           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
    1195           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
    1196           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
    1197             : 
    1198           0 :         in_ptr += 8 * stride_in;
    1199             : 
    1200             :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
    1201           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
    1202           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
    1203           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
    1204           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
    1205           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
    1206           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
    1207           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
    1208             :     }
    1209           0 :     else if (size == 32) {
    1210           0 :         __m256i line = _mm256_set1_epi16(value);
    1211             : 
    1212             :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
    1213           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
    1214           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
    1215           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
    1216           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
    1217           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
    1218           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
    1219           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
    1220           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
    1221           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
    1222           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
    1223           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
    1224           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
    1225           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
    1226           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
    1227           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
    1228             : 
    1229           0 :         in_ptr += 8 * stride_in;
    1230             : 
    1231             :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
    1232           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
    1233           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
    1234           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
    1235           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
    1236           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
    1237           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
    1238           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
    1239           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
    1240           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
    1241           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
    1242           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
    1243           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
    1244           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
    1245           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
    1246           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
    1247             : 
    1248           0 :         in_ptr += 8 * stride_in;
    1249             : 
    1250             :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
    1251           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
    1252           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
    1253           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
    1254           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
    1255           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
    1256           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
    1257           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
    1258           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
    1259           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
    1260           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
    1261           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
    1262           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
    1263           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
    1264           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
    1265           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
    1266             : 
    1267           0 :         in_ptr += 8 * stride_in;
    1268             : 
    1269             :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in), line);
    1270           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 0 * stride_in + 16), line);
    1271           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in), line);
    1272           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 1 * stride_in + 16), line);
    1273           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in), line);
    1274           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 2 * stride_in + 16), line);
    1275           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in), line);
    1276           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 3 * stride_in + 16), line);
    1277           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in), line);
    1278           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 4 * stride_in + 16), line);
    1279           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in), line);
    1280           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 5 * stride_in + 16), line);
    1281           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in), line);
    1282           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 6 * stride_in + 16), line);
    1283           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in), line);
    1284           0 :         _mm256_storeu_si256((__m256i *)(in_ptr + 7 * stride_in + 16), line);
    1285             :     }
    1286             : 
    1287             :     else
    1288           0 :         printf("\n add the rest \n");
    1289           0 : }
    1290             : 
    1291           0 : void unpack_avg_safe_sub_avx2_intrin(
    1292             :     uint16_t *ref16_l0,
    1293             :     uint32_t  ref_l0_stride,
    1294             :     uint16_t *ref16_l1,
    1295             :     uint32_t  ref_l1_stride,
    1296             :     uint8_t  *dst_ptr,
    1297             :     uint32_t  dst_stride,
    1298             :     EbBool  sub_pred,
    1299             :     uint32_t  width,
    1300             :     uint32_t  height)
    1301             : {
    1302             :     uint32_t   y;
    1303             :     __m128i inPixel0, inPixel1;
    1304             : 
    1305           0 :     if (width == 8)
    1306             :     {
    1307             :         __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
    1308             :         __m128i avg8_0_U8, avg8_2_U8;
    1309             : 
    1310           0 :         for (y = 0; y < height; y += 2)
    1311             :         {
    1312             :             //--------
    1313             :             //Line One
    1314             :             //--------
    1315             : 
    1316             :             //List0
    1317             : 
    1318           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
    1319             : 
    1320           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
    1321           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
    1322             : 
    1323             :             //List1
    1324             : 
    1325           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
    1326             : 
    1327           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
    1328           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
    1329             : 
    1330             :             //AVG
    1331           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
    1332             : 
    1333           0 :             _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
    1334             : 
    1335             :             //--------
    1336             :             //Line Two
    1337             :             //--------
    1338             : 
    1339             :             //List0
    1340             : 
    1341           0 :             inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l0 + ref_l0_stride));
    1342             : 
    1343           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
    1344           0 :             out8_2_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
    1345             : 
    1346             :             //List1
    1347             : 
    1348           0 :             inPixel0 = _mm_loadu_si128((__m128i*)(ref16_l1 + ref_l1_stride));
    1349             : 
    1350           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
    1351           0 :             out8_2_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
    1352             : 
    1353             :             //AVG
    1354           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
    1355             : 
    1356           0 :             _mm_storel_epi64((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
    1357             : 
    1358           0 :             dst_ptr += 2 * dst_stride;
    1359           0 :             ref16_l0 += 2 * ref_l0_stride;
    1360           0 :             ref16_l1 += 2 * ref_l1_stride;
    1361             :         }
    1362             : 
    1363           0 :         if (sub_pred) {
    1364           0 :             ref16_l0 -= (ref_l0_stride >> 1);
    1365           0 :             ref16_l1 -= (ref_l1_stride >> 1);
    1366           0 :             dst_ptr -= (dst_stride >> 1);
    1367             :             //List0
    1368           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l0);
    1369           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
    1370           0 :             out8_0_U8_L0 = _mm_packus_epi16(inPixel1, inPixel1);
    1371             :             //List1
    1372           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
    1373           0 :             inPixel1 = _mm_srli_epi16(inPixel0, 2);
    1374           0 :             out8_0_U8_L1 = _mm_packus_epi16(inPixel1, inPixel1);
    1375             :             //AVG
    1376           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
    1377           0 :             _mm_storel_epi64((__m128i*) dst_ptr, avg8_0_U8);
    1378             :         }
    1379             :     }
    1380           0 :     else if (width == 16)
    1381             :     {
    1382             :         __m128i inPixel4, inPixel5;
    1383             :         __m128i out8_0_U8_L0, out8_0_U8_L1, out8_2_U8_L0, out8_2_U8_L1;
    1384             :         __m128i avg8_0_U8, avg8_2_U8;
    1385             : 
    1386           0 :         for (y = 0; y < height; y += 2)
    1387             :         {
    1388             :             //--------
    1389             :             //Line One
    1390             :             //--------
    1391             : 
    1392             :             //List0
    1393             : 
    1394           0 :             inPixel0 = _mm_loadu_si128((__m128i*)  ref16_l0);
    1395           0 :             inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
    1396             : 
    1397           0 :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
    1398             : 
    1399             :             //List1
    1400             : 
    1401           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
    1402           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
    1403             : 
    1404           0 :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
    1405             : 
    1406             :             //AVG
    1407           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
    1408             : 
    1409             :             _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
    1410             : 
    1411             :             //--------
    1412             :             //Line Two
    1413             :             //--------
    1414             : 
    1415             :             //List0
    1416             : 
    1417           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride));
    1418           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l0 + ref_l0_stride + 8));
    1419             : 
    1420           0 :             out8_2_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
    1421             : 
    1422             :             //List1
    1423             : 
    1424           0 :             inPixel4 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride));
    1425           0 :             inPixel5 = _mm_loadu_si128((__m128i*) (ref16_l1 + ref_l1_stride + 8));
    1426             : 
    1427           0 :             out8_2_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel4, 2), _mm_srli_epi16(inPixel5, 2));
    1428             : 
    1429             :             //AVG
    1430           0 :             avg8_2_U8 = _mm_avg_epu8(out8_2_U8_L0, out8_2_U8_L1);
    1431             : 
    1432           0 :             _mm_store_si128((__m128i*)(dst_ptr + dst_stride), avg8_2_U8);
    1433             : 
    1434           0 :             dst_ptr += 2 * dst_stride;
    1435           0 :             ref16_l0 += 2 * ref_l0_stride;
    1436           0 :             ref16_l1 += 2 * ref_l1_stride;
    1437             :         }
    1438             : 
    1439           0 :         if (sub_pred) {
    1440           0 :             ref16_l0 -= (ref_l0_stride >> 1);
    1441           0 :             ref16_l1 -= (ref_l1_stride >> 1);
    1442           0 :             dst_ptr -= (dst_stride >> 1);
    1443             :             //List0
    1444           0 :             inPixel0 = _mm_loadu_si128((__m128i*)  ref16_l0);
    1445           0 :             inPixel1 = _mm_loadu_si128((__m128i*) (ref16_l0 + 8));
    1446           0 :             out8_0_U8_L0 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
    1447             :             //List1
    1448           0 :             inPixel0 = _mm_loadu_si128((__m128i*) ref16_l1);
    1449           0 :             inPixel1 = _mm_loadu_si128((__m128i*)(ref16_l1 + 8));
    1450           0 :             out8_0_U8_L1 = _mm_packus_epi16(_mm_srli_epi16(inPixel0, 2), _mm_srli_epi16(inPixel1, 2));
    1451             :             //AVG
    1452           0 :             avg8_0_U8 = _mm_avg_epu8(out8_0_U8_L0, out8_0_U8_L1);
    1453             :             _mm_store_si128((__m128i*) dst_ptr, avg8_0_U8);
    1454             :         }
    1455             :     }
    1456           0 :     else if (width == 32)
    1457             :     {
    1458             :         __m256i inVal16b_0, inVal16b_1;
    1459             :         __m256i data8b_32_0_L0, data8b_32_0_L1;
    1460             :         __m256i avg8b_32_0;
    1461             : 
    1462           0 :         for (y = 0; y < height; y += 2)
    1463             :         {
    1464             :             //--------
    1465             :             //Line One
    1466             :             //--------
    1467             : 
    1468             :             //List0
    1469           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
    1470           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
    1471           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1472             :             //List1
    1473           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
    1474           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
    1475           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1476             : 
    1477             :             //Avg
    1478           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
    1479             : 
    1480           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
    1481             : 
    1482             :             _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
    1483             : 
    1484             :             //--------
    1485             :             //Line Two
    1486             :             //--------
    1487             :               //List0
    1488           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride));
    1489           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + ref_l0_stride + 16));
    1490             : 
    1491           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1492             : 
    1493             :             //List1
    1494           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride));
    1495           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + ref_l1_stride + 16));
    1496             : 
    1497           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1498             : 
    1499             :             //Avg
    1500           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
    1501             : 
    1502           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
    1503             : 
    1504           0 :             _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), avg8b_32_0);
    1505             : 
    1506           0 :             dst_ptr += 2 * dst_stride;
    1507           0 :             ref16_l0 += 2 * ref_l0_stride;
    1508           0 :             ref16_l1 += 2 * ref_l1_stride;
    1509             :         }
    1510             : 
    1511           0 :         if (sub_pred) {
    1512           0 :             ref16_l0 -= (ref_l0_stride >> 1);
    1513           0 :             ref16_l1 -= (ref_l1_stride >> 1);
    1514           0 :             dst_ptr -= (dst_stride >> 1);
    1515             :             //List0
    1516           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
    1517           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
    1518           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1519             :             //List1
    1520           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
    1521           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
    1522           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1523             :             //Avg
    1524           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
    1525           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
    1526             :             _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
    1527             :         }
    1528             :     }
    1529           0 :     else if (width == 64)
    1530             :     {
    1531             :         __m256i inVal16b_0, inVal16b_1, inVal16b_2, inVal16b_3;
    1532             :         __m256i data8b_32_0_L0, data8b_32_1_L0, data8b_32_0_L1, data8b_32_1_L1;
    1533             :         __m256i avg8b_32_0, avg8b_32_1;
    1534             : 
    1535           0 :         for (y = 0; y < height; ++y)
    1536             :         {
    1537             :             //List0
    1538           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
    1539           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
    1540           0 :             inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 32));
    1541           0 :             inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 48));
    1542           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1543           0 :             data8b_32_1_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
    1544             :             //List1
    1545           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
    1546           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
    1547           0 :             inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 32));
    1548           0 :             inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 48));
    1549           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1550           0 :             data8b_32_1_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
    1551             : 
    1552             :             //Avg
    1553           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
    1554           0 :             avg8b_32_1 = _mm256_avg_epu8(data8b_32_1_L0, data8b_32_1_L1);
    1555             : 
    1556           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
    1557           0 :             avg8b_32_1 = _mm256_permute4x64_epi64(avg8b_32_1, 216);
    1558             : 
    1559             :             _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
    1560           0 :             _mm256_storeu_si256((__m256i *)(dst_ptr + 32), avg8b_32_1);
    1561             : 
    1562           0 :             dst_ptr += dst_stride;
    1563           0 :             ref16_l0 += ref_l0_stride;
    1564           0 :             ref16_l1 += ref_l1_stride;
    1565             :         }
    1566             : 
    1567           0 :         if (sub_pred) {
    1568           0 :             ref16_l0 -= (ref_l0_stride >> 1);
    1569           0 :             ref16_l1 -= (ref_l1_stride >> 1);
    1570           0 :             dst_ptr -= (dst_stride >> 1);
    1571             :             //List0
    1572           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l0);
    1573           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 16));
    1574           0 :             inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 32));
    1575           0 :             inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l0 + 48));
    1576           0 :             data8b_32_0_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1577           0 :             data8b_32_1_L0 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
    1578             :             //List1
    1579           0 :             inVal16b_0 = _mm256_loadu_si256((__m256i*) ref16_l1);
    1580           0 :             inVal16b_1 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 16));
    1581           0 :             inVal16b_2 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 32));
    1582           0 :             inVal16b_3 = _mm256_loadu_si256((__m256i*)(ref16_l1 + 48));
    1583           0 :             data8b_32_0_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_0, 2), _mm256_srli_epi16(inVal16b_1, 2));
    1584           0 :             data8b_32_1_L1 = _mm256_packus_epi16(_mm256_srli_epi16(inVal16b_2, 2), _mm256_srli_epi16(inVal16b_3, 2));
    1585             : 
    1586             :             //Avg
    1587           0 :             avg8b_32_0 = _mm256_avg_epu8(data8b_32_0_L0, data8b_32_0_L1);
    1588           0 :             avg8b_32_1 = _mm256_avg_epu8(data8b_32_1_L0, data8b_32_1_L1);
    1589             : 
    1590           0 :             avg8b_32_0 = _mm256_permute4x64_epi64(avg8b_32_0, 216);
    1591           0 :             avg8b_32_1 = _mm256_permute4x64_epi64(avg8b_32_1, 216);
    1592             : 
    1593             :             _mm256_storeu_si256((__m256i *)(dst_ptr), avg8b_32_0);
    1594           0 :             _mm256_storeu_si256((__m256i *)(dst_ptr + 32), avg8b_32_1);
    1595             :         }
    1596             :     }
    1597             : 
    1598           0 :     return;
    1599             : }
    1600             : 
    1601           0 : void picture_addition_kernel4x4_av1_sse2_intrin(
    1602             :     uint8_t  *pred_ptr,
    1603             :     uint32_t  pred_stride,
    1604             :     int32_t *residual_ptr,
    1605             :     uint32_t  residual_stride,
    1606             :     uint8_t  *recon_ptr,
    1607             :     uint32_t  recon_stride,
    1608             :     uint32_t  width,
    1609             :     uint32_t  height,
    1610             :     int32_t     bd)
    1611             : {
    1612             :     __m128i predReg, xmm0, recon_0_7, resReg;
    1613             :     uint32_t y;
    1614           0 :     xmm0 = _mm_setzero_si128();
    1615             : 
    1616           0 :     for (y = 0; y < 4; ++y) {
    1617           0 :         predReg = _mm_cvtsi32_si128(*(uint32_t *)pred_ptr);
    1618           0 :         predReg = _mm_unpacklo_epi8(predReg, xmm0);
    1619           0 :         predReg = _mm_unpacklo_epi16(predReg, xmm0);
    1620           0 :         resReg = _mm_loadu_si128((__m128i *)residual_ptr);
    1621           0 :         resReg = _mm_add_epi32(resReg, predReg);
    1622           0 :         recon_0_7 = _mm_packus_epi32(resReg, xmm0);
    1623           0 :         recon_0_7 = _mm_packus_epi16(recon_0_7, xmm0);
    1624           0 :         *(uint32_t *)recon_ptr = _mm_cvtsi128_si32(recon_0_7);
    1625           0 :         pred_ptr += pred_stride;
    1626           0 :         residual_ptr += residual_stride;
    1627           0 :         recon_ptr += recon_stride;
    1628             :     }
    1629             :     (void)width;
    1630             :     (void)height;
    1631             :     (void)bd;
    1632             : 
    1633           0 :     return;
    1634             : }
    1635           0 : void picture_addition_kernel8x8_av1_sse2_intrin(
    1636             :     uint8_t  *pred_ptr,
    1637             :     uint32_t  pred_stride,
    1638             :     int32_t *residual_ptr,
    1639             :     uint32_t  residual_stride,
    1640             :     uint8_t  *recon_ptr,
    1641             :     uint32_t  recon_stride,
    1642             :     uint32_t  width,
    1643             :     uint32_t  height,
    1644             :     int32_t     bd)
    1645             : {
    1646             :     __m256i predReg, resReg, recon_0_7, xmm0;
    1647             :     __m128i predReg_128, predReg_128Lo, predReg_128Hi, xmm0_128, recon_0_7_128;
    1648             :     uint32_t y;
    1649           0 :     xmm0_128 = _mm_setzero_si128();
    1650           0 :     xmm0 = _mm256_setzero_si256();
    1651             : 
    1652           0 :     for (y = 0; y < 8; ++y) {
    1653           0 :         predReg_128 = _mm_cvtsi64_si128(*(uint64_t *)pred_ptr);
    1654           0 :         predReg_128 = _mm_unpacklo_epi8(predReg_128, xmm0_128);
    1655           0 :         predReg_128Lo = _mm_unpacklo_epi16(predReg_128, xmm0_128);
    1656           0 :         predReg_128Hi = _mm_unpackhi_epi16(predReg_128, xmm0_128);
    1657           0 :         predReg = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
    1658           0 :         resReg = _mm256_loadu_si256((__m256i*)residual_ptr);
    1659           0 :         resReg = _mm256_add_epi32(predReg, resReg);
    1660           0 :         recon_0_7 = _mm256_packus_epi32(resReg, xmm0);
    1661           0 :         recon_0_7_128 = _mm_slli_si128(_mm256_extracti128_si256(recon_0_7, 1), 8);
    1662           0 :         recon_0_7_128 = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), recon_0_7_128);
    1663           0 :         recon_0_7_128 = _mm_packus_epi16(recon_0_7_128, xmm0_128);
    1664           0 :         *(uint64_t *)recon_ptr = _mm_cvtsi128_si64(recon_0_7_128);
    1665           0 :         pred_ptr += pred_stride;
    1666           0 :         residual_ptr += residual_stride;
    1667           0 :         recon_ptr += recon_stride;
    1668             :     }
    1669             :     (void)width;
    1670             :     (void)height;
    1671             :     (void)bd;
    1672             : 
    1673           0 :     return;
    1674             : }
    1675           0 : void picture_addition_kernel16x16_av1_sse2_intrin(
    1676             :     uint8_t  *pred_ptr,
    1677             :     uint32_t  pred_stride,
    1678             :     int32_t *residual_ptr,
    1679             :     uint32_t  residual_stride,
    1680             :     uint8_t  *recon_ptr,
    1681             :     uint32_t  recon_stride,
    1682             :     uint32_t  width,
    1683             :     uint32_t  height,
    1684             :     int32_t     bd)
    1685             : {
    1686             :     __m256i resReg, recon_0_7, xmm0, predRegLo, predRegHi, resRegLo, resRegHi;
    1687             :     __m128i predReg_128, predReg_128Lo, predReg_128Hi, xmm0_128, predReg_128Lo16Lo, predReg_128Lo16Hi, predReg_128Hi16Lo, predReg_128Hi16Hi;
    1688             :     uint32_t y;
    1689           0 :     xmm0_128 = _mm_setzero_si128();
    1690           0 :     xmm0 = _mm256_setzero_si256();
    1691             : 
    1692           0 :     for (y = 0; y < 16; ++y) {
    1693           0 :         predReg_128 = _mm_loadu_si128((__m128i *)pred_ptr);
    1694           0 :         predReg_128Lo = _mm_unpacklo_epi8(predReg_128, xmm0_128);
    1695           0 :         predReg_128Hi = _mm_unpackhi_epi8(predReg_128, xmm0_128);
    1696           0 :         predReg_128Lo16Lo = _mm_unpacklo_epi16(predReg_128Lo, xmm0_128);
    1697           0 :         predReg_128Lo16Hi = _mm_unpackhi_epi16(predReg_128Lo, xmm0_128);
    1698           0 :         predReg_128Hi16Lo = _mm_unpacklo_epi16(predReg_128Hi, xmm0_128);
    1699           0 :         predReg_128Hi16Hi = _mm_unpackhi_epi16(predReg_128Hi, xmm0_128);
    1700           0 :         predRegLo = _mm256_set_m128i(predReg_128Lo16Hi, predReg_128Lo16Lo);
    1701           0 :         predRegHi = _mm256_set_m128i(predReg_128Hi16Hi, predReg_128Hi16Lo);
    1702           0 :         resRegLo = _mm256_loadu_si256((__m256i*)residual_ptr);
    1703           0 :         resRegHi = _mm256_loadu_si256((__m256i*)(residual_ptr + 8));
    1704           0 :         predRegLo = _mm256_add_epi32(predRegLo, resRegLo);
    1705           0 :         predRegHi = _mm256_add_epi32(predRegHi, resRegHi);
    1706           0 :         resReg = _mm256_packus_epi32(predRegLo, predRegHi);
    1707           0 :         resReg = _mm256_packus_epi16(resReg, xmm0);
    1708           0 :         recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
    1709           0 :         predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
    1710           0 :         predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
    1711           0 :         predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
    1712             :         _mm_storeu_si128((__m128i*) (recon_ptr), predReg_128Lo);
    1713           0 :         pred_ptr += pred_stride;
    1714           0 :         residual_ptr += residual_stride;
    1715           0 :         recon_ptr += recon_stride;
    1716             :     }
    1717             :     (void)width;
    1718             :     (void)height;
    1719             :     (void)bd;
    1720             : 
    1721           0 :     return;
    1722             : }
    1723             : 
    1724           0 : void picture_addition_kernel32x32_av1_sse2_intrin(
    1725             :     uint8_t  *pred_ptr,
    1726             :     uint32_t  pred_stride,
    1727             :     int32_t *residual_ptr,
    1728             :     uint32_t  residual_stride,
    1729             :     uint8_t  *recon_ptr,
    1730             :     uint32_t  recon_stride,
    1731             :     uint32_t  width,
    1732             :     uint32_t  height,
    1733             :     int32_t     bd)
    1734             : {
    1735             :     __m256i predReg, recon_0_7, xmm0, resReg, predReg_Lo, predReg_Hi,
    1736             :         predReg_Lo16Lo, predReg_Lo16Hi, predReg_Hi16Lo, predReg_Hi16Hi, resReg1, resReg2, resReg3, resReg4;
    1737             :     __m128i  predReg_128Lo, predReg_128Hi;
    1738             :     uint32_t y;
    1739           0 :     xmm0 = _mm256_setzero_si256();
    1740             : 
    1741           0 :     for (y = 0; y < 32; ++y) {
    1742           0 :         predReg = _mm256_loadu_si256((__m256i*)pred_ptr);
    1743           0 :         predReg_Lo = _mm256_unpacklo_epi8(predReg, xmm0);
    1744           0 :         predReg_Hi = _mm256_unpackhi_epi8(predReg, xmm0);
    1745           0 :         predReg_Lo16Lo = _mm256_unpacklo_epi16(predReg_Lo, xmm0);
    1746           0 :         predReg_Lo16Hi = _mm256_unpackhi_epi16(predReg_Lo, xmm0);
    1747           0 :         predReg_Hi16Lo = _mm256_unpacklo_epi16(predReg_Hi, xmm0);
    1748           0 :         predReg_Hi16Hi = _mm256_unpackhi_epi16(predReg_Hi, xmm0);
    1749             : 
    1750           0 :         predReg_Lo = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Lo16Hi), _mm256_castsi256_si128(predReg_Lo16Lo));
    1751           0 :         predReg_Hi = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Hi16Hi), _mm256_castsi256_si128(predReg_Hi16Lo));
    1752           0 :         predReg_Lo16Lo = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Lo16Hi, 1), _mm256_extracti128_si256(predReg_Lo16Lo, 1));
    1753           0 :         predReg_Hi16Hi = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Hi16Hi, 1), _mm256_extracti128_si256(predReg_Hi16Lo, 1));
    1754             : 
    1755           0 :         resReg1 = _mm256_loadu_si256((__m256i*)residual_ptr);
    1756           0 :         resReg2 = _mm256_loadu_si256((__m256i*)(residual_ptr + 8));
    1757           0 :         resReg3 = _mm256_loadu_si256((__m256i*)(residual_ptr + 16));
    1758           0 :         resReg4 = _mm256_loadu_si256((__m256i*)(residual_ptr + 24));
    1759             : 
    1760           0 :         resReg1 = _mm256_add_epi32(predReg_Lo, resReg1);
    1761           0 :         resReg2 = _mm256_add_epi32(predReg_Hi, resReg2);
    1762           0 :         resReg3 = _mm256_add_epi32(predReg_Lo16Lo, resReg3);
    1763           0 :         resReg4 = _mm256_add_epi32(predReg_Hi16Hi, resReg4);
    1764             : 
    1765           0 :         resReg = _mm256_packus_epi32(resReg1, resReg2);
    1766           0 :         resReg = _mm256_packus_epi16(resReg, xmm0);
    1767           0 :         recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
    1768           0 :         predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
    1769           0 :         predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
    1770           0 :         predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
    1771             : 
    1772           0 :         resReg = _mm256_packus_epi32(resReg3, resReg4);
    1773           0 :         resReg = _mm256_packus_epi16(resReg, xmm0);
    1774           0 :         recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
    1775           0 :         predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
    1776           0 :         predReg_128Hi = _mm_slli_epi64(predReg_128Hi, 32);
    1777           0 :         predReg_128Hi = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Hi);
    1778           0 :         recon_0_7 = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
    1779             :         _mm256_storeu_si256((__m256i*)recon_ptr, recon_0_7);
    1780             : 
    1781           0 :         pred_ptr += pred_stride;
    1782           0 :         residual_ptr += residual_stride;
    1783           0 :         recon_ptr += recon_stride;
    1784             :     }
    1785             :     (void)width;
    1786             :     (void)height;
    1787             :     (void)bd;
    1788             : 
    1789           0 :     return;
    1790             : }
    1791           0 : void picture_addition_kernel64x64_av1_sse2_intrin(
    1792             :     uint8_t  *pred_ptr,
    1793             :     uint32_t  pred_stride,
    1794             :     int32_t *residual_ptr,
    1795             :     uint32_t  residual_stride,
    1796             :     uint8_t  *recon_ptr,
    1797             :     uint32_t  recon_stride,
    1798             :     uint32_t  width,
    1799             :     uint32_t  height,
    1800             :     int32_t     bd)
    1801             : {
    1802             :     __m256i predReg, recon_0_7, xmm0, resReg, predReg_Lo, predReg_Hi,
    1803             :         predReg_Lo16Lo, predReg_Lo16Hi, predReg_Hi16Lo, predReg_Hi16Hi, resReg1, resReg2, resReg3, resReg4;
    1804             :     __m128i  predReg_128Lo, predReg_128Hi;
    1805             :     uint32_t y;
    1806           0 :     xmm0 = _mm256_setzero_si256();
    1807             : 
    1808           0 :     for (y = 0; y < 64; ++y) {
    1809           0 :         predReg = _mm256_loadu_si256((__m256i*)pred_ptr);
    1810           0 :         predReg_Lo = _mm256_unpacklo_epi8(predReg, xmm0);
    1811           0 :         predReg_Hi = _mm256_unpackhi_epi8(predReg, xmm0);
    1812           0 :         predReg_Lo16Lo = _mm256_unpacklo_epi16(predReg_Lo, xmm0);
    1813           0 :         predReg_Lo16Hi = _mm256_unpackhi_epi16(predReg_Lo, xmm0);
    1814           0 :         predReg_Hi16Lo = _mm256_unpacklo_epi16(predReg_Hi, xmm0);
    1815           0 :         predReg_Hi16Hi = _mm256_unpackhi_epi16(predReg_Hi, xmm0);
    1816             : 
    1817           0 :         predReg_Lo = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Lo16Hi), _mm256_castsi256_si128(predReg_Lo16Lo));
    1818           0 :         predReg_Hi = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Hi16Hi), _mm256_castsi256_si128(predReg_Hi16Lo));
    1819           0 :         predReg_Lo16Lo = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Lo16Hi, 1), _mm256_extracti128_si256(predReg_Lo16Lo, 1));
    1820           0 :         predReg_Hi16Hi = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Hi16Hi, 1), _mm256_extracti128_si256(predReg_Hi16Lo, 1));
    1821             : 
    1822           0 :         resReg1 = _mm256_loadu_si256((__m256i*)residual_ptr);
    1823           0 :         resReg2 = _mm256_loadu_si256((__m256i*)(residual_ptr + 8));
    1824           0 :         resReg3 = _mm256_loadu_si256((__m256i*)(residual_ptr + 16));
    1825           0 :         resReg4 = _mm256_loadu_si256((__m256i*)(residual_ptr + 24));
    1826             : 
    1827           0 :         resReg1 = _mm256_add_epi32(predReg_Lo, resReg1);
    1828           0 :         resReg2 = _mm256_add_epi32(predReg_Hi, resReg2);
    1829           0 :         resReg3 = _mm256_add_epi32(predReg_Lo16Lo, resReg3);
    1830           0 :         resReg4 = _mm256_add_epi32(predReg_Hi16Hi, resReg4);
    1831             : 
    1832           0 :         resReg = _mm256_packus_epi32(resReg1, resReg2);
    1833           0 :         resReg = _mm256_packus_epi16(resReg, xmm0);
    1834           0 :         recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
    1835           0 :         predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
    1836           0 :         predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
    1837           0 :         predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
    1838             : 
    1839           0 :         resReg = _mm256_packus_epi32(resReg3, resReg4);
    1840           0 :         resReg = _mm256_packus_epi16(resReg, xmm0);
    1841           0 :         recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
    1842           0 :         predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
    1843           0 :         predReg_128Hi = _mm_slli_epi64(predReg_128Hi, 32);
    1844           0 :         predReg_128Hi = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Hi);
    1845           0 :         recon_0_7 = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
    1846             :         _mm256_storeu_si256((__m256i*)recon_ptr, recon_0_7);
    1847             : 
    1848           0 :         predReg = _mm256_loadu_si256((__m256i*)(pred_ptr + 32));
    1849           0 :         predReg_Lo = _mm256_unpacklo_epi8(predReg, xmm0);
    1850           0 :         predReg_Hi = _mm256_unpackhi_epi8(predReg, xmm0);
    1851           0 :         predReg_Lo16Lo = _mm256_unpacklo_epi16(predReg_Lo, xmm0);
    1852           0 :         predReg_Lo16Hi = _mm256_unpackhi_epi16(predReg_Lo, xmm0);
    1853           0 :         predReg_Hi16Lo = _mm256_unpacklo_epi16(predReg_Hi, xmm0);
    1854           0 :         predReg_Hi16Hi = _mm256_unpackhi_epi16(predReg_Hi, xmm0);
    1855             : 
    1856           0 :         predReg_Lo = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Lo16Hi), _mm256_castsi256_si128(predReg_Lo16Lo));
    1857           0 :         predReg_Hi = _mm256_set_m128i(_mm256_castsi256_si128(predReg_Hi16Hi), _mm256_castsi256_si128(predReg_Hi16Lo));
    1858           0 :         predReg_Lo16Lo = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Lo16Hi, 1), _mm256_extracti128_si256(predReg_Lo16Lo, 1));
    1859           0 :         predReg_Hi16Hi = _mm256_set_m128i(_mm256_extracti128_si256(predReg_Hi16Hi, 1), _mm256_extracti128_si256(predReg_Hi16Lo, 1));
    1860             : 
    1861           0 :         resReg1 = _mm256_loadu_si256((__m256i*)(residual_ptr + 32));
    1862           0 :         resReg2 = _mm256_loadu_si256((__m256i*)(residual_ptr + 40));
    1863           0 :         resReg3 = _mm256_loadu_si256((__m256i*)(residual_ptr + 48));
    1864           0 :         resReg4 = _mm256_loadu_si256((__m256i*)(residual_ptr + 56));
    1865             : 
    1866           0 :         resReg1 = _mm256_add_epi32(predReg_Lo, resReg1);
    1867           0 :         resReg2 = _mm256_add_epi32(predReg_Hi, resReg2);
    1868           0 :         resReg3 = _mm256_add_epi32(predReg_Lo16Lo, resReg3);
    1869           0 :         resReg4 = _mm256_add_epi32(predReg_Hi16Hi, resReg4);
    1870             : 
    1871           0 :         resReg = _mm256_packus_epi32(resReg1, resReg2);
    1872           0 :         resReg = _mm256_packus_epi16(resReg, xmm0);
    1873           0 :         recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
    1874           0 :         predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
    1875           0 :         predReg_128Lo = _mm_slli_epi64(predReg_128Hi, 32);
    1876           0 :         predReg_128Lo = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Lo);
    1877             : 
    1878           0 :         resReg = _mm256_packus_epi32(resReg3, resReg4);
    1879           0 :         resReg = _mm256_packus_epi16(resReg, xmm0);
    1880           0 :         recon_0_7 = _mm256_shuffle_epi32(resReg, 0xD8);
    1881           0 :         predReg_128Hi = _mm256_extracti128_si256(recon_0_7, 1);
    1882           0 :         predReg_128Hi = _mm_slli_epi64(predReg_128Hi, 32);
    1883           0 :         predReg_128Hi = _mm_or_si128(_mm256_castsi256_si128(recon_0_7), predReg_128Hi);
    1884           0 :         recon_0_7 = _mm256_set_m128i(predReg_128Hi, predReg_128Lo);
    1885           0 :         _mm256_storeu_si256((__m256i*)(recon_ptr + 32), recon_0_7);
    1886             : 
    1887           0 :         pred_ptr += pred_stride;
    1888           0 :         residual_ptr += residual_stride;
    1889           0 :         recon_ptr += recon_stride;
    1890             :     }
    1891             :     (void)width;
    1892             :     (void)height;
    1893             :     (void)bd;
    1894             : 
    1895           0 :     return;
    1896             : }
    1897             : 
    1898     5292800 : void full_distortion_kernel32_bits_avx2(
    1899             :     int32_t  *coeff,
    1900             :     uint32_t   coeff_stride,
    1901             :     int32_t  *recon_coeff,
    1902             :     uint32_t   recon_coeff_stride,
    1903             :     uint64_t   distortion_result[DIST_CALC_TOTAL],
    1904             :     uint32_t   area_width,
    1905             :     uint32_t   area_height)
    1906             : {
    1907             :     uint32_t rowCount, col_count;
    1908     5292800 :     __m256i sum1 = _mm256_setzero_si256();
    1909     5292800 :     __m256i sum2 = _mm256_setzero_si256();
    1910             :     __m128i temp1, temp2, temp3;
    1911             : 
    1912     5292800 :     rowCount = area_height;
    1913             :     do {
    1914    38433300 :         int32_t *coeffTemp = coeff;
    1915    38433300 :         int32_t *reconCoeffTemp = recon_coeff;
    1916             : 
    1917    38433300 :         col_count = area_width / 4;
    1918             :         do {
    1919             :             __m128i x0, y0;
    1920             :             __m256i x, y, z;
    1921    80435000 :             x0 = _mm_loadu_si128((__m128i *)(coeffTemp));
    1922    80435000 :             y0 = _mm_loadu_si128((__m128i *)(reconCoeffTemp));
    1923    80435000 :             x = _mm256_cvtepi32_epi64(x0);
    1924    80435000 :             y = _mm256_cvtepi32_epi64(y0);
    1925    80435000 :             z= _mm256_mul_epi32(x, x);
    1926    80435000 :             sum2 = _mm256_add_epi64(sum2, z);
    1927    80435000 :             x = _mm256_sub_epi64(x, y);
    1928    80435000 :             x = _mm256_mul_epi32(x, x);
    1929    80435000 :             sum1 = _mm256_add_epi32(sum1, x);
    1930    80435000 :             coeffTemp += 4;
    1931    80435000 :             reconCoeffTemp += 4;
    1932    80435000 :         } while (--col_count);
    1933             : 
    1934    38433300 :         coeff += coeff_stride;
    1935    38433300 :         recon_coeff += recon_coeff_stride;
    1936    38433300 :         rowCount -= 1;
    1937    38433300 :     } while (rowCount > 0);
    1938             : 
    1939     5292800 :     temp1 = _mm256_castsi256_si128(sum1);
    1940     5292800 :     temp2 = _mm256_extracti128_si256(sum1, 1);
    1941     5292800 :     temp1 = _mm_add_epi64(temp1, temp2);
    1942     5292800 :     temp2 = _mm_shuffle_epi32(temp1, 0x4e);
    1943     5292800 :     temp3 = _mm_add_epi64(temp1, temp2);
    1944     5292800 :     temp1 = _mm256_castsi256_si128(sum2);
    1945     5292800 :     temp2 = _mm256_extracti128_si256(sum2, 1);
    1946     5292800 :     temp1 = _mm_add_epi64(temp1, temp2);
    1947     5292800 :     temp2 = _mm_shuffle_epi32(temp1, 0x4e);
    1948     5292800 :     temp1 = _mm_add_epi64(temp1, temp2);
    1949     5292800 :     temp1 = _mm_unpacklo_epi64(temp3, temp1);
    1950             : 
    1951             :     _mm_storeu_si128((__m128i *)distortion_result, temp1);
    1952     5292800 : }
    1953             : 
    1954     8358880 : void full_distortion_kernel_cbf_zero32_bits_avx2(
    1955             :     int32_t  *coeff,
    1956             :     uint32_t   coeff_stride,
    1957             :     int32_t  *recon_coeff,
    1958             :     uint32_t   recon_coeff_stride,
    1959             :     uint64_t   distortion_result[DIST_CALC_TOTAL],
    1960             :     uint32_t   area_width,
    1961             :     uint32_t   area_height)
    1962             : {
    1963             :     uint32_t rowCount, col_count;
    1964     8358880 :     __m256i sum = _mm256_setzero_si256();
    1965             :     __m128i temp1, temp2;
    1966             : 
    1967     8358880 :     rowCount = area_height;
    1968             :     do {
    1969    58311200 :         int32_t *coeffTemp = coeff;
    1970             : 
    1971    58311200 :         col_count = area_width / 4;
    1972             :         do {
    1973             :             __m128i x0;
    1974             :             __m256i y0, z0;
    1975   126494000 :             x0 = _mm_loadu_si128((__m128i *)(coeffTemp));
    1976   126494000 :             coeffTemp += 4;
    1977   126494000 :             y0 = _mm256_cvtepi32_epi64(x0);
    1978   126494000 :             z0 = _mm256_mul_epi32(y0, y0);
    1979   126494000 :             sum = _mm256_add_epi64(sum, z0);
    1980   126494000 :         } while (--col_count);
    1981             : 
    1982    58311200 :         coeff += coeff_stride;
    1983    58311200 :         recon_coeff += coeff_stride;
    1984    58311200 :         rowCount -= 1;
    1985    58311200 :     } while (rowCount > 0);
    1986             : 
    1987     8358880 :     temp1 = _mm256_castsi256_si128(sum);
    1988     8358880 :     temp2 = _mm256_extracti128_si256(sum, 1);
    1989     8358880 :     temp1 = _mm_add_epi64(temp1, temp2);
    1990     8358880 :     temp2 = _mm_shuffle_epi32(temp1, 0x4e);
    1991     8358880 :     temp1 = _mm_add_epi64(temp1, temp2);
    1992             :     _mm_storeu_si128((__m128i *)distortion_result, temp1);
    1993             :     (void)recon_coeff_stride;
    1994     8358880 : }
    1995             : 
    1996   217626000 : static INLINE void residual32_avx2(const uint8_t *const input,
    1997             :     const uint8_t *const pred, int16_t *const residual)
    1998             : {
    1999   217626000 :     const __m256i zero = _mm256_setzero_si256();
    2000   217626000 :     const __m256i in0 = _mm256_loadu_si256((__m256i *)input);
    2001   217626000 :     const __m256i pr0 = _mm256_loadu_si256((__m256i *)pred);
    2002   217626000 :     const __m256i in1 = _mm256_permute4x64_epi64(in0, 0xD8);
    2003   217626000 :     const __m256i pr1 = _mm256_permute4x64_epi64(pr0, 0xD8);
    2004   217626000 :     const __m256i in_lo = _mm256_unpacklo_epi8(in1, zero);
    2005   217626000 :     const __m256i in_hi = _mm256_unpackhi_epi8(in1, zero);
    2006   217626000 :     const __m256i pr_lo = _mm256_unpacklo_epi8(pr1, zero);
    2007   217626000 :     const __m256i pr_hi = _mm256_unpackhi_epi8(pr1, zero);
    2008   217626000 :     const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
    2009   217626000 :     const __m256i re_hi = _mm256_sub_epi16(in_hi, pr_hi);
    2010             :     _mm256_storeu_si256((__m256i*)(residual + 0 * 16), re_lo);
    2011   217626000 :     _mm256_storeu_si256((__m256i*)(residual + 1 * 16), re_hi);
    2012   217626000 : }
    2013             : 
    2014             : SIMD_INLINE void residual_kernel32_avx2(
    2015             :     const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
    2016             :     const uint32_t pred_stride, int16_t *residual,
    2017             :     const uint32_t residual_stride, const uint32_t area_height)
    2018             : {
    2019     6153280 :     uint32_t y = area_height;
    2020             : 
    2021             :     do {
    2022   127535000 :         residual32_avx2(input, pred, residual);
    2023   127536000 :         input += input_stride;
    2024   127536000 :         pred += pred_stride;
    2025   127536000 :         residual += residual_stride;
    2026   127536000 :     } while (--y);
    2027     6153500 : }
    2028             : 
    2029             : SIMD_INLINE void residual_kernel64_avx2(
    2030             :     const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
    2031             :     const uint32_t pred_stride, int16_t *residual,
    2032             :     const uint32_t residual_stride, const uint32_t area_height)
    2033             : {
    2034     1496950 :     uint32_t y = area_height;
    2035             : 
    2036             :     do {
    2037    45290100 :         residual32_avx2(input + 0 * 32, pred + 0 * 32, residual + 0 * 32);
    2038    45289600 :         residual32_avx2(input + 1 * 32, pred + 1 * 32, residual + 1 * 32);
    2039    45290100 :         input += input_stride;
    2040    45290100 :         pred += pred_stride;
    2041    45290100 :         residual += residual_stride;
    2042    45290100 :     } while (--y);
    2043     1496970 : }
    2044             : 
    2045             : SIMD_INLINE void residual_kernel128_avx2(
    2046             :     const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
    2047             :     const uint32_t pred_stride, int16_t *residual,
    2048             :     const uint32_t residual_stride, const uint32_t area_height)
    2049             : {
    2050           0 :     uint32_t y = area_height;
    2051             : 
    2052             :     do {
    2053           0 :         residual32_avx2(input + 0 * 32, pred + 0 * 32, residual + 0 * 32);
    2054           0 :         residual32_avx2(input + 1 * 32, pred + 1 * 32, residual + 1 * 32);
    2055           0 :         residual32_avx2(input + 2 * 32, pred + 2 * 32, residual + 2 * 32);
    2056           0 :         residual32_avx2(input + 3 * 32, pred + 3 * 32, residual + 3 * 32);
    2057           0 :         input += input_stride;
    2058           0 :         pred += pred_stride;
    2059           0 :         residual += residual_stride;
    2060           0 :     } while (--y);
    2061           0 : }
    2062             : 
    2063    67841300 : void residual_kernel8bit_avx2(
    2064             :     uint8_t   *input,
    2065             :     uint32_t   input_stride,
    2066             :     uint8_t   *pred,
    2067             :     uint32_t   pred_stride,
    2068             :     int16_t  *residual,
    2069             :     uint32_t   residual_stride,
    2070             :     uint32_t   area_width,
    2071             :     uint32_t   area_height)
    2072             : {
    2073    67841300 :     switch (area_width) {
    2074    20595000 :     case 4:
    2075             :         residual_kernel4_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
    2076    20589800 :         break;
    2077             : 
    2078    24146600 :     case 8:
    2079             :         residual_kernel8_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
    2080    24143700 :         break;
    2081             : 
    2082    15538700 :     case 16:
    2083             :         residual_kernel16_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
    2084    15540600 :         break;
    2085             : 
    2086     6153280 :     case 32:
    2087             :         residual_kernel32_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
    2088     6153500 :         break;
    2089             : 
    2090     1496950 :     case 64:
    2091             :         residual_kernel64_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
    2092     1496970 :         break;
    2093             : 
    2094           0 :     default: // 128
    2095             :         residual_kernel128_avx2(input, input_stride, pred, pred_stride, residual, residual_stride, area_height);
    2096           0 :         break;
    2097             :     }
    2098    67924600 : }
    2099             : 
    2100           0 : uint64_t spatial_full_distortion_kernel4x_n_avx2_intrin(
    2101             :     uint8_t   *input,
    2102             :     uint32_t   input_offset,
    2103             :     uint32_t   input_stride,
    2104             :     uint8_t   *recon,
    2105             :     uint32_t   recon_offset,
    2106             :     uint32_t   recon_stride,
    2107             :     uint32_t   area_width,
    2108             :     uint32_t   area_height)
    2109             : {
    2110           0 :     int32_t row_count = area_height;
    2111           0 :     __m256i sum = _mm256_setzero_si256();
    2112             :     __m128i sum_L, sum_H, s;
    2113           0 :     input += input_offset;
    2114           0 :     recon += recon_offset;
    2115             :     (void)area_width;
    2116             : 
    2117             :     do {
    2118           0 :         const __m128i in0 = _mm_cvtsi32_si128(*(uint32_t *)(input + 0 * input_stride));
    2119           0 :         const __m128i in1 = _mm_cvtsi32_si128(*(uint32_t *)(input + 1 * input_stride));
    2120           0 :         const __m128i re0 = _mm_cvtsi32_si128(*(uint32_t *)(recon + 0 * recon_stride));
    2121           0 :         const __m128i re1 = _mm_cvtsi32_si128(*(uint32_t *)(recon + 1 * recon_stride));
    2122           0 :         const __m256i in = _mm256_setr_m128i(in0, in1);
    2123           0 :         const __m256i re = _mm256_setr_m128i(re0, re1);
    2124           0 :         Distortion_AVX2_INTRIN(in, re, &sum);
    2125           0 :         input += 2 * input_stride;
    2126           0 :         recon += 2 * recon_stride;
    2127           0 :         row_count -= 2;
    2128           0 :     } while (row_count);
    2129             : 
    2130           0 :     sum_L = _mm256_castsi256_si128(sum);
    2131           0 :     sum_H = _mm256_extracti128_si256(sum, 1);
    2132           0 :     s = _mm_add_epi32(sum_L, sum_H);
    2133           0 :     s = _mm_add_epi32(s, _mm_srli_si128(s, 4));
    2134             : 
    2135           0 :     return _mm_cvtsi128_si32(s);
    2136             : }
    2137             : 
    2138           0 : uint64_t spatial_full_distortion_kernel8x_n_avx2_intrin(
    2139             :     uint8_t   *input,
    2140             :     uint32_t   input_offset,
    2141             :     uint32_t   input_stride,
    2142             :     uint8_t   *recon,
    2143             :     uint32_t   recon_offset,
    2144             :     uint32_t   recon_stride,
    2145             :     uint32_t   area_width,
    2146             :     uint32_t   area_height)
    2147             : {
    2148           0 :     int32_t row_count = area_height;
    2149           0 :     __m256i sum = _mm256_setzero_si256();
    2150           0 :     input += input_offset;
    2151           0 :     recon += recon_offset;
    2152             :     (void)area_width;
    2153             : 
    2154             :     do {
    2155           0 :         const __m128i in0 = _mm_loadl_epi64((__m128i *)(input + 0 * input_stride));
    2156           0 :         const __m128i in1 = _mm_loadl_epi64((__m128i *)(input + 1 * input_stride));
    2157           0 :         const __m128i re0 = _mm_loadl_epi64((__m128i *)(recon + 0 * recon_stride));
    2158           0 :         const __m128i re1 = _mm_loadl_epi64((__m128i *)(recon + 1 * recon_stride));
    2159           0 :         const __m256i in = _mm256_setr_m128i(in0, in1);
    2160           0 :         const __m256i re = _mm256_setr_m128i(re0, re1);
    2161           0 :         Distortion_AVX2_INTRIN(in, re, &sum);
    2162           0 :         input += 2 * input_stride;
    2163           0 :         recon += 2 * recon_stride;
    2164           0 :         row_count -= 2;
    2165           0 :     } while (row_count);
    2166             : 
    2167           0 :     return Hadd32_AVX2_INTRIN(sum);
    2168             : }
    2169             : 
    2170           0 : uint64_t spatial_full_distortion_kernel16x_n_avx2_intrin(
    2171             :     uint8_t   *input,
    2172             :     uint32_t   input_offset,
    2173             :     uint32_t   input_stride,
    2174             :     uint8_t   *recon,
    2175             :     uint32_t   recon_offset,
    2176             :     uint32_t   recon_stride,
    2177             :     uint32_t   area_width,
    2178             :     uint32_t   area_height)
    2179             : {
    2180           0 :     int32_t row_count = area_height;
    2181           0 :     __m256i sum = _mm256_setzero_si256();
    2182           0 :     input += input_offset;
    2183           0 :     recon += recon_offset;
    2184             :     (void)area_width;
    2185             : 
    2186             :     do {
    2187           0 :         SpatialFullDistortionKernel16_AVX2_INTRIN(input, recon, &sum);
    2188           0 :         input += input_stride;
    2189           0 :         recon += recon_stride;
    2190           0 :     } while (--row_count);
    2191             : 
    2192           0 :     return Hadd32_AVX2_INTRIN(sum);
    2193             : }
    2194             : 
    2195           0 : static INLINE void SpatialFullDistortionKernel64_AVX2_INTRIN(
    2196             :     const uint8_t *const input, const uint8_t *const recon, __m256i *const sum)
    2197             : {
    2198           0 :     SpatialFullDistortionKernel32_AVX2_INTRIN(input + 0 * 32, recon + 0 * 32, sum);
    2199           0 :     SpatialFullDistortionKernel32_AVX2_INTRIN(input + 1 * 32, recon + 1 * 32, sum);
    2200           0 : }
    2201             : 
    2202           0 : uint64_t spatial_full_distortion_kernel32x_n_avx2_intrin(
    2203             :     uint8_t   *input,
    2204             :     uint32_t   input_offset,
    2205             :     uint32_t   input_stride,
    2206             :     uint8_t   *recon,
    2207             :     uint32_t   recon_offset,
    2208             :     uint32_t   recon_stride,
    2209             :     uint32_t   area_width,
    2210             :     uint32_t   area_height)
    2211             : {
    2212           0 :     int32_t row_count = area_height;
    2213           0 :     __m256i sum = _mm256_setzero_si256();
    2214           0 :     input += input_offset;
    2215           0 :     recon += recon_offset;
    2216             :     (void)area_width;
    2217             : 
    2218             :     do {
    2219           0 :         SpatialFullDistortionKernel32_AVX2_INTRIN(input, recon, &sum);
    2220           0 :         input += input_stride;
    2221           0 :         recon += recon_stride;
    2222           0 :     } while (--row_count);
    2223             : 
    2224           0 :     return Hadd32_AVX2_INTRIN(sum);
    2225             : }
    2226             : 
    2227           0 : uint64_t spatial_full_distortion_kernel64x_n_avx2_intrin(
    2228             :     uint8_t   *input,
    2229             :     uint32_t   input_offset,
    2230             :     uint32_t   input_stride,
    2231             :     uint8_t   *recon,
    2232             :     uint32_t   recon_offset,
    2233             :     uint32_t   recon_stride,
    2234             :     uint32_t   area_width,
    2235             :     uint32_t   area_height)
    2236             : {
    2237           0 :     int32_t row_count = area_height;
    2238           0 :     __m256i sum = _mm256_setzero_si256();
    2239           0 :     input += input_offset;
    2240           0 :     recon += recon_offset;
    2241             :     (void)area_width;
    2242             : 
    2243             :     do {
    2244           0 :         SpatialFullDistortionKernel64_AVX2_INTRIN(input, recon, &sum);
    2245           0 :         input += input_stride;
    2246           0 :         recon += recon_stride;
    2247           0 :     } while (--row_count);
    2248             : 
    2249           0 :     return Hadd32_AVX2_INTRIN(sum);
    2250             : }
    2251             : 
    2252           0 : uint64_t spatial_full_distortion_kernel128x_n_avx2_intrin(
    2253             :     uint8_t   *input,
    2254             :     uint32_t   input_offset,
    2255             :     uint32_t   input_stride,
    2256             :     uint8_t   *recon,
    2257             :     uint32_t   recon_offset,
    2258             :     uint32_t   recon_stride,
    2259             :     uint32_t   area_width,
    2260             :     uint32_t   area_height)
    2261             : {
    2262           0 :     int32_t row_count = area_height;
    2263           0 :     __m256i sum = _mm256_setzero_si256();
    2264           0 :     input += input_offset;
    2265           0 :     recon += recon_offset;
    2266             :     (void)area_width;
    2267             : 
    2268             :     do {
    2269           0 :         SpatialFullDistortionKernel64_AVX2_INTRIN(input + 0 * 64, recon + 0 * 64, &sum);
    2270           0 :         SpatialFullDistortionKernel64_AVX2_INTRIN(input + 1 * 64, recon + 1 * 64, &sum);
    2271           0 :         input += input_stride;
    2272           0 :         recon += recon_stride;
    2273           0 :     } while (--row_count);
    2274             : 
    2275           0 :     return Hadd32_AVX2_INTRIN(sum);
    2276             : }
    2277             : 
    2278             : #include "EbUtility.h"
    2279             : 
    2280  1270770000 : uint64_t spatial_full_distortion_kernel_avx2(
    2281             :     uint8_t   *input,
    2282             :     uint32_t   input_offset,
    2283             :     uint32_t   input_stride,
    2284             :     uint8_t   *recon,
    2285             :     uint32_t   recon_offset,
    2286             :     uint32_t   recon_stride,
    2287             :     uint32_t   area_width,
    2288             :     uint32_t   area_height)
    2289             : {
    2290  1270770000 :     const uint32_t leftover = area_width & 31;
    2291             :     int32_t h;
    2292  1270770000 :     __m256i sum = _mm256_setzero_si256();
    2293             :     __m128i sum_L, sum_H, s;
    2294  1270770000 :     uint64_t spatialDistortion = 0;
    2295  1270770000 :     input += input_offset;
    2296  1270770000 :     recon += recon_offset;
    2297             : 
    2298  1270770000 :     if (leftover) {
    2299  1066310000 :         const uint8_t *inp = input + area_width - leftover;
    2300  1066310000 :         const uint8_t *rec = recon + area_width - leftover;
    2301             : 
    2302  1066310000 :         if (leftover == 4) {
    2303    81796100 :             h = area_height;
    2304             :             do {
    2305   366005000 :                 const __m128i in0 = _mm_cvtsi32_si128(*(uint32_t *)inp);
    2306   366005000 :                 const __m128i in1 = _mm_cvtsi32_si128(*(uint32_t *)(inp + input_stride));
    2307   366005000 :                 const __m128i re0 = _mm_cvtsi32_si128(*(uint32_t *)rec);
    2308   366005000 :                 const __m128i re1 = _mm_cvtsi32_si128(*(uint32_t *)(rec + recon_stride));
    2309   366005000 :                 const __m256i in = _mm256_setr_m128i(in0, in1);
    2310   366005000 :                 const __m256i re = _mm256_setr_m128i(re0, re1);
    2311   366005000 :                 Distortion_AVX2_INTRIN(in, re, &sum);
    2312   366175000 :                 inp += 2 * input_stride;
    2313   366175000 :                 rec += 2 * recon_stride;
    2314   366175000 :                 h -= 2;
    2315   366175000 :             } while (h);
    2316             : 
    2317    81965700 :             if (area_width == 4) {
    2318    81800600 :                 sum_L = _mm256_castsi256_si128(sum);
    2319    81800600 :                 sum_H = _mm256_extracti128_si256(sum, 1);
    2320    81800600 :                 s = _mm_add_epi32(sum_L, sum_H);
    2321   163601000 :                 s = _mm_add_epi32(s, _mm_srli_si128(s, 4));
    2322    81800600 :                 spatialDistortion = _mm_cvtsi128_si32(s);
    2323    81800600 :                 return spatialDistortion;
    2324             :             }
    2325             :         }
    2326   984509000 :         else if (leftover == 8) {
    2327   633345000 :             h = area_height;
    2328             :             do {
    2329  4022560000 :                 const __m128i in0 = _mm_loadl_epi64((__m128i *)inp);
    2330  8045110000 :                 const __m128i in1 = _mm_loadl_epi64((__m128i *)(inp + input_stride));
    2331  4022560000 :                 const __m128i re0 = _mm_loadl_epi64((__m128i *)rec);
    2332  4022560000 :                 const __m128i re1 = _mm_loadl_epi64((__m128i *)(rec + recon_stride));
    2333  4022560000 :                 const __m256i in = _mm256_setr_m128i(in0, in1);
    2334  4022560000 :                 const __m256i re = _mm256_setr_m128i(re0, re1);
    2335  4022560000 :                 Distortion_AVX2_INTRIN(in, re, &sum);
    2336  4035410000 :                 inp += 2 * input_stride;
    2337  4035410000 :                 rec += 2 * recon_stride;
    2338  4035410000 :                 h -= 2;
    2339  4035410000 :             } while (h);
    2340             :         }
    2341   351164000 :         else if (leftover <= 16) {
    2342   371710000 :             h = area_height;
    2343             :             do {
    2344  5678600000 :                 SpatialFullDistortionKernel16_AVX2_INTRIN(inp, rec, &sum);
    2345  5678540000 :                 inp += input_stride;
    2346  5678540000 :                 rec += recon_stride;
    2347  5678540000 :             } while (--h);
    2348             : 
    2349   371646000 :             if (leftover == 12) {
    2350           0 :                 const __m256i mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
    2351           0 :                 sum = _mm256_and_si256(sum, mask);
    2352             :             }
    2353             :         }
    2354             :         else {
    2355           0 :             __m256i sum1 = _mm256_setzero_si256();
    2356           0 :             h = area_height;
    2357             :             do {
    2358           0 :                 SpatialFullDistortionKernel32Leftover_AVX2_INTRIN(inp, rec, &sum, &sum1);
    2359           0 :                 inp += input_stride;
    2360           0 :                 rec += recon_stride;
    2361           0 :             } while (--h);
    2362             : 
    2363             :             __m256i mask[2];
    2364           0 :             if (leftover == 20) {
    2365           0 :                 mask[0] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
    2366           0 :                 mask[1] = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
    2367             :             }
    2368           0 :             else if (leftover == 24) {
    2369           0 :                 mask[0] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
    2370           0 :                 mask[1] = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
    2371             :             }
    2372             :             else { // leftover = 28
    2373           0 :                 mask[0] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
    2374           0 :                 mask[1] = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
    2375             :             }
    2376             : 
    2377           0 :             sum = _mm256_and_si256(sum, mask[0]);
    2378           0 :             sum1 = _mm256_and_si256(sum1, mask[1]);
    2379           0 :             sum = _mm256_add_epi32(sum, sum1);
    2380             :         }
    2381             :     }
    2382             : 
    2383  1222480000 :     area_width -= leftover;
    2384             : 
    2385  1222480000 :     if (area_width) {
    2386   221196000 :         const uint8_t *inp = input;
    2387   221196000 :         const uint8_t *rec = recon;
    2388   221196000 :         h = area_height;
    2389             : 
    2390   221196000 :         if (area_width == 32) {
    2391             :             do {
    2392  3054760000 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp, rec, &sum);
    2393  3034610000 :                 inp += input_stride;
    2394  3034610000 :                 rec += recon_stride;
    2395  3034610000 :             } while (--h);
    2396             :         }
    2397    43738500 :         else if (area_width == 64) {
    2398             :             do {
    2399  1178570000 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 0 * 32, rec + 0 * 32, &sum);
    2400  1175810000 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 1 * 32, rec + 1 * 32, &sum);
    2401  1175280000 :                 inp += input_stride;
    2402  1175280000 :                 rec += recon_stride;
    2403  1175280000 :             } while (--h);
    2404             :         }
    2405           0 :         else if (area_width == 96) {
    2406             :             do {
    2407           0 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 0 * 32, rec + 0 * 32, &sum);
    2408           0 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 1 * 32, rec + 1 * 32, &sum);
    2409           0 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 2 * 32, rec + 2 * 32, &sum);
    2410           0 :                 inp += input_stride;
    2411           0 :                 rec += recon_stride;
    2412           0 :             } while (--h);
    2413             :         }
    2414             :         else { // 128
    2415             :             do {
    2416           0 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 0 * 32, rec + 0 * 32, &sum);
    2417           0 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 1 * 32, rec + 1 * 32, &sum);
    2418           0 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 2 * 32, rec + 2 * 32, &sum);
    2419           0 :                 SpatialFullDistortionKernel32_AVX2_INTRIN(inp + 3 * 32, rec + 3 * 32, &sum);
    2420           0 :                 inp += input_stride;
    2421           0 :                 rec += recon_stride;
    2422           0 :             } while (--h);
    2423             :         }
    2424             :     }
    2425             : 
    2426  1199040000 :     return Hadd32_AVX2_INTRIN(sum);
    2427             : }

Generated by: LCOV version 1.14