LCOV - coverage.info - ASM_AVX2/aom_subpixel_8t_intrin

LCOV - code coverage report

Current view:	top level - ASM_AVX2 - aom_subpixel_8t_intrin_avx2.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	265	576	46.0 %
Date:	2019-11-25 17:38:06	Functions:	10	18	55.6 %

          Line data    Source code

       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : 
      14             : #include "aom_dsp_rtcd.h"
      15             : 
      16             : #include "convolve.h"
      17             : #include "convolve_avx2.h"
      18             : // #include "aom_ports/mem.h"
      19             : 
      20             : #if defined(__clang__)
      21             : #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
      22             :     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
      23             :     (defined(__APPLE__) && defined(__apple_build_version__) && \
      24             :      ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
      25             :       (__clang_major__ == 5 && __clang_minor__ == 0)))
      26             : #define MM256_BROADCASTSI128_SI256(x) \
      27             :   _mm_broadcastsi128_si256((__m128i const *)&(x))
      28             : #else  // clang > 3.3, and not 5.0 on macosx.
      29             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      30             : #endif  // clang <= 3.3
      31             : #elif defined(__GNUC__)
      32             : #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
      33             : #define MM256_BROADCASTSI128_SI256(x) \
      34             :   _mm_broadcastsi128_si256((__m128i const *)&(x))
      35             : #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
      36             : #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
      37             : #else  // gcc > 4.7
      38             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      39             : #endif  // gcc <= 4.6
      40             : #else   // !(gcc || clang)
      41             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      42             : #endif  // __clang__
      43             : 
      44             : typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      45             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      46             :                                 uint32_t output_height, const int16_t *filter);
      47             : void  aom_filter_block1d4_v8_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      48             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      49             :                                 uint32_t output_height, const int16_t *filter);
      50             :  void  aom_filter_block1d16_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      51             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      52             :                                 uint32_t output_height, const int16_t *filter);
      53             :  void  aom_filter_block1d16_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      54             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      55             :                                 uint32_t output_height, const int16_t *filter);
      56             :  void  aom_filter_block1d8_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      57             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      58             :                                 uint32_t output_height, const int16_t *filter);
      59             :  void  aom_filter_block1d8_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      60             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      61             :                                 uint32_t output_height, const int16_t *filter);
      62             :  void  aom_filter_block1d4_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      63             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      64             :                                 uint32_t output_height, const int16_t *filter);
      65             :  void  aom_filter_block1d4_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
      66             :                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
      67             :                                 uint32_t output_height, const int16_t *filter);
      68             : 
      69             : filter8_1dfunction aom_filter_block1d4_v8_ssse3;
      70             : filter8_1dfunction aom_filter_block1d16_v2_ssse3;
      71             : filter8_1dfunction aom_filter_block1d16_h2_ssse3;
      72             : filter8_1dfunction aom_filter_block1d8_v2_ssse3;
      73             : filter8_1dfunction aom_filter_block1d8_h2_ssse3;
      74             : filter8_1dfunction aom_filter_block1d4_v2_ssse3;
      75             : filter8_1dfunction aom_filter_block1d4_h2_ssse3;
      76             : #define aom_filter_block1d4_v8_avx2  aom_filter_block1d4_v8_sse2
      77             : #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
      78             : #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
      79             : #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
      80             : #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
      81             : #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
      82             : #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
      83             : 
      84             : 
      85             : #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
      86             :   void aom_convolve8_##name##_##opt(                                         \
      87             :       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
      88             :       ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
      89             :       const int16_t *filter_y, int y_step_q4, int w, int h) {                \
      90             :     (void)filter_x;                                                          \
      91             :     (void)x_step_q4;                                                         \
      92             :     (void)filter_y;                                                          \
      93             :     (void)y_step_q4;                                                         \
      94             :     assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
      95             :     assert(step_q4 == 16);                                                   \
      96             :     if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
      97             :         (filter[2] | filter[5])) {                                           \
      98             :       while (w >= 16) {                                                      \
      99             :         aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
     100             :                                                  dst_stride, h, filter);     \
     101             :         src += 16;                                                           \
     102             :         dst += 16;                                                           \
     103             :         w -= 16;                                                             \
     104             :       }                                                                      \
     105             :       while (w >= 8) {                                                       \
     106             :         aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
     107             :                                                 dst_stride, h, filter);      \
     108             :         src += 8;                                                            \
     109             :         dst += 8;                                                            \
     110             :         w -= 8;                                                              \
     111             :       }                                                                      \
     112             :       while (w >= 4) {                                                       \
     113             :         aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
     114             :                                                 dst_stride, h, filter);      \
     115             :         src += 4;                                                            \
     116             :         dst += 4;                                                            \
     117             :         w -= 4;                                                              \
     118             :       }                                                                      \
     119             :     } else if (filter[0] | filter[1] | filter[2]) {                          \
     120             :       while (w >= 16) {                                                      \
     121             :         aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
     122             :                                                  dst_stride, h, filter);     \
     123             :         src += 16;                                                           \
     124             :         dst += 16;                                                           \
     125             :         w -= 16;                                                             \
     126             :       }                                                                      \
     127             :       while (w >= 8) {                                                       \
     128             :         aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
     129             :                                                 dst_stride, h, filter);      \
     130             :         src += 8;                                                            \
     131             :         dst += 8;                                                            \
     132             :         w -= 8;                                                              \
     133             :       }                                                                      \
     134             :       while (w >= 4) {                                                       \
     135             :         aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
     136             :                                                 dst_stride, h, filter);      \
     137             :         src += 4;                                                            \
     138             :         dst += 4;                                                            \
     139             :         w -= 4;                                                              \
     140             :       }                                                                      \
     141             :     } else {                                                                 \
     142             :       while (w >= 16) {                                                      \
     143             :         aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
     144             :                                                  dst_stride, h, filter);     \
     145             :         src += 16;                                                           \
     146             :         dst += 16;                                                           \
     147             :         w -= 16;                                                             \
     148             :       }                                                                      \
     149             :       while (w >= 8) {                                                       \
     150             :         aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
     151             :                                                 dst_stride, h, filter);      \
     152             :         src += 8;                                                            \
     153             :         dst += 8;                                                            \
     154             :         w -= 8;                                                              \
     155             :       }                                                                      \
     156             :       while (w >= 4) {                                                       \
     157             :         aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
     158             :                                                 dst_stride, h, filter);      \
     159             :         src += 4;                                                            \
     160             :         dst += 4;                                                            \
     161             :         w -= 4;                                                              \
     162             :       }                                                                      \
     163             :     }                                                                        \
     164             :     if (w) {                                                                 \
     165             :       aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
     166             :                                x_step_q4, filter_y, y_step_q4, w, h);        \
     167             :     }                                                                        \
     168             :   }
     169             : 
     170             :  // filters for 16
     171             : DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
     172             :   0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
     173             :   2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
     174             :   5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
     175             :   7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
     176             :   10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
     177             :   12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
     178             :   7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
     179             : };
     180             : 
     181             : DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
     182             :   0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
     183             :   3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
     184             :   7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
     185             : };
     186             : 
     187             : DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
     188             :   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
     189             :   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
     190             : };
     191             : 
     192           0 : static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
     193             :                                     const ptrdiff_t stride, const __m256i *a) {
     194           0 :   *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
     195           0 :   *((uint32_t *)(output_ptr + stride)) =
     196           0 :       _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
     197           0 : }
     198             : 
     199    33603300 : static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
     200    67206600 :   __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
     201    33603300 :   a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
     202    33603300 :   return a;
     203             : }
     204             : 
     205   187641000 : static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
     206             :                                     const ptrdiff_t stride, const __m256i *a) {
     207   375282000 :   _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
     208   375282000 :   _mm_storel_epi64((__m128i *)(output_ptr + stride),
     209   187641000 :                    _mm256_extractf128_si256(*a, 1));
     210   187641000 : }
     211             : 
     212   848760000 : static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
     213  1697520000 :   __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
     214   848760000 :   a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
     215   848760000 :   return a;
     216             : }
     217             : 
     218   629185000 : static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
     219             :                                    const ptrdiff_t stride, const __m256i *a) {
     220  1258370000 :   _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
     221   629185000 :   _mm_store_si128((__m128i *)(output_ptr + stride),
     222   629185000 :                   _mm256_extractf128_si256(*a, 1));
     223   629185000 : }
     224             : 
     225           0 : static void aom_filter_block1d4_h4_avx2(
     226             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     227             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     228             :   __m128i filtersReg;
     229             :   __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
     230             :   unsigned int i;
     231             :   ptrdiff_t src_stride, dst_stride;
     232           0 :   src_ptr -= 3;
     233           0 :   addFilterReg32 = _mm256_set1_epi16(32);
     234           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     235           0 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
     236             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
     237             :   // in both lanes of 128 bit register.
     238           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     239             :   // have the same data in both lanes of a 256 bit register
     240           0 :   const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     241             : 
     242             :   firstFilters =
     243           0 :       _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
     244           0 :   filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
     245             : 
     246             :   // multiple the size of the source and destination stride by two
     247           0 :   src_stride = src_pixels_per_line << 1;
     248           0 :   dst_stride = output_pitch << 1;
     249           0 :   for (i = output_height; i > 1; i -= 2) {
     250             :     // load the 2 strides of source
     251           0 :     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
     252             : 
     253             :     // filter the source buffer
     254           0 :     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
     255             : 
     256             :     // multiply 4 adjacent elements with the filter and add the result
     257           0 :     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
     258             : 
     259           0 :     srcRegFilt32b1_1 =
     260           0 :         _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
     261             : 
     262             :     // shift by 6 bit each 16 bit
     263           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
     264           0 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
     265             : 
     266             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     267             :     // convolve result and the second lane contain the second convolve result
     268           0 :     srcRegFilt32b1_1 =
     269           0 :         _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
     270             : 
     271           0 :     src_ptr += src_stride;
     272             : 
     273           0 :     xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
     274           0 :     output_ptr += dst_stride;
     275             :   }
     276             : 
     277             :   // if the number of strides is odd.
     278             :   // process only 4 bytes
     279           0 :   if (i > 0) {
     280             :     __m128i srcReg1, srcRegFilt1_1;
     281             : 
     282           0 :     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
     283             : 
     284             :     // filter the source buffer
     285           0 :     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
     286             : 
     287             :     // multiply 4 adjacent elements with the filter and add the result
     288             :     srcRegFilt1_1 =
     289           0 :         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
     290             : 
     291           0 :     srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
     292             :     // shift by 6 bit each 16 bit
     293             :     srcRegFilt1_1 =
     294           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
     295           0 :     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
     296             : 
     297             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     298             :     // convolve result and the second lane contain the second convolve result
     299           0 :     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
     300             : 
     301             :     // save 4 bytes
     302           0 :     *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
     303             :   }
     304           0 : }
     305             : 
     306           0 : static void aom_filter_block1d4_h8_avx2(
     307             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     308             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     309             :   __m128i filtersReg;
     310             :   __m256i addFilterReg32, filt1Reg, filt2Reg;
     311             :   __m256i firstFilters, secondFilters;
     312             :   __m256i srcRegFilt32b1_1, srcRegFilt32b2;
     313             :   __m256i srcReg32b1;
     314             :   unsigned int i;
     315             :   ptrdiff_t src_stride, dst_stride;
     316           0 :   src_ptr -= 3;
     317           0 :   addFilterReg32 = _mm256_set1_epi16(32);
     318           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     319           0 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
     320             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
     321             :   // in both lanes of 128 bit register.
     322           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     323             :   // have the same data in both lanes of a 256 bit register
     324           0 :   const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     325             : 
     326             :   // duplicate only the first 32 bits
     327           0 :   firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
     328             :   // duplicate only the second 32 bits
     329           0 :   secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
     330             : 
     331           0 :   filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
     332           0 :   filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
     333             : 
     334             :   // multiple the size of the source and destination stride by two
     335           0 :   src_stride = src_pixels_per_line << 1;
     336           0 :   dst_stride = output_pitch << 1;
     337           0 :   for (i = output_height; i > 1; i -= 2) {
     338             :     // load the 2 strides of source
     339           0 :     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
     340             : 
     341             :     // filter the source buffer
     342           0 :     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
     343             : 
     344             :     // multiply 4 adjacent elements with the filter and add the result
     345           0 :     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
     346             : 
     347             :     // filter the source buffer
     348           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     349             : 
     350             :     // multiply 4 adjacent elements with the filter and add the result
     351           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
     352             : 
     353           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
     354             : 
     355           0 :     srcRegFilt32b1_1 =
     356           0 :         _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
     357             : 
     358             :     // shift by 6 bit each 16 bit
     359           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
     360           0 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
     361             : 
     362             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     363             :     // convolve result and the second lane contain the second convolve result
     364           0 :     srcRegFilt32b1_1 =
     365           0 :         _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
     366             : 
     367           0 :     src_ptr += src_stride;
     368             : 
     369           0 :     xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
     370           0 :     output_ptr += dst_stride;
     371             :   }
     372             : 
     373             :   // if the number of strides is odd.
     374             :   // process only 4 bytes
     375           0 :   if (i > 0) {
     376             :     __m128i srcReg1, srcRegFilt1_1;
     377             :     __m128i srcRegFilt2;
     378             : 
     379           0 :     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
     380             : 
     381             :     // filter the source buffer
     382           0 :     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
     383             : 
     384             :     // multiply 4 adjacent elements with the filter and add the result
     385             :     srcRegFilt1_1 =
     386           0 :         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
     387             : 
     388             :     // filter the source buffer
     389           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
     390             : 
     391             :     // multiply 4 adjacent elements with the filter and add the result
     392             :     srcRegFilt2 =
     393           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
     394             : 
     395           0 :     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
     396           0 :     srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
     397             :     // shift by 6 bit each 16 bit
     398             :     srcRegFilt1_1 =
     399           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
     400           0 :     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
     401             : 
     402             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     403             :     // convolve result and the second lane contain the second convolve result
     404           0 :     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
     405             : 
     406             :     // save 4 bytes
     407           0 :     *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
     408             :   }
     409           0 : }
     410             : 
     411           0 : static void aom_filter_block1d8_h4_avx2(
     412             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     413             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     414             :   __m128i filtersReg;
     415             :   __m256i addFilterReg32, filt2Reg, filt3Reg;
     416             :   __m256i secondFilters, thirdFilters;
     417             :   __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
     418             :   __m256i srcReg32b1, filtersReg32;
     419             :   unsigned int i;
     420             :   ptrdiff_t src_stride, dst_stride;
     421           0 :   src_ptr -= 3;
     422           0 :   addFilterReg32 = _mm256_set1_epi16(32);
     423           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     424           0 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
     425             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
     426             :   // in both lanes of 128 bit register.
     427           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     428             :   // have the same data in both lanes of a 256 bit register
     429           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     430             : 
     431             :   // duplicate only the second 16 bits (third and forth byte)
     432             :   // across 256 bit register
     433           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
     434             :   // duplicate only the third 16 bits (fifth and sixth byte)
     435             :   // across 256 bit register
     436           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
     437             : 
     438           0 :   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
     439           0 :   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
     440             : 
     441             :   // multiply the size of the source and destination stride by two
     442           0 :   src_stride = src_pixels_per_line << 1;
     443           0 :   dst_stride = output_pitch << 1;
     444           0 :   for (i = output_height; i > 1; i -= 2) {
     445             :     // load the 2 strides of source
     446           0 :     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
     447             : 
     448             :     // filter the source buffer
     449           0 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     450           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
     451             : 
     452             :     // multiply 2 adjacent elements with the filter and add the result
     453           0 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     454           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     455             : 
     456           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
     457             : 
     458             :     // shift by 6 bit each 16 bit
     459           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
     460           0 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
     461             : 
     462             :     // shrink to 8 bit each 16 bits
     463           0 :     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
     464             : 
     465           0 :     src_ptr += src_stride;
     466             : 
     467           0 :     xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
     468           0 :     output_ptr += dst_stride;
     469             :   }
     470             : 
     471             :   // if the number of strides is odd.
     472             :   // process only 8 bytes
     473           0 :   if (i > 0) {
     474             :     __m128i srcReg1, srcRegFilt1_1;
     475             :     __m128i srcRegFilt2, srcRegFilt3;
     476             : 
     477           0 :     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
     478             : 
     479             :     // filter the source buffer
     480           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
     481           0 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
     482             : 
     483             :     // multiply 2 adjacent elements with the filter and add the result
     484             :     srcRegFilt2 =
     485           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
     486             :     srcRegFilt3 =
     487           0 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
     488             : 
     489             :     // add and saturate the results together
     490           0 :     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
     491             : 
     492             :     // shift by 6 bit each 16 bit
     493             :     srcRegFilt1_1 =
     494           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
     495           0 :     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
     496             : 
     497             :     // shrink to 8 bit each 16 bits
     498           0 :     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
     499             : 
     500             :     // save 8 bytes
     501           0 :     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
     502             :   }
     503           0 : }
     504             : 
     505    11204400 : static void aom_filter_block1d8_h8_avx2(
     506             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     507             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     508             :   __m128i filtersReg;
     509             :   __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
     510             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
     511             :   __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
     512             :   __m256i srcReg32b1;
     513             :   unsigned int i;
     514             :   ptrdiff_t src_stride, dst_stride;
     515    11204400 :   src_ptr -= 3;
     516    11204400 :   addFilterReg32 = _mm256_set1_epi16(32);
     517    11204400 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     518    11204400 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
     519             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
     520             :   // in both lanes of 128 bit register.
     521    11204400 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     522             :   // have the same data in both lanes of a 256 bit register
     523    11204400 :   const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     524             : 
     525             :   // duplicate only the first 16 bits (first and second byte)
     526             :   // across 256 bit register
     527    22408900 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
     528             :   // duplicate only the second 16 bits (third and forth byte)
     529             :   // across 256 bit register
     530    22408900 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
     531             :   // duplicate only the third 16 bits (fifth and sixth byte)
     532             :   // across 256 bit register
     533    22408900 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
     534             :   // duplicate only the forth 16 bits (seventh and eighth byte)
     535             :   // across 256 bit register
     536    22408900 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
     537             : 
     538    11204400 :   filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
     539    11204400 :   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
     540    11204400 :   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
     541    11204400 :   filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
     542             : 
     543             :   // multiple the size of the source and destination stride by two
     544    11204400 :   src_stride = src_pixels_per_line << 1;
     545    11204400 :   dst_stride = output_pitch << 1;
     546   116960000 :   for (i = output_height; i > 1; i -= 2) {
     547             :     // load the 2 strides of source
     548   105756000 :     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
     549             : 
     550             :     // filter the source buffer
     551   105726000 :     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
     552   105726000 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
     553             : 
     554             :     // multiply 2 adjacent elements with the filter and add the result
     555   211452000 :     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
     556   105726000 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
     557             : 
     558             :     // add and saturate the results together
     559   211452000 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
     560             : 
     561             :     // filter the source buffer
     562   105726000 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     563   105726000 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
     564             : 
     565             :     // multiply 2 adjacent elements with the filter and add the result
     566   105726000 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     567   105726000 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     568             : 
     569   105726000 :     __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
     570   105726000 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
     571             : 
     572             :     // shift by 6 bit each 16 bit
     573   105726000 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
     574   211452000 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
     575             : 
     576             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     577             :     // convolve result and the second lane contain the second convolve result
     578   105726000 :     srcRegFilt32b1_1 =
     579   105726000 :         _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
     580             : 
     581   105726000 :     src_ptr += src_stride;
     582             : 
     583   105726000 :     xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
     584   105756000 :     output_ptr += dst_stride;
     585             :   }
     586             : 
     587             :   // if the number of strides is odd.
     588             :   // process only 8 bytes
     589    11203900 :   if (i > 0) {
     590             :     __m128i srcReg1, srcRegFilt1_1;
     591             :     __m128i srcRegFilt2, srcRegFilt3;
     592             : 
     593     7898760 :     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
     594             : 
     595             :     // filter the source buffer
     596    15797500 :     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
     597    15797500 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
     598             : 
     599             :     // multiply 2 adjacent elements with the filter and add the result
     600             :     srcRegFilt1_1 =
     601    15797500 :         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
     602             :     srcRegFilt2 =
     603    15797500 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
     604             : 
     605             :     // add and saturate the results together
     606     7898760 :     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
     607             : 
     608             :     // filter the source buffer
     609    15797500 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
     610    15797500 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
     611             : 
     612             :     // multiply 2 adjacent elements with the filter and add the result
     613             :     srcRegFilt3 =
     614    15797500 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
     615             :     srcRegFilt2 =
     616    15797500 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
     617             : 
     618             :     // add and saturate the results together
     619             :     srcRegFilt1_1 =
     620    15797500 :         _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
     621             : 
     622             :     // shift by 6 bit each 16 bit
     623             :     srcRegFilt1_1 =
     624    15797500 :         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
     625     7898760 :     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
     626             : 
     627             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     628             :     // convolve result and the second lane contain the second convolve
     629             :     // result
     630    15797500 :     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
     631             : 
     632             :     // save 8 bytes
     633     7898760 :     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
     634             :   }
     635    11203900 : }
     636             : 
     637           0 : static void aom_filter_block1d16_h4_avx2(
     638             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     639             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     640             :   __m128i filtersReg;
     641             :   __m256i addFilterReg32, filt2Reg, filt3Reg;
     642             :   __m256i secondFilters, thirdFilters;
     643             :   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
     644             :   __m256i srcReg32b1, srcReg32b2, filtersReg32;
     645             :   unsigned int i;
     646             :   ptrdiff_t src_stride, dst_stride;
     647           0 :   src_ptr -= 3;
     648           0 :   addFilterReg32 = _mm256_set1_epi16(32);
     649           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     650           0 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
     651             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
     652             :   // in both lanes of 128 bit register.
     653           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     654             :   // have the same data in both lanes of a 256 bit register
     655           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     656             : 
     657             :   // duplicate only the second 16 bits (third and forth byte)
     658             :   // across 256 bit register
     659           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
     660             :   // duplicate only the third 16 bits (fifth and sixth byte)
     661             :   // across 256 bit register
     662           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
     663             : 
     664           0 :   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
     665           0 :   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
     666             : 
     667             :   // multiply the size of the source and destination stride by two
     668           0 :   src_stride = src_pixels_per_line << 1;
     669           0 :   dst_stride = output_pitch << 1;
     670           0 :   for (i = output_height; i > 1; i -= 2) {
     671             :     // load the 2 strides of source
     672           0 :     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
     673             : 
     674             :     // filter the source buffer
     675           0 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     676           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
     677             : 
     678             :     // multiply 2 adjacent elements with the filter and add the result
     679           0 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     680           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     681             : 
     682           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
     683             : 
     684             :     // reading 2 strides of the next 16 bytes
     685             :     // (part of it was being read by earlier read)
     686             :     srcReg32b2 =
     687           0 :         xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
     688             : 
     689             :     // filter the source buffer
     690           0 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
     691           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
     692             : 
     693             :     // multiply 2 adjacent elements with the filter and add the result
     694           0 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     695           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     696             : 
     697             :     // add and saturate the results together
     698           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
     699             : 
     700             :     // shift by 6 bit each 16 bit
     701           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
     702           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
     703           0 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
     704           0 :     srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
     705             : 
     706             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     707             :     // convolve result and the second lane contain the second convolve result
     708           0 :     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
     709             : 
     710           0 :     src_ptr += src_stride;
     711             : 
     712           0 :     xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
     713           0 :     output_ptr += dst_stride;
     714             :   }
     715             : 
     716             :   // if the number of strides is odd.
     717             :   // process only 16 bytes
     718           0 :   if (i > 0) {
     719             :     __m256i srcReg1, srcReg12;
     720             :     __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
     721             : 
     722           0 :     srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
     723           0 :     srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
     724             : 
     725             :     // filter the source buffer
     726           0 :     srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
     727           0 :     srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
     728             : 
     729             :     // multiply 2 adjacent elements with the filter and add the result
     730           0 :     srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
     731           0 :     srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
     732             : 
     733             :     // add and saturate the results together
     734           0 :     srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
     735             : 
     736             :     // shift by 6 bit each 16 bit
     737           0 :     srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
     738           0 :     srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
     739             : 
     740             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     741             :     // convolve result and the second lane contain the second convolve
     742             :     // result
     743           0 :     srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
     744           0 :     srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
     745             : 
     746             :     // save 16 bytes
     747           0 :     _mm_store_si128((__m128i *)output_ptr,
     748             :                     _mm256_castsi256_si128(srcRegFilt1_1));
     749             :   }
     750           0 : }
     751             : 
     752    24076900 : static void aom_filter_block1d16_h8_avx2(
     753             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     754             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     755             :   __m128i filtersReg;
     756             :   __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
     757             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
     758             :   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
     759             :   __m256i srcReg32b1, srcReg32b2, filtersReg32;
     760             :   unsigned int i;
     761             :   ptrdiff_t src_stride, dst_stride;
     762    24076900 :   src_ptr -= 3;
     763    24076900 :   addFilterReg32 = _mm256_set1_epi16(32);
     764    24076900 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     765    24076900 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
     766             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
     767             :   // in both lanes of 128 bit register.
     768    24076900 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     769             :   // have the same data in both lanes of a 256 bit register
     770    24076900 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     771             : 
     772             :   // duplicate only the first 16 bits (first and second byte)
     773             :   // across 256 bit register
     774    48153800 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
     775             :   // duplicate only the second 16 bits (third and forth byte)
     776             :   // across 256 bit register
     777    48153800 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
     778             :   // duplicate only the third 16 bits (fifth and sixth byte)
     779             :   // across 256 bit register
     780    48153800 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
     781             :   // duplicate only the forth 16 bits (seventh and eighth byte)
     782             :   // across 256 bit register
     783    48153800 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
     784             : 
     785    24076900 :   filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
     786    24076900 :   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
     787    24076900 :   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
     788    24076900 :   filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
     789             : 
     790             :   // multiple the size of the source and destination stride by two
     791    24076900 :   src_stride = src_pixels_per_line << 1;
     792    24076900 :   dst_stride = output_pitch << 1;
     793   363545000 :   for (i = output_height; i > 1; i -= 2) {
     794             :     // load the 2 strides of source
     795   339473000 :     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
     796             : 
     797             :     // filter the source buffer
     798   339078000 :     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
     799   339078000 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
     800             : 
     801             :     // multiply 2 adjacent elements with the filter and add the result
     802   678155000 :     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
     803   339078000 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
     804             : 
     805             :     // add and saturate the results together
     806   678155000 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
     807             : 
     808             :     // filter the source buffer
     809   339078000 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     810   339078000 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
     811             : 
     812             :     // multiply 2 adjacent elements with the filter and add the result
     813   339078000 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     814   339078000 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     815             : 
     816   339078000 :     __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
     817   339078000 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
     818             : 
     819             :     // reading 2 strides of the next 16 bytes
     820             :     // (part of it was being read by earlier read)
     821             :     srcReg32b2 =
     822   339078000 :         xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
     823             : 
     824             :     // filter the source buffer
     825   339049000 :     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
     826   339049000 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
     827             : 
     828             :     // multiply 2 adjacent elements with the filter and add the result
     829   339049000 :     srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
     830   339049000 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
     831             : 
     832             :     // add and saturate the results together
     833   339049000 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
     834             : 
     835             :     // filter the source buffer
     836   339049000 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
     837   339049000 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
     838             : 
     839             :     // multiply 2 adjacent elements with the filter and add the result
     840   339049000 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     841   339049000 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     842             : 
     843             :     // add and saturate the results together
     844   339049000 :     srcRegFilt32b2_1 = _mm256_adds_epi16(
     845             :         srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
     846             : 
     847             :     // shift by 6 bit each 16 bit
     848   678099000 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
     849   339049000 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
     850   678099000 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
     851   339049000 :     srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
     852             : 
     853             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     854             :     // convolve result and the second lane contain the second convolve result
     855   339049000 :     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
     856             : 
     857   339049000 :     src_ptr += src_stride;
     858             : 
     859   339049000 :     xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
     860   339468000 :     output_ptr += dst_stride;
     861             :   }
     862             : 
     863             :   // if the number of strides is odd.
     864             :   // process only 16 bytes
     865    24071200 :   if (i > 0) {
     866             :     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
     867             :     __m128i srcRegFilt2, srcRegFilt3;
     868             : 
     869    16961100 :     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
     870             : 
     871             :     // filter the source buffer
     872    33922300 :     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
     873    33922300 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
     874             : 
     875             :     // multiply 2 adjacent elements with the filter and add the result
     876             :     srcRegFilt1_1 =
     877    33922300 :         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
     878             :     srcRegFilt2 =
     879    33922300 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
     880             : 
     881             :     // add and saturate the results together
     882    16961100 :     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
     883             : 
     884             :     // filter the source buffer
     885    33922300 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
     886    33922300 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
     887             : 
     888             :     // multiply 2 adjacent elements with the filter and add the result
     889             :     srcRegFilt3 =
     890    33922300 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
     891             :     srcRegFilt2 =
     892    33922300 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
     893             : 
     894             :     // add and saturate the results together
     895             :     srcRegFilt1_1 =
     896    16961100 :         _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
     897             : 
     898             :     // reading the next 16 bytes
     899             :     // (part of it was being read by earlier read)
     900    33922300 :     srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
     901             : 
     902             :     // filter the source buffer
     903    33922300 :     srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
     904    33922300 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
     905             : 
     906             :     // multiply 2 adjacent elements with the filter and add the result
     907             :     srcRegFilt2_1 =
     908    33922300 :         _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
     909             :     srcRegFilt2 =
     910    33922300 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
     911             : 
     912             :     // add and saturate the results together
     913    16961100 :     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
     914             : 
     915             :     // filter the source buffer
     916    33922300 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
     917    33922300 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
     918             : 
     919             :     // multiply 2 adjacent elements with the filter and add the result
     920             :     srcRegFilt3 =
     921    33922300 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
     922             :     srcRegFilt2 =
     923    33922300 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
     924             : 
     925             :     // add and saturate the results together
     926             :     srcRegFilt2_1 =
     927    33922300 :         _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
     928             : 
     929             :     // shift by 6 bit each 16 bit
     930             :     srcRegFilt1_1 =
     931    33922300 :         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
     932    16961100 :     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
     933             : 
     934             :     srcRegFilt2_1 =
     935    33922300 :         _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
     936    16961100 :     srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
     937             : 
     938             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     939             :     // convolve result and the second lane contain the second convolve
     940             :     // result
     941    16961100 :     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
     942             : 
     943             :     // save 16 bytes
     944             :     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
     945             :   }
     946    24071200 : }
     947             : 
     948           0 : static void aom_filter_block1d8_v4_avx2(
     949             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     950             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
     951             :   __m128i filtersReg;
     952             :   __m256i filtersReg32, addFilterReg32;
     953             :   __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
     954             :   __m256i srcReg23_34_lo, srcReg45_56_lo;
     955             :   __m256i resReg23_34_lo, resReg45_56_lo;
     956             :   __m256i resReglo, resReg;
     957             :   __m256i secondFilters, thirdFilters;
     958             :   unsigned int i;
     959             :   ptrdiff_t src_stride, dst_stride;
     960             : 
     961           0 :   addFilterReg32 = _mm256_set1_epi16(32);
     962           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     963             :   // converting the 16 bit (short) to  8 bit (byte) and have the
     964             :   // same data in both lanes of 128 bit register.
     965           0 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
     966           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     967             :   // have the same data in both lanes of a 256 bit register
     968           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     969             : 
     970             :   // duplicate only the second 16 bits (third and forth byte)
     971             :   // across 256 bit register
     972           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
     973             :   // duplicate only the third 16 bits (fifth and sixth byte)
     974             :   // across 256 bit register
     975           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
     976             : 
     977             :   // multiple the size of the source and destination stride by two
     978           0 :   src_stride = src_pitch << 1;
     979           0 :   dst_stride = out_pitch << 1;
     980             : 
     981           0 :   srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
     982           0 :   srcReg4x = _mm256_castsi128_si256(
     983           0 :       _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
     984             : 
     985             :   // have consecutive loads on the same 256 register
     986           0 :   srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
     987             : 
     988           0 :   srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
     989             : 
     990           0 :   for (i = output_height; i > 1; i -= 2) {
     991             :     // load the last 2 loads of 16 bytes and have every two
     992             :     // consecutive loads in the same 256 bit register
     993           0 :     srcReg5x = _mm256_castsi128_si256(
     994           0 :         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
     995             :     srcReg45 =
     996           0 :         _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
     997             : 
     998           0 :     srcReg6x = _mm256_castsi128_si256(
     999           0 :         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
    1000             :     srcReg56 =
    1001           0 :         _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
    1002             : 
    1003             :     // merge every two consecutive registers
    1004           0 :     srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
    1005             : 
    1006             :     // multiply 2 adjacent elements with the filter and add the result
    1007           0 :     resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
    1008           0 :     resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
    1009             : 
    1010             :     // add and saturate the results together
    1011           0 :     resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
    1012             : 
    1013             :     // shift by 6 bit each 16 bit
    1014           0 :     resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
    1015           0 :     resReglo = _mm256_srai_epi16(resReglo, 6);
    1016             : 
    1017             :     // shrink to 8 bit each 16 bits, the first lane contain the first
    1018             :     // convolve result and the second lane contain the second convolve
    1019             :     // result
    1020           0 :     resReg = _mm256_packus_epi16(resReglo, resReglo);
    1021             : 
    1022           0 :     src_ptr += src_stride;
    1023             : 
    1024           0 :     xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
    1025             : 
    1026           0 :     output_ptr += dst_stride;
    1027             : 
    1028             :     // save part of the registers for next strides
    1029           0 :     srcReg23_34_lo = srcReg45_56_lo;
    1030           0 :     srcReg4x = srcReg6x;
    1031             :   }
    1032           0 : }
    1033             : 
    1034    11202800 : static void aom_filter_block1d8_v8_avx2(
    1035             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    1036             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    1037             :   __m128i filtersReg;
    1038             :   __m256i addFilterReg32;
    1039             :   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
    1040             :   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
    1041             :   __m256i srcReg32b11, srcReg32b12, filtersReg32;
    1042             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
    1043             :   unsigned int i;
    1044             :   ptrdiff_t src_stride, dst_stride;
    1045             : 
    1046    11202800 :   addFilterReg32 = _mm256_set1_epi16(32);
    1047    11202800 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
    1048             :   // converting the 16 bit (short) to  8 bit (byte) and have the
    1049             :   // same data in both lanes of 128 bit register.
    1050    11202800 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
    1051    11202800 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    1052             :   // have the same data in both lanes of a 256 bit register
    1053    11202800 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    1054             : 
    1055             :   // duplicate only the first 16 bits (first and second byte)
    1056             :   // across 256 bit register
    1057    22405600 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
    1058             :   // duplicate only the second 16 bits (third and forth byte)
    1059             :   // across 256 bit register
    1060    22405600 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    1061             :   // duplicate only the third 16 bits (fifth and sixth byte)
    1062             :   // across 256 bit register
    1063    22405600 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    1064             :   // duplicate only the forth 16 bits (seventh and eighth byte)
    1065             :   // across 256 bit register
    1066    11202800 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
    1067             : 
    1068             :   // multiple the size of the source and destination stride by two
    1069    11202800 :   src_stride = src_pitch << 1;
    1070    11202800 :   dst_stride = out_pitch << 1;
    1071             : 
    1072             :   // load 16 bytes 7 times in stride of src_pitch
    1073    11202800 :   srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
    1074             :   srcReg32b3 =
    1075    11203700 :       xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
    1076             :   srcReg32b5 =
    1077    11203200 :       xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
    1078    11202300 :   srcReg32b7 = _mm256_castsi128_si256(
    1079    11202300 :       _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
    1080             : 
    1081             :   // have each consecutive loads on the same 256 register
    1082    11202300 :   srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
    1083    11202300 :   srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
    1084    11202300 :   srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
    1085             :   // merge every two consecutive registers except the last one
    1086    22404600 :   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
    1087    11202300 :   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
    1088    11202300 :   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
    1089             : 
    1090    93391700 :   for (i = output_height; i > 1; i -= 2) {
    1091             :     // load the last 2 loads of 16 bytes and have every two
    1092             :     // consecutive loads in the same 256 bit register
    1093   164351000 :     srcReg32b8 = _mm256_castsi128_si256(
    1094    82175500 :         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
    1095    82175500 :     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
    1096             :                                          _mm256_castsi256_si128(srcReg32b8), 1);
    1097   164351000 :     srcReg32b9 = _mm256_castsi128_si256(
    1098    82175500 :         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
    1099    82175500 :     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
    1100             :                                          _mm256_castsi256_si128(srcReg32b9), 1);
    1101             : 
    1102             :     // merge every two consecutive registers
    1103             :     // save
    1104    82175500 :     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
    1105             : 
    1106             :     // multiply 2 adjacent elements with the filter and add the result
    1107    82175500 :     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
    1108    82175500 :     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
    1109             : 
    1110             :     // add and saturate the results together
    1111    82175500 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
    1112             : 
    1113             :     // multiply 2 adjacent elements with the filter and add the result
    1114    82175500 :     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
    1115    82175500 :     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
    1116             : 
    1117             :     // add and saturate the results together
    1118   164351000 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
    1119             :                                     _mm256_adds_epi16(srcReg32b8, srcReg32b12));
    1120             : 
    1121             :     // shift by 6 bit each 16 bit
    1122    82175500 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
    1123    82175500 :     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
    1124             : 
    1125             :     // shrink to 8 bit each 16 bits, the first lane contain the first
    1126             :     // convolve result and the second lane contain the second convolve
    1127             :     // result
    1128    82175500 :     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
    1129             : 
    1130    82175500 :     src_ptr += src_stride;
    1131             : 
    1132    82175500 :     xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
    1133             : 
    1134    82189400 :     output_ptr += dst_stride;
    1135             : 
    1136             :     // save part of the registers for next strides
    1137    82189400 :     srcReg32b10 = srcReg32b11;
    1138    82189400 :     srcReg32b11 = srcReg32b2;
    1139    82189400 :     srcReg32b2 = srcReg32b4;
    1140    82189400 :     srcReg32b7 = srcReg32b9;
    1141             :   }
    1142    11216200 :   if (i > 0) {
    1143             :     __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
    1144             :     // load the last 16 bytes
    1145           0 :     srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
    1146             : 
    1147             :     // merge the last 2 results together
    1148             :     srcRegFilt4 =
    1149           0 :         _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
    1150             : 
    1151             :     // multiply 2 adjacent elements with the filter and add the result
    1152           0 :     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
    1153             :                                     _mm256_castsi256_si128(firstFilters));
    1154             :     srcRegFilt4 =
    1155           0 :         _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
    1156             : 
    1157             :     // add and saturate the results together
    1158           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    1159             : 
    1160             :     // multiply 2 adjacent elements with the filter and add the result
    1161           0 :     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
    1162             :                                     _mm256_castsi256_si128(secondFilters));
    1163             : 
    1164             :     // multiply 2 adjacent elements with the filter and add the result
    1165           0 :     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
    1166             :                                     _mm256_castsi256_si128(thirdFilters));
    1167             : 
    1168             :     // add and saturate the results together
    1169             :     srcRegFilt1 =
    1170           0 :         _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
    1171             : 
    1172             :     // shift by 6 bit each 16 bit
    1173             :     srcRegFilt1 =
    1174           0 :         _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
    1175           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
    1176             : 
    1177             :     // shrink to 8 bit each 16 bits, the first lane contain the first
    1178             :     // convolve result and the second lane contain the second convolve result
    1179           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
    1180             : 
    1181             :     // save 8 bytes
    1182           0 :     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
    1183             :   }
    1184    11216200 : }
    1185             : 
    1186           0 : static void aom_filter_block1d16_v4_avx2(
    1187             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    1188             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    1189             :   __m128i filtersReg;
    1190             :   __m256i filtersReg32, addFilterReg32;
    1191             :   __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
    1192             :   __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
    1193             :   __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
    1194             :   __m256i resReglo, resReghi, resReg;
    1195             :   __m256i secondFilters, thirdFilters;
    1196             :   unsigned int i;
    1197             :   ptrdiff_t src_stride, dst_stride;
    1198             : 
    1199           0 :   addFilterReg32 = _mm256_set1_epi16(32);
    1200           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
    1201             :   // converting the 16 bit (short) to  8 bit (byte) and have the
    1202             :   // same data in both lanes of 128 bit register.
    1203           0 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
    1204           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    1205             :   // have the same data in both lanes of a 256 bit register
    1206           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    1207             : 
    1208             :   // duplicate only the second 16 bits (third and forth byte)
    1209             :   // across 256 bit register
    1210           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    1211             :   // duplicate only the third 16 bits (fifth and sixth byte)
    1212             :   // across 256 bit register
    1213           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    1214             : 
    1215             :   // multiple the size of the source and destination stride by two
    1216           0 :   src_stride = src_pitch << 1;
    1217           0 :   dst_stride = out_pitch << 1;
    1218             : 
    1219           0 :   srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
    1220           0 :   srcReg4x = _mm256_castsi128_si256(
    1221           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
    1222             : 
    1223             :   // have consecutive loads on the same 256 register
    1224           0 :   srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
    1225             : 
    1226           0 :   srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
    1227           0 :   srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
    1228             : 
    1229           0 :   for (i = output_height; i > 1; i -= 2) {
    1230             :     // load the last 2 loads of 16 bytes and have every two
    1231             :     // consecutive loads in the same 256 bit register
    1232           0 :     srcReg5x = _mm256_castsi128_si256(
    1233           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
    1234             :     srcReg45 =
    1235           0 :         _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
    1236             : 
    1237           0 :     srcReg6x = _mm256_castsi128_si256(
    1238           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
    1239             :     srcReg56 =
    1240           0 :         _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
    1241             : 
    1242             :     // merge every two consecutive registers
    1243           0 :     srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
    1244           0 :     srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
    1245             : 
    1246             :     // multiply 2 adjacent elements with the filter and add the result
    1247           0 :     resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
    1248           0 :     resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
    1249             : 
    1250             :     // add and saturate the results together
    1251           0 :     resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
    1252             : 
    1253             :     // multiply 2 adjacent elements with the filter and add the result
    1254           0 :     resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
    1255           0 :     resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
    1256             : 
    1257             :     // add and saturate the results together
    1258           0 :     resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
    1259             : 
    1260             :     // shift by 6 bit each 16 bit
    1261           0 :     resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
    1262           0 :     resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
    1263           0 :     resReglo = _mm256_srai_epi16(resReglo, 6);
    1264           0 :     resReghi = _mm256_srai_epi16(resReghi, 6);
    1265             : 
    1266             :     // shrink to 8 bit each 16 bits, the first lane contain the first
    1267             :     // convolve result and the second lane contain the second convolve
    1268             :     // result
    1269           0 :     resReg = _mm256_packus_epi16(resReglo, resReghi);
    1270             : 
    1271           0 :     src_ptr += src_stride;
    1272             : 
    1273           0 :     xx_store2_mi128(output_ptr, out_pitch, &resReg);
    1274             : 
    1275           0 :     output_ptr += dst_stride;
    1276             : 
    1277             :     // save part of the registers for next strides
    1278           0 :     srcReg23_34_lo = srcReg45_56_lo;
    1279           0 :     srcReg23_34_hi = srcReg45_56_hi;
    1280           0 :     srcReg4x = srcReg6x;
    1281             :   }
    1282           0 : }
    1283             : 
    1284    24315400 : static void aom_filter_block1d16_v8_avx2(
    1285             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    1286             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    1287             :   __m128i filtersReg;
    1288             :   __m256i addFilterReg32;
    1289             :   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
    1290             :   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
    1291             :   __m256i srcReg32b11, srcReg32b12, filtersReg32;
    1292             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
    1293             :   unsigned int i;
    1294             :   ptrdiff_t src_stride, dst_stride;
    1295             : 
    1296    24315400 :   addFilterReg32 = _mm256_set1_epi16(32);
    1297    24315400 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
    1298             :   // converting the 16 bit (short) to  8 bit (byte) and have the
    1299             :   // same data in both lanes of 128 bit register.
    1300    24315400 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
    1301    24315400 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    1302             :   // have the same data in both lanes of a 256 bit register
    1303    24315400 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    1304             : 
    1305             :   // duplicate only the first 16 bits (first and second byte)
    1306             :   // across 256 bit register
    1307    48630900 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
    1308             :   // duplicate only the second 16 bits (third and forth byte)
    1309             :   // across 256 bit register
    1310    48630900 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
    1311             :   // duplicate only the third 16 bits (fifth and sixth byte)
    1312             :   // across 256 bit register
    1313    48630900 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
    1314             :   // duplicate only the forth 16 bits (seventh and eighth byte)
    1315             :   // across 256 bit register
    1316    24315400 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
    1317             : 
    1318             :   // multiple the size of the source and destination stride by two
    1319    24315400 :   src_stride = src_pitch << 1;
    1320    24315400 :   dst_stride = out_pitch << 1;
    1321             : 
    1322             :   // load 16 bytes 7 times in stride of src_pitch
    1323    24315400 :   srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
    1324             :   srcReg32b3 =
    1325    24320200 :       xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
    1326             :   srcReg32b5 =
    1327    24317800 :       xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
    1328    24311400 :   srcReg32b7 = _mm256_castsi128_si256(
    1329    24311400 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
    1330             : 
    1331             :   // have each consecutive loads on the same 256 register
    1332    24311400 :   srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
    1333    24311400 :   srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
    1334    24311400 :   srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
    1335             :   // merge every two consecutive registers except the last one
    1336    24311400 :   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
    1337    48622800 :   srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
    1338             : 
    1339             :   // save
    1340    24311400 :   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
    1341    24311400 :   srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
    1342    24311400 :   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
    1343    24311400 :   srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
    1344             : 
    1345   317356000 :   for (i = output_height; i > 1; i -= 2) {
    1346             :     // load the last 2 loads of 16 bytes and have every two
    1347             :     // consecutive loads in the same 256 bit register
    1348   585390000 :     srcReg32b8 = _mm256_castsi128_si256(
    1349   292695000 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
    1350   292695000 :     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
    1351             :                                          _mm256_castsi256_si128(srcReg32b8), 1);
    1352   585390000 :     srcReg32b9 = _mm256_castsi128_si256(
    1353   292695000 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
    1354   292695000 :     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
    1355             :                                          _mm256_castsi256_si128(srcReg32b9), 1);
    1356             : 
    1357             :     // merge every two consecutive registers
    1358             :     // save
    1359   292695000 :     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
    1360   292695000 :     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
    1361             : 
    1362             :     // multiply 2 adjacent elements with the filter and add the result
    1363   292695000 :     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
    1364   292695000 :     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
    1365             : 
    1366             :     // add and saturate the results together
    1367   292695000 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
    1368             : 
    1369             :     // multiply 2 adjacent elements with the filter and add the result
    1370   292695000 :     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
    1371   292695000 :     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
    1372             : 
    1373             :     // add and saturate the results together
    1374   292695000 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
    1375             :                                     _mm256_adds_epi16(srcReg32b8, srcReg32b12));
    1376             : 
    1377             :     // multiply 2 adjacent elements with the filter and add the result
    1378   585390000 :     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
    1379   292695000 :     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
    1380             : 
    1381   585390000 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
    1382             : 
    1383             :     // multiply 2 adjacent elements with the filter and add the result
    1384   292695000 :     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
    1385   292695000 :     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
    1386             : 
    1387             :     // add and saturate the results together
    1388   585390000 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
    1389             :                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
    1390             : 
    1391             :     // shift by 6 bit each 16 bit
    1392   292695000 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
    1393   585390000 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
    1394   292695000 :     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
    1395   292695000 :     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
    1396             : 
    1397             :     // shrink to 8 bit each 16 bits, the first lane contain the first
    1398             :     // convolve result and the second lane contain the second convolve
    1399             :     // result
    1400   292695000 :     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
    1401             : 
    1402   292695000 :     src_ptr += src_stride;
    1403             : 
    1404   292695000 :     xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
    1405             : 
    1406   293044000 :     output_ptr += dst_stride;
    1407             : 
    1408             :     // save part of the registers for next strides
    1409   293044000 :     srcReg32b10 = srcReg32b11;
    1410   293044000 :     srcReg32b1 = srcReg32b3;
    1411   293044000 :     srcReg32b11 = srcReg32b2;
    1412   293044000 :     srcReg32b3 = srcReg32b5;
    1413   293044000 :     srcReg32b2 = srcReg32b4;
    1414   293044000 :     srcReg32b5 = srcReg32b7;
    1415   293044000 :     srcReg32b7 = srcReg32b9;
    1416             :   }
    1417    24661000 :   if (i > 0) {
    1418             :     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
    1419             :     __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
    1420             :     // load the last 16 bytes
    1421           0 :     srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
    1422             : 
    1423             :     // merge the last 2 results together
    1424             :     srcRegFilt4 =
    1425           0 :         _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
    1426             :     srcRegFilt7 =
    1427           0 :         _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
    1428             : 
    1429             :     // multiply 2 adjacent elements with the filter and add the result
    1430           0 :     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
    1431             :                                     _mm256_castsi256_si128(firstFilters));
    1432             :     srcRegFilt4 =
    1433           0 :         _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
    1434           0 :     srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
    1435             :                                     _mm256_castsi256_si128(firstFilters));
    1436             :     srcRegFilt7 =
    1437           0 :         _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
    1438             : 
    1439             :     // add and saturate the results together
    1440           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    1441           0 :     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
    1442             : 
    1443             :     // multiply 2 adjacent elements with the filter and add the result
    1444           0 :     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
    1445             :                                     _mm256_castsi256_si128(secondFilters));
    1446           0 :     srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
    1447             :                                     _mm256_castsi256_si128(secondFilters));
    1448             : 
    1449             :     // multiply 2 adjacent elements with the filter and add the result
    1450           0 :     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
    1451             :                                     _mm256_castsi256_si128(thirdFilters));
    1452           0 :     srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
    1453             :                                     _mm256_castsi256_si128(thirdFilters));
    1454             : 
    1455             :     // add and saturate the results together
    1456             :     srcRegFilt1 =
    1457           0 :         _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
    1458             :     srcRegFilt3 =
    1459           0 :         _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
    1460             : 
    1461             :     // shift by 6 bit each 16 bit
    1462             :     srcRegFilt1 =
    1463           0 :         _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
    1464             :     srcRegFilt3 =
    1465           0 :         _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
    1466           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
    1467           0 :     srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
    1468             : 
    1469             :     // shrink to 8 bit each 16 bits, the first lane contain the first
    1470             :     // convolve result and the second lane contain the second convolve
    1471             :     // result
    1472           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
    1473             : 
    1474             :     // save 16 bytes
    1475             :     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
    1476             :   }
    1477    24661000 : }
    1478             : 
    1479           0 : static void aom_filter_block1d4_v4_avx2(
    1480             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    1481             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    1482             :   __m128i filtersReg;
    1483             :   __m256i filtersReg32, addFilterReg32;
    1484             :   __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
    1485             :   __m256i srcReg23_34_lo, srcReg45_56_lo;
    1486             :   __m256i srcReg2345_3456_lo;
    1487             :   __m256i resReglo, resReg;
    1488             :   __m256i firstFilters;
    1489             :   unsigned int i;
    1490             :   ptrdiff_t src_stride, dst_stride;
    1491             : 
    1492           0 :   addFilterReg32 = _mm256_set1_epi16(32);
    1493           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
    1494             :   // converting the 16 bit (short) to  8 bit (byte) and have the
    1495             :   // same data in both lanes of 128 bit register.
    1496           0 :   filtersReg = _mm_srai_epi16(filtersReg, 1);
    1497           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    1498             :   // have the same data in both lanes of a 256 bit register
    1499           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
    1500             : 
    1501             :   firstFilters =
    1502           0 :       _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
    1503             : 
    1504             :   // multiple the size of the source and destination stride by two
    1505           0 :   src_stride = src_pitch << 1;
    1506           0 :   dst_stride = out_pitch << 1;
    1507             : 
    1508           0 :   srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
    1509           0 :   srcReg4x = _mm256_castsi128_si256(
    1510           0 :       _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
    1511             : 
    1512             :   // have consecutive loads on the same 256 register
    1513           0 :   srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
    1514             : 
    1515           0 :   srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
    1516             : 
    1517           0 :   for (i = output_height; i > 1; i -= 2) {
    1518             :     // load the last 2 loads of 16 bytes and have every two
    1519             :     // consecutive loads in the same 256 bit register
    1520           0 :     srcReg5x = _mm256_castsi128_si256(
    1521           0 :         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
    1522             :     srcReg45 =
    1523           0 :         _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
    1524             : 
    1525           0 :     srcReg6x = _mm256_castsi128_si256(
    1526           0 :         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
    1527             :     srcReg56 =
    1528           0 :         _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
    1529             : 
    1530             :     // merge every two consecutive registers
    1531           0 :     srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
    1532             : 
    1533           0 :     srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
    1534             : 
    1535             :     // multiply 2 adjacent elements with the filter and add the result
    1536           0 :     resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
    1537             : 
    1538           0 :     resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
    1539             : 
    1540             :     // shift by 6 bit each 16 bit
    1541           0 :     resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
    1542           0 :     resReglo = _mm256_srai_epi16(resReglo, 6);
    1543             : 
    1544             :     // shrink to 8 bit each 16 bits, the first lane contain the first
    1545             :     // convolve result and the second lane contain the second convolve
    1546             :     // result
    1547           0 :     resReg = _mm256_packus_epi16(resReglo, resReglo);
    1548             : 
    1549           0 :     src_ptr += src_stride;
    1550             : 
    1551           0 :     xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
    1552             : 
    1553           0 :     output_ptr += dst_stride;
    1554             : 
    1555             :     // save part of the registers for next strides
    1556           0 :     srcReg23_34_lo = srcReg45_56_lo;
    1557           0 :     srcReg4x = srcReg6x;
    1558             :   }
    1559           0 : }
    1560             : 
    1561             : #if 1//HAVE_AVX2 && HAVE_SSSE3
    1562             : // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
    1563             : //                                uint8_t *dst, ptrdiff_t dst_stride,
    1564             : //                                const int16_t *filter_x, int x_step_q4,
    1565             : //                                const int16_t *filter_y, int y_step_q4,
    1566             : //                                int w, int h);
    1567             : // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
    1568             : //                               uint8_t *dst, ptrdiff_t dst_stride,
    1569             : //                               const int16_t *filter_x, int x_step_q4,
    1570             : //                               const int16_t *filter_y, int y_step_q4,
    1571             : //                               int w, int h);
    1572    86833700 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
    1573    87282400 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
    1574             : 
    1575             : #endif  // HAVE_AX2 && HAVE_SSSE3

Generated by: LCOV version 1.14