LCOV - code coverage report
Current view: top level - ASM_AVX2 - convolve_avx2.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 883 1052 83.9 %
Date: 2019-11-25 17:38:06 Functions: 124 148 83.8 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
      13             : #define AOM_DSP_X86_CONVOLVE_AVX2_H_
      14             : 
      15             : #include "convolve.h"
      16             : #include "EbDefinitions.h"
      17             : #include "EbInterPrediction.h"
      18             : #include "EbMemory_AVX2.h"
      19             : #include "EbMemory_SSE4_1.h"
      20             : #include "synonyms.h"
      21             : #include "synonyms_avx2.h"
      22             : 
      23             :  // filters for 16
      24             : DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
      25             :     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
      26             :     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
      27             : 
      28             : DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
      29             :     2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
      30             :     2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 };
      31             : 
      32             : DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
      33             :     4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
      34             :     4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 };
      35             : 
      36             : DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
      37             :     6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
      38             :     6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 };
      39             : 
      40   732082200 : static INLINE EbBool is_convolve_2tap(const int16_t *const filter) {
      41   732082200 :     return (EbBool)((InterpKernel *)filter == bilinear_filters);
      42             : }
      43             : 
      44   461094000 : static INLINE EbBool is_convolve_4tap(const int16_t *const filter) {
      45   461094000 :     return (EbBool)(((InterpKernel *)filter == sub_pel_filters_4) ||
      46             :         ((InterpKernel *)filter == sub_pel_filters_4smooth));
      47             : }
      48             : 
      49   424751300 : static INLINE EbBool is_convolve_6tap(const int16_t *const filter) {
      50   424751300 :     return (EbBool)(((InterpKernel *)filter == sub_pel_filters_8) ||
      51             :         ((InterpKernel *)filter == sub_pel_filters_8smooth));
      52             : }
      53             : 
      54   668794200 : static INLINE int32_t get_convolve_tap(const int16_t *const filter) {
      55   668794200 :     if (is_convolve_2tap(filter))
      56   266024100 :         return 2;
      57   402985700 :     else if (is_convolve_4tap(filter))
      58    31042641 :         return 4;
      59   374174300 :     else if (is_convolve_6tap(filter))
      60   314499300 :         return 6;
      61             :     else
      62    60109160 :         return 8;
      63             : }
      64             : 
      65    43479320 : static INLINE void prepare_half_coeffs_2tap_ssse3(
      66             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
      67             :     __m128i *const coeffs /* [4] */) {
      68    43479320 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
      69             :         *filter_params, subpel_q4 & SUBPEL_MASK);
      70    86957930 :     const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
      71             : 
      72             :     // right shift all filter co-efficients by 1 to reduce the bits required.
      73             :     // This extra right shift will be taken care of at the end while rounding
      74             :     // the result.
      75             :     // Since all filter co-efficients are even, this change will not affect the
      76             :     // end result
      77   173915900 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
      78             :         _mm_set1_epi16((short)0xffff)));
      79             : 
      80    43478920 :     const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
      81             : 
      82             :     // coeffs 3 4 3 4 3 4 3 4
      83    43478920 :     *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
      84    43478920 : }
      85             : 
      86     5573454 : static INLINE void prepare_half_coeffs_4tap_ssse3(
      87             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
      88             :     __m128i *const coeffs /* [2] */) {
      89     5573454 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
      90             :         *filter_params, subpel_q4 & SUBPEL_MASK);
      91     5573435 :     const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
      92             : 
      93             :     // right shift all filter co-efficients by 1 to reduce the bits required.
      94             :     // This extra right shift will be taken care of at the end while rounding
      95             :     // the result.
      96             :     // Since all filter co-efficients are even, this change will not affect the
      97             :     // end result
      98    22293710 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
      99             :         _mm_set1_epi16((short)0xffff)));
     100             : 
     101     5573435 :     const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
     102             : 
     103             :     // coeffs 2 3 2 3 2 3 2 3
     104    11146860 :     coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
     105             :     // coeffs 4 5 4 5 4 5 4 5
     106     5573435 :     coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
     107     5573435 : }
     108             : 
     109     2455100 : static INLINE void prepare_half_coeffs_6tap_ssse3(
     110             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     111             :     __m128i *const coeffs /* [4] */) {
     112     2455100 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     113             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     114     2455100 :     const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
     115             : 
     116             :     // right shift all filter co-efficients by 1 to reduce the bits required.
     117             :     // This extra right shift will be taken care of at the end while rounding
     118             :     // the result.
     119             :     // Since all filter co-efficients are even, this change will not affect the
     120             :     // end result
     121     9820400 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
     122             :         _mm_set1_epi16((short)0xffff)));
     123             : 
     124     2455100 :     const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
     125             : 
     126             :     // coeffs 1 2 1 2 1 2 1 2
     127     4910200 :     coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
     128             :     // coeffs 3 4 3 4 3 4 3 4
     129     4910200 :     coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
     130             :     // coeffs 5 6 5 6 5 6 5 6
     131     2455100 :     coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
     132     2455100 : }
     133             : 
     134       15090 : static INLINE void prepare_half_coeffs_8tap_ssse3(
     135             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     136             :     __m128i *const coeffs /* [4] */) {
     137       15090 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     138             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     139       15090 :     const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
     140             : 
     141             :     // right shift all filter co-efficients by 1 to reduce the bits required.
     142             :     // This extra right shift will be taken care of at the end while rounding
     143             :     // the result.
     144             :     // Since all filter co-efficients are even, this change will not affect the
     145             :     // end result
     146       60360 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
     147             :         _mm_set1_epi16((short)0xffff)));
     148             : 
     149       15090 :     const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
     150             : 
     151             :     // coeffs 0 1 0 1 0 1 0 1
     152       30180 :     coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
     153             :     // coeffs 2 3 2 3 2 3 2 3
     154       30180 :     coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
     155             :     // coeffs 4 5 4 5 4 5 4 5
     156       30180 :     coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
     157             :     // coeffs 6 7 6 7 6 7 6 7
     158       15090 :     coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
     159       15090 : }
     160             : 
     161   116163440 : static INLINE void prepare_half_coeffs_2tap_avx2(
     162             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     163             :     __m256i *const coeffs /* [4] */) {
     164   116163440 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     165             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     166   232343700 :     const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
     167   116171850 :     const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
     168             : 
     169             :     // right shift all filter co-efficients by 1 to reduce the bits required.
     170             :     // This extra right shift will be taken care of at the end while rounding
     171             :     // the result.
     172             :     // Since all filter co-efficients are even, this change will not affect the
     173             :     // end result
     174   464687500 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
     175             :         _mm_set1_epi16((short)0xffff)));
     176             : 
     177   116171850 :     const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
     178             : 
     179             :     // coeffs 3 4 3 4 3 4 3 4
     180   116171850 :     *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
     181   116171850 : }
     182             : 
     183     2351436 : static INLINE void prepare_half_coeffs_4tap_avx2(
     184             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     185             :     __m256i *const coeffs /* [3] */) {
     186     2351436 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     187             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     188     2351426 :     const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
     189     2351426 :     const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
     190             : 
     191             :     // right shift all filter co-efficients by 1 to reduce the bits required.
     192             :     // This extra right shift will be taken care of at the end while rounding
     193             :     // the result.
     194             :     // Since all filter co-efficients are even, this change will not affect the
     195             :     // end result
     196     9405714 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
     197             :         _mm_set1_epi16((short)0xffff)));
     198             : 
     199     2351426 :     const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
     200             : 
     201             :     // coeffs 2 3 2 3 2 3 2 3
     202     4702862 :     coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
     203             :     // coeffs 4 5 4 5 4 5 4 5
     204     2351426 :     coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
     205     2351426 : }
     206             : 
     207    98102800 : static INLINE void prepare_half_coeffs_6tap_avx2(
     208             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     209             :     __m256i *const coeffs /* [3] */) {
     210    98102800 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     211             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     212    98099600 :     const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
     213    98099600 :     const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
     214             : 
     215             :     // right shift all filter co-efficients by 1 to reduce the bits required.
     216             :     // This extra right shift will be taken care of at the end while rounding
     217             :     // the result.
     218             :     // Since all filter co-efficients are even, this change will not affect the
     219             :     // end result
     220   392398000 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
     221             :         _mm_set1_epi16((short)0xffff)));
     222             : 
     223    98099600 :     const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
     224             : 
     225             :     // coeffs 1 2 1 2 1 2 1 2
     226   196199200 :     coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
     227             :     // coeffs 3 4 3 4 3 4 3 4
     228   196199200 :     coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
     229             :     // coeffs 5 6 5 6 5 6 5 6
     230    98099600 :     coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0C0Au));
     231    98099600 : }
     232             : 
     233   182187500 : static INLINE void prepare_half_coeffs_8tap_avx2(
     234             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     235             :     __m256i *const coeffs /* [4] */) {
     236   182187500 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     237             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     238   182148520 :     const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
     239   182148520 :     const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
     240             : 
     241             :     // right shift all filter co-efficients by 1 to reduce the bits required.
     242             :     // This extra right shift will be taken care of at the end while rounding
     243             :     // the result.
     244             :     // Since all filter co-efficients are even, this change will not affect the
     245             :     // end result
     246   728593000 :     assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
     247             :         _mm_set1_epi16((short)0xffff)));
     248             : 
     249   182148520 :     const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
     250             : 
     251             :     // coeffs 0 1 0 1 0 1 0 1
     252   364295900 :     coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
     253             :     // coeffs 2 3 2 3 2 3 2 3
     254   364295900 :     coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
     255             :     // coeffs 4 5 4 5 4 5 4 5
     256   364295900 :     coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
     257             :     // coeffs 6 7 6 7 6 7 6 7
     258   182148520 :     coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
     259   182148520 : }
     260             : 
     261           0 : static INLINE void prepare_coeffs_2tap_sse2(
     262             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     263             :     __m128i *const coeffs /* [1] */) {
     264           0 :     const int16_t *filter = av1_get_interp_filter_subpel_kernel(
     265             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     266             : 
     267           0 :     const __m128i coeff = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
     268             : 
     269             :     // coeffs 3 4 3 4 3 4 3 4
     270           0 :     coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
     271           0 : }
     272             : 
     273           0 : static INLINE void prepare_coeffs_4tap_sse2(
     274             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     275             :     __m128i *const coeffs /* [2] */) {
     276           0 :     const int16_t *filter = av1_get_interp_filter_subpel_kernel(
     277             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     278             : 
     279           0 :     const __m128i coeff = _mm_load_si128((__m128i *)filter);
     280             : 
     281             :     // coeffs 2 3 2 3 2 3 2 3
     282           0 :     coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
     283             :     // coeffs 4 5 4 5 4 5 4 5
     284           0 :     coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
     285           0 : }
     286             : 
     287           0 : static INLINE void prepare_coeffs_6tap_ssse3(
     288             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     289             :     __m128i *const coeffs /* [3] */) {
     290           0 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     291             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     292           0 :     const __m128i coeff = _mm_load_si128((__m128i *)filter);
     293             : 
     294             :     // coeffs 1 2 1 2 1 2 1 2
     295           0 :     coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
     296             :     // coeffs 3 4 3 4 3 4 3 4
     297           0 :     coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
     298             :     // coeffs 5 6 5 6 5 6 5 6
     299           0 :     coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
     300           0 : }
     301             : 
     302           0 : static INLINE void prepare_coeffs_8tap_sse2(
     303             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     304             :     __m128i *const coeffs /* [4] */) {
     305           0 :     const int16_t *filter = av1_get_interp_filter_subpel_kernel(
     306             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     307             : 
     308           0 :     const __m128i coeff = _mm_load_si128((__m128i *)filter);
     309             : 
     310             :     // coeffs 0 1 0 1 0 1 0 1
     311           0 :     coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
     312             :     // coeffs 2 3 2 3 2 3 2 3
     313           0 :     coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
     314             :     // coeffs 4 5 4 5 4 5 4 5
     315           0 :     coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
     316             :     // coeffs 6 7 6 7 6 7 6 7
     317           0 :     coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
     318           0 : }
     319             : 
     320    94588800 : static INLINE void prepare_coeffs_2tap_avx2(
     321             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     322             :     __m256i *const coeffs /* [1] */) {
     323    94588800 :     const int16_t *filter = av1_get_interp_filter_subpel_kernel(
     324             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     325             : 
     326   189171300 :     const __m128i coeff_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
     327    94585600 :     const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
     328             : 
     329             :     // coeffs 3 4 3 4 3 4 3 4
     330    94585600 :     coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
     331    94585600 : }
     332             : 
     333     1653570 : static INLINE void prepare_coeffs_4tap_avx2(
     334             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     335             :     __m256i *const coeffs /* [2] */) {
     336     1653570 :     const int16_t *filter = av1_get_interp_filter_subpel_kernel(
     337             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     338             : 
     339     1653560 :     const __m128i coeff_8 = _mm_load_si128((__m128i *)filter);
     340     1653560 :     const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
     341             : 
     342             :     // coeffs 2 3 2 3 2 3 2 3
     343     1653560 :     coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
     344             :     // coeffs 4 5 4 5 4 5 4 5
     345     1653560 :     coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
     346     1653560 : }
     347             : 
     348    38615000 : static INLINE void prepare_coeffs_6tap_avx2(
     349             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     350             :     __m256i *const coeffs /* [3] */) {
     351    38615000 :     const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
     352             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     353    38612300 :     const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
     354    38612300 :     const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
     355             : 
     356             :     // coeffs 1 2 1 2 1 2 1 2
     357    77224600 :     coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
     358             :     // coeffs 3 4 3 4 3 4 3 4
     359    77224600 :     coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
     360             :     // coeffs 5 6 5 6 5 6 5 6
     361    38612300 :     coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
     362    38612300 : }
     363             : 
     364   171996300 : static INLINE void prepare_coeffs_8tap_avx2(
     365             :     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     366             :     __m256i *const coeffs /* [4] */) {
     367   171996300 :     const int16_t *filter = av1_get_interp_filter_subpel_kernel(
     368             :         *filter_params, subpel_q4 & SUBPEL_MASK);
     369             : 
     370   171977800 :     const __m128i coeff_8 = _mm_load_si128((__m128i *)filter);
     371   171977800 :     const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
     372             : 
     373             :     // coeffs 0 1 0 1 0 1 0 1
     374   171977800 :     coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
     375             :     // coeffs 2 3 2 3 2 3 2 3
     376   171977800 :     coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
     377             :     // coeffs 4 5 4 5 4 5 4 5
     378   171977800 :     coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
     379             :     // coeffs 6 7 6 7 6 7 6 7
     380   171977800 :     coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
     381   171977800 : }
     382             : 
     383             : static INLINE void load_16bit_5rows_avx2(const int16_t *const src,
     384             :     const int32_t stride, __m256i dst[5]) {
     385             :     dst[0] = _mm256_load_si256((__m256i *)(src + 0 * stride));
     386             :     dst[1] = _mm256_load_si256((__m256i *)(src + 1 * stride));
     387             :     dst[2] = _mm256_load_si256((__m256i *)(src + 2 * stride));
     388             :     dst[3] = _mm256_load_si256((__m256i *)(src + 3 * stride));
     389             :     dst[4] = _mm256_load_si256((__m256i *)(src + 4 * stride));
     390             : }
     391             : 
     392    15250300 : static INLINE void load_16bit_7rows_avx2(const int16_t *const src,
     393             :     const int32_t stride, __m256i dst[7]) {
     394    15250300 :     dst[0] = _mm256_load_si256((__m256i *)(src + 0 * stride));
     395    15250300 :     dst[1] = _mm256_load_si256((__m256i *)(src + 1 * stride));
     396    15250300 :     dst[2] = _mm256_load_si256((__m256i *)(src + 2 * stride));
     397    15250300 :     dst[3] = _mm256_load_si256((__m256i *)(src + 3 * stride));
     398    15250300 :     dst[4] = _mm256_load_si256((__m256i *)(src + 4 * stride));
     399    15250300 :     dst[5] = _mm256_load_si256((__m256i *)(src + 5 * stride));
     400    15250300 :     dst[6] = _mm256_load_si256((__m256i *)(src + 6 * stride));
     401    15250300 : }
     402             : 
     403             : SIMD_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
     404             :     const int32_t stride, __m256i dst[8]) {
     405           0 :     dst[0] = _mm256_load_si256((__m256i *)(src + 0 * stride));
     406           0 :     dst[1] = _mm256_load_si256((__m256i *)(src + 1 * stride));
     407           0 :     dst[2] = _mm256_load_si256((__m256i *)(src + 2 * stride));
     408           0 :     dst[3] = _mm256_load_si256((__m256i *)(src + 3 * stride));
     409           0 :     dst[4] = _mm256_load_si256((__m256i *)(src + 4 * stride));
     410           0 :     dst[5] = _mm256_load_si256((__m256i *)(src + 5 * stride));
     411           0 :     dst[6] = _mm256_load_si256((__m256i *)(src + 6 * stride));
     412           0 :     dst[7] = _mm256_load_si256((__m256i *)(src + 7 * stride));
     413           0 : }
     414             : 
     415             : SIMD_INLINE void loadu_unpack_16bit_5rows_avx2(const int16_t *const src,
     416             :     const int32_t stride,
     417             :     __m256i s_256[5],
     418             :     __m256i ss_256[5],
     419             :     __m256i tt_256[5]) {
     420    21355300 :     s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
     421    21355300 :     s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
     422    21355300 :     s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
     423    21355300 :     s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
     424    21355300 :     s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
     425             : 
     426    21355300 :     ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     427    21355300 :     ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
     428    21355300 :     ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
     429    21355300 :     ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
     430             : 
     431    21355300 :     tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
     432    21355300 :     tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
     433    21355300 :     tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
     434    21355300 :     tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
     435    21355300 : }
     436             : 
     437             : SIMD_INLINE void loadu_unpack_16bit_3rows_avx2(const int16_t *const src,
     438             :     const int32_t stride,
     439             :     __m256i s_256[3],
     440             :     __m256i ss_256[3],
     441             :     __m256i tt_256[3])
     442             : {
     443             :     s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
     444             :     s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
     445             :     s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
     446             : 
     447             :     ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     448             :     ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
     449             : 
     450             :     tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
     451             :     tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
     452             : }
     453             : 
     454             : 
     455    37699500 : static INLINE void convolve_8tap_unapck_avx2(const __m256i s[6],
     456             :     __m256i ss[7]) {
     457    37699500 :     ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
     458    37699500 :     ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
     459    37699500 :     ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
     460    37699500 :     ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
     461    37699500 :     ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
     462    37699500 :     ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
     463    37699500 : }
     464             : 
     465   695131500 : static INLINE __m128i convolve_2tap_ssse3(const __m128i ss[1],
     466             :     const __m128i coeffs[1]) {
     467  1390261000 :     return _mm_maddubs_epi16(ss[0], coeffs[0]);
     468             : }
     469             : 
     470    30331937 : static INLINE __m128i convolve_4tap_ssse3(const __m128i ss[2],
     471             :     const __m128i coeffs[2]) {
     472    30331937 :     const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
     473    60663870 :     const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
     474    30331937 :     return _mm_add_epi16(res_23, res_45);
     475             : }
     476             : 
     477    15516945 : static INLINE __m128i convolve_6tap_ssse3(const __m128i ss[3],
     478             :     const __m128i coeffs[3]) {
     479    15516945 :     const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
     480    15516945 :     const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
     481    31033890 :     const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
     482    15516945 :     const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
     483    15516945 :     return _mm_add_epi16(res_1256, res_34);
     484             : }
     485             : 
     486       89584 : static INLINE __m128i convolve_8tap_ssse3(const __m128i ss[4],
     487             :     const __m128i coeffs[4]) {
     488       89584 :     const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
     489       89584 :     const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
     490       89584 :     const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
     491      179168 :     const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
     492       89584 :     const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
     493       89584 :     const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
     494       89584 :     return _mm_add_epi16(res_0145, res_2367);
     495             : }
     496             : 
     497  3591145700 : static INLINE __m256i convolve_2tap_avx2(const __m256i ss[1],
     498             :     const __m256i coeffs[1]) {
     499  7182293000 :     return _mm256_maddubs_epi16(ss[0], coeffs[0]);
     500             : }
     501             : 
     502   136633067 : static INLINE __m256i convolve_4tap_avx2(const __m256i ss[2],
     503             :     const __m256i coeffs[2]) {
     504   136633067 :     const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
     505   273266154 :     const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
     506   136633067 :     return _mm256_add_epi16(res_23, res_45);
     507             : }
     508             : 
     509  2353409000 : static INLINE __m256i convolve_6tap_avx2(const __m256i ss[3],
     510             :     const __m256i coeffs[3]) {
     511  2353409000 :     const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
     512  2353409000 :     const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
     513  4706826000 :     const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
     514  2353409000 :     const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
     515  2353409000 :     return _mm256_add_epi16(res_0145, res_23);
     516             : }
     517             : 
     518  3821450700 : static INLINE __m256i convolve_8tap_avx2(const __m256i ss[4],
     519             :     const __m256i coeffs[4]) {
     520  3821450700 :     const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
     521  3821450700 :     const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
     522  3821450700 :     const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
     523  7642915700 :     const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
     524  3821450700 :     const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
     525  3821450700 :     const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
     526  3821450700 :     return _mm256_add_epi16(res_0145, res_2367);
     527             : }
     528             : 
     529           0 : static INLINE __m128i convolve16_2tap_sse2(const __m128i ss[1],
     530             :     const __m128i coeffs[1]) {
     531           0 :     return _mm_madd_epi16(ss[0], coeffs[0]);
     532             : }
     533             : 
     534           0 : static INLINE __m128i convolve16_4tap_sse2(const __m128i ss[2],
     535             :     const __m128i coeffs[2]) {
     536           0 :     const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
     537           0 :     const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
     538           0 :     return _mm_add_epi32(res_01, res_23);
     539             : }
     540             : 
     541           0 : static INLINE __m128i convolve16_6tap_sse2(const __m128i ss[3],
     542             :     const __m128i coeffs[3]) {
     543           0 :     const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
     544           0 :     const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
     545           0 :     const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
     546           0 :     const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
     547           0 :     return _mm_add_epi32(res_0123, res_45);
     548             : }
     549             : 
     550           0 : static INLINE __m128i convolve16_8tap_sse2(const __m128i ss[4],
     551             :     const __m128i coeffs[4]) {
     552           0 :     const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
     553           0 :     const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
     554           0 :     const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
     555           0 :     const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
     556           0 :     const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
     557           0 :     const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
     558           0 :     return _mm_add_epi32(res_0123, res_4567);
     559             : }
     560             : 
     561  3359760000 : static INLINE __m256i convolve16_2tap_avx2(const __m256i ss[1],
     562             :     const __m256i coeffs[1]) {
     563  6719510000 :     return _mm256_madd_epi16(ss[0], coeffs[0]);
     564             : }
     565             : 
     566     7013640 : static INLINE __m256i convolve16_4tap_avx2(const __m256i ss[2],
     567             :     const __m256i coeffs[2]) {
     568     7013640 :     const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
     569    14027300 :     const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
     570     7013640 :     return _mm256_add_epi32(res_1, res_2);
     571             : }
     572             : 
     573  1659360000 : static INLINE __m256i convolve16_6tap_avx2(const __m256i ss[3],
     574             :     const __m256i coeffs[3]) {
     575  1659360000 :     const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
     576  1659360000 :     const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
     577  3318730000 :     const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
     578  1659360000 :     const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
     579  1659360000 :     return _mm256_add_epi32(res_0123, res_45);
     580             : }
     581             : 
     582   880097300 : static INLINE __m256i convolve16_8tap_avx2(const __m256i ss[4],
     583             :     const __m256i coeffs[4]) {
     584   880097300 :     const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
     585   880097300 :     const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
     586   880097300 :     const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
     587  1760199000 :     const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
     588   880097300 :     const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
     589   880097300 :     const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
     590   880097300 :     return _mm256_add_epi32(res_0123, res_4567);
     591             : }
     592             : 
     593  1708755000 : static INLINE __m256i x_convolve_6tap_avx2(const __m256i data,
     594             :     const __m256i coeffs[3],
     595             :     const __m256i *const filt) {
     596             :     __m256i ss[3];
     597             : 
     598  1708755000 :     ss[0] = _mm256_shuffle_epi8(data, filt[0]);
     599  1708755000 :     ss[1] = _mm256_shuffle_epi8(data, filt[1]);
     600  1708755000 :     ss[2] = _mm256_shuffle_epi8(data, filt[2]);
     601             : 
     602  1708755000 :     return convolve_6tap_avx2(ss, coeffs);
     603             : }
     604             : 
     605   706986300 : static INLINE __m256i x_convolve_8tap_avx2(const __m256i data,
     606             :     const __m256i coeffs[4],
     607             :     const __m256i *const filt) {
     608             :     __m256i ss[4];
     609             : 
     610   706986300 :     ss[0] = _mm256_shuffle_epi8(data, filt[0]);
     611   706986300 :     ss[1] = _mm256_shuffle_epi8(data, filt[1]);
     612   706986300 :     ss[2] = _mm256_shuffle_epi8(data, filt[2]);
     613   706986300 :     ss[3] = _mm256_shuffle_epi8(data, filt[3]);
     614             : 
     615   706986300 :     return convolve_8tap_avx2(ss, coeffs);
     616             : }
     617             : 
     618   556173000 : static INLINE __m256i sr_y_round_avx2(const __m256i src) {
     619   556173000 :     const __m256i round = _mm256_set1_epi16(32);
     620   556173000 :     const __m256i dst = _mm256_add_epi16(src, round);
     621   556173000 :     return _mm256_srai_epi16(dst, FILTER_BITS - 1);
     622             : }
     623             : 
     624   669909000 : static INLINE __m128i xy_x_round_sse2(const __m128i src) {
     625   669909000 :     const __m128i round = _mm_set1_epi16(2);
     626   669909000 :     const __m128i dst = _mm_add_epi16(src, round);
     627   669909000 :     return _mm_srai_epi16(dst, 2);
     628             : }
     629             : 
     630  3327560000 : static INLINE __m256i xy_x_round_avx2(const __m256i src) {
     631  3327560000 :     const __m256i round = _mm256_set1_epi16(2);
     632  3327560000 :     const __m256i dst = _mm256_add_epi16(src, round);
     633  3327560000 :     return _mm256_srai_epi16(dst, 2);
     634             : }
     635             : 
     636           0 : static INLINE void xy_x_round_store_2x2_sse2(const __m128i res,
     637             :     int16_t *const dst) {
     638           0 :     const __m128i d = xy_x_round_sse2(res);
     639           0 :     _mm_storel_epi64((__m128i *)dst, d);
     640           0 : }
     641             : 
     642    12750800 : static INLINE void xy_x_round_store_4x2_sse2(const __m128i res,
     643             :     int16_t *const dst) {
     644    12750800 :     const __m128i d = xy_x_round_sse2(res);
     645             :     _mm_store_si128((__m128i *)dst, d);
     646    12750100 : }
     647             : 
     648   330723000 : static INLINE void xy_x_round_store_8x2_sse2(const __m128i res[2],
     649             :     int16_t *const dst) {
     650             :     __m128i r[2];
     651             : 
     652   330723000 :     r[0] = xy_x_round_sse2(res[0]);
     653   330286000 :     r[1] = xy_x_round_sse2(res[1]);
     654   330583000 :     _mm_store_si128((__m128i *)dst, r[0]);
     655   330583000 :     _mm_store_si128((__m128i *)(dst + 8), r[1]);
     656   330583000 : }
     657             : 
     658   265329000 : static INLINE void xy_x_round_store_8x2_avx2(const __m256i res,
     659             :     int16_t *const dst) {
     660   265329000 :     const __m256i d = xy_x_round_avx2(res);
     661             :     _mm256_store_si256((__m256i *)dst, d);
     662   265926000 : }
     663             : 
     664   537305000 : static INLINE void xy_x_round_store_32_avx2(const __m256i res[2],
     665             :     int16_t *const dst) {
     666             :     __m256i r[2];
     667             : 
     668   537305000 :     r[0] = xy_x_round_avx2(res[0]);
     669   540783000 :     r[1] = xy_x_round_avx2(res[1]);
     670             :     const __m256i d0 =
     671   543167000 :         _mm256_inserti128_si256(r[0], _mm256_extracti128_si256(r[1], 0), 1);
     672             :     const __m256i d1 =
     673   543167000 :         _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
     674             :     _mm256_store_si256((__m256i *)dst, d0);
     675   543167000 :     _mm256_store_si256((__m256i *)(dst + 16), d1);
     676   543167000 : }
     677             : 
     678             : static INLINE __m128i xy_y_round_sse2(const __m128i src) {
     679             :     const __m128i round = _mm_set1_epi32(1024);
     680             :     const __m128i dst = _mm_add_epi32(src, round);
     681             :     return _mm_srai_epi32(dst, 11);
     682             : }
     683             : 
     684             : static INLINE __m128i xy_y_round_half_pel_sse2(const __m128i src) {
     685             :     const __m128i round = _mm_set1_epi16(16);
     686             :     const __m128i dst = _mm_add_epi16(src, round);
     687             :     return _mm_srai_epi16(dst, 5);
     688             : }
     689             : 
     690             : static INLINE __m256i xy_y_round_avx2(const __m256i src) {
     691             :     const __m256i round = _mm256_set1_epi32(1024);
     692             :     const __m256i dst = _mm256_add_epi32(src, round);
     693             :     return _mm256_srai_epi32(dst, 11);
     694             : }
     695             : 
     696             : static INLINE __m256i xy_y_round_16_avx2(const __m256i r[2]) {
     697             :     const __m256i r0 = xy_y_round_avx2(r[0]);
     698             :     const __m256i r1 = xy_y_round_avx2(r[1]);
     699             :     return _mm256_packs_epi32(r0, r1);
     700             : }
     701             : 
     702             : static INLINE __m256i xy_y_round_half_pel_avx2(const __m256i src) {
     703             :     const __m256i round = _mm256_set1_epi16(16);
     704             :     const __m256i dst = _mm256_add_epi16(src, round);
     705             :     return _mm256_srai_epi16(dst, 5);
     706             : }
     707             : 
     708      225212 : static INLINE __m128i jnt_y_round_sse2(const __m128i src) {
     709      225212 :     const __m128i round = _mm_set1_epi16(2);
     710      225212 :     const __m128i dst = _mm_add_epi16(src, round);
     711      225212 :     return _mm_srai_epi16(dst, 2);
     712             : }
     713             : 
     714   231354000 : static INLINE __m256i jnt_y_round_avx2(const __m256i src) {
     715   231354000 :     const __m256i round = _mm256_set1_epi16(2);
     716   231354000 :     const __m256i dst = _mm256_add_epi16(src, round);
     717   231354000 :     return _mm256_srai_epi16(dst, 2);
     718             : }
     719             : 
     720      351612 : static INLINE __m128i jnt_avg_round_sse2(const __m128i src,
     721             :     const __m128i offset) {
     722      351612 :     const __m128i dst = _mm_add_epi16(src, offset);
     723      351612 :     return _mm_srai_epi16(dst, 2);
     724             : }
     725             : 
     726   276725000 : static INLINE __m256i jnt_avg_round_avx2(const __m256i src,
     727             :     const __m256i offset) {
     728   276725000 :     const __m256i dst = _mm256_add_epi16(src, offset);
     729   276725000 :     return _mm256_srai_epi16(dst, 2);
     730             : }
     731             : 
     732      733242 : static INLINE __m128i jnt_no_avg_round_sse2(const __m128i src,
     733             :     const __m128i offset) {
     734      733242 :     const __m128i dst = _mm_add_epi16(src, offset);
     735      733242 :     return _mm_srli_epi16(dst, 2);
     736             : }
     737             : 
     738  1024650000 : static INLINE __m256i jnt_no_avg_round_avx2(const __m256i src,
     739             :     const __m256i offset) {
     740  1024650000 :     const __m256i dst = _mm256_add_epi16(src, offset);
     741  1024650000 :     return _mm256_srli_epi16(dst, 2);
     742             : }
     743             : 
     744      197450 : static INLINE void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
     745             :     const int32_t stride) {
     746      197450 :     const __m128i d = _mm_packus_epi16(res, res);
     747      197450 :     *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
     748      197450 :     *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
     749      197450 : }
     750             : 
     751    35268622 : static INLINE void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
     752             :     const int32_t stride) {
     753    35268622 :     const __m128i d = _mm_packus_epi16(res, res);
     754             :     store_u8_4x2_sse2(d, dst, stride);
     755    35268722 : }
     756             : 
     757             : static INLINE void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
     758             :     const int32_t stride) {
     759             :     const __m256i d = _mm256_packus_epi16(res, res);
     760             :     const __m128i d0 = _mm256_castsi256_si128(d);
     761             :     const __m128i d1 = _mm256_extracti128_si256(d, 1);
     762             :     xx_storel_32(dst, d0);
     763             :     xx_storel_32(dst + stride, d1);
     764             : }
     765             : 
     766   366878800 : static INLINE void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
     767             :     const int32_t stride) {
     768   366878800 :     const __m256i d = _mm256_packus_epi16(res, res);
     769   366878800 :     const __m128i d0 = _mm256_castsi256_si128(d);
     770   366878800 :     const __m128i d1 = _mm256_extracti128_si256(d, 1);
     771   366878800 :     _mm_storel_epi64((__m128i *)dst, d0);
     772   366878800 :     _mm_storel_epi64((__m128i *)(dst + stride), d1);
     773   366878800 : }
     774             : 
     775   236666400 : static INLINE void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
     776             :     uint8_t *const dst,
     777             :     const int32_t stride) {
     778   236666400 :     const __m256i d = _mm256_packus_epi16(res0, res1);
     779   236666400 :     storeu_u8_16x2_avx2(d, dst, stride);
     780   236536900 : }
     781             : 
     782   148583000 : static INLINE void xy_y_pack_store_16x2_avx2(const __m256i res0,
     783             :     const __m256i res1,
     784             :     uint8_t *const dst,
     785             :     const int32_t stride) {
     786   148583000 :     const __m256i t = _mm256_packus_epi16(res0, res1);
     787   148583000 :     const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
     788   148583000 :     storeu_u8_16x2_avx2(d, dst, stride);
     789   148638000 : }
     790             : 
     791             : static INLINE void xy_y_round_store_2x2_sse2(const __m128i res,
     792             :     uint8_t *const dst,
     793             :     const int32_t stride) {
     794             :     const __m128i r = xy_y_round_sse2(res);
     795             :     const __m128i rr = _mm_packs_epi32(r, r);
     796             :     pack_store_2x2_sse2(rr, dst, stride);
     797             : }
     798             : 
     799             : static INLINE void xy_y_round_store_4x2_avx2(const __m256i res,
     800             :     uint8_t *const dst,
     801             :     const int32_t stride) {
     802             :     const __m256i r = xy_y_round_avx2(res);
     803             :     const __m256i rr = _mm256_packs_epi32(r, r);
     804             :     pack_store_4x2_avx2(rr, dst, stride);
     805             : }
     806             : 
     807             : static INLINE void xy_y_pack_store_32_avx2(const __m256i res0,
     808             :     const __m256i res1,
     809             :     uint8_t *const dst) {
     810             :     const __m256i d = _mm256_packus_epi16(res0, res1);
     811             :     // d = _mm256_permute4x64_epi64(d, 0xD8);
     812             :     _mm256_storeu_si256((__m256i *)dst, d);
     813             : }
     814             : 
     815             : static INLINE void xy_y_round_store_32_avx2(const __m256i r0[2],
     816             :     const __m256i r1[2],
     817             :     uint8_t *const dst) {
     818             :     const __m256i ra = xy_y_round_16_avx2(r0);
     819             :     const __m256i rb = xy_y_round_16_avx2(r1);
     820             :     xy_y_pack_store_32_avx2(ra, rb, dst);
     821             : }
     822             : 
     823   820942000 : static INLINE void convolve_store_32_avx2(const __m256i res0,
     824             :     const __m256i res1,
     825             :     uint8_t *const dst) {
     826   820942000 :     const __m256i d = _mm256_packus_epi16(res0, res1);
     827             :     _mm256_storeu_si256((__m256i *)dst, d);
     828   820942000 : }
     829             : 
     830  1122091000 : static INLINE void jnt_no_avg_store_16x2_avx2(const __m256i src0,
     831             :     const __m256i src1,
     832             :     ConvBufType *const dst,
     833             :     const int32_t stride) {
     834             :     const __m256i d0 =
     835  1122091000 :         _mm256_inserti128_si256(src0, _mm256_extracti128_si256(src1, 0), 1);
     836             :     const __m256i d1 =
     837  1122091000 :         _mm256_inserti128_si256(src1, _mm256_extracti128_si256(src0, 1), 0);
     838             :     _mm256_storeu_si256((__m256i *)dst, d0);
     839  1122091000 :     _mm256_storeu_si256((__m256i *)(dst + stride), d1);
     840  1122091000 : }
     841             : 
     842           0 : static INLINE __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
     843             :     const int32_t stride,
     844             :     const __m128i coeffs[1]) {
     845             :     const __m128i sfl =
     846           0 :         _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
     847           0 :     const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
     848           0 :     const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
     849           0 :     return convolve_2tap_ssse3(&ss, coeffs);
     850             : }
     851             : 
     852     1468830 : static INLINE __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
     853             :     const int32_t stride,
     854             :     const __m128i coeffs[1]) {
     855             :     const __m128i sfl =
     856     1468830 :         _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
     857     1468830 :     const __m128i s_128 = load_u8_8x2_sse2(src, stride);
     858     1468840 :     const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
     859     1468840 :     return convolve_2tap_ssse3(&ss, coeffs);
     860             : }
     861             : 
     862   338881390 : static INLINE void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
     863             :     const int32_t stride,
     864             :     const __m128i coeffs[1],
     865             :     __m128i r[2]) {
     866             :     __m128i ss[2];
     867   338881390 :     const __m128i s00 = _mm_loadu_si128((__m128i *)src);
     868   338881390 :     const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
     869   338881390 :     const __m128i s01 = _mm_srli_si128(s00, 1);
     870   338881390 :     const __m128i s11 = _mm_srli_si128(s10, 1);
     871   338881390 :     ss[0] = _mm_unpacklo_epi8(s00, s01);
     872   338881390 :     ss[1] = _mm_unpacklo_epi8(s10, s11);
     873             : 
     874   338881390 :     r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
     875   339113580 :     r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
     876   338544480 : }
     877             : 
     878    61452000 : static INLINE __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
     879             :     const int32_t stride,
     880             :     const __m256i coeffs[1]) {
     881             :     __m128i s_128[2][2];
     882             :     __m256i s_256[2];
     883             : 
     884    61452000 :     s_128[0][0] = _mm_loadu_si128((__m128i *)src);
     885    61452000 :     s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
     886    61452000 :     s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
     887    61452000 :     s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
     888    61452000 :     s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
     889    61452000 :     s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
     890    61452000 :     const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
     891    61452000 :     return convolve_2tap_avx2(&ss, coeffs);
     892             : }
     893             : 
     894   399526810 : static INLINE void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
     895             :     const int32_t stride,
     896             :     const __m256i coeffs[1],
     897             :     __m256i r[2]) {
     898             :     __m128i s_128[2][2];
     899             :     __m256i s_256[2];
     900             : 
     901   399526810 :     s_128[0][0] = _mm_loadu_si128((__m128i *)src);
     902   399526810 :     s_128[0][1] = _mm_loadu_si128((__m128i *)(src + 1));
     903   399526810 :     s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
     904   399526810 :     s_128[1][1] = _mm_loadu_si128((__m128i *)(src + stride + 1));
     905   399526810 :     s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
     906   399526810 :     s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
     907   399526810 :     const __m256i s0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
     908   399526810 :     const __m256i s1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
     909   399526810 :     r[0] = convolve_2tap_avx2(&s0, coeffs);
     910   399250750 :     r[1] = convolve_2tap_avx2(&s1, coeffs);
     911   398219110 : }
     912             : 
     913   171934100 : static INLINE void x_convolve_2tap_32_avx2(const uint8_t *const src,
     914             :     const __m256i coeffs[1],
     915             :     __m256i r[2]) {
     916   171934100 :     const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
     917   343868100 :     const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
     918   171934100 :     const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
     919   171934100 :     const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
     920             : 
     921   171934100 :     r[0] = convolve_2tap_avx2(&ss0, coeffs);
     922   172021400 :     r[1] = convolve_2tap_avx2(&ss1, coeffs);
     923   171898200 : }
     924             : 
     925      104946 : static INLINE __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
     926             :     const int32_t stride,
     927             :     const __m128i coeffs[1]) {
     928             :     const __m128i sfl0 =
     929      104946 :         _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
     930             :     const __m128i sfl1 =
     931      104946 :         _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
     932      104946 :     const __m128i s = load_u8_8x2_sse2(src, stride);
     933             :     __m128i ss[2];
     934             : 
     935      104946 :     ss[0] = _mm_shuffle_epi8(s, sfl0);
     936      104946 :     ss[1] = _mm_shuffle_epi8(s, sfl1);
     937      104946 :     return convolve_4tap_ssse3(ss, coeffs);
     938             : }
     939             : 
     940    28868090 : static INLINE __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
     941             :     const int32_t stride,
     942             :     const __m128i coeffs[1]) {
     943    28868090 :     const __m128i s = load_u8_8x2_sse2(src, stride);
     944             :     const __m128i sfl0 =
     945    28868302 :         _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
     946             :     const __m128i sfl1 =
     947    28868302 :         _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
     948             :     __m128i ss[2];
     949             : 
     950    28868302 :     ss[0] = _mm_shuffle_epi8(s, sfl0);
     951    28868302 :     ss[1] = _mm_shuffle_epi8(s, sfl1);
     952    28868302 :     return convolve_4tap_ssse3(ss, coeffs);
     953             : }
     954             : 
     955   275830700 : static INLINE __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
     956             :     const int32_t stride,
     957             :     const __m256i coeffs[3],
     958             :     const __m256i *const filt) {
     959   275830700 :     const __m128i s0 = _mm_loadu_si128((__m128i *)src);
     960   275830700 :     const __m128i s1 = _mm_loadu_si128((__m128i *)(src + stride));
     961   275830700 :     const __m256i s_256 = _mm256_setr_m128i(s0, s1);
     962   275830700 :     return x_convolve_6tap_avx2(s_256, coeffs, filt);
     963             : }
     964             : 
     965   727784700 : static INLINE void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
     966             :     const int32_t src_stride,
     967             :     const __m256i coeffs[3],
     968             :     const __m256i *const filt,
     969             :     __m256i r[2]) {
     970   727784700 :     const __m128i s0_128 = _mm_loadu_si128((__m128i *)src);
     971   727784700 :     const __m128i s1_128 = _mm_loadu_si128((__m128i *)(src + src_stride));
     972   727784700 :     const __m128i s2_128 = _mm_loadu_si128((__m128i *)(src + 8));
     973   727784700 :     const __m128i s3_128 = _mm_loadu_si128((__m128i *)(src + src_stride + 8));
     974   727784700 :     const __m256i s0_256 = _mm256_setr_m128i(s0_128, s1_128);
     975   727784700 :     const __m256i s1_256 = _mm256_setr_m128i(s2_128, s3_128);
     976             : 
     977   727784700 :     r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
     978   727960000 :     r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
     979   726174700 : }
     980             : 
     981   100278630 : static INLINE __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
     982             :     const int32_t stride,
     983             :     const __m256i coeffs[3],
     984             :     const __m256i *const filt) {
     985   100278630 :     const __m128i s0 = _mm_loadu_si128((__m128i *)src);
     986   100278630 :     const __m128i s1 = _mm_loadu_si128((__m128i *)(src + stride));
     987   100278630 :     const __m256i s_256 = _mm256_setr_m128i(s0, s1);
     988   100278630 :     return x_convolve_8tap_avx2(s_256, coeffs, filt);
     989             : }
     990             : 
     991             : SIMD_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
     992             :     const int32_t src_stride,
     993             :     const __m256i coeffs[4],
     994             :     const __m256i *const filt,
     995             :     __m256i r[2]) {
     996   283220600 :     const __m128i s0_128 = _mm_loadu_si128((__m128i *)src);
     997   283220600 :     const __m128i s1_128 = _mm_loadu_si128((__m128i *)(src + src_stride));
     998   283220600 :     const __m128i s2_128 = _mm_loadu_si128((__m128i *)(src + 8));
     999   283220600 :     const __m128i s3_128 = _mm_loadu_si128((__m128i *)(src + src_stride + 8));
    1000   283220600 :     const __m256i s0_256 = _mm256_setr_m128i(s0_128, s1_128);
    1001   283220600 :     const __m256i s1_256 = _mm256_setr_m128i(s2_128, s3_128);
    1002             : 
    1003   283220600 :     r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
    1004   283066200 :     r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
    1005   282875100 : }
    1006             : 
    1007           0 : static INLINE __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
    1008             :     const int32_t stride,
    1009             :     const __m128i coeffs[1],
    1010             :     __m128i s_16[2]) {
    1011             :     __m128i s_128[2];
    1012             : 
    1013           0 :     s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
    1014           0 :     s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
    1015           0 :     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
    1016           0 :     s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
    1017           0 :     const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
    1018           0 :     return convolve_2tap_ssse3(&ss, coeffs);
    1019             : }
    1020             : 
    1021     1559280 : static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
    1022             :     const int32_t stride,
    1023             :     const __m128i coeffs[1],
    1024             :     __m128i s_32[2]) {
    1025             :     __m128i s_128[2];
    1026             : 
    1027     1559280 :     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
    1028     1559280 :     s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    1029     1559280 :     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
    1030     1559280 :     s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
    1031     1559280 :     const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
    1032     1559280 :     return convolve_2tap_ssse3(&ss, coeffs);
    1033             : }
    1034             : 
    1035    61018500 : static INLINE __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
    1036             :     const int32_t stride,
    1037             :     const __m256i coeffs[1],
    1038             :     __m128i s_64[2]) {
    1039             :     __m256i s_256[2];
    1040             : 
    1041    61018500 :     s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
    1042    61018500 :     s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
    1043    61018500 :     s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
    1044    61018500 :     s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
    1045    61018500 :     const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
    1046    61018500 :     return convolve_2tap_avx2(&ss, coeffs);
    1047             : }
    1048             : 
    1049    73657410 : static INLINE void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
    1050             :     const int32_t stride,
    1051             :     const __m256i coeffs[1],
    1052             :     __m128i s_128[2], __m256i r[2]) {
    1053             :     __m256i s_256[2];
    1054             : 
    1055    73657410 :     s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
    1056    73657410 :     s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
    1057    73657410 :     s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
    1058    73657410 :     s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
    1059    73657410 :     const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
    1060    73657410 :     const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
    1061    73657410 :     r[0] = convolve_2tap_avx2(&ss0, coeffs);
    1062    73684640 :     r[1] = convolve_2tap_avx2(&ss1, coeffs);
    1063    73674390 : }
    1064             : 
    1065   162559100 : static INLINE void y_convolve_2tap_32_avx2(const uint8_t *const src,
    1066             :     const __m256i coeffs[1],
    1067             :     const __m256i s0, __m256i *const s1,
    1068             :     __m256i r[2]) {
    1069   162559100 :     *s1 = _mm256_loadu_si256((__m256i *)src);
    1070   162559100 :     const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
    1071   162559100 :     const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
    1072   162559100 :     r[0] = convolve_2tap_avx2(&ss0, coeffs);
    1073   162621500 :     r[1] = convolve_2tap_avx2(&ss1, coeffs);
    1074   162516200 : }
    1075             : 
    1076       26776 : static INLINE __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
    1077             :     const int32_t stride,
    1078             :     const __m128i coeffs[2],
    1079             :     __m128i s_16[4],
    1080             :     __m128i ss_128[2]) {
    1081       26776 :     s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
    1082       26776 :     const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
    1083       26776 :     s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
    1084       26776 :     const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
    1085       26776 :     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
    1086       26776 :     return convolve_4tap_ssse3(ss_128, coeffs);
    1087             : }
    1088             : 
    1089     1328956 : static INLINE __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
    1090             :     const int32_t stride,
    1091             :     const __m128i coeffs[2],
    1092             :     __m128i s_32[4],
    1093             :     __m128i ss_128[2]) {
    1094     1328956 :     s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
    1095     1328956 :     const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
    1096     1328956 :     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
    1097     1328956 :     const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
    1098     1328956 :     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
    1099     1328956 :     return convolve_4tap_ssse3(ss_128, coeffs);
    1100             : }
    1101             : 
    1102     1923966 : static INLINE __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
    1103             :     const int32_t stride,
    1104             :     const __m256i coeffs[2],
    1105             :     __m128i s_64[4],
    1106             :     __m256i ss_256[2]) {
    1107     1923966 :     s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
    1108     1923966 :     const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
    1109     1923966 :     s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
    1110     1923966 :     const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
    1111     1923966 :     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1112     1923966 :     return convolve_4tap_avx2(ss_256, coeffs);
    1113             : }
    1114             : 
    1115     2751104 : static INLINE void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
    1116             :     const int32_t stride,
    1117             :     const __m256i coeffs[2],
    1118             :     __m128i s_128[4],
    1119             :     __m256i ss_256[2], __m256i r[2]) {
    1120     2751104 :     s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
    1121     2751104 :     const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
    1122     2751104 :     s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
    1123     2751104 :     const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
    1124     2751104 :     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1125     2751104 :     ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
    1126     2751104 :     r[0] = convolve_4tap_avx2(ss_256, coeffs);
    1127     2751146 :     r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
    1128     2751115 : }
    1129             : 
    1130       65728 : static INLINE __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
    1131             :     const int32_t stride,
    1132             :     const __m128i coeffs[3],
    1133             :     __m128i s_16[6],
    1134             :     __m128i ss_128[3]) {
    1135       65728 :     s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src + 3 * stride));
    1136       65728 :     const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
    1137       65728 :     s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src + 4 * stride));
    1138       65728 :     const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
    1139       65728 :     ss_128[2] = _mm_unpacklo_epi8(src45, src56);
    1140       65728 :     return convolve_6tap_ssse3(ss_128, coeffs);
    1141             : }
    1142             : 
    1143       11086 : static INLINE void y_convolve_4tap_32x2_avx2(
    1144             :     const uint8_t *const src, const int32_t stride, const __m256i coeffs[2],
    1145             :     __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
    1146       11086 :     s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
    1147       11086 :     ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
    1148       11086 :     ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
    1149       11086 :     s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
    1150       11086 :     tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
    1151       11086 :     tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
    1152       11086 :     r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
    1153       11086 :     r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
    1154       11086 :     r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
    1155       11086 :     r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
    1156       11086 : }
    1157             : 
    1158    15450542 : static INLINE __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
    1159             :     const int32_t stride,
    1160             :     const __m128i coeffs[3],
    1161             :     __m128i s_32[6],
    1162             :     __m128i ss_128[3]) {
    1163    15450542 :     s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * stride));
    1164    15450542 :     const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
    1165    15450542 :     s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * stride));
    1166    15450542 :     const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
    1167    15450542 :     ss_128[2] = _mm_unpacklo_epi8(src45, src56);
    1168    15450542 :     return convolve_6tap_ssse3(ss_128, coeffs);
    1169             : }
    1170             : 
    1171    90980600 : static INLINE __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
    1172             :     const int32_t stride,
    1173             :     const __m256i coeffs[3],
    1174             :     __m128i s_64[6],
    1175             :     __m256i ss_256[3]) {
    1176    90980600 :     s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
    1177    90980600 :     const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
    1178    90980600 :     s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
    1179    90980600 :     const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
    1180    90980600 :     ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
    1181    90980600 :     return convolve_6tap_avx2(ss_256, coeffs);
    1182             : }
    1183             : 
    1184    89805200 : static INLINE void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
    1185             :     const int32_t stride,
    1186             :     const __m256i coeffs[3],
    1187             :     __m128i s_128[6],
    1188             :     __m256i ss_256[3], __m256i r[2]) {
    1189    89805200 :     s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
    1190    89805200 :     const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
    1191    89805200 :     s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
    1192    89805200 :     const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
    1193    89805200 :     ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
    1194    89805200 :     ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
    1195    89805200 :     r[0] = convolve_6tap_avx2(ss_256, coeffs);
    1196    89819500 :     r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
    1197    89812000 : }
    1198             : 
    1199    96092600 : static INLINE void y_convolve_6tap_32x2_avx2(
    1200             :     const uint8_t *const src, const int32_t stride, const __m256i coeffs[3],
    1201             :     __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
    1202    96092600 :     s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
    1203    96092600 :     ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
    1204    96092600 :     ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
    1205    96092600 :     s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
    1206    96092600 :     tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
    1207    96092600 :     tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
    1208    96092600 :     r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
    1209    96100800 :     r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
    1210    96077600 :     r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
    1211    96061600 :     r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
    1212    96066500 : }
    1213             : 
    1214           0 : static INLINE __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
    1215             :     const int32_t stride,
    1216             :     const __m128i coeffs[4],
    1217             :     __m128i s_16[8],
    1218             :     __m128i ss_128[4]) {
    1219           0 :     s_16[7] = _mm_cvtsi32_si128(*(int16_t *)(src + 7 * stride));
    1220           0 :     const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
    1221           0 :     s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src + 8 * stride));
    1222           0 :     const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
    1223           0 :     ss_128[3] = _mm_unpacklo_epi8(src67, src78);
    1224           0 :     return convolve_8tap_ssse3(ss_128, coeffs);
    1225             : }
    1226             : 
    1227       89584 : static INLINE __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
    1228             :     const int32_t stride,
    1229             :     const __m128i coeffs[4],
    1230             :     __m128i s_32[8],
    1231             :     __m128i ss_128[4]) {
    1232       89584 :     s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * stride));
    1233       89584 :     const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
    1234       89584 :     s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * stride));
    1235       89584 :     const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
    1236       89584 :     ss_128[3] = _mm_unpacklo_epi8(src67, src78);
    1237       89584 :     return convolve_8tap_ssse3(ss_128, coeffs);
    1238             : }
    1239             : 
    1240    16576230 : static INLINE __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
    1241             :     const int32_t stride,
    1242             :     const __m256i coeffs[4],
    1243             :     __m128i s_64[8],
    1244             :     __m256i ss_256[4]) {
    1245    16576230 :     s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
    1246    16576230 :     const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
    1247    16576230 :     s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
    1248    16576230 :     const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
    1249    16576230 :     ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
    1250    16576230 :     return convolve_8tap_avx2(ss_256, coeffs);
    1251             : }
    1252             : 
    1253    16636140 : static INLINE void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
    1254             :     const int32_t stride,
    1255             :     const __m256i coeffs[4],
    1256             :     __m128i s_128[8],
    1257             :     __m256i ss_256[4], __m256i r[2]) {
    1258    16636140 :     s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
    1259    16636140 :     const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
    1260    16636140 :     s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
    1261    16636140 :     const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
    1262    16636140 :     ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
    1263    16636140 :     ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
    1264    16636140 :     r[0] = convolve_8tap_avx2(ss_256, coeffs);
    1265    16637710 :     r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
    1266    16637650 : }
    1267             : 
    1268    20257350 : static INLINE void y_convolve_8tap_32x2_avx2(
    1269             :     const uint8_t *const src, const int32_t stride, const __m256i coeffs[4],
    1270             :     __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
    1271    20257350 :     s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
    1272    20257350 :     ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
    1273    20257350 :     ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
    1274    20257350 :     s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
    1275    20257350 :     tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
    1276    20257350 :     tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
    1277    20257350 :     r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
    1278    20258850 :     r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
    1279    20258370 :     r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
    1280    20257420 :     r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
    1281    20257030 : }
    1282             : 
    1283   687460000 : static INLINE void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
    1284             :     const __m256i coeffs[1],
    1285             :     __m256i r[2]) {
    1286   687460000 :     const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
    1287  1374920000 :     const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
    1288   687460000 :     const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
    1289   687460000 :     const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
    1290             : 
    1291   687460000 :     r[0] = convolve_2tap_avx2(&ss0, coeffs);
    1292   687282000 :     r[1] = convolve_2tap_avx2(&ss1, coeffs);
    1293   683459000 : }
    1294             : 
    1295   688062000 : static INLINE void xy_x_2tap_32_avx2(const uint8_t *const src,
    1296             :     const __m256i coeffs[1],
    1297             :     int16_t *const dst) {
    1298             :     __m256i r[2];
    1299             : 
    1300   688062000 :     xy_x_convolve_2tap_32_avx2(src, coeffs, r);
    1301   683510000 :     const __m256i d0 = xy_x_round_avx2(r[0]);
    1302   687328000 :     const __m256i d1 = xy_x_round_avx2(r[1]);
    1303             :     // d0 = _mm256_inserti128_si256(d0, _mm256_extracti128_si256(d1, 0), 1);
    1304             :     // d1 = _mm256_inserti128_si256(d1, _mm256_extracti128_si256(d0, 1), 0);
    1305             :     _mm256_store_si256((__m256i *)dst, d0);
    1306   688865000 :     _mm256_store_si256((__m256i *)(dst + 16), d1);
    1307   688865000 : }
    1308             : 
    1309   299944000 : static INLINE void xy_x_6tap_32_avx2(const uint8_t *const src,
    1310             :     const int32_t src_stride,
    1311             :     const __m256i coeffs[3],
    1312             :     const __m256i *const filt,
    1313             :     int16_t *const dst) {
    1314             :     __m256i r[2];
    1315             : 
    1316   299944000 :     x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    1317   299404000 :     const __m256i d0 = xy_x_round_avx2(r[0]);
    1318   299989000 :     const __m256i d1 = xy_x_round_avx2(r[1]);
    1319             :     _mm256_store_si256((__m256i *)dst, d0);
    1320   300218000 :     _mm256_store_si256((__m256i *)(dst + 16), d1);
    1321   300218000 : }
    1322             : 
    1323   160006000 : static INLINE void xy_x_8tap_32_avx2(const uint8_t *const src,
    1324             :     const int32_t src_stride,
    1325             :     const __m256i coeffs[4],
    1326             :     const __m256i *const filt,
    1327             :     int16_t *const dst) {
    1328             :     __m256i r[2];
    1329             : 
    1330             :     x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    1331   159728000 :     const __m256i d0 = xy_x_round_avx2(r[0]);
    1332   159874000 :     const __m256i d1 = xy_x_round_avx2(r[1]);
    1333             :     _mm256_store_si256((__m256i *)dst, d0);
    1334   160038000 :     _mm256_store_si256((__m256i *)(dst + 16), d1);
    1335   160038000 : }
    1336             : 
    1337           0 : static INLINE __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
    1338             :     __m128i s_32[2],
    1339             :     const __m128i coeffs[1]) {
    1340             :     __m128i s_128[2];
    1341             : 
    1342           0 :     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + 2));
    1343           0 :     s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    1344           0 :     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * 2));
    1345           0 :     s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
    1346           0 :     const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
    1347           0 :     return convolve16_2tap_sse2(&ss, coeffs);
    1348             : }
    1349             : 
    1350           0 : static INLINE __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
    1351             :     const int16_t *const src, __m128i s_32[2]) {
    1352             :     __m128i s_128[2];
    1353             : 
    1354           0 :     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + 2));
    1355           0 :     s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    1356           0 :     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * 2));
    1357           0 :     s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
    1358           0 :     return _mm_add_epi16(s_128[0], s_128[1]);
    1359             : }
    1360             : 
    1361           0 : static INLINE void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
    1362             :     __m128i s_64[2],
    1363             :     const __m128i coeffs[1],
    1364             :     __m128i r[2]) {
    1365             :     __m128i s_128[2];
    1366             : 
    1367           0 :     s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
    1368           0 :     s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
    1369           0 :     s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
    1370           0 :     s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
    1371           0 :     const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
    1372           0 :     const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
    1373           0 :     r[0] = convolve16_2tap_sse2(&ss0, coeffs);
    1374           0 :     r[1] = convolve16_2tap_sse2(&ss1, coeffs);
    1375           0 : }
    1376             : 
    1377           0 : static INLINE __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
    1378             :     const int16_t *const src, __m128i s_64[2]) {
    1379             :     __m128i s_128[2];
    1380             : 
    1381           0 :     s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
    1382           0 :     s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
    1383           0 :     s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
    1384           0 :     s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
    1385           0 :     return _mm_add_epi16(s_128[0], s_128[1]);
    1386             : }
    1387             : 
    1388  1704090000 : static INLINE void xy_y_convolve_2tap_16_avx2(const __m256i s0,
    1389             :     const __m256i s1,
    1390             :     const __m256i coeffs[1],
    1391             :     __m256i r[2]) {
    1392  1704090000 :     const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
    1393  1704090000 :     const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
    1394  1704090000 :     r[0] = convolve16_2tap_avx2(&ss0, coeffs);
    1395  1698140000 :     r[1] = convolve16_2tap_avx2(&ss1, coeffs);
    1396  1684730000 : }
    1397             : 
    1398   235131000 : static INLINE void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
    1399             :     __m128i s_128[2],
    1400             :     const __m256i coeffs[1],
    1401             :     __m256i r[2]) {
    1402             :     __m256i s_256[2];
    1403   235131000 :     s_128[1] = _mm_load_si128((__m128i *)(src + 8));
    1404   235131000 :     s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
    1405   235131000 :     s_128[0] = _mm_load_si128((__m128i *)(src + 2 * 8));
    1406   235131000 :     s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
    1407   235131000 :     xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
    1408   234854000 : }
    1409             : 
    1410    56561000 : static INLINE __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
    1411             :     const int16_t *const src, __m128i s_128[2]) {
    1412             :     __m256i s_256[2];
    1413    56561000 :     s_128[1] = _mm_load_si128((__m128i *)(src + 8));
    1414    56561000 :     s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
    1415    56561000 :     s_128[0] = _mm_load_si128((__m128i *)(src + 2 * 8));
    1416    56561000 :     s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
    1417   113122000 :     return _mm256_add_epi16(s_256[0], s_256[1]);
    1418             : }
    1419             : 
    1420    59906500 : static INLINE void xy_y_convolve_2tap_16x2_half_pel_avx2(
    1421             :     const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
    1422    59906500 :     s_256[1] = _mm256_load_si256((__m256i *)(src + 16));
    1423    59906500 :     r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
    1424    59906500 :     s_256[0] = _mm256_load_si256((__m256i *)(src + 2 * 16));
    1425    59906500 :     r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
    1426    59906500 : }
    1427             : 
    1428             : static INLINE void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
    1429             :     const int32_t stride) {
    1430             :     const __m256i t = _mm256_packus_epi16(r[0], r[1]);
    1431             :     const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
    1432             :     storeu_u8_16x2_avx2(d, dst, stride);
    1433             : }
    1434             : 
    1435   234788000 : static INLINE void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
    1436             :     __m256i s[2],
    1437             :     const __m256i coeffs[1],
    1438             :     __m256i r[4]) {
    1439   234788000 :     s[1] = _mm256_load_si256((__m256i *)(src + 16));
    1440   234788000 :     xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r);
    1441   233845000 :     s[0] = _mm256_load_si256((__m256i *)(src + 2 * 16));
    1442   233845000 :     xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
    1443   233143000 : }
    1444             : 
    1445   532882000 : static INLINE void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
    1446             :     const __m256i s0[2],
    1447             :     __m256i s1[2],
    1448             :     const __m256i coeffs[1],
    1449             :     __m256i r[4]) {
    1450   532882000 :     s1[0] = _mm256_load_si256((__m256i *)src);
    1451   532882000 :     s1[1] = _mm256_load_si256((__m256i *)(src + 16));
    1452   532882000 :     xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
    1453   528235000 :     xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
    1454   525035000 : }
    1455             : 
    1456             : static INLINE void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
    1457             :     const __m256i s0[2],
    1458             :     __m256i s1[2],
    1459             :     const __m256i coeffs[1],
    1460             :     uint8_t *const dst) {
    1461             :     __m256i r[4];
    1462             : 
    1463             :     xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
    1464             :     xy_y_round_store_32_avx2(r + 0, r + 2, dst);
    1465             : }
    1466             : 
    1467   131852000 : static INLINE void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
    1468             :     const __m256i s0[2],
    1469             :     __m256i s1[2],
    1470             :     __m256i r[2]) {
    1471   131852000 :     s1[0] = _mm256_load_si256((__m256i *)src);
    1472   131852000 :     s1[1] = _mm256_load_si256((__m256i *)(src + 16));
    1473   131852000 :     r[0] = _mm256_add_epi16(s0[0], s1[0]);
    1474   131852000 :     r[1] = _mm256_add_epi16(s0[1], s1[1]);
    1475   131852000 : }
    1476             : 
    1477             : static INLINE void xy_y_convolve_2tap_half_pel_32_all_avx2(
    1478             :     const int16_t *const src, const __m256i s0[2], __m256i s1[2],
    1479             :     uint8_t *const dst) {
    1480             :     __m256i r[2];
    1481             : 
    1482             :     xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
    1483             :     r[0] = xy_y_round_half_pel_avx2(r[0]);
    1484             :     r[1] = xy_y_round_half_pel_avx2(r[1]);
    1485             :     xy_y_pack_store_32_avx2(r[0], r[1], dst);
    1486             : }
    1487             : 
    1488           0 : static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
    1489             :     __m128i s_32[4],
    1490             :     __m128i ss_128[2],
    1491             :     const __m128i coeffs[1]) {
    1492           0 :     s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * 2));
    1493           0 :     const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
    1494           0 :     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * 2));
    1495           0 :     const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
    1496           0 :     ss_128[1] = _mm_unpacklo_epi16(src23, src34);
    1497           0 :     const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
    1498           0 :     ss_128[0] = ss_128[1];
    1499           0 :     return r;
    1500             : }
    1501             : 
    1502     1338440 : static INLINE __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
    1503             :     __m128i s_64[4],
    1504             :     __m256i ss_256[2],
    1505             :     const __m256i coeffs[2]) {
    1506             :     __m256i s_256[2];
    1507     1338440 :     s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
    1508     1338440 :     s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
    1509     1338440 :     s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
    1510     1338440 :     s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
    1511     1338440 :     ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    1512     1338440 :     const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
    1513     1338450 :     ss_256[0] = ss_256[1];
    1514     1338450 :     return r;
    1515             : }
    1516             : 
    1517     2837830 : static INLINE void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
    1518             :     const __m256i coeffs[2],
    1519             :     __m256i r[2]) {
    1520     2837830 :     r[0] = convolve16_4tap_avx2(ss, coeffs);
    1521     2837800 :     r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
    1522     2837770 : }
    1523             : 
    1524     1099400 : static INLINE void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
    1525             :     __m256i ss_256[4],
    1526             :     const __m256i coeffs[2],
    1527             :     __m256i r[2]) {
    1528             :     __m256i s_256[2];
    1529     1099400 :     s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
    1530     1099400 :     s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
    1531     1099400 :     ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    1532     1099400 :     ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
    1533     1099400 :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
    1534     1099400 :     ss_256[0] = ss_256[1];
    1535     1099400 :     ss_256[2] = ss_256[3];
    1536     1099400 : }
    1537             : 
    1538             : static INLINE void xy_y_convolve_4tap_8x2_half_pel_avx2(
    1539             :     const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
    1540             :     __m256i r[2]) {
    1541             :     __m256i a_256[2];
    1542             :     s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
    1543             :     s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
    1544             :     a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
    1545             :     a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
    1546             :     xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
    1547             :     s_256[0] = s_256[2];
    1548             :     s_256[1] = s_256[3];
    1549             : }
    1550             : 
    1551      869257 : static INLINE void xy_y_convolve_4tap_16x2_avx2(
    1552             :     const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
    1553             :     __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
    1554      869257 :     s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
    1555      869257 :     ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    1556      869257 :     ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
    1557      869257 :     s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
    1558      869257 :     tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
    1559      869257 :     tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
    1560      869257 :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
    1561      869250 :     xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
    1562      869246 :     ss_256[0] = ss_256[1];
    1563      869246 :     ss_256[2] = ss_256[3];
    1564      869246 :     tt_256[0] = tt_256[1];
    1565      869246 :     tt_256[2] = tt_256[3];
    1566      869246 : }
    1567             : 
    1568             : static INLINE void xy_y_convolve_4tap_32x2_avx2(
    1569             :     const int16_t *const src, const int32_t stride,
    1570             :     __m256i s_256[4], __m256i ss_256[4],
    1571             :     __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4])
    1572             : {
    1573             :     s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
    1574             :     ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    1575             :     ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
    1576             :     s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
    1577             :     tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
    1578             :     tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
    1579             :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
    1580             :     xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
    1581             :     ss_256[0] = ss_256[1];
    1582             :     ss_256[2] = ss_256[3];
    1583             :     tt_256[0] = tt_256[1];
    1584             :     tt_256[2] = tt_256[3];
    1585             : }
    1586             : 
    1587             : static INLINE void xy_y_convolve_4tap_16x2_half_pelavx2(
    1588             :     const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
    1589             :     __m256i r[4]) {
    1590             :     __m256i a_256[2];
    1591             : 
    1592             :     s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
    1593             :     s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
    1594             : 
    1595             :     a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
    1596             :     a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
    1597             :     xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
    1598             : 
    1599             :     a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
    1600             :     a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
    1601             :     xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
    1602             : 
    1603             :     s_256[0] = s_256[2];
    1604             :     s_256[1] = s_256[3];
    1605             :     s_256[2] = s_256[4];
    1606             : }
    1607             : 
    1608           0 : static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
    1609             :     __m128i s_32[6],
    1610             :     __m128i ss_128[3],
    1611             :     const __m128i coeffs[3]) {
    1612           0 :     s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 5 * 2));
    1613           0 :     const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
    1614           0 :     s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 6 * 2));
    1615           0 :     const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
    1616           0 :     ss_128[2] = _mm_unpacklo_epi16(src45, src56);
    1617           0 :     const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
    1618           0 :     ss_128[0] = ss_128[1];
    1619           0 :     ss_128[1] = ss_128[2];
    1620           0 :     return r;
    1621             : }
    1622             : 
    1623     5727240 : static INLINE __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
    1624             :     __m128i s_64[6],
    1625             :     __m256i ss_256[3],
    1626             :     const __m256i coeffs[3]) {
    1627             :     __m256i s_256[2];
    1628     5727240 :     s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
    1629     5727240 :     s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
    1630     5727240 :     s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
    1631     5727240 :     s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
    1632     5727240 :     ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    1633     5727240 :     const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
    1634     5727260 :     ss_256[0] = ss_256[1];
    1635     5727260 :     ss_256[1] = ss_256[2];
    1636     5727260 :     return r;
    1637             : }
    1638             : 
    1639   837617000 : static INLINE void xy_y_convolve_6tap_16_avx2(const __m256i *const ss,
    1640             :     const __m256i coeffs[3],
    1641             :     __m256i r[2]) {
    1642   837617000 :     r[0] = convolve16_6tap_avx2(ss, coeffs);
    1643   834291000 :     r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
    1644   833598000 : }
    1645             : 
    1646   129073000 : static INLINE void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
    1647             :     __m256i ss_256[6],
    1648             :     const __m256i coeffs[3],
    1649             :     __m256i r[2]) {
    1650             :     __m256i s_256[2];
    1651   129073000 :     s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
    1652   129073000 :     s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
    1653   129073000 :     ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    1654   129073000 :     ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
    1655   129073000 :     xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
    1656   129033000 :     ss_256[0] = ss_256[1];
    1657   129033000 :     ss_256[1] = ss_256[2];
    1658   129033000 :     ss_256[3] = ss_256[4];
    1659   129033000 :     ss_256[4] = ss_256[5];
    1660   129033000 : }
    1661             : 
    1662             : static INLINE void xy_y_convolve_6tap_8x2_half_pel_avx2(
    1663             :     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[4],
    1664             :     __m256i r[2]) {
    1665             :     __m256i a_256[2], ss_256[4];
    1666             :     s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
    1667             :     s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
    1668             :     a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
    1669             :     a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
    1670             :     ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
    1671             :     ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    1672             :     ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
    1673             :     ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
    1674             :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
    1675             :     s_256[0] = s_256[2];
    1676             :     s_256[1] = s_256[3];
    1677             :     s_256[2] = s_256[4];
    1678             :     s_256[3] = s_256[5];
    1679             : }
    1680             : 
    1681   359157000 : static INLINE void xy_y_convolve_6tap_16x2_avx2(
    1682             :     const int16_t *const src, const int32_t stride, __m256i s_256[6],
    1683             :     __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
    1684             :     __m256i r[4]) {
    1685   359157000 :     s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
    1686   359157000 :     ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
    1687   359157000 :     ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
    1688   359157000 :     s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
    1689   359157000 :     tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
    1690   359157000 :     tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
    1691             : 
    1692   359157000 :     xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
    1693   358881000 :     xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
    1694             : 
    1695   357852000 :     ss_256[0] = ss_256[1];
    1696   357852000 :     ss_256[1] = ss_256[2];
    1697   357852000 :     ss_256[3] = ss_256[4];
    1698   357852000 :     ss_256[4] = ss_256[5];
    1699             : 
    1700   357852000 :     tt_256[0] = tt_256[1];
    1701   357852000 :     tt_256[1] = tt_256[2];
    1702   357852000 :     tt_256[3] = tt_256[4];
    1703   357852000 :     tt_256[4] = tt_256[5];
    1704   357852000 : }
    1705             : 
    1706             : static INLINE void xy_y_convolve_6tap_16x2_half_pel_avx2(
    1707             :     const int16_t *const src, const int32_t stride, __m256i s_256[6],
    1708             :     __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
    1709             :     __m256i a_256[2];
    1710             : 
    1711             :     s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
    1712             :     a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
    1713             :     a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
    1714             :     ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
    1715             :     ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    1716             :     ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
    1717             :     ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
    1718             :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
    1719             : 
    1720             :     a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
    1721             :     s_256[0] = s_256[2];
    1722             :     s_256[2] = s_256[4];
    1723             :     s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
    1724             :     a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
    1725             :     s_256[1] = s_256[3];
    1726             :     s_256[3] = s_256[5];
    1727             :     ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
    1728             :     ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
    1729             :     ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
    1730             :     ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
    1731             :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
    1732             : }
    1733             : 
    1734           0 : static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
    1735             :     __m128i s_32[8],
    1736             :     __m128i ss_128[4],
    1737             :     const __m128i coeffs[4]) {
    1738           0 :     s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * 2));
    1739           0 :     const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
    1740           0 :     s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * 2));
    1741           0 :     const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
    1742           0 :     ss_128[3] = _mm_unpacklo_epi16(src67, src78);
    1743           0 :     const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
    1744           0 :     ss_128[0] = ss_128[1];
    1745           0 :     ss_128[1] = ss_128[2];
    1746           0 :     ss_128[2] = ss_128[3];
    1747           0 :     return r;
    1748             : }
    1749             : 
    1750      708165 : static INLINE __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
    1751             :     __m128i s_64[8],
    1752             :     __m256i ss_256[4],
    1753             :     const __m256i coeffs[4]) {
    1754             :     __m256i s_256[2];
    1755      708165 :     s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
    1756      708165 :     s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
    1757      708165 :     s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
    1758      708165 :     s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
    1759      708165 :     ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    1760      708165 :     const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
    1761      708166 :     ss_256[0] = ss_256[1];
    1762      708166 :     ss_256[1] = ss_256[2];
    1763      708166 :     ss_256[2] = ss_256[3];
    1764      708166 :     return r;
    1765             : }
    1766             : 
    1767   402514000 : static INLINE void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
    1768             :     const __m256i coeffs[4],
    1769             :     __m256i r[2]) {
    1770   402514000 :     r[0] = convolve16_8tap_avx2(ss, coeffs);
    1771   401863000 :     r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
    1772   400963000 : }
    1773             : 
    1774    55040100 : static INLINE void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
    1775             :     __m256i ss_256[8],
    1776             :     const __m256i coeffs[4],
    1777             :     __m256i r[2]) {
    1778             :     __m256i s_256[2];
    1779    55040100 :     s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
    1780    55040100 :     s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
    1781    55040100 :     ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    1782    55040100 :     ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
    1783    55040100 :     xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
    1784    55041900 :     ss_256[0] = ss_256[1];
    1785    55041900 :     ss_256[1] = ss_256[2];
    1786    55041900 :     ss_256[2] = ss_256[3];
    1787    55041900 :     ss_256[4] = ss_256[5];
    1788    55041900 :     ss_256[5] = ss_256[6];
    1789    55041900 :     ss_256[6] = ss_256[7];
    1790    55041900 : }
    1791             : 
    1792             : static INLINE void xy_y_convolve_8tap_8x2_half_pel_avx2(
    1793             :     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
    1794             :     __m256i r[2]) {
    1795             :     __m256i a_256[4], ss_256[4];
    1796             : 
    1797             :     s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
    1798             :     s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
    1799             :     a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
    1800             :     a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
    1801             :     a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
    1802             :     a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
    1803             :     ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
    1804             :     ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
    1805             :     ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
    1806             :     ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
    1807             :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
    1808             :     s_256[0] = s_256[2];
    1809             :     s_256[1] = s_256[3];
    1810             :     s_256[2] = s_256[4];
    1811             :     s_256[3] = s_256[5];
    1812             :     s_256[4] = s_256[6];
    1813             :     s_256[5] = s_256[7];
    1814             : }
    1815             : 
    1816             : SIMD_INLINE void xy_y_convolve_8tap_16x2_avx2(
    1817             :     const int16_t *const src, const int32_t stride, const __m256i coeffs[4],
    1818             :     __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
    1819   175608000 :     s_256[7] = _mm256_load_si256((__m256i *)(src + 7 * stride));
    1820   175608000 :     ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
    1821   175608000 :     ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
    1822   175608000 :     s_256[6] = _mm256_load_si256((__m256i *)(src + 8 * stride));
    1823   175608000 :     tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
    1824   175608000 :     tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
    1825   175608000 :     xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
    1826   175533000 :     xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
    1827   175627000 :     ss_256[0] = ss_256[1];
    1828   175627000 :     ss_256[1] = ss_256[2];
    1829   175627000 :     ss_256[2] = ss_256[3];
    1830   175627000 :     ss_256[4] = ss_256[5];
    1831   175627000 :     ss_256[5] = ss_256[6];
    1832   175627000 :     ss_256[6] = ss_256[7];
    1833   175627000 :     tt_256[0] = tt_256[1];
    1834   175627000 :     tt_256[1] = tt_256[2];
    1835   175627000 :     tt_256[2] = tt_256[3];
    1836   175627000 :     tt_256[4] = tt_256[5];
    1837   175627000 :     tt_256[5] = tt_256[6];
    1838   175627000 :     tt_256[6] = tt_256[7];
    1839   175627000 : }
    1840             : 
    1841             : static INLINE void xy_y_convolve_8tap_16x2_half_pel_avx2(
    1842             :     const int16_t *const src, const int32_t stride, const __m256i coeffs[4],
    1843             :     __m256i s_256[8], __m256i r[4]) {
    1844             :     __m256i a_256[4], ss_256[4];
    1845             :     s_256[7] = _mm256_load_si256((__m256i *)(src + 7 * stride));
    1846             : 
    1847             :     a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
    1848             :     a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
    1849             :     a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
    1850             :     a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
    1851             :     ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
    1852             :     ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
    1853             :     ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
    1854             :     ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
    1855             : 
    1856             :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
    1857             : 
    1858             :     a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
    1859             :     a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
    1860             :     a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
    1861             :     s_256[0] = s_256[2];
    1862             :     s_256[2] = s_256[4];
    1863             :     s_256[4] = s_256[6];
    1864             :     s_256[6] = _mm256_load_si256((__m256i *)(src + 8 * stride));
    1865             : 
    1866             :     a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
    1867             :     s_256[1] = s_256[3];
    1868             :     s_256[3] = s_256[5];
    1869             :     s_256[5] = s_256[7];
    1870             :     ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
    1871             :     ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
    1872             :     ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
    1873             :     ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
    1874             : 
    1875             :     xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
    1876             : }
    1877             : 
    1878           0 : static INLINE void jnt_comp_avg_round_store_2x2_sse2(
    1879             :     const __m128i res, const __m128i factor, const __m128i offset,
    1880             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
    1881             :     const int32_t dst8_stride) {
    1882           0 :     const __m128i r = jnt_y_round_sse2(res);
    1883             :     __m128i d;
    1884             : 
    1885           0 :     d = load_u16_2x2_sse4_1(dst, dst_stride);
    1886           0 :     d = _mm_unpacklo_epi16(d, r);
    1887           0 :     d = _mm_madd_epi16(d, factor);
    1888           0 :     d = _mm_add_epi32(d, offset);
    1889           0 :     d = _mm_srai_epi32(d, 8);
    1890           0 :     d = _mm_packs_epi32(d, d);
    1891           0 :     pack_store_2x2_sse2(d, dst8, dst8_stride);
    1892           0 : }
    1893             : 
    1894      225212 : static INLINE void jnt_comp_avg_round_store_4x2_sse2(
    1895             :     const __m128i res, const __m128i factor, const __m128i offset,
    1896             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
    1897             :     const int32_t dst8_stride) {
    1898      225212 :     const __m128i r = jnt_y_round_sse2(res);
    1899      225212 :     const __m128i dst_128 = load_u16_4x2_sse2(dst, dst_stride);
    1900             :     __m128i d[2];
    1901             : 
    1902      225212 :     d[0] = _mm_unpacklo_epi16(dst_128, r);
    1903      225212 :     d[1] = _mm_unpackhi_epi16(dst_128, r);
    1904      225212 :     d[0] = _mm_madd_epi16(d[0], factor);
    1905      225212 :     d[1] = _mm_madd_epi16(d[1], factor);
    1906      225212 :     d[0] = _mm_add_epi32(d[0], offset);
    1907      225212 :     d[1] = _mm_add_epi32(d[1], offset);
    1908      225212 :     d[0] = _mm_srai_epi32(d[0], 8);
    1909      225212 :     d[1] = _mm_srai_epi32(d[1], 8);
    1910      225212 :     d[0] = _mm_packs_epi32(d[0], d[1]);
    1911      225212 :     pack_store_4x2_sse2(d[0], dst8, dst8_stride);
    1912      225212 : }
    1913             : 
    1914   231372000 : static INLINE __m256i jnt_comp_avg_convolve_16_avx2(const __m256i res,
    1915             :     const __m256i dst,
    1916             :     const __m256i factor,
    1917             :     const __m256i offset) {
    1918             :     __m256i d[2];
    1919             : 
    1920   231372000 :     d[0] = _mm256_unpacklo_epi16(dst, res);
    1921   231372000 :     d[1] = _mm256_unpackhi_epi16(dst, res);
    1922   231372000 :     d[0] = _mm256_madd_epi16(d[0], factor);
    1923   231372000 :     d[1] = _mm256_madd_epi16(d[1], factor);
    1924   231372000 :     d[0] = _mm256_add_epi32(d[0], offset);
    1925   231372000 :     d[1] = _mm256_add_epi32(d[1], offset);
    1926   231372000 :     d[0] = _mm256_srai_epi32(d[0], 8);
    1927   231372000 :     d[1] = _mm256_srai_epi32(d[1], 8);
    1928   462744000 :     return _mm256_packs_epi32(d[0], d[1]);
    1929             : }
    1930             : 
    1931    29951000 : static INLINE void jnt_comp_avg_round_store_8x2_avx2(
    1932             :     const __m256i res, const __m256i factor, const __m256i offset,
    1933             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
    1934             :     const int32_t dst8_stride) {
    1935    29951000 :     const __m256i r = jnt_y_round_avx2(res);
    1936             :     __m256i d;
    1937             : 
    1938    29952700 :     d = loadu_u16_8x2_avx2(dst, dst_stride);
    1939    29949800 :     d = jnt_comp_avg_convolve_16_avx2(r, d, factor, offset);
    1940    29953800 :     pack_store_8x2_avx2(d, dst8, dst8_stride);
    1941    29951000 : }
    1942             : 
    1943             : SIMD_INLINE void jnt_comp_avg_round_store_16x2_avx2(
    1944             :     const __m256i res[2], const __m256i factor, const __m256i offset,
    1945             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
    1946             :     const int32_t dst8_stride) {
    1947             :     __m256i r[2], d[2];
    1948             : 
    1949    33230600 :     r[0] = jnt_y_round_avx2(res[0]);
    1950    33231800 :     r[1] = jnt_y_round_avx2(res[1]);
    1951    33232000 :     d[0] = loadu_u16_8x2_avx2(dst, dst_stride);
    1952    33230500 :     d[1] = loadu_u16_8x2_avx2(dst + 8, dst_stride);
    1953    33229700 :     d[0] = jnt_comp_avg_convolve_16_avx2(r[0], d[0], factor, offset);
    1954    33231500 :     d[1] = jnt_comp_avg_convolve_16_avx2(r[1], d[1], factor, offset);
    1955    33232700 :     pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
    1956    33232200 : }
    1957             : 
    1958             : SIMD_INLINE void jnt_comp_avg_round_store_32_avx2(const __m256i res[2],
    1959             :     const __m256i factor,
    1960             :     const __m256i offset,
    1961             :     const ConvBufType *const dst,
    1962             :     uint8_t *const dst8) {
    1963             :     __m256i r[2], d[2];
    1964             : 
    1965    67840500 :     r[0] = jnt_y_round_avx2(res[0]);
    1966    67842600 :     r[1] = jnt_y_round_avx2(res[1]);
    1967    67843100 :     d[0] = loadu_u16_8x2_avx2(dst, 16);
    1968    67840400 :     d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
    1969    67839100 :     d[0] = jnt_comp_avg_convolve_16_avx2(r[0], d[0], factor, offset);
    1970    67842500 :     d[1] = jnt_comp_avg_convolve_16_avx2(r[1], d[1], factor, offset);
    1971    67843500 :     convolve_store_32_avx2(d[0], d[1], dst8);
    1972    67842400 : }
    1973             : 
    1974      351615 : static INLINE __m128i jnt_avg_4x2_sse2(const __m128i res, const __m128i dst) {
    1975      351615 :     const __m128i d = _mm_add_epi16(res, dst);
    1976      351615 :     return _mm_srai_epi16(d, 5);
    1977             : }
    1978             : 
    1979           0 : static INLINE void jnt_avg_round_store_2x2_sse2(
    1980             :     const __m128i res, const __m128i offset, const ConvBufType *const dst,
    1981             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
    1982           0 :     const __m128i r = jnt_avg_round_sse2(res, offset);
    1983             :     __m128i d;
    1984             : 
    1985           0 :     d = load_u16_2x2_sse4_1(dst, dst_stride);
    1986           0 :     d = jnt_avg_4x2_sse2(r, d);
    1987           0 :     pack_store_2x2_sse2(d, dst8, dst8_stride);
    1988           0 : }
    1989             : 
    1990      351613 : static INLINE void jnt_avg_round_store_4x2_sse2(
    1991             :     const __m128i res, const __m128i offset, const ConvBufType *const dst,
    1992             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
    1993      351613 :     const __m128i r = jnt_avg_round_sse2(res, offset);
    1994             :     __m128i d;
    1995             : 
    1996      351614 :     d = load_u16_4x2_sse2(dst, dst_stride);
    1997      351615 :     d = jnt_avg_4x2_sse2(r, d);
    1998      351615 :     pack_store_4x2_sse2(d, dst8, dst8_stride);
    1999      351614 : }
    2000             : 
    2001   911865000 : static INLINE __m256i jnt_avg_16_avx2(const __m256i res, const __m256i dst) {
    2002   911865000 :     const __m256i d = _mm256_add_epi16(res, dst);
    2003   911865000 :     return _mm256_srai_epi16(d, 5);
    2004             : }
    2005             : 
    2006    33379100 : static INLINE void jnt_avg_round_store_8x2_sse2(
    2007             :     const __m256i res, const __m256i offset, const ConvBufType *const dst,
    2008             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
    2009    33379100 :     const __m256i r = jnt_avg_round_avx2(res, offset);
    2010             :     __m256i d;
    2011             : 
    2012    33375600 :     d = loadu_u16_8x2_avx2(dst, dst_stride);
    2013    33371500 :     d = jnt_avg_16_avx2(r, d);
    2014    33375000 :     pack_store_8x2_avx2(d, dst8, dst8_stride);
    2015    33377200 : }
    2016             : 
    2017    41128900 : static INLINE void jnt_avg_round_store_16x2_avx2(
    2018             :     const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
    2019             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
    2020             :     __m256i r[2], d[2];
    2021             : 
    2022    41128900 :     r[0] = jnt_avg_round_avx2(res[0], offset);
    2023    41127000 :     r[1] = jnt_avg_round_avx2(res[1], offset);
    2024    41116400 :     d[0] = loadu_u16_8x2_avx2(dst, dst_stride);
    2025    41113400 :     d[1] = loadu_u16_8x2_avx2(dst + 8, dst_stride);
    2026    41110900 :     d[0] = jnt_avg_16_avx2(r[0], d[0]);
    2027    41116000 :     d[1] = jnt_avg_16_avx2(r[1], d[1]);
    2028    41117700 :     pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
    2029    41127100 : }
    2030             : 
    2031    80887900 : static INLINE void jnt_avg_round_store_32_avx2(const __m256i res[2],
    2032             :     const __m256i offset,
    2033             :     const ConvBufType *const dst,
    2034             :     uint8_t *const dst8) {
    2035             :     __m256i r[2], d[2];
    2036             : 
    2037    80887900 :     r[0] = jnt_avg_round_avx2(res[0], offset);
    2038    80880600 :     r[1] = jnt_avg_round_avx2(res[1], offset);
    2039    80846300 :     d[0] = loadu_u16_8x2_avx2(dst, 16);
    2040    80843400 :     d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
    2041    80840600 :     d[0] = jnt_avg_16_avx2(r[0], d[0]);
    2042    80848500 :     d[1] = jnt_avg_16_avx2(r[1], d[1]);
    2043    80830500 :     convolve_store_32_avx2(d[0], d[1], dst8);
    2044    80835700 : }
    2045             : 
    2046           0 : static INLINE void jnt_no_avg_round_store_2x2_sse2(const __m128i res,
    2047             :     const __m128i offset,
    2048             :     ConvBufType *const dst,
    2049             :     const int32_t dst_stride) {
    2050           0 :     const __m128i d = jnt_no_avg_round_sse2(res, offset);
    2051             :     store_u16_2x2_sse2(d, dst, dst_stride);
    2052           0 : }
    2053             : 
    2054      733248 : static INLINE void jnt_no_avg_round_store_4x2_sse2(const __m128i res,
    2055             :     const __m128i offset,
    2056             :     ConvBufType *const dst,
    2057             :     const int32_t dst_stride) {
    2058      733248 :     const __m128i d = jnt_no_avg_round_sse2(res, offset);
    2059             :     store_u16_4x2_sse2(d, dst, dst_stride);
    2060      733245 : }
    2061             : 
    2062   130712000 : static INLINE void jnt_no_avg_round_store_8x2_avx2(const __m256i res,
    2063             :     const __m256i offset,
    2064             :     ConvBufType *const dst,
    2065             :     const int32_t dst_stride) {
    2066   130712000 :     const __m256i d = jnt_no_avg_round_avx2(res, offset);
    2067   130680000 :     storeu_u16_8x2_avx2(d, dst, dst_stride);
    2068   130700000 : }
    2069             : 
    2070   154087000 : static INLINE void jnt_no_avg_round_store_16x2_avx2(const __m256i res[2],
    2071             :     const __m256i offset,
    2072             :     ConvBufType *const dst,
    2073             :     const int32_t dst_stride) {
    2074             :     __m256i d[2];
    2075             : 
    2076   154087000 :     d[0] = jnt_no_avg_round_avx2(res[0], offset);
    2077   154061000 :     d[1] = jnt_no_avg_round_avx2(res[1], offset);
    2078   153944000 :     jnt_no_avg_store_16x2_avx2(d[0], d[1], dst, dst_stride);
    2079   153973000 : }
    2080             : 
    2081   297976000 : static INLINE void jnt_no_avg_round_store_32_avx2(const __m256i res[2],
    2082             :     const __m256i offset,
    2083             :     ConvBufType *const dst) {
    2084             :     __m256i d[2];
    2085             : 
    2086   297976000 :     d[0] = jnt_no_avg_round_avx2(res[0], offset);
    2087   297856000 :     d[1] = jnt_no_avg_round_avx2(res[1], offset);
    2088   297499000 :     jnt_no_avg_store_16x2_avx2(d[0], d[1], dst, 16);
    2089   297655000 : }
    2090             : 
    2091   128357000 : static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
    2092             :     const __m256i *const res_unsigned,
    2093             :     const __m256i *const wt,
    2094             :     const int32_t use_jnt_comp_avg) {
    2095             :     __m256i res;
    2096   128357000 :     if (use_jnt_comp_avg) {
    2097    56132800 :         const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
    2098    56132800 :         const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
    2099             : 
    2100    56132800 :         const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
    2101   112266000 :         const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
    2102             : 
    2103    56132800 :         const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
    2104    56132800 :         const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
    2105             : 
    2106    56132800 :         res = _mm256_packs_epi32(res_lo, res_hi);
    2107             :     }
    2108             :     else {
    2109   144449000 :         const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
    2110    72224700 :         res = _mm256_srai_epi16(wt_res, 1);
    2111             :     }
    2112   128357000 :     return res;
    2113             : }
    2114             : 
    2115   128329000 : static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
    2116             :     const __m256i *const offset_const,
    2117             :     const __m256i *const round_const,
    2118             :     const int32_t round_shift) {
    2119   128329000 :     const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
    2120   256658000 :     const __m256i res_round = _mm256_srai_epi16(
    2121             :         _mm256_add_epi16(res_signed, *round_const), round_shift);
    2122   128329000 :     return res_round;
    2123             : }
    2124             : 
    2125           0 : static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
    2126             :     const __m256i *const res_unsigned,
    2127             :     const __m256i *const wt0,
    2128             :     const __m256i *const wt1,
    2129             :     const int32_t use_jnt_comp_avg) {
    2130             :     __m256i res;
    2131           0 :     if (use_jnt_comp_avg) {
    2132           0 :         const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
    2133           0 :         const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
    2134           0 :         const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
    2135           0 :         res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
    2136             :     }
    2137             :     else {
    2138           0 :         const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
    2139           0 :         res = _mm256_srai_epi32(wt_res, 1);
    2140             :     }
    2141           0 :     return res;
    2142             : }
    2143   167618000 : static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
    2144   167618000 :     __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
    2145   167618000 :     __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
    2146   167618000 :     return yy_set_m128i(mhi, mlo);
    2147             : }
    2148             : 
    2149             : 
    2150           0 : static INLINE __m256i highbd_convolve_rounding(
    2151             :     const __m256i *const res_unsigned, const __m256i *const offset_const,
    2152             :     const __m256i *const round_const, const int32_t round_shift) {
    2153           0 :     const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
    2154           0 :     const __m256i res_round = _mm256_srai_epi32(
    2155             :         _mm256_add_epi32(res_signed, *round_const), round_shift);
    2156             : 
    2157           0 :     return res_round;
    2158             : }
    2159             : #if OBMC_CONVOLVE
    2160    70205800 : static INLINE __m256i convolve_4tap(const __m256i *const s,
    2161             :     const __m256i *const coeffs) {
    2162    70205800 :     const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
    2163   140412000 :     const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
    2164    70205800 :     return _mm256_add_epi32(res_1, res_2);
    2165             : }
    2166             : 
    2167  4116930000 : static INLINE __m256i convolve_8tap(const __m256i *const s,
    2168             :     const __m256i *const coeffs) {
    2169  4116930000 :     const __m256i res_01 = _mm256_madd_epi16(s[0], coeffs[0]);
    2170  4116930000 :     const __m256i res_23 = _mm256_madd_epi16(s[1], coeffs[1]);
    2171  4116930000 :     const __m256i res_45 = _mm256_madd_epi16(s[2], coeffs[2]);
    2172  8233860000 :     const __m256i res_67 = _mm256_madd_epi16(s[3], coeffs[3]);
    2173  4116930000 :     const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
    2174  4116930000 :     const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
    2175  4116930000 :     return _mm256_add_epi32(res_0123, res_4567);
    2176             : }
    2177   737897000 : static INLINE __m256i convolve_2tap(const __m256i *const s,
    2178             :     const __m256i *const coeffs) {
    2179  1475790000 :     return _mm256_madd_epi16(s[0], coeffs[0]);
    2180             : }
    2181   525258000 : static INLINE __m256i convolve_x_2tap(const __m256i data,
    2182             :     const __m256i *const coeffs, const __m256i *const filt) {
    2183   525258000 :     const __m256i s = _mm256_shuffle_epi8(data, filt[0]);
    2184   525258000 :     return convolve_2tap_avx2(&s, coeffs);
    2185             : }
    2186   129142000 : static INLINE __m256i convolve_x_4tap(const __m256i data,
    2187             :     const __m256i *const coeffs, const __m256i *const filt) {
    2188             :     __m256i s[2];
    2189             : 
    2190   129142000 :     s[0] = _mm256_shuffle_epi8(data, filt[0]);
    2191   129142000 :     s[1] = _mm256_shuffle_epi8(data, filt[1]);
    2192             : 
    2193   129142000 :     return convolve_4tap_avx2(s, coeffs);
    2194             : }
    2195             : 
    2196  2943610000 : static INLINE __m256i convolve_x_8tap_avx2(const __m256i data,
    2197             :     const __m256i *const coeffs, const __m256i *const filt) {
    2198             :     __m256i s[4];
    2199             : 
    2200  2943610000 :     s[0] = _mm256_shuffle_epi8(data, filt[0]);
    2201  2943610000 :     s[1] = _mm256_shuffle_epi8(data, filt[1]);
    2202  2943610000 :     s[2] = _mm256_shuffle_epi8(data, filt[2]);
    2203  2943610000 :     s[3] = _mm256_shuffle_epi8(data, filt[3]);
    2204             : 
    2205  2943610000 :     return convolve_8tap_avx2(s, coeffs);
    2206             : }
    2207             : 
    2208             : #define CONVOLVE_SR_VERTICAL_FILTER_2TAP                                         \
    2209             :     __m256i s[6];                                                                \
    2210             :                                                                                  \
    2211             :     for (i = 0; i < h; i += 2) {                                                 \
    2212             :         const int16_t *data = &t_block[i * im_stride];                           \
    2213             :                                                                                  \
    2214             :         const __m256i s4 =                                                       \
    2215             :             _mm256_loadu_si256((__m256i *)(data + 0 * im_stride));               \
    2216             :         const __m256i s5 =                                                       \
    2217             :             _mm256_loadu_si256((__m256i *)(data + 1 * im_stride));               \
    2218             :                                                                                  \
    2219             :         s[0] = _mm256_unpacklo_epi16(s4, s5);                                    \
    2220             :         s[1] = _mm256_unpackhi_epi16(s4, s5);                                    \
    2221             :                                                                                  \
    2222             :         __m256i res_a = convolve_2tap(s + 0, coeffs_v);                          \
    2223             :         __m256i res_b = convolve_2tap(s + 1, coeffs_v);                          \
    2224             :                                                                                  \
    2225             :         res_a =                                                                  \
    2226             :             _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
    2227             :         res_b =                                                                  \
    2228             :             _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
    2229             :                                                                                  \
    2230             :         const __m256i res_a_round = _mm256_sra_epi32(                            \
    2231             :             _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
    2232             :         const __m256i res_b_round = _mm256_sra_epi32(                            \
    2233             :             _mm256_add_epi32(res_b, round_const_v), round_shift_v);              \
    2234             :                                                                                  \
    2235             :         const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);  \
    2236             :         const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);        \
    2237             :                                                                                  \
    2238             :         const __m128i res_0 = _mm256_castsi256_si128(res_8b);                    \
    2239             :         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);               \
    2240             :                                                                                  \
    2241             :         __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                \
    2242             :         __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];   \
    2243             :         if (w - j > 4) {                                                         \
    2244             :             _mm_storel_epi64(p_0, res_0);                                        \
    2245             :             _mm_storel_epi64(p_1, res_1);                                        \
    2246             :         }                                                                        \
    2247             :         else if (w == 4) {                                                       \
    2248             :             xx_storel_32(p_0, res_0);                                            \
    2249             :             xx_storel_32(p_1, res_1);                                            \
    2250             :         }                                                                        \
    2251             :         else {                                                                   \
    2252             :             *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);                         \
    2253             :             *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);                         \
    2254             :         }                                                                        \
    2255             :     }
    2256             : #define CONVOLVE_SR_VERTICAL_FILTER_4TAP                                         \
    2257             :     __m256i s[6];                                                                \
    2258             :     __m256i src_0 = _mm256_loadu_si256((__m256i *)(t_block + 0 * im_stride));    \
    2259             :     __m256i src_1 = _mm256_loadu_si256((__m256i *)(t_block + 1 * im_stride));    \
    2260             :     __m256i src_2 = _mm256_loadu_si256((__m256i *)(t_block + 2 * im_stride));    \
    2261             :     __m256i src_3 = _mm256_loadu_si256((__m256i *)(t_block + 3 * im_stride));    \
    2262             :                                                                                  \
    2263             :     s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                  \
    2264             :     s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                  \
    2265             :     s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                  \
    2266             :     s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                  \
    2267             :                                                                                  \
    2268             :     for (i = 0; i < h; i += 2) {                                                 \
    2269             :         const int16_t *data = &t_block[i * im_stride];                           \
    2270             :                                                                                  \
    2271             :         const __m256i s4 =                                                       \
    2272             :             _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));               \
    2273             :         const __m256i s5 =                                                       \
    2274             :             _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));               \
    2275             :                                                                                  \
    2276             :         s[2] = _mm256_unpacklo_epi16(s4, s5);                                    \
    2277             :         s[5] = _mm256_unpackhi_epi16(s4, s5);                                    \
    2278             :                                                                                  \
    2279             :         __m256i res_a = convolve_4tap(s, coeffs_v + 1);                          \
    2280             :         __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);                      \
    2281             :                                                                                  \
    2282             :         res_a =                                                                  \
    2283             :             _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
    2284             :         res_b =                                                                  \
    2285             :             _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
    2286             :                                                                                  \
    2287             :         const __m256i res_a_round = _mm256_sra_epi32(                            \
    2288             :             _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
    2289             :         const __m256i res_b_round = _mm256_sra_epi32(                            \
    2290             :             _mm256_add_epi32(res_b, round_const_v), round_shift_v);              \
    2291             :                                                                                  \
    2292             :         const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);  \
    2293             :         const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);        \
    2294             :                                                                                  \
    2295             :         const __m128i res_0 = _mm256_castsi256_si128(res_8b);                    \
    2296             :         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);               \
    2297             :                                                                                  \
    2298             :         __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                \
    2299             :         __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];   \
    2300             :         if (w - j > 4) {                                                         \
    2301             :             _mm_storel_epi64(p_0, res_0);                                        \
    2302             :             _mm_storel_epi64(p_1, res_1);                                        \
    2303             :         }                                                                        \
    2304             :         else if (w == 4) {                                                       \
    2305             :             xx_storel_32(p_0, res_0);                                            \
    2306             :             xx_storel_32(p_1, res_1);                                            \
    2307             :         }                                                                        \
    2308             :         else {                                                                   \
    2309             :             *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);                         \
    2310             :             *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);                         \
    2311             :         }                                                                        \
    2312             :                                                                                  \
    2313             :         s[0] = s[1];                                                             \
    2314             :         s[1] = s[2];                                                             \
    2315             :         s[3] = s[4];                                                             \
    2316             :         s[4] = s[5];                                                             \
    2317             :     }
    2318             : 
    2319             : #define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
    2320             :   __m256i src_0 = _mm256_loadu_si256((__m256i *)(t_block + 0 * im_stride));   \
    2321             :   __m256i src_1 = _mm256_loadu_si256((__m256i *)(t_block + 1 * im_stride));   \
    2322             :   __m256i src_2 = _mm256_loadu_si256((__m256i *)(t_block + 2 * im_stride));   \
    2323             :   __m256i src_3 = _mm256_loadu_si256((__m256i *)(t_block + 3 * im_stride));   \
    2324             :   __m256i src_4 = _mm256_loadu_si256((__m256i *)(t_block + 4 * im_stride));   \
    2325             :   __m256i src_5 = _mm256_loadu_si256((__m256i *)(t_block + 5 * im_stride));   \
    2326             :                                                                               \
    2327             :   __m256i s[8];                                                               \
    2328             :   s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
    2329             :   s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
    2330             :   s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
    2331             :                                                                               \
    2332             :   s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
    2333             :   s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
    2334             :   s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
    2335             :                                                                               \
    2336             :   for (i = 0; i < h; i += 2) {                                                \
    2337             :     const int16_t *data = &t_block[i * im_stride];                            \
    2338             :                                                                               \
    2339             :     const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
    2340             :     const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
    2341             :                                                                               \
    2342             :     s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
    2343             :     s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
    2344             :                                                                               \
    2345             :     __m256i res_a = convolve_8tap(s, coeffs_v);                               \
    2346             :     __m256i res_b = convolve_8tap(s + 4, coeffs_v);                           \
    2347             :                                                                               \
    2348             :     res_a =                                                                   \
    2349             :         _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
    2350             :     res_b =                                                                   \
    2351             :         _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
    2352             :                                                                               \
    2353             :     const __m256i res_a_round = _mm256_sra_epi32(                             \
    2354             :         _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
    2355             :     const __m256i res_b_round = _mm256_sra_epi32(                             \
    2356             :         _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
    2357             :                                                                               \
    2358             :     const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
    2359             :     const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
    2360             :                                                                               \
    2361             :     const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
    2362             :     const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
    2363             :                                                                               \
    2364             :     __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
    2365             :     __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
    2366             :     if (w - j > 4) {                                                          \
    2367             :       _mm_storel_epi64(p_0, res_0);                                           \
    2368             :       _mm_storel_epi64(p_1, res_1);                                           \
    2369             :     } else if (w == 4) {                                                      \
    2370             :       xx_storel_32(p_0, res_0);                                               \
    2371             :       xx_storel_32(p_1, res_1);                                               \
    2372             :     } else {                                                                  \
    2373             :       *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);                            \
    2374             :       *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);                            \
    2375             :     }                                                                         \
    2376             :                                                                               \
    2377             :     s[0] = s[1];                                                              \
    2378             :     s[1] = s[2];                                                              \
    2379             :     s[2] = s[3];                                                              \
    2380             :                                                                               \
    2381             :     s[4] = s[5];                                                              \
    2382             :     s[5] = s[6];                                                              \
    2383             :     s[6] = s[7];                                                              \
    2384             :   }
    2385             : #define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP                                     \
    2386             :     for (i = 0; i < (im_h - 2); i += 2) {                                      \
    2387             :         __m256i data = _mm256_castsi128_si256(                                 \
    2388             :             _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));       \
    2389             :                                                                                \
    2390             :         data = _mm256_inserti128_si256(                                        \
    2391             :             data,                                                              \
    2392             :             _mm_loadu_si128(                                                   \
    2393             :             (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
    2394             :             1);                                                                \
    2395             :         __m256i res = convolve_x_2tap(data, coeffs_h, filt);                   \
    2396             :                                                                                \
    2397             :         res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),           \
    2398             :             round_shift_h);                                                    \
    2399             :         _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);          \
    2400             :     }                                                                          \
    2401             :                                                                                \
    2402             :     __m256i data_1 = _mm256_castsi128_si256(                                   \
    2403             :         _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
    2404             :                                                                                \
    2405             :     __m256i res = convolve_x_2tap(data_1, coeffs_h, filt);                     \
    2406             :     res =                                                                      \
    2407             :         _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
    2408             :     _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
    2409             : #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP                                     \
    2410             :     for (i = 0; i < (im_h - 2); i += 2) {                                      \
    2411             :         __m256i data = _mm256_castsi128_si256(                                 \
    2412             :             _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));       \
    2413             :                                                                                \
    2414             :         data = _mm256_inserti128_si256(                                        \
    2415             :             data,                                                              \
    2416             :             _mm_loadu_si128(                                                   \
    2417             :             (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
    2418             :             1);                                                                \
    2419             :         __m256i res = convolve_x_4tap(data, coeffs_h + 1, filt);               \
    2420             :                                                                                \
    2421             :         res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),           \
    2422             :             round_shift_h);                                                    \
    2423             :         _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);          \
    2424             :     }                                                                          \
    2425             :                                                                                \
    2426             :     __m256i data_1 = _mm256_castsi128_si256(                                   \
    2427             :         _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
    2428             :                                                                                \
    2429             :     __m256i res = convolve_x_4tap(data_1, coeffs_h + 1, filt);                 \
    2430             :     res =                                                                      \
    2431             :         _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
    2432             :     _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
    2433             : #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
    2434             :   for (i = 0; i < (im_h - 2); i += 2) {                                        \
    2435             :     __m256i data = _mm256_castsi128_si256(                                     \
    2436             :         _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
    2437             :     data = _mm256_inserti128_si256(                                            \
    2438             :         data,                                                                  \
    2439             :         _mm_loadu_si128(                                                       \
    2440             :             (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
    2441             :         1);                                                                    \
    2442             :                                                                                \
    2443             :     __m256i res = convolve_x_8tap_avx2(data, coeffs_h, filt);                  \
    2444             :     res =                                                                      \
    2445             :         _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
    2446             :     _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
    2447             :   }                                                                            \
    2448             :                                                                                \
    2449             :   __m256i data_1 = _mm256_castsi128_si256(                                     \
    2450             :       _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
    2451             :                                                                                \
    2452             :   __m256i res = convolve_x_8tap_avx2(data_1, coeffs_h, filt);                  \
    2453             :                                                                                \
    2454             :   res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
    2455             :                                                                                \
    2456             :   _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
    2457             : 
    2458             : #endif
    2459             : #endif

Generated by: LCOV version 1.14