LCOV - code coverage report
Current view: top level - ASM_AVX2 - convolve_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1321 1692 78.1 %
Date: 2019-11-25 17:38:06 Functions: 40 43 93.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : #include "EbDefinitions.h"
      14             : #include "aom_dsp_rtcd.h"
      15             : #include "convolve.h"
      16             : #include "convolve_avx2.h"
      17             : #include "EbInterPrediction.h"
      18             : #include "EbMemory_AVX2.h"
      19             : #include "synonyms.h"
      20             : 
      21    32156200 : static INLINE __m128i sr_x_round_sse2(const __m128i src) {
      22    32156200 :     const __m128i round = _mm_set1_epi16(34);
      23    32156200 :     const __m128i dst = _mm_add_epi16(src, round);
      24    32156200 :     return _mm_srai_epi16(dst, 6);
      25             : }
      26             : 
      27   543599000 : static INLINE __m256i sr_x_round_avx2(const __m256i src) {
      28   543599000 :     const __m256i round = _mm256_set1_epi16(34);
      29   543599000 :     const __m256i dst = _mm256_add_epi16(src, round);
      30   543599000 :     return _mm256_srai_epi16(dst, 6);
      31             : }
      32             : 
      33    33554200 : static INLINE __m128i sr_y_round_sse2(const __m128i src) {
      34    33554200 :     const __m128i round = _mm_set1_epi16(32);
      35    33554200 :     const __m128i dst = _mm_add_epi16(src, round);
      36    33554200 :     return _mm_srai_epi16(dst, FILTER_BITS - 1);
      37             : }
      38             : 
      39    73473400 : static INLINE void sr_x_round_store_8x2_avx2(const __m256i res,
      40             :     uint8_t *const dst,
      41             :     const int32_t dst_stride) {
      42    73473400 :     const __m256i r = sr_x_round_avx2(res);
      43    73502700 :     pack_store_8x2_avx2(r, dst, dst_stride);
      44    73475800 : }
      45             : 
      46    80388100 : static INLINE void sr_x_round_store_16x2_avx2(const __m256i res[2],
      47             :     uint8_t *const dst,
      48             :     const int32_t dst_stride) {
      49             :     __m256i r[2];
      50             : 
      51    80388100 :     r[0] = sr_x_round_avx2(res[0]);
      52    80407500 :     r[1] = sr_x_round_avx2(res[1]);
      53    80409300 :     pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
      54    80384600 : }
      55             : 
      56   156292000 : static INLINE void sr_x_round_store_32_avx2(const __m256i res[2],
      57             :     uint8_t *const dst) {
      58             :     __m256i r[2];
      59             : 
      60   156292000 :     r[0] = sr_x_round_avx2(res[0]);
      61   156393000 :     r[1] = sr_x_round_avx2(res[1]);
      62   156410000 :     convolve_store_32_avx2(r[0], r[1], dst);
      63   156326000 : }
      64             : 
      65    73636100 : static INLINE void sr_y_round_store_8x2_avx2(const __m256i res,
      66             :     uint8_t *const dst,
      67             :     const int32_t dst_stride) {
      68    73636100 :     const __m256i r = sr_y_round_avx2(res);
      69    73657700 :     pack_store_8x2_avx2(r, dst, dst_stride);
      70    73637800 : }
      71             : 
      72    82167700 : static INLINE void sr_y_round_store_16x2_avx2(const __m256i res[2],
      73             :     uint8_t *const dst,
      74             :     const int32_t dst_stride) {
      75             :     __m256i r[2];
      76             : 
      77    82167700 :     r[0] = sr_y_round_avx2(res[0]);
      78    82198700 :     r[1] = sr_y_round_avx2(res[1]);
      79    82212000 :     pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
      80    82184000 : }
      81             : 
      82   161284000 : static INLINE void sr_y_round_store_32_avx2(const __m256i res[2],
      83             :     uint8_t *const dst) {
      84             :     __m256i r[2];
      85             : 
      86   161284000 :     r[0] = sr_y_round_avx2(res[0]);
      87   161406000 :     r[1] = sr_y_round_avx2(res[1]);
      88   161372000 :     convolve_store_32_avx2(r[0], r[1], dst);
      89   161312000 : }
      90             : 
      91    72922600 : static INLINE void sr_y_round_store_32x2_avx2(const __m256i res[2],
      92             :     uint8_t *const dst,
      93             :     const int32_t dst_stride) {
      94    72922600 :     sr_y_round_store_32_avx2(res, dst);
      95    72960100 :     sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
      96    72953500 : }
      97             : 
      98    15723700 : static INLINE void sr_y_2tap_32_avx2(const uint8_t *const src,
      99             :     const __m256i coeffs[1], const __m256i s0,
     100             :     __m256i *const s1, uint8_t *const dst) {
     101             :     __m256i r[2];
     102    15723700 :     y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
     103    15722900 :     sr_y_round_store_32_avx2(r, dst);
     104    15724100 : }
     105             : 
     106     4154600 : static INLINE void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
     107             :     const __m256i s0, __m256i *const s1,
     108             :     uint8_t *const dst) {
     109     4154600 :     *s1 = _mm256_loadu_si256((__m256i *)src);
     110     8309190 :     const __m256i d = _mm256_avg_epu8(s0, *s1);
     111             :     _mm256_storeu_si256((__m256i *)dst, d);
     112     4154600 : }
     113             : 
     114    31790500 : void eb_av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
     115             :     uint8_t *dst, int32_t dst_stride, int32_t w,
     116             :     int32_t h, InterpFilterParams *filter_params_x,
     117             :     InterpFilterParams *filter_params_y,
     118             :     const int32_t subpel_x_q4,
     119             :     const int32_t subpel_y_q4,
     120             :     ConvolveParams *conv_params) {
     121             :     int32_t x, y;
     122             :     __m128i coeffs_128[4];
     123             :     __m256i coeffs_256[4];
     124             : 
     125             :     (void)filter_params_x;
     126             :     (void)subpel_x_q4;
     127             :     (void)conv_params;
     128             : 
     129    31790500 :     if (is_convolve_2tap(filter_params_y->filter_ptr)) {
     130             :         // vert_filt as 2 tap
     131     3719800 :         const uint8_t *src_ptr = src;
     132             : 
     133     3719800 :         y = h;
     134             : 
     135     3719800 :         if (subpel_y_q4 != 8) {
     136     2985080 :             if (w <= 8) {
     137     1493620 :                 prepare_half_coeffs_2tap_ssse3(
     138             :                     filter_params_y, subpel_y_q4, coeffs_128);
     139             : 
     140     1493600 :                 if (w == 2) {
     141             :                     __m128i s_16[2];
     142             : 
     143           0 :                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
     144             : 
     145             :                     do {
     146           0 :                         const __m128i res = y_convolve_2tap_2x2_ssse3(
     147             :                             src_ptr, src_stride, coeffs_128, s_16);
     148           0 :                         const __m128i r = sr_y_round_sse2(res);
     149           0 :                         pack_store_2x2_sse2(r, dst, dst_stride);
     150           0 :                         src_ptr += 2 * src_stride;
     151           0 :                         dst += 2 * dst_stride;
     152           0 :                         y -= 2;
     153           0 :                     } while (y);
     154             :                 }
     155     1493600 :                 else if (w == 4) {
     156             :                     __m128i s_32[2];
     157             : 
     158      585890 :                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
     159             : 
     160             :                     do {
     161     1559290 :                         const __m128i res = y_convolve_2tap_4x2_ssse3(
     162             :                             src_ptr, src_stride, coeffs_128, s_32);
     163     1559290 :                         const __m128i r = sr_y_round_sse2(res);
     164     1559290 :                         pack_store_4x2_sse2(r, dst, dst_stride);
     165     1559290 :                         src_ptr += 2 * src_stride;
     166     1559290 :                         dst += 2 * dst_stride;
     167     1559290 :                         y -= 2;
     168     1559290 :                     } while (y);
     169             :                 }
     170             :                 else {
     171             :                     __m128i s_64[2], s_128[2];
     172             : 
     173     1200650 :                     assert(w == 8);
     174             : 
     175     1200650 :                     s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
     176             : 
     177             :                     do {
     178             :                         // Note: Faster than binding to AVX2 registers.
     179     7841140 :                         s_64[1] =
     180     7841140 :                             _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
     181     7841140 :                         s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
     182     7841140 :                         s_64[0] = _mm_loadl_epi64(
     183     7841140 :                             (__m128i *)(src_ptr + 2 * src_stride));
     184     7841140 :                         s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
     185     7841140 :                         const __m128i ss0 =
     186     7841140 :                             _mm_unpacklo_epi8(s_128[0], s_128[1]);
     187     7841140 :                         const __m128i ss1 =
     188     7841140 :                             _mm_unpackhi_epi8(s_128[0], s_128[1]);
     189             :                         const __m128i res0 =
     190     7841140 :                             convolve_2tap_ssse3(&ss0, coeffs_128);
     191             :                         const __m128i res1 =
     192     7841370 :                             convolve_2tap_ssse3(&ss1, coeffs_128);
     193     7841250 :                         const __m128i r0 = sr_y_round_sse2(res0);
     194     7841200 :                         const __m128i r1 = sr_y_round_sse2(res1);
     195     7841200 :                         const __m128i d = _mm_packus_epi16(r0, r1);
     196     7841200 :                         _mm_storel_epi64((__m128i *)dst, d);
     197     7841200 :                         _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
     198     7841230 :                         src_ptr += 2 * src_stride;
     199     7841230 :                         dst += 2 * dst_stride;
     200     7841230 :                         y -= 2;
     201     7841230 :                     } while (y);
     202             :                 }
     203             :             }
     204             :             else {
     205     1491470 :                 prepare_half_coeffs_2tap_avx2(
     206             :                     filter_params_y, subpel_y_q4, coeffs_256);
     207             : 
     208     1491760 :                 if (w == 16) {
     209             :                     __m128i s_128[2];
     210             : 
     211      973079 :                     s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
     212             : 
     213             :                     do {
     214             :                         __m256i r[2];
     215             : 
     216     8033530 :                         y_convolve_2tap_16x2_avx2(
     217             :                             src_ptr, src_stride, coeffs_256, s_128, r);
     218     8033290 :                         sr_y_round_store_16x2_avx2(r, dst, dst_stride);
     219     8033540 :                         src_ptr += 2 * src_stride;
     220     8033540 :                         dst += 2 * dst_stride;
     221     8033540 :                         y -= 2;
     222     8033540 :                     } while (y);
     223             :                 }
     224      518685 :                 else if (w == 32) {
     225             :                     __m256i s_256[2];
     226             : 
     227      419410 :                     s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
     228             : 
     229             :                     do {
     230     4699100 :                         sr_y_2tap_32_avx2(src_ptr + src_stride,
     231             :                             coeffs_256,
     232             :                             s_256[0],
     233             :                             &s_256[1],
     234             :                             dst);
     235     4699070 :                         sr_y_2tap_32_avx2(src_ptr + 2 * src_stride,
     236             :                             coeffs_256,
     237             :                             s_256[1],
     238             :                             &s_256[0],
     239             :                             dst + dst_stride);
     240     4699100 :                         src_ptr += 2 * src_stride;
     241     4699100 :                         dst += 2 * dst_stride;
     242     4699100 :                         y -= 2;
     243     4699100 :                     } while (y);
     244             :                 }
     245       99275 :                 else if (w == 64) {
     246             :                     __m256i s_256[2][2];
     247             : 
     248       99320 :                     s_256[0][0] =
     249       99320 :                         _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
     250       99320 :                     s_256[0][1] =
     251      198640 :                         _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
     252             : 
     253             :                     do {
     254     1582420 :                         sr_y_2tap_32_avx2(src_ptr + src_stride,
     255             :                             coeffs_256,
     256             :                             s_256[0][0],
     257             :                             &s_256[1][0],
     258             :                             dst);
     259     1582410 :                         sr_y_2tap_32_avx2(src_ptr + src_stride + 32,
     260             :                             coeffs_256,
     261             :                             s_256[0][1],
     262             :                             &s_256[1][1],
     263             :                             dst + 32);
     264     1582420 :                         sr_y_2tap_32_avx2(src_ptr + 2 * src_stride,
     265             :                             coeffs_256,
     266             :                             s_256[1][0],
     267             :                             &s_256[0][0],
     268             :                             dst + dst_stride);
     269     1582420 :                         sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
     270             :                             coeffs_256,
     271             :                             s_256[1][1],
     272             :                             &s_256[0][1],
     273     1582420 :                             dst + dst_stride + 32);
     274             : 
     275     1582420 :                         src_ptr += 2 * src_stride;
     276     1582420 :                         dst += 2 * dst_stride;
     277     1582420 :                         y -= 2;
     278     1582420 :                     } while (y);
     279             :                 }
     280             :                 else {
     281             :                     __m256i s_256[2][4];
     282             : 
     283           0 :                     assert(w == 128);
     284             : 
     285           0 :                     s_256[0][0] =
     286           0 :                         _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
     287           0 :                     s_256[0][1] =
     288           0 :                         _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
     289           0 :                     s_256[0][2] =
     290           0 :                         _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
     291           0 :                     s_256[0][3] =
     292           0 :                         _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
     293             : 
     294             :                     do {
     295           0 :                         sr_y_2tap_32_avx2(src_ptr + src_stride,
     296             :                             coeffs_256,
     297             :                             s_256[0][0],
     298             :                             &s_256[1][0],
     299             :                             dst);
     300           0 :                         sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
     301             :                             coeffs_256,
     302             :                             s_256[0][1],
     303             :                             &s_256[1][1],
     304             :                             dst + 1 * 32);
     305           0 :                         sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
     306             :                             coeffs_256,
     307             :                             s_256[0][2],
     308             :                             &s_256[1][2],
     309             :                             dst + 2 * 32);
     310           0 :                         sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
     311             :                             coeffs_256,
     312             :                             s_256[0][3],
     313             :                             &s_256[1][3],
     314             :                             dst + 3 * 32);
     315             : 
     316           0 :                         sr_y_2tap_32_avx2(src_ptr + 2 * src_stride,
     317             :                             coeffs_256,
     318             :                             s_256[1][0],
     319             :                             &s_256[0][0],
     320             :                             dst + dst_stride);
     321           0 :                         sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
     322             :                             coeffs_256,
     323             :                             s_256[1][1],
     324             :                             &s_256[0][1],
     325           0 :                             dst + dst_stride + 1 * 32);
     326           0 :                         sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
     327             :                             coeffs_256,
     328             :                             s_256[1][2],
     329             :                             &s_256[0][2],
     330           0 :                             dst + dst_stride + 2 * 32);
     331           0 :                         sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
     332             :                             coeffs_256,
     333             :                             s_256[1][3],
     334             :                             &s_256[0][3],
     335           0 :                             dst + dst_stride + 3 * 32);
     336             : 
     337           0 :                         src_ptr += 2 * src_stride;
     338           0 :                         dst += 2 * dst_stride;
     339           0 :                         y -= 2;
     340           0 :                     } while (y);
     341             :                 }
     342             :             }
     343             :         }
     344             :         else {
     345             :             // average to get half pel
     346      734720 :             if (w <= 8) {
     347      371146 :                 if (w == 2) {
     348             :                     __m128i s_16[2];
     349             : 
     350           0 :                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
     351             : 
     352             :                     do {
     353           0 :                         s_16[1] = _mm_cvtsi32_si128(
     354           0 :                             *(int16_t *)(src_ptr + src_stride));
     355           0 :                         const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
     356           0 :                         *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
     357           0 :                         s_16[0] = _mm_cvtsi32_si128(
     358           0 :                             *(int16_t *)(src_ptr + 2 * src_stride));
     359           0 :                         const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
     360           0 :                         *(int16_t *)(dst + dst_stride) =
     361           0 :                             (int16_t)_mm_cvtsi128_si32(d1);
     362           0 :                         src_ptr += 2 * src_stride;
     363           0 :                         dst += 2 * dst_stride;
     364           0 :                         y -= 2;
     365           0 :                     } while (y);
     366             :                 }
     367      371146 :                 else if (w == 4) {
     368             :                     __m128i s_32[2];
     369             : 
     370      199702 :                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
     371             : 
     372             :                     do {
     373      941886 :                         s_32[1] = _mm_cvtsi32_si128(
     374      470943 :                             *(int32_t *)(src_ptr + src_stride));
     375      470943 :                         const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
     376      470943 :                         xx_storel_32(dst, d0);
     377      941888 :                         s_32[0] = _mm_cvtsi32_si128(
     378      470944 :                             *(int32_t *)(src_ptr + 2 * src_stride));
     379      470944 :                         const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
     380      470944 :                         xx_storel_32(dst + dst_stride, d1);
     381      470944 :                         src_ptr += 2 * src_stride;
     382      470944 :                         dst += 2 * dst_stride;
     383      470944 :                         y -= 2;
     384      470944 :                     } while (y);
     385             :                 }
     386             :                 else {
     387             :                     __m128i s_64[2];
     388             : 
     389      271295 :                     assert(w == 8);
     390             : 
     391      271295 :                     s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
     392             : 
     393             :                     do {
     394             :                         // Note: Faster than binding to AVX2 registers.
     395     1706080 :                         s_64[1] =
     396     1706080 :                             _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
     397     3412150 :                         const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
     398     1706080 :                         _mm_storel_epi64((__m128i *)dst, d0);
     399     1706080 :                         s_64[0] = _mm_loadl_epi64(
     400     1706080 :                             (__m128i *)(src_ptr + 2 * src_stride));
     401     1706080 :                         const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
     402     1706080 :                         _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
     403     1706080 :                         src_ptr += 2 * src_stride;
     404     1706080 :                         dst += 2 * dst_stride;
     405     1706080 :                         y -= 2;
     406     1706080 :                     } while (y);
     407             :                 }
     408             :             }
     409      363574 :             else if (w == 16) {
     410             :                 __m128i s_128[2];
     411             : 
     412      233605 :                 s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
     413             : 
     414             :                 do {
     415     1965600 :                     s_128[1] =
     416     1965600 :                         _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
     417     3931200 :                     const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
     418             :                     _mm_storeu_si128((__m128i *)dst, d0);
     419     1965600 :                     s_128[0] =
     420     1965600 :                         _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
     421     1965600 :                     const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
     422     1965600 :                     _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
     423     1965600 :                     src_ptr += 2 * src_stride;
     424     1965600 :                     dst += 2 * dst_stride;
     425     1965600 :                     y -= 2;
     426     1965600 :                 } while (y);
     427             :             }
     428      129969 :             else if (w == 32) {
     429             :                 __m256i s_256[2];
     430             : 
     431      102178 :                 s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
     432             : 
     433             :                 do {
     434     1180800 :                     sr_y_2tap_32_avg_avx2(
     435             :                         src_ptr + src_stride, s_256[0], &s_256[1], dst);
     436     1180800 :                     sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride,
     437             :                         s_256[1],
     438             :                         &s_256[0],
     439             :                         dst + dst_stride);
     440     1180800 :                     src_ptr += 2 * src_stride;
     441     1180800 :                     dst += 2 * dst_stride;
     442     1180800 :                     y -= 2;
     443     1180800 :                 } while (y);
     444             :             }
     445       27791 :             else if (w == 64) {
     446             :                 __m256i s_256[2][2];
     447             : 
     448       27806 :                 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
     449       55612 :                 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
     450             : 
     451             :                 do {
     452      448264 :                     sr_y_2tap_32_avg_avx2(
     453             :                         src_ptr + src_stride, s_256[0][0], &s_256[1][0], dst);
     454      448264 :                     sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32,
     455             :                         s_256[0][1],
     456             :                         &s_256[1][1],
     457             :                         dst + 32);
     458             : 
     459      448264 :                     sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride,
     460             :                         s_256[1][0],
     461             :                         &s_256[0][0],
     462             :                         dst + dst_stride);
     463      448264 :                     sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32,
     464             :                         s_256[1][1],
     465             :                         &s_256[0][1],
     466      448264 :                         dst + dst_stride + 32);
     467             : 
     468      448264 :                     src_ptr += 2 * src_stride;
     469      448264 :                     dst += 2 * dst_stride;
     470      448264 :                     y -= 2;
     471      448264 :                 } while (y);
     472             :             }
     473             :             else {
     474             :                 __m256i s_256[2][4];
     475             : 
     476           0 :                 assert(w == 128);
     477             : 
     478           0 :                 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
     479           0 :                 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
     480           0 :                 s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
     481           0 :                 s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
     482             : 
     483             :                 do {
     484           0 :                     sr_y_2tap_32_avg_avx2(
     485             :                         src_ptr + src_stride, s_256[0][0], &s_256[1][0], dst);
     486           0 :                     sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32,
     487             :                         s_256[0][1],
     488             :                         &s_256[1][1],
     489             :                         dst + 1 * 32);
     490           0 :                     sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32,
     491             :                         s_256[0][2],
     492             :                         &s_256[1][2],
     493             :                         dst + 2 * 32);
     494           0 :                     sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32,
     495             :                         s_256[0][3],
     496             :                         &s_256[1][3],
     497             :                         dst + 3 * 32);
     498             : 
     499           0 :                     sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride,
     500             :                         s_256[1][0],
     501             :                         &s_256[0][0],
     502             :                         dst + dst_stride);
     503           0 :                     sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32,
     504             :                         s_256[1][1],
     505             :                         &s_256[0][1],
     506           0 :                         dst + dst_stride + 1 * 32);
     507           0 :                     sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32,
     508             :                         s_256[1][2],
     509             :                         &s_256[0][2],
     510           0 :                         dst + dst_stride + 2 * 32);
     511           0 :                     sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32,
     512             :                         s_256[1][3],
     513             :                         &s_256[0][3],
     514           0 :                         dst + dst_stride + 3 * 32);
     515             : 
     516           0 :                     src_ptr += 2 * src_stride;
     517           0 :                     dst += 2 * dst_stride;
     518           0 :                     y -= 2;
     519           0 :                 } while (y);
     520             :             }
     521             :         }
     522             :     }
     523    28068100 :     else if (is_convolve_4tap(filter_params_y->filter_ptr)) {
     524             :         // vert_filt as 4 tap
     525     2892740 :         const uint8_t *src_ptr = src - src_stride;
     526             : 
     527     2892740 :         y = h;
     528             : 
     529     2892740 :         if (w <= 4) {
     530      622441 :             prepare_half_coeffs_4tap_ssse3(
     531             :                 filter_params_y, subpel_y_q4, coeffs_128);
     532             : 
     533      622444 :             if (w == 2) {
     534             :                 __m128i s_16[4], ss_128[2];
     535             : 
     536       14306 :                 s_16[0] =
     537       14306 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
     538       14306 :                 s_16[1] =
     539       14306 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
     540       14306 :                 s_16[2] =
     541       14306 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
     542             : 
     543       14306 :                 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
     544       28612 :                 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
     545             : 
     546       14306 :                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
     547             : 
     548             :                 do {
     549       26776 :                     src_ptr += 2 * src_stride;
     550       26776 :                     const __m128i res = y_convolve_4tap_2x2_ssse3(
     551             :                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
     552       26776 :                     const __m128i r = sr_y_round_sse2(res);
     553       26776 :                     pack_store_2x2_sse2(r, dst, dst_stride);
     554             : 
     555       26776 :                     ss_128[0] = ss_128[1];
     556       26776 :                     dst += 2 * dst_stride;
     557       26776 :                     y -= 2;
     558       26776 :                 } while (y);
     559             :             }
     560             :             else {
     561             :                 __m128i s_32[4], ss_128[2];
     562             : 
     563      608138 :                 assert(w == 4);
     564             : 
     565      608138 :                 s_32[0] =
     566      608138 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
     567      608138 :                 s_32[1] =
     568      608138 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
     569      608138 :                 s_32[2] =
     570      608138 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
     571             : 
     572      608138 :                 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
     573     1216280 :                 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
     574             : 
     575      608138 :                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
     576             : 
     577             :                 do {
     578     1203740 :                     src_ptr += 2 * src_stride;
     579     1203740 :                     const __m128i res = y_convolve_4tap_4x2_ssse3(
     580             :                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
     581     1203730 :                     const __m128i r = sr_y_round_sse2(res);
     582     1203730 :                     pack_store_4x2_sse2(r, dst, dst_stride);
     583             : 
     584     1203720 :                     ss_128[0] = ss_128[1];
     585     1203720 :                     dst += 2 * dst_stride;
     586     1203720 :                     y -= 2;
     587     1203720 :                 } while (y);
     588             :             }
     589             :         }
     590             :         else {
     591     2270300 :             prepare_half_coeffs_4tap_avx2(
     592             :                 filter_params_y, subpel_y_q4, coeffs_256);
     593             : 
     594     2270510 :             if (w == 8) {
     595             :                 __m128i s_64[4];
     596             :                 __m256i ss_256[2];
     597             : 
     598      925045 :                 s_64[0] =
     599      925045 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
     600      925045 :                 s_64[1] =
     601      925045 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
     602      925045 :                 s_64[2] =
     603      925045 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
     604             : 
     605             :                 // Load lines a and b. Line a to lower 128, line b to upper 128
     606      925045 :                 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
     607     1850090 :                 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
     608             : 
     609      925045 :                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
     610             : 
     611             :                 do {
     612     1833150 :                     src_ptr += 2 * src_stride;
     613     1833150 :                     const __m256i res = y_convolve_4tap_8x2_avx2(
     614             :                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
     615     1833130 :                     sr_y_round_store_8x2_avx2(res, dst, dst_stride);
     616             : 
     617     1833120 :                     ss_256[0] = ss_256[1];
     618     1833120 :                     dst += 2 * dst_stride;
     619     1833120 :                     y -= 2;
     620     1833120 :                 } while (y);
     621             :             }
     622     1345460 :             else if (w == 16) {
     623             :                 __m128i s_128[4];
     624             :                 __m256i ss_256[4], r[2];
     625             : 
     626     1339940 :                 assert(w == 16);
     627             : 
     628     1339940 :                 s_128[0] =
     629     1339940 :                     _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
     630     1339940 :                 s_128[1] =
     631     1339940 :                     _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
     632     1339940 :                 s_128[2] =
     633     1339940 :                     _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
     634             : 
     635             :                 // Load lines a and b. Line a to lower 128, line b to upper 128
     636     1339940 :                 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
     637     2679870 :                 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
     638             : 
     639     1339940 :                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
     640     1339940 :                 ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
     641             : 
     642             :                 do {
     643     2679750 :                     src_ptr += 2 * src_stride;
     644     2679750 :                     y_convolve_4tap_16x2_avx2(
     645             :                         src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
     646     2679670 :                     sr_y_round_store_16x2_avx2(r, dst, dst_stride);
     647             : 
     648     2679720 :                     ss_256[0] = ss_256[1];
     649     2679720 :                     ss_256[2] = ss_256[3];
     650     2679720 :                     dst += 2 * dst_stride;
     651     2679720 :                     y -= 2;
     652     2679720 :                 } while (y);
     653             :             }
     654             :             else {
     655             :                 // AV1 standard won't have 32x4 case.
     656             :                 // This only favors some optimization feature which
     657             :                 // subsamples 32x8 to 32x4 and triggers 4-tap filter.
     658             : 
     659             :                 __m256i s_256[4], ss_256[4], tt_256[4], r[4];
     660             : 
     661        5528 :                 assert(w == 32);
     662             : 
     663        5528 :                 s_256[0] =
     664        5528 :                     _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
     665        5528 :                 s_256[1] =
     666        5528 :                     _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
     667        5528 :                 s_256[2] =
     668        5528 :                     _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
     669             : 
     670        5528 :                 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
     671        5528 :                 ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
     672             : 
     673        5528 :                 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
     674       11056 :                 tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
     675             : 
     676             :                 do {
     677       11071 :                     src_ptr += 2 * src_stride;
     678       11071 :                     y_convolve_4tap_32x2_avx2(src_ptr,
     679             :                         src_stride,
     680             :                         coeffs_256,
     681             :                         s_256,
     682             :                         ss_256,
     683             :                         tt_256,
     684             :                         r);
     685       11086 :                     sr_y_round_store_32x2_avx2(r, dst, dst_stride);
     686             : 
     687       11086 :                     ss_256[0] = ss_256[1];
     688       11086 :                     ss_256[2] = ss_256[3];
     689             : 
     690       11086 :                     tt_256[0] = tt_256[1];
     691       11086 :                     tt_256[2] = tt_256[3];
     692       11086 :                     dst += 2 * dst_stride;
     693       11086 :                     y -= 2;
     694       11086 :                 } while (y);
     695             :             }
     696             :         }
     697             :     }
     698    25189000 :     else if (is_convolve_6tap(filter_params_y->filter_ptr)) {
     699             :         // vert_filt as 6 tap
     700    23346000 :         const uint8_t *src_ptr = src - 2 * src_stride;
     701             : 
     702    23346000 :         if (w <= 4) {
     703     2372640 :             prepare_half_coeffs_6tap_ssse3(
     704             :                 filter_params_y, subpel_y_q4, coeffs_128);
     705             : 
     706     2372800 :             y = h;
     707             : 
     708     2372800 :             if (w == 2) {
     709             :                 __m128i s_16[6], ss_128[3];
     710             : 
     711       16428 :                 s_16[0] =
     712       16428 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
     713       16428 :                 s_16[1] =
     714       16428 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
     715       16428 :                 s_16[2] =
     716       16428 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
     717       16428 :                 s_16[3] =
     718       16428 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
     719       16428 :                 s_16[4] =
     720       16428 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
     721             : 
     722       16428 :                 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
     723       16428 :                 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
     724       16428 :                 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
     725       32856 :                 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
     726             : 
     727       16428 :                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
     728       16428 :                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
     729             : 
     730             :                 do {
     731       65728 :                     src_ptr += 2 * src_stride;
     732       65728 :                     const __m128i res = y_convolve_6tap_2x2_ssse3(
     733             :                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
     734       65728 :                     const __m128i r = sr_y_round_sse2(res);
     735       65728 :                     pack_store_2x2_sse2(r, dst, dst_stride);
     736             : 
     737       65728 :                     ss_128[0] = ss_128[1];
     738       65728 :                     ss_128[1] = ss_128[2];
     739       65728 :                     dst += 2 * dst_stride;
     740       65728 :                     y -= 2;
     741       65728 :                 } while (y);
     742             :             }
     743             :             else {
     744             :                 __m128i s_32[6], ss_128[3];
     745             : 
     746     2356370 :                 assert(w == 4);
     747             : 
     748     2356370 :                 s_32[0] =
     749     2356370 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
     750     2356370 :                 s_32[1] =
     751     2356370 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
     752     2356370 :                 s_32[2] =
     753     2356370 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
     754     2356370 :                 s_32[3] =
     755     2356370 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
     756     2356370 :                 s_32[4] =
     757     2356370 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
     758             : 
     759     2356370 :                 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
     760     2356370 :                 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
     761     2356370 :                 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
     762     4712740 :                 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
     763             : 
     764     2356370 :                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
     765     2356370 :                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
     766             : 
     767             :                 do {
     768    15005000 :                     src_ptr += 2 * src_stride;
     769    15005000 :                     const __m128i res = y_convolve_6tap_4x2_ssse3(
     770             :                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
     771    15005600 :                     const __m128i r = sr_y_round_sse2(res);
     772    15005100 :                     pack_store_4x2_sse2(r, dst, dst_stride);
     773             : 
     774    15004900 :                     ss_128[0] = ss_128[1];
     775    15004900 :                     ss_128[1] = ss_128[2];
     776    15004900 :                     dst += 2 * dst_stride;
     777    15004900 :                     y -= 2;
     778    15004900 :                 } while (y);
     779             :             }
     780             :         }
     781             :         else {
     782    20973300 :             prepare_half_coeffs_6tap_avx2(
     783             :                 filter_params_y, subpel_y_q4, coeffs_256);
     784             : 
     785    20984300 :             if (w == 8) {
     786             :                 __m128i s_64[6];
     787             :                 __m256i ss_256[3];
     788             : 
     789     9396180 :                 s_64[0] =
     790     9396180 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
     791     9396180 :                 s_64[1] =
     792     9396180 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
     793     9396180 :                 s_64[2] =
     794     9396180 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
     795     9396180 :                 s_64[3] =
     796     9396180 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
     797     9396180 :                 s_64[4] =
     798     9396180 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
     799             : 
     800             :                 // Load lines a and b. Line a to lower 128, line b to upper 128
     801     9396180 :                 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
     802     9396180 :                 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
     803     9396180 :                 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
     804    18792400 :                 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
     805             : 
     806     9396180 :                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
     807     9396180 :                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
     808             : 
     809     9396180 :                 y = h;
     810             :                 do {
     811    65996800 :                     src_ptr += 2 * src_stride;
     812    65996800 :                     const __m256i res = y_convolve_6tap_8x2_avx2(
     813             :                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
     814    65993700 :                     sr_y_round_store_8x2_avx2(res, dst, dst_stride);
     815             : 
     816    65995500 :                     ss_256[0] = ss_256[1];
     817    65995500 :                     ss_256[1] = ss_256[2];
     818    65995500 :                     dst += 2 * dst_stride;
     819    65995500 :                     y -= 2;
     820    65995500 :                 } while (y);
     821             :             }
     822    11588100 :             else if (w == 16) {
     823             :                 __m128i s_128[6];
     824             :                 __m256i ss_256[6], r[2];
     825             : 
     826     7053300 :                 s_128[0] =
     827     7053300 :                     _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
     828     7053300 :                 s_128[1] =
     829     7053300 :                     _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
     830     7053300 :                 s_128[2] =
     831     7053300 :                     _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
     832     7053300 :                 s_128[3] =
     833     7053300 :                     _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
     834     7053300 :                 s_128[4] =
     835     7053300 :                     _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
     836             : 
     837             :                 // Load lines a and b. Line a to lower 128, line b to upper 128
     838     7053300 :                 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
     839     7053300 :                 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
     840     7053300 :                 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
     841    14106600 :                 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
     842             : 
     843     7053300 :                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
     844     7053300 :                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
     845             : 
     846     7053300 :                 ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
     847     7053300 :                 ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
     848             : 
     849     7053300 :                 y = h;
     850             :                 do {
     851    65767500 :                     src_ptr += 2 * src_stride;
     852    65767500 :                     y_convolve_6tap_16x2_avx2(
     853             :                         src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
     854    65760400 :                     sr_y_round_store_16x2_avx2(r, dst, dst_stride);
     855             : 
     856    65766900 :                     ss_256[0] = ss_256[1];
     857    65766900 :                     ss_256[1] = ss_256[2];
     858             : 
     859    65766900 :                     ss_256[3] = ss_256[4];
     860    65766900 :                     ss_256[4] = ss_256[5];
     861    65766900 :                     dst += 2 * dst_stride;
     862    65766900 :                     y -= 2;
     863    65766900 :                 } while (y);
     864             :             }
     865             :             else {
     866             :                 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
     867             : 
     868     4534810 :                 assert(!(w % 32));
     869             : 
     870     4534810 :                 x = 0;
     871             :                 do {
     872     5456800 :                     const uint8_t *s = src_ptr + x;
     873     5456800 :                     uint8_t *d = dst + x;
     874             : 
     875     5456800 :                     s_256[0] =
     876     5456800 :                         _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
     877     5456800 :                     s_256[1] =
     878     5456800 :                         _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
     879     5456800 :                     s_256[2] =
     880     5456800 :                         _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
     881     5456800 :                     s_256[3] =
     882     5456800 :                         _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
     883     5456800 :                     s_256[4] =
     884     5456800 :                         _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
     885             : 
     886     5456800 :                     ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
     887    10913600 :                     ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
     888     5456800 :                     ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
     889     5456800 :                     ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
     890             : 
     891     5456800 :                     tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
     892     5456800 :                     tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
     893     5456800 :                     tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
     894     5456800 :                     tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
     895             : 
     896     5456800 :                     y = h;
     897             :                     do {
     898    67575000 :                         s += 2 * src_stride;
     899    67575000 :                         y_convolve_6tap_32x2_avx2(s,
     900             :                             src_stride,
     901             :                             coeffs_256,
     902             :                             s_256,
     903             :                             ss_256,
     904             :                             tt_256,
     905             :                             r);
     906    67553900 :                         sr_y_round_store_32x2_avx2(r, d, dst_stride);
     907             : 
     908    67579600 :                         ss_256[0] = ss_256[1];
     909    67579600 :                         ss_256[1] = ss_256[2];
     910    67579600 :                         ss_256[3] = ss_256[4];
     911    67579600 :                         ss_256[4] = ss_256[5];
     912             : 
     913    67579600 :                         tt_256[0] = tt_256[1];
     914    67579600 :                         tt_256[1] = tt_256[2];
     915    67579600 :                         tt_256[3] = tt_256[4];
     916    67579600 :                         tt_256[4] = tt_256[5];
     917    67579600 :                         d += 2 * dst_stride;
     918    67579600 :                         y -= 2;
     919    67579600 :                     } while (y);
     920             : 
     921     5461340 :                     x += 32;
     922     5461340 :                 } while (x < w);
     923             :             }
     924             :         }
     925             :     }
     926             :     else {
     927             :         // vert_filt as 8 tap
     928     1848400 :         const uint8_t *src_ptr = src - 3 * src_stride;
     929             : 
     930     1848400 :         if (w <= 4) {
     931        3958 :             prepare_half_coeffs_8tap_ssse3(
     932             :                 filter_params_y, subpel_y_q4, coeffs_128);
     933             : 
     934        3958 :             y = h;
     935             : 
     936        3958 :             if (w == 2) {
     937             :                 __m128i s_16[8], ss_128[4];
     938             : 
     939           0 :                 s_16[0] =
     940           0 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
     941           0 :                 s_16[1] =
     942           0 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
     943           0 :                 s_16[2] =
     944           0 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
     945           0 :                 s_16[3] =
     946           0 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
     947           0 :                 s_16[4] =
     948           0 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
     949           0 :                 s_16[5] =
     950           0 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
     951           0 :                 s_16[6] =
     952           0 :                     _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
     953             : 
     954           0 :                 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
     955           0 :                 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
     956           0 :                 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
     957           0 :                 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
     958           0 :                 const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
     959           0 :                 const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
     960             : 
     961           0 :                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
     962           0 :                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
     963           0 :                 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
     964             : 
     965             :                 do {
     966           0 :                     const __m128i res = y_convolve_8tap_2x2_ssse3(
     967             :                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
     968           0 :                     const __m128i r = sr_y_round_sse2(res);
     969           0 :                     pack_store_2x2_sse2(r, dst, dst_stride);
     970           0 :                     ss_128[0] = ss_128[1];
     971           0 :                     ss_128[1] = ss_128[2];
     972           0 :                     ss_128[2] = ss_128[3];
     973           0 :                     src_ptr += 2 * src_stride;
     974           0 :                     dst += 2 * dst_stride;
     975           0 :                     y -= 2;
     976           0 :                 } while (y);
     977             :             }
     978             :             else {
     979             :                 __m128i s_32[8], ss_128[4];
     980             : 
     981        3958 :                 assert(w == 4);
     982             : 
     983        3958 :                 s_32[0] =
     984        3958 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
     985        3958 :                 s_32[1] =
     986        3958 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
     987        3958 :                 s_32[2] =
     988        3958 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
     989        3958 :                 s_32[3] =
     990        3958 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
     991        3958 :                 s_32[4] =
     992        3958 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
     993        3958 :                 s_32[5] =
     994        3958 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
     995        3958 :                 s_32[6] =
     996        3958 :                     _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
     997             : 
     998        3958 :                 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
     999        3958 :                 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
    1000        3958 :                 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
    1001        3958 :                 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
    1002        3958 :                 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
    1003        7916 :                 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
    1004             : 
    1005        3958 :                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
    1006        7916 :                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
    1007        3958 :                 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
    1008             : 
    1009             :                 do {
    1010       23472 :                     const __m128i res = y_convolve_8tap_4x2_ssse3(
    1011             :                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
    1012       23472 :                     const __m128i r = sr_y_round_sse2(res);
    1013       23472 :                     pack_store_4x2_sse2(r, dst, dst_stride);
    1014       23472 :                     ss_128[0] = ss_128[1];
    1015       23472 :                     ss_128[1] = ss_128[2];
    1016       23472 :                     ss_128[2] = ss_128[3];
    1017       23472 :                     src_ptr += 2 * src_stride;
    1018       23472 :                     dst += 2 * dst_stride;
    1019       23472 :                     y -= 2;
    1020       23472 :                 } while (y);
    1021             :             }
    1022             :         }
    1023             :         else {
    1024     1844440 :             prepare_half_coeffs_8tap_avx2(
    1025             :                 filter_params_y, subpel_y_q4, coeffs_256);
    1026             : 
    1027     1851490 :             if (w == 8) {
    1028             :                 __m128i s_64[8];
    1029             :                 __m256i ss_256[4];
    1030             : 
    1031      863733 :                 s_64[0] =
    1032      863733 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
    1033      863733 :                 s_64[1] =
    1034      863733 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
    1035      863733 :                 s_64[2] =
    1036      863733 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
    1037      863733 :                 s_64[3] =
    1038      863733 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
    1039      863733 :                 s_64[4] =
    1040      863733 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
    1041      863733 :                 s_64[5] =
    1042      863733 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
    1043      863733 :                 s_64[6] =
    1044      863733 :                     _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
    1045             : 
    1046             :                 // Load lines a and b. Line a to lower 128, line b to upper 128
    1047      863733 :                 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
    1048      863733 :                 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
    1049      863733 :                 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
    1050      863733 :                 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
    1051      863733 :                 const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
    1052     1727470 :                 const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
    1053             : 
    1054      863733 :                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
    1055      863733 :                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1056      863733 :                 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
    1057             : 
    1058      863733 :                 y = h;
    1059             :                 do {
    1060     5833640 :                     const __m256i res = y_convolve_8tap_8x2_avx2(
    1061             :                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
    1062     5833630 :                     sr_y_round_store_8x2_avx2(res, dst, dst_stride);
    1063     5833600 :                     ss_256[0] = ss_256[1];
    1064     5833600 :                     ss_256[1] = ss_256[2];
    1065     5833600 :                     ss_256[2] = ss_256[3];
    1066     5833600 :                     src_ptr += 2 * src_stride;
    1067     5833600 :                     dst += 2 * dst_stride;
    1068     5833600 :                     y -= 2;
    1069     5833600 :                 } while (y);
    1070             :             }
    1071      987756 :             else if (w == 16) {
    1072             :                 __m128i s_128[8];
    1073             :                 __m256i ss_256[8], r[2];
    1074             : 
    1075      619540 :                 s_128[0] =
    1076      619540 :                     _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
    1077      619540 :                 s_128[1] =
    1078      619540 :                     _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
    1079      619540 :                 s_128[2] =
    1080      619540 :                     _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
    1081      619540 :                 s_128[3] =
    1082      619540 :                     _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
    1083      619540 :                 s_128[4] =
    1084      619540 :                     _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
    1085      619540 :                 s_128[5] =
    1086      619540 :                     _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
    1087      619540 :                 s_128[6] =
    1088      619540 :                     _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
    1089             : 
    1090             :                 // Load lines a and b. Line a to lower 128, line b to upper 128
    1091      619540 :                 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
    1092      619540 :                 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
    1093      619540 :                 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
    1094      619540 :                 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
    1095      619540 :                 const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
    1096     1239080 :                 const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
    1097             : 
    1098      619540 :                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
    1099      619540 :                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1100      619540 :                 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
    1101             : 
    1102      619540 :                 ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
    1103      619540 :                 ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
    1104      619540 :                 ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
    1105             : 
    1106      619540 :                 y = h;
    1107             :                 do {
    1108     5751690 :                     y_convolve_8tap_16x2_avx2(
    1109             :                         src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1110     5751890 :                     sr_y_round_store_16x2_avx2(r, dst, dst_stride);
    1111             : 
    1112     5751670 :                     ss_256[0] = ss_256[1];
    1113     5751670 :                     ss_256[1] = ss_256[2];
    1114     5751670 :                     ss_256[2] = ss_256[3];
    1115             : 
    1116     5751670 :                     ss_256[4] = ss_256[5];
    1117     5751670 :                     ss_256[5] = ss_256[6];
    1118     5751670 :                     ss_256[6] = ss_256[7];
    1119     5751670 :                     src_ptr += 2 * src_stride;
    1120     5751670 :                     dst += 2 * dst_stride;
    1121     5751670 :                     y -= 2;
    1122     5751670 :                 } while (y);
    1123             :             }
    1124             :             else {
    1125             :                 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
    1126             : 
    1127      368216 :                 assert(!(w % 32));
    1128             : 
    1129      368216 :                 x = 0;
    1130             :                 do {
    1131      443224 :                     const uint8_t *s = src_ptr + x;
    1132      443224 :                     uint8_t *d = dst + x;
    1133             : 
    1134      443224 :                     s_256[0] =
    1135      443224 :                         _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
    1136      443224 :                     s_256[1] =
    1137      443224 :                         _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
    1138      443224 :                     s_256[2] =
    1139      443224 :                         _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
    1140      443224 :                     s_256[3] =
    1141      443224 :                         _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
    1142      443224 :                     s_256[4] =
    1143      443224 :                         _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
    1144      443224 :                     s_256[5] =
    1145      443224 :                         _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
    1146      443224 :                     s_256[6] =
    1147      443224 :                         _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
    1148             : 
    1149      443224 :                     ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
    1150      443224 :                     ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
    1151      443224 :                     ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
    1152      443224 :                     ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
    1153      443224 :                     ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
    1154      443224 :                     ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
    1155             : 
    1156      443224 :                     tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
    1157      443224 :                     tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
    1158      443224 :                     tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
    1159      443224 :                     tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
    1160      443224 :                     tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
    1161      443224 :                     tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
    1162             : 
    1163      443224 :                     y = h;
    1164             :                     do {
    1165     5378280 :                         y_convolve_8tap_32x2_avx2(s,
    1166             :                             src_stride,
    1167             :                             coeffs_256,
    1168             :                             s_256,
    1169             :                             ss_256,
    1170             :                             tt_256,
    1171             :                             r);
    1172     5378190 :                         sr_y_round_store_32x2_avx2(r, d, dst_stride);
    1173             : 
    1174     5378350 :                         ss_256[0] = ss_256[1];
    1175     5378350 :                         ss_256[1] = ss_256[2];
    1176     5378350 :                         ss_256[2] = ss_256[3];
    1177     5378350 :                         ss_256[4] = ss_256[5];
    1178     5378350 :                         ss_256[5] = ss_256[6];
    1179     5378350 :                         ss_256[6] = ss_256[7];
    1180             : 
    1181     5378350 :                         tt_256[0] = tt_256[1];
    1182     5378350 :                         tt_256[1] = tt_256[2];
    1183     5378350 :                         tt_256[2] = tt_256[3];
    1184     5378350 :                         tt_256[4] = tt_256[5];
    1185     5378350 :                         tt_256[5] = tt_256[6];
    1186     5378350 :                         tt_256[6] = tt_256[7];
    1187     5378350 :                         s += 2 * src_stride;
    1188     5378350 :                         d += 2 * dst_stride;
    1189     5378350 :                         y -= 2;
    1190     5378350 :                     } while (y);
    1191             : 
    1192      443301 :                     x += 32;
    1193      443301 :                 } while (x < w);
    1194             :             }
    1195             :         }
    1196             :     }
    1197    31828400 : }
    1198             : 
    1199    15049100 : static INLINE void sr_x_2tap_32_avx2(const uint8_t *const src,
    1200             :     const __m256i coeffs[1],
    1201             :     uint8_t *const dst) {
    1202             :     __m256i r[2];
    1203             : 
    1204    15049100 :     x_convolve_2tap_32_avx2(src, coeffs, r);
    1205    15048100 :     sr_x_round_store_32_avx2(r, dst);
    1206    15049400 : }
    1207             : 
    1208     3521320 : static INLINE void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
    1209             :     uint8_t *const dst) {
    1210     3521320 :     const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
    1211     7042630 :     const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
    1212     3521320 :     const __m256i d = _mm256_avg_epu8(s0, s1);
    1213             :     _mm256_storeu_si256((__m256i *)dst, d);
    1214     3521320 : }
    1215             : 
    1216   131651000 : static INLINE void sr_x_6tap_32_avx2(const uint8_t *const src,
    1217             :     const __m256i coeffs[3],
    1218             :     const __m256i *const filt,
    1219             :     uint8_t *const dst) {
    1220             :     __m256i r[2];
    1221             : 
    1222   131651000 :     x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
    1223   131646000 :     sr_x_round_store_32_avx2(r, dst);
    1224   131676000 : }
    1225             : 
    1226             : SIMD_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
    1227             :     const __m256i coeffs[4],
    1228             :     const __m256i *const filt,
    1229             :     uint8_t *const dst) {
    1230             :     __m256i r[2];
    1231             : 
    1232             :     x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
    1233     9742950 :     sr_x_round_store_32_avx2(r, dst);
    1234     9742950 : }
    1235             : 
    1236    31589400 : void eb_av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
    1237             :     uint8_t *dst, int32_t dst_stride, int32_t w,
    1238             :     int32_t h, InterpFilterParams *filter_params_x,
    1239             :     InterpFilterParams *filter_params_y,
    1240             :     const int32_t subpel_x_q4,
    1241             :     const int32_t subpel_y_q4,
    1242             :     ConvolveParams *conv_params) {
    1243    31589400 :     int32_t y = h;
    1244             :     __m128i coeffs_128[4];
    1245             :     __m256i coeffs_256[4];
    1246             : 
    1247             :     (void)filter_params_y;
    1248             :     (void)subpel_y_q4;
    1249             :     (void)conv_params;
    1250             : 
    1251    31589400 :     assert(conv_params->round_0 == 3);
    1252    31589400 :     assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    1253             :         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    1254             : 
    1255    31589400 :     if (is_convolve_2tap(filter_params_x->filter_ptr)) {
    1256             :         // horz_filt as 2 tap
    1257     3636860 :         const uint8_t *src_ptr = src;
    1258             : 
    1259     3636860 :         if (subpel_x_q4 != 8) {
    1260     2921240 :             if (w <= 8) {
    1261     1462860 :                 prepare_half_coeffs_2tap_ssse3(
    1262             :                     filter_params_x, subpel_x_q4, coeffs_128);
    1263             : 
    1264     1462860 :                 if (w == 2) {
    1265             :                     do {
    1266           0 :                         const __m128i res = x_convolve_2tap_2x2_sse4_1(
    1267             :                             src_ptr, src_stride, coeffs_128);
    1268           0 :                         const __m128i r = sr_x_round_sse2(res);
    1269           0 :                         pack_store_2x2_sse2(r, dst, dst_stride);
    1270           0 :                         src_ptr += 2 * src_stride;
    1271           0 :                         dst += 2 * dst_stride;
    1272           0 :                         y -= 2;
    1273           0 :                     } while (y);
    1274             :                 }
    1275     1462860 :                 else if (w == 4) {
    1276             :                     do {
    1277     1468820 :                         const __m128i res = x_convolve_2tap_4x2_ssse3(
    1278             :                             src_ptr, src_stride, coeffs_128);
    1279     1468840 :                         const __m128i r = sr_x_round_sse2(res);
    1280     1468840 :                         pack_store_4x2_sse2(r, dst, dst_stride);
    1281     1468830 :                         src_ptr += 2 * src_stride;
    1282     1468830 :                         dst += 2 * dst_stride;
    1283     1468830 :                         y -= 2;
    1284     1468830 :                     } while (y);
    1285             :                 }
    1286             :                 else {
    1287     1177460 :                     assert(w == 8);
    1288             : 
    1289             :                     do {
    1290             :                         __m128i res[2];
    1291             : 
    1292     7573850 :                         x_convolve_2tap_8x2_ssse3(
    1293             :                             src_ptr, src_stride, coeffs_128, res);
    1294     7573470 :                         res[0] = sr_x_round_sse2(res[0]);
    1295     7573430 :                         res[1] = sr_x_round_sse2(res[1]);
    1296    15147000 :                         const __m128i d = _mm_packus_epi16(res[0], res[1]);
    1297     7573500 :                         _mm_storel_epi64((__m128i *)dst, d);
    1298     7573500 :                         _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
    1299             : 
    1300     7573500 :                         src_ptr += 2 * src_stride;
    1301     7573500 :                         dst += 2 * dst_stride;
    1302     7573500 :                         y -= 2;
    1303     7573500 :                     } while (y);
    1304             :                 }
    1305             :             }
    1306             :             else {
    1307     1458380 :                 prepare_half_coeffs_2tap_avx2(
    1308             :                     filter_params_x, subpel_x_q4, coeffs_256);
    1309             : 
    1310     1458590 :                 if (w == 16) {
    1311             :                     do {
    1312             :                         __m256i r[2];
    1313             : 
    1314     7726420 :                         x_convolve_2tap_16x2_avx2(
    1315             :                             src_ptr, src_stride, coeffs_256, r);
    1316     7725890 :                         sr_x_round_store_16x2_avx2(r, dst, dst_stride);
    1317     7726200 :                         src_ptr += 2 * src_stride;
    1318     7726200 :                         dst += 2 * dst_stride;
    1319     7726200 :                         y -= 2;
    1320     7726200 :                     } while (y);
    1321             :                 }
    1322      503787 :                 else if (w == 32) {
    1323             :                     do {
    1324     9107900 :                         sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
    1325     9107810 :                         src_ptr += src_stride;
    1326     9107810 :                         dst += dst_stride;
    1327     9107810 :                     } while (--y);
    1328             :                 }
    1329       93306 :                 else if (w == 64) {
    1330             :                     do {
    1331     2971890 :                         sr_x_2tap_32_avx2(
    1332             :                             src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
    1333     2971860 :                         sr_x_2tap_32_avx2(
    1334             :                             src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
    1335     2971870 :                         src_ptr += src_stride;
    1336     2971870 :                         dst += dst_stride;
    1337     2971870 :                     } while (--y);
    1338             :                 }
    1339             :                 else {
    1340           0 :                     assert(w == 128);
    1341             : 
    1342             :                     do {
    1343           0 :                         sr_x_2tap_32_avx2(
    1344             :                             src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
    1345           0 :                         sr_x_2tap_32_avx2(
    1346             :                             src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
    1347           0 :                         sr_x_2tap_32_avx2(
    1348             :                             src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
    1349           0 :                         sr_x_2tap_32_avx2(
    1350             :                             src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
    1351           0 :                         src_ptr += src_stride;
    1352           0 :                         dst += dst_stride;
    1353           0 :                     } while (--y);
    1354             :                 }
    1355             :             }
    1356             :         }
    1357             :         else {
    1358             :             // average to get half pel
    1359      715617 :             if (w == 2) {
    1360             :                 do {
    1361             :                     __m128i s_128;
    1362             : 
    1363           1 :                     s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
    1364           0 :                     const __m128i s1 = _mm_srli_si128(s_128, 1);
    1365           0 :                     const __m128i d = _mm_avg_epu8(s_128, s1);
    1366           0 :                     *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
    1367           0 :                     *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
    1368             : 
    1369           0 :                     src_ptr += 2 * src_stride;
    1370           0 :                     dst += 2 * dst_stride;
    1371           0 :                     y -= 2;
    1372           0 :                 } while (y);
    1373             :             }
    1374      715616 :             else if (w == 4) {
    1375             :                 do {
    1376             :                     __m128i s_128;
    1377             : 
    1378      453792 :                     s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
    1379      453794 :                     const __m128i s1 = _mm_srli_si128(s_128, 1);
    1380      453794 :                     const __m128i d = _mm_avg_epu8(s_128, s1);
    1381      453794 :                     xx_storel_32(dst, d);
    1382      453795 :                     *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
    1383             : 
    1384      453795 :                     src_ptr += 2 * src_stride;
    1385      453795 :                     dst += 2 * dst_stride;
    1386      453795 :                     y -= 2;
    1387      453795 :                 } while (y);
    1388             :             }
    1389      616347 :             else if (w == 8) {
    1390             :                 do {
    1391     1703770 :                     const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
    1392             :                     const __m128i s10 =
    1393     1703770 :                         _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
    1394     1703770 :                     const __m128i s01 = _mm_srli_si128(s00, 1);
    1395     1703770 :                     const __m128i s11 = _mm_srli_si128(s10, 1);
    1396     1703770 :                     const __m128i d0 = _mm_avg_epu8(s00, s01);
    1397     1703770 :                     const __m128i d1 = _mm_avg_epu8(s10, s11);
    1398     1703770 :                     _mm_storel_epi64((__m128i *)dst, d0);
    1399     1703770 :                     _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
    1400             : 
    1401     1703770 :                     src_ptr += 2 * src_stride;
    1402     1703770 :                     dst += 2 * dst_stride;
    1403     1703770 :                     y -= 2;
    1404     1703770 :                 } while (y);
    1405             :             }
    1406      337142 :             else if (w == 16) {
    1407             :                 do {
    1408     1793350 :                     const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
    1409             :                     const __m128i s01 =
    1410     1793350 :                         _mm_loadu_si128((__m128i *)(src_ptr + 1));
    1411             :                     const __m128i s10 =
    1412     1793350 :                         _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
    1413             :                     const __m128i s11 =
    1414     3586700 :                         _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
    1415     1793350 :                     const __m128i d0 = _mm_avg_epu8(s00, s01);
    1416     1793350 :                     const __m128i d1 = _mm_avg_epu8(s10, s11);
    1417             :                     _mm_storeu_si128((__m128i *)dst, d0);
    1418     1793350 :                     _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
    1419             : 
    1420     1793350 :                     src_ptr += 2 * src_stride;
    1421     1793350 :                     dst += 2 * dst_stride;
    1422     1793350 :                     y -= 2;
    1423     1793350 :                 } while (y);
    1424             :             }
    1425      110967 :             else if (w == 32) {
    1426             :                 do {
    1427     2033370 :                     sr_x_2tap_32_avg_avx2(src_ptr, dst);
    1428     2033370 :                     src_ptr += src_stride;
    1429     2033370 :                     dst += dst_stride;
    1430     2033370 :                 } while (--y);
    1431             :             }
    1432       22111 :             else if (w == 64) {
    1433             :                 do {
    1434      744004 :                     sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
    1435      744001 :                     sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
    1436      744002 :                     src_ptr += src_stride;
    1437      744002 :                     dst += dst_stride;
    1438      744002 :                 } while (--y);
    1439             :             }
    1440             :             else {
    1441           0 :                 assert(w == 128);
    1442             : 
    1443             :                 do {
    1444           0 :                     sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
    1445           0 :                     sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
    1446           0 :                     sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
    1447           0 :                     sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
    1448           0 :                     src_ptr += src_stride;
    1449           0 :                     dst += dst_stride;
    1450           0 :                 } while (--y);
    1451             :             }
    1452             :         }
    1453             :     }
    1454    27955100 :     else if (is_convolve_4tap(filter_params_x->filter_ptr)) {
    1455             :         // horz_filt as 4 tap
    1456     2876230 :         const uint8_t *src_ptr = src - 1;
    1457             : 
    1458     2876230 :         prepare_half_coeffs_4tap_ssse3(
    1459             :             filter_params_x, subpel_x_q4, coeffs_128);
    1460             : 
    1461     2881160 :         if (w == 2) {
    1462             :             do {
    1463             :                 const __m128i res =
    1464      104938 :                     x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
    1465      104946 :                 const __m128i r = sr_x_round_sse2(res);
    1466      104946 :                 pack_store_2x2_sse2(r, dst, dst_stride);
    1467      104946 :                 src_ptr += 2 * src_stride;
    1468      104946 :                 dst += 2 * dst_stride;
    1469      104946 :                 y -= 2;
    1470      104946 :             } while (y);
    1471             :         }
    1472             :         else {
    1473     2846640 :             assert(w == 4);
    1474             : 
    1475             :             do {
    1476             :                 const __m128i res =
    1477    15446900 :                     x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
    1478    15446100 :                 const __m128i r = sr_x_round_sse2(res);
    1479    15445500 :                 pack_store_4x2_sse2(r, dst, dst_stride);
    1480    15445800 :                 src_ptr += 2 * src_stride;
    1481    15445800 :                 dst += 2 * dst_stride;
    1482    15445800 :                 y -= 2;
    1483    15445800 :             } while (y);
    1484             :         }
    1485             :     }
    1486             :     else {
    1487             :         __m256i filt_256[4];
    1488             : 
    1489    25091400 :         filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    1490    25091400 :         filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    1491    25091400 :         filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    1492             : 
    1493    25091400 :         if (is_convolve_6tap(filter_params_x->filter_ptr)) {
    1494             :             // horz_filt as 6 tap
    1495    23262900 :             const uint8_t *src_ptr = src - 2;
    1496             : 
    1497    23262900 :             prepare_half_coeffs_6tap_avx2(
    1498             :                 filter_params_x, subpel_x_q4, coeffs_256);
    1499             : 
    1500    23277900 :             if (w == 8) {
    1501             :                 do {
    1502    67736200 :                     const __m256i res = x_convolve_6tap_8x2_avx2(
    1503             :                         src_ptr, src_stride, coeffs_256, filt_256);
    1504    67726200 :                     sr_x_round_store_8x2_avx2(res, dst, dst_stride);
    1505    67723200 :                     src_ptr += 2 * src_stride;
    1506    67723200 :                     dst += 2 * dst_stride;
    1507    67723200 :                     y -= 2;
    1508    67723200 :                 } while (y);
    1509             :             }
    1510    12797200 :             else if (w == 16) {
    1511             :                 do {
    1512             :                     __m256i r[2];
    1513             : 
    1514    67176900 :                     x_convolve_6tap_16x2_avx2(
    1515             :                         src_ptr, src_stride, coeffs_256, filt_256, r);
    1516    67164900 :                     sr_x_round_store_16x2_avx2(r, dst, dst_stride);
    1517    67158500 :                     src_ptr += 2 * src_stride;
    1518    67158500 :                     dst += 2 * dst_stride;
    1519    67158500 :                     y -= 2;
    1520    67158500 :                 } while (y);
    1521             :             }
    1522     4382890 :             else if (w == 32) {
    1523             :                 do {
    1524    76682000 :                     sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
    1525    76674100 :                     src_ptr += src_stride;
    1526    76674100 :                     dst += dst_stride;
    1527    76674100 :                 } while (--y);
    1528             :             }
    1529      876998 :             else if (w == 64) {
    1530             :                 do {
    1531    27582800 :                     sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
    1532    27582200 :                     sr_x_6tap_32_avx2(
    1533             :                         src_ptr + 32, coeffs_256, filt_256, dst + 32);
    1534    27582100 :                     src_ptr += src_stride;
    1535    27582100 :                     dst += dst_stride;
    1536    27582100 :                 } while (--y);
    1537             :             }
    1538             :             else {
    1539           0 :                 assert(w == 128);
    1540             : 
    1541             :                 do {
    1542           0 :                     sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
    1543           0 :                     sr_x_6tap_32_avx2(
    1544             :                         src_ptr + 1 * 32, coeffs_256, filt_256, dst + 1 * 32);
    1545           0 :                     sr_x_6tap_32_avx2(
    1546             :                         src_ptr + 2 * 32, coeffs_256, filt_256, dst + 2 * 32);
    1547           0 :                     sr_x_6tap_32_avx2(
    1548             :                         src_ptr + 3 * 32, coeffs_256, filt_256, dst + 3 * 32);
    1549           0 :                     src_ptr += src_stride;
    1550           0 :                     dst += dst_stride;
    1551           0 :                 } while (--y);
    1552             :             }
    1553             :         }
    1554             :         else {
    1555             :             // horz_filt as 8 tap
    1556     1826660 :             const uint8_t *src_ptr = src - 3;
    1557             : 
    1558     1826660 :             filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
    1559             : 
    1560     1826660 :             prepare_half_coeffs_8tap_avx2(
    1561             :                 filter_params_x, subpel_x_q4, coeffs_256);
    1562             : 
    1563     1826810 :             if (w == 8) {
    1564             :                 do {
    1565     5729910 :                     const __m256i res = x_convolve_8tap_8x2_avx2(
    1566             :                         src_ptr, src_stride, coeffs_256, filt_256);
    1567     5766150 :                     sr_x_round_store_8x2_avx2(res, dst, dst_stride);
    1568     5766080 :                     src_ptr += 2 * src_stride;
    1569     5766080 :                     dst += 2 * dst_stride;
    1570     5766080 :                     y -= 2;
    1571     5766080 :                 } while (y);
    1572             :             }
    1573      957222 :             else if (w == 16) {
    1574             :                 do {
    1575             :                     __m256i r[2];
    1576             : 
    1577             :                     x_convolve_8tap_16x2_avx2(
    1578             :                         src_ptr, src_stride, coeffs_256, filt_256, r);
    1579     5540220 :                     sr_x_round_store_16x2_avx2(r, dst, dst_stride);
    1580     5540340 :                     src_ptr += 2 * src_stride;
    1581     5540340 :                     dst += 2 * dst_stride;
    1582     5540340 :                     y -= 2;
    1583     5540340 :                 } while (y);
    1584             :             }
    1585      344321 :             else if (w == 32) {
    1586             :                 do {
    1587             :                     sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
    1588     5889800 :                     src_ptr += src_stride;
    1589     5889800 :                     dst += dst_stride;
    1590     5889800 :                 } while (--y);
    1591             :             }
    1592       64873 :             else if (w == 64) {
    1593             :                 do {
    1594             :                     sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
    1595     1926560 :                     sr_x_8tap_32_avx2(
    1596             :                         src_ptr + 32, coeffs_256, filt_256, dst + 32);
    1597     1926580 :                     src_ptr += src_stride;
    1598     1926580 :                     dst += dst_stride;
    1599     1926580 :                 } while (--y);
    1600             :             }
    1601             :             else {
    1602           0 :                 assert(w == 128);
    1603             : 
    1604             :                 do {
    1605             :                     sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
    1606           0 :                     sr_x_8tap_32_avx2(
    1607             :                         src_ptr + 1 * 32, coeffs_256, filt_256, dst + 1 * 32);
    1608           0 :                     sr_x_8tap_32_avx2(
    1609             :                         src_ptr + 2 * 32, coeffs_256, filt_256, dst + 2 * 32);
    1610           0 :                     sr_x_8tap_32_avx2(
    1611             :                         src_ptr + 3 * 32, coeffs_256, filt_256, dst + 3 * 32);
    1612           0 :                     src_ptr += src_stride;
    1613           0 :                     dst += dst_stride;
    1614           0 :                 } while (--y);
    1615             :             }
    1616             :         }
    1617             :     }
    1618    31617800 : }
    1619             : 
    1620             : // Loads and stores to do away with the tedium of casting the address
    1621             : // to the right type.
    1622           0 : static INLINE __m128i xx_loadl_32(const void *a) {
    1623             :     int val;
    1624           0 :     memcpy(&val, a, sizeof(val));
    1625           0 :     return _mm_cvtsi32_si128(val);
    1626             : }
    1627  1190760000 : static INLINE __m128i xx_load_128(const void *a) {
    1628  1190760000 :     return _mm_load_si128((const __m128i *)a);
    1629             : }
    1630             : 
    1631  1032330000 : static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
    1632             :     const __m256i s1) {
    1633  2064650000 :     const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
    1634  3096980000 :     return _mm256_abs_epi16(
    1635             :         _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
    1636             :     // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
    1637             : }
    1638    39720800 : void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
    1639             :     DIFFWTD_MASK_TYPE mask_type,
    1640             :     const uint8_t *src0, int src0_stride,
    1641             :     const uint8_t *src1, int src1_stride,
    1642             :     int h, int w) {
    1643    39720800 :     const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
    1644    39720800 :     const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
    1645    39720800 :     int i = 0;
    1646    39720800 :     if (4 == w) {
    1647             :         do {
    1648           0 :             const __m128i s0A = xx_loadl_32(src0);
    1649           0 :             const __m128i s0B = xx_loadl_32(src0 + src0_stride);
    1650           0 :             const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
    1651           0 :             const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
    1652           0 :             const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
    1653           0 :             const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
    1654           0 :             const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
    1655           0 :             const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
    1656             : 
    1657           0 :             const __m128i s1A = xx_loadl_32(src1);
    1658           0 :             const __m128i s1B = xx_loadl_32(src1 + src1_stride);
    1659           0 :             const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
    1660           0 :             const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
    1661           0 :             const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
    1662           0 :             const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
    1663           0 :             const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
    1664           0 :             const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
    1665           0 :             const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
    1666           0 :             const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
    1667             :             const __m128i x_m8 =
    1668           0 :                 _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
    1669           0 :             xx_storeu_128(mask, x_m8);
    1670           0 :             src0 += (src0_stride << 2);
    1671           0 :             src1 += (src1_stride << 2);
    1672           0 :             mask += 16;
    1673           0 :             i += 4;
    1674           0 :         } while (i < h);
    1675             :     }
    1676    39734900 :     else if (8 == w) {
    1677             :         do {
    1678    62336500 :             const __m128i s0A = xx_loadl_64(src0);
    1679    62303500 :             const __m128i s0B = xx_loadl_64(src0 + src0_stride);
    1680    62292900 :             const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
    1681    62270900 :             const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
    1682   124545000 :             const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
    1683    62272300 :             const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
    1684    62272300 :             const __m128i s1A = xx_loadl_64(src1);
    1685    62272500 :             const __m128i s1B = xx_loadl_64(src1 + src1_stride);
    1686    62260500 :             const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
    1687    62254900 :             const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
    1688   124529000 :             const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
    1689    62264300 :             const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
    1690    62264300 :             const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
    1691    62294500 :             const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
    1692    62303400 :             const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
    1693    62303400 :             yy_storeu_256(mask, m8);
    1694    62300900 :             src0 += src0_stride << 2;
    1695    62300900 :             src1 += src1_stride << 2;
    1696    62300900 :             mask += 32;
    1697    62300900 :             i += 4;
    1698    62300900 :         } while (i < h);
    1699             :     }
    1700    23000800 :     else if (16 == w) {
    1701             :         do {
    1702   129757000 :             const __m128i s0A = xx_load_128(src0);
    1703   129658000 :             const __m128i s0B = xx_load_128(src0 + src0_stride);
    1704   129595000 :             const __m128i s1A = xx_load_128(src1);
    1705   129525000 :             const __m128i s1B = xx_load_128(src1 + src1_stride);
    1706   129485000 :             const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
    1707   129485000 :             const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
    1708   129485000 :             const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
    1709   129485000 :             const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
    1710             : 
    1711   129485000 :             const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
    1712   129583000 :             const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
    1713             : 
    1714             :             const __m256i m8 =
    1715   129655000 :                 _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
    1716   129655000 :             yy_storeu_256(mask, m8);
    1717   129670000 :             src0 += src0_stride << 1;
    1718   129670000 :             src1 += src1_stride << 1;
    1719   129670000 :             mask += 32;
    1720   129670000 :             i += 2;
    1721   129670000 :         } while (i < h);
    1722             :     }
    1723             :     else {
    1724             :         do {
    1725   250572000 :             int j = 0;
    1726             :             do {
    1727   330371000 :                 const __m256i s0 = yy_loadu_256(src0 + j);
    1728   330115000 :                 const __m256i s1 = yy_loadu_256(src1 + j);
    1729   659290000 :                 const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
    1730   329645000 :                 const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
    1731             :                 const __m256i s0H =
    1732   329645000 :                     _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
    1733             :                 const __m256i s1H =
    1734   329645000 :                     _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
    1735   329645000 :                 const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
    1736   330116000 :                 const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
    1737             :                 const __m256i m8 =
    1738   330583000 :                     _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
    1739   330583000 :                 yy_storeu_256(mask + j, m8);
    1740   330376000 :                 j += 32;
    1741   330376000 :             } while (j < w);
    1742   250576000 :             src0 += src0_stride;
    1743   250576000 :             src1 += src1_stride;
    1744   250576000 :             mask += w;
    1745   250576000 :             i += 1;
    1746   250576000 :         } while (i < h);
    1747             :     }
    1748    39616700 : }
    1749             : ////////
    1750             : 
    1751   372286000 : static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
    1752             :     const __m256i *data_src1,
    1753             :     const __m256i *round_const,
    1754             :     const __m256i *mask_base_16,
    1755             :     const __m256i *clip_diff, int round) {
    1756   372286000 :     const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
    1757   744573000 :     const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
    1758   372286000 :     const __m256i diff = _mm256_max_epu16(diffa, diffb);
    1759             :     const __m256i diff_round =
    1760  1116860000 :         _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
    1761   372286000 :     const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
    1762   372286000 :     const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
    1763   372286000 :     const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
    1764   372286000 :     return diff_clamp;
    1765             : }
    1766             : 
    1767   325266000 : static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
    1768             :     const __m256i *data_src1,
    1769             :     const __m256i *round_const,
    1770             :     const __m256i *mask_base_16,
    1771             :     const __m256i *clip_diff,
    1772             :     int round) {
    1773   325266000 :     const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
    1774   650533000 :     const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
    1775   325266000 :     const __m256i diff = _mm256_max_epu16(diffa, diffb);
    1776             :     const __m256i diff_round =
    1777   975799000 :         _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
    1778   325266000 :     const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
    1779   325266000 :     const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
    1780   325266000 :     const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
    1781   325266000 :     const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
    1782   325266000 :     return diff_const_16;
    1783             : }
    1784             : 
    1785    14127100 : static INLINE void build_compound_diffwtd_mask_d16_avx2(
    1786             :     uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
    1787             :     const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
    1788    14127100 :     const int mask_base = 38;
    1789    14127100 :     const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
    1790    28254300 :     const __m256i y38 = _mm256_set1_epi16(mask_base);
    1791    14127100 :     const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    1792    14127100 :     int i = 0;
    1793    14127100 :     if (w == 4) {
    1794             :         do {
    1795           0 :             const __m128i s0A = xx_loadl_64(src0);
    1796           0 :             const __m128i s0B = xx_loadl_64(src0 + src0_stride);
    1797           0 :             const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
    1798           0 :             const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
    1799           0 :             const __m128i s1A = xx_loadl_64(src1);
    1800           0 :             const __m128i s1B = xx_loadl_64(src1 + src1_stride);
    1801           0 :             const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
    1802           0 :             const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
    1803           0 :             const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
    1804             :                 _mm_unpacklo_epi64(s0A, s0B));
    1805           0 :             const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
    1806             :                 _mm_unpacklo_epi64(s1A, s1B));
    1807           0 :             const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
    1808           0 :             const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
    1809           0 :             xx_storeu_128(mask,
    1810           0 :                 _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
    1811           0 :             src0 += src0_stride << 2;
    1812           0 :             src1 += src1_stride << 2;
    1813           0 :             mask += 16;
    1814           0 :             i += 4;
    1815           0 :         } while (i < h);
    1816             :     }
    1817    14129300 :     else if (w == 8) {
    1818             :         do {
    1819    22611500 :             const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
    1820    22600600 :             const __m256i s0CD =
    1821    22606700 :                 yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
    1822    22600600 :             const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
    1823    22597500 :             const __m256i s1CD =
    1824    22597900 :                 yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
    1825             :             const __m256i m16AB =
    1826    22597500 :                 calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
    1827             :             const __m256i m16CD =
    1828    22608700 :                 calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
    1829    22610200 :             const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
    1830    22610200 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
    1831    22609400 :             src0 += src0_stride << 2;
    1832    22609400 :             src1 += src1_stride << 2;
    1833    22609400 :             mask += 32;
    1834    22609400 :             i += 4;
    1835    22609400 :         } while (i < h);
    1836             :     }
    1837     8209340 :     else if (w == 16) {
    1838             :         do {
    1839    46565600 :             const __m256i s0A = yy_loadu_256(src0);
    1840    46552800 :             const __m256i s0B = yy_loadu_256(src0 + src0_stride);
    1841    46539300 :             const __m256i s1A = yy_loadu_256(src1);
    1842    46530300 :             const __m256i s1B = yy_loadu_256(src1 + src1_stride);
    1843             :             const __m256i m16A =
    1844    46525200 :                 calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    1845             :             const __m256i m16B =
    1846    46569400 :                 calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    1847    46559300 :             const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
    1848    46559300 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
    1849    46554700 :             src0 += src0_stride << 1;
    1850    46554700 :             src1 += src1_stride << 1;
    1851    46554700 :             mask += 32;
    1852    46554700 :             i += 2;
    1853    46554700 :         } while (i < h);
    1854             :     }
    1855     3680100 :     else if (w == 32) {
    1856             :         do {
    1857    61400100 :             const __m256i s0A = yy_loadu_256(src0);
    1858    61383800 :             const __m256i s0B = yy_loadu_256(src0 + 16);
    1859    61360900 :             const __m256i s1A = yy_loadu_256(src1);
    1860    61347700 :             const __m256i s1B = yy_loadu_256(src1 + 16);
    1861             :             const __m256i m16A =
    1862    61335000 :                 calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    1863             :             const __m256i m16B =
    1864    61413900 :                 calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    1865    61398400 :             const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
    1866    61398400 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
    1867    61390000 :             src0 += src0_stride;
    1868    61390000 :             src1 += src1_stride;
    1869    61390000 :             mask += 32;
    1870    61390000 :             i += 1;
    1871    61390000 :         } while (i < h);
    1872             :     }
    1873      894399 :     else if (w == 64) {
    1874             :         do {
    1875    28302200 :             const __m256i s0A = yy_loadu_256(src0);
    1876    28295400 :             const __m256i s0B = yy_loadu_256(src0 + 16);
    1877    28288900 :             const __m256i s0C = yy_loadu_256(src0 + 32);
    1878    28283900 :             const __m256i s0D = yy_loadu_256(src0 + 48);
    1879    28278200 :             const __m256i s1A = yy_loadu_256(src1);
    1880    28276700 :             const __m256i s1B = yy_loadu_256(src1 + 16);
    1881    28276800 :             const __m256i s1C = yy_loadu_256(src1 + 32);
    1882    28276200 :             const __m256i s1D = yy_loadu_256(src1 + 48);
    1883             :             const __m256i m16A =
    1884    28275600 :                 calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    1885             :             const __m256i m16B =
    1886    28309200 :                 calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    1887             :             const __m256i m16C =
    1888    28316100 :                 calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
    1889             :             const __m256i m16D =
    1890    28316000 :                 calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
    1891    28310300 :             const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
    1892    28310300 :             const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
    1893    28310300 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
    1894    28305600 :             yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
    1895    28297900 :             src0 += src0_stride;
    1896    28297900 :             src1 += src1_stride;
    1897    28297900 :             mask += 64;
    1898    28297900 :             i += 1;
    1899    28297900 :         } while (i < h);
    1900             :     }
    1901             :     else {
    1902             :         do {
    1903           0 :             const __m256i s0A = yy_loadu_256(src0);
    1904           0 :             const __m256i s0B = yy_loadu_256(src0 + 16);
    1905           0 :             const __m256i s0C = yy_loadu_256(src0 + 32);
    1906           0 :             const __m256i s0D = yy_loadu_256(src0 + 48);
    1907           0 :             const __m256i s0E = yy_loadu_256(src0 + 64);
    1908           0 :             const __m256i s0F = yy_loadu_256(src0 + 80);
    1909           0 :             const __m256i s0G = yy_loadu_256(src0 + 96);
    1910           0 :             const __m256i s0H = yy_loadu_256(src0 + 112);
    1911           0 :             const __m256i s1A = yy_loadu_256(src1);
    1912           0 :             const __m256i s1B = yy_loadu_256(src1 + 16);
    1913           0 :             const __m256i s1C = yy_loadu_256(src1 + 32);
    1914           0 :             const __m256i s1D = yy_loadu_256(src1 + 48);
    1915           0 :             const __m256i s1E = yy_loadu_256(src1 + 64);
    1916           0 :             const __m256i s1F = yy_loadu_256(src1 + 80);
    1917           0 :             const __m256i s1G = yy_loadu_256(src1 + 96);
    1918           0 :             const __m256i s1H = yy_loadu_256(src1 + 112);
    1919             :             const __m256i m16A =
    1920           0 :                 calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    1921             :             const __m256i m16B =
    1922           0 :                 calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    1923             :             const __m256i m16C =
    1924           0 :                 calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
    1925             :             const __m256i m16D =
    1926           0 :                 calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
    1927             :             const __m256i m16E =
    1928           0 :                 calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
    1929             :             const __m256i m16F =
    1930           0 :                 calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
    1931             :             const __m256i m16G =
    1932           0 :                 calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
    1933             :             const __m256i m16H =
    1934           0 :                 calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
    1935           0 :             const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
    1936           0 :             const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
    1937           0 :             const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
    1938           0 :             const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
    1939           0 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
    1940           0 :             yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
    1941           0 :             yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
    1942           0 :             yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
    1943           0 :             src0 += src0_stride;
    1944           0 :             src1 += src1_stride;
    1945           0 :             mask += 128;
    1946           0 :             i += 1;
    1947           0 :         } while (i < h);
    1948             :     }
    1949    14101900 : }
    1950    12237500 : static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
    1951             :     uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
    1952             :     const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
    1953    12237500 :     const int mask_base = 38;
    1954    12237500 :     const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
    1955    24474900 :     const __m256i y38 = _mm256_set1_epi16(mask_base);
    1956    12237500 :     const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    1957    12237500 :     int i = 0;
    1958    12237500 :     if (w == 4) {
    1959             :         do {
    1960           0 :             const __m128i s0A = xx_loadl_64(src0);
    1961           0 :             const __m128i s0B = xx_loadl_64(src0 + src0_stride);
    1962           0 :             const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
    1963           0 :             const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
    1964           0 :             const __m128i s1A = xx_loadl_64(src1);
    1965           0 :             const __m128i s1B = xx_loadl_64(src1 + src1_stride);
    1966           0 :             const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
    1967           0 :             const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
    1968           0 :             const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
    1969             :                 _mm_unpacklo_epi64(s0A, s0B));
    1970           0 :             const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
    1971             :                 _mm_unpacklo_epi64(s1A, s1B));
    1972             :             const __m256i m16 =
    1973           0 :                 calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
    1974           0 :             const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
    1975           0 :             xx_storeu_128(mask,
    1976           0 :                 _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
    1977           0 :             src0 += src0_stride << 2;
    1978           0 :             src1 += src1_stride << 2;
    1979           0 :             mask += 16;
    1980           0 :             i += 4;
    1981           0 :         } while (i < h);
    1982             :     }
    1983    12238700 :     else if (w == 8) {
    1984             :         do {
    1985    19382800 :             const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
    1986    19374100 :             const __m256i s0CD =
    1987    19378600 :                 yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
    1988    19374100 :             const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
    1989    19372200 :             const __m256i s1CD =
    1990    19372100 :                 yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
    1991             :             const __m256i m16AB =
    1992    19372200 :                 calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
    1993             :             const __m256i m16CD =
    1994    19381300 :                 calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
    1995    19382600 :             const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
    1996    19382600 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
    1997    19381000 :             src0 += src0_stride << 2;
    1998    19381000 :             src1 += src1_stride << 2;
    1999    19381000 :             mask += 32;
    2000    19381000 :             i += 4;
    2001    19381000 :         } while (i < h);
    2002             :     }
    2003     7126280 :     else if (w == 16) {
    2004             :         do {
    2005    40128200 :             const __m256i s0A = yy_loadu_256(src0);
    2006    40117000 :             const __m256i s0B = yy_loadu_256(src0 + src0_stride);
    2007    40103000 :             const __m256i s1A = yy_loadu_256(src1);
    2008    40095300 :             const __m256i s1B = yy_loadu_256(src1 + src1_stride);
    2009             :             const __m256i m16A =
    2010    40090600 :                 calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    2011             :             const __m256i m16B =
    2012    40126200 :                 calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    2013    40126000 :             const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
    2014    40126000 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
    2015    40119700 :             src0 += src0_stride << 1;
    2016    40119700 :             src1 += src1_stride << 1;
    2017    40119700 :             mask += 32;
    2018    40119700 :             i += 2;
    2019    40119700 :         } while (i < h);
    2020             :     }
    2021     3194250 :     else if (w == 32) {
    2022             :         do {
    2023    53085200 :             const __m256i s0A = yy_loadu_256(src0);
    2024    53068600 :             const __m256i s0B = yy_loadu_256(src0 + 16);
    2025    53048600 :             const __m256i s1A = yy_loadu_256(src1);
    2026    53035300 :             const __m256i s1B = yy_loadu_256(src1 + 16);
    2027             :             const __m256i m16A =
    2028    53023400 :                 calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    2029             :             const __m256i m16B =
    2030    53082200 :                 calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    2031    53082400 :             const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
    2032    53082400 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
    2033    53074400 :             src0 += src0_stride;
    2034    53074400 :             src1 += src1_stride;
    2035    53074400 :             mask += 32;
    2036    53074400 :             i += 1;
    2037    53074400 :         } while (i < h);
    2038             :     }
    2039      796986 :     else if (w == 64) {
    2040             :         do {
    2041    25452100 :             const __m256i s0A = yy_loadu_256(src0);
    2042    25446600 :             const __m256i s0B = yy_loadu_256(src0 + 16);
    2043    25439700 :             const __m256i s0C = yy_loadu_256(src0 + 32);
    2044    25432600 :             const __m256i s0D = yy_loadu_256(src0 + 48);
    2045    25424000 :             const __m256i s1A = yy_loadu_256(src1);
    2046    25424000 :             const __m256i s1B = yy_loadu_256(src1 + 16);
    2047    25423400 :             const __m256i s1C = yy_loadu_256(src1 + 32);
    2048    25423100 :             const __m256i s1D = yy_loadu_256(src1 + 48);
    2049             :             const __m256i m16A =
    2050    25422400 :                 calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    2051             :             const __m256i m16B =
    2052    25459400 :                 calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    2053             :             const __m256i m16C =
    2054    25461900 :                 calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
    2055             :             const __m256i m16D =
    2056    25464300 :                 calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
    2057    25464400 :             const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
    2058    25464400 :             const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
    2059    25464400 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
    2060    25457600 :             yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
    2061    25448900 :             src0 += src0_stride;
    2062    25448900 :             src1 += src1_stride;
    2063    25448900 :             mask += 64;
    2064    25448900 :             i += 1;
    2065    25448900 :         } while (i < h);
    2066             :     }
    2067             :     else {
    2068             :         do {
    2069           0 :             const __m256i s0A = yy_loadu_256(src0);
    2070           0 :             const __m256i s0B = yy_loadu_256(src0 + 16);
    2071           0 :             const __m256i s0C = yy_loadu_256(src0 + 32);
    2072           0 :             const __m256i s0D = yy_loadu_256(src0 + 48);
    2073           0 :             const __m256i s0E = yy_loadu_256(src0 + 64);
    2074           0 :             const __m256i s0F = yy_loadu_256(src0 + 80);
    2075           0 :             const __m256i s0G = yy_loadu_256(src0 + 96);
    2076           0 :             const __m256i s0H = yy_loadu_256(src0 + 112);
    2077           0 :             const __m256i s1A = yy_loadu_256(src1);
    2078           0 :             const __m256i s1B = yy_loadu_256(src1 + 16);
    2079           0 :             const __m256i s1C = yy_loadu_256(src1 + 32);
    2080           0 :             const __m256i s1D = yy_loadu_256(src1 + 48);
    2081           0 :             const __m256i s1E = yy_loadu_256(src1 + 64);
    2082           0 :             const __m256i s1F = yy_loadu_256(src1 + 80);
    2083           0 :             const __m256i s1G = yy_loadu_256(src1 + 96);
    2084           0 :             const __m256i s1H = yy_loadu_256(src1 + 112);
    2085             :             const __m256i m16A =
    2086           0 :                 calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
    2087             :             const __m256i m16B =
    2088           0 :                 calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
    2089             :             const __m256i m16C =
    2090           0 :                 calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
    2091             :             const __m256i m16D =
    2092           0 :                 calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
    2093             :             const __m256i m16E =
    2094           0 :                 calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
    2095             :             const __m256i m16F =
    2096           0 :                 calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
    2097             :             const __m256i m16G =
    2098           0 :                 calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
    2099             :             const __m256i m16H =
    2100           0 :                 calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
    2101           0 :             const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
    2102           0 :             const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
    2103           0 :             const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
    2104           0 :             const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
    2105           0 :             yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
    2106           0 :             yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
    2107           0 :             yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
    2108           0 :             yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
    2109           0 :             src0 += src0_stride;
    2110           0 :             src1 += src1_stride;
    2111           0 :             mask += 128;
    2112           0 :             i += 1;
    2113           0 :         } while (i < h);
    2114             :     }
    2115    12214400 : }
    2116    26352300 : void av1_build_compound_diffwtd_mask_d16_avx2(
    2117             :     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
    2118             :     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
    2119             :     ConvolveParams *conv_params, int bd) {
    2120    26352300 :     const int shift =
    2121    26352300 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
    2122             :     // When rounding constant is added, there is a possibility of overflow.
    2123             :     // However that much precision is not required. Code should very well work for
    2124             :     // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
    2125             :     // there is a possibility of corner case bugs.
    2126             :     assert(DIFF_FACTOR_LOG2 == 4);
    2127             :     assert(AOM_BLEND_A64_MAX_ALPHA == 64);
    2128             : 
    2129    26352300 :     if (mask_type == DIFFWTD_38) {
    2130    14126700 :         build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
    2131             :             src1_stride, h, w, shift);
    2132             :     }
    2133             :     else {
    2134    12225600 :         build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
    2135             :             src1_stride, h, w, shift);
    2136             :     }
    2137    26366700 : }
    2138             : 
    2139             : 
    2140             : #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
    2141             : 
    2142             : /**
    2143             :  * See av1_wedge_sse_from_residuals_c
    2144             :  */
    2145   292732000 : uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
    2146             :     const uint8_t *m, int N) {
    2147   292732000 :     int n = -N;
    2148             : 
    2149             :     uint64_t csse;
    2150             : 
    2151   292732000 :     const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
    2152   292732000 :     const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
    2153             : 
    2154   292890000 :     __m256i v_acc0_q = _mm256_setzero_si256();
    2155             : 
    2156   292890000 :     assert(N % 64 == 0);
    2157             : 
    2158   292890000 :     r1 += N;
    2159   292890000 :     d += N;
    2160   292890000 :     m += N;
    2161             : 
    2162             :     do {
    2163  4388270000 :         const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
    2164  4395960000 :         const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
    2165  8781470000 :         const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
    2166             : 
    2167  4388170000 :         const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
    2168  4388170000 :         const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
    2169  4388170000 :         const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
    2170             : 
    2171  4388170000 :         const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
    2172  4388170000 :         const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
    2173             : 
    2174  4388170000 :         const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
    2175  4388170000 :         const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
    2176             : 
    2177  4388170000 :         const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
    2178             : 
    2179  4388170000 :         const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
    2180             : 
    2181 13164500000 :         const __m256i v_sum0_q = _mm256_add_epi64(
    2182             :             _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
    2183             : 
    2184  4388170000 :         v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
    2185             : 
    2186  4388170000 :         n += 16;
    2187  4388170000 :     } while (n);
    2188             : 
    2189   585577000 :     v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
    2190   292788000 :     __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
    2191   292788000 :     __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
    2192   292788000 :     v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
    2193             : #if ARCH_X86_64
    2194             :     csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
    2195             : #else
    2196   292788000 :     xx_storel_64(&csse, v_acc_q_0);
    2197             : #endif
    2198             : 
    2199   292216000 :     return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
    2200             : }
    2201             : 
    2202             : 
    2203   458849000 : static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
    2204             :     const uint8_t *pred_ptr) {
    2205   458886000 :     __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
    2206   458884000 :     __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
    2207   458884000 :     __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
    2208   917768000 :     __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
    2209   458884000 :     __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
    2210   917768000 :     __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
    2211   458884000 :     const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
    2212   458884000 :     const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
    2213             :     _mm256_storeu_si256((__m256i *)(diff_ptr), d_0);
    2214   458884000 :     _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
    2215   458884000 : }
    2216             : 
    2217             : 
    2218    29982100 : static INLINE void aom_subtract_block_16xn_avx2(
    2219             :     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
    2220             :     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
    2221   546887000 :     for (int32_t j = 0; j < rows; ++j) {
    2222   516815000 :         __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
    2223   516905000 :         __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
    2224   516905000 :         __m256i s_0 = _mm256_cvtepu8_epi16(s);
    2225   516905000 :         __m256i p_0 = _mm256_cvtepu8_epi16(p);
    2226   516905000 :         const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
    2227             :         _mm256_storeu_si256((__m256i *)(diff_ptr), d_0);
    2228   516905000 :         src_ptr += src_stride;
    2229   516905000 :         pred_ptr += pred_stride;
    2230   516905000 :         diff_ptr += diff_stride;
    2231             :     }
    2232    30160900 : }
    2233             : 
    2234    12979000 : static INLINE void aom_subtract_block_32xn_avx2(
    2235             :     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
    2236             :     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
    2237   311854000 :     for (int32_t j = 0; j < rows; ++j) {
    2238   298992000 :         subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
    2239   298875000 :         src_ptr += src_stride;
    2240   298875000 :         pred_ptr += pred_stride;
    2241   298875000 :         diff_ptr += diff_stride;
    2242             :     }
    2243    12862300 : }
    2244     2476930 : static INLINE void aom_subtract_block_64xn_avx2(
    2245             :     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
    2246             :     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
    2247    83529900 :     for (int32_t j = 0; j < rows; ++j) {
    2248    81052800 :         subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
    2249    81052700 :         subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
    2250    81052900 :         src_ptr += src_stride;
    2251    81052900 :         pred_ptr += pred_stride;
    2252    81052900 :         diff_ptr += diff_stride;
    2253             :     }
    2254     2477080 : }
    2255           0 : static INLINE void aom_subtract_block_128xn_avx2(
    2256             :     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
    2257             :     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
    2258           0 :     for (int32_t j = 0; j < rows; ++j) {
    2259           0 :         subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
    2260           0 :         subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
    2261           0 :         subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
    2262           0 :         subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
    2263           0 :         src_ptr += src_stride;
    2264           0 :         pred_ptr += pred_stride;
    2265           0 :         diff_ptr += diff_stride;
    2266             :     }
    2267           0 : }
    2268    83024600 : void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
    2269             :     ptrdiff_t diff_stride, const uint8_t *src_ptr,
    2270             :     ptrdiff_t src_stride, const uint8_t *pred_ptr,
    2271             :     ptrdiff_t pred_stride) {
    2272    83024600 :     switch (cols) {
    2273    29986600 :     case 16:
    2274    29986600 :         aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
    2275             :             src_stride, pred_ptr, pred_stride);
    2276    29999000 :         break;
    2277    12979300 :     case 32:
    2278    12979300 :         aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
    2279             :             src_stride, pred_ptr, pred_stride);
    2280    12981800 :         break;
    2281     2476940 :     case 64:
    2282     2476940 :         aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
    2283             :             src_stride, pred_ptr, pred_stride);
    2284     2477060 :         break;
    2285           0 :     case 128:
    2286           0 :         aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
    2287             :             src_stride, pred_ptr, pred_stride);
    2288           0 :         break;
    2289    37581700 :     default:
    2290    37581700 :         eb_aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
    2291             :             src_stride, pred_ptr, pred_stride);
    2292    37750600 :         break;
    2293             :     }
    2294    83208500 : }
    2295             : 
    2296             : 
    2297             : 
    2298           0 : static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
    2299             :     const uint8_t *b, int b_stride, __m256i *sum) {
    2300           0 :     const __m128i v_a0 = xx_loadl_32(a);
    2301           0 :     const __m128i v_a1 = xx_loadl_32(a + a_stride);
    2302           0 :     const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
    2303           0 :     const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
    2304           0 :     const __m128i v_b0 = xx_loadl_32(b);
    2305           0 :     const __m128i v_b1 = xx_loadl_32(b + b_stride);
    2306           0 :     const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
    2307           0 :     const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
    2308           0 :     const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
    2309             :         _mm_unpacklo_epi32(v_a2, v_a3));
    2310           0 :     const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
    2311             :         _mm_unpacklo_epi32(v_b2, v_b3));
    2312           0 :     const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
    2313           0 :     const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
    2314           0 :     const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
    2315           0 :     *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
    2316           0 : }
    2317             : 
    2318             : 
    2319    50679100 : static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
    2320             :     const uint8_t *b, int b_stride, __m256i *sum) {
    2321    50679100 :     const __m128i v_a0 = xx_loadl_64(a);
    2322    50676400 :     const __m128i v_a1 = xx_loadl_64(a + a_stride);
    2323    50671200 :     const __m128i v_b0 = xx_loadl_64(b);
    2324    50668700 :     const __m128i v_b1 = xx_loadl_64(b + b_stride);
    2325   101339000 :     const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
    2326   101339000 :     const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
    2327    50669500 :     const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
    2328    50669500 :     *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
    2329    50669500 : }
    2330             : 
    2331    52427600 : static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
    2332             :     const uint8_t *b) {
    2333    52427600 :     const __m256i v_a0 = yy_loadu_256(a);
    2334    52422500 :     const __m256i v_b0 = yy_loadu_256(b);
    2335    52416100 :     const __m256i zero = _mm256_setzero_si256();
    2336    52416100 :     const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
    2337    52416100 :     const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
    2338    52416100 :     const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
    2339    52416100 :     const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
    2340    52416100 :     const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
    2341    52416100 :     const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
    2342   104832000 :     *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
    2343    52416100 :     *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
    2344    52416100 : }
    2345             : 
    2346    17811100 : static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
    2347             :     int64_t sum;
    2348    17811100 :     __m256i zero = _mm256_setzero_si256();
    2349    17811100 :     const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
    2350    35622100 :     const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
    2351    17811100 :     const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
    2352    17811100 :     const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
    2353    17811100 :         _mm256_extracti128_si256(sum_4x64, 1));
    2354    17811100 :     const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
    2355    17811100 :     xx_storel_64(&sum, sum_1x64);
    2356    17811000 :     return sum;
    2357             : }
    2358    17809400 : int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
    2359             :     int b_stride, int width, int height) {
    2360    17809400 :     int32_t y = 0;
    2361    17809400 :     int64_t sse = 0;
    2362    17809400 :     __m256i sum = _mm256_setzero_si256();
    2363    17809400 :     __m256i zero = _mm256_setzero_si256();
    2364    17809400 :     switch (width) {
    2365           0 :     case 4:
    2366             :         do {
    2367           0 :             sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
    2368           0 :             a += a_stride << 2;
    2369           0 :             b += b_stride << 2;
    2370           0 :             y += 4;
    2371           0 :         } while (y < height);
    2372           0 :         sse = summary_all_avx2(&sum);
    2373           0 :         break;
    2374    41940800 :     case 8:
    2375             :         do {
    2376    50676700 :             sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
    2377    50677100 :             a += a_stride << 1;
    2378    50677100 :             b += b_stride << 1;
    2379    50677100 :             y += 2;
    2380    50677100 :         } while (y < height);
    2381     8736220 :         sse = summary_all_avx2(&sum);
    2382     8736400 :         break;
    2383    45555500 :     case 16:
    2384             :         do {
    2385    52373900 :             const __m128i v_a0 = xx_loadu_128(a);
    2386    52388400 :             const __m128i v_a1 = xx_loadu_128(a + a_stride);
    2387    52384500 :             const __m128i v_b0 = xx_loadu_128(b);
    2388    52377600 :             const __m128i v_b1 = xx_loadu_128(b + b_stride);
    2389    52374300 :             const __m256i v_a =
    2390    52374300 :                 _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
    2391    52374300 :             const __m256i v_b =
    2392   104749000 :                 _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
    2393    52374300 :             const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
    2394    52374300 :             const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
    2395    52374300 :             const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
    2396    52374300 :             const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
    2397    52374300 :             const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
    2398    52374300 :             const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
    2399             :             const __m256i temp =
    2400   104749000 :                 _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
    2401             :                     _mm256_madd_epi16(v_bsub, v_bsub));
    2402    52374300 :             sum = _mm256_add_epi32(sum, temp);
    2403    52374300 :             a += a_stride << 1;
    2404    52374300 :             b += b_stride << 1;
    2405    52374300 :             y += 2;
    2406    52374300 :         } while (y < height);
    2407     6818770 :         sse = summary_all_avx2(&sum);
    2408     6818750 :         break;
    2409    50174400 :     case 32:
    2410             :         do {
    2411    52433200 :             sse_w32_avx2(&sum, a, b);
    2412    52433200 :             a += a_stride;
    2413    52433200 :             b += b_stride;
    2414    52433200 :             y += 1;
    2415    52433200 :         } while (y < height);
    2416     2258840 :         sse = summary_all_avx2(&sum);
    2417     2258840 :         break;
    2418           0 :     case 64:
    2419             :         do {
    2420           0 :             sse_w32_avx2(&sum, a, b);
    2421           0 :             sse_w32_avx2(&sum, a + 32, b + 32);
    2422           0 :             a += a_stride;
    2423           0 :             b += b_stride;
    2424           0 :             y += 1;
    2425           0 :         } while (y < height);
    2426           0 :         sse = summary_all_avx2(&sum);
    2427           0 :         break;
    2428           0 :     case 128:
    2429             :         do {
    2430           0 :             sse_w32_avx2(&sum, a, b);
    2431           0 :             sse_w32_avx2(&sum, a + 32, b + 32);
    2432           0 :             sse_w32_avx2(&sum, a + 64, b + 64);
    2433           0 :             sse_w32_avx2(&sum, a + 96, b + 96);
    2434           0 :             a += a_stride;
    2435           0 :             b += b_stride;
    2436           0 :             y += 1;
    2437           0 :         } while (y < height);
    2438           0 :         sse = summary_all_avx2(&sum);
    2439           0 :         break;
    2440           0 :     default:
    2441           0 :         if ((width & 0x07) == 0) {
    2442             :             do {
    2443           0 :                 int i = 0;
    2444             :                 do {
    2445           0 :                     sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
    2446           0 :                     i += 8;
    2447           0 :                 } while (i < width);
    2448           0 :                 a += a_stride << 1;
    2449           0 :                 b += b_stride << 1;
    2450           0 :                 y += 2;
    2451           0 :             } while (y < height);
    2452             :         }
    2453             :         else {
    2454             :             do {
    2455           0 :                 int i = 0;
    2456             :                 do {
    2457           0 :                     sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
    2458           0 :                     const uint8_t *a2 = a + i + (a_stride << 1);
    2459           0 :                     const uint8_t *b2 = b + i + (b_stride << 1);
    2460           0 :                     sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
    2461           0 :                     i += 8;
    2462           0 :                 } while (i + 4 < width);
    2463           0 :                 sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
    2464           0 :                 a += a_stride << 2;
    2465           0 :                 b += b_stride << 2;
    2466           0 :                 y += 4;
    2467           0 :             } while (y < height);
    2468             :         }
    2469           0 :         sse = summary_all_avx2(&sum);
    2470           0 :         break;
    2471             :     }
    2472             : 
    2473    17814000 :     return sse;
    2474             : }
    2475             : 
    2476             : 
    2477    23060400 : static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
    2478             : #if ARCH_X86_64
    2479             :     return (uint64_t)_mm_cvtsi128_si64(a);
    2480             : #else
    2481             :     {
    2482             :         uint64_t tmp;
    2483    23060400 :         _mm_storel_epi64((__m128i *)&tmp, a);
    2484    23060400 :         return tmp;
    2485             :     }
    2486             : #endif
    2487             : 
    2488             : }
    2489    23051000 : static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
    2490    23051000 :     const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
    2491    23049100 :     __m128i v_acc0_q = _mm_setzero_si128();
    2492    23049100 :     __m128i v_acc1_q = _mm_setzero_si128();
    2493             : 
    2494    23049100 :     const int16_t *const end = src + n;
    2495             : 
    2496    23049100 :     assert(n % 64 == 0);
    2497             : 
    2498   107794000 :     while (src < end) {
    2499    84908900 :         const __m128i v_val_0_w = xx_load_128(src);
    2500    84893000 :         const __m128i v_val_1_w = xx_load_128(src + 8);
    2501    84857600 :         const __m128i v_val_2_w = xx_load_128(src + 16);
    2502    84817500 :         const __m128i v_val_3_w = xx_load_128(src + 24);
    2503    84783600 :         const __m128i v_val_4_w = xx_load_128(src + 32);
    2504    84763000 :         const __m128i v_val_5_w = xx_load_128(src + 40);
    2505    84753100 :         const __m128i v_val_6_w = xx_load_128(src + 48);
    2506    84744900 :         const __m128i v_val_7_w = xx_load_128(src + 56);
    2507             : 
    2508    84744500 :         const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
    2509    84744500 :         const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
    2510    84744500 :         const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
    2511    84744500 :         const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
    2512    84744500 :         const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
    2513    84744500 :         const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
    2514    84744500 :         const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
    2515    84744500 :         const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
    2516             : 
    2517    84744500 :         const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
    2518    84744500 :         const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
    2519    84744500 :         const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
    2520    84744500 :         const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
    2521             : 
    2522    84744500 :         const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
    2523    84744500 :         const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
    2524             : 
    2525    84744500 :         const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
    2526             : 
    2527   169489000 :         v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
    2528    84744500 :         v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
    2529             : 
    2530    84744500 :         src += 64;
    2531             :     }
    2532             : 
    2533    22884700 :     v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
    2534    22884700 :     v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
    2535    22884700 :     return xx_cvtsi128_si64(v_acc0_q);
    2536             : }
    2537             : 
    2538    23051400 : uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
    2539    23051400 :     if (n % 64 == 0) {
    2540    23051800 :         return aom_sum_squares_i16_64n_sse2(src, n);
    2541             :     }
    2542           0 :     else if (n > 64) {
    2543           0 :         int k = n & ~(64 - 1);
    2544           0 :         return aom_sum_squares_i16_64n_sse2(src, k) +
    2545           0 :             aom_sum_squares_i16_c(src + k, n - k);
    2546             :     }
    2547             :     else {
    2548           0 :         return aom_sum_squares_i16_c(src, n);
    2549             :     }
    2550             : }
    2551             : 
    2552             : 
    2553             : 
    2554             : 
    2555             : /**
    2556             :  * See av1_wedge_sign_from_residuals_c
    2557             :  */
    2558   183596000 : int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
    2559             :     int N, int64_t limit) {
    2560             :     int64_t acc;
    2561   183596000 :     __m256i v_acc0_d = _mm256_setzero_si256();
    2562             : 
    2563             :     // Input size limited to 8192 by the use of 32 bit accumulators and m
    2564             :     // being between [0, 64]. Overflow might happen at larger sizes,
    2565             :     // though it is practically impossible on real video input.
    2566   183596000 :     assert(N < 8192);
    2567   183596000 :     assert(N % 64 == 0);
    2568             : 
    2569             :     do {
    2570   673061000 :         const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
    2571  1345000000 :         const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
    2572             : 
    2573   671958000 :         const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
    2574   671958000 :         const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
    2575   671816000 :         const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
    2576  1342960000 :         const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
    2577             : 
    2578             :         const __m256i v_m0_w =
    2579   671603000 :             _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
    2580             :         const __m256i v_m1_w =
    2581  1343210000 :             _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
    2582             :         const __m256i v_m2_w =
    2583   671603000 :             _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
    2584             :         const __m256i v_m3_w =
    2585  1343210000 :             _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
    2586             : 
    2587   671603000 :         const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
    2588   671603000 :         const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
    2589   671603000 :         const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
    2590   671603000 :         const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
    2591             : 
    2592   671603000 :         const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
    2593   671603000 :         const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
    2594             : 
    2595   671603000 :         const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
    2596             : 
    2597   671603000 :         v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
    2598             : 
    2599   671603000 :         ds += 64;
    2600   671603000 :         m += 64;
    2601             : 
    2602   671603000 :         N -= 64;
    2603   671603000 :     } while (N);
    2604             : 
    2605   183873000 :     __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
    2606   367745000 :     v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
    2607             :         _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
    2608             : 
    2609   367745000 :     __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
    2610             : 
    2611   183873000 :     __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
    2612   183873000 :     __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
    2613   183873000 :     v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
    2614             : 
    2615             : #if ARCH_X86_64
    2616             :     acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
    2617             : #else
    2618   183873000 :     xx_storel_64(&acc, v_acc_q_0);
    2619             : #endif
    2620             : 
    2621   183805000 :     return acc > limit;
    2622             : }
    2623             : 
    2624             : 
    2625             : /**
    2626             :  * av1_wedge_compute_delta_squares_c
    2627             :  */
    2628    11531200 : void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
    2629             :     const int16_t *b, int N) {
    2630    11531200 :     const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
    2631             : 
    2632    11531200 :     assert(N % 64 == 0);
    2633             : 
    2634             :     do {
    2635    42469800 :         const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
    2636    42470000 :         const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
    2637    42470000 :         const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
    2638    42468100 :         const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
    2639    42466000 :         const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
    2640    42465400 :         const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
    2641    42466500 :         const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
    2642    84934500 :         const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
    2643             : 
    2644    42465700 :         const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
    2645    42465700 :         const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
    2646    42465700 :         const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
    2647    42465700 :         const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
    2648    42465700 :         const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
    2649    42465700 :         const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
    2650    42465700 :         const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
    2651    42465700 :         const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
    2652             : 
    2653             :         // Negate top word of pairs
    2654    42465700 :         const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
    2655    42465700 :         const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
    2656    42465700 :         const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
    2657    42465700 :         const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
    2658    42465700 :         const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
    2659    42465700 :         const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
    2660    42465700 :         const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
    2661    42465700 :         const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
    2662             : 
    2663    42465700 :         const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
    2664    42465700 :         const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
    2665    42465700 :         const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
    2666    42465700 :         const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
    2667    42465700 :         const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
    2668    42465700 :         const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
    2669    42465700 :         const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
    2670    42465700 :         const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
    2671             : 
    2672    42465700 :         const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
    2673    42465700 :         const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
    2674    42465700 :         const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
    2675    42465700 :         const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
    2676             : 
    2677             :         _mm256_storeu_si256((__m256i *)(d), v_r0_w);
    2678    42465700 :         _mm256_storeu_si256((__m256i *)(d + 16), v_r1_w);
    2679    42465700 :         _mm256_storeu_si256((__m256i *)(d + 32), v_r2_w);
    2680    42465700 :         _mm256_storeu_si256((__m256i *)(d + 48), v_r3_w);
    2681             : 
    2682    42465700 :         a += 64;
    2683    42465700 :         b += 64;
    2684    42465700 :         d += 64;
    2685    42465700 :         N -= 64;
    2686    42465700 :     } while (N);
    2687    11531200 : }

Generated by: LCOV version 1.14