LCOV - code coverage report
Current view: top level - ASM_AVX2 - jnt_convolve_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1340 1723 77.8 %
Date: 2019-11-25 17:38:06 Functions: 23 23 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : #include "aom_dsp_rtcd.h"
      14             : #include "convolve.h"
      15             : #include "convolve_avx2.h"
      16             : #include "EbDefinitions.h"
      17             : #include "EbMemory_SSE4_1.h"
      18             : 
      19             : SIMD_INLINE void jnt_y_comp_avg_2tap_32_avx2(
      20             :     const uint8_t *const src, const __m256i *const coeffs, const __m256i factor,
      21             :     const __m256i offset, const __m256i s0, __m256i *const s1,
      22             :     ConvBufType *const dst, uint8_t *const dst8) {
      23             :     __m256i r[2];
      24             : 
      25    22440900 :     y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
      26             :     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
      27    22440700 : }
      28             : 
      29    22427700 : static INLINE void jnt_y_avg_2tap_32_avx2(const uint8_t *const src,
      30             :     const __m256i *const coeffs,
      31             :     const __m256i offset,
      32             :     const __m256i s0, __m256i *const s1,
      33             :     const ConvBufType *const dst,
      34             :     uint8_t *const dst8) {
      35             :     __m256i r[2];
      36             : 
      37    22427700 :     y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
      38    22427800 :     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
      39    22426700 : }
      40             : 
      41   102196000 : static INLINE void jnt_y_no_avg_2tap_32_avx2(
      42             :     const uint8_t *const src, const __m256i *const coeffs, const __m256i offset,
      43             :     const __m256i s0, __m256i *const s1, ConvBufType *const dst) {
      44             :     __m256i r[2];
      45             : 
      46   102196000 :     y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
      47   102192000 :     jnt_no_avg_round_store_32_avx2(r, offset, dst);
      48   102184000 : }
      49             : 
      50    20259700 : static void jnt_convolve_y_2tap_avx2(
      51             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
      52             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
      53             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
      54             :     const ConvolveParams *const conv_params) {
      55    20259700 :     const uint8_t *src_ptr = src;
      56    20259700 :     const int32_t dst_stride = conv_params->dst_stride;
      57    20259700 :     const int32_t round_0 = 3;
      58    20259700 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
      59    20259700 :     const int32_t bits = FILTER_BITS - round_0;
      60    20259700 :     const int32_t bd = 8;
      61    20259700 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
      62    20259700 :     const int32_t round_offset =
      63    20259700 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
      64    20259700 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
      65    20259700 :     const int32_t offset_comp_avg =
      66    20259700 :         round_offset * conv_params->bck_offset +
      67    20259700 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
      68    20259700 :         (round_offset << DIST_PRECISION_BITS);
      69    20259700 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
      70    20259700 :         (1 << (round_1 - bits - 2)) -
      71    20259700 :         (round_offset << (round_1 - bits - 1));
      72    20259700 :     const int16_t offset_no_avg =
      73    20259700 :         (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
      74    20259700 :     const int32_t factor =
      75    20259700 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
      76    20259700 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
      77    20259700 :     const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
      78    40519400 :     const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
      79    20259700 :     const __m128i factor_128 = _mm_set1_epi32(factor);
      80    20259700 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
      81    20259700 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
      82    40519400 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
      83    20259700 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
      84    20259700 :     ConvBufType *dst = conv_params->dst;
      85    20259700 :     int32_t y = h;
      86             :     __m128i coeffs_128[4];
      87             :     __m256i coeffs_256[4];
      88             : 
      89    20259700 :     if (w <= 4) {
      90           0 :         prepare_half_coeffs_2tap_ssse3(
      91             :             filter_params_y, subpel_y_q4, coeffs_128);
      92             : 
      93           0 :         if (w == 2) {
      94             :             __m128i s_16[2];
      95             : 
      96           0 :             s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
      97             : 
      98           0 :             if (conv_params->do_average) {
      99           0 :                 if (conv_params->use_jnt_comp_avg) {
     100             :                     do {
     101           0 :                         const __m128i res = y_convolve_2tap_2x2_ssse3(
     102             :                             src_ptr, src_stride, coeffs_128, s_16);
     103           0 :                         jnt_comp_avg_round_store_2x2_sse2(res,
     104             :                             factor_128,
     105             :                             offset_comp_avg_128,
     106             :                             dst,
     107             :                             dst_stride,
     108             :                             dst8,
     109             :                             dst8_stride);
     110           0 :                         src_ptr += 2 * src_stride;
     111           0 :                         dst += 2 * dst_stride;
     112           0 :                         dst8 += 2 * dst8_stride;
     113           0 :                         y -= 2;
     114           0 :                     } while (y);
     115             :                 }
     116             :                 else {
     117             :                     do {
     118           0 :                         const __m128i res = y_convolve_2tap_2x2_ssse3(
     119             :                             src_ptr, src_stride, coeffs_128, s_16);
     120           0 :                         jnt_avg_round_store_2x2_sse2(res,
     121             :                             offset_avg_128,
     122             :                             dst,
     123             :                             dst_stride,
     124             :                             dst8,
     125             :                             dst8_stride);
     126           0 :                         src_ptr += 2 * src_stride;
     127           0 :                         dst += 2 * dst_stride;
     128           0 :                         dst8 += 2 * dst8_stride;
     129           0 :                         y -= 2;
     130           0 :                     } while (y);
     131             :                 }
     132             :             }
     133             :             else {
     134             :                 do {
     135           0 :                     const __m128i res = y_convolve_2tap_2x2_ssse3(
     136             :                         src_ptr, src_stride, coeffs_128, s_16);
     137           0 :                     jnt_no_avg_round_store_2x2_sse2(
     138             :                         res, offset_no_avg_128, dst, dst_stride);
     139           0 :                     src_ptr += 2 * src_stride;
     140           0 :                     dst += 2 * dst_stride;
     141           0 :                     y -= 2;
     142           0 :                 } while (y);
     143             :             }
     144             :         }
     145             :         else {
     146             :             __m128i s_32[2];
     147             : 
     148           0 :             assert(w == 4);
     149             : 
     150           0 :             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
     151             : 
     152           0 :             if (conv_params->do_average) {
     153           0 :                 if (conv_params->use_jnt_comp_avg) {
     154             :                     do {
     155           0 :                         const __m128i res = y_convolve_2tap_4x2_ssse3(
     156             :                             src_ptr, src_stride, coeffs_128, s_32);
     157           0 :                         jnt_comp_avg_round_store_4x2_sse2(res,
     158             :                             factor_128,
     159             :                             offset_comp_avg_128,
     160             :                             dst,
     161             :                             dst_stride,
     162             :                             dst8,
     163             :                             dst8_stride);
     164           0 :                         src_ptr += 2 * src_stride;
     165           0 :                         dst += 2 * dst_stride;
     166           0 :                         dst8 += 2 * dst8_stride;
     167           0 :                         y -= 2;
     168           0 :                     } while (y);
     169             :                 }
     170             :                 else {
     171             :                     do {
     172           0 :                         const __m128i res = y_convolve_2tap_4x2_ssse3(
     173             :                             src_ptr, src_stride, coeffs_128, s_32);
     174           0 :                         jnt_avg_round_store_4x2_sse2(res,
     175             :                             offset_avg_128,
     176             :                             dst,
     177             :                             dst_stride,
     178             :                             dst8,
     179             :                             dst8_stride);
     180           0 :                         src_ptr += 2 * src_stride;
     181           0 :                         dst += 2 * dst_stride;
     182           0 :                         dst8 += 2 * dst8_stride;
     183           0 :                         y -= 2;
     184           0 :                     } while (y);
     185             :                 }
     186             :             }
     187             :             else {
     188             :                 do {
     189           0 :                     const __m128i res = y_convolve_2tap_4x2_ssse3(
     190             :                         src_ptr, src_stride, coeffs_128, s_32);
     191           0 :                     jnt_no_avg_round_store_4x2_sse2(
     192             :                         res, offset_no_avg_128, dst, dst_stride);
     193           0 :                     src_ptr += 2 * src_stride;
     194           0 :                     dst += 2 * dst_stride;
     195           0 :                     y -= 2;
     196           0 :                 } while (y);
     197             :             }
     198             :         }
     199             :     }
     200             :     else {
     201    20259700 :         prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
     202             : 
     203    20269000 :         if (w == 8) {
     204             :             __m128i s_64[2];
     205             : 
     206     8669440 :             s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
     207             : 
     208     8669440 :             if (conv_params->do_average) {
     209     2348170 :                 if (conv_params->use_jnt_comp_avg) {
     210             :                     do {
     211     8617400 :                         const __m256i res = y_convolve_2tap_8x2_avx2(
     212             :                             src_ptr, src_stride, coeffs_256, s_64);
     213     8617040 :                         jnt_comp_avg_round_store_8x2_avx2(res,
     214             :                             factor_256,
     215             :                             offset_comp_avg_256,
     216             :                             dst,
     217             :                             dst_stride,
     218             :                             dst8,
     219             :                             dst8_stride);
     220     8617030 :                         src_ptr += 2 * src_stride;
     221     8617030 :                         dst += 2 * dst_stride;
     222     8617030 :                         dst8 += 2 * dst8_stride;
     223     8617030 :                         y -= 2;
     224     8617030 :                     } while (y);
     225             :                 }
     226             :                 else {
     227             :                     do {
     228     8617090 :                         const __m256i res = y_convolve_2tap_8x2_avx2(
     229             :                             src_ptr, src_stride, coeffs_256, s_64);
     230     8616940 :                         jnt_avg_round_store_8x2_sse2(res,
     231             :                             offset_avg_256,
     232             :                             dst,
     233             :                             dst_stride,
     234             :                             dst8,
     235             :                             dst8_stride);
     236     8616700 :                         src_ptr += 2 * src_stride;
     237     8616700 :                         dst += 2 * dst_stride;
     238     8616700 :                         dst8 += 2 * dst8_stride;
     239     8616700 :                         y -= 2;
     240     8616700 :                     } while (y);
     241             :                 }
     242             :             }
     243             :             else {
     244             :                 do {
     245    43828800 :                     const __m256i res = y_convolve_2tap_8x2_avx2(
     246             :                         src_ptr, src_stride, coeffs_256, s_64);
     247    43832200 :                     jnt_no_avg_round_store_8x2_avx2(
     248             :                         res, offset_no_avg_256, dst, dst_stride);
     249    43829000 :                     src_ptr += 2 * src_stride;
     250    43829000 :                     dst += 2 * dst_stride;
     251    43829000 :                     y -= 2;
     252    43829000 :                 } while (y);
     253             :             }
     254             :         }
     255    11599600 :         else if (w == 16) {
     256             :             __m128i s_128[2];
     257             :             __m256i r[2];
     258             : 
     259     7005530 :             s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
     260             : 
     261     7005530 :             if (conv_params->do_average) {
     262     1860260 :                 if (conv_params->use_jnt_comp_avg) {
     263             :                     do {
     264     9114440 :                         y_convolve_2tap_16x2_avx2(
     265             :                             src_ptr, src_stride, coeffs_256, s_128, r);
     266             :                         jnt_comp_avg_round_store_16x2_avx2(r,
     267             :                             factor_256,
     268             :                             offset_comp_avg_256,
     269             :                             dst,
     270             :                             dst_stride,
     271             :                             dst8,
     272             :                             dst8_stride);
     273     9114160 :                         src_ptr += 2 * src_stride;
     274     9114160 :                         dst += 2 * dst_stride;
     275     9114160 :                         dst8 += 2 * dst8_stride;
     276     9114160 :                         y -= 2;
     277     9114160 :                     } while (y);
     278             :                 }
     279             :                 else {
     280             :                     do {
     281     9114470 :                         y_convolve_2tap_16x2_avx2(
     282             :                             src_ptr, src_stride, coeffs_256, s_128, r);
     283     9113920 :                         jnt_avg_round_store_16x2_avx2(r,
     284             :                             offset_avg_256,
     285             :                             dst,
     286             :                             dst_stride,
     287             :                             dst8,
     288             :                             dst8_stride);
     289     9114220 :                         src_ptr += 2 * src_stride;
     290     9114220 :                         dst += 2 * dst_stride;
     291     9114220 :                         dst8 += 2 * dst8_stride;
     292     9114220 :                         y -= 2;
     293     9114220 :                     } while (y);
     294             :                 }
     295             :             }
     296             :             else {
     297             :                 do {
     298    47448400 :                     y_convolve_2tap_16x2_avx2(
     299             :                         src_ptr, src_stride, coeffs_256, s_128, r);
     300    47458100 :                     jnt_no_avg_round_store_16x2_avx2(
     301             :                         r, offset_no_avg_256, dst, dst_stride);
     302    47448800 :                     src_ptr += 2 * src_stride;
     303    47448800 :                     dst += 2 * dst_stride;
     304    47448800 :                     y -= 2;
     305    47448800 :                 } while (y);
     306             :             }
     307             :         }
     308     4594040 :         else if (w == 32) {
     309             :             __m256i s_256[2];
     310             : 
     311     3606640 :             s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
     312             : 
     313     3606640 :             if (conv_params->do_average) {
     314     1052260 :                 if (conv_params->use_jnt_comp_avg) {
     315             :                     do {
     316     5832260 :                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
     317             :                             coeffs_256,
     318             :                             factor_256,
     319             :                             offset_comp_avg_256,
     320             :                             s_256[0],
     321             :                             &s_256[1],
     322             :                             dst,
     323             :                             dst8);
     324     5832040 :                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     325             :                             coeffs_256,
     326             :                             factor_256,
     327             :                             offset_comp_avg_256,
     328             :                             s_256[1],
     329             :                             &s_256[0],
     330     5832040 :                             dst + dst_stride,
     331             :                             dst8 + dst8_stride);
     332     5832100 :                         src_ptr += 2 * src_stride;
     333     5832100 :                         dst += 2 * dst_stride;
     334     5832100 :                         dst8 += 2 * dst8_stride;
     335     5832100 :                         y -= 2;
     336     5832100 :                     } while (y);
     337             :                 }
     338             :                 else {
     339             :                     do {
     340     5832200 :                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
     341             :                             coeffs_256,
     342             :                             offset_avg_256,
     343             :                             s_256[0],
     344             :                             &s_256[1],
     345             :                             dst,
     346             :                             dst8);
     347     5831990 :                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     348             :                             coeffs_256,
     349             :                             offset_avg_256,
     350             :                             s_256[1],
     351             :                             &s_256[0],
     352     5831990 :                             dst + dst_stride,
     353             :                             dst8 + dst8_stride);
     354     5832050 :                         src_ptr += 2 * src_stride;
     355     5832050 :                         dst += 2 * dst_stride;
     356     5832050 :                         dst8 += 2 * dst8_stride;
     357     5832050 :                         y -= 2;
     358     5832050 :                     } while (y);
     359             :                 }
     360             :             }
     361             :             else {
     362             :                 do {
     363    28920400 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
     364             :                         coeffs_256,
     365             :                         offset_no_avg_256,
     366             :                         s_256[0],
     367             :                         &s_256[1],
     368             :                         dst);
     369    28920700 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     370             :                         coeffs_256,
     371             :                         offset_no_avg_256,
     372             :                         s_256[1],
     373             :                         &s_256[0],
     374    28920700 :                         dst + dst_stride);
     375    28920800 :                     src_ptr += 2 * src_stride;
     376    28920800 :                     dst += 2 * dst_stride;
     377    28920800 :                     y -= 2;
     378    28920800 :                 } while (y);
     379             :             }
     380             :         }
     381      987400 :         else if (w == 64) {
     382             :             __m256i s_256[2][2];
     383             : 
     384      995046 :             s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
     385      995046 :             s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
     386             : 
     387      995046 :             if (conv_params->do_average) {
     388      325072 :                 if (conv_params->use_jnt_comp_avg) {
     389             :                     do {
     390     2694190 :                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
     391             :                             coeffs_256,
     392             :                             factor_256,
     393             :                             offset_comp_avg_256,
     394             :                             s_256[0][0],
     395             :                             &s_256[1][0],
     396             :                             dst,
     397             :                             dst8);
     398     2694120 :                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 32,
     399             :                             coeffs_256,
     400             :                             factor_256,
     401             :                             offset_comp_avg_256,
     402             :                             s_256[0][1],
     403             :                             &s_256[1][1],
     404             :                             dst + 32,
     405             :                             dst8 + 32);
     406     2694170 :                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     407             :                             coeffs_256,
     408             :                             factor_256,
     409             :                             offset_comp_avg_256,
     410             :                             s_256[1][0],
     411             :                             &s_256[0][0],
     412     2694170 :                             dst + dst_stride,
     413             :                             dst8 + dst8_stride);
     414     2694140 :                         jnt_y_comp_avg_2tap_32_avx2(
     415     2694140 :                             src_ptr + 2 * src_stride + 32,
     416             :                             coeffs_256,
     417             :                             factor_256,
     418             :                             offset_comp_avg_256,
     419             :                             s_256[1][1],
     420             :                             &s_256[0][1],
     421     2694140 :                             dst + dst_stride + 32,
     422     2694140 :                             dst8 + dst8_stride + 32);
     423             : 
     424     2694160 :                         src_ptr += 2 * src_stride;
     425     2694160 :                         dst += 2 * dst_stride;
     426     2694160 :                         dst8 += 2 * dst8_stride;
     427     2694160 :                         y -= 2;
     428     2694160 :                     } while (y);
     429             :                 }
     430             :                 else {
     431             :                     do {
     432     2694040 :                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
     433             :                             coeffs_256,
     434             :                             offset_avg_256,
     435             :                             s_256[0][0],
     436             :                             &s_256[1][0],
     437             :                             dst,
     438             :                             dst8);
     439     2694050 :                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 32,
     440             :                             coeffs_256,
     441             :                             offset_avg_256,
     442             :                             s_256[0][1],
     443             :                             &s_256[1][1],
     444     2694050 :                             dst + 32,
     445             :                             dst8 + 32);
     446     2694060 :                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     447             :                             coeffs_256,
     448             :                             offset_avg_256,
     449             :                             s_256[1][0],
     450             :                             &s_256[0][0],
     451     2694060 :                             dst + dst_stride,
     452             :                             dst8 + dst8_stride);
     453     2694050 :                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
     454             :                             coeffs_256,
     455             :                             offset_avg_256,
     456             :                             s_256[1][1],
     457             :                             &s_256[0][1],
     458     2694050 :                             dst + dst_stride + 32,
     459     2694050 :                             dst8 + dst8_stride + 32);
     460             : 
     461     2694040 :                         src_ptr += 2 * src_stride;
     462     2694040 :                         dst += 2 * dst_stride;
     463     2694040 :                         dst8 += 2 * dst8_stride;
     464     2694040 :                         y -= 2;
     465     2694040 :                     } while (y);
     466             :                 }
     467             :             }
     468             :             else {
     469             :                 do {
     470    11137000 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
     471             :                         coeffs_256,
     472             :                         offset_no_avg_256,
     473             :                         s_256[0][0],
     474             :                         &s_256[1][0],
     475             :                         dst);
     476    11137100 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 32,
     477             :                         coeffs_256,
     478             :                         offset_no_avg_256,
     479             :                         s_256[0][1],
     480             :                         &s_256[1][1],
     481             :                         dst + 32);
     482    11137100 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     483             :                         coeffs_256,
     484             :                         offset_no_avg_256,
     485             :                         s_256[1][0],
     486             :                         &s_256[0][0],
     487    11137100 :                         dst + dst_stride);
     488    11137000 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
     489             :                         coeffs_256,
     490             :                         offset_no_avg_256,
     491             :                         s_256[1][1],
     492             :                         &s_256[0][1],
     493    11137000 :                         dst + dst_stride + 32);
     494             : 
     495    11137100 :                     src_ptr += 2 * src_stride;
     496    11137100 :                     dst += 2 * dst_stride;
     497    11137100 :                     y -= 2;
     498    11137100 :                 } while (y);
     499             :             }
     500             :         }
     501             :         else {
     502             :             __m256i s_256[2][4];
     503             : 
     504           0 :             assert(w == 128);
     505             : 
     506           0 :             s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
     507           0 :             s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
     508           0 :             s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
     509           0 :             s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
     510             : 
     511           0 :             if (conv_params->do_average) {
     512           0 :                 if (conv_params->use_jnt_comp_avg) {
     513             :                     do {
     514           0 :                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
     515             :                             coeffs_256,
     516             :                             factor_256,
     517             :                             offset_comp_avg_256,
     518             :                             s_256[0][0],
     519             :                             &s_256[1][0],
     520             :                             dst,
     521             :                             dst8);
     522           0 :                         jnt_y_comp_avg_2tap_32_avx2(
     523           0 :                             src_ptr + src_stride + 1 * 32,
     524             :                             coeffs_256,
     525             :                             factor_256,
     526             :                             offset_comp_avg_256,
     527             :                             s_256[0][1],
     528             :                             &s_256[1][1],
     529             :                             dst + 1 * 32,
     530             :                             dst8 + 1 * 32);
     531           0 :                         jnt_y_comp_avg_2tap_32_avx2(
     532           0 :                             src_ptr + src_stride + 2 * 32,
     533             :                             coeffs_256,
     534             :                             factor_256,
     535             :                             offset_comp_avg_256,
     536             :                             s_256[0][2],
     537             :                             &s_256[1][2],
     538             :                             dst + 2 * 32,
     539             :                             dst8 + 2 * 32);
     540           0 :                         jnt_y_comp_avg_2tap_32_avx2(
     541           0 :                             src_ptr + src_stride + 3 * 32,
     542             :                             coeffs_256,
     543             :                             factor_256,
     544             :                             offset_comp_avg_256,
     545             :                             s_256[0][3],
     546             :                             &s_256[1][3],
     547             :                             dst + 3 * 32,
     548             :                             dst8 + 3 * 32);
     549           0 :                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     550             :                             coeffs_256,
     551             :                             factor_256,
     552             :                             offset_comp_avg_256,
     553             :                             s_256[1][0],
     554             :                             &s_256[0][0],
     555           0 :                             dst + dst_stride,
     556             :                             dst8 + dst8_stride);
     557           0 :                         jnt_y_comp_avg_2tap_32_avx2(
     558           0 :                             src_ptr + 2 * src_stride + 1 * 32,
     559             :                             coeffs_256,
     560             :                             factor_256,
     561             :                             offset_comp_avg_256,
     562             :                             s_256[1][1],
     563             :                             &s_256[0][1],
     564           0 :                             dst + dst_stride + 1 * 32,
     565           0 :                             dst8 + dst8_stride + 1 * 32);
     566           0 :                         jnt_y_comp_avg_2tap_32_avx2(
     567           0 :                             src_ptr + 2 * src_stride + 2 * 32,
     568             :                             coeffs_256,
     569             :                             factor_256,
     570             :                             offset_comp_avg_256,
     571             :                             s_256[1][2],
     572             :                             &s_256[0][2],
     573           0 :                             dst + dst_stride + 2 * 32,
     574           0 :                             dst8 + dst8_stride + 2 * 32);
     575           0 :                         jnt_y_comp_avg_2tap_32_avx2(
     576           0 :                             src_ptr + 2 * src_stride + 3 * 32,
     577             :                             coeffs_256,
     578             :                             factor_256,
     579             :                             offset_comp_avg_256,
     580             :                             s_256[1][3],
     581             :                             &s_256[0][3],
     582           0 :                             dst + dst_stride + 3 * 32,
     583           0 :                             dst8 + dst8_stride + 3 * 32);
     584             : 
     585           0 :                         src_ptr += 2 * src_stride;
     586           0 :                         dst += 2 * dst_stride;
     587           0 :                         dst8 += 2 * dst8_stride;
     588           0 :                         y -= 2;
     589           0 :                     } while (y);
     590             :                 }
     591             :                 else {
     592             :                     do {
     593           0 :                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
     594             :                             coeffs_256,
     595             :                             offset_avg_256,
     596             :                             s_256[0][0],
     597             :                             &s_256[1][0],
     598             :                             dst,
     599             :                             dst8);
     600           0 :                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
     601             :                             coeffs_256,
     602             :                             offset_avg_256,
     603             :                             s_256[0][1],
     604             :                             &s_256[1][1],
     605           0 :                             dst + 1 * 32,
     606             :                             dst8 + 1 * 32);
     607           0 :                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
     608             :                             coeffs_256,
     609             :                             offset_avg_256,
     610             :                             s_256[0][2],
     611             :                             &s_256[1][2],
     612           0 :                             dst + 2 * 32,
     613             :                             dst8 + 2 * 32);
     614           0 :                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
     615             :                             coeffs_256,
     616             :                             offset_avg_256,
     617             :                             s_256[0][3],
     618             :                             &s_256[1][3],
     619           0 :                             dst + 3 * 32,
     620             :                             dst8 + 3 * 32);
     621           0 :                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     622             :                             coeffs_256,
     623             :                             offset_avg_256,
     624             :                             s_256[1][0],
     625             :                             &s_256[0][0],
     626           0 :                             dst + dst_stride,
     627             :                             dst8 + dst8_stride);
     628           0 :                         jnt_y_avg_2tap_32_avx2(
     629           0 :                             src_ptr + 2 * src_stride + 1 * 32,
     630             :                             coeffs_256,
     631             :                             offset_avg_256,
     632             :                             s_256[1][1],
     633             :                             &s_256[0][1],
     634           0 :                             dst + dst_stride + 1 * 32,
     635           0 :                             dst8 + dst8_stride + 1 * 32);
     636           0 :                         jnt_y_avg_2tap_32_avx2(
     637           0 :                             src_ptr + 2 * src_stride + 2 * 32,
     638             :                             coeffs_256,
     639             :                             offset_avg_256,
     640             :                             s_256[1][2],
     641             :                             &s_256[0][2],
     642           0 :                             dst + dst_stride + 2 * 32,
     643           0 :                             dst8 + dst8_stride + 2 * 32);
     644           0 :                         jnt_y_avg_2tap_32_avx2(
     645           0 :                             src_ptr + 2 * src_stride + 3 * 32,
     646             :                             coeffs_256,
     647             :                             offset_avg_256,
     648             :                             s_256[1][3],
     649             :                             &s_256[0][3],
     650           0 :                             dst + dst_stride + 3 * 32,
     651           0 :                             dst8 + dst8_stride + 3 * 32);
     652             : 
     653           0 :                         src_ptr += 2 * src_stride;
     654           0 :                         dst += 2 * dst_stride;
     655           0 :                         dst8 += 2 * dst8_stride;
     656           0 :                         y -= 2;
     657           0 :                     } while (y);
     658             :                 }
     659             :             }
     660             :             else {
     661             :                 do {
     662           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
     663             :                         coeffs_256,
     664             :                         offset_no_avg_256,
     665             :                         s_256[0][0],
     666             :                         &s_256[1][0],
     667             :                         dst);
     668           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
     669             :                         coeffs_256,
     670             :                         offset_no_avg_256,
     671             :                         s_256[0][1],
     672             :                         &s_256[1][1],
     673             :                         dst + 1 * 32);
     674           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
     675             :                         coeffs_256,
     676             :                         offset_no_avg_256,
     677             :                         s_256[0][2],
     678             :                         &s_256[1][2],
     679             :                         dst + 2 * 32);
     680           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
     681             :                         coeffs_256,
     682             :                         offset_no_avg_256,
     683             :                         s_256[0][3],
     684             :                         &s_256[1][3],
     685             :                         dst + 3 * 32);
     686           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
     687             :                         coeffs_256,
     688             :                         offset_no_avg_256,
     689             :                         s_256[1][0],
     690             :                         &s_256[0][0],
     691           0 :                         dst + dst_stride);
     692           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
     693             :                         coeffs_256,
     694             :                         offset_no_avg_256,
     695             :                         s_256[1][1],
     696             :                         &s_256[0][1],
     697           0 :                         dst + dst_stride + 1 * 32);
     698           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
     699             :                         coeffs_256,
     700             :                         offset_no_avg_256,
     701             :                         s_256[1][2],
     702             :                         &s_256[0][2],
     703           0 :                         dst + dst_stride + 2 * 32);
     704           0 :                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
     705             :                         coeffs_256,
     706             :                         offset_no_avg_256,
     707             :                         s_256[1][3],
     708             :                         &s_256[0][3],
     709           0 :                         dst + dst_stride + 3 * 32);
     710             : 
     711           0 :                     src_ptr += 2 * src_stride;
     712           0 :                     dst += 2 * dst_stride;
     713           0 :                     y -= 2;
     714           0 :                 } while (y);
     715             :             }
     716             :         }
     717             :     }
     718    20276000 : }
     719             : 
     720      143763 : static void jnt_convolve_y_4tap_avx2(
     721             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
     722             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
     723             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
     724             :     const ConvolveParams *const conv_params) {
     725      143763 :     const uint8_t *src_ptr = src - src_stride;
     726      143763 :     const int32_t dst_stride = conv_params->dst_stride;
     727      143763 :     const int32_t round_0 = 3;
     728      143763 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
     729      143763 :     const int32_t bits = FILTER_BITS - round_0;
     730      143763 :     const int32_t bd = 8;
     731      143763 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
     732      143763 :     const int32_t round_offset =
     733      143763 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
     734      143763 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
     735      143763 :     const int32_t offset_comp_avg =
     736      143763 :         round_offset * conv_params->bck_offset +
     737      143763 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
     738      143763 :         (round_offset << DIST_PRECISION_BITS);
     739      143763 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
     740      143763 :         (1 << (round_1 - bits - 2)) -
     741      143763 :         (round_offset << (round_1 - bits - 1));
     742      143763 :     const int16_t offset_no_avg =
     743      143763 :         (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
     744      143763 :     const int32_t factor =
     745      143763 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
     746      143763 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
     747      143763 :     const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
     748      287526 :     const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
     749      143763 :     const __m128i factor_128 = _mm_set1_epi32(factor);
     750      143763 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
     751      143763 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
     752      287526 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
     753      143763 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
     754      143763 :     ConvBufType *dst = conv_params->dst;
     755      143763 :     int32_t y = h;
     756             :     __m128i coeffs_128[4];
     757             :     __m256i coeffs_256[4];
     758             : 
     759      143763 :     if (w <= 4) {
     760       62618 :         prepare_half_coeffs_4tap_ssse3(
     761             :             filter_params_y, subpel_y_q4, coeffs_128);
     762             : 
     763       62618 :         if (w == 2) {
     764             :             __m128i s_16[4], ss_128[2];
     765             : 
     766           0 :             s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
     767           0 :             s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
     768           0 :             s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
     769             : 
     770           0 :             const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
     771           0 :             const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
     772             : 
     773           0 :             ss_128[0] = _mm_unpacklo_epi8(src01, src12);
     774             : 
     775           0 :             if (conv_params->do_average) {
     776           0 :                 if (conv_params->use_jnt_comp_avg) {
     777             :                     do {
     778           0 :                         src_ptr += 2 * src_stride;
     779           0 :                         const __m128i res = y_convolve_4tap_2x2_ssse3(
     780             :                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
     781           0 :                         jnt_comp_avg_round_store_2x2_sse2(res,
     782             :                             factor_128,
     783             :                             offset_comp_avg_128,
     784             :                             dst,
     785             :                             dst_stride,
     786             :                             dst8,
     787             :                             dst8_stride);
     788           0 :                         ss_128[0] = ss_128[1];
     789           0 :                         dst += 2 * dst_stride;
     790           0 :                         dst8 += 2 * dst8_stride;
     791           0 :                         y -= 2;
     792           0 :                     } while (y);
     793             :                 }
     794             :                 else {
     795             :                     do {
     796           0 :                         src_ptr += 2 * src_stride;
     797           0 :                         const __m128i res = y_convolve_4tap_2x2_ssse3(
     798             :                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
     799           0 :                         jnt_avg_round_store_2x2_sse2(res,
     800             :                             offset_avg_128,
     801             :                             dst,
     802             :                             dst_stride,
     803             :                             dst8,
     804             :                             dst8_stride);
     805           0 :                         ss_128[0] = ss_128[1];
     806           0 :                         dst += 2 * dst_stride;
     807           0 :                         dst8 += 2 * dst8_stride;
     808           0 :                         y -= 2;
     809           0 :                     } while (y);
     810             :                 }
     811             :             }
     812             :             else {
     813             :                 do {
     814           0 :                     src_ptr += 2 * src_stride;
     815           0 :                     const __m128i res = y_convolve_4tap_2x2_ssse3(
     816             :                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
     817           0 :                     jnt_no_avg_round_store_2x2_sse2(
     818             :                         res, offset_no_avg_128, dst, dst_stride);
     819           0 :                     ss_128[0] = ss_128[1];
     820           0 :                     dst += 2 * dst_stride;
     821           0 :                     y -= 2;
     822           0 :                 } while (y);
     823             :             }
     824             :         }
     825             :         else {
     826             :             __m128i s_32[4], ss_128[2];
     827             : 
     828       62618 :             assert(w == 4);
     829             : 
     830       62618 :             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
     831       62618 :             s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
     832       62618 :             s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
     833             : 
     834       62618 :             const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
     835      125236 :             const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
     836             : 
     837       62618 :             ss_128[0] = _mm_unpacklo_epi8(src01, src12);
     838             : 
     839       62618 :             if (conv_params->do_average) {
     840       26716 :                 if (conv_params->use_jnt_comp_avg) {
     841             :                     do {
     842       19232 :                         src_ptr += 2 * src_stride;
     843       19232 :                         const __m128i res = y_convolve_4tap_4x2_ssse3(
     844             :                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
     845       19232 :                         jnt_comp_avg_round_store_4x2_sse2(res,
     846             :                             factor_128,
     847             :                             offset_comp_avg_128,
     848             :                             dst,
     849             :                             dst_stride,
     850             :                             dst8,
     851             :                             dst8_stride);
     852       19232 :                         ss_128[0] = ss_128[1];
     853       19232 :                         dst += 2 * dst_stride;
     854       19232 :                         dst8 += 2 * dst8_stride;
     855       19232 :                         y -= 2;
     856       19232 :                     } while (y);
     857             :                 }
     858             :                 else {
     859             :                     do {
     860       34200 :                         src_ptr += 2 * src_stride;
     861       34200 :                         const __m128i res = y_convolve_4tap_4x2_ssse3(
     862             :                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
     863       34200 :                         jnt_avg_round_store_4x2_sse2(res,
     864             :                             offset_avg_128,
     865             :                             dst,
     866             :                             dst_stride,
     867             :                             dst8,
     868             :                             dst8_stride);
     869       34200 :                         ss_128[0] = ss_128[1];
     870       34200 :                         dst += 2 * dst_stride;
     871       34200 :                         dst8 += 2 * dst8_stride;
     872       34200 :                         y -= 2;
     873       34200 :                     } while (y);
     874             :                 }
     875             :             }
     876             :             else {
     877             :                 do {
     878       71804 :                     src_ptr += 2 * src_stride;
     879       71804 :                     const __m128i res = y_convolve_4tap_4x2_ssse3(
     880             :                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
     881       71804 :                     jnt_no_avg_round_store_4x2_sse2(
     882             :                         res, offset_no_avg_128, dst, dst_stride);
     883       71804 :                     ss_128[0] = ss_128[1];
     884       71804 :                     dst += 2 * dst_stride;
     885       71804 :                     y -= 2;
     886       71804 :                 } while (y);
     887             :             }
     888             :         }
     889             :     }
     890             :     else {
     891       81145 :         prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
     892             : 
     893       81146 :         if (w == 8) {
     894             :             __m128i s_64[4];
     895             :             __m256i ss_256[2];
     896             : 
     897       45428 :             s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
     898       45428 :             s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
     899       45428 :             s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
     900             : 
     901             :             // Load lines a and b. Line a to lower 128, line b to upper
     902             :             // 128
     903       45428 :             const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
     904       90856 :             const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
     905             : 
     906       45428 :             ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
     907             : 
     908       45428 :             if (conv_params->do_average) {
     909       19472 :                 if (conv_params->use_jnt_comp_avg) {
     910             :                     do {
     911       14528 :                         src_ptr += 2 * src_stride;
     912       14528 :                         const __m256i res = y_convolve_4tap_8x2_avx2(
     913             :                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
     914       14528 :                         jnt_comp_avg_round_store_8x2_avx2(res,
     915             :                             factor_256,
     916             :                             offset_comp_avg_256,
     917             :                             dst,
     918             :                             dst_stride,
     919             :                             dst8,
     920             :                             dst8_stride);
     921       14528 :                         ss_256[0] = ss_256[1];
     922       14528 :                         dst += 2 * dst_stride;
     923       14528 :                         dst8 += 2 * dst8_stride;
     924       14528 :                         y -= 2;
     925       14528 :                     } while (y);
     926             :                 }
     927             :                 else {
     928             :                     do {
     929       24416 :                         src_ptr += 2 * src_stride;
     930       24416 :                         const __m256i res = y_convolve_4tap_8x2_avx2(
     931             :                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
     932       24416 :                         jnt_avg_round_store_8x2_sse2(res,
     933             :                             offset_avg_256,
     934             :                             dst,
     935             :                             dst_stride,
     936             :                             dst8,
     937             :                             dst8_stride);
     938       24416 :                         ss_256[0] = ss_256[1];
     939       24416 :                         dst += 2 * dst_stride;
     940       24416 :                         dst8 += 2 * dst8_stride;
     941       24416 :                         y -= 2;
     942       24416 :                     } while (y);
     943             :                 }
     944             :             }
     945             :             else {
     946             :                 do {
     947       51912 :                     src_ptr += 2 * src_stride;
     948       51912 :                     const __m256i res = y_convolve_4tap_8x2_avx2(
     949             :                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
     950       51912 :                     jnt_no_avg_round_store_8x2_avx2(
     951             :                         res, offset_no_avg_256, dst, dst_stride);
     952       51912 :                     ss_256[0] = ss_256[1];
     953       51912 :                     dst += 2 * dst_stride;
     954       51912 :                     y -= 2;
     955       51912 :                 } while (y);
     956             :             }
     957             :         }
     958             :         else {
     959             :             __m128i s_128[4];
     960             :             __m256i ss_256[4], r[2];
     961             : 
     962       35718 :             assert(w == 16);
     963             : 
     964       35718 :             s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
     965       35718 :             s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
     966       35718 :             s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
     967             : 
     968             :             // Load lines a and b. Line a to lower 128, line b to upper
     969             :             // 128
     970       35718 :             const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
     971       71436 :             const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
     972             : 
     973       35718 :             ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
     974       35718 :             ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
     975             : 
     976       35718 :             if (conv_params->do_average) {
     977       15706 :                 if (conv_params->use_jnt_comp_avg) {
     978             :                     do {
     979       12316 :                         src_ptr += 2 * src_stride;
     980       12316 :                         y_convolve_4tap_16x2_avx2(
     981             :                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
     982             :                         jnt_comp_avg_round_store_16x2_avx2(r,
     983             :                             factor_256,
     984             :                             offset_comp_avg_256,
     985             :                             dst,
     986             :                             dst_stride,
     987             :                             dst8,
     988             :                             dst8_stride);
     989       12316 :                         ss_256[0] = ss_256[1];
     990       12316 :                         ss_256[2] = ss_256[3];
     991       12316 :                         dst += 2 * dst_stride;
     992       12316 :                         dst8 += 2 * dst8_stride;
     993       12316 :                         y -= 2;
     994       12316 :                     } while (y);
     995             :                 }
     996             :                 else {
     997             :                     do {
     998       19096 :                         src_ptr += 2 * src_stride;
     999       19096 :                         y_convolve_4tap_16x2_avx2(
    1000             :                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1001       19096 :                         jnt_avg_round_store_16x2_avx2(r,
    1002             :                             offset_avg_256,
    1003             :                             dst,
    1004             :                             dst_stride,
    1005             :                             dst8,
    1006             :                             dst8_stride);
    1007       19096 :                         ss_256[0] = ss_256[1];
    1008       19096 :                         ss_256[2] = ss_256[3];
    1009       19096 :                         dst += 2 * dst_stride;
    1010       19096 :                         dst8 += 2 * dst8_stride;
    1011       19096 :                         y -= 2;
    1012       19096 :                     } while (y);
    1013             :                 }
    1014             :             }
    1015             :             else {
    1016             :                 do {
    1017       40023 :                     src_ptr += 2 * src_stride;
    1018       40023 :                     y_convolve_4tap_16x2_avx2(
    1019             :                         src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1020       40024 :                     jnt_no_avg_round_store_16x2_avx2(
    1021             :                         r, offset_no_avg_256, dst, dst_stride);
    1022       40023 :                     ss_256[0] = ss_256[1];
    1023       40023 :                     ss_256[2] = ss_256[3];
    1024       40023 :                     dst += 2 * dst_stride;
    1025       40023 :                     y -= 2;
    1026       40023 :                 } while (y);
    1027             :             }
    1028             :         }
    1029             :     }
    1030      143764 : }
    1031             : 
    1032     7752470 : static void jnt_convolve_y_6tap_avx2(
    1033             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    1034             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    1035             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    1036             :     const ConvolveParams *const conv_params) {
    1037     7752470 :     const uint8_t *src_ptr = src - 2 * src_stride;
    1038     7752470 :     const int32_t dst_stride = conv_params->dst_stride;
    1039     7752470 :     const int32_t round_0 = 3;
    1040     7752470 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    1041     7752470 :     const int32_t bits = FILTER_BITS - round_0;
    1042     7752470 :     const int32_t bd = 8;
    1043     7752470 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
    1044     7752470 :     const int32_t round_offset =
    1045     7752470 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
    1046     7752470 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
    1047     7752470 :     const int32_t offset_comp_avg =
    1048     7752470 :         round_offset * conv_params->bck_offset +
    1049     7752470 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
    1050     7752470 :         (round_offset << DIST_PRECISION_BITS);
    1051     7752470 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
    1052     7752470 :         (1 << (round_1 - bits - 2)) -
    1053     7752470 :         (round_offset << (round_1 - bits - 1));
    1054     7752470 :     const int16_t offset_no_avg =
    1055     7752470 :         (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
    1056     7752470 :     const int32_t factor =
    1057     7752470 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    1058     7752470 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    1059     7752470 :     const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
    1060    15504900 :     const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
    1061     7752470 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    1062     7752470 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    1063     7752470 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
    1064    15504900 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
    1065     7752470 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    1066     7752470 :     ConvBufType *dst = conv_params->dst;
    1067             :     int32_t x;
    1068     7752470 :     int32_t y = h;
    1069             :     __m128i coeffs_128[4];
    1070             :     __m256i coeffs_256[4];
    1071             : 
    1072     7752470 :     if (w <= 4) {
    1073       82460 :         prepare_half_coeffs_6tap_ssse3(
    1074             :             filter_params_y, subpel_y_q4, coeffs_128);
    1075             : 
    1076       82460 :         y = h;
    1077             : 
    1078       82460 :         if (w == 2) {
    1079             :             __m128i s_16[6], ss_128[3];
    1080             : 
    1081           0 :             s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
    1082           0 :             s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
    1083           0 :             s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
    1084           0 :             s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
    1085           0 :             s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
    1086             : 
    1087           0 :             const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
    1088           0 :             const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
    1089           0 :             const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
    1090           0 :             const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
    1091             : 
    1092           0 :             ss_128[0] = _mm_unpacklo_epi8(src01, src12);
    1093           0 :             ss_128[1] = _mm_unpacklo_epi8(src23, src34);
    1094             : 
    1095           0 :             if (conv_params->do_average) {
    1096           0 :                 if (conv_params->use_jnt_comp_avg) {
    1097             :                     do {
    1098           0 :                         src_ptr += 2 * src_stride;
    1099           0 :                         const __m128i res = y_convolve_6tap_2x2_ssse3(
    1100             :                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
    1101           0 :                         jnt_comp_avg_round_store_2x2_sse2(res,
    1102             :                             factor_128,
    1103             :                             offset_comp_avg_128,
    1104             :                             dst,
    1105             :                             dst_stride,
    1106             :                             dst8,
    1107             :                             dst8_stride);
    1108           0 :                         ss_128[0] = ss_128[1];
    1109           0 :                         ss_128[1] = ss_128[2];
    1110           0 :                         dst += 2 * dst_stride;
    1111           0 :                         dst8 += 2 * dst8_stride;
    1112           0 :                         y -= 2;
    1113           0 :                     } while (y);
    1114             :                 }
    1115             :                 else {
    1116             :                     do {
    1117           0 :                         src_ptr += 2 * src_stride;
    1118           0 :                         const __m128i res = y_convolve_6tap_2x2_ssse3(
    1119             :                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
    1120           0 :                         jnt_avg_round_store_2x2_sse2(res,
    1121             :                             offset_avg_128,
    1122             :                             dst,
    1123             :                             dst_stride,
    1124             :                             dst8,
    1125             :                             dst8_stride);
    1126           0 :                         ss_128[0] = ss_128[1];
    1127           0 :                         ss_128[1] = ss_128[2];
    1128           0 :                         dst += 2 * dst_stride;
    1129           0 :                         dst8 += 2 * dst8_stride;
    1130           0 :                         y -= 2;
    1131           0 :                     } while (y);
    1132             :                 }
    1133             :             }
    1134             :             else {
    1135             :                 do {
    1136           0 :                     src_ptr += 2 * src_stride;
    1137           0 :                     const __m128i res = y_convolve_6tap_2x2_ssse3(
    1138             :                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
    1139           0 :                     jnt_no_avg_round_store_2x2_sse2(
    1140             :                         res, offset_no_avg_128, dst, dst_stride);
    1141           0 :                     ss_128[0] = ss_128[1];
    1142           0 :                     ss_128[1] = ss_128[2];
    1143           0 :                     dst += 2 * dst_stride;
    1144           0 :                     y -= 2;
    1145           0 :                 } while (y);
    1146             :             }
    1147             :         }
    1148             :         else {
    1149             :             __m128i s_32[6], ss_128[3];
    1150             : 
    1151       82460 :             assert(w == 4);
    1152             : 
    1153       82460 :             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
    1154       82460 :             s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
    1155       82460 :             s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
    1156       82460 :             s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
    1157       82460 :             s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
    1158             : 
    1159       82460 :             const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    1160       82460 :             const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
    1161       82460 :             const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
    1162      164920 :             const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
    1163             : 
    1164       82460 :             ss_128[0] = _mm_unpacklo_epi8(src01, src12);
    1165       82460 :             ss_128[1] = _mm_unpacklo_epi8(src23, src34);
    1166             : 
    1167       82460 :             if (conv_params->do_average) {
    1168       36452 :                 if (conv_params->use_jnt_comp_avg) {
    1169             :                     do {
    1170       74704 :                         src_ptr += 2 * src_stride;
    1171       74704 :                         const __m128i res = y_convolve_6tap_4x2_ssse3(
    1172             :                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
    1173       74704 :                         jnt_comp_avg_round_store_4x2_sse2(res,
    1174             :                             factor_128,
    1175             :                             offset_comp_avg_128,
    1176             :                             dst,
    1177             :                             dst_stride,
    1178             :                             dst8,
    1179             :                             dst8_stride);
    1180       74704 :                         ss_128[0] = ss_128[1];
    1181       74704 :                         ss_128[1] = ss_128[2];
    1182       74704 :                         dst += 2 * dst_stride;
    1183       74704 :                         dst8 += 2 * dst8_stride;
    1184       74704 :                         y -= 2;
    1185       74704 :                     } while (y);
    1186             :                 }
    1187             :                 else {
    1188             :                     do {
    1189      122920 :                         src_ptr += 2 * src_stride;
    1190      122920 :                         const __m128i res = y_convolve_6tap_4x2_ssse3(
    1191             :                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
    1192      122920 :                         jnt_avg_round_store_4x2_sse2(res,
    1193             :                             offset_avg_128,
    1194             :                             dst,
    1195             :                             dst_stride,
    1196             :                             dst8,
    1197             :                             dst8_stride);
    1198      122920 :                         ss_128[0] = ss_128[1];
    1199      122920 :                         ss_128[1] = ss_128[2];
    1200      122920 :                         dst += 2 * dst_stride;
    1201      122920 :                         dst8 += 2 * dst8_stride;
    1202      122920 :                         y -= 2;
    1203      122920 :                     } while (y);
    1204             :                 }
    1205             :             }
    1206             :             else {
    1207             :                 do {
    1208      248624 :                     src_ptr += 2 * src_stride;
    1209      248624 :                     const __m128i res = y_convolve_6tap_4x2_ssse3(
    1210             :                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
    1211      248624 :                     jnt_no_avg_round_store_4x2_sse2(
    1212             :                         res, offset_no_avg_128, dst, dst_stride);
    1213      248624 :                     ss_128[0] = ss_128[1];
    1214      248624 :                     ss_128[1] = ss_128[2];
    1215      248624 :                     dst += 2 * dst_stride;
    1216      248624 :                     y -= 2;
    1217      248624 :                 } while (y);
    1218             :             }
    1219             :         }
    1220             :     }
    1221             :     else {
    1222     7670010 :         prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
    1223             : 
    1224     7671280 :         if (w == 8) {
    1225             :             __m128i s_64[6];
    1226             :             __m256i ss_256[3];
    1227             : 
    1228     3378780 :             s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
    1229     3378780 :             s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
    1230     3378780 :             s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
    1231     3378780 :             s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
    1232     3378780 :             s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
    1233             : 
    1234             :             // Load lines a and b. Line a to lower 128, line b to upper
    1235             :             // 128
    1236     3378780 :             const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
    1237     3378780 :             const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
    1238     3378780 :             const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
    1239     6757560 :             const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
    1240             : 
    1241     3378780 :             ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
    1242     3378780 :             ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1243             : 
    1244     3378780 :             y = h;
    1245             : 
    1246     3378780 :             if (conv_params->do_average) {
    1247     1347710 :                 if (conv_params->use_jnt_comp_avg) {
    1248             :                     do {
    1249     4385580 :                         src_ptr += 2 * src_stride;
    1250     4385580 :                         const __m256i res = y_convolve_6tap_8x2_avx2(
    1251             :                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
    1252     4385540 :                         jnt_comp_avg_round_store_8x2_avx2(res,
    1253             :                             factor_256,
    1254             :                             offset_comp_avg_256,
    1255             :                             dst,
    1256             :                             dst_stride,
    1257             :                             dst8,
    1258             :                             dst8_stride);
    1259     4385510 :                         ss_256[0] = ss_256[1];
    1260     4385510 :                         ss_256[1] = ss_256[2];
    1261     4385510 :                         dst += 2 * dst_stride;
    1262     4385510 :                         dst8 += 2 * dst8_stride;
    1263     4385510 :                         y -= 2;
    1264     4385510 :                     } while (y);
    1265             :                 }
    1266             :                 else {
    1267             :                     do {
    1268     5559460 :                         src_ptr += 2 * src_stride;
    1269     5559460 :                         const __m256i res = y_convolve_6tap_8x2_avx2(
    1270             :                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
    1271     5559510 :                         jnt_avg_round_store_8x2_sse2(res,
    1272             :                             offset_avg_256,
    1273             :                             dst,
    1274             :                             dst_stride,
    1275             :                             dst8,
    1276             :                             dst8_stride);
    1277     5559300 :                         ss_256[0] = ss_256[1];
    1278     5559300 :                         ss_256[1] = ss_256[2];
    1279     5559300 :                         dst += 2 * dst_stride;
    1280     5559300 :                         dst8 += 2 * dst8_stride;
    1281     5559300 :                         y -= 2;
    1282     5559300 :                     } while (y);
    1283             :                 }
    1284             :             }
    1285             :             else {
    1286             :                 do {
    1287    15063000 :                     src_ptr += 2 * src_stride;
    1288    15063000 :                     const __m256i res = y_convolve_6tap_8x2_avx2(
    1289             :                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
    1290    15063600 :                     jnt_no_avg_round_store_8x2_avx2(
    1291             :                         res, offset_no_avg_256, dst, dst_stride);
    1292    15063100 :                     ss_256[0] = ss_256[1];
    1293    15063100 :                     ss_256[1] = ss_256[2];
    1294    15063100 :                     dst += 2 * dst_stride;
    1295    15063100 :                     y -= 2;
    1296    15063100 :                 } while (y);
    1297             :             }
    1298             :         }
    1299     4292500 :         else if (w == 16) {
    1300             :             __m128i s_128[6];
    1301             :             __m256i ss_256[6], r[2];
    1302             : 
    1303     2423560 :             s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
    1304     2423560 :             s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
    1305     2423560 :             s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
    1306     2423560 :             s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
    1307     2423560 :             s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
    1308             : 
    1309             :             // Load lines a and b. Line a to lower 128, line b to upper
    1310             :             // 128
    1311     2423560 :             const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
    1312     2423560 :             const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
    1313     2423560 :             const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
    1314     4847120 :             const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
    1315             : 
    1316     2423560 :             ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
    1317     2423560 :             ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1318             : 
    1319     2423560 :             ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
    1320     2423560 :             ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
    1321             : 
    1322     2423560 :             y = h;
    1323             : 
    1324     2423560 :             if (conv_params->do_average) {
    1325      953053 :                 if (conv_params->use_jnt_comp_avg) {
    1326             :                     do {
    1327     3759680 :                         src_ptr += 2 * src_stride;
    1328     3759680 :                         y_convolve_6tap_16x2_avx2(
    1329             :                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1330             :                         jnt_comp_avg_round_store_16x2_avx2(r,
    1331             :                             factor_256,
    1332             :                             offset_comp_avg_256,
    1333             :                             dst,
    1334             :                             dst_stride,
    1335             :                             dst8,
    1336             :                             dst8_stride);
    1337     3759650 :                         ss_256[0] = ss_256[1];
    1338     3759650 :                         ss_256[1] = ss_256[2];
    1339     3759650 :                         ss_256[3] = ss_256[4];
    1340     3759650 :                         ss_256[4] = ss_256[5];
    1341     3759650 :                         dst += 2 * dst_stride;
    1342     3759650 :                         dst8 += 2 * dst8_stride;
    1343     3759650 :                         y -= 2;
    1344     3759650 :                     } while (y);
    1345             :                 }
    1346             :                 else {
    1347             :                     do {
    1348     5641710 :                         src_ptr += 2 * src_stride;
    1349     5641710 :                         y_convolve_6tap_16x2_avx2(
    1350             :                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1351     5641570 :                         jnt_avg_round_store_16x2_avx2(r,
    1352             :                             offset_avg_256,
    1353             :                             dst,
    1354             :                             dst_stride,
    1355             :                             dst8,
    1356             :                             dst8_stride);
    1357     5641580 :                         ss_256[0] = ss_256[1];
    1358     5641580 :                         ss_256[1] = ss_256[2];
    1359     5641580 :                         ss_256[3] = ss_256[4];
    1360     5641580 :                         ss_256[4] = ss_256[5];
    1361     5641580 :                         dst += 2 * dst_stride;
    1362     5641580 :                         dst8 += 2 * dst8_stride;
    1363     5641580 :                         y -= 2;
    1364     5641580 :                     } while (y);
    1365             :                 }
    1366             :             }
    1367             :             else {
    1368             :                 do {
    1369    14654200 :                     src_ptr += 2 * src_stride;
    1370    14654200 :                     y_convolve_6tap_16x2_avx2(
    1371             :                         src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1372    14655500 :                     jnt_no_avg_round_store_16x2_avx2(
    1373             :                         r, offset_no_avg_256, dst, dst_stride);
    1374    14654300 :                     ss_256[0] = ss_256[1];
    1375    14654300 :                     ss_256[1] = ss_256[2];
    1376    14654300 :                     ss_256[3] = ss_256[4];
    1377    14654300 :                     ss_256[4] = ss_256[5];
    1378    14654300 :                     dst += 2 * dst_stride;
    1379    14654300 :                     y -= 2;
    1380    14654300 :                 } while (y);
    1381             :             }
    1382             :         }
    1383             :         else {
    1384             :             __m256i s_256[6], ss_256[6], tt_256[6], r[4];
    1385             : 
    1386     1868940 :             assert(!(w % 32));
    1387             : 
    1388     1868940 :             x = 0;
    1389             :             do {
    1390     2304440 :                 const uint8_t *s = src_ptr + x;
    1391     2304440 :                 ConvBufType *d = dst + x;
    1392     2304440 :                 uint8_t *d8 = dst8 + x;
    1393             : 
    1394     2304440 :                 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
    1395     2304440 :                 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
    1396     2304440 :                 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
    1397     2304440 :                 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
    1398     2304440 :                 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
    1399             : 
    1400     2304440 :                 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
    1401     2304440 :                 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
    1402     2304440 :                 ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
    1403     2304440 :                 ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
    1404             : 
    1405     2304440 :                 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
    1406     2304440 :                 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
    1407     2304440 :                 tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
    1408     2304440 :                 tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
    1409             : 
    1410     2304440 :                 y = h;
    1411             : 
    1412     2304440 :                 if (conv_params->do_average) {
    1413      904976 :                     if (conv_params->use_jnt_comp_avg) {
    1414             :                         do {
    1415     4191810 :                             s += 2 * src_stride;
    1416     4191810 :                             y_convolve_6tap_32x2_avx2(s,
    1417             :                                 src_stride,
    1418             :                                 coeffs_256,
    1419             :                                 s_256,
    1420             :                                 ss_256,
    1421             :                                 tt_256,
    1422             :                                 r);
    1423             :                             jnt_comp_avg_round_store_32_avx2(
    1424             :                                 r, factor_256, offset_comp_avg_256, d, d8);
    1425     4191900 :                             jnt_comp_avg_round_store_32_avx2(
    1426             :                                 r + 2,
    1427             :                                 factor_256,
    1428             :                                 offset_comp_avg_256,
    1429     4191900 :                                 d + dst_stride,
    1430             :                                 d8 + dst8_stride);
    1431             : 
    1432     4191800 :                             ss_256[0] = ss_256[1];
    1433     4191800 :                             ss_256[1] = ss_256[2];
    1434     4191800 :                             ss_256[3] = ss_256[4];
    1435     4191800 :                             ss_256[4] = ss_256[5];
    1436             : 
    1437     4191800 :                             tt_256[0] = tt_256[1];
    1438     4191800 :                             tt_256[1] = tt_256[2];
    1439     4191800 :                             tt_256[3] = tt_256[4];
    1440     4191800 :                             tt_256[4] = tt_256[5];
    1441     4191800 :                             d += 2 * dst_stride;
    1442     4191800 :                             d8 += 2 * dst8_stride;
    1443     4191800 :                             y -= 2;
    1444     4191800 :                         } while (y);
    1445             :                     }
    1446             :                     else {
    1447             :                         do {
    1448     6978040 :                             s += 2 * src_stride;
    1449     6978040 :                             y_convolve_6tap_32x2_avx2(s,
    1450             :                                 src_stride,
    1451             :                                 coeffs_256,
    1452             :                                 s_256,
    1453             :                                 ss_256,
    1454             :                                 tt_256,
    1455             :                                 r);
    1456     6978550 :                             jnt_avg_round_store_32_avx2(
    1457             :                                 r, offset_avg_256, d, d8);
    1458     6977890 :                             jnt_avg_round_store_32_avx2(r + 2,
    1459             :                                 offset_avg_256,
    1460     6977890 :                                 d + dst_stride,
    1461             :                                 d8 + dst8_stride);
    1462             : 
    1463     6977700 :                             ss_256[0] = ss_256[1];
    1464     6977700 :                             ss_256[1] = ss_256[2];
    1465     6977700 :                             ss_256[3] = ss_256[4];
    1466     6977700 :                             ss_256[4] = ss_256[5];
    1467             : 
    1468     6977700 :                             tt_256[0] = tt_256[1];
    1469     6977700 :                             tt_256[1] = tt_256[2];
    1470     6977700 :                             tt_256[3] = tt_256[4];
    1471     6977700 :                             tt_256[4] = tt_256[5];
    1472     6977700 :                             d += 2 * dst_stride;
    1473     6977700 :                             d8 += 2 * dst8_stride;
    1474     6977700 :                             y -= 2;
    1475     6977700 :                         } while (y);
    1476             :                     }
    1477             :                 }
    1478             :                 else {
    1479             :                     do {
    1480    17348300 :                         s += 2 * src_stride;
    1481    17348300 :                         y_convolve_6tap_32x2_avx2(s,
    1482             :                             src_stride,
    1483             :                             coeffs_256,
    1484             :                             s_256,
    1485             :                             ss_256,
    1486             :                             tt_256,
    1487             :                             r);
    1488    17352000 :                         jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
    1489    17351000 :                         jnt_no_avg_round_store_32_avx2(
    1490    17351000 :                             r + 2, offset_no_avg_256, d + dst_stride);
    1491             : 
    1492    17349400 :                         ss_256[0] = ss_256[1];
    1493    17349400 :                         ss_256[1] = ss_256[2];
    1494    17349400 :                         ss_256[3] = ss_256[4];
    1495    17349400 :                         ss_256[4] = ss_256[5];
    1496             : 
    1497    17349400 :                         tt_256[0] = tt_256[1];
    1498    17349400 :                         tt_256[1] = tt_256[2];
    1499    17349400 :                         tt_256[3] = tt_256[4];
    1500    17349400 :                         tt_256[4] = tt_256[5];
    1501    17349400 :                         d += 2 * dst_stride;
    1502    17349400 :                         y -= 2;
    1503    17349400 :                     } while (y);
    1504             :                 }
    1505             : 
    1506     2305160 :                 x += 32;
    1507     2305160 :             } while (x < w);
    1508             :         }
    1509             :     }
    1510     7754270 : }
    1511             : 
    1512     3388010 : static void jnt_convolve_y_8tap_avx2(
    1513             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    1514             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    1515             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    1516             :     const ConvolveParams *const conv_params) {
    1517     3388010 :     const uint8_t *src_ptr = src - 3 * src_stride;
    1518     3388010 :     const int32_t dst_stride = conv_params->dst_stride;
    1519     3388010 :     const int32_t round_0 = 3;
    1520     3388010 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    1521     3388010 :     const int32_t bits = FILTER_BITS - round_0;
    1522     3388010 :     const int32_t bd = 8;
    1523     3388010 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
    1524     3388010 :     const int32_t round_offset =
    1525     3388010 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
    1526     3388010 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
    1527     3388010 :     const int32_t offset_comp_avg =
    1528     3388010 :         round_offset * conv_params->bck_offset +
    1529     3388010 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
    1530     3388010 :         (round_offset << DIST_PRECISION_BITS);
    1531     3388010 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
    1532     3388010 :         (1 << (round_1 - bits - 2)) -
    1533     3388010 :         (round_offset << (round_1 - bits - 1));
    1534     3388010 :     const int16_t offset_no_avg =
    1535     3388010 :         (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
    1536     3388010 :     const int32_t factor =
    1537     3388010 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    1538     3388010 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    1539     3388010 :     const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
    1540     6776020 :     const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
    1541     3388010 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    1542     3388010 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    1543     3388010 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
    1544     6776020 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
    1545     3388010 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    1546     3388010 :     ConvBufType *dst = conv_params->dst;
    1547             :     int32_t x;
    1548     3388010 :     int32_t y = h;
    1549             :     __m128i coeffs_128[4];
    1550             :     __m256i coeffs_256[4];
    1551             : 
    1552     3388010 :     if (w <= 4) {
    1553       11132 :         prepare_half_coeffs_8tap_ssse3(
    1554             :             filter_params_y, subpel_y_q4, coeffs_128);
    1555             : 
    1556       11132 :         y = h;
    1557             : 
    1558       11132 :         if (w == 2) {
    1559             :             __m128i s_16[8], ss_128[4];
    1560             : 
    1561           0 :             s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
    1562           0 :             s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
    1563           0 :             s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
    1564           0 :             s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
    1565           0 :             s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
    1566           0 :             s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
    1567           0 :             s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
    1568             : 
    1569           0 :             const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
    1570           0 :             const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
    1571           0 :             const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
    1572           0 :             const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
    1573           0 :             const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
    1574           0 :             const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
    1575             : 
    1576           0 :             ss_128[0] = _mm_unpacklo_epi8(src01, src12);
    1577           0 :             ss_128[1] = _mm_unpacklo_epi8(src23, src34);
    1578           0 :             ss_128[2] = _mm_unpacklo_epi8(src45, src56);
    1579             : 
    1580           0 :             if (conv_params->do_average) {
    1581           0 :                 if (conv_params->use_jnt_comp_avg) {
    1582             :                     do {
    1583           0 :                         const __m128i res = y_convolve_8tap_2x2_ssse3(
    1584             :                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
    1585           0 :                         jnt_comp_avg_round_store_2x2_sse2(res,
    1586             :                             factor_128,
    1587             :                             offset_comp_avg_128,
    1588             :                             dst,
    1589             :                             dst_stride,
    1590             :                             dst8,
    1591             :                             dst8_stride);
    1592           0 :                         ss_128[0] = ss_128[1];
    1593           0 :                         ss_128[1] = ss_128[2];
    1594           0 :                         ss_128[2] = ss_128[3];
    1595           0 :                         src_ptr += 2 * src_stride;
    1596           0 :                         dst += 2 * dst_stride;
    1597           0 :                         dst8 += 2 * dst8_stride;
    1598           0 :                         y -= 2;
    1599           0 :                     } while (y);
    1600             :                 }
    1601             :                 else {
    1602             :                     do {
    1603           0 :                         const __m128i res = y_convolve_8tap_2x2_ssse3(
    1604             :                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
    1605           0 :                         jnt_avg_round_store_2x2_sse2(res,
    1606             :                             offset_avg_128,
    1607             :                             dst,
    1608             :                             dst_stride,
    1609             :                             dst8,
    1610             :                             dst8_stride);
    1611           0 :                         ss_128[0] = ss_128[1];
    1612           0 :                         ss_128[1] = ss_128[2];
    1613           0 :                         ss_128[2] = ss_128[3];
    1614           0 :                         src_ptr += 2 * src_stride;
    1615           0 :                         dst += 2 * dst_stride;
    1616           0 :                         dst8 += 2 * dst8_stride;
    1617           0 :                         y -= 2;
    1618           0 :                     } while (y);
    1619             :                 }
    1620             :             }
    1621             :             else {
    1622             :                 do {
    1623           0 :                     const __m128i res = y_convolve_8tap_2x2_ssse3(
    1624             :                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
    1625           0 :                     jnt_no_avg_round_store_2x2_sse2(
    1626             :                         res, offset_no_avg_128, dst, dst_stride);
    1627           0 :                     ss_128[0] = ss_128[1];
    1628           0 :                     ss_128[1] = ss_128[2];
    1629           0 :                     ss_128[2] = ss_128[3];
    1630           0 :                     src_ptr += 2 * src_stride;
    1631           0 :                     dst += 2 * dst_stride;
    1632           0 :                     y -= 2;
    1633           0 :                 } while (y);
    1634             :             }
    1635             :         }
    1636             :         else {
    1637             :             __m128i s_32[8], ss_128[4];
    1638             : 
    1639       11132 :             assert(w == 4);
    1640             : 
    1641       11132 :             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
    1642       11132 :             s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
    1643       11132 :             s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
    1644       11132 :             s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
    1645       11132 :             s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
    1646       11132 :             s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
    1647       11132 :             s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
    1648             : 
    1649       11132 :             const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    1650       11132 :             const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
    1651       11132 :             const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
    1652       11132 :             const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
    1653       11132 :             const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
    1654       22264 :             const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
    1655             : 
    1656       11132 :             ss_128[0] = _mm_unpacklo_epi8(src01, src12);
    1657       11132 :             ss_128[1] = _mm_unpacklo_epi8(src23, src34);
    1658       11132 :             ss_128[2] = _mm_unpacklo_epi8(src45, src56);
    1659             : 
    1660       11132 :             if (conv_params->do_average) {
    1661        4458 :                 if (conv_params->use_jnt_comp_avg) {
    1662             :                     do {
    1663       13464 :                         const __m128i res = y_convolve_8tap_4x2_ssse3(
    1664             :                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
    1665       13464 :                         jnt_comp_avg_round_store_4x2_sse2(res,
    1666             :                             factor_128,
    1667             :                             offset_comp_avg_128,
    1668             :                             dst,
    1669             :                             dst_stride,
    1670             :                             dst8,
    1671             :                             dst8_stride);
    1672       13464 :                         ss_128[0] = ss_128[1];
    1673       13464 :                         ss_128[1] = ss_128[2];
    1674       13464 :                         ss_128[2] = ss_128[3];
    1675       13464 :                         src_ptr += 2 * src_stride;
    1676       13464 :                         dst += 2 * dst_stride;
    1677       13464 :                         dst8 += 2 * dst8_stride;
    1678       13464 :                         y -= 2;
    1679       13464 :                     } while (y);
    1680             :                 }
    1681             :                 else {
    1682             :                     do {
    1683       12984 :                         const __m128i res = y_convolve_8tap_4x2_ssse3(
    1684             :                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
    1685       12984 :                         jnt_avg_round_store_4x2_sse2(res,
    1686             :                             offset_avg_128,
    1687             :                             dst,
    1688             :                             dst_stride,
    1689             :                             dst8,
    1690             :                             dst8_stride);
    1691       12984 :                         ss_128[0] = ss_128[1];
    1692       12984 :                         ss_128[1] = ss_128[2];
    1693       12984 :                         ss_128[2] = ss_128[3];
    1694       12984 :                         src_ptr += 2 * src_stride;
    1695       12984 :                         dst += 2 * dst_stride;
    1696       12984 :                         dst8 += 2 * dst8_stride;
    1697       12984 :                         y -= 2;
    1698       12984 :                     } while (y);
    1699             :                 }
    1700             :             }
    1701             :             else {
    1702             :                 do {
    1703       39664 :                     const __m128i res = y_convolve_8tap_4x2_ssse3(
    1704             :                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
    1705       39664 :                     jnt_no_avg_round_store_4x2_sse2(
    1706             :                         res, offset_no_avg_128, dst, dst_stride);
    1707       39664 :                     ss_128[0] = ss_128[1];
    1708       39664 :                     ss_128[1] = ss_128[2];
    1709       39664 :                     ss_128[2] = ss_128[3];
    1710       39664 :                     src_ptr += 2 * src_stride;
    1711       39664 :                     dst += 2 * dst_stride;
    1712       39664 :                     y -= 2;
    1713       39664 :                 } while (y);
    1714             :             }
    1715             :         }
    1716             :     }
    1717             :     else {
    1718     3376880 :         prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
    1719             : 
    1720     3377070 :         if (w == 8) {
    1721             :             __m128i s_64[8];
    1722             :             __m256i ss_256[4];
    1723             : 
    1724     1412110 :             s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
    1725     1412110 :             s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
    1726     1412110 :             s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
    1727     1412110 :             s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
    1728     1412110 :             s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
    1729     1412110 :             s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
    1730     1412110 :             s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
    1731             : 
    1732             :             // Load lines a and b. Line a to lower 128, line b to upper
    1733             :             // 128
    1734     1412110 :             const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
    1735     1412110 :             const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
    1736     1412110 :             const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
    1737     1412110 :             const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
    1738     1412110 :             const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
    1739     2824220 :             const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
    1740             : 
    1741     1412110 :             ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
    1742     1412110 :             ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1743     1412110 :             ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
    1744             : 
    1745     1412110 :             y = h;
    1746             : 
    1747     1412110 :             if (conv_params->do_average) {
    1748      576251 :                 if (conv_params->use_jnt_comp_avg) {
    1749             :                     do {
    1750     1927600 :                         const __m256i res = y_convolve_8tap_8x2_avx2(
    1751             :                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
    1752     1927600 :                         jnt_comp_avg_round_store_8x2_avx2(res,
    1753             :                             factor_256,
    1754             :                             offset_comp_avg_256,
    1755             :                             dst,
    1756             :                             dst_stride,
    1757             :                             dst8,
    1758             :                             dst8_stride);
    1759     1927580 :                         ss_256[0] = ss_256[1];
    1760     1927580 :                         ss_256[1] = ss_256[2];
    1761     1927580 :                         ss_256[2] = ss_256[3];
    1762     1927580 :                         src_ptr += 2 * src_stride;
    1763     1927580 :                         dst += 2 * dst_stride;
    1764     1927580 :                         dst8 += 2 * dst8_stride;
    1765     1927580 :                         y -= 2;
    1766     1927580 :                     } while (y);
    1767             :                 }
    1768             :                 else {
    1769             :                     do {
    1770     2456560 :                         const __m256i res = y_convolve_8tap_8x2_avx2(
    1771             :                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
    1772     2456570 :                         jnt_avg_round_store_8x2_sse2(res,
    1773             :                             offset_avg_256,
    1774             :                             dst,
    1775             :                             dst_stride,
    1776             :                             dst8,
    1777             :                             dst8_stride);
    1778     2456550 :                         ss_256[0] = ss_256[1];
    1779     2456550 :                         ss_256[1] = ss_256[2];
    1780     2456550 :                         ss_256[2] = ss_256[3];
    1781     2456550 :                         src_ptr += 2 * src_stride;
    1782     2456550 :                         dst += 2 * dst_stride;
    1783     2456550 :                         dst8 += 2 * dst8_stride;
    1784     2456550 :                         y -= 2;
    1785     2456550 :                     } while (y);
    1786             :                 }
    1787             :             }
    1788             :             else {
    1789             :                 do {
    1790     6360280 :                     const __m256i res = y_convolve_8tap_8x2_avx2(
    1791             :                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
    1792     6360390 :                     jnt_no_avg_round_store_8x2_avx2(
    1793             :                         res, offset_no_avg_256, dst, dst_stride);
    1794     6360300 :                     ss_256[0] = ss_256[1];
    1795     6360300 :                     ss_256[1] = ss_256[2];
    1796     6360300 :                     ss_256[2] = ss_256[3];
    1797     6360300 :                     src_ptr += 2 * src_stride;
    1798     6360300 :                     dst += 2 * dst_stride;
    1799     6360300 :                     y -= 2;
    1800     6360300 :                 } while (y);
    1801             :             }
    1802             :         }
    1803     1964960 :         else if (w == 16) {
    1804             :             __m128i s_128[8];
    1805             :             __m256i ss_256[8], r[2];
    1806             : 
    1807     1054100 :             s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
    1808     1054100 :             s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
    1809     1054100 :             s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
    1810     1054100 :             s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
    1811     1054100 :             s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
    1812     1054100 :             s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
    1813     1054100 :             s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
    1814             : 
    1815             :             // Load lines a and b. Line a to lower 128, line b to upper
    1816             :             // 128
    1817     1054100 :             const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
    1818     1054100 :             const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
    1819     1054100 :             const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
    1820     1054100 :             const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
    1821     1054100 :             const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
    1822     2108190 :             const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
    1823             : 
    1824     1054100 :             ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
    1825     1054100 :             ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
    1826     1054100 :             ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
    1827             : 
    1828     1054100 :             ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
    1829     1054100 :             ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
    1830     1054100 :             ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
    1831             : 
    1832     1054100 :             y = h;
    1833             : 
    1834     1054100 :             if (conv_params->do_average) {
    1835      424222 :                 if (conv_params->use_jnt_comp_avg) {
    1836             :                     do {
    1837     1742750 :                         y_convolve_8tap_16x2_avx2(
    1838             :                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1839             :                         jnt_comp_avg_round_store_16x2_avx2(r,
    1840             :                             factor_256,
    1841             :                             offset_comp_avg_256,
    1842             :                             dst,
    1843             :                             dst_stride,
    1844             :                             dst8,
    1845             :                             dst8_stride);
    1846     1742760 :                         ss_256[0] = ss_256[1];
    1847     1742760 :                         ss_256[1] = ss_256[2];
    1848     1742760 :                         ss_256[2] = ss_256[3];
    1849     1742760 :                         ss_256[4] = ss_256[5];
    1850     1742760 :                         ss_256[5] = ss_256[6];
    1851     1742760 :                         ss_256[6] = ss_256[7];
    1852     1742760 :                         src_ptr += 2 * src_stride;
    1853     1742760 :                         dst += 2 * dst_stride;
    1854     1742760 :                         dst8 += 2 * dst8_stride;
    1855     1742760 :                         y -= 2;
    1856     1742760 :                     } while (y);
    1857             :                 }
    1858             :                 else {
    1859             :                     do {
    1860     2625770 :                         y_convolve_8tap_16x2_avx2(
    1861             :                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1862     2625800 :                         jnt_avg_round_store_16x2_avx2(r,
    1863             :                             offset_avg_256,
    1864             :                             dst,
    1865             :                             dst_stride,
    1866             :                             dst8,
    1867             :                             dst8_stride);
    1868     2625760 :                         ss_256[0] = ss_256[1];
    1869     2625760 :                         ss_256[1] = ss_256[2];
    1870     2625760 :                         ss_256[2] = ss_256[3];
    1871     2625760 :                         ss_256[4] = ss_256[5];
    1872     2625760 :                         ss_256[5] = ss_256[6];
    1873     2625760 :                         ss_256[6] = ss_256[7];
    1874     2625760 :                         src_ptr += 2 * src_stride;
    1875     2625760 :                         dst += 2 * dst_stride;
    1876     2625760 :                         dst8 += 2 * dst8_stride;
    1877     2625760 :                         y -= 2;
    1878     2625760 :                     } while (y);
    1879             :                 }
    1880             :             }
    1881             :             else {
    1882             :                 do {
    1883     6517960 :                     y_convolve_8tap_16x2_avx2(
    1884             :                         src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
    1885     6518300 :                     jnt_no_avg_round_store_16x2_avx2(
    1886             :                         r, offset_no_avg_256, dst, dst_stride);
    1887     6517960 :                     ss_256[0] = ss_256[1];
    1888     6517960 :                     ss_256[1] = ss_256[2];
    1889     6517960 :                     ss_256[2] = ss_256[3];
    1890     6517960 :                     ss_256[4] = ss_256[5];
    1891     6517960 :                     ss_256[5] = ss_256[6];
    1892     6517960 :                     ss_256[6] = ss_256[7];
    1893     6517960 :                     src_ptr += 2 * src_stride;
    1894     6517960 :                     dst += 2 * dst_stride;
    1895     6517960 :                     y -= 2;
    1896     6517960 :                 } while (y);
    1897             :             }
    1898             :         }
    1899             :         else {
    1900             :             __m256i s_256[8], ss_256[8], tt_256[8], r[4];
    1901             : 
    1902      910864 :             assert(!(w % 32));
    1903             : 
    1904      910864 :             x = 0;
    1905             :             do {
    1906     1142520 :                 const uint8_t *s = src_ptr + x;
    1907     1142520 :                 ConvBufType *d = dst + x;
    1908     1142520 :                 uint8_t *d8 = dst8 + x;
    1909             : 
    1910     1142520 :                 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
    1911     1142520 :                 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
    1912     1142520 :                 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
    1913     1142520 :                 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
    1914     1142520 :                 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
    1915     1142520 :                 s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
    1916     1142520 :                 s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
    1917             : 
    1918     1142520 :                 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
    1919     1142520 :                 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
    1920     1142520 :                 ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
    1921     1142520 :                 ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
    1922     1142520 :                 ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
    1923     1142520 :                 ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
    1924             : 
    1925     1142520 :                 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
    1926     1142520 :                 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
    1927     1142520 :                 tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
    1928     1142520 :                 tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
    1929     1142520 :                 tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
    1930     1142520 :                 tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
    1931             : 
    1932     1142520 :                 y = h;
    1933             : 
    1934     1142520 :                 if (conv_params->do_average) {
    1935      460332 :                     if (conv_params->use_jnt_comp_avg) {
    1936             :                         do {
    1937     2229770 :                             y_convolve_8tap_32x2_avx2(s,
    1938             :                                 src_stride,
    1939             :                                 coeffs_256,
    1940             :                                 s_256,
    1941             :                                 ss_256,
    1942             :                                 tt_256,
    1943             :                                 r);
    1944             :                             jnt_comp_avg_round_store_32_avx2(
    1945             :                                 r, factor_256, offset_comp_avg_256, d, d8);
    1946     2229750 :                             jnt_comp_avg_round_store_32_avx2(
    1947             :                                 r + 2,
    1948             :                                 factor_256,
    1949             :                                 offset_comp_avg_256,
    1950     2229750 :                                 d + dst_stride,
    1951             :                                 d8 + dst8_stride);
    1952             : 
    1953     2229770 :                             ss_256[0] = ss_256[1];
    1954     2229770 :                             ss_256[1] = ss_256[2];
    1955     2229770 :                             ss_256[2] = ss_256[3];
    1956     2229770 :                             ss_256[4] = ss_256[5];
    1957     2229770 :                             ss_256[5] = ss_256[6];
    1958     2229770 :                             ss_256[6] = ss_256[7];
    1959             : 
    1960     2229770 :                             tt_256[0] = tt_256[1];
    1961     2229770 :                             tt_256[1] = tt_256[2];
    1962     2229770 :                             tt_256[2] = tt_256[3];
    1963     2229770 :                             tt_256[4] = tt_256[5];
    1964     2229770 :                             tt_256[5] = tt_256[6];
    1965     2229770 :                             tt_256[6] = tt_256[7];
    1966     2229770 :                             s += 2 * src_stride;
    1967     2229770 :                             d += 2 * dst_stride;
    1968     2229770 :                             d8 += 2 * dst8_stride;
    1969     2229770 :                             y -= 2;
    1970     2229770 :                         } while (y);
    1971             :                     }
    1972             :                     else {
    1973             :                         do {
    1974     3744240 :                             y_convolve_8tap_32x2_avx2(s,
    1975             :                                 src_stride,
    1976             :                                 coeffs_256,
    1977             :                                 s_256,
    1978             :                                 ss_256,
    1979             :                                 tt_256,
    1980             :                                 r);
    1981     3744290 :                             jnt_avg_round_store_32_avx2(
    1982             :                                 r, offset_avg_256, d, d8);
    1983     3744170 :                             jnt_avg_round_store_32_avx2(r + 2,
    1984             :                                 offset_avg_256,
    1985     3744170 :                                 d + dst_stride,
    1986             :                                 d8 + dst8_stride);
    1987             : 
    1988     3744130 :                             ss_256[0] = ss_256[1];
    1989     3744130 :                             ss_256[1] = ss_256[2];
    1990     3744130 :                             ss_256[2] = ss_256[3];
    1991     3744130 :                             ss_256[4] = ss_256[5];
    1992     3744130 :                             ss_256[5] = ss_256[6];
    1993     3744130 :                             ss_256[6] = ss_256[7];
    1994             : 
    1995     3744130 :                             tt_256[0] = tt_256[1];
    1996     3744130 :                             tt_256[1] = tt_256[2];
    1997     3744130 :                             tt_256[2] = tt_256[3];
    1998     3744130 :                             tt_256[4] = tt_256[5];
    1999     3744130 :                             tt_256[5] = tt_256[6];
    2000     3744130 :                             tt_256[6] = tt_256[7];
    2001     3744130 :                             s += 2 * src_stride;
    2002     3744130 :                             d += 2 * dst_stride;
    2003     3744130 :                             d8 += 2 * dst8_stride;
    2004     3744130 :                             y -= 2;
    2005     3744130 :                         } while (y);
    2006             :                     }
    2007             :                 }
    2008             :                 else {
    2009             :                     do {
    2010     8907200 :                         y_convolve_8tap_32x2_avx2(s,
    2011             :                             src_stride,
    2012             :                             coeffs_256,
    2013             :                             s_256,
    2014             :                             ss_256,
    2015             :                             tt_256,
    2016             :                             r);
    2017     8908280 :                         jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
    2018     8907840 :                         jnt_no_avg_round_store_32_avx2(
    2019     8907840 :                             r + 2, offset_no_avg_256, d + dst_stride);
    2020             : 
    2021     8907480 :                         ss_256[0] = ss_256[1];
    2022     8907480 :                         ss_256[1] = ss_256[2];
    2023     8907480 :                         ss_256[2] = ss_256[3];
    2024     8907480 :                         ss_256[4] = ss_256[5];
    2025     8907480 :                         ss_256[5] = ss_256[6];
    2026     8907480 :                         ss_256[6] = ss_256[7];
    2027             : 
    2028     8907480 :                         tt_256[0] = tt_256[1];
    2029     8907480 :                         tt_256[1] = tt_256[2];
    2030     8907480 :                         tt_256[2] = tt_256[3];
    2031     8907480 :                         tt_256[4] = tt_256[5];
    2032     8907480 :                         tt_256[5] = tt_256[6];
    2033     8907480 :                         tt_256[6] = tt_256[7];
    2034     8907480 :                         s += 2 * src_stride;
    2035     8907480 :                         d += 2 * dst_stride;
    2036     8907480 :                         y -= 2;
    2037     8907480 :                     } while (y);
    2038             :                 }
    2039             : 
    2040     1142700 :                 x += 32;
    2041     1142700 :             } while (x < w);
    2042             :         }
    2043             :     }
    2044     3388370 : }
    2045             : 
    2046             : typedef void(*jnt_convolve_y_tap_func)(
    2047             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    2048             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    2049             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    2050             :     const ConvolveParams *const conv_params);
    2051             : 
    2052    31524800 : void eb_av1_jnt_convolve_y_avx2(const uint8_t *src, int32_t src_stride,
    2053             :     uint8_t *dst8, int32_t dst8_stride, int32_t w,
    2054             :     int32_t h, InterpFilterParams *filter_params_x,
    2055             :     InterpFilterParams *filter_params_y,
    2056             :     const int32_t subpel_x_q4,
    2057             :     const int32_t subpel_y_q4,
    2058             :     ConvolveParams *conv_params) {
    2059             :     static const jnt_convolve_y_tap_func
    2060             :         jnt_convolve_y_tap_func_table[MAX_FILTER_TAP + 1] = {
    2061             :             NULL,
    2062             :             NULL,
    2063             :             jnt_convolve_y_2tap_avx2,
    2064             :             NULL,
    2065             :             jnt_convolve_y_4tap_avx2,
    2066             :             NULL,
    2067             :             jnt_convolve_y_6tap_avx2,
    2068             :             NULL,
    2069             :             jnt_convolve_y_8tap_avx2 };
    2070    31524800 :     const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
    2071             : 
    2072             :     (void)filter_params_x;
    2073             :     (void)subpel_x_q4;
    2074             : 
    2075    31521300 :     assert(conv_params->round_0 == 3);
    2076    31522000 :     assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
    2077             : 
    2078    31522000 :     jnt_convolve_y_tap_func_table[tap_y](src,
    2079             :         src_stride,
    2080             :         dst8,
    2081             :         dst8_stride,
    2082             :         w,
    2083             :         h,
    2084             :         filter_params_y,
    2085             :         subpel_y_q4,
    2086             :         conv_params);
    2087    31544900 : }
    2088             : 
    2089             : // =============================================================================
    2090             : 
    2091    12106500 : static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
    2092    12106500 :     const int32_t w0 = conv_params->fwd_offset;
    2093    12106500 :     const int32_t w1 = conv_params->bck_offset;
    2094    12106500 :     const __m256i wt0 = _mm256_set1_epi16(w0);
    2095    24213100 :     const __m256i wt1 = _mm256_set1_epi16(w1);
    2096    12106500 :     const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
    2097    12106500 :     return wt;
    2098             : }
    2099             : 
    2100    11208500 : static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
    2101    44834200 :     return _mm256_permute2x128_si256(
    2102             :         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
    2103             :         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
    2104             : }
    2105             : 
    2106    12107200 : void eb_av1_jnt_convolve_2d_copy_avx2(
    2107             :     const uint8_t *src, int32_t src_stride, uint8_t *dst0, int32_t dst_stride0,
    2108             :     int32_t w, int32_t h, InterpFilterParams *filter_params_x,
    2109             :     InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
    2110             :     const int32_t subpel_y_q4, ConvolveParams *conv_params) {
    2111    12107200 :     const int32_t bd = 8;
    2112    12107200 :     ConvBufType *dst = conv_params->dst;
    2113    12107200 :     int32_t dst_stride = conv_params->dst_stride;
    2114             :     (void)filter_params_x;
    2115             :     (void)filter_params_y;
    2116             :     (void)subpel_x_q4;
    2117             :     (void)subpel_y_q4;
    2118             : 
    2119    12107200 :     const int32_t bits =
    2120    12107200 :         FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
    2121    12107200 :     const __m128i left_shift = _mm_cvtsi32_si128(bits);
    2122    12107200 :     const int32_t do_average = conv_params->do_average;
    2123    12107200 :     const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
    2124    12107200 :     const __m256i wt = unpack_weights_avx2(conv_params);
    2125    12109500 :     const __m256i zero = _mm256_setzero_si256();
    2126             : 
    2127    12109500 :     const int32_t offset_0 =
    2128    12109500 :         bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    2129    12109500 :     const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
    2130    12109500 :     const __m256i offset_const = _mm256_set1_epi16(offset);
    2131    12109500 :     const int32_t rounding_shift =
    2132    12109500 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    2133    12109500 :     const __m256i rounding_const =
    2134    12109500 :         _mm256_set1_epi16((1 << rounding_shift) >> 1);
    2135             :     int32_t i, j;
    2136             : 
    2137    12109500 :     if (!(w % 16)) {
    2138   183824000 :         for (i = 0; i < h; i += 1) {
    2139   507682000 :             for (j = 0; j < w; j += 16) {
    2140   662560000 :                 const __m256i src_16bit = _mm256_cvtepu8_epi16(
    2141   331280000 :                     _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
    2142             : 
    2143   331280000 :                 const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
    2144   331280000 :                 const __m256i res_unsigned =
    2145   331280000 :                     _mm256_add_epi16(res, offset_const);
    2146             : 
    2147   331280000 :                 if (do_average) {
    2148   117196000 :                     const __m256i data_ref_0 = _mm256_loadu_si256(
    2149   117196000 :                         (__m256i *)(&dst[i * dst_stride + j]));
    2150             : 
    2151   117196000 :                     const __m256i comp_avg_res = comp_avg(
    2152             :                         &data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
    2153             : 
    2154             :                     const __m256i round_result =
    2155   117185000 :                         convolve_rounding(&comp_avg_res,
    2156             :                             &offset_const,
    2157             :                             &rounding_const,
    2158             :                             rounding_shift);
    2159             : 
    2160             :                     const __m256i res_8 =
    2161   117145000 :                         _mm256_packus_epi16(round_result, round_result);
    2162   117145000 :                     const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
    2163             : 
    2164   117145000 :                     _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
    2165             :                         _mm256_castsi256_si128(res_0));
    2166             :                 }
    2167             :                 else {
    2168   214084000 :                     _mm256_storeu_si256((__m256i *)(&dst[i * dst_stride + j]),
    2169             :                         res_unsigned);
    2170             :                 }
    2171             :             }
    2172             :         }
    2173             :     }
    2174     4688120 :     else if (!(w % 4)) {
    2175    38177000 :         for (i = 0; i < h; i += 2) {
    2176    66975300 :             for (j = 0; j < w; j += 8) {
    2177             :                 const __m128i src_row_0 =
    2178    33488200 :                     _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
    2179    33488200 :                 const __m128i src_row_1 = _mm_loadl_epi64(
    2180    33488200 :                     (__m128i *)(&src[i * src_stride + j + src_stride]));
    2181             :                 // since not all compilers yet support _mm256_set_m128i()
    2182    66976400 :                 const __m256i src_10 = _mm256_insertf128_si256(
    2183             :                     _mm256_castsi128_si256(src_row_0), src_row_1, 1);
    2184             : 
    2185    33488200 :                 const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
    2186             : 
    2187    33488200 :                 const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
    2188             : 
    2189    33488200 :                 const __m256i res_unsigned =
    2190    33488200 :                     _mm256_add_epi16(res, offset_const);
    2191             : 
    2192             :                 // Accumulate values into the destination buffer
    2193    33488200 :                 if (do_average) {
    2194    11208600 :                     const __m256i data_ref_0 =
    2195    11208400 :                         load_line2_avx2(&dst[i * dst_stride + j],
    2196    11208400 :                             &dst[i * dst_stride + j + dst_stride]);
    2197             : 
    2198    11208600 :                     const __m256i comp_avg_res = comp_avg(
    2199             :                         &data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
    2200             : 
    2201             :                     const __m256i round_result =
    2202    11208100 :                         convolve_rounding(&comp_avg_res,
    2203             :                             &offset_const,
    2204             :                             &rounding_const,
    2205             :                             rounding_shift);
    2206             : 
    2207             :                     const __m256i res_8 =
    2208    11207600 :                         _mm256_packus_epi16(round_result, round_result);
    2209    11207600 :                     const __m128i res_0 = _mm256_castsi256_si128(res_8);
    2210    11207600 :                     const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
    2211             : 
    2212    11207600 :                     if (w > 4) {
    2213    11181500 :                         _mm_storel_epi64(
    2214    11181500 :                             (__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
    2215    11181500 :                         _mm_storel_epi64(
    2216             :                             (__m128i *)((
    2217    11181500 :                                 &dst0[i * dst_stride0 + j + dst_stride0])),
    2218             :                             res_1);
    2219             :                     }
    2220             :                     else {
    2221       26189 :                         *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
    2222       26189 :                             _mm_cvtsi128_si32(res_0);
    2223             :                         *(uint32_t
    2224       26189 :                             *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
    2225       26189 :                             _mm_cvtsi128_si32(res_1);
    2226             :                     }
    2227             :                 }
    2228             :                 else {
    2229    22279800 :                     const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
    2230    22279800 :                     _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]),
    2231             :                         res_0);
    2232             : 
    2233             :                     const __m128i res_1 =
    2234    22279800 :                         _mm256_extracti128_si256(res_unsigned, 1);
    2235             :                     _mm_storeu_si128(
    2236    22279800 :                         (__m128i *)(&dst[i * dst_stride + j + dst_stride]),
    2237             :                         res_1);
    2238             :                 }
    2239             :             }
    2240             :         }
    2241             :     }
    2242    12057600 : }
    2243             : 
    2244             : // =============================================================================
    2245             : 
    2246             : SIMD_INLINE void jnt_x_comp_avg_2tap_32_avx2(
    2247             :     const uint8_t *const src, const __m256i *const coeffs, const __m256i factor,
    2248             :     const __m256i offset, ConvBufType *const dst, uint8_t *const dst8) {
    2249             :     __m256i r[2];
    2250             : 
    2251    17989200 :     x_convolve_2tap_32_avx2(src, coeffs, r);
    2252             :     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
    2253    23975500 : }
    2254             : 
    2255    23960600 : static INLINE void jnt_x_avg_2tap_32_avx2(const uint8_t *const src,
    2256             :     const __m256i *const coeffs,
    2257             :     const __m256i offset,
    2258             :     const ConvBufType *const dst,
    2259             :     uint8_t *const dst8) {
    2260             :     __m256i r[2];
    2261             : 
    2262    23960600 :     x_convolve_2tap_32_avx2(src, coeffs, r);
    2263    23961800 :     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
    2264    23959800 : }
    2265             : 
    2266   109224000 : static INLINE void jnt_x_no_avg_2tap_32_avx2(const uint8_t *const src,
    2267             :     const __m256i *const coeffs,
    2268             :     const __m256i offset,
    2269             :     ConvBufType *const dst) {
    2270             :     __m256i r[2];
    2271             : 
    2272   109224000 :     x_convolve_2tap_32_avx2(src, coeffs, r);
    2273   109219000 :     jnt_no_avg_round_store_32_avx2(r, offset, dst);
    2274   109217000 : }
    2275             : 
    2276             : SIMD_INLINE void jnt_x_comp_avg_6tap_16x2_avx2(
    2277             :     const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
    2278             :     const __m256i *const filt, const __m256i factor, const __m256i offset,
    2279             :     ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
    2280             :     const int32_t dst8_stride) {
    2281             :     __m256i r[2];
    2282             : 
    2283     5716750 :     x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    2284             :     jnt_comp_avg_round_store_16x2_avx2(
    2285             :         r, factor, offset, dst, dst_stride, dst8, dst8_stride);
    2286     7573690 : }
    2287             : 
    2288             : SIMD_INLINE void jnt_x_avg_6tap_16x2_avx2(
    2289             :     const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
    2290             :     const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
    2291             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
    2292             :     __m256i r[2];
    2293             : 
    2294     8824300 :     x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    2295    12040100 :     jnt_avg_round_store_16x2_avx2(
    2296             :         r, offset, dst, dst_stride, dst8, dst8_stride);
    2297    12039900 : }
    2298             : 
    2299             : SIMD_INLINE void jnt_x_no_avg_6tap_16x2_avx2(
    2300             :     const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
    2301             :     const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
    2302             :     const int32_t dst_stride) {
    2303             :     __m256i r[2];
    2304             : 
    2305    23049400 :     x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    2306    31148500 :     jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
    2307    31147000 : }
    2308             : 
    2309     4622440 : static INLINE void jnt_x_comp_avg_6tap_32_avx2(
    2310             :     const uint8_t *const src, const __m256i coeffs[3],
    2311             :     const __m256i *const filt, const __m256i factor, const __m256i offset,
    2312             :     ConvBufType *const dst, uint8_t *const dst8) {
    2313             :     __m256i r[2];
    2314             : 
    2315     4622440 :     x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
    2316             :     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
    2317     4622420 : }
    2318             : 
    2319     7053250 : static INLINE void jnt_x_avg_6tap_32_avx2(const uint8_t *const src,
    2320             :     const __m256i coeffs[3],
    2321             :     const __m256i *const filt,
    2322             :     const __m256i offset,
    2323             :     ConvBufType *const dst,
    2324             :     uint8_t *const dst8) {
    2325             :     __m256i r[2];
    2326             : 
    2327     7053250 :     x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
    2328     7053590 :     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
    2329     7053190 : }
    2330             : 
    2331    19119200 : static INLINE void jnt_x_no_avg_6tap_32_avx2(const uint8_t *const src,
    2332             :     const __m256i coeffs[3],
    2333             :     const __m256i *const filt,
    2334             :     const __m256i offset,
    2335             :     ConvBufType *const dst) {
    2336             :     __m256i r[2];
    2337             : 
    2338    19119200 :     x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
    2339    19120300 :     jnt_no_avg_round_store_32_avx2(r, offset, dst);
    2340    19119500 : }
    2341             : 
    2342     1690010 : static INLINE void jnt_x_comp_avg_8tap_16x2_avx2(
    2343             :     const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
    2344             :     const __m256i *const filt, const __m256i factor, const __m256i offset,
    2345             :     ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
    2346             :     const int32_t dst8_stride) {
    2347             :     __m256i r[2];
    2348             : 
    2349             :     x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    2350             :     jnt_comp_avg_round_store_16x2_avx2(
    2351             :         r, factor, offset, dst, dst_stride, dst8, dst8_stride);
    2352     1690020 : }
    2353             : 
    2354     2376430 : static INLINE void jnt_x_avg_8tap_16x2_avx2(
    2355             :     const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
    2356             :     const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
    2357             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
    2358             :     __m256i r[2];
    2359             : 
    2360             :     x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    2361     2376440 :     jnt_avg_round_store_16x2_avx2(
    2362             :         r, offset, dst, dst_stride, dst8, dst8_stride);
    2363     2376460 : }
    2364             : 
    2365     6335660 : static INLINE void jnt_x_no_avg_8tap_16x2_avx2(
    2366             :     const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
    2367             :     const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
    2368             :     const int32_t dst_stride) {
    2369             :     __m256i r[2];
    2370             : 
    2371             :     x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
    2372     6335660 :     jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
    2373     6335540 : }
    2374             : 
    2375             : SIMD_INLINE void jnt_x_comp_avg_8tap_32_avx2(
    2376             :     const uint8_t *const src, const __m256i coeffs[4],
    2377             :     const __m256i *const filt, const __m256i factor, const __m256i offset,
    2378             :     ConvBufType *const dst, uint8_t *const dst8) {
    2379             :     __m256i r[2];
    2380             : 
    2381             :     x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
    2382             :     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
    2383     3960570 : }
    2384             : 
    2385             : SIMD_INLINE void jnt_x_avg_8tap_32_avx2(const uint8_t *const src,
    2386             :     const __m256i coeffs[4],
    2387             :     const __m256i *const filt,
    2388             :     const __m256i offset,
    2389             :     ConvBufType *const dst,
    2390             :     uint8_t *const dst8) {
    2391             :     __m256i r[2];
    2392             : 
    2393             :     x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
    2394     6115990 :     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
    2395     6115870 : }
    2396             : 
    2397             : SIMD_INLINE void jnt_x_no_avg_8tap_32_avx2(const uint8_t *const src,
    2398             :     const __m256i coeffs[4],
    2399             :     const __m256i *const filt,
    2400             :     const __m256i offset,
    2401             :     ConvBufType *const dst) {
    2402             :     __m256i r[2];
    2403             : 
    2404             :     x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
    2405    16299200 :     jnt_no_avg_round_store_32_avx2(r, offset, dst);
    2406    16299100 : }
    2407             : 
    2408    20591400 : static void jnt_convolve_x_2tap_avx2(
    2409             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    2410             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    2411             :     const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
    2412             :     const ConvolveParams *const conv_params) {
    2413    20591400 :     const uint8_t *src_ptr = src;
    2414    20591400 :     const int32_t dst_stride = conv_params->dst_stride;
    2415    20591400 :     const int32_t round_0 = 3;
    2416    20591400 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    2417    20591400 :     const int32_t bits = FILTER_BITS - round_1;
    2418    20591400 :     const int32_t bd = 8;
    2419    20591400 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
    2420    20591400 :     const int32_t round_offset =
    2421    20591400 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
    2422    20591400 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
    2423    20591400 :     const int32_t offset_comp_avg =
    2424    20591400 :         round_offset * conv_params->bck_offset +
    2425    20591400 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
    2426    20591400 :         (round_offset << DIST_PRECISION_BITS);
    2427    20591400 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
    2428    20591400 :         (1 << (round_0 - bits - 2)) -
    2429    20591400 :         (round_offset << (round_0 - bits - 1));
    2430    20591400 :     const int16_t offset_no_avg =
    2431    20591400 :         (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
    2432    20591400 :     const int32_t factor =
    2433    20591400 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    2434    20591400 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    2435    20591400 :     const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
    2436    41182700 :     const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
    2437    20591400 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    2438    20591400 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    2439    20591400 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
    2440    41182700 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
    2441    20591400 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    2442    20591400 :     ConvBufType *dst = conv_params->dst;
    2443    20591400 :     int32_t y = h;
    2444             :     __m128i coeffs_128[4];
    2445             :     __m256i coeffs_256[4];
    2446             : 
    2447    20591400 :     if (w <= 4) {
    2448           0 :         prepare_half_coeffs_2tap_ssse3(
    2449             :             filter_params_x, subpel_x_q4, coeffs_128);
    2450             : 
    2451           0 :         if (w == 2) {
    2452           0 :             if (conv_params->do_average) {
    2453           0 :                 if (conv_params->use_jnt_comp_avg) {
    2454             :                     do {
    2455           0 :                         const __m128i res = x_convolve_2tap_2x2_sse4_1(
    2456             :                             src_ptr, src_stride, coeffs_128);
    2457           0 :                         jnt_comp_avg_round_store_2x2_sse2(res,
    2458             :                             factor_128,
    2459             :                             offset_comp_avg_128,
    2460             :                             dst,
    2461             :                             dst_stride,
    2462             :                             dst8,
    2463             :                             dst8_stride);
    2464           0 :                         src_ptr += 2 * src_stride;
    2465           0 :                         dst += 2 * dst_stride;
    2466           0 :                         dst8 += 2 * dst8_stride;
    2467           0 :                         y -= 2;
    2468           0 :                     } while (y);
    2469             :                 }
    2470             :                 else {
    2471             :                     do {
    2472           0 :                         const __m128i res = x_convolve_2tap_2x2_sse4_1(
    2473             :                             src_ptr, src_stride, coeffs_128);
    2474           0 :                         jnt_avg_round_store_2x2_sse2(res,
    2475             :                             offset_avg_128,
    2476             :                             dst,
    2477             :                             dst_stride,
    2478             :                             dst8,
    2479             :                             dst8_stride);
    2480           0 :                         src_ptr += 2 * src_stride;
    2481           0 :                         dst += 2 * dst_stride;
    2482           0 :                         dst8 += 2 * dst8_stride;
    2483           0 :                         y -= 2;
    2484           0 :                     } while (y);
    2485             :                 }
    2486             :             }
    2487             :             else {
    2488             :                 do {
    2489           0 :                     const __m128i res = x_convolve_2tap_2x2_sse4_1(
    2490             :                         src_ptr, src_stride, coeffs_128);
    2491           0 :                     jnt_no_avg_round_store_2x2_sse2(
    2492             :                         res, offset_no_avg_128, dst, dst_stride);
    2493           0 :                     src_ptr += 2 * src_stride;
    2494           0 :                     dst += 2 * dst_stride;
    2495           0 :                     y -= 2;
    2496           0 :                 } while (y);
    2497             :             }
    2498             :         }
    2499           0 :         else if (w == 4) {
    2500           0 :             assert(w == 4);
    2501             : 
    2502           0 :             if (conv_params->do_average) {
    2503           0 :                 if (conv_params->use_jnt_comp_avg) {
    2504             :                     do {
    2505           0 :                         const __m128i res = x_convolve_2tap_4x2_ssse3(
    2506             :                             src_ptr, src_stride, coeffs_128);
    2507           0 :                         jnt_comp_avg_round_store_4x2_sse2(res,
    2508             :                             factor_128,
    2509             :                             offset_comp_avg_128,
    2510             :                             dst,
    2511             :                             dst_stride,
    2512             :                             dst8,
    2513             :                             dst8_stride);
    2514           0 :                         src_ptr += 2 * src_stride;
    2515           0 :                         dst += 2 * dst_stride;
    2516           0 :                         dst8 += 2 * dst8_stride;
    2517           0 :                         y -= 2;
    2518           0 :                     } while (y);
    2519             :                 }
    2520             :                 else {
    2521             :                     do {
    2522           0 :                         const __m128i res = x_convolve_2tap_4x2_ssse3(
    2523             :                             src_ptr, src_stride, coeffs_128);
    2524           0 :                         jnt_avg_round_store_4x2_sse2(res,
    2525             :                             offset_avg_128,
    2526             :                             dst,
    2527             :                             dst_stride,
    2528             :                             dst8,
    2529             :                             dst8_stride);
    2530           0 :                         src_ptr += 2 * src_stride;
    2531           0 :                         dst += 2 * dst_stride;
    2532           0 :                         dst8 += 2 * dst8_stride;
    2533           0 :                         y -= 2;
    2534           0 :                     } while (y);
    2535             :                 }
    2536             :             }
    2537             :             else {
    2538             :                 do {
    2539           0 :                     const __m128i res = x_convolve_2tap_4x2_ssse3(
    2540             :                         src_ptr, src_stride, coeffs_128);
    2541           0 :                     jnt_no_avg_round_store_4x2_sse2(
    2542             :                         res, offset_no_avg_128, dst, dst_stride);
    2543           0 :                     src_ptr += 2 * src_stride;
    2544           0 :                     dst += 2 * dst_stride;
    2545           0 :                     y -= 2;
    2546           0 :                 } while (y);
    2547             :             }
    2548             :         }
    2549             :         else {
    2550             :         }
    2551             :     }
    2552             :     else {
    2553             :         __m256i r[2];
    2554             : 
    2555    20591400 :         prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
    2556             : 
    2557    20600800 :         if (w == 8) {
    2558     8883610 :             if (conv_params->do_average) {
    2559     2415100 :                 if (conv_params->use_jnt_comp_avg) {
    2560             :                     do {
    2561     8673410 :                         const __m256i res = x_convolve_2tap_8x2_avx2(
    2562             :                             src_ptr, src_stride, coeffs_256);
    2563     8673080 :                         jnt_comp_avg_round_store_8x2_avx2(res,
    2564             :                             factor_256,
    2565             :                             offset_comp_avg_256,
    2566             :                             dst,
    2567             :                             dst_stride,
    2568             :                             dst8,
    2569             :                             dst8_stride);
    2570     8673030 :                         src_ptr += 2 * src_stride;
    2571     8673030 :                         dst += 2 * dst_stride;
    2572     8673030 :                         dst8 += 2 * dst8_stride;
    2573     8673030 :                         y -= 2;
    2574     8673030 :                     } while (y);
    2575             :                 }
    2576             :                 else {
    2577             :                     do {
    2578     8673460 :                         const __m256i res = x_convolve_2tap_8x2_avx2(
    2579             :                             src_ptr, src_stride, coeffs_256);
    2580     8673230 :                         jnt_avg_round_store_8x2_sse2(res,
    2581             :                             offset_avg_256,
    2582             :                             dst,
    2583             :                             dst_stride,
    2584             :                             dst8,
    2585             :                             dst8_stride);
    2586     8673150 :                         src_ptr += 2 * src_stride;
    2587     8673150 :                         dst += 2 * dst_stride;
    2588     8673150 :                         dst8 += 2 * dst8_stride;
    2589     8673150 :                         y -= 2;
    2590     8673150 :                     } while (y);
    2591             :                 }
    2592             :             }
    2593             :             else {
    2594             :                 do {
    2595    44150100 :                     const __m256i res = x_convolve_2tap_8x2_avx2(
    2596             :                         src_ptr, src_stride, coeffs_256);
    2597    44150900 :                     jnt_no_avg_round_store_8x2_avx2(
    2598             :                         res, offset_no_avg_256, dst, dst_stride);
    2599    44150100 :                     src_ptr += 2 * src_stride;
    2600    44150100 :                     dst += 2 * dst_stride;
    2601    44150100 :                     y -= 2;
    2602    44150100 :                 } while (y);
    2603             :             }
    2604             :         }
    2605    11717200 :         else if (w == 16) {
    2606     7093380 :             if (conv_params->do_average) {
    2607     1897170 :                 if (conv_params->use_jnt_comp_avg) {
    2608             :                     do {
    2609     9339940 :                         x_convolve_2tap_16x2_avx2(
    2610             :                             src_ptr, src_stride, coeffs_256, r);
    2611             :                         jnt_comp_avg_round_store_16x2_avx2(r,
    2612             :                             factor_256,
    2613             :                             offset_comp_avg_256,
    2614             :                             dst,
    2615             :                             dst_stride,
    2616             :                             dst8,
    2617             :                             dst8_stride);
    2618     9339590 :                         src_ptr += 2 * src_stride;
    2619     9339590 :                         dst += 2 * dst_stride;
    2620     9339590 :                         dst8 += 2 * dst8_stride;
    2621     9339590 :                         y -= 2;
    2622     9339590 :                     } while (y);
    2623             :                 }
    2624             :                 else {
    2625             :                     do {
    2626     9339930 :                         x_convolve_2tap_16x2_avx2(
    2627             :                             src_ptr, src_stride, coeffs_256, r);
    2628     9339370 :                         jnt_avg_round_store_16x2_avx2(r,
    2629             :                             offset_avg_256,
    2630             :                             dst,
    2631             :                             dst_stride,
    2632             :                             dst8,
    2633             :                             dst8_stride);
    2634     9339620 :                         src_ptr += 2 * src_stride;
    2635     9339620 :                         dst += 2 * dst_stride;
    2636     9339620 :                         dst8 += 2 * dst8_stride;
    2637     9339620 :                         y -= 2;
    2638     9339620 :                     } while (y);
    2639             :                 }
    2640             :             }
    2641             :             else {
    2642             :                 do {
    2643    48314700 :                     x_convolve_2tap_16x2_avx2(
    2644             :                         src_ptr, src_stride, coeffs_256, r);
    2645    48310900 :                     jnt_no_avg_round_store_16x2_avx2(
    2646             :                         r, offset_no_avg_256, dst, dst_stride);
    2647    48304900 :                     src_ptr += 2 * src_stride;
    2648    48304900 :                     dst += 2 * dst_stride;
    2649    48304900 :                     y -= 2;
    2650    48304900 :                 } while (y);
    2651             :             }
    2652             :         }
    2653     4623800 :         else if (w == 32) {
    2654     3565750 :             if (conv_params->do_average) {
    2655     1038530 :                 if (conv_params->use_jnt_comp_avg) {
    2656             :                     do {
    2657             :                         jnt_x_comp_avg_2tap_32_avx2(src_ptr,
    2658             :                             coeffs_256,
    2659             :                             factor_256,
    2660             :                             offset_comp_avg_256,
    2661             :                             dst,
    2662             :                             dst8);
    2663    12002100 :                         src_ptr += src_stride;
    2664    12002100 :                         dst += dst_stride;
    2665    12002100 :                         dst8 += dst8_stride;
    2666    12002100 :                     } while (--y);
    2667             :                 }
    2668             :                 else {
    2669             :                     do {
    2670    12001300 :                         jnt_x_avg_2tap_32_avx2(
    2671             :                             src_ptr, coeffs_256, offset_avg_256, dst, dst8);
    2672    12000900 :                         src_ptr += src_stride;
    2673    12000900 :                         dst += dst_stride;
    2674    12000900 :                         dst8 += dst8_stride;
    2675    12000900 :                     } while (--y);
    2676             :                 }
    2677             :             }
    2678             :             else {
    2679             :                 do {
    2680    59306200 :                     jnt_x_no_avg_2tap_32_avx2(
    2681             :                         src_ptr, coeffs_256, offset_no_avg_256, dst);
    2682    59299300 :                     src_ptr += src_stride;
    2683    59299300 :                     dst += dst_stride;
    2684    59299300 :                 } while (--y);
    2685             :             }
    2686             :         }
    2687     1058050 :         else if (w == 64) {
    2688     1058280 :             if (conv_params->do_average) {
    2689      342144 :                 if (conv_params->use_jnt_comp_avg) {
    2690             :                     do {
    2691             :                         jnt_x_comp_avg_2tap_32_avx2(src_ptr,
    2692             :                             coeffs_256,
    2693             :                             factor_256,
    2694             :                             offset_comp_avg_256,
    2695             :                             dst,
    2696             :                             dst8);
    2697     5986580 :                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 32,
    2698             :                             coeffs_256,
    2699             :                             factor_256,
    2700             :                             offset_comp_avg_256,
    2701             :                             dst + 32,
    2702             :                             dst8 + 32);
    2703     5986790 :                         src_ptr += src_stride;
    2704     5986790 :                         dst += dst_stride;
    2705     5986790 :                         dst8 += dst8_stride;
    2706     5986790 :                     } while (--y);
    2707             :                 }
    2708             :                 else {
    2709             :                     do {
    2710     5985770 :                         jnt_x_avg_2tap_32_avx2(
    2711             :                             src_ptr, coeffs_256, offset_avg_256, dst, dst8);
    2712     5985690 :                         jnt_x_avg_2tap_32_avx2(src_ptr + 32,
    2713             :                             coeffs_256,
    2714             :                             offset_avg_256,
    2715     5985690 :                             dst + 32,
    2716             :                             dst8 + 32);
    2717     5985680 :                         src_ptr += src_stride;
    2718     5985680 :                         dst += dst_stride;
    2719     5985680 :                         dst8 += dst8_stride;
    2720     5985680 :                     } while (--y);
    2721             :                 }
    2722             :             }
    2723             :             else {
    2724             :                 do {
    2725    25043100 :                     jnt_x_no_avg_2tap_32_avx2(
    2726             :                         src_ptr, coeffs_256, offset_no_avg_256, dst);
    2727    25042500 :                     jnt_x_no_avg_2tap_32_avx2(
    2728             :                         src_ptr + 32, coeffs_256, offset_no_avg_256, dst + 32);
    2729    25042200 :                     src_ptr += src_stride;
    2730    25042200 :                     dst += dst_stride;
    2731    25042200 :                 } while (--y);
    2732             :             }
    2733             :         }
    2734             :         else {
    2735           0 :             assert(w == 128);
    2736             : 
    2737           0 :             if (conv_params->do_average) {
    2738           0 :                 if (conv_params->use_jnt_comp_avg) {
    2739             :                     do {
    2740             :                         jnt_x_comp_avg_2tap_32_avx2(src_ptr,
    2741             :                             coeffs_256,
    2742             :                             factor_256,
    2743             :                             offset_comp_avg_256,
    2744             :                             dst,
    2745             :                             dst8);
    2746           0 :                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 1 * 32,
    2747             :                             coeffs_256,
    2748             :                             factor_256,
    2749             :                             offset_comp_avg_256,
    2750             :                             dst + 1 * 32,
    2751             :                             dst8 + 1 * 32);
    2752           0 :                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 2 * 32,
    2753             :                             coeffs_256,
    2754             :                             factor_256,
    2755             :                             offset_comp_avg_256,
    2756             :                             dst + 2 * 32,
    2757             :                             dst8 + 2 * 32);
    2758           0 :                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 3 * 32,
    2759             :                             coeffs_256,
    2760             :                             factor_256,
    2761             :                             offset_comp_avg_256,
    2762             :                             dst + 3 * 32,
    2763             :                             dst8 + 3 * 32);
    2764           0 :                         src_ptr += src_stride;
    2765           0 :                         dst += dst_stride;
    2766           0 :                         dst8 += dst8_stride;
    2767           0 :                     } while (--y);
    2768             :                 }
    2769             :                 else {
    2770             :                     do {
    2771           0 :                         jnt_x_avg_2tap_32_avx2(
    2772             :                             src_ptr, coeffs_256, offset_avg_256, dst, dst8);
    2773           0 :                         jnt_x_avg_2tap_32_avx2(src_ptr + 1 * 32,
    2774             :                             coeffs_256,
    2775             :                             offset_avg_256,
    2776           0 :                             dst + 1 * 32,
    2777             :                             dst8 + 1 * 32);
    2778           0 :                         jnt_x_avg_2tap_32_avx2(src_ptr + 2 * 32,
    2779             :                             coeffs_256,
    2780             :                             offset_avg_256,
    2781           0 :                             dst + 2 * 32,
    2782             :                             dst8 + 2 * 32);
    2783           0 :                         jnt_x_avg_2tap_32_avx2(src_ptr + 3 * 32,
    2784             :                             coeffs_256,
    2785             :                             offset_avg_256,
    2786           0 :                             dst + 3 * 32,
    2787             :                             dst8 + 3 * 32);
    2788           0 :                         src_ptr += src_stride;
    2789           0 :                         dst += dst_stride;
    2790           0 :                         dst8 += dst8_stride;
    2791           0 :                     } while (--y);
    2792             :                 }
    2793             :             }
    2794             :             else {
    2795             :                 do {
    2796           0 :                     jnt_x_no_avg_2tap_32_avx2(
    2797             :                         src_ptr, coeffs_256, offset_no_avg_256, dst);
    2798           0 :                     jnt_x_no_avg_2tap_32_avx2(src_ptr + 1 * 32,
    2799             :                         coeffs_256,
    2800             :                         offset_no_avg_256,
    2801             :                         dst + 1 * 32);
    2802           0 :                     jnt_x_no_avg_2tap_32_avx2(src_ptr + 2 * 32,
    2803             :                         coeffs_256,
    2804             :                         offset_no_avg_256,
    2805             :                         dst + 2 * 32);
    2806           0 :                     jnt_x_no_avg_2tap_32_avx2(src_ptr + 3 * 32,
    2807             :                         coeffs_256,
    2808             :                         offset_no_avg_256,
    2809             :                         dst + 3 * 32);
    2810           0 :                     src_ptr += src_stride;
    2811           0 :                     dst += dst_stride;
    2812           0 :                 } while (--y);
    2813             :             }
    2814             :         }
    2815             :     }
    2816    20602600 : }
    2817             : 
    2818      165000 : static void jnt_convolve_x_4tap_ssse3(
    2819             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    2820             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    2821             :     const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
    2822             :     const ConvolveParams *const conv_params) {
    2823      165000 :     const uint8_t *src_ptr = src - 1;
    2824      165000 :     const int32_t dst_stride = conv_params->dst_stride;
    2825      165000 :     const int32_t round_0 = 3;
    2826      165000 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    2827      165000 :     const int32_t bits = FILTER_BITS - round_1;
    2828      165000 :     const int32_t bd = 8;
    2829      165000 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
    2830      165000 :     const int32_t round_offset =
    2831      165000 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
    2832      165000 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
    2833      165000 :     const int32_t offset_comp_avg =
    2834      165000 :         round_offset * conv_params->bck_offset +
    2835      165000 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
    2836      165000 :         (round_offset << DIST_PRECISION_BITS);
    2837      165000 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
    2838      165000 :         (1 << (round_0 - bits - 2)) -
    2839      165000 :         (round_offset << (round_0 - bits - 1));
    2840      165000 :     const int16_t offset_no_avg =
    2841      165000 :         (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
    2842      165000 :     const int32_t factor =
    2843      165000 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    2844      165000 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    2845      165000 :     const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
    2846      330000 :     const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
    2847      165000 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    2848      165000 :     ConvBufType *dst = conv_params->dst;
    2849      165000 :     int32_t y = h;
    2850             :     __m128i coeffs_128[4];
    2851             : 
    2852      165000 :     prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
    2853             : 
    2854      165002 :     if (w == 2) {
    2855           1 :         if (conv_params->do_average) {
    2856           0 :             if (conv_params->use_jnt_comp_avg) {
    2857             :                 do {
    2858           0 :                     const __m128i res = x_convolve_4tap_2x2_ssse3(
    2859             :                         src_ptr, src_stride, coeffs_128);
    2860           0 :                     jnt_comp_avg_round_store_2x2_sse2(res,
    2861             :                         factor_128,
    2862             :                         offset_comp_avg_128,
    2863             :                         dst,
    2864             :                         dst_stride,
    2865             :                         dst8,
    2866             :                         dst8_stride);
    2867           0 :                     src_ptr += 2 * src_stride;
    2868           0 :                     dst += 2 * dst_stride;
    2869           0 :                     dst8 += 2 * dst8_stride;
    2870           0 :                     y -= 2;
    2871           0 :                 } while (y);
    2872             :             }
    2873             :             else {
    2874             :                 do {
    2875           0 :                     const __m128i res = x_convolve_4tap_2x2_ssse3(
    2876             :                         src_ptr, src_stride, coeffs_128);
    2877           0 :                     jnt_avg_round_store_2x2_sse2(res,
    2878             :                         offset_avg_128,
    2879             :                         dst,
    2880             :                         dst_stride,
    2881             :                         dst8,
    2882             :                         dst8_stride);
    2883           0 :                     src_ptr += 2 * src_stride;
    2884           0 :                     dst += 2 * dst_stride;
    2885           0 :                     dst8 += 2 * dst8_stride;
    2886           0 :                     y -= 2;
    2887           0 :                 } while (y);
    2888             :             }
    2889             :         }
    2890             :         else {
    2891             :             do {
    2892             :                 const __m128i res =
    2893           1 :                     x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
    2894           0 :                 jnt_no_avg_round_store_2x2_sse2(
    2895             :                     res, offset_no_avg_128, dst, dst_stride);
    2896           0 :                 src_ptr += 2 * src_stride;
    2897           0 :                 dst += 2 * dst_stride;
    2898           0 :                 y -= 2;
    2899           0 :             } while (y);
    2900             :         }
    2901             :     }
    2902             :     else {
    2903      165001 :         assert(w == 4);
    2904             : 
    2905      165001 :         if (conv_params->do_average) {
    2906       73890 :             if (conv_params->use_jnt_comp_avg) {
    2907             :                 do {
    2908      117812 :                     const __m128i res = x_convolve_4tap_4x2_ssse3(
    2909             :                         src_ptr, src_stride, coeffs_128);
    2910      117812 :                     jnt_comp_avg_round_store_4x2_sse2(res,
    2911             :                         factor_128,
    2912             :                         offset_comp_avg_128,
    2913             :                         dst,
    2914             :                         dst_stride,
    2915             :                         dst8,
    2916             :                         dst8_stride);
    2917      117812 :                     src_ptr += 2 * src_stride;
    2918      117812 :                     dst += 2 * dst_stride;
    2919      117812 :                     dst8 += 2 * dst8_stride;
    2920      117812 :                     y -= 2;
    2921      117812 :                 } while (y);
    2922             :             }
    2923             :             else {
    2924             :                 do {
    2925      181511 :                     const __m128i res = x_convolve_4tap_4x2_ssse3(
    2926             :                         src_ptr, src_stride, coeffs_128);
    2927      181510 :                     jnt_avg_round_store_4x2_sse2(res,
    2928             :                         offset_avg_128,
    2929             :                         dst,
    2930             :                         dst_stride,
    2931             :                         dst8,
    2932             :                         dst8_stride);
    2933      181511 :                     src_ptr += 2 * src_stride;
    2934      181511 :                     dst += 2 * dst_stride;
    2935      181511 :                     dst8 += 2 * dst8_stride;
    2936      181511 :                     y -= 2;
    2937      181511 :                 } while (y);
    2938             :             }
    2939             :         }
    2940             :         else {
    2941             :             do {
    2942             :                 const __m128i res =
    2943      373179 :                     x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
    2944      373184 :                 jnt_no_avg_round_store_4x2_sse2(
    2945             :                     res, offset_no_avg_128, dst, dst_stride);
    2946      373179 :                 src_ptr += 2 * src_stride;
    2947      373179 :                 dst += 2 * dst_stride;
    2948      373179 :                 y -= 2;
    2949      373179 :             } while (y);
    2950             :         }
    2951             :     }
    2952      165001 : }
    2953             : 
    2954     7846150 : static void jnt_convolve_x_6tap_avx2(
    2955             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    2956             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    2957             :     const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
    2958             :     const ConvolveParams *const conv_params) {
    2959     7846150 :     const uint8_t *src_ptr = src - 2;
    2960     7846150 :     const int32_t dst_stride = conv_params->dst_stride;
    2961     7846150 :     const int32_t round_0 = 3;
    2962     7846150 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    2963     7846150 :     const int32_t bits = FILTER_BITS - round_1;
    2964     7846150 :     const int32_t bd = 8;
    2965     7846150 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
    2966     7846150 :     const int32_t round_offset =
    2967     7846150 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
    2968     7846150 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
    2969     7846150 :     const int32_t offset_comp_avg =
    2970     7846150 :         round_offset * conv_params->bck_offset +
    2971     7846150 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
    2972     7846150 :         (round_offset << DIST_PRECISION_BITS);
    2973     7846150 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
    2974     7846150 :         (1 << (round_0 - bits - 2)) -
    2975     7846150 :         (round_offset << (round_0 - bits - 1));
    2976     7846150 :     const int16_t offset_no_avg =
    2977     7846150 :         (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
    2978     7846150 :     const int32_t factor =
    2979     7846150 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    2980     7846150 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    2981     7846150 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
    2982    15692300 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
    2983     7846150 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    2984     7846150 :     ConvBufType *dst = conv_params->dst;
    2985     7846150 :     int32_t y = h;
    2986             :     __m256i coeffs_256[4], filt_256[4];
    2987             : 
    2988     7846150 :     filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    2989     7846150 :     filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    2990     7846150 :     filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    2991             : 
    2992     7846150 :     prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
    2993             : 
    2994     7847420 :     if (w == 8) {
    2995     3505330 :         if (conv_params->do_average) {
    2996     1412780 :             if (conv_params->use_jnt_comp_avg) {
    2997             :                 do {
    2998     4458300 :                     const __m256i res = x_convolve_6tap_8x2_avx2(
    2999             :                         src_ptr, src_stride, coeffs_256, filt_256);
    3000     4458270 :                     jnt_comp_avg_round_store_8x2_avx2(res,
    3001             :                         factor_256,
    3002             :                         offset_comp_avg_256,
    3003             :                         dst,
    3004             :                         dst_stride,
    3005             :                         dst8,
    3006             :                         dst8_stride);
    3007     4458190 :                     src_ptr += 2 * src_stride;
    3008     4458190 :                     dst += 2 * dst_stride;
    3009     4458190 :                     dst8 += 2 * dst8_stride;
    3010     4458190 :                     y -= 2;
    3011     4458190 :                 } while (y);
    3012             :             }
    3013             :             else {
    3014             :                 do {
    3015     5683300 :                     const __m256i res = x_convolve_6tap_8x2_avx2(
    3016             :                         src_ptr, src_stride, coeffs_256, filt_256);
    3017     5683310 :                     jnt_avg_round_store_8x2_sse2(res,
    3018             :                         offset_avg_256,
    3019             :                         dst,
    3020             :                         dst_stride,
    3021             :                         dst8,
    3022             :                         dst8_stride);
    3023     5683100 :                     src_ptr += 2 * src_stride;
    3024     5683100 :                     dst += 2 * dst_stride;
    3025     5683100 :                     dst8 += 2 * dst8_stride;
    3026     5683100 :                     y -= 2;
    3027     5683100 :                 } while (y);
    3028             :             }
    3029             :         }
    3030             :         else {
    3031             :             do {
    3032    15283700 :                 const __m256i res = x_convolve_6tap_8x2_avx2(
    3033             :                     src_ptr, src_stride, coeffs_256, filt_256);
    3034    15284100 :                 jnt_no_avg_round_store_8x2_avx2(
    3035             :                     res, offset_no_avg_256, dst, dst_stride);
    3036    15283700 :                 src_ptr += 2 * src_stride;
    3037    15283700 :                 dst += 2 * dst_stride;
    3038    15283700 :                 y -= 2;
    3039    15283700 :             } while (y);
    3040             :         }
    3041             :     }
    3042     4342080 :     else if (w == 16) {
    3043     2489280 :         if (conv_params->do_average) {
    3044      976559 :             if (conv_params->use_jnt_comp_avg) {
    3045             :                 do {
    3046             :                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
    3047             :                         src_stride,
    3048             :                         coeffs_256,
    3049             :                         filt_256,
    3050             :                         factor_256,
    3051             :                         offset_comp_avg_256,
    3052             :                         dst,
    3053             :                         dst_stride,
    3054             :                         dst8,
    3055             :                         dst8_stride);
    3056     3859710 :                     src_ptr += 2 * src_stride;
    3057     3859710 :                     dst += 2 * dst_stride;
    3058     3859710 :                     dst8 += 2 * dst8_stride;
    3059     3859710 :                     y -= 2;
    3060     3859710 :                 } while (y);
    3061             :             }
    3062             :             else {
    3063             :                 do {
    3064             :                     jnt_x_avg_6tap_16x2_avx2(src_ptr,
    3065             :                         src_stride,
    3066             :                         coeffs_256,
    3067             :                         filt_256,
    3068             :                         offset_avg_256,
    3069             :                         dst,
    3070             :                         dst_stride,
    3071             :                         dst8,
    3072             :                         dst8_stride);
    3073     5608480 :                     src_ptr += 2 * src_stride;
    3074     5608480 :                     dst += 2 * dst_stride;
    3075     5608480 :                     dst8 += 2 * dst8_stride;
    3076     5608480 :                     y -= 2;
    3077     5608480 :                 } while (y);
    3078             :             }
    3079             :         }
    3080             :         else {
    3081             :             do {
    3082             :                 jnt_x_no_avg_6tap_16x2_avx2(src_ptr,
    3083             :                     src_stride,
    3084             :                     coeffs_256,
    3085             :                     filt_256,
    3086             :                     offset_no_avg_256,
    3087             :                     dst,
    3088             :                     dst_stride);
    3089    14950000 :                 src_ptr += 2 * src_stride;
    3090    14950000 :                 dst += 2 * dst_stride;
    3091    14950000 :                 y -= 2;
    3092    14950000 :             } while (y);
    3093             :         }
    3094             :     }
    3095     1852800 :     else if (w == 32) {
    3096     1435280 :         if (conv_params->do_average) {
    3097      547917 :             if (conv_params->use_jnt_comp_avg) {
    3098             :                 do {
    3099     4622450 :                     jnt_x_comp_avg_6tap_32_avx2(src_ptr,
    3100             :                         coeffs_256,
    3101             :                         filt_256,
    3102             :                         factor_256,
    3103             :                         offset_comp_avg_256,
    3104             :                         dst,
    3105             :                         dst8);
    3106     4622430 :                     src_ptr += src_stride;
    3107     4622430 :                     dst += dst_stride;
    3108     4622430 :                     dst8 += dst8_stride;
    3109     4622430 :                 } while (--y);
    3110             :             }
    3111             :             else {
    3112             :                 do {
    3113     7053320 :                     jnt_x_avg_6tap_32_avx2(src_ptr,
    3114             :                         coeffs_256,
    3115             :                         filt_256,
    3116             :                         offset_avg_256,
    3117             :                         dst,
    3118             :                         dst8);
    3119     7053200 :                     src_ptr += src_stride;
    3120     7053200 :                     dst += dst_stride;
    3121     7053200 :                     dst8 += dst8_stride;
    3122     7053200 :                 } while (--y);
    3123             :             }
    3124             :         }
    3125             :         else {
    3126             :             do {
    3127    19120200 :                 jnt_x_no_avg_6tap_32_avx2(
    3128             :                     src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
    3129    19119000 :                 src_ptr += src_stride;
    3130    19119000 :                 dst += dst_stride;
    3131    19119000 :             } while (--y);
    3132             :         }
    3133             :     }
    3134      417526 :     else if (w == 64) {
    3135      417546 :         if (conv_params->do_average) {
    3136      159491 :             if (conv_params->use_jnt_comp_avg) {
    3137             :                 do {
    3138             :                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
    3139             :                         16,
    3140             :                         coeffs_256,
    3141             :                         filt_256,
    3142             :                         factor_256,
    3143             :                         offset_comp_avg_256,
    3144             :                         dst,
    3145             :                         16,
    3146             :                         dst8,
    3147             :                         16);
    3148     1856980 :                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 32,
    3149             :                         16,
    3150             :                         coeffs_256,
    3151             :                         filt_256,
    3152             :                         factor_256,
    3153             :                         offset_comp_avg_256,
    3154             :                         dst + 32,
    3155             :                         16,
    3156             :                         dst8 + 32,
    3157             :                         16);
    3158     1856990 :                     src_ptr += src_stride;
    3159     1856990 :                     dst += dst_stride;
    3160     1856990 :                     dst8 += dst8_stride;
    3161     1856990 :                 } while (--y);
    3162             :             }
    3163             :             else {
    3164             :                 do {
    3165             :                     jnt_x_avg_6tap_16x2_avx2(src_ptr,
    3166             :                         16,
    3167             :                         coeffs_256,
    3168             :                         filt_256,
    3169             :                         offset_avg_256,
    3170             :                         dst,
    3171             :                         16,
    3172             :                         dst8,
    3173             :                         16);
    3174     3215680 :                     jnt_x_avg_6tap_16x2_avx2(src_ptr + 32,
    3175             :                         16,
    3176             :                         coeffs_256,
    3177             :                         filt_256,
    3178             :                         offset_avg_256,
    3179             :                         dst + 32,
    3180             :                         16,
    3181             :                         dst8 + 32,
    3182             :                         16);
    3183     3215700 :                     src_ptr += src_stride;
    3184     3215700 :                     dst += dst_stride;
    3185     3215700 :                     dst8 += dst8_stride;
    3186     3215700 :                 } while (--y);
    3187             :             }
    3188             :         }
    3189             :         else {
    3190             :             do {
    3191             :                 jnt_x_no_avg_6tap_16x2_avx2(src_ptr,
    3192             :                     16,
    3193             :                     coeffs_256,
    3194             :                     filt_256,
    3195             :                     offset_no_avg_256,
    3196             :                     dst,
    3197             :                     16);
    3198     8098490 :                 jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 32,
    3199             :                     16,
    3200             :                     coeffs_256,
    3201             :                     filt_256,
    3202             :                     offset_no_avg_256,
    3203             :                     dst + 32,
    3204             :                     16);
    3205     8098480 :                 src_ptr += src_stride;
    3206     8098480 :                 dst += dst_stride;
    3207     8098480 :             } while (--y);
    3208             :         }
    3209             :     }
    3210             :     else {
    3211           0 :         assert(w == 128);
    3212             : 
    3213           0 :         if (conv_params->do_average) {
    3214           0 :             if (conv_params->use_jnt_comp_avg) {
    3215             :                 do {
    3216             :                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
    3217             :                         16,
    3218             :                         coeffs_256,
    3219             :                         filt_256,
    3220             :                         factor_256,
    3221             :                         offset_comp_avg_256,
    3222             :                         dst,
    3223             :                         16,
    3224             :                         dst8,
    3225             :                         16);
    3226           0 :                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 1 * 32,
    3227             :                         16,
    3228             :                         coeffs_256,
    3229             :                         filt_256,
    3230             :                         factor_256,
    3231             :                         offset_comp_avg_256,
    3232             :                         dst + 1 * 32,
    3233             :                         16,
    3234             :                         dst8 + 1 * 32,
    3235             :                         16);
    3236           0 :                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 2 * 32,
    3237             :                         16,
    3238             :                         coeffs_256,
    3239             :                         filt_256,
    3240             :                         factor_256,
    3241             :                         offset_comp_avg_256,
    3242             :                         dst + 2 * 32,
    3243             :                         16,
    3244             :                         dst8 + 2 * 32,
    3245             :                         16);
    3246           0 :                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 3 * 32,
    3247             :                         16,
    3248             :                         coeffs_256,
    3249             :                         filt_256,
    3250             :                         factor_256,
    3251             :                         offset_comp_avg_256,
    3252             :                         dst + 3 * 32,
    3253             :                         16,
    3254             :                         dst8 + 3 * 32,
    3255             :                         16);
    3256           0 :                     src_ptr += src_stride;
    3257           0 :                     dst += dst_stride;
    3258           0 :                     dst8 += dst8_stride;
    3259           0 :                 } while (--y);
    3260             :             }
    3261             :             else {
    3262             :                 do {
    3263             :                     jnt_x_avg_6tap_16x2_avx2(src_ptr,
    3264             :                         16,
    3265             :                         coeffs_256,
    3266             :                         filt_256,
    3267             :                         offset_avg_256,
    3268             :                         dst,
    3269             :                         16,
    3270             :                         dst8,
    3271             :                         16);
    3272           0 :                     jnt_x_avg_6tap_16x2_avx2(src_ptr + 1 * 32,
    3273             :                         16,
    3274             :                         coeffs_256,
    3275             :                         filt_256,
    3276             :                         offset_avg_256,
    3277             :                         dst + 1 * 32,
    3278             :                         16,
    3279             :                         dst8 + 1 * 32,
    3280             :                         16);
    3281           0 :                     jnt_x_avg_6tap_16x2_avx2(src_ptr + 2 * 32,
    3282             :                         16,
    3283             :                         coeffs_256,
    3284             :                         filt_256,
    3285             :                         offset_avg_256,
    3286             :                         dst + 2 * 32,
    3287             :                         16,
    3288             :                         dst8 + 2 * 32,
    3289             :                         16);
    3290           0 :                     jnt_x_avg_6tap_16x2_avx2(src_ptr + 3 * 32,
    3291             :                         16,
    3292             :                         coeffs_256,
    3293             :                         filt_256,
    3294             :                         offset_avg_256,
    3295             :                         dst + 3 * 32,
    3296             :                         16,
    3297             :                         dst8 + 3 * 32,
    3298             :                         16);
    3299           0 :                     src_ptr += src_stride;
    3300           0 :                     dst += dst_stride;
    3301           0 :                     dst8 += dst8_stride;
    3302           0 :                 } while (--y);
    3303             :             }
    3304             :         }
    3305             :         else {
    3306             :             do {
    3307             :                 jnt_x_no_avg_6tap_16x2_avx2(src_ptr,
    3308             :                     16,
    3309             :                     coeffs_256,
    3310             :                     filt_256,
    3311             :                     offset_no_avg_256,
    3312             :                     dst,
    3313             :                     16);
    3314           0 :                 jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 1 * 32,
    3315             :                     16,
    3316             :                     coeffs_256,
    3317             :                     filt_256,
    3318             :                     offset_no_avg_256,
    3319             :                     dst + 1 * 32,
    3320             :                     16);
    3321           0 :                 jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 2 * 32,
    3322             :                     16,
    3323             :                     coeffs_256,
    3324             :                     filt_256,
    3325             :                     offset_no_avg_256,
    3326             :                     dst + 2 * 32,
    3327             :                     16);
    3328           0 :                 jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 3 * 32,
    3329             :                     16,
    3330             :                     coeffs_256,
    3331             :                     filt_256,
    3332             :                     offset_no_avg_256,
    3333             :                     dst + 3 * 32,
    3334             :                     16);
    3335           0 :                 src_ptr += src_stride;
    3336           0 :                 dst += dst_stride;
    3337           0 :             } while (--y);
    3338             :         }
    3339             :     }
    3340     7844520 : }
    3341             : 
    3342     3278940 : static void jnt_convolve_x_8tap_avx2(
    3343             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    3344             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    3345             :     const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
    3346             :     const ConvolveParams *const conv_params) {
    3347     3278940 :     const uint8_t *src_ptr = src - 3;
    3348     3278940 :     const int32_t dst_stride = conv_params->dst_stride;
    3349     3278940 :     const int32_t round_0 = 3;
    3350     3278940 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    3351     3278940 :     const int32_t bits = FILTER_BITS - round_1;
    3352     3278940 :     const int32_t bd = 8;
    3353     3278940 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
    3354     3278940 :     const int32_t round_offset =
    3355     3278940 :         (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
    3356     3278940 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
    3357     3278940 :     const int32_t offset_comp_avg =
    3358     3278940 :         round_offset * conv_params->bck_offset +
    3359     3278940 :         (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
    3360     3278940 :         (round_offset << DIST_PRECISION_BITS);
    3361     3278940 :     const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
    3362     3278940 :         (1 << (round_0 - bits - 2)) -
    3363     3278940 :         (round_offset << (round_0 - bits - 1));
    3364     3278940 :     const int16_t offset_no_avg =
    3365     3278940 :         (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
    3366     3278940 :     const int32_t factor =
    3367     3278940 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    3368     3278940 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    3369     3278940 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
    3370     6557880 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
    3371     3278940 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    3372     3278940 :     ConvBufType *dst = conv_params->dst;
    3373     3278940 :     int32_t y = h;
    3374             :     __m256i coeffs_256[4], filt_256[4];
    3375             : 
    3376     3278940 :     filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    3377     3278940 :     filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    3378     3278940 :     filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    3379     3278940 :     filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
    3380             : 
    3381     3278940 :     prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
    3382             : 
    3383     3279110 :     if (w == 8) {
    3384     1406050 :         if (conv_params->do_average) {
    3385      576810 :             if (conv_params->use_jnt_comp_avg) {
    3386             :                 do {
    3387     1889450 :                     const __m256i res = x_convolve_8tap_8x2_avx2(
    3388             :                         src_ptr, src_stride, coeffs_256, filt_256);
    3389     1889450 :                     jnt_comp_avg_round_store_8x2_avx2(res,
    3390             :                         factor_256,
    3391             :                         offset_comp_avg_256,
    3392             :                         dst,
    3393             :                         dst_stride,
    3394             :                         dst8,
    3395             :                         dst8_stride);
    3396     1889430 :                     src_ptr += 2 * src_stride;
    3397     1889430 :                     dst += 2 * dst_stride;
    3398     1889430 :                     dst8 += 2 * dst8_stride;
    3399     1889430 :                     y -= 2;
    3400     1889430 :                 } while (y);
    3401             :             }
    3402             :             else {
    3403             :                 do {
    3404     2383010 :                     const __m256i res = x_convolve_8tap_8x2_avx2(
    3405             :                         src_ptr, src_stride, coeffs_256, filt_256);
    3406     2383030 :                     jnt_avg_round_store_8x2_sse2(res,
    3407             :                         offset_avg_256,
    3408             :                         dst,
    3409             :                         dst_stride,
    3410             :                         dst8,
    3411             :                         dst8_stride);
    3412     2383000 :                     src_ptr += 2 * src_stride;
    3413     2383000 :                     dst += 2 * dst_stride;
    3414     2383000 :                     dst8 += 2 * dst8_stride;
    3415     2383000 :                     y -= 2;
    3416     2383000 :                 } while (y);
    3417             :             }
    3418             :         }
    3419             :         else {
    3420             :             do {
    3421     6212910 :                 const __m256i res = x_convolve_8tap_8x2_avx2(
    3422             :                     src_ptr, src_stride, coeffs_256, filt_256);
    3423     6213000 :                 jnt_no_avg_round_store_8x2_avx2(
    3424             :                     res, offset_no_avg_256, dst, dst_stride);
    3425     6212950 :                 src_ptr += 2 * src_stride;
    3426     6212950 :                 dst += 2 * dst_stride;
    3427     6212950 :                 y -= 2;
    3428     6212950 :             } while (y);
    3429             :         }
    3430             :     }
    3431     1873050 :     else if (w == 16) {
    3432     1034330 :         if (conv_params->do_average) {
    3433      409753 :             if (conv_params->use_jnt_comp_avg) {
    3434             :                 do {
    3435     1690030 :                     jnt_x_comp_avg_8tap_16x2_avx2(src_ptr,
    3436             :                         src_stride,
    3437             :                         coeffs_256,
    3438             :                         filt_256,
    3439             :                         factor_256,
    3440             :                         offset_comp_avg_256,
    3441             :                         dst,
    3442             :                         dst_stride,
    3443             :                         dst8,
    3444             :                         dst8_stride);
    3445     1690020 :                     src_ptr += 2 * src_stride;
    3446     1690020 :                     dst += 2 * dst_stride;
    3447     1690020 :                     dst8 += 2 * dst8_stride;
    3448     1690020 :                     y -= 2;
    3449     1690020 :                 } while (y);
    3450             :             }
    3451             :             else {
    3452             :                 do {
    3453     2376470 :                     jnt_x_avg_8tap_16x2_avx2(src_ptr,
    3454             :                         src_stride,
    3455             :                         coeffs_256,
    3456             :                         filt_256,
    3457             :                         offset_avg_256,
    3458             :                         dst,
    3459             :                         dst_stride,
    3460             :                         dst8,
    3461             :                         dst8_stride);
    3462     2376460 :                     src_ptr += 2 * src_stride;
    3463     2376460 :                     dst += 2 * dst_stride;
    3464     2376460 :                     dst8 += 2 * dst8_stride;
    3465     2376460 :                     y -= 2;
    3466     2376460 :                 } while (y);
    3467             :             }
    3468             :         }
    3469             :         else {
    3470             :             do {
    3471     6335670 :                 jnt_x_no_avg_8tap_16x2_avx2(src_ptr,
    3472             :                     src_stride,
    3473             :                     coeffs_256,
    3474             :                     filt_256,
    3475             :                     offset_no_avg_256,
    3476             :                     dst,
    3477             :                     dst_stride);
    3478     6335550 :                 src_ptr += 2 * src_stride;
    3479     6335550 :                 dst += 2 * dst_stride;
    3480     6335550 :                 y -= 2;
    3481     6335550 :             } while (y);
    3482             :         }
    3483             :     }
    3484      838721 :     else if (w == 32) {
    3485      642749 :         if (conv_params->do_average) {
    3486      248229 :             if (conv_params->use_jnt_comp_avg) {
    3487             :                 do {
    3488             :                     jnt_x_comp_avg_8tap_32_avx2(src_ptr,
    3489             :                         coeffs_256,
    3490             :                         filt_256,
    3491             :                         factor_256,
    3492             :                         offset_comp_avg_256,
    3493             :                         dst,
    3494             :                         dst8);
    3495     2157830 :                     src_ptr += src_stride;
    3496     2157830 :                     dst += dst_stride;
    3497     2157830 :                     dst8 += dst8_stride;
    3498     2157830 :                 } while (--y);
    3499             :             }
    3500             :             else {
    3501             :                 do {
    3502             :                     jnt_x_avg_8tap_32_avx2(src_ptr,
    3503             :                         coeffs_256,
    3504             :                         filt_256,
    3505             :                         offset_avg_256,
    3506             :                         dst,
    3507             :                         dst8);
    3508     3172140 :                     src_ptr += src_stride;
    3509     3172140 :                     dst += dst_stride;
    3510     3172140 :                     dst8 += dst8_stride;
    3511     3172140 :                 } while (--y);
    3512             :             }
    3513             :         }
    3514             :         else {
    3515             :             do {
    3516             :                 jnt_x_no_avg_8tap_32_avx2(
    3517             :                     src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
    3518     8687870 :                 src_ptr += src_stride;
    3519     8687870 :                 dst += dst_stride;
    3520     8687870 :             } while (--y);
    3521             :         }
    3522             :     }
    3523      195972 :     else if (w == 64) {
    3524      195979 :         if (conv_params->do_average) {
    3525       74962 :             if (conv_params->use_jnt_comp_avg) {
    3526             :                 do {
    3527             :                     jnt_x_comp_avg_8tap_32_avx2(src_ptr,
    3528             :                         coeffs_256,
    3529             :                         filt_256,
    3530             :                         factor_256,
    3531             :                         offset_comp_avg_256,
    3532             :                         dst,
    3533             :                         dst8);
    3534      901365 :                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 32,
    3535             :                         coeffs_256,
    3536             :                         filt_256,
    3537             :                         factor_256,
    3538             :                         offset_comp_avg_256,
    3539             :                         dst + 32,
    3540             :                         dst8 + 32);
    3541      901378 :                     src_ptr += src_stride;
    3542      901378 :                     dst += dst_stride;
    3543      901378 :                     dst8 += dst8_stride;
    3544      901378 :                 } while (--y);
    3545             :             }
    3546             :             else {
    3547             :                 do {
    3548             :                     jnt_x_avg_8tap_32_avx2(src_ptr,
    3549             :                         coeffs_256,
    3550             :                         filt_256,
    3551             :                         offset_avg_256,
    3552             :                         dst,
    3553             :                         dst8);
    3554     1471870 :                     jnt_x_avg_8tap_32_avx2(src_ptr + 32,
    3555             :                         coeffs_256,
    3556             :                         filt_256,
    3557             :                         offset_avg_256,
    3558             :                         dst + 32,
    3559             :                         dst8 + 32);
    3560     1471860 :                     src_ptr += src_stride;
    3561     1471860 :                     dst += dst_stride;
    3562     1471860 :                     dst8 += dst8_stride;
    3563     1471860 :                 } while (--y);
    3564             :             }
    3565             :         }
    3566             :         else {
    3567             :             do {
    3568             :                 jnt_x_no_avg_8tap_32_avx2(
    3569             :                     src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
    3570     3805590 :                 jnt_x_no_avg_8tap_32_avx2(src_ptr + 32,
    3571             :                     coeffs_256,
    3572             :                     filt_256,
    3573             :                     offset_no_avg_256,
    3574             :                     dst + 32);
    3575     3805620 :                 src_ptr += src_stride;
    3576     3805620 :                 dst += dst_stride;
    3577     3805620 :             } while (--y);
    3578             :         }
    3579             :     }
    3580             :     else {
    3581           0 :         assert(w == 128);
    3582             : 
    3583           0 :         if (conv_params->do_average) {
    3584           0 :             if (conv_params->use_jnt_comp_avg) {
    3585             :                 do {
    3586             :                     jnt_x_comp_avg_8tap_32_avx2(src_ptr,
    3587             :                         coeffs_256,
    3588             :                         filt_256,
    3589             :                         factor_256,
    3590             :                         offset_comp_avg_256,
    3591             :                         dst,
    3592             :                         dst8);
    3593           0 :                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 1 * 32,
    3594             :                         coeffs_256,
    3595             :                         filt_256,
    3596             :                         factor_256,
    3597             :                         offset_comp_avg_256,
    3598             :                         dst + 1 * 32,
    3599             :                         dst8 + 1 * 32);
    3600           0 :                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 2 * 32,
    3601             :                         coeffs_256,
    3602             :                         filt_256,
    3603             :                         factor_256,
    3604             :                         offset_comp_avg_256,
    3605             :                         dst + 2 * 32,
    3606             :                         dst8 + 2 * 32);
    3607           0 :                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 3 * 32,
    3608             :                         coeffs_256,
    3609             :                         filt_256,
    3610             :                         factor_256,
    3611             :                         offset_comp_avg_256,
    3612             :                         dst + 3 * 32,
    3613             :                         dst8 + 3 * 32);
    3614           0 :                     src_ptr += src_stride;
    3615           0 :                     dst += dst_stride;
    3616           0 :                     dst8 += dst8_stride;
    3617           0 :                 } while (--y);
    3618             :             }
    3619             :             else {
    3620             :                 do {
    3621             :                     jnt_x_avg_8tap_32_avx2(src_ptr,
    3622             :                         coeffs_256,
    3623             :                         filt_256,
    3624             :                         offset_avg_256,
    3625             :                         dst,
    3626             :                         dst8);
    3627           0 :                     jnt_x_avg_8tap_32_avx2(src_ptr + 1 * 32,
    3628             :                         coeffs_256,
    3629             :                         filt_256,
    3630             :                         offset_avg_256,
    3631             :                         dst + 1 * 32,
    3632             :                         dst8 + 1 * 32);
    3633           0 :                     jnt_x_avg_8tap_32_avx2(src_ptr + 2 * 32,
    3634             :                         coeffs_256,
    3635             :                         filt_256,
    3636             :                         offset_avg_256,
    3637             :                         dst + 2 * 32,
    3638             :                         dst8 + 2 * 32);
    3639           0 :                     jnt_x_avg_8tap_32_avx2(src_ptr + 3 * 32,
    3640             :                         coeffs_256,
    3641             :                         filt_256,
    3642             :                         offset_avg_256,
    3643             :                         dst + 3 * 32,
    3644             :                         dst8 + 3 * 32);
    3645           0 :                     src_ptr += src_stride;
    3646           0 :                     dst += dst_stride;
    3647           0 :                     dst8 += dst8_stride;
    3648           0 :                 } while (--y);
    3649             :             }
    3650             :         }
    3651             :         else {
    3652             :             do {
    3653             :                 jnt_x_no_avg_8tap_32_avx2(
    3654             :                     src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
    3655           0 :                 jnt_x_no_avg_8tap_32_avx2(src_ptr + 1 * 32,
    3656             :                     coeffs_256,
    3657             :                     filt_256,
    3658             :                     offset_no_avg_256,
    3659             :                     dst + 1 * 32);
    3660           0 :                 jnt_x_no_avg_8tap_32_avx2(src_ptr + 2 * 32,
    3661             :                     coeffs_256,
    3662             :                     filt_256,
    3663             :                     offset_no_avg_256,
    3664             :                     dst + 2 * 32);
    3665           0 :                 jnt_x_no_avg_8tap_32_avx2(src_ptr + 3 * 32,
    3666             :                     coeffs_256,
    3667             :                     filt_256,
    3668             :                     offset_no_avg_256,
    3669             :                     dst + 3 * 32);
    3670           0 :                 src_ptr += src_stride;
    3671           0 :                 dst += dst_stride;
    3672           0 :             } while (--y);
    3673             :         }
    3674             :     }
    3675     3279040 : }
    3676             : 
    3677             : typedef void(*jnt_convolve_x_tap_func)(
    3678             :     const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
    3679             :     const int32_t dst8_stride, const int32_t w, const int32_t h,
    3680             :     const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
    3681             :     const ConvolveParams *const conv_params);
    3682             : 
    3683    31862100 : void eb_av1_jnt_convolve_x_avx2(const uint8_t *src, int32_t src_stride,
    3684             :     uint8_t *dst8, int32_t dst8_stride, int32_t w,
    3685             :     int32_t h, InterpFilterParams *filter_params_x,
    3686             :     InterpFilterParams *filter_params_y,
    3687             :     const int32_t subpel_x_q4,
    3688             :     const int32_t subpel_y_q4,
    3689             :     ConvolveParams *conv_params) {
    3690             :     static const jnt_convolve_x_tap_func
    3691             :         jnt_convolve_x_tap_func_table[MAX_FILTER_TAP + 1] = {
    3692             :             NULL,
    3693             :             NULL,
    3694             :             jnt_convolve_x_2tap_avx2,
    3695             :             NULL,
    3696             :             jnt_convolve_x_4tap_ssse3,
    3697             :             NULL,
    3698             :             jnt_convolve_x_6tap_avx2,
    3699             :             NULL,
    3700             :             jnt_convolve_x_8tap_avx2 };
    3701    31862100 :     const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
    3702             : 
    3703             :     (void)filter_params_y;
    3704             :     (void)subpel_y_q4;
    3705             : 
    3706    31859200 :     assert(conv_params->round_0 == 3);
    3707    31860000 :     assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
    3708             : 
    3709    31860000 :     jnt_convolve_x_tap_func_table[tap_x](src,
    3710             :         src_stride,
    3711             :         dst8,
    3712             :         dst8_stride,
    3713             :         w,
    3714             :         h,
    3715             :         filter_params_x,
    3716             :         subpel_x_q4,
    3717             :         conv_params);
    3718    31883400 : }

Generated by: LCOV version 1.14